262 files changed, 24507 insertions, 6803 deletions
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 11270ca22c0..4f3006b600e 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -3,16 +3,30 @@
 
 config TILE
 	def_bool y
+	select HAVE_PERF_EVENTS
+	select USE_PMC if PERF_EVENTS
+	select HAVE_DMA_ATTRS
+	select HAVE_DMA_API_DEBUG
 	select HAVE_KVM if !TILEGX
 	select GENERIC_FIND_FIRST_BIT
-	select USE_GENERIC_SMP_HELPERS
+	select SYSCTL_EXCEPTION_TRACE
 	select CC_OPTIMIZE_FOR_SIZE
-	select HAVE_GENERIC_HARDIRQS
+	select HAVE_DEBUG_KMEMLEAK
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
 	select GENERIC_IRQ_SHOW
+	select HAVE_DEBUG_BUGVERBOSE
+	select VIRT_TO_BUS
 	select SYS_HYPERVISOR
-	select ARCH_HAVE_NMI_SAFE_CMPXCHG if !M386
+	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+	select ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select GENERIC_CLOCKEVENTS
+	select MODULES_USE_ELF_RELA
+	select HAVE_ARCH_TRACEHOOK
+	select HAVE_SYSCALL_TRACEPOINTS
+	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
+	select HAVE_DEBUG_STACKOVERFLOW
+	select ARCH_WANT_FRAME_POINTERS
 
 # FIXME: investigate whether we need/want these options.
 #	select HAVE_IOREMAP_PROT
@@ -31,9 +45,6 @@ config MMU
 config GENERIC_CSUM
 	def_bool y
 
-config SEMAPHORE_SLEEPERS
-	def_bool y
-
 config HAVE_ARCH_ALLOC_REMAP
 	def_bool y
 
@@ -46,19 +57,25 @@ config NEED_PER_CPU_PAGE_FIRST_CHUNK
 config SYS_SUPPORTS_HUGETLBFS
 	def_bool y
 
-config GENERIC_CLOCKEVENTS
+# Support for additional huge page sizes besides HPAGE_SIZE.
+# The software support is currently only present in the TILE-Gx
+# hypervisor. TILEPro in any case does not support page sizes
+# larger than the default HPAGE_SIZE.
+config HUGETLB_SUPER_PAGES
+	depends on HUGETLB_PAGE && TILEGX
+	def_bool y
+
+config GENERIC_TIME_VSYSCALL
 	def_bool y
 
+# Enable PMC if PERF_EVENTS, OPROFILE, or WATCHPOINTS are enabled.
+config USE_PMC
+	bool
+
 # FIXME: tilegx can implement a more efficient rwsem.
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
 
-# We have a very flat architecture from a migration point of view,
-# so save boot time by presetting this (particularly useful on tile-sim).
-config DEFAULT_MIGRATION_COST
-	int
-	default "10000000"
-
 # We only support gcc 4.4 and above, so this should work.
 config ARCH_SUPPORTS_OPTIMIZED_INLINING
 	def_bool y
@@ -69,6 +86,12 @@ config ARCH_PHYS_ADDR_T_64BIT
 config ARCH_DMA_ADDR_T_64BIT
 	def_bool y
 
+config NEED_DMA_MAP_STATE
+	def_bool y
+
+config ARCH_HAS_DMA_SET_COHERENT_MASK
+	bool
+
 config LOCKDEP_SUPPORT
 	def_bool y
 
@@ -94,35 +117,40 @@ config STRICT_DEVMEM
 config SMP
 	def_bool y
 
-# Allow checking for compile-time determined overflow errors in
-# copy_from_user().  There are still unprovable places in the
-# generic code as of 2.6.34, so this option is not really compatible
-# with -Werror, which is more useful in general.
-config DEBUG_COPY_FROM_USER
-	def_bool n
-
 config HVC_TILE
+	depends on TTY
 	select HVC_DRIVER
+	select HVC_IRQ if TILEGX
 	def_bool y
 
-# Please note: TILE-Gx support is not yet finalized; this is
-# the preliminary support.  TILE-Gx drivers are only provided
-# with the alpha or beta test versions for Tilera customers.
 config TILEGX
-	depends on EXPERIMENTAL
-	bool "Building with TILE-Gx (64-bit) compiler and toolchain"
+	bool "Building for TILE-Gx (64-bit) processor"
+	select SPARSE_IRQ
+	select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+	select HAVE_FUNCTION_TRACER
+	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	select HAVE_FUNCTION_GRAPH_TRACER
+	select HAVE_DYNAMIC_FTRACE
+	select HAVE_FTRACE_MCOUNT_RECORD
+	select HAVE_KPROBES
+	select HAVE_KRETPROBES
+	select HAVE_ARCH_KGDB
+
+config TILEPRO
+	def_bool !TILEGX
 
 config 64BIT
-	depends on TILEGX
-	def_bool y
+	def_bool TILEGX
 
 config ARCH_DEFCONFIG
 	string
-	default "arch/tile/configs/tile_defconfig" if !TILEGX
+	default "arch/tile/configs/tilepro_defconfig" if !TILEGX
 	default "arch/tile/configs/tilegx_defconfig" if TILEGX
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "Tilera-specific configuration"
 
 config NR_CPUS
@@ -135,7 +163,30 @@ config NR_CPUS
 	  smaller kernel memory footprint results from using a smaller
 	  value on chips with fewer tiles.
 
-source "kernel/time/Kconfig"
+if TILEGX
+
+choice
+	prompt "Kernel page size"
+	default PAGE_SIZE_64KB
+	help
+	  This lets you select the page size of the kernel.  For best
+	  performance on memory-intensive applications, a page size of 64KB
+	  is recommended.  For workloads involving many small files, many
+	  connections, etc., it may be better to select 16KB, which uses
+	  memory more efficiently at some cost in TLB performance.
+
+	  Note that this option is TILE-Gx specific; currently
+	  TILEPro page size is set by rebuilding the hypervisor.
+
+config PAGE_SIZE_16KB
+	bool "16KB"
+
+config PAGE_SIZE_64KB
+	bool "64KB"
+
+endchoice
+
+endif
 
 source "kernel/Kconfig.hz"
 
@@ -162,7 +213,7 @@ config SYSVIPC_COMPAT
 	def_bool y
 	depends on COMPAT && SYSVIPC
 
-# We do not currently support disabling HIGHMEM on tile64 and tilepro.
+# We do not currently support disabling HIGHMEM on tilepro.
 config HIGHMEM
 	bool # "Support for more than 512 MB of RAM"
 	default !TILEGX
@@ -181,6 +232,22 @@ config HIGHMEM
 
 	  If unsure, say "true".
 
+config ZONE_DMA
+	def_bool y
+
+config IOMMU_HELPER
+	bool
+
+config NEED_SG_DMA_LENGTH
+	bool
+
+config SWIOTLB
+	bool
+	default TILEGX
+	select IOMMU_HELPER
+	select NEED_SG_DMA_LENGTH
+	select ARCH_HAS_DMA_SET_COHERENT_MASK
+
 # We do not currently support disabling NUMA.
 config NUMA
 	bool # "NUMA Memory Allocation and Scheduler Support"
@@ -240,6 +307,7 @@ endchoice
 
 config PAGE_OFFSET
 	hex
+	depends on !64BIT
 	default 0xF0000000 if VMSPLIT_3_75G
 	default 0xE0000000 if VMSPLIT_3_5G
 	default 0xB0000000 if VMSPLIT_2_75G
@@ -251,6 +319,8 @@ config PAGE_OFFSET
 
 source "mm/Kconfig"
 
+source "kernel/Kconfig.preempt"
+
 config CMDLINE_BOOL
 	bool "Built-in kernel command line"
 	default n
@@ -298,7 +368,7 @@ config CMDLINE_OVERRIDE
 
 config VMALLOC_RESERVE
 	hex
-	default 0x1000000
+	default 0x2000000
 
 config HARDWALL
 	bool "Hardwall support to allow access to user dynamic network"
@@ -307,11 +377,19 @@ config HARDWALL
 config KERNEL_PL
 	int "Processor protection level for kernel"
 	range 1 2
-	default "1"
+	default 2 if TILEGX
+	default 1 if !TILEGX
 	---help---
-	  This setting determines the processor protection level the
-	  kernel will be built to run at.  Generally you should use
-	  the default value here.
+	  Since MDE 4.2, the Tilera hypervisor runs the kernel
+	  at PL2 by default.  If running under an older hypervisor,
+	  or as a KVM guest, you must run at PL1.  (The current
+	  hypervisor may also be recompiled with "make HV_PL=2" to
+	  allow it to run a kernel at PL1, but clients running at PL1
+	  are not expected to be supported indefinitely.)
+
+	  If you're not sure, don't change the default.
+
+source "arch/tile/gxio/Kconfig"
 
 endmenu  # Tilera-specific configuration
 
@@ -322,6 +400,8 @@ config PCI
 	default y
 	select PCI_DOMAINS
 	select GENERIC_PCI_IOMAP
+	select TILE_GXIO_TRIO if TILEGX
+	select PCI_MSI if TILEGX
 	---help---
 	  Enable PCI root complex support, so PCIe endpoint devices can
 	  be attached to the Tile chip.  Many, but not all, PCI devices
@@ -333,18 +413,32 @@ config PCI_DOMAINS
 config NO_IOMEM
 	def_bool !PCI
 
-config NO_IOPORT
+config NO_IOPORT_MAP
 	def_bool !PCI
 
+config TILE_PCI_IO
+	bool "PCI I/O space support"
+	default n
+	depends on PCI
+	depends on TILEGX
+	---help---
+	  Enable PCI I/O space support on TILEGx. Since the PCI I/O space
+	  is used by few modern PCIe endpoint devices, its support is disabled
+	  by default to save the TRIO PIO Region resource for other purposes.
+
 source "drivers/pci/Kconfig"
 
-config HOTPLUG
-	bool "Support for hot-pluggable devices"
+source "drivers/pci/pcie/Kconfig"
+
+config TILE_USB
+	tristate "Tilera USB host adapter support"
+	default y
+	depends on USB
+	depends on TILEGX
+	select TILE_GXIO_USB_HOST
 	---help---
-	  Say Y here if you want to plug devices into your computer while
-	  the system is running, and be able to use them quickly.  In many
-	  cases, the devices can likewise be unplugged at any time too.
-	  One well-known example of this is USB.
+	  Provides USB host adapter support for the built-in EHCI and OHCI
+	  interfaces on TILE-Gx chips.
 
 source "drivers/pci/hotplug/Kconfig"
 
@@ -352,11 +446,6 @@ endmenu
 
 menu "Executable file formats"
 
-# only elf supported
-config KCORE_ELF
-	def_bool y
-	depends on PROC_FS
-
 source "fs/Kconfig.binfmt"
 
 endmenu
diff --git a/arch/tile/Kconfig.debug b/arch/tile/Kconfig.debug
index ddbfc3322d7..19734d3ab1e 100644
--- a/arch/tile/Kconfig.debug
+++ b/arch/tile/Kconfig.debug
@@ -14,21 +14,12 @@ config EARLY_PRINTK
 	  with klogd/syslogd. You should normally N here,
 	  unless you want to debug such a crash.
 
-config DEBUG_STACKOVERFLOW
-	bool "Check for stack overflows"
-	depends on DEBUG_KERNEL
+config TILE_HVGLUE_TRACE
+	bool "Provide wrapper functions for hypervisor ABI calls"
+	default n
 	help
-	  This option will cause messages to be printed if free stack space
-	  drops below a certain limit.
-
-config DEBUG_EXTRA_FLAGS
-	string "Additional compiler arguments when building with '-g'"
-	depends on DEBUG_INFO
-	default ""
-	help
-	  Debug info can be large, and flags like
-	  `-femit-struct-debug-baseonly' can reduce the kernel file
-	  size and build time noticeably.  Such flags are often
-	  helpful if the main use of debug info is line number info.
+	  Provide wrapper functions for the hypervisor ABI calls
+	  defined in arch/tile/kernel/hvglue.S.  This allows tracing
+	  mechanisms, etc., to have visibility into those calls.
 
 endmenu
diff --git a/arch/tile/Makefile b/arch/tile/Makefile
index 17acce70569..4dc380a519d 100644
--- a/arch/tile/Makefile
+++ b/arch/tile/Makefile
@@ -26,14 +26,20 @@ $(error Set TILERA_ROOT or CROSS_COMPILE when building $(ARCH) on $(HOST_ARCH))
   endif
 endif
 
-ifneq ($(CONFIG_DEBUG_EXTRA_FLAGS),"")
-KBUILD_CFLAGS   += $(CONFIG_DEBUG_EXTRA_FLAGS)
-endif
+# The tile compiler may emit .eh_frame information for backtracing.
+# In kernel modules, this causes load failures due to unsupported relocations.
+KBUILD_CFLAGS   += -fno-asynchronous-unwind-tables
 
-LIBGCC_PATH     := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
+LIBGCC_PATH     := \
+  $(shell $(CC) $(KBUILD_CFLAGS) $(KCFLAGS) -print-libgcc-file-name)
 
 # Provide the path to use for "make defconfig".
-KBUILD_DEFCONFIG := $(ARCH)_defconfig
+# We default to the newer TILE-Gx architecture if only "tile" is given.
+ifeq ($(ARCH),tile)
+        KBUILD_DEFCONFIG := tilegx_defconfig
+else
+        KBUILD_DEFCONFIG := $(ARCH)_defconfig
+endif
 
 # Used as a file extension when useful, e.g. head_$(BITS).o
 # Not needed for (e.g.) "$(CC) -m32" since the compiler automatically
@@ -53,7 +59,7 @@ libs-y		+= $(LIBGCC_PATH)
 # See arch/tile/Kbuild for content of core part of the kernel
 core-y		+= arch/tile/
 
-core-$(CONFIG_KVM) += arch/tile/kvm/
+core-$(CONFIG_TILE_GXIO) += arch/tile/gxio/
 
 ifdef TILERA_ROOT
 INSTALL_PATH ?= $(TILERA_ROOT)/tile/boot
diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig
index b8d99aca543..730e40d9cf6 100644
--- a/arch/tile/configs/tilegx_defconfig
+++ b/arch/tile/configs/tilegx_defconfig
@@ -1,16 +1,15 @@
 CONFIG_TILEGX=y
-CONFIG_EXPERIMENTAL=y
-# CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_FHANDLE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_FHANDLE=y
 CONFIG_TASKSTATS=y
 CONFIG_TASK_DELAY_ACCT=y
 CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_AUDIT=y
 CONFIG_LOG_BUF_SHIFT=19
 CONFIG_CGROUPS=y
 CONFIG_CGROUP_DEBUG=y
@@ -18,18 +17,18 @@ CONFIG_CGROUP_DEVICE=y
 CONFIG_CPUSETS=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_RESOURCE_COUNTERS=y
-CONFIG_CGROUP_MEM_RES_CTLR=y
-CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_BLK_CGROUP=y
 CONFIG_NAMESPACES=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_RD_XZ=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_EMBEDDED=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
+CONFIG_KPROBES=y
 CONFIG_MODULES=y
 CONFIG_MODULE_FORCE_LOAD=y
 CONFIG_MODULE_UNLOAD=y
@@ -45,12 +44,12 @@ CONFIG_UNIXWARE_DISKLABEL=y
 CONFIG_SGI_PARTITION=y
 CONFIG_SUN_PARTITION=y
 CONFIG_KARMA_PARTITION=y
-CONFIG_EFI_PARTITION=y
 CONFIG_CFQ_GROUP_IOSCHED=y
 CONFIG_NR_CPUS=100
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
 CONFIG_HZ_100=y
+# CONFIG_COMPACTION is not set
+CONFIG_PREEMPT_VOLUNTARY=y
+CONFIG_TILE_PCI_IO=y
 CONFIG_PCI_DEBUG=y
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_BINFMT_MISC=y
@@ -108,151 +107,9 @@ CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_IPV6_MROUTE=y
 CONFIG_IPV6_PIMSM_V2=y
 CONFIG_NETLABEL=y
-CONFIG_NETFILTER=y
-CONFIG_NF_CONNTRACK=m
-CONFIG_NF_CONNTRACK_SECMARK=y
-CONFIG_NF_CONNTRACK_ZONES=y
-CONFIG_NF_CONNTRACK_EVENTS=y
-CONFIG_NF_CT_PROTO_DCCP=m
-CONFIG_NF_CT_PROTO_UDPLITE=m
-CONFIG_NF_CONNTRACK_AMANDA=m
-CONFIG_NF_CONNTRACK_FTP=m
-CONFIG_NF_CONNTRACK_H323=m
-CONFIG_NF_CONNTRACK_IRC=m
-CONFIG_NF_CONNTRACK_NETBIOS_NS=m
-CONFIG_NF_CONNTRACK_PPTP=m
-CONFIG_NF_CONNTRACK_SANE=m
-CONFIG_NF_CONNTRACK_SIP=m
-CONFIG_NF_CONNTRACK_TFTP=m
-CONFIG_NETFILTER_TPROXY=m
-CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
-CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
-CONFIG_NETFILTER_XT_TARGET_CT=m
-CONFIG_NETFILTER_XT_TARGET_DSCP=m
-CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
-CONFIG_NETFILTER_XT_TARGET_MARK=m
-CONFIG_NETFILTER_XT_TARGET_NFLOG=m
-CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
-CONFIG_NETFILTER_XT_TARGET_NOTRACK=m
-CONFIG_NETFILTER_XT_TARGET_TEE=m
-CONFIG_NETFILTER_XT_TARGET_TPROXY=m
-CONFIG_NETFILTER_XT_TARGET_TRACE=m
-CONFIG_NETFILTER_XT_TARGET_SECMARK=m
-CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
-CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
-CONFIG_NETFILTER_XT_MATCH_CLUSTER=m
-CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
-CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
-CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
-CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
-CONFIG_NETFILTER_XT_MATCH_DSCP=m
-CONFIG_NETFILTER_XT_MATCH_ESP=m
-CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
-CONFIG_NETFILTER_XT_MATCH_HELPER=m
-CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
-CONFIG_NETFILTER_XT_MATCH_IPVS=m
-CONFIG_NETFILTER_XT_MATCH_LENGTH=m
-CONFIG_NETFILTER_XT_MATCH_LIMIT=m
-CONFIG_NETFILTER_XT_MATCH_MAC=m
-CONFIG_NETFILTER_XT_MATCH_MARK=m
-CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
-CONFIG_NETFILTER_XT_MATCH_OSF=m
-CONFIG_NETFILTER_XT_MATCH_OWNER=m
-CONFIG_NETFILTER_XT_MATCH_POLICY=m
-CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m
-CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
-CONFIG_NETFILTER_XT_MATCH_QUOTA=m
-CONFIG_NETFILTER_XT_MATCH_RATEEST=m
-CONFIG_NETFILTER_XT_MATCH_REALM=m
-CONFIG_NETFILTER_XT_MATCH_RECENT=m
-CONFIG_NETFILTER_XT_MATCH_SOCKET=m
-CONFIG_NETFILTER_XT_MATCH_STATE=m
-CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
-CONFIG_NETFILTER_XT_MATCH_STRING=m
-CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
-CONFIG_NETFILTER_XT_MATCH_TIME=m
-CONFIG_NETFILTER_XT_MATCH_U32=m
-CONFIG_IP_VS=m
-CONFIG_IP_VS_IPV6=y
-CONFIG_IP_VS_PROTO_TCP=y
-CONFIG_IP_VS_PROTO_UDP=y
-CONFIG_IP_VS_PROTO_ESP=y
-CONFIG_IP_VS_PROTO_AH=y
-CONFIG_IP_VS_PROTO_SCTP=y
-CONFIG_IP_VS_RR=m
-CONFIG_IP_VS_WRR=m
-CONFIG_IP_VS_LC=m
-CONFIG_IP_VS_WLC=m
-CONFIG_IP_VS_LBLC=m
-CONFIG_IP_VS_LBLCR=m
-CONFIG_IP_VS_SED=m
-CONFIG_IP_VS_NQ=m
-CONFIG_NF_CONNTRACK_IPV4=m
-# CONFIG_NF_CONNTRACK_PROC_COMPAT is not set
-CONFIG_IP_NF_QUEUE=m
-CONFIG_IP_NF_IPTABLES=y
-CONFIG_IP_NF_MATCH_AH=m
-CONFIG_IP_NF_MATCH_ECN=m
-CONFIG_IP_NF_MATCH_TTL=m
-CONFIG_IP_NF_FILTER=y
-CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_TARGET_LOG=m
-CONFIG_IP_NF_TARGET_ULOG=m
-CONFIG_IP_NF_MANGLE=m
-CONFIG_IP_NF_TARGET_ECN=m
-CONFIG_IP_NF_TARGET_TTL=m
-CONFIG_IP_NF_RAW=m
-CONFIG_IP_NF_SECURITY=m
-CONFIG_IP_NF_ARPTABLES=m
-CONFIG_IP_NF_ARPFILTER=m
-CONFIG_IP_NF_ARP_MANGLE=m
-CONFIG_NF_CONNTRACK_IPV6=m
-CONFIG_IP6_NF_QUEUE=m
-CONFIG_IP6_NF_IPTABLES=m
-CONFIG_IP6_NF_MATCH_AH=m
-CONFIG_IP6_NF_MATCH_EUI64=m
-CONFIG_IP6_NF_MATCH_FRAG=m
-CONFIG_IP6_NF_MATCH_OPTS=m
-CONFIG_IP6_NF_MATCH_HL=m
-CONFIG_IP6_NF_MATCH_IPV6HEADER=m
-CONFIG_IP6_NF_MATCH_MH=m
-CONFIG_IP6_NF_MATCH_RT=m
-CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_TARGET_LOG=m
-CONFIG_IP6_NF_FILTER=m
-CONFIG_IP6_NF_TARGET_REJECT=m
-CONFIG_IP6_NF_MANGLE=m
-CONFIG_IP6_NF_RAW=m
-CONFIG_IP6_NF_SECURITY=m
-CONFIG_BRIDGE_NF_EBTABLES=m
-CONFIG_BRIDGE_EBT_BROUTE=m
-CONFIG_BRIDGE_EBT_T_FILTER=m
-CONFIG_BRIDGE_EBT_T_NAT=m
-CONFIG_BRIDGE_EBT_802_3=m
-CONFIG_BRIDGE_EBT_AMONG=m
-CONFIG_BRIDGE_EBT_ARP=m
-CONFIG_BRIDGE_EBT_IP=m
-CONFIG_BRIDGE_EBT_IP6=m
-CONFIG_BRIDGE_EBT_LIMIT=m
-CONFIG_BRIDGE_EBT_MARK=m
-CONFIG_BRIDGE_EBT_PKTTYPE=m
-CONFIG_BRIDGE_EBT_STP=m
-CONFIG_BRIDGE_EBT_VLAN=m
-CONFIG_BRIDGE_EBT_ARPREPLY=m
-CONFIG_BRIDGE_EBT_DNAT=m
-CONFIG_BRIDGE_EBT_MARK_T=m
-CONFIG_BRIDGE_EBT_REDIRECT=m
-CONFIG_BRIDGE_EBT_SNAT=m
-CONFIG_BRIDGE_EBT_LOG=m
-CONFIG_BRIDGE_EBT_ULOG=m
-CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_BRIDGE=m
-CONFIG_NET_DSA=y
 CONFIG_VLAN_8021Q=m
 CONFIG_VLAN_8021Q_GVRP=y
 CONFIG_PHONET=m
@@ -293,13 +150,13 @@ CONFIG_NET_ACT_POLICE=m
 CONFIG_NET_ACT_GACT=m
 CONFIG_GACT_PROB=y
 CONFIG_NET_ACT_MIRRED=m
-CONFIG_NET_ACT_IPT=m
 CONFIG_NET_ACT_NAT=m
 CONFIG_NET_ACT_PEDIT=m
 CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
 CONFIG_NET_CLS_IND=y
 CONFIG_DCB=y
+CONFIG_DNS_RESOLVER=y
 # CONFIG_WIRELESS is not set
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
@@ -318,10 +175,12 @@ CONFIG_BLK_DEV_SD=y
 CONFIG_SCSI_CONSTANTS=y
 CONFIG_SCSI_LOGGING=y
 CONFIG_SCSI_SAS_ATA=y
+CONFIG_ISCSI_TCP=m
 CONFIG_SCSI_MVSAS=y
 # CONFIG_SCSI_MVSAS_DEBUG is not set
 CONFIG_SCSI_MVSAS_TASKLET=y
 CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
 CONFIG_SATA_SIL24=y
 # CONFIG_ATA_SFF is not set
 CONFIG_MD=y
@@ -331,7 +190,6 @@ CONFIG_MD_RAID0=m
 CONFIG_MD_RAID1=m
 CONFIG_MD_RAID10=m
 CONFIG_MD_RAID456=m
-CONFIG_MULTICORE_RAID456=y
 CONFIG_MD_FAULTY=m
 CONFIG_BLK_DEV_DM=m
 CONFIG_DM_DEBUG=y
@@ -345,6 +203,12 @@ CONFIG_DM_MULTIPATH_QL=m
 CONFIG_DM_MULTIPATH_ST=m
 CONFIG_DM_DELAY=m
 CONFIG_DM_UEVENT=y
+CONFIG_TARGET_CORE=m
+CONFIG_TCM_IBLOCK=m
+CONFIG_TCM_FILEIO=m
+CONFIG_TCM_PSCSI=m
+CONFIG_LOOPBACK_TARGET=m
+CONFIG_ISCSI_TARGET=m
 CONFIG_FUSION=y
 CONFIG_FUSION_SAS=y
 CONFIG_NETDEVICES=y
@@ -361,42 +225,8 @@ CONFIG_VETH=m
 CONFIG_NET_DSA_MV88E6060=y
 CONFIG_NET_DSA_MV88E6131=y
 CONFIG_NET_DSA_MV88E6123_61_65=y
-# CONFIG_NET_VENDOR_3COM is not set
-# CONFIG_NET_VENDOR_ADAPTEC is not set
-# CONFIG_NET_VENDOR_ALTEON is not set
-# CONFIG_NET_VENDOR_AMD is not set
-# CONFIG_NET_VENDOR_ATHEROS is not set
-# CONFIG_NET_VENDOR_BROADCOM is not set
-# CONFIG_NET_VENDOR_BROCADE is not set
-# CONFIG_NET_VENDOR_CHELSIO is not set
-# CONFIG_NET_VENDOR_CISCO is not set
-# CONFIG_NET_VENDOR_DEC is not set
-# CONFIG_NET_VENDOR_DLINK is not set
-# CONFIG_NET_VENDOR_EMULEX is not set
-# CONFIG_NET_VENDOR_EXAR is not set
-# CONFIG_NET_VENDOR_HP is not set
-# CONFIG_NET_VENDOR_INTEL is not set
-# CONFIG_NET_VENDOR_MARVELL is not set
-# CONFIG_NET_VENDOR_MELLANOX is not set
-# CONFIG_NET_VENDOR_MICREL is not set
-# CONFIG_NET_VENDOR_MYRI is not set
-# CONFIG_NET_VENDOR_NATSEMI is not set
-# CONFIG_NET_VENDOR_NVIDIA is not set
-# CONFIG_NET_VENDOR_OKI is not set
-# CONFIG_NET_PACKET_ENGINE is not set
-# CONFIG_NET_VENDOR_QLOGIC is not set
-# CONFIG_NET_VENDOR_REALTEK is not set
-# CONFIG_NET_VENDOR_RDC is not set
-# CONFIG_NET_VENDOR_SEEQ is not set
-# CONFIG_NET_VENDOR_SILAN is not set
-# CONFIG_NET_VENDOR_SIS is not set
-# CONFIG_NET_VENDOR_SMSC is not set
-# CONFIG_NET_VENDOR_STMICRO is not set
-# CONFIG_NET_VENDOR_SUN is not set
-# CONFIG_NET_VENDOR_TEHUTI is not set
-# CONFIG_NET_VENDOR_TI is not set
-# CONFIG_TILE_NET is not set
-# CONFIG_NET_VENDOR_VIA is not set
+CONFIG_SKY2=y
+CONFIG_PTP_1588_CLOCK_TILEGX=y
 # CONFIG_WLAN is not set
 # CONFIG_INPUT_MOUSEDEV is not set
 # CONFIG_INPUT_KEYBOARD is not set
@@ -404,6 +234,7 @@ CONFIG_NET_DSA_MV88E6123_61_65=y
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
 # CONFIG_LEGACY_PTYS is not set
+CONFIG_SERIAL_TILEGX=y
 CONFIG_HW_RANDOM=y
 CONFIG_HW_RANDOM_TIMERIOMEM=m
 CONFIG_I2C=y
@@ -412,13 +243,16 @@ CONFIG_I2C_CHARDEV=y
 CONFIG_WATCHDOG=y
 CONFIG_WATCHDOG_NOWAYOUT=y
 # CONFIG_VGA_ARB is not set
-# CONFIG_HID_SUPPORT is not set
+CONFIG_DRM=m
+CONFIG_DRM_TDFX=m
+CONFIG_DRM_R128=m
+CONFIG_DRM_MGA=m
+CONFIG_DRM_VIA=m
+CONFIG_DRM_SAVAGE=m
 CONFIG_USB=y
-# CONFIG_USB_DEVICE_CLASS is not set
 CONFIG_USB_EHCI_HCD=y
 CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_STORAGE=y
-CONFIG_USB_LIBUSUAL=y
 CONFIG_EDAC=y
 CONFIG_EDAC_MM_EDAC=y
 CONFIG_RTC_CLASS=y
@@ -466,9 +300,8 @@ CONFIG_ECRYPT_FS=m
 CONFIG_CRAMFS=m
 CONFIG_SQUASHFS=m
 CONFIG_NFS_FS=m
-CONFIG_NFS_V3=y
 CONFIG_NFS_V3_ACL=y
-CONFIG_NFS_V4=y
+CONFIG_NFS_V4=m
 CONFIG_NFS_V4_1=y
 CONFIG_NFS_FSCACHE=y
 CONFIG_NFSD=m
@@ -521,25 +354,28 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_KOI8_R=m
 CONFIG_NLS_KOI8_U=m
 CONFIG_NLS_UTF8=m
+CONFIG_DLM=m
 CONFIG_DLM_DEBUG=y
+CONFIG_DYNAMIC_DEBUG=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_REDUCED=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
-CONFIG_MAGIC_SYSRQ=y
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
 CONFIG_HEADERS_CHECK=y
+# CONFIG_FRAME_POINTER is not set
+CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_LOCKUP_DETECTOR=y
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
-CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_INFO_REDUCED=y
-CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_CREDENTIALS=y
-CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y
-CONFIG_DYNAMIC_DEBUG=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=60
 CONFIG_ASYNC_RAID6_TEST=m
-CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_KGDB=y
 CONFIG_KEYS_DEBUG_PROC_KEYS=y
 CONFIG_SECURITY=y
 CONFIG_SECURITYFS=y
@@ -548,7 +384,6 @@ CONFIG_SECURITY_NETWORK_XFRM=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
 CONFIG_SECURITY_SELINUX_DISABLE=y
-CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_TEST=m
@@ -561,14 +396,12 @@ CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_CRC32C=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
 CONFIG_CRYPTO_RMD256=m
 CONFIG_CRYPTO_RMD320=m
 CONFIG_CRYPTO_SHA1=y
-CONFIG_CRYPTO_SHA256=m
 CONFIG_CRYPTO_SHA512=m
 CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig
index 2b1fd31894f..80fc32ed049 100644
--- a/arch/tile/configs/tilepro_defconfig
+++ b/arch/tile/configs/tilepro_defconfig
@@ -1,15 +1,14 @@
-CONFIG_EXPERIMENTAL=y
-# CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
-CONFIG_FHANDLE=y
 CONFIG_TASKSTATS=y
 CONFIG_TASK_DELAY_ACCT=y
 CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_AUDIT=y
 CONFIG_LOG_BUF_SHIFT=19
 CONFIG_CGROUPS=y
 CONFIG_CGROUP_DEBUG=y
@@ -17,14 +16,13 @@ CONFIG_CGROUP_DEVICE=y
 CONFIG_CPUSETS=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_RESOURCE_COUNTERS=y
-CONFIG_CGROUP_MEM_RES_CTLR=y
-CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_RT_GROUP_SCHED=y
 CONFIG_BLK_CGROUP=y
 CONFIG_NAMESPACES=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_RD_XZ=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_EMBEDDED=y
 # CONFIG_COMPAT_BRK is not set
@@ -44,11 +42,10 @@ CONFIG_UNIXWARE_DISKLABEL=y
 CONFIG_SGI_PARTITION=y
 CONFIG_SUN_PARTITION=y
 CONFIG_KARMA_PARTITION=y
-CONFIG_EFI_PARTITION=y
 CONFIG_CFQ_GROUP_IOSCHED=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
 CONFIG_HZ_100=y
+# CONFIG_COMPACTION is not set
+CONFIG_PREEMPT_VOLUNTARY=y
 CONFIG_PCI_DEBUG=y
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
 CONFIG_BINFMT_MISC=y
@@ -122,11 +119,9 @@ CONFIG_NF_CONNTRACK_PPTP=m
 CONFIG_NF_CONNTRACK_SANE=m
 CONFIG_NF_CONNTRACK_SIP=m
 CONFIG_NF_CONNTRACK_TFTP=m
-CONFIG_NETFILTER_TPROXY=m
 CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
 CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
 CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
-CONFIG_NETFILTER_XT_TARGET_CT=m
 CONFIG_NETFILTER_XT_TARGET_DSCP=m
 CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m
 CONFIG_NETFILTER_XT_TARGET_MARK=m
@@ -190,14 +185,12 @@ CONFIG_IP_VS_SED=m
 CONFIG_IP_VS_NQ=m
 CONFIG_NF_CONNTRACK_IPV4=m
 # CONFIG_NF_CONNTRACK_PROC_COMPAT is not set
-CONFIG_IP_NF_QUEUE=m
 CONFIG_IP_NF_IPTABLES=y
 CONFIG_IP_NF_MATCH_AH=m
 CONFIG_IP_NF_MATCH_ECN=m
 CONFIG_IP_NF_MATCH_TTL=m
 CONFIG_IP_NF_FILTER=y
 CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_IP_NF_TARGET_LOG=m
 CONFIG_IP_NF_TARGET_ULOG=m
 CONFIG_IP_NF_MANGLE=m
 CONFIG_IP_NF_TARGET_ECN=m
@@ -208,8 +201,6 @@ CONFIG_IP_NF_ARPTABLES=m
 CONFIG_IP_NF_ARPFILTER=m
 CONFIG_IP_NF_ARP_MANGLE=m
 CONFIG_NF_CONNTRACK_IPV6=m
-CONFIG_IP6_NF_QUEUE=m
-CONFIG_IP6_NF_IPTABLES=m
 CONFIG_IP6_NF_MATCH_AH=m
 CONFIG_IP6_NF_MATCH_EUI64=m
 CONFIG_IP6_NF_MATCH_FRAG=m
@@ -219,7 +210,6 @@ CONFIG_IP6_NF_MATCH_IPV6HEADER=m
 CONFIG_IP6_NF_MATCH_MH=m
 CONFIG_IP6_NF_MATCH_RT=m
 CONFIG_IP6_NF_TARGET_HL=m
-CONFIG_IP6_NF_TARGET_LOG=m
 CONFIG_IP6_NF_FILTER=m
 CONFIG_IP6_NF_TARGET_REJECT=m
 CONFIG_IP6_NF_MANGLE=m
@@ -250,7 +240,6 @@ CONFIG_BRIDGE_EBT_NFLOG=m
 CONFIG_RDS=m
 CONFIG_RDS_TCP=m
 CONFIG_BRIDGE=m
-CONFIG_NET_DSA=y
 CONFIG_VLAN_8021Q=m
 CONFIG_VLAN_8021Q_GVRP=y
 CONFIG_PHONET=m
@@ -298,6 +287,7 @@ CONFIG_NET_ACT_SIMP=m
 CONFIG_NET_ACT_SKBEDIT=m
 CONFIG_NET_CLS_IND=y
 CONFIG_DCB=y
+CONFIG_DNS_RESOLVER=y
 # CONFIG_WIRELESS is not set
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
@@ -325,7 +315,6 @@ CONFIG_MD_RAID0=m
 CONFIG_MD_RAID1=m
 CONFIG_MD_RAID10=m
 CONFIG_MD_RAID456=m
-CONFIG_MULTICORE_RAID456=y
 CONFIG_MD_FAULTY=m
 CONFIG_BLK_DEV_DM=m
 CONFIG_DM_DEBUG=y
@@ -356,40 +345,7 @@ CONFIG_NET_DSA_MV88E6060=y
 CONFIG_NET_DSA_MV88E6131=y
 CONFIG_NET_DSA_MV88E6123_61_65=y
 # CONFIG_NET_VENDOR_3COM is not set
-# CONFIG_NET_VENDOR_ADAPTEC is not set
-# CONFIG_NET_VENDOR_ALTEON is not set
-# CONFIG_NET_VENDOR_AMD is not set
-# CONFIG_NET_VENDOR_ATHEROS is not set
-# CONFIG_NET_VENDOR_BROADCOM is not set
-# CONFIG_NET_VENDOR_BROCADE is not set
-# CONFIG_NET_VENDOR_CHELSIO is not set
-# CONFIG_NET_VENDOR_CISCO is not set
-# CONFIG_NET_VENDOR_DEC is not set
-# CONFIG_NET_VENDOR_DLINK is not set
-# CONFIG_NET_VENDOR_EMULEX is not set
-# CONFIG_NET_VENDOR_EXAR is not set
-# CONFIG_NET_VENDOR_HP is not set
-# CONFIG_NET_VENDOR_INTEL is not set
-# CONFIG_NET_VENDOR_MARVELL is not set
-# CONFIG_NET_VENDOR_MELLANOX is not set
-# CONFIG_NET_VENDOR_MICREL is not set
-# CONFIG_NET_VENDOR_MYRI is not set
-# CONFIG_NET_VENDOR_NATSEMI is not set
-# CONFIG_NET_VENDOR_NVIDIA is not set
-# CONFIG_NET_VENDOR_OKI is not set
-# CONFIG_NET_PACKET_ENGINE is not set
-# CONFIG_NET_VENDOR_QLOGIC is not set
-# CONFIG_NET_VENDOR_REALTEK is not set
-# CONFIG_NET_VENDOR_RDC is not set
-# CONFIG_NET_VENDOR_SEEQ is not set
-# CONFIG_NET_VENDOR_SILAN is not set
-# CONFIG_NET_VENDOR_SIS is not set
-# CONFIG_NET_VENDOR_SMSC is not set
-# CONFIG_NET_VENDOR_STMICRO is not set
-# CONFIG_NET_VENDOR_SUN is not set
-# CONFIG_NET_VENDOR_TEHUTI is not set
-# CONFIG_NET_VENDOR_TI is not set
-# CONFIG_NET_VENDOR_VIA is not set
+CONFIG_E1000E=y
 # CONFIG_WLAN is not set
 # CONFIG_INPUT_MOUSEDEV is not set
 # CONFIG_INPUT_KEYBOARD is not set
@@ -405,7 +361,6 @@ CONFIG_I2C_CHARDEV=y
 CONFIG_WATCHDOG=y
 CONFIG_WATCHDOG_NOWAYOUT=y
 # CONFIG_VGA_ARB is not set
-# CONFIG_HID_SUPPORT is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_EDAC=y
 CONFIG_EDAC_MM_EDAC=y
@@ -450,13 +405,13 @@ CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_HUGETLBFS=y
+CONFIG_CONFIGFS_FS=m
 CONFIG_ECRYPT_FS=m
 CONFIG_CRAMFS=m
 CONFIG_SQUASHFS=m
 CONFIG_NFS_FS=m
-CONFIG_NFS_V3=y
 CONFIG_NFS_V3_ACL=y
-CONFIG_NFS_V4=y
+CONFIG_NFS_V4=m
 CONFIG_NFS_V4_1=y
 CONFIG_NFS_FSCACHE=y
 CONFIG_NFSD=m
@@ -510,26 +465,29 @@ CONFIG_NLS_ISO8859_15=m
 CONFIG_NLS_KOI8_R=m
 CONFIG_NLS_KOI8_U=m
 CONFIG_NLS_UTF8=m
+CONFIG_DLM=m
 CONFIG_DLM_DEBUG=y
+CONFIG_DYNAMIC_DEBUG=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_REDUCED=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
 CONFIG_FRAME_WARN=2048
-CONFIG_MAGIC_SYSRQ=y
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
 CONFIG_HEADERS_CHECK=y
+# CONFIG_FRAME_POINTER is not set
+CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_LOCKUP_DETECTOR=y
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
-CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_INFO_REDUCED=y
-CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_CREDENTIALS=y
-CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y
-CONFIG_DYNAMIC_DEBUG=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=60
 CONFIG_ASYNC_RAID6_TEST=m
-CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_KEYS_DEBUG_PROC_KEYS=y
 CONFIG_SECURITY=y
 CONFIG_SECURITYFS=y
@@ -538,7 +496,6 @@ CONFIG_SECURITY_NETWORK_XFRM=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX_BOOTPARAM=y
 CONFIG_SECURITY_SELINUX_DISABLE=y
-CONFIG_CRYPTO_NULL=m
 CONFIG_CRYPTO_PCRYPT=m
 CONFIG_CRYPTO_CRYPTD=m
 CONFIG_CRYPTO_TEST=m
@@ -551,14 +508,12 @@ CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_CRC32C=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
 CONFIG_CRYPTO_RMD256=m
 CONFIG_CRYPTO_RMD320=m
 CONFIG_CRYPTO_SHA1=y
-CONFIG_CRYPTO_SHA256=m
 CONFIG_CRYPTO_SHA512=m
 CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
diff --git a/arch/tile/gxio/Kconfig b/arch/tile/gxio/Kconfig
new file mode 100644
index 00000000000..d4e10d58071
--- /dev/null
+++ b/arch/tile/gxio/Kconfig
@@ -0,0 +1,33 @@
+# Support direct access to TILE-Gx hardware from user space, via the
+# gxio library, or from kernel space, via kernel IORPC support.
+config TILE_GXIO
+	bool
+	depends on TILEGX
+
+# Support direct access to the common I/O DMA facility within the
+# TILE-Gx mPIPE and Trio hardware from kernel space.
+config TILE_GXIO_DMA
+	bool
+	select TILE_GXIO
+
+# Support direct access to the TILE-Gx mPIPE hardware from kernel space.
+config TILE_GXIO_MPIPE
+	bool
+	select TILE_GXIO
+	select TILE_GXIO_DMA
+
+# Support direct access to the TILE-Gx TRIO hardware from kernel space.
+config TILE_GXIO_TRIO
+	bool
+	select TILE_GXIO
+	select TILE_GXIO_DMA
+
+# Support direct access to the TILE-Gx USB hardware from kernel space.
+config TILE_GXIO_USB_HOST
+	bool
+	select TILE_GXIO
+
+# Support direct access to the TILE-Gx UART hardware from kernel space.
+config TILE_GXIO_UART
+	bool
+	select TILE_GXIO
diff --git a/arch/tile/gxio/Makefile b/arch/tile/gxio/Makefile
new file mode 100644
index 00000000000..26ae2c72746
--- /dev/null
+++ b/arch/tile/gxio/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the Tile-Gx device access support.
+#
+
+obj-$(CONFIG_TILE_GXIO) += iorpc_globals.o kiorpc.o
+obj-$(CONFIG_TILE_GXIO_DMA) += dma_queue.o
+obj-$(CONFIG_TILE_GXIO_MPIPE) += mpipe.o iorpc_mpipe.o iorpc_mpipe_info.o
+obj-$(CONFIG_TILE_GXIO_TRIO) += trio.o iorpc_trio.o
+obj-$(CONFIG_TILE_GXIO_UART) += uart.o iorpc_uart.o
+obj-$(CONFIG_TILE_GXIO_USB_HOST) += usb_host.o iorpc_usb_host.o
diff --git a/arch/tile/gxio/dma_queue.c b/arch/tile/gxio/dma_queue.c
new file mode 100644
index 00000000000..baa60357f8b
--- /dev/null
+++ b/arch/tile/gxio/dma_queue.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/io.h>
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <gxio/dma_queue.h>
+
+/* Wait for a memory read to complete. */
+#define wait_for_value(val)                             \
+  __asm__ __volatile__("move %0, %0" :: "r"(val))
+
+/* The index is in the low 16. */
+#define DMA_QUEUE_INDEX_MASK ((1 << 16) - 1)
+
+/*
+ * The hardware descriptor-ring type.
+ * This matches the types used by mpipe (MPIPE_EDMA_POST_REGION_VAL_t)
+ * and trio (TRIO_PUSH_DMA_REGION_VAL_t or TRIO_PULL_DMA_REGION_VAL_t).
+ * See those types for more documentation on the individual fields.
+ */
+typedef union {
+	struct {
+#ifndef __BIG_ENDIAN__
+		uint64_t ring_idx:16;
+		uint64_t count:16;
+		uint64_t gen:1;
+		uint64_t __reserved:31;
+#else
+		uint64_t __reserved:31;
+		uint64_t gen:1;
+		uint64_t count:16;
+		uint64_t ring_idx:16;
+#endif
+	};
+	uint64_t word;
+} __gxio_ring_t;
+
+void __gxio_dma_queue_init(__gxio_dma_queue_t *dma_queue,
+			   void *post_region_addr, unsigned int num_entries)
+{
+	/*
+	 * Limit 65536 entry rings to 65535 credits because we only have a
+	 * 16 bit completion counter.
+	 */
+	int64_t credits = (num_entries < 65536) ? num_entries : 65535;
+
+	memset(dma_queue, 0, sizeof(*dma_queue));
+
+	dma_queue->post_region_addr = post_region_addr;
+	dma_queue->hw_complete_count = 0;
+	dma_queue->credits_and_next_index = credits << DMA_QUEUE_CREDIT_SHIFT;
+}
+
+EXPORT_SYMBOL_GPL(__gxio_dma_queue_init);
+
+void __gxio_dma_queue_update_credits(__gxio_dma_queue_t *dma_queue)
+{
+	__gxio_ring_t val;
+	uint64_t count;
+	uint64_t delta;
+	uint64_t new_count;
+
+	/*
+	 * Read the 64-bit completion count without touching the cache, so
+	 * we later avoid having to evict any sharers of this cache line
+	 * when we update it below.
+	 */
+	uint64_t orig_hw_complete_count =
+		cmpxchg(&dma_queue->hw_complete_count,
+			-1, -1);
+
+	/* Make sure the load completes before we access the hardware. */
+	wait_for_value(orig_hw_complete_count);
+
+	/* Read the 16-bit count of how many packets it has completed. */
+	val.word = __gxio_mmio_read(dma_queue->post_region_addr);
+	count = val.count;
+
+	/*
+	 * Calculate the number of completions since we last updated the
+	 * 64-bit counter.  It's safe to ignore the high bits because the
+	 * maximum credit value is 65535.
+	 */
+	delta = (count - orig_hw_complete_count) & 0xffff;
+	if (delta == 0)
+		return;
+
+	/*
+	 * Try to write back the count, advanced by delta.  If we race with
+	 * another thread, this might fail, in which case we return
+	 * immediately on the assumption that some credits are (or at least
+	 * were) available.
+	 */
+	new_count = orig_hw_complete_count + delta;
+	if (cmpxchg(&dma_queue->hw_complete_count,
+		    orig_hw_complete_count,
+		    new_count) != orig_hw_complete_count)
+		return;
+
+	/*
+	 * We succeeded in advancing the completion count; add back the
+	 * corresponding number of egress credits.
+	 */
+	__insn_fetchadd(&dma_queue->credits_and_next_index,
+			(delta << DMA_QUEUE_CREDIT_SHIFT));
+}
+
+EXPORT_SYMBOL_GPL(__gxio_dma_queue_update_credits);
+
+/*
+ * A separate 'blocked' method for put() so that backtraces and
+ * profiles will clearly indicate that we're wasting time spinning on
+ * egress availability rather than actually posting commands.
+ */
+int64_t __gxio_dma_queue_wait_for_credits(__gxio_dma_queue_t *dma_queue,
+					  int64_t modifier)
+{
+	int backoff = 16;
+	int64_t old;
+
+	do {
+		int i;
+		/* Back off to avoid spamming memory networks. */
+		for (i = backoff; i > 0; i--)
+			__insn_mfspr(SPR_PASS);
+
+		/* Check credits again. */
+		__gxio_dma_queue_update_credits(dma_queue);
+		old = __insn_fetchaddgez(&dma_queue->credits_and_next_index,
+					 modifier);
+
+		/* Calculate bounded exponential backoff for next iteration. */
+		if (backoff < 256)
+			backoff *= 2;
+	} while (old + modifier < 0);
+
+	return old;
+}
+
+EXPORT_SYMBOL_GPL(__gxio_dma_queue_wait_for_credits);
+
+int64_t __gxio_dma_queue_reserve_aux(__gxio_dma_queue_t *dma_queue,
+				     unsigned int num, int wait)
+{
+	return __gxio_dma_queue_reserve(dma_queue, num, wait != 0, true);
+}
+
+EXPORT_SYMBOL_GPL(__gxio_dma_queue_reserve_aux);
+
+int __gxio_dma_queue_is_complete(__gxio_dma_queue_t *dma_queue,
+				 int64_t completion_slot, int update)
+{
+	if (update) {
+		if (ACCESS_ONCE(dma_queue->hw_complete_count) >
+		    completion_slot)
+			return 1;
+
+		__gxio_dma_queue_update_credits(dma_queue);
+	}
+
+	return ACCESS_ONCE(dma_queue->hw_complete_count) > completion_slot;
+}
+
+EXPORT_SYMBOL_GPL(__gxio_dma_queue_is_complete);
diff --git a/arch/tile/gxio/iorpc_globals.c b/arch/tile/gxio/iorpc_globals.c
new file mode 100644
index 00000000000..e178e90805a
--- /dev/null
+++ b/arch/tile/gxio/iorpc_globals.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_globals.h"
+
+struct arm_pollfd_param {
+	union iorpc_pollfd pollfd;
+};
+
+int __iorpc_arm_pollfd(int fd, int pollfd_cookie)
+{
+	struct arm_pollfd_param temp;
+	struct arm_pollfd_param *params = &temp;
+
+	params->pollfd.kernel.cookie = pollfd_cookie;
+
+	return hv_dev_pwrite(fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			     IORPC_OP_ARM_POLLFD);
+}
+
+EXPORT_SYMBOL(__iorpc_arm_pollfd);
+
+struct close_pollfd_param {
+	union iorpc_pollfd pollfd;
+};
+
+int __iorpc_close_pollfd(int fd, int pollfd_cookie)
+{
+	struct close_pollfd_param temp;
+	struct close_pollfd_param *params = &temp;
+
+	params->pollfd.kernel.cookie = pollfd_cookie;
+
+	return hv_dev_pwrite(fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			     IORPC_OP_CLOSE_POLLFD);
+}
+
+EXPORT_SYMBOL(__iorpc_close_pollfd);
+
+struct get_mmio_base_param {
+	HV_PTE base;
+};
+
+int __iorpc_get_mmio_base(int fd, HV_PTE *base)
+{
+	int __result;
+	struct get_mmio_base_param temp;
+	struct get_mmio_base_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 IORPC_OP_GET_MMIO_BASE);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(__iorpc_get_mmio_base);
+
+struct check_mmio_offset_param {
+	unsigned long offset;
+	unsigned long size;
+};
+
+int __iorpc_check_mmio_offset(int fd, unsigned long offset, unsigned long size)
+{
+	struct check_mmio_offset_param temp;
+	struct check_mmio_offset_param *params = &temp;
+
+	params->offset = offset;
+	params->size = size;
+
+	return hv_dev_pwrite(fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			     IORPC_OP_CHECK_MMIO_OFFSET);
+}
+
+EXPORT_SYMBOL(__iorpc_check_mmio_offset);
diff --git a/arch/tile/gxio/iorpc_mpipe.c b/arch/tile/gxio/iorpc_mpipe.c
new file mode 100644
index 00000000000..e19325c4c43
--- /dev/null
+++ b/arch/tile/gxio/iorpc_mpipe.c
@@ -0,0 +1,593 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_mpipe.h"
+
+struct alloc_buffer_stacks_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_mpipe_alloc_buffer_stacks(gxio_mpipe_context_t *context,
+				   unsigned int count, unsigned int first,
+				   unsigned int flags)
+{
+	struct alloc_buffer_stacks_param temp;
+	struct alloc_buffer_stacks_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_ALLOC_BUFFER_STACKS);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_buffer_stacks);
+
+struct init_buffer_stack_aux_param {
+	union iorpc_mem_buffer buffer;
+	unsigned int stack;
+	unsigned int buffer_size_enum;
+};
+
+int gxio_mpipe_init_buffer_stack_aux(gxio_mpipe_context_t *context,
+				     void *mem_va, size_t mem_size,
+				     unsigned int mem_flags, unsigned int stack,
+				     unsigned int buffer_size_enum)
+{
+	int __result;
+	unsigned long long __cpa;
+	pte_t __pte;
+	struct init_buffer_stack_aux_param temp;
+	struct init_buffer_stack_aux_param *params = &temp;
+
+	__result = va_to_cpa_and_pte(mem_va, &__cpa, &__pte);
+	if (__result != 0)
+		return __result;
+	params->buffer.kernel.cpa = __cpa;
+	params->buffer.kernel.size = mem_size;
+	params->buffer.kernel.pte = __pte;
+	params->buffer.kernel.flags = mem_flags;
+	params->stack = stack;
+	params->buffer_size_enum = buffer_size_enum;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_INIT_BUFFER_STACK_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_buffer_stack_aux);
+
+
+struct alloc_notif_rings_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_mpipe_alloc_notif_rings(gxio_mpipe_context_t *context,
+				 unsigned int count, unsigned int first,
+				 unsigned int flags)
+{
+	struct alloc_notif_rings_param temp;
+	struct alloc_notif_rings_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_ALLOC_NOTIF_RINGS);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_notif_rings);
+
+struct init_notif_ring_aux_param {
+	union iorpc_mem_buffer buffer;
+	unsigned int ring;
+};
+
+int gxio_mpipe_init_notif_ring_aux(gxio_mpipe_context_t *context, void *mem_va,
+				   size_t mem_size, unsigned int mem_flags,
+				   unsigned int ring)
+{
+	int __result;
+	unsigned long long __cpa;
+	pte_t __pte;
+	struct init_notif_ring_aux_param temp;
+	struct init_notif_ring_aux_param *params = &temp;
+
+	__result = va_to_cpa_and_pte(mem_va, &__cpa, &__pte);
+	if (__result != 0)
+		return __result;
+	params->buffer.kernel.cpa = __cpa;
+	params->buffer.kernel.size = mem_size;
+	params->buffer.kernel.pte = __pte;
+	params->buffer.kernel.flags = mem_flags;
+	params->ring = ring;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_INIT_NOTIF_RING_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_notif_ring_aux);
+
+struct request_notif_ring_interrupt_param {
+	union iorpc_interrupt interrupt;
+	unsigned int ring;
+};
+
+int gxio_mpipe_request_notif_ring_interrupt(gxio_mpipe_context_t *context,
+					    int inter_x, int inter_y,
+					    int inter_ipi, int inter_event,
+					    unsigned int ring)
+{
+	struct request_notif_ring_interrupt_param temp;
+	struct request_notif_ring_interrupt_param *params = &temp;
+
+	params->interrupt.kernel.x = inter_x;
+	params->interrupt.kernel.y = inter_y;
+	params->interrupt.kernel.ipi = inter_ipi;
+	params->interrupt.kernel.event = inter_event;
+	params->ring = ring;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_REQUEST_NOTIF_RING_INTERRUPT);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_request_notif_ring_interrupt);
+
+struct enable_notif_ring_interrupt_param {
+	unsigned int ring;
+};
+
+int gxio_mpipe_enable_notif_ring_interrupt(gxio_mpipe_context_t *context,
+					   unsigned int ring)
+{
+	struct enable_notif_ring_interrupt_param temp;
+	struct enable_notif_ring_interrupt_param *params = &temp;
+
+	params->ring = ring;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_ENABLE_NOTIF_RING_INTERRUPT);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_enable_notif_ring_interrupt);
+
+struct alloc_notif_groups_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_mpipe_alloc_notif_groups(gxio_mpipe_context_t *context,
+				  unsigned int count, unsigned int first,
+				  unsigned int flags)
+{
+	struct alloc_notif_groups_param temp;
+	struct alloc_notif_groups_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_ALLOC_NOTIF_GROUPS);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_notif_groups);
+
+struct init_notif_group_param {
+	unsigned int group;
+	gxio_mpipe_notif_group_bits_t bits;
+};
+
+int gxio_mpipe_init_notif_group(gxio_mpipe_context_t *context,
+				unsigned int group,
+				gxio_mpipe_notif_group_bits_t bits)
+{
+	struct init_notif_group_param temp;
+	struct init_notif_group_param *params = &temp;
+
+	params->group = group;
+	params->bits = bits;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_INIT_NOTIF_GROUP);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_notif_group);
+
+struct alloc_buckets_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_mpipe_alloc_buckets(gxio_mpipe_context_t *context, unsigned int count,
+			     unsigned int first, unsigned int flags)
+{
+	struct alloc_buckets_param temp;
+	struct alloc_buckets_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_ALLOC_BUCKETS);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_buckets);
+
+struct init_bucket_param {
+	unsigned int bucket;
+	MPIPE_LBL_INIT_DAT_BSTS_TBL_t bucket_info;
+};
+
+int gxio_mpipe_init_bucket(gxio_mpipe_context_t *context, unsigned int bucket,
+			   MPIPE_LBL_INIT_DAT_BSTS_TBL_t bucket_info)
+{
+	struct init_bucket_param temp;
+	struct init_bucket_param *params = &temp;
+
+	params->bucket = bucket;
+	params->bucket_info = bucket_info;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_INIT_BUCKET);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_bucket);
+
+struct alloc_edma_rings_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_mpipe_alloc_edma_rings(gxio_mpipe_context_t *context,
+				unsigned int count, unsigned int first,
+				unsigned int flags)
+{
+	struct alloc_edma_rings_param temp;
+	struct alloc_edma_rings_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_ALLOC_EDMA_RINGS);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_alloc_edma_rings);
+
+struct init_edma_ring_aux_param {
+	union iorpc_mem_buffer buffer;
+	unsigned int ring;
+	unsigned int channel;
+};
+
+int gxio_mpipe_init_edma_ring_aux(gxio_mpipe_context_t *context, void *mem_va,
+				  size_t mem_size, unsigned int mem_flags,
+				  unsigned int ring, unsigned int channel)
+{
+	int __result;
+	unsigned long long __cpa;
+	pte_t __pte;
+	struct init_edma_ring_aux_param temp;
+	struct init_edma_ring_aux_param *params = &temp;
+
+	__result = va_to_cpa_and_pte(mem_va, &__cpa, &__pte);
+	if (__result != 0)
+		return __result;
+	params->buffer.kernel.cpa = __cpa;
+	params->buffer.kernel.size = mem_size;
+	params->buffer.kernel.pte = __pte;
+	params->buffer.kernel.flags = mem_flags;
+	params->ring = ring;
+	params->channel = channel;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_INIT_EDMA_RING_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_init_edma_ring_aux);
+
+
+int gxio_mpipe_commit_rules(gxio_mpipe_context_t *context, const void *blob,
+			    size_t blob_size)
+{
+	const void *params = blob;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params, blob_size,
+			     GXIO_MPIPE_OP_COMMIT_RULES);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_commit_rules);
+
+struct register_client_memory_param {
+	unsigned int iotlb;
+	HV_PTE pte;
+	unsigned int flags;
+};
+
+int gxio_mpipe_register_client_memory(gxio_mpipe_context_t *context,
+				      unsigned int iotlb, HV_PTE pte,
+				      unsigned int flags)
+{
+	struct register_client_memory_param temp;
+	struct register_client_memory_param *params = &temp;
+
+	params->iotlb = iotlb;
+	params->pte = pte;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_REGISTER_CLIENT_MEMORY);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_register_client_memory);
+
+struct link_open_aux_param {
+	_gxio_mpipe_link_name_t name;
+	unsigned int flags;
+};
+
+int gxio_mpipe_link_open_aux(gxio_mpipe_context_t *context,
+			     _gxio_mpipe_link_name_t name, unsigned int flags)
+{
+	struct link_open_aux_param temp;
+	struct link_open_aux_param *params = &temp;
+
+	params->name = name;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_LINK_OPEN_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_link_open_aux);
+
+struct link_close_aux_param {
+	int mac;
+};
+
+int gxio_mpipe_link_close_aux(gxio_mpipe_context_t *context, int mac)
+{
+	struct link_close_aux_param temp;
+	struct link_close_aux_param *params = &temp;
+
+	params->mac = mac;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_LINK_CLOSE_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_link_close_aux);
+
+struct link_set_attr_aux_param {
+	int mac;
+	uint32_t attr;
+	int64_t val;
+};
+
+int gxio_mpipe_link_set_attr_aux(gxio_mpipe_context_t *context, int mac,
+				 uint32_t attr, int64_t val)
+{
+	struct link_set_attr_aux_param temp;
+	struct link_set_attr_aux_param *params = &temp;
+
+	params->mac = mac;
+	params->attr = attr;
+	params->val = val;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_LINK_SET_ATTR_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_link_set_attr_aux);
+
+struct get_timestamp_aux_param {
+	uint64_t sec;
+	uint64_t nsec;
+	uint64_t cycles;
+};
+
+int gxio_mpipe_get_timestamp_aux(gxio_mpipe_context_t *context, uint64_t *sec,
+				 uint64_t *nsec, uint64_t *cycles)
+{
+	int __result;
+	struct get_timestamp_aux_param temp;
+	struct get_timestamp_aux_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 GXIO_MPIPE_OP_GET_TIMESTAMP_AUX);
+	*sec = params->sec;
+	*nsec = params->nsec;
+	*cycles = params->cycles;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_get_timestamp_aux);
+
+struct set_timestamp_aux_param {
+	uint64_t sec;
+	uint64_t nsec;
+	uint64_t cycles;
+};
+
+int gxio_mpipe_set_timestamp_aux(gxio_mpipe_context_t *context, uint64_t sec,
+				 uint64_t nsec, uint64_t cycles)
+{
+	struct set_timestamp_aux_param temp;
+	struct set_timestamp_aux_param *params = &temp;
+
+	params->sec = sec;
+	params->nsec = nsec;
+	params->cycles = cycles;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_SET_TIMESTAMP_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_set_timestamp_aux);
+
+struct adjust_timestamp_aux_param {
+	int64_t nsec;
+};
+
+int gxio_mpipe_adjust_timestamp_aux(gxio_mpipe_context_t *context, int64_t nsec)
+{
+	struct adjust_timestamp_aux_param temp;
+	struct adjust_timestamp_aux_param *params = &temp;
+
+	params->nsec = nsec;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_ADJUST_TIMESTAMP_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_adjust_timestamp_aux);
+
+struct config_edma_ring_blks_param {
+	unsigned int ering;
+	unsigned int max_blks;
+	unsigned int min_snf_blks;
+	unsigned int db;
+};
+
+int gxio_mpipe_config_edma_ring_blks(gxio_mpipe_context_t *context,
+				     unsigned int ering, unsigned int max_blks,
+				     unsigned int min_snf_blks, unsigned int db)
+{
+	struct config_edma_ring_blks_param temp;
+	struct config_edma_ring_blks_param *params = &temp;
+
+	params->ering = ering;
+	params->max_blks = max_blks;
+	params->min_snf_blks = min_snf_blks;
+	params->db = db;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_CONFIG_EDMA_RING_BLKS);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_config_edma_ring_blks);
+
+struct adjust_timestamp_freq_param {
+	int32_t ppb;
+};
+
+int gxio_mpipe_adjust_timestamp_freq(gxio_mpipe_context_t *context, int32_t ppb)
+{
+	struct adjust_timestamp_freq_param temp;
+	struct adjust_timestamp_freq_param *params = &temp;
+
+	params->ppb = ppb;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_OP_ADJUST_TIMESTAMP_FREQ);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_adjust_timestamp_freq);
+
+struct arm_pollfd_param {
+	union iorpc_pollfd pollfd;
+};
+
+int gxio_mpipe_arm_pollfd(gxio_mpipe_context_t *context, int pollfd_cookie)
+{
+	struct arm_pollfd_param temp;
+	struct arm_pollfd_param *params = &temp;
+
+	params->pollfd.kernel.cookie = pollfd_cookie;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_ARM_POLLFD);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_arm_pollfd);
+
+struct close_pollfd_param {
+	union iorpc_pollfd pollfd;
+};
+
+int gxio_mpipe_close_pollfd(gxio_mpipe_context_t *context, int pollfd_cookie)
+{
+	struct close_pollfd_param temp;
+	struct close_pollfd_param *params = &temp;
+
+	params->pollfd.kernel.cookie = pollfd_cookie;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_CLOSE_POLLFD);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_close_pollfd);
+
+struct get_mmio_base_param {
+	HV_PTE base;
+};
+
+int gxio_mpipe_get_mmio_base(gxio_mpipe_context_t *context, HV_PTE *base)
+{
+	int __result;
+	struct get_mmio_base_param temp;
+	struct get_mmio_base_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 GXIO_MPIPE_OP_GET_MMIO_BASE);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_get_mmio_base);
+
+struct check_mmio_offset_param {
+	unsigned long offset;
+	unsigned long size;
+};
+
+int gxio_mpipe_check_mmio_offset(gxio_mpipe_context_t *context,
+				 unsigned long offset, unsigned long size)
+{
+	struct check_mmio_offset_param temp;
+	struct check_mmio_offset_param *params = &temp;
+
+	params->offset = offset;
+	params->size = size;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_OP_CHECK_MMIO_OFFSET);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_check_mmio_offset);
diff --git a/arch/tile/gxio/iorpc_mpipe_info.c b/arch/tile/gxio/iorpc_mpipe_info.c
new file mode 100644
index 00000000000..77019c6e9b4
--- /dev/null
+++ b/arch/tile/gxio/iorpc_mpipe_info.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_mpipe_info.h"
+
+struct instance_aux_param {
+	_gxio_mpipe_link_name_t name;
+};
+
+int gxio_mpipe_info_instance_aux(gxio_mpipe_info_context_t *context,
+				 _gxio_mpipe_link_name_t name)
+{
+	struct instance_aux_param temp;
+	struct instance_aux_param *params = &temp;
+
+	params->name = name;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_MPIPE_INFO_OP_INSTANCE_AUX);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_info_instance_aux);
+
+struct enumerate_aux_param {
+	_gxio_mpipe_link_name_t name;
+	_gxio_mpipe_link_mac_t mac;
+};
+
+int gxio_mpipe_info_enumerate_aux(gxio_mpipe_info_context_t *context,
+				  unsigned int idx,
+				  _gxio_mpipe_link_name_t *name,
+				  _gxio_mpipe_link_mac_t *mac)
+{
+	int __result;
+	struct enumerate_aux_param temp;
+	struct enumerate_aux_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 (((uint64_t)idx << 32) |
+			  GXIO_MPIPE_INFO_OP_ENUMERATE_AUX));
+	*name = params->name;
+	*mac = params->mac;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_info_enumerate_aux);
+
+struct get_mmio_base_param {
+	HV_PTE base;
+};
+
+int gxio_mpipe_info_get_mmio_base(gxio_mpipe_info_context_t *context,
+				  HV_PTE *base)
+{
+	int __result;
+	struct get_mmio_base_param temp;
+	struct get_mmio_base_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 GXIO_MPIPE_INFO_OP_GET_MMIO_BASE);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_mpipe_info_get_mmio_base);
+
+struct check_mmio_offset_param {
+	unsigned long offset;
+	unsigned long size;
+};
+
+int gxio_mpipe_info_check_mmio_offset(gxio_mpipe_info_context_t *context,
+				      unsigned long offset, unsigned long size)
+{
+	struct check_mmio_offset_param temp;
+	struct check_mmio_offset_param *params = &temp;
+
+	params->offset = offset;
+	params->size = size;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_MPIPE_INFO_OP_CHECK_MMIO_OFFSET);
+}
+
+EXPORT_SYMBOL(gxio_mpipe_info_check_mmio_offset);
diff --git a/arch/tile/gxio/iorpc_trio.c b/arch/tile/gxio/iorpc_trio.c
new file mode 100644
index 00000000000..1d3cedb9aeb
--- /dev/null
+++ b/arch/tile/gxio/iorpc_trio.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_trio.h"
+
+struct alloc_asids_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_trio_alloc_asids(gxio_trio_context_t *context, unsigned int count,
+			  unsigned int first, unsigned int flags)
+{
+	struct alloc_asids_param temp;
+	struct alloc_asids_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_TRIO_OP_ALLOC_ASIDS);
+}
+
+EXPORT_SYMBOL(gxio_trio_alloc_asids);
+
+
+struct alloc_memory_maps_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_trio_alloc_memory_maps(gxio_trio_context_t *context,
+				unsigned int count, unsigned int first,
+				unsigned int flags)
+{
+	struct alloc_memory_maps_param temp;
+	struct alloc_memory_maps_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_TRIO_OP_ALLOC_MEMORY_MAPS);
+}
+
+EXPORT_SYMBOL(gxio_trio_alloc_memory_maps);
+
+struct alloc_scatter_queues_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_trio_alloc_scatter_queues(gxio_trio_context_t *context,
+				   unsigned int count, unsigned int first,
+				   unsigned int flags)
+{
+	struct alloc_scatter_queues_param temp;
+	struct alloc_scatter_queues_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_TRIO_OP_ALLOC_SCATTER_QUEUES);
+}
+
+EXPORT_SYMBOL(gxio_trio_alloc_scatter_queues);
+
+struct alloc_pio_regions_param {
+	unsigned int count;
+	unsigned int first;
+	unsigned int flags;
+};
+
+int gxio_trio_alloc_pio_regions(gxio_trio_context_t *context,
+				unsigned int count, unsigned int first,
+				unsigned int flags)
+{
+	struct alloc_pio_regions_param temp;
+	struct alloc_pio_regions_param *params = &temp;
+
+	params->count = count;
+	params->first = first;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_TRIO_OP_ALLOC_PIO_REGIONS);
+}
+
+EXPORT_SYMBOL(gxio_trio_alloc_pio_regions);
+
+struct init_pio_region_aux_param {
+	unsigned int pio_region;
+	unsigned int mac;
+	uint32_t bus_address_hi;
+	unsigned int flags;
+};
+
+int gxio_trio_init_pio_region_aux(gxio_trio_context_t *context,
+				  unsigned int pio_region, unsigned int mac,
+				  uint32_t bus_address_hi, unsigned int flags)
+{
+	struct init_pio_region_aux_param temp;
+	struct init_pio_region_aux_param *params = &temp;
+
+	params->pio_region = pio_region;
+	params->mac = mac;
+	params->bus_address_hi = bus_address_hi;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_TRIO_OP_INIT_PIO_REGION_AUX);
+}
+
+EXPORT_SYMBOL(gxio_trio_init_pio_region_aux);
+
+
+struct init_memory_map_mmu_aux_param {
+	unsigned int map;
+	unsigned long va;
+	uint64_t size;
+	unsigned int asid;
+	unsigned int mac;
+	uint64_t bus_address;
+	unsigned int node;
+	unsigned int order_mode;
+};
+
+int gxio_trio_init_memory_map_mmu_aux(gxio_trio_context_t *context,
+				      unsigned int map, unsigned long va,
+				      uint64_t size, unsigned int asid,
+				      unsigned int mac, uint64_t bus_address,
+				      unsigned int node,
+				      unsigned int order_mode)
+{
+	struct init_memory_map_mmu_aux_param temp;
+	struct init_memory_map_mmu_aux_param *params = &temp;
+
+	params->map = map;
+	params->va = va;
+	params->size = size;
+	params->asid = asid;
+	params->mac = mac;
+	params->bus_address = bus_address;
+	params->node = node;
+	params->order_mode = order_mode;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_TRIO_OP_INIT_MEMORY_MAP_MMU_AUX);
+}
+
+EXPORT_SYMBOL(gxio_trio_init_memory_map_mmu_aux);
+
+struct get_port_property_param {
+	struct pcie_trio_ports_property trio_ports;
+};
+
+int gxio_trio_get_port_property(gxio_trio_context_t *context,
+				struct pcie_trio_ports_property *trio_ports)
+{
+	int __result;
+	struct get_port_property_param temp;
+	struct get_port_property_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 GXIO_TRIO_OP_GET_PORT_PROPERTY);
+	*trio_ports = params->trio_ports;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_trio_get_port_property);
+
+struct config_legacy_intr_param {
+	union iorpc_interrupt interrupt;
+	unsigned int mac;
+	unsigned int intx;
+};
+
+int gxio_trio_config_legacy_intr(gxio_trio_context_t *context, int inter_x,
+				 int inter_y, int inter_ipi, int inter_event,
+				 unsigned int mac, unsigned int intx)
+{
+	struct config_legacy_intr_param temp;
+	struct config_legacy_intr_param *params = &temp;
+
+	params->interrupt.kernel.x = inter_x;
+	params->interrupt.kernel.y = inter_y;
+	params->interrupt.kernel.ipi = inter_ipi;
+	params->interrupt.kernel.event = inter_event;
+	params->mac = mac;
+	params->intx = intx;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_TRIO_OP_CONFIG_LEGACY_INTR);
+}
+
+EXPORT_SYMBOL(gxio_trio_config_legacy_intr);
+
+struct config_msi_intr_param {
+	union iorpc_interrupt interrupt;
+	unsigned int mac;
+	unsigned int mem_map;
+	uint64_t mem_map_base;
+	uint64_t mem_map_limit;
+	unsigned int asid;
+};
+
+int gxio_trio_config_msi_intr(gxio_trio_context_t *context, int inter_x,
+			      int inter_y, int inter_ipi, int inter_event,
+			      unsigned int mac, unsigned int mem_map,
+			      uint64_t mem_map_base, uint64_t mem_map_limit,
+			      unsigned int asid)
+{
+	struct config_msi_intr_param temp;
+	struct config_msi_intr_param *params = &temp;
+
+	params->interrupt.kernel.x = inter_x;
+	params->interrupt.kernel.y = inter_y;
+	params->interrupt.kernel.ipi = inter_ipi;
+	params->interrupt.kernel.event = inter_event;
+	params->mac = mac;
+	params->mem_map = mem_map;
+	params->mem_map_base = mem_map_base;
+	params->mem_map_limit = mem_map_limit;
+	params->asid = asid;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_TRIO_OP_CONFIG_MSI_INTR);
+}
+
+EXPORT_SYMBOL(gxio_trio_config_msi_intr);
+
+
+struct set_mps_mrs_param {
+	uint16_t mps;
+	uint16_t mrs;
+	unsigned int mac;
+};
+
+int gxio_trio_set_mps_mrs(gxio_trio_context_t *context, uint16_t mps,
+			  uint16_t mrs, unsigned int mac)
+{
+	struct set_mps_mrs_param temp;
+	struct set_mps_mrs_param *params = &temp;
+
+	params->mps = mps;
+	params->mrs = mrs;
+	params->mac = mac;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_TRIO_OP_SET_MPS_MRS);
+}
+
+EXPORT_SYMBOL(gxio_trio_set_mps_mrs);
+
+struct force_rc_link_up_param {
+	unsigned int mac;
+};
+
+int gxio_trio_force_rc_link_up(gxio_trio_context_t *context, unsigned int mac)
+{
+	struct force_rc_link_up_param temp;
+	struct force_rc_link_up_param *params = &temp;
+
+	params->mac = mac;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_TRIO_OP_FORCE_RC_LINK_UP);
+}
+
+EXPORT_SYMBOL(gxio_trio_force_rc_link_up);
+
+struct force_ep_link_up_param {
+	unsigned int mac;
+};
+
+int gxio_trio_force_ep_link_up(gxio_trio_context_t *context, unsigned int mac)
+{
+	struct force_ep_link_up_param temp;
+	struct force_ep_link_up_param *params = &temp;
+
+	params->mac = mac;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_TRIO_OP_FORCE_EP_LINK_UP);
+}
+
+EXPORT_SYMBOL(gxio_trio_force_ep_link_up);
+
+struct get_mmio_base_param {
+	HV_PTE base;
+};
+
+int gxio_trio_get_mmio_base(gxio_trio_context_t *context, HV_PTE *base)
+{
+	int __result;
+	struct get_mmio_base_param temp;
+	struct get_mmio_base_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 GXIO_TRIO_OP_GET_MMIO_BASE);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_trio_get_mmio_base);
+
+struct check_mmio_offset_param {
+	unsigned long offset;
+	unsigned long size;
+};
+
+int gxio_trio_check_mmio_offset(gxio_trio_context_t *context,
+				unsigned long offset, unsigned long size)
+{
+	struct check_mmio_offset_param temp;
+	struct check_mmio_offset_param *params = &temp;
+
+	params->offset = offset;
+	params->size = size;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_TRIO_OP_CHECK_MMIO_OFFSET);
+}
+
+EXPORT_SYMBOL(gxio_trio_check_mmio_offset);
diff --git a/arch/tile/gxio/iorpc_uart.c b/arch/tile/gxio/iorpc_uart.c
new file mode 100644
index 00000000000..b9a6d6193d7
--- /dev/null
+++ b/arch/tile/gxio/iorpc_uart.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_uart.h"
+
+struct cfg_interrupt_param {
+	union iorpc_interrupt interrupt;
+};
+
+int gxio_uart_cfg_interrupt(gxio_uart_context_t *context, int inter_x,
+			    int inter_y, int inter_ipi, int inter_event)
+{
+	struct cfg_interrupt_param temp;
+	struct cfg_interrupt_param *params = &temp;
+
+	params->interrupt.kernel.x = inter_x;
+	params->interrupt.kernel.y = inter_y;
+	params->interrupt.kernel.ipi = inter_ipi;
+	params->interrupt.kernel.event = inter_event;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_UART_OP_CFG_INTERRUPT);
+}
+
+EXPORT_SYMBOL(gxio_uart_cfg_interrupt);
+
+struct get_mmio_base_param {
+	HV_PTE base;
+};
+
+int gxio_uart_get_mmio_base(gxio_uart_context_t *context, HV_PTE *base)
+{
+	int __result;
+	struct get_mmio_base_param temp;
+	struct get_mmio_base_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 GXIO_UART_OP_GET_MMIO_BASE);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_uart_get_mmio_base);
+
+struct check_mmio_offset_param {
+	unsigned long offset;
+	unsigned long size;
+};
+
+int gxio_uart_check_mmio_offset(gxio_uart_context_t *context,
+				unsigned long offset, unsigned long size)
+{
+	struct check_mmio_offset_param temp;
+	struct check_mmio_offset_param *params = &temp;
+
+	params->offset = offset;
+	params->size = size;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_UART_OP_CHECK_MMIO_OFFSET);
+}
+
+EXPORT_SYMBOL(gxio_uart_check_mmio_offset);
diff --git a/arch/tile/gxio/iorpc_usb_host.c b/arch/tile/gxio/iorpc_usb_host.c
new file mode 100644
index 00000000000..9c820073bfc
--- /dev/null
+++ b/arch/tile/gxio/iorpc_usb_host.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#include "gxio/iorpc_usb_host.h"
+
+struct cfg_interrupt_param {
+	union iorpc_interrupt interrupt;
+};
+
+int gxio_usb_host_cfg_interrupt(gxio_usb_host_context_t *context, int inter_x,
+				int inter_y, int inter_ipi, int inter_event)
+{
+	struct cfg_interrupt_param temp;
+	struct cfg_interrupt_param *params = &temp;
+
+	params->interrupt.kernel.x = inter_x;
+	params->interrupt.kernel.y = inter_y;
+	params->interrupt.kernel.ipi = inter_ipi;
+	params->interrupt.kernel.event = inter_event;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params), GXIO_USB_HOST_OP_CFG_INTERRUPT);
+}
+
+EXPORT_SYMBOL(gxio_usb_host_cfg_interrupt);
+
+struct register_client_memory_param {
+	HV_PTE pte;
+	unsigned int flags;
+};
+
+int gxio_usb_host_register_client_memory(gxio_usb_host_context_t *context,
+					 HV_PTE pte, unsigned int flags)
+{
+	struct register_client_memory_param temp;
+	struct register_client_memory_param *params = &temp;
+
+	params->pte = pte;
+	params->flags = flags;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_USB_HOST_OP_REGISTER_CLIENT_MEMORY);
+}
+
+EXPORT_SYMBOL(gxio_usb_host_register_client_memory);
+
+struct get_mmio_base_param {
+	HV_PTE base;
+};
+
+int gxio_usb_host_get_mmio_base(gxio_usb_host_context_t *context, HV_PTE *base)
+{
+	int __result;
+	struct get_mmio_base_param temp;
+	struct get_mmio_base_param *params = &temp;
+
+	__result =
+	    hv_dev_pread(context->fd, 0, (HV_VirtAddr) params, sizeof(*params),
+			 GXIO_USB_HOST_OP_GET_MMIO_BASE);
+	*base = params->base;
+
+	return __result;
+}
+
+EXPORT_SYMBOL(gxio_usb_host_get_mmio_base);
+
+struct check_mmio_offset_param {
+	unsigned long offset;
+	unsigned long size;
+};
+
+int gxio_usb_host_check_mmio_offset(gxio_usb_host_context_t *context,
+				    unsigned long offset, unsigned long size)
+{
+	struct check_mmio_offset_param temp;
+	struct check_mmio_offset_param *params = &temp;
+
+	params->offset = offset;
+	params->size = size;
+
+	return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+			     sizeof(*params),
+			     GXIO_USB_HOST_OP_CHECK_MMIO_OFFSET);
+}
+
+EXPORT_SYMBOL(gxio_usb_host_check_mmio_offset);
diff --git a/arch/tile/gxio/kiorpc.c b/arch/tile/gxio/kiorpc.c
new file mode 100644
index 00000000000..c8096aa5a3f
--- /dev/null
+++ b/arch/tile/gxio/kiorpc.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx IORPC support for kernel I/O drivers.
+ */
+
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <gxio/iorpc_globals.h>
+#include <gxio/kiorpc.h>
+
+#ifdef DEBUG_IORPC
+#define TRACE(FMT, ...) pr_info(SIMPLE_MSG_LINE FMT, ## __VA_ARGS__)
+#else
+#define TRACE(...)
+#endif
+
+/* Create kernel-VA-space MMIO mapping for an on-chip IO device. */
+void __iomem *iorpc_ioremap(int hv_fd, resource_size_t offset,
+			    unsigned long size)
+{
+	pgprot_t mmio_base, prot = { 0 };
+	unsigned long pfn;
+	int err;
+
+	/* Look up the shim's lotar and base PA. */
+	err = __iorpc_get_mmio_base(hv_fd, &mmio_base);
+	if (err) {
+		TRACE("get_mmio_base() failure: %d\n", err);
+		return NULL;
+	}
+
+	/* Make sure the HV driver approves of our offset and size. */
+	err = __iorpc_check_mmio_offset(hv_fd, offset, size);
+	if (err) {
+		TRACE("check_mmio_offset() failure: %d\n", err);
+		return NULL;
+	}
+
+	/*
+	 * mmio_base contains a base pfn and homing coordinates.  Turn
+	 * it into an MMIO pgprot and offset pfn.
+	 */
+	prot = hv_pte_set_lotar(prot, hv_pte_get_lotar(mmio_base));
+	pfn = pte_pfn(mmio_base) + PFN_DOWN(offset);
+
+	return ioremap_prot(PFN_PHYS(pfn), size, prot);
+}
+
+EXPORT_SYMBOL(iorpc_ioremap);
diff --git a/arch/tile/gxio/mpipe.c b/arch/tile/gxio/mpipe.c
new file mode 100644
index 00000000000..5301a9ffbae
--- /dev/null
+++ b/arch/tile/gxio/mpipe.c
@@ -0,0 +1,578 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/*
+ * Implementation of mpipe gxio calls.
+ */
+
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <gxio/iorpc_globals.h>
+#include <gxio/iorpc_mpipe.h>
+#include <gxio/iorpc_mpipe_info.h>
+#include <gxio/kiorpc.h>
+#include <gxio/mpipe.h>
+
+/* HACK: Avoid pointless "shadow" warnings. */
+#define link link_shadow
+
+int gxio_mpipe_init(gxio_mpipe_context_t *context, unsigned int mpipe_index)
+{
+	char file[32];
+
+	int fd;
+	int i;
+
+	if (mpipe_index >= GXIO_MPIPE_INSTANCE_MAX)
+		return -EINVAL;
+
+	snprintf(file, sizeof(file), "mpipe/%d/iorpc", mpipe_index);
+	fd = hv_dev_open((HV_VirtAddr) file, 0);
+
+	context->fd = fd;
+
+	if (fd < 0) {
+		if (fd >= GXIO_ERR_MIN && fd <= GXIO_ERR_MAX)
+			return fd;
+		else
+			return -ENODEV;
+	}
+
+	/* Map in the MMIO space. */
+	context->mmio_cfg_base = (void __force *)
+		iorpc_ioremap(fd, HV_MPIPE_CONFIG_MMIO_OFFSET,
+			      HV_MPIPE_CONFIG_MMIO_SIZE);
+	if (context->mmio_cfg_base == NULL)
+		goto cfg_failed;
+
+	context->mmio_fast_base = (void __force *)
+		iorpc_ioremap(fd, HV_MPIPE_FAST_MMIO_OFFSET,
+			      HV_MPIPE_FAST_MMIO_SIZE);
+	if (context->mmio_fast_base == NULL)
+		goto fast_failed;
+
+	/* Initialize the stacks. */
+	for (i = 0; i < 8; i++)
+		context->__stacks.stacks[i] = 255;
+
+	context->instance = mpipe_index;
+
+	return 0;
+
+      fast_failed:
+	iounmap((void __force __iomem *)(context->mmio_cfg_base));
+      cfg_failed:
+	hv_dev_close(context->fd);
+	context->fd = -1;
+	return -ENODEV;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_init);
+
+int gxio_mpipe_destroy(gxio_mpipe_context_t *context)
+{
+	iounmap((void __force __iomem *)(context->mmio_cfg_base));
+	iounmap((void __force __iomem *)(context->mmio_fast_base));
+	return hv_dev_close(context->fd);
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_destroy);
+
+static int16_t gxio_mpipe_buffer_sizes[8] =
+	{ 128, 256, 512, 1024, 1664, 4096, 10368, 16384 };
+
+gxio_mpipe_buffer_size_enum_t gxio_mpipe_buffer_size_to_buffer_size_enum(size_t
+									 size)
+{
+	int i;
+	for (i = 0; i < 7; i++)
+		if (size <= gxio_mpipe_buffer_sizes[i])
+			break;
+	return i;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_buffer_size_to_buffer_size_enum);
+
+size_t gxio_mpipe_buffer_size_enum_to_buffer_size(gxio_mpipe_buffer_size_enum_t
+						  buffer_size_enum)
+{
+	if (buffer_size_enum > 7)
+		buffer_size_enum = 7;
+
+	return gxio_mpipe_buffer_sizes[buffer_size_enum];
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_buffer_size_enum_to_buffer_size);
+
+size_t gxio_mpipe_calc_buffer_stack_bytes(unsigned long buffers)
+{
+	const int BUFFERS_PER_LINE = 12;
+
+	/* Count the number of cachlines. */
+	unsigned long lines =
+		(buffers + BUFFERS_PER_LINE - 1) / BUFFERS_PER_LINE;
+
+	/* Convert to bytes. */
+	return lines * CHIP_L2_LINE_SIZE();
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_calc_buffer_stack_bytes);
+
+int gxio_mpipe_init_buffer_stack(gxio_mpipe_context_t *context,
+				 unsigned int stack,
+				 gxio_mpipe_buffer_size_enum_t
+				 buffer_size_enum, void *mem, size_t mem_size,
+				 unsigned int mem_flags)
+{
+	int result;
+
+	memset(mem, 0, mem_size);
+
+	result = gxio_mpipe_init_buffer_stack_aux(context, mem, mem_size,
+						  mem_flags, stack,
+						  buffer_size_enum);
+	if (result < 0)
+		return result;
+
+	/* Save the stack. */
+	context->__stacks.stacks[buffer_size_enum] = stack;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_init_buffer_stack);
+
+int gxio_mpipe_init_notif_ring(gxio_mpipe_context_t *context,
+			       unsigned int ring,
+			       void *mem, size_t mem_size,
+			       unsigned int mem_flags)
+{
+	return gxio_mpipe_init_notif_ring_aux(context, mem, mem_size,
+					      mem_flags, ring);
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_init_notif_ring);
+
+int gxio_mpipe_init_notif_group_and_buckets(gxio_mpipe_context_t *context,
+					    unsigned int group,
+					    unsigned int ring,
+					    unsigned int num_rings,
+					    unsigned int bucket,
+					    unsigned int num_buckets,
+					    gxio_mpipe_bucket_mode_t mode)
+{
+	int i;
+	int result;
+
+	gxio_mpipe_bucket_info_t bucket_info = { {
+						  .group = group,
+						  .mode = mode,
+						  }
+	};
+
+	gxio_mpipe_notif_group_bits_t bits = { {0} };
+
+	for (i = 0; i < num_rings; i++)
+		gxio_mpipe_notif_group_add_ring(&bits, ring + i);
+
+	result = gxio_mpipe_init_notif_group(context, group, bits);
+	if (result != 0)
+		return result;
+
+	for (i = 0; i < num_buckets; i++) {
+		bucket_info.notifring = ring + (i % num_rings);
+
+		result = gxio_mpipe_init_bucket(context, bucket + i,
+						bucket_info);
+		if (result != 0)
+			return result;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_init_notif_group_and_buckets);
+
+int gxio_mpipe_init_edma_ring(gxio_mpipe_context_t *context,
+			      unsigned int ring, unsigned int channel,
+			      void *mem, size_t mem_size,
+			      unsigned int mem_flags)
+{
+	memset(mem, 0, mem_size);
+
+	return gxio_mpipe_init_edma_ring_aux(context, mem, mem_size, mem_flags,
+					     ring, channel);
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_init_edma_ring);
+
+void gxio_mpipe_rules_init(gxio_mpipe_rules_t *rules,
+			   gxio_mpipe_context_t *context)
+{
+	rules->context = context;
+	memset(&rules->list, 0, sizeof(rules->list));
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_rules_init);
+
+int gxio_mpipe_rules_begin(gxio_mpipe_rules_t *rules,
+			   unsigned int bucket, unsigned int num_buckets,
+			   gxio_mpipe_rules_stacks_t *stacks)
+{
+	int i;
+	int stack = 255;
+
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	/* Current rule. */
+	gxio_mpipe_rules_rule_t *rule =
+		(gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	unsigned int head = list->tail;
+
+	/*
+	 * Align next rule properly.
+	 *Note that "dmacs_and_vlans" will also be aligned.
+	 */
+	unsigned int pad = 0;
+	while (((head + pad) % __alignof__(gxio_mpipe_rules_rule_t)) != 0)
+		pad++;
+
+	/*
+	 * Verify room.
+	 * ISSUE: Mark rules as broken on error?
+	 */
+	if (head + pad + sizeof(*rule) >= sizeof(list->rules))
+		return GXIO_MPIPE_ERR_RULES_FULL;
+
+	/* Verify num_buckets is a power of 2. */
+	if (__builtin_popcount(num_buckets) != 1)
+		return GXIO_MPIPE_ERR_RULES_INVALID;
+
+	/* Add padding to previous rule. */
+	rule->size += pad;
+
+	/* Start a new rule. */
+	list->head = head + pad;
+
+	rule = (gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	/* Default some values. */
+	rule->headroom = 2;
+	rule->tailroom = 0;
+	rule->capacity = 16384;
+
+	/* Save the bucket info. */
+	rule->bucket_mask = num_buckets - 1;
+	rule->bucket_first = bucket;
+
+	for (i = 8 - 1; i >= 0; i--) {
+		int maybe =
+			stacks ? stacks->stacks[i] : rules->context->__stacks.
+			stacks[i];
+		if (maybe != 255)
+			stack = maybe;
+		rule->stacks.stacks[i] = stack;
+	}
+
+	if (stack == 255)
+		return GXIO_MPIPE_ERR_RULES_INVALID;
+
+	/* NOTE: Only entries at the end of the array can be 255. */
+	for (i = 8 - 1; i > 0; i--) {
+		if (rule->stacks.stacks[i] == 255) {
+			rule->stacks.stacks[i] = stack;
+			rule->capacity =
+				gxio_mpipe_buffer_size_enum_to_buffer_size(i -
+									   1);
+		}
+	}
+
+	rule->size = sizeof(*rule);
+	list->tail = list->head + rule->size;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_rules_begin);
+
+int gxio_mpipe_rules_add_channel(gxio_mpipe_rules_t *rules,
+				 unsigned int channel)
+{
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	gxio_mpipe_rules_rule_t *rule =
+		(gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	/* Verify channel. */
+	if (channel >= 32)
+		return GXIO_MPIPE_ERR_RULES_INVALID;
+
+	/* Verify begun. */
+	if (list->tail == 0)
+		return GXIO_MPIPE_ERR_RULES_EMPTY;
+
+	rule->channel_bits |= (1UL << channel);
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_rules_add_channel);
+
+int gxio_mpipe_rules_set_headroom(gxio_mpipe_rules_t *rules, uint8_t headroom)
+{
+	gxio_mpipe_rules_list_t *list = &rules->list;
+
+	gxio_mpipe_rules_rule_t *rule =
+		(gxio_mpipe_rules_rule_t *) (list->rules + list->head);
+
+	/* Verify begun. */
+	if (list->tail == 0)
+		return GXIO_MPIPE_ERR_RULES_EMPTY;
+
+	rule->headroom = headroom;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_rules_set_headroom);
+
+int gxio_mpipe_rules_commit(gxio_mpipe_rules_t *rules)
+{
+	gxio_mpipe_rules_list_t *list = &rules->list;
+	unsigned int size =
+		offsetof(gxio_mpipe_rules_list_t, rules) + list->tail;
+	return gxio_mpipe_commit_rules(rules->context, list, size);
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_rules_commit);
+
+int gxio_mpipe_iqueue_init(gxio_mpipe_iqueue_t *iqueue,
+			   gxio_mpipe_context_t *context,
+			   unsigned int ring,
+			   void *mem, size_t mem_size, unsigned int mem_flags)
+{
+	/* The init call below will verify that "mem_size" is legal. */
+	unsigned int num_entries = mem_size / sizeof(gxio_mpipe_idesc_t);
+
+	iqueue->context = context;
+	iqueue->idescs = (gxio_mpipe_idesc_t *)mem;
+	iqueue->ring = ring;
+	iqueue->num_entries = num_entries;
+	iqueue->mask_num_entries = num_entries - 1;
+	iqueue->log2_num_entries = __builtin_ctz(num_entries);
+	iqueue->head = 1;
+#ifdef __BIG_ENDIAN__
+	iqueue->swapped = 0;
+#endif
+
+	/* Initialize the "tail". */
+	__gxio_mmio_write(mem, iqueue->head);
+
+	return gxio_mpipe_init_notif_ring(context, ring, mem, mem_size,
+					  mem_flags);
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_iqueue_init);
+
+int gxio_mpipe_equeue_init(gxio_mpipe_equeue_t *equeue,
+			   gxio_mpipe_context_t *context,
+			   unsigned int ering,
+			   unsigned int channel,
+			   void *mem, unsigned int mem_size,
+			   unsigned int mem_flags)
+{
+	/* The init call below will verify that "mem_size" is legal. */
+	unsigned int num_entries = mem_size / sizeof(gxio_mpipe_edesc_t);
+
+	/* Offset used to read number of completed commands. */
+	MPIPE_EDMA_POST_REGION_ADDR_t offset;
+
+	int result = gxio_mpipe_init_edma_ring(context, ering, channel,
+					       mem, mem_size, mem_flags);
+	if (result < 0)
+		return result;
+
+	memset(equeue, 0, sizeof(*equeue));
+
+	offset.word = 0;
+	offset.region =
+		MPIPE_MMIO_ADDR__REGION_VAL_EDMA -
+		MPIPE_MMIO_ADDR__REGION_VAL_IDMA;
+	offset.ring = ering;
+
+	__gxio_dma_queue_init(&equeue->dma_queue,
+			      context->mmio_fast_base + offset.word,
+			      num_entries);
+	equeue->edescs = mem;
+	equeue->mask_num_entries = num_entries - 1;
+	equeue->log2_num_entries = __builtin_ctz(num_entries);
+	equeue->context = context;
+	equeue->ering = ering;
+	equeue->channel = channel;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_equeue_init);
+
+int gxio_mpipe_set_timestamp(gxio_mpipe_context_t *context,
+			     const struct timespec *ts)
+{
+	cycles_t cycles = get_cycles();
+	return gxio_mpipe_set_timestamp_aux(context, (uint64_t)ts->tv_sec,
+					    (uint64_t)ts->tv_nsec,
+					    (uint64_t)cycles);
+}
+
+int gxio_mpipe_get_timestamp(gxio_mpipe_context_t *context,
+			     struct timespec *ts)
+{
+	int ret;
+	cycles_t cycles_prev, cycles_now, clock_rate;
+	cycles_prev = get_cycles();
+	ret = gxio_mpipe_get_timestamp_aux(context, (uint64_t *)&ts->tv_sec,
+					   (uint64_t *)&ts->tv_nsec,
+					   (uint64_t *)&cycles_now);
+	if (ret < 0) {
+		return ret;
+	}
+
+	clock_rate = get_clock_rate();
+	ts->tv_nsec -= (cycles_now - cycles_prev) * 1000000000LL / clock_rate;
+	if (ts->tv_nsec < 0) {
+		ts->tv_nsec += 1000000000LL;
+		ts->tv_sec -= 1;
+	}
+	return ret;
+}
+
+int gxio_mpipe_adjust_timestamp(gxio_mpipe_context_t *context, int64_t delta)
+{
+	return gxio_mpipe_adjust_timestamp_aux(context, delta);
+}
+
+/* Get our internal context used for link name access.  This context is
+ *  special in that it is not associated with an mPIPE service domain.
+ */
+static gxio_mpipe_context_t *_gxio_get_link_context(void)
+{
+	static gxio_mpipe_context_t context;
+	static gxio_mpipe_context_t *contextp;
+	static int tried_open = 0;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);
+
+	if (!tried_open) {
+		int i = 0;
+		tried_open = 1;
+
+		/*
+		 * "4" here is the maximum possible number of mPIPE shims; it's
+		 * an exaggeration but we shouldn't ever go beyond 2 anyway.
+		 */
+		for (i = 0; i < 4; i++) {
+			char file[80];
+
+			snprintf(file, sizeof(file), "mpipe/%d/iorpc_info", i);
+			context.fd = hv_dev_open((HV_VirtAddr) file, 0);
+			if (context.fd < 0)
+				continue;
+
+			contextp = &context;
+			break;
+		}
+	}
+
+	mutex_unlock(&mutex);
+
+	return contextp;
+}
+
+int gxio_mpipe_link_instance(const char *link_name)
+{
+	_gxio_mpipe_link_name_t name;
+	gxio_mpipe_context_t *context = _gxio_get_link_context();
+
+	if (!context)
+		return GXIO_ERR_NO_DEVICE;
+
+	strncpy(name.name, link_name, sizeof(name.name));
+	name.name[GXIO_MPIPE_LINK_NAME_LEN - 1] = '\0';
+
+	return gxio_mpipe_info_instance_aux(context, name);
+}
+
+int gxio_mpipe_link_enumerate_mac(int idx, char *link_name, uint8_t *link_mac)
+{
+	int rv;
+	_gxio_mpipe_link_name_t name;
+	_gxio_mpipe_link_mac_t mac;
+
+	gxio_mpipe_context_t *context = _gxio_get_link_context();
+	if (!context)
+		return GXIO_ERR_NO_DEVICE;
+
+	rv = gxio_mpipe_info_enumerate_aux(context, idx, &name, &mac);
+	if (rv >= 0) {
+		strncpy(link_name, name.name, sizeof(name.name));
+		memcpy(link_mac, mac.mac, sizeof(mac.mac));
+	}
+
+	return rv;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_link_enumerate_mac);
+
+int gxio_mpipe_link_open(gxio_mpipe_link_t *link,
+			 gxio_mpipe_context_t *context, const char *link_name,
+			 unsigned int flags)
+{
+	_gxio_mpipe_link_name_t name;
+	int rv;
+
+	strncpy(name.name, link_name, sizeof(name.name));
+	name.name[GXIO_MPIPE_LINK_NAME_LEN - 1] = '\0';
+
+	rv = gxio_mpipe_link_open_aux(context, name, flags);
+	if (rv < 0)
+		return rv;
+
+	link->context = context;
+	link->channel = rv >> 8;
+	link->mac = rv & 0xFF;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_link_open);
+
+int gxio_mpipe_link_close(gxio_mpipe_link_t *link)
+{
+	return gxio_mpipe_link_close_aux(link->context, link->mac);
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_link_close);
+
+int gxio_mpipe_link_set_attr(gxio_mpipe_link_t *link, uint32_t attr,
+			     int64_t val)
+{
+	return gxio_mpipe_link_set_attr_aux(link->context, link->mac, attr,
+					    val);
+}
+
+EXPORT_SYMBOL_GPL(gxio_mpipe_link_set_attr);
diff --git a/arch/tile/gxio/trio.c b/arch/tile/gxio/trio.c
new file mode 100644
index 00000000000..69f0b8df3ce
--- /dev/null
+++ b/arch/tile/gxio/trio.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/*
+ * Implementation of trio gxio calls.
+ */
+
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/module.h>
+
+#include <gxio/trio.h>
+#include <gxio/iorpc_globals.h>
+#include <gxio/iorpc_trio.h>
+#include <gxio/kiorpc.h>
+
+int gxio_trio_init(gxio_trio_context_t *context, unsigned int trio_index)
+{
+	char file[32];
+	int fd;
+
+	snprintf(file, sizeof(file), "trio/%d/iorpc", trio_index);
+	fd = hv_dev_open((HV_VirtAddr) file, 0);
+	if (fd < 0) {
+		context->fd = -1;
+
+		if (fd >= GXIO_ERR_MIN && fd <= GXIO_ERR_MAX)
+			return fd;
+		else
+			return -ENODEV;
+	}
+
+	context->fd = fd;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_trio_init);
diff --git a/arch/tile/gxio/uart.c b/arch/tile/gxio/uart.c
new file mode 100644
index 00000000000..ba585175ef8
--- /dev/null
+++ b/arch/tile/gxio/uart.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/*
+ * Implementation of UART gxio calls.
+ */
+
+#include <linux/io.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+
+#include <gxio/uart.h>
+#include <gxio/iorpc_globals.h>
+#include <gxio/iorpc_uart.h>
+#include <gxio/kiorpc.h>
+
+int gxio_uart_init(gxio_uart_context_t *context, int uart_index)
+{
+	char file[32];
+	int fd;
+
+	snprintf(file, sizeof(file), "uart/%d/iorpc", uart_index);
+	fd = hv_dev_open((HV_VirtAddr) file, 0);
+	if (fd < 0) {
+		if (fd >= GXIO_ERR_MIN && fd <= GXIO_ERR_MAX)
+			return fd;
+		else
+			return -ENODEV;
+	}
+
+	context->fd = fd;
+
+	/* Map in the MMIO space. */
+	context->mmio_base = (void __force *)
+		iorpc_ioremap(fd, HV_UART_MMIO_OFFSET, HV_UART_MMIO_SIZE);
+
+	if (context->mmio_base == NULL) {
+		hv_dev_close(context->fd);
+		context->fd = -1;
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_uart_init);
+
+int gxio_uart_destroy(gxio_uart_context_t *context)
+{
+	iounmap((void __force __iomem *)(context->mmio_base));
+	hv_dev_close(context->fd);
+
+	context->mmio_base = NULL;
+	context->fd = -1;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_uart_destroy);
+
+/* UART register write wrapper. */
+void gxio_uart_write(gxio_uart_context_t *context, uint64_t offset,
+		     uint64_t word)
+{
+	__gxio_mmio_write(context->mmio_base + offset, word);
+}
+
+EXPORT_SYMBOL_GPL(gxio_uart_write);
+
+/* UART register read wrapper. */
+uint64_t gxio_uart_read(gxio_uart_context_t *context, uint64_t offset)
+{
+	return __gxio_mmio_read(context->mmio_base + offset);
+}
+
+EXPORT_SYMBOL_GPL(gxio_uart_read);
diff --git a/arch/tile/gxio/usb_host.c b/arch/tile/gxio/usb_host.c
new file mode 100644
index 00000000000..785afad7922
--- /dev/null
+++ b/arch/tile/gxio/usb_host.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/*
+ *
+ * Implementation of USB gxio calls.
+ */
+
+#include <linux/io.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+
+#include <gxio/iorpc_globals.h>
+#include <gxio/iorpc_usb_host.h>
+#include <gxio/kiorpc.h>
+#include <gxio/usb_host.h>
+
+int gxio_usb_host_init(gxio_usb_host_context_t *context, int usb_index,
+		       int is_ehci)
+{
+	char file[32];
+	int fd;
+
+	if (is_ehci)
+		snprintf(file, sizeof(file), "usb_host/%d/iorpc/ehci",
+			 usb_index);
+	else
+		snprintf(file, sizeof(file), "usb_host/%d/iorpc/ohci",
+			 usb_index);
+
+	fd = hv_dev_open((HV_VirtAddr) file, 0);
+	if (fd < 0) {
+		if (fd >= GXIO_ERR_MIN && fd <= GXIO_ERR_MAX)
+			return fd;
+		else
+			return -ENODEV;
+	}
+
+	context->fd = fd;
+
+	// Map in the MMIO space.
+	context->mmio_base =
+		(void __force *)iorpc_ioremap(fd, 0, HV_USB_HOST_MMIO_SIZE);
+
+	if (context->mmio_base == NULL) {
+		hv_dev_close(context->fd);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_usb_host_init);
+
+int gxio_usb_host_destroy(gxio_usb_host_context_t *context)
+{
+	iounmap((void __force __iomem *)(context->mmio_base));
+	hv_dev_close(context->fd);
+
+	context->mmio_base = NULL;
+	context->fd = -1;
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(gxio_usb_host_destroy);
+
+void *gxio_usb_host_get_reg_start(gxio_usb_host_context_t *context)
+{
+	return context->mmio_base;
+}
+
+EXPORT_SYMBOL_GPL(gxio_usb_host_get_reg_start);
+
+size_t gxio_usb_host_get_reg_len(gxio_usb_host_context_t *context)
+{
+	return HV_USB_HOST_MMIO_SIZE;
+}
+
+EXPORT_SYMBOL_GPL(gxio_usb_host_get_reg_len);
diff --git a/arch/tile/include/arch/Kbuild b/arch/tile/include/arch/Kbuild
index 9c0ea24cc94..3751c9fabcf 100644
--- a/arch/tile/include/arch/Kbuild
+++ b/arch/tile/include/arch/Kbuild
@@ -1,17 +1 @@
-header-y += abi.h
-header-y += chip.h
-header-y += chip_tile64.h
-header-y += chip_tilegx.h
-header-y += chip_tilepro.h
-header-y += icache.h
-header-y += interrupts.h
-header-y += interrupts_32.h
-header-y += interrupts_64.h
-header-y += opcode.h
-header-y += opcode_tilegx.h
-header-y += opcode_tilepro.h
-header-y += sim.h
-header-y += sim_def.h
-header-y += spr_def.h
-header-y += spr_def_32.h
-header-y += spr_def_64.h
+# Tile arch headers
diff --git a/arch/tile/include/arch/chip_tile64.h b/arch/tile/include/arch/chip_tile64.h
deleted file mode 100644
index 261aaba092d..00000000000
--- a/arch/tile/include/arch/chip_tile64.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-/*
- * @file
- * Global header file.
- * This header file specifies defines for TILE64.
- */
-
-#ifndef __ARCH_CHIP_H__
-#define __ARCH_CHIP_H__
-
-/** Specify chip version.
- * When possible, prefer the CHIP_xxx symbols below for future-proofing.
- * This is intended for cross-compiling; native compilation should
- * use the predefined __tile_chip__ symbol.
- */
-#define TILE_CHIP 0
-
-/** Specify chip revision.
- * This provides for the case of a respin of a particular chip type;
- * the normal value for this symbol is "0".
- * This is intended for cross-compiling; native compilation should
- * use the predefined __tile_chip_rev__ symbol.
- */
-#define TILE_CHIP_REV 0
-
-/** The name of this architecture. */
-#define CHIP_ARCH_NAME "tile64"
-
-/** The ELF e_machine type for binaries for this chip. */
-#define CHIP_ELF_TYPE() EM_TILE64
-
-/** The alternate ELF e_machine type for binaries for this chip. */
-#define CHIP_COMPAT_ELF_TYPE() 0x2506
-
-/** What is the native word size of the machine? */
-#define CHIP_WORD_SIZE() 32
-
-/** How many bits of a virtual address are used. Extra bits must be
- * the sign extension of the low bits.
- */
-#define CHIP_VA_WIDTH() 32
-
-/** How many bits are in a physical address? */
-#define CHIP_PA_WIDTH() 36
-
-/** Size of the L2 cache, in bytes. */
-#define CHIP_L2_CACHE_SIZE() 65536
-
-/** Log size of an L2 cache line in bytes. */
-#define CHIP_L2_LOG_LINE_SIZE() 6
-
-/** Size of an L2 cache line, in bytes. */
-#define CHIP_L2_LINE_SIZE() (1 << CHIP_L2_LOG_LINE_SIZE())
-
-/** Associativity of the L2 cache. */
-#define CHIP_L2_ASSOC() 2
-
-/** Size of the L1 data cache, in bytes. */
-#define CHIP_L1D_CACHE_SIZE() 8192
-
-/** Log size of an L1 data cache line in bytes. */
-#define CHIP_L1D_LOG_LINE_SIZE() 4
-
-/** Size of an L1 data cache line, in bytes. */
-#define CHIP_L1D_LINE_SIZE() (1 << CHIP_L1D_LOG_LINE_SIZE())
-
-/** Associativity of the L1 data cache. */
-#define CHIP_L1D_ASSOC() 2
-
-/** Size of the L1 instruction cache, in bytes. */
-#define CHIP_L1I_CACHE_SIZE() 8192
-
-/** Log size of an L1 instruction cache line in bytes. */
-#define CHIP_L1I_LOG_LINE_SIZE() 6
-
-/** Size of an L1 instruction cache line, in bytes. */
-#define CHIP_L1I_LINE_SIZE() (1 << CHIP_L1I_LOG_LINE_SIZE())
-
-/** Associativity of the L1 instruction cache. */
-#define CHIP_L1I_ASSOC() 1
-
-/** Stride with which flush instructions must be issued. */
-#define CHIP_FLUSH_STRIDE() CHIP_L2_LINE_SIZE()
-
-/** Stride with which inv instructions must be issued. */
-#define CHIP_INV_STRIDE() CHIP_L1D_LINE_SIZE()
-
-/** Stride with which finv instructions must be issued. */
-#define CHIP_FINV_STRIDE() CHIP_L1D_LINE_SIZE()
-
-/** Can the local cache coherently cache data that is homed elsewhere? */
-#define CHIP_HAS_COHERENT_LOCAL_CACHE() 0
-
-/** How many simultaneous outstanding victims can the L2 cache have? */
-#define CHIP_MAX_OUTSTANDING_VICTIMS() 2
-
-/** Does the TLB support the NC and NOALLOC bits? */
-#define CHIP_HAS_NC_AND_NOALLOC_BITS() 0
-
-/** Does the chip support hash-for-home caching? */
-#define CHIP_HAS_CBOX_HOME_MAP() 0
-
-/** Number of entries in the chip's home map tables. */
-/* #define CHIP_CBOX_HOME_MAP_SIZE() -- does not apply to chip 0 */
-
-/** Do uncacheable requests miss in the cache regardless of whether
- * there is matching data? */
-#define CHIP_HAS_ENFORCED_UNCACHEABLE_REQUESTS() 0
-
-/** Does the mf instruction wait for victims? */
-#define CHIP_HAS_MF_WAITS_FOR_VICTIMS() 1
-
-/** Does the chip have an "inv" instruction that doesn't also flush? */
-#define CHIP_HAS_INV() 0
-
-/** Does the chip have a "wh64" instruction? */
-#define CHIP_HAS_WH64() 0
-
-/** Does this chip have a 'dword_align' instruction? */
-#define CHIP_HAS_DWORD_ALIGN() 0
-
-/** Number of performance counters. */
-#define CHIP_PERFORMANCE_COUNTERS() 2
-
-/** Does this chip have auxiliary performance counters? */
-#define CHIP_HAS_AUX_PERF_COUNTERS() 0
-
-/** Is the CBOX_MSR1 SPR supported? */
-#define CHIP_HAS_CBOX_MSR1() 0
-
-/** Is the TILE_RTF_HWM SPR supported? */
-#define CHIP_HAS_TILE_RTF_HWM() 0
-
-/** Is the TILE_WRITE_PENDING SPR supported? */
-#define CHIP_HAS_TILE_WRITE_PENDING() 0
-
-/** Is the PROC_STATUS SPR supported? */
-#define CHIP_HAS_PROC_STATUS_SPR() 0
-
-/** Is the DSTREAM_PF SPR supported? */
-#define CHIP_HAS_DSTREAM_PF() 0
-
-/** Log of the number of mshims we have. */
-#define CHIP_LOG_NUM_MSHIMS() 2
-
-/** Are the bases of the interrupt vector areas fixed? */
-#define CHIP_HAS_FIXED_INTVEC_BASE() 1
-
-/** Are the interrupt masks split up into 2 SPRs? */
-#define CHIP_HAS_SPLIT_INTR_MASK() 1
-
-/** Is the cycle count split up into 2 SPRs? */
-#define CHIP_HAS_SPLIT_CYCLE() 1
-
-/** Does the chip have a static network? */
-#define CHIP_HAS_SN() 1
-
-/** Does the chip have a static network processor? */
-#define CHIP_HAS_SN_PROC() 1
-
-/** Size of the L1 static network processor instruction cache, in bytes. */
-#define CHIP_L1SNI_CACHE_SIZE() 2048
-
-/** Does the chip have DMA support in each tile? */
-#define CHIP_HAS_TILE_DMA() 1
-
-/** Does the chip have the second revision of the directly accessible
- *  dynamic networks?  This encapsulates a number of characteristics,
- *  including the absence of the catch-all, the absence of inline message
- *  tags, the absence of support for network context-switching, and so on.
- */
-#define CHIP_HAS_REV1_XDN() 0
-
-/** Does the chip have cmpexch and similar (fetchadd, exch, etc.)? */
-#define CHIP_HAS_CMPEXCH() 0
-
-/** Does the chip have memory-mapped I/O support? */
-#define CHIP_HAS_MMIO() 0
-
-/** Does the chip have post-completion interrupts? */
-#define CHIP_HAS_POST_COMPLETION_INTERRUPTS() 0
-
-/** Does the chip have native single step support? */
-#define CHIP_HAS_SINGLE_STEP() 0
-
-#ifndef __OPEN_SOURCE__  /* features only relevant to hypervisor-level code */
-
-/** How many entries are present in the instruction TLB? */
-#define CHIP_ITLB_ENTRIES() 8
-
-/** How many entries are present in the data TLB? */
-#define CHIP_DTLB_ENTRIES() 16
-
-/** How many MAF entries does the XAUI shim have? */
-#define CHIP_XAUI_MAF_ENTRIES() 16
-
-/** Does the memory shim have a source-id table? */
-#define CHIP_HAS_MSHIM_SRCID_TABLE() 1
-
-/** Does the L1 instruction cache clear on reset? */
-#define CHIP_HAS_L1I_CLEAR_ON_RESET() 0
-
-/** Does the chip come out of reset with valid coordinates on all tiles?
- * Note that if defined, this also implies that the upper left is 1,1.
- */
-#define CHIP_HAS_VALID_TILE_COORD_RESET() 0
-
-/** Does the chip have unified packet formats? */
-#define CHIP_HAS_UNIFIED_PACKET_FORMATS() 0
-
-/** Does the chip support write reordering? */
-#define CHIP_HAS_WRITE_REORDERING() 0
-
-/** Does the chip support Y-X routing as well as X-Y? */
-#define CHIP_HAS_Y_X_ROUTING() 0
-
-/** Is INTCTRL_3 managed with the correct MPL? */
-#define CHIP_HAS_INTCTRL_3_STATUS_FIX() 0
-
-/** Is it possible to configure the chip to be big-endian? */
-#define CHIP_HAS_BIG_ENDIAN_CONFIG() 0
-
-/** Is the CACHE_RED_WAY_OVERRIDDEN SPR supported? */
-#define CHIP_HAS_CACHE_RED_WAY_OVERRIDDEN() 0
-
-/** Is the DIAG_TRACE_WAY SPR supported? */
-#define CHIP_HAS_DIAG_TRACE_WAY() 0
-
-/** Is the MEM_STRIPE_CONFIG SPR supported? */
-#define CHIP_HAS_MEM_STRIPE_CONFIG() 0
-
-/** Are the TLB_PERF SPRs supported? */
-#define CHIP_HAS_TLB_PERF() 0
-
-/** Is the VDN_SNOOP_SHIM_CTL SPR supported? */
-#define CHIP_HAS_VDN_SNOOP_SHIM_CTL() 0
-
-/** Does the chip support rev1 DMA packets? */
-#define CHIP_HAS_REV1_DMA_PACKETS() 0
-
-/** Does the chip have an IPI shim? */
-#define CHIP_HAS_IPI() 0
-
-#endif /* !__OPEN_SOURCE__ */
-#endif /* __ARCH_CHIP_H__ */
diff --git a/arch/tile/include/arch/interrupts_32.h b/arch/tile/include/arch/interrupts_32.h
deleted file mode 100644
index 96b5710505b..00000000000
--- a/arch/tile/include/arch/interrupts_32.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-#ifndef __ARCH_INTERRUPTS_H__
-#define __ARCH_INTERRUPTS_H__
-
-/** Mask for an interrupt. */
-/* Note: must handle breaking interrupts into high and low words manually. */
-#define INT_MASK_LO(intno) (1 << (intno))
-#define INT_MASK_HI(intno) (1 << ((intno) - 32))
-
-#ifndef __ASSEMBLER__
-#define INT_MASK(intno) (1ULL << (intno))
-#endif
-
-
-/** Where a given interrupt executes */
-#define INTERRUPT_VECTOR(i, pl) (0xFC000000 + ((pl) << 24) + ((i) << 8))
-
-/** Where to store a vector for a given interrupt. */
-#define USER_INTERRUPT_VECTOR(i) INTERRUPT_VECTOR(i, 0)
-
-/** The base address of user-level interrupts. */
-#define USER_INTERRUPT_VECTOR_BASE INTERRUPT_VECTOR(0, 0)
-
-
-/** Additional synthetic interrupt. */
-#define INT_BREAKPOINT (63)
-
-#define INT_ITLB_MISS    0
-#define INT_MEM_ERROR    1
-#define INT_ILL    2
-#define INT_GPV    3
-#define INT_SN_ACCESS    4
-#define INT_IDN_ACCESS    5
-#define INT_UDN_ACCESS    6
-#define INT_IDN_REFILL    7
-#define INT_UDN_REFILL    8
-#define INT_IDN_COMPLETE    9
-#define INT_UDN_COMPLETE   10
-#define INT_SWINT_3   11
-#define INT_SWINT_2   12
-#define INT_SWINT_1   13
-#define INT_SWINT_0   14
-#define INT_UNALIGN_DATA   15
-#define INT_DTLB_MISS   16
-#define INT_DTLB_ACCESS   17
-#define INT_DMATLB_MISS   18
-#define INT_DMATLB_ACCESS   19
-#define INT_SNITLB_MISS   20
-#define INT_SN_NOTIFY   21
-#define INT_SN_FIREWALL   22
-#define INT_IDN_FIREWALL   23
-#define INT_UDN_FIREWALL   24
-#define INT_TILE_TIMER   25
-#define INT_IDN_TIMER   26
-#define INT_UDN_TIMER   27
-#define INT_DMA_NOTIFY   28
-#define INT_IDN_CA   29
-#define INT_UDN_CA   30
-#define INT_IDN_AVAIL   31
-#define INT_UDN_AVAIL   32
-#define INT_PERF_COUNT   33
-#define INT_INTCTRL_3   34
-#define INT_INTCTRL_2   35
-#define INT_INTCTRL_1   36
-#define INT_INTCTRL_0   37
-#define INT_BOOT_ACCESS   38
-#define INT_WORLD_ACCESS   39
-#define INT_I_ASID   40
-#define INT_D_ASID   41
-#define INT_DMA_ASID   42
-#define INT_SNI_ASID   43
-#define INT_DMA_CPL   44
-#define INT_SN_CPL   45
-#define INT_DOUBLE_FAULT   46
-#define INT_SN_STATIC_ACCESS   47
-#define INT_AUX_PERF_COUNT   48
-
-#define NUM_INTERRUPTS 49
-
-#ifndef __ASSEMBLER__
-#define QUEUED_INTERRUPTS ( \
-    INT_MASK(INT_MEM_ERROR) | \
-    INT_MASK(INT_DMATLB_MISS) | \
-    INT_MASK(INT_DMATLB_ACCESS) | \
-    INT_MASK(INT_SNITLB_MISS) | \
-    INT_MASK(INT_SN_NOTIFY) | \
-    INT_MASK(INT_SN_FIREWALL) | \
-    INT_MASK(INT_IDN_FIREWALL) | \
-    INT_MASK(INT_UDN_FIREWALL) | \
-    INT_MASK(INT_TILE_TIMER) | \
-    INT_MASK(INT_IDN_TIMER) | \
-    INT_MASK(INT_UDN_TIMER) | \
-    INT_MASK(INT_DMA_NOTIFY) | \
-    INT_MASK(INT_IDN_CA) | \
-    INT_MASK(INT_UDN_CA) | \
-    INT_MASK(INT_IDN_AVAIL) | \
-    INT_MASK(INT_UDN_AVAIL) | \
-    INT_MASK(INT_PERF_COUNT) | \
-    INT_MASK(INT_INTCTRL_3) | \
-    INT_MASK(INT_INTCTRL_2) | \
-    INT_MASK(INT_INTCTRL_1) | \
-    INT_MASK(INT_INTCTRL_0) | \
-    INT_MASK(INT_BOOT_ACCESS) | \
-    INT_MASK(INT_WORLD_ACCESS) | \
-    INT_MASK(INT_I_ASID) | \
-    INT_MASK(INT_D_ASID) | \
-    INT_MASK(INT_DMA_ASID) | \
-    INT_MASK(INT_SNI_ASID) | \
-    INT_MASK(INT_DMA_CPL) | \
-    INT_MASK(INT_SN_CPL) | \
-    INT_MASK(INT_DOUBLE_FAULT) | \
-    INT_MASK(INT_AUX_PERF_COUNT) | \
-    0)
-#define NONQUEUED_INTERRUPTS ( \
-    INT_MASK(INT_ITLB_MISS) | \
-    INT_MASK(INT_ILL) | \
-    INT_MASK(INT_GPV) | \
-    INT_MASK(INT_SN_ACCESS) | \
-    INT_MASK(INT_IDN_ACCESS) | \
-    INT_MASK(INT_UDN_ACCESS) | \
-    INT_MASK(INT_IDN_REFILL) | \
-    INT_MASK(INT_UDN_REFILL) | \
-    INT_MASK(INT_IDN_COMPLETE) | \
-    INT_MASK(INT_UDN_COMPLETE) | \
-    INT_MASK(INT_SWINT_3) | \
-    INT_MASK(INT_SWINT_2) | \
-    INT_MASK(INT_SWINT_1) | \
-    INT_MASK(INT_SWINT_0) | \
-    INT_MASK(INT_UNALIGN_DATA) | \
-    INT_MASK(INT_DTLB_MISS) | \
-    INT_MASK(INT_DTLB_ACCESS) | \
-    INT_MASK(INT_SN_STATIC_ACCESS) | \
-    0)
-#define CRITICAL_MASKED_INTERRUPTS ( \
-    INT_MASK(INT_MEM_ERROR) | \
-    INT_MASK(INT_DMATLB_MISS) | \
-    INT_MASK(INT_DMATLB_ACCESS) | \
-    INT_MASK(INT_SNITLB_MISS) | \
-    INT_MASK(INT_SN_NOTIFY) | \
-    INT_MASK(INT_SN_FIREWALL) | \
-    INT_MASK(INT_IDN_FIREWALL) | \
-    INT_MASK(INT_UDN_FIREWALL) | \
-    INT_MASK(INT_TILE_TIMER) | \
-    INT_MASK(INT_IDN_TIMER) | \
-    INT_MASK(INT_UDN_TIMER) | \
-    INT_MASK(INT_DMA_NOTIFY) | \
-    INT_MASK(INT_IDN_CA) | \
-    INT_MASK(INT_UDN_CA) | \
-    INT_MASK(INT_IDN_AVAIL) | \
-    INT_MASK(INT_UDN_AVAIL) | \
-    INT_MASK(INT_PERF_COUNT) | \
-    INT_MASK(INT_INTCTRL_3) | \
-    INT_MASK(INT_INTCTRL_2) | \
-    INT_MASK(INT_INTCTRL_1) | \
-    INT_MASK(INT_INTCTRL_0) | \
-    INT_MASK(INT_AUX_PERF_COUNT) | \
-    0)
-#define CRITICAL_UNMASKED_INTERRUPTS ( \
-    INT_MASK(INT_ITLB_MISS) | \
-    INT_MASK(INT_ILL) | \
-    INT_MASK(INT_GPV) | \
-    INT_MASK(INT_SN_ACCESS) | \
-    INT_MASK(INT_IDN_ACCESS) | \
-    INT_MASK(INT_UDN_ACCESS) | \
-    INT_MASK(INT_IDN_REFILL) | \
-    INT_MASK(INT_UDN_REFILL) | \
-    INT_MASK(INT_IDN_COMPLETE) | \
-    INT_MASK(INT_UDN_COMPLETE) | \
-    INT_MASK(INT_SWINT_3) | \
-    INT_MASK(INT_SWINT_2) | \
-    INT_MASK(INT_SWINT_1) | \
-    INT_MASK(INT_SWINT_0) | \
-    INT_MASK(INT_UNALIGN_DATA) | \
-    INT_MASK(INT_DTLB_MISS) | \
-    INT_MASK(INT_DTLB_ACCESS) | \
-    INT_MASK(INT_BOOT_ACCESS) | \
-    INT_MASK(INT_WORLD_ACCESS) | \
-    INT_MASK(INT_I_ASID) | \
-    INT_MASK(INT_D_ASID) | \
-    INT_MASK(INT_DMA_ASID) | \
-    INT_MASK(INT_SNI_ASID) | \
-    INT_MASK(INT_DMA_CPL) | \
-    INT_MASK(INT_SN_CPL) | \
-    INT_MASK(INT_DOUBLE_FAULT) | \
-    INT_MASK(INT_SN_STATIC_ACCESS) | \
-    0)
-#define MASKABLE_INTERRUPTS ( \
-    INT_MASK(INT_MEM_ERROR) | \
-    INT_MASK(INT_IDN_REFILL) | \
-    INT_MASK(INT_UDN_REFILL) | \
-    INT_MASK(INT_IDN_COMPLETE) | \
-    INT_MASK(INT_UDN_COMPLETE) | \
-    INT_MASK(INT_DMATLB_MISS) | \
-    INT_MASK(INT_DMATLB_ACCESS) | \
-    INT_MASK(INT_SNITLB_MISS) | \
-    INT_MASK(INT_SN_NOTIFY) | \
-    INT_MASK(INT_SN_FIREWALL) | \
-    INT_MASK(INT_IDN_FIREWALL) | \
-    INT_MASK(INT_UDN_FIREWALL) | \
-    INT_MASK(INT_TILE_TIMER) | \
-    INT_MASK(INT_IDN_TIMER) | \
-    INT_MASK(INT_UDN_TIMER) | \
-    INT_MASK(INT_DMA_NOTIFY) | \
-    INT_MASK(INT_IDN_CA) | \
-    INT_MASK(INT_UDN_CA) | \
-    INT_MASK(INT_IDN_AVAIL) | \
-    INT_MASK(INT_UDN_AVAIL) | \
-    INT_MASK(INT_PERF_COUNT) | \
-    INT_MASK(INT_INTCTRL_3) | \
-    INT_MASK(INT_INTCTRL_2) | \
-    INT_MASK(INT_INTCTRL_1) | \
-    INT_MASK(INT_INTCTRL_0) | \
-    INT_MASK(INT_AUX_PERF_COUNT) | \
-    0)
-#define UNMASKABLE_INTERRUPTS ( \
-    INT_MASK(INT_ITLB_MISS) | \
-    INT_MASK(INT_ILL) | \
-    INT_MASK(INT_GPV) | \
-    INT_MASK(INT_SN_ACCESS) | \
-    INT_MASK(INT_IDN_ACCESS) | \
-    INT_MASK(INT_UDN_ACCESS) | \
-    INT_MASK(INT_SWINT_3) | \
-    INT_MASK(INT_SWINT_2) | \
-    INT_MASK(INT_SWINT_1) | \
-    INT_MASK(INT_SWINT_0) | \
-    INT_MASK(INT_UNALIGN_DATA) | \
-    INT_MASK(INT_DTLB_MISS) | \
-    INT_MASK(INT_DTLB_ACCESS) | \
-    INT_MASK(INT_BOOT_ACCESS) | \
-    INT_MASK(INT_WORLD_ACCESS) | \
-    INT_MASK(INT_I_ASID) | \
-    INT_MASK(INT_D_ASID) | \
-    INT_MASK(INT_DMA_ASID) | \
-    INT_MASK(INT_SNI_ASID) | \
-    INT_MASK(INT_DMA_CPL) | \
-    INT_MASK(INT_SN_CPL) | \
-    INT_MASK(INT_DOUBLE_FAULT) | \
-    INT_MASK(INT_SN_STATIC_ACCESS) | \
-    0)
-#define SYNC_INTERRUPTS ( \
-    INT_MASK(INT_ITLB_MISS) | \
-    INT_MASK(INT_ILL) | \
-    INT_MASK(INT_GPV) | \
-    INT_MASK(INT_SN_ACCESS) | \
-    INT_MASK(INT_IDN_ACCESS) | \
-    INT_MASK(INT_UDN_ACCESS) | \
-    INT_MASK(INT_IDN_REFILL) | \
-    INT_MASK(INT_UDN_REFILL) | \
-    INT_MASK(INT_IDN_COMPLETE) | \
-    INT_MASK(INT_UDN_COMPLETE) | \
-    INT_MASK(INT_SWINT_3) | \
-    INT_MASK(INT_SWINT_2) | \
-    INT_MASK(INT_SWINT_1) | \
-    INT_MASK(INT_SWINT_0) | \
-    INT_MASK(INT_UNALIGN_DATA) | \
-    INT_MASK(INT_DTLB_MISS) | \
-    INT_MASK(INT_DTLB_ACCESS) | \
-    INT_MASK(INT_SN_STATIC_ACCESS) | \
-    0)
-#define NON_SYNC_INTERRUPTS ( \
-    INT_MASK(INT_MEM_ERROR) | \
-    INT_MASK(INT_DMATLB_MISS) | \
-    INT_MASK(INT_DMATLB_ACCESS) | \
-    INT_MASK(INT_SNITLB_MISS) | \
-    INT_MASK(INT_SN_NOTIFY) | \
-    INT_MASK(INT_SN_FIREWALL) | \
-    INT_MASK(INT_IDN_FIREWALL) | \
-    INT_MASK(INT_UDN_FIREWALL) | \
-    INT_MASK(INT_TILE_TIMER) | \
-    INT_MASK(INT_IDN_TIMER) | \
-    INT_MASK(INT_UDN_TIMER) | \
-    INT_MASK(INT_DMA_NOTIFY) | \
-    INT_MASK(INT_IDN_CA) | \
-    INT_MASK(INT_UDN_CA) | \
-    INT_MASK(INT_IDN_AVAIL) | \
-    INT_MASK(INT_UDN_AVAIL) | \
-    INT_MASK(INT_PERF_COUNT) | \
-    INT_MASK(INT_INTCTRL_3) | \
-    INT_MASK(INT_INTCTRL_2) | \
-    INT_MASK(INT_INTCTRL_1) | \
-    INT_MASK(INT_INTCTRL_0) | \
-    INT_MASK(INT_BOOT_ACCESS) | \
-    INT_MASK(INT_WORLD_ACCESS) | \
-    INT_MASK(INT_I_ASID) | \
-    INT_MASK(INT_D_ASID) | \
-    INT_MASK(INT_DMA_ASID) | \
-    INT_MASK(INT_SNI_ASID) | \
-    INT_MASK(INT_DMA_CPL) | \
-    INT_MASK(INT_SN_CPL) | \
-    INT_MASK(INT_DOUBLE_FAULT) | \
-    INT_MASK(INT_AUX_PERF_COUNT) | \
-    0)
-#endif /* !__ASSEMBLER__ */
-#endif /* !__ARCH_INTERRUPTS_H__ */
diff --git a/arch/tile/include/arch/interrupts_64.h b/arch/tile/include/arch/interrupts_64.h
deleted file mode 100644
index 5bb58b2e4e6..00000000000
--- a/arch/tile/include/arch/interrupts_64.h
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright 2011 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-#ifndef __ARCH_INTERRUPTS_H__
-#define __ARCH_INTERRUPTS_H__
-
-/** Mask for an interrupt. */
-#ifdef __ASSEMBLER__
-/* Note: must handle breaking interrupts into high and low words manually. */
-#define INT_MASK(intno) (1 << (intno))
-#else
-#define INT_MASK(intno) (1ULL << (intno))
-#endif
-
-
-/** Where a given interrupt executes */
-#define INTERRUPT_VECTOR(i, pl) (0xFC000000 + ((pl) << 24) + ((i) << 8))
-
-/** Where to store a vector for a given interrupt. */
-#define USER_INTERRUPT_VECTOR(i) INTERRUPT_VECTOR(i, 0)
-
-/** The base address of user-level interrupts. */
-#define USER_INTERRUPT_VECTOR_BASE INTERRUPT_VECTOR(0, 0)
-
-
-/** Additional synthetic interrupt. */
-#define INT_BREAKPOINT (63)
-
-#define INT_MEM_ERROR    0
-#define INT_SINGLE_STEP_3    1
-#define INT_SINGLE_STEP_2    2
-#define INT_SINGLE_STEP_1    3
-#define INT_SINGLE_STEP_0    4
-#define INT_IDN_COMPLETE    5
-#define INT_UDN_COMPLETE    6
-#define INT_ITLB_MISS    7
-#define INT_ILL    8
-#define INT_GPV    9
-#define INT_IDN_ACCESS   10
-#define INT_UDN_ACCESS   11
-#define INT_SWINT_3   12
-#define INT_SWINT_2   13
-#define INT_SWINT_1   14
-#define INT_SWINT_0   15
-#define INT_ILL_TRANS   16
-#define INT_UNALIGN_DATA   17
-#define INT_DTLB_MISS   18
-#define INT_DTLB_ACCESS   19
-#define INT_IDN_FIREWALL   20
-#define INT_UDN_FIREWALL   21
-#define INT_TILE_TIMER   22
-#define INT_AUX_TILE_TIMER   23
-#define INT_IDN_TIMER   24
-#define INT_UDN_TIMER   25
-#define INT_IDN_AVAIL   26
-#define INT_UDN_AVAIL   27
-#define INT_IPI_3   28
-#define INT_IPI_2   29
-#define INT_IPI_1   30
-#define INT_IPI_0   31
-#define INT_PERF_COUNT   32
-#define INT_AUX_PERF_COUNT   33
-#define INT_INTCTRL_3   34
-#define INT_INTCTRL_2   35
-#define INT_INTCTRL_1   36
-#define INT_INTCTRL_0   37
-#define INT_BOOT_ACCESS   38
-#define INT_WORLD_ACCESS   39
-#define INT_I_ASID   40
-#define INT_D_ASID   41
-#define INT_DOUBLE_FAULT   42
-
-#define NUM_INTERRUPTS 43
-
-#ifndef __ASSEMBLER__
-#define QUEUED_INTERRUPTS ( \
-    INT_MASK(INT_MEM_ERROR) | \
-    INT_MASK(INT_IDN_COMPLETE) | \
-    INT_MASK(INT_UDN_COMPLETE) | \
-    INT_MASK(INT_IDN_FIREWALL) | \
-    INT_MASK(INT_UDN_FIREWALL) | \
-    INT_MASK(INT_TILE_TIMER) | \
-    INT_MASK(INT_AUX_TILE_TIMER) | \
-    INT_MASK(INT_IDN_TIMER) | \
-    INT_MASK(INT_UDN_TIMER) | \
-    INT_MASK(INT_IDN_AVAIL) | \
-    INT_MASK(INT_UDN_AVAIL) | \
-    INT_MASK(INT_IPI_3) | \
-    INT_MASK(INT_IPI_2) | \
-    INT_MASK(INT_IPI_1) | \
-    INT_MASK(INT_IPI_0) | \
-    INT_MASK(INT_PERF_COUNT) | \
-    INT_MASK(INT_AUX_PERF_COUNT) | \
-    INT_MASK(INT_INTCTRL_3) | \
-    INT_MASK(INT_INTCTRL_2) | \
-    INT_MASK(INT_INTCTRL_1) | \
-    INT_MASK(INT_INTCTRL_0) | \
-    INT_MASK(INT_BOOT_ACCESS) | \
-    INT_MASK(INT_WORLD_ACCESS) | \
-    INT_MASK(INT_I_ASID) | \
-    INT_MASK(INT_D_ASID) | \
-    INT_MASK(INT_DOUBLE_FAULT) | \
-    0)
-#define NONQUEUED_INTERRUPTS ( \
-    INT_MASK(INT_SINGLE_STEP_3) | \
-    INT_MASK(INT_SINGLE_STEP_2) | \
-    INT_MASK(INT_SINGLE_STEP_1) | \
-    INT_MASK(INT_SINGLE_STEP_0) | \
-    INT_MASK(INT_ITLB_MISS) | \
-    INT_MASK(INT_ILL) | \
-    INT_MASK(INT_GPV) | \
-    INT_MASK(INT_IDN_ACCESS) | \
-    INT_MASK(INT_UDN_ACCESS) | \
-    INT_MASK(INT_SWINT_3) | \
-    INT_MASK(INT_SWINT_2) | \
-    INT_MASK(INT_SWINT_1) | \
-    INT_MASK(INT_SWINT_0) | \
-    INT_MASK(INT_ILL_TRANS) | \
-    INT_MASK(INT_UNALIGN_DATA) | \
-    INT_MASK(INT_DTLB_MISS) | \
-    INT_MASK(INT_DTLB_ACCESS) | \
-    0)
-#define CRITICAL_MASKED_INTERRUPTS ( \
-    INT_MASK(INT_MEM_ERROR) | \
-    INT_MASK(INT_SINGLE_STEP_3) | \
-    INT_MASK(INT_SINGLE_STEP_2) | \
-    INT_MASK(INT_SINGLE_STEP_1) | \
-    INT_MASK(INT_SINGLE_STEP_0) | \
-    INT_MASK(INT_IDN_COMPLETE) | \
-    INT_MASK(INT_UDN_COMPLETE) | \
-    INT_MASK(INT_IDN_FIREWALL) | \
-    INT_MASK(INT_UDN_FIREWALL) | \
-    INT_MASK(INT_TILE_TIMER) | \
-    INT_MASK(INT_AUX_TILE_TIMER) | \
-    INT_MASK(INT_IDN_TIMER) | \
-    INT_MASK(INT_UDN_TIMER) | \
-    INT_MASK(INT_IDN_AVAIL) | \
-    INT_MASK(INT_UDN_AVAIL) | \
-    INT_MASK(INT_IPI_3) | \
-    INT_MASK(INT_IPI_2) | \
-    INT_MASK(INT_IPI_1) | \
-    INT_MASK(INT_IPI_0) | \
-    INT_MASK(INT_PERF_COUNT) | \
-    INT_MASK(INT_AUX_PERF_COUNT) | \
-    INT_MASK(INT_INTCTRL_3) | \
-    INT_MASK(INT_INTCTRL_2) | \
-    INT_MASK(INT_INTCTRL_1) | \
-    INT_MASK(INT_INTCTRL_0) | \
-    0)
-#define CRITICAL_UNMASKED_INTERRUPTS ( \
-    INT_MASK(INT_ITLB_MISS) | \
-    INT_MASK(INT_ILL) | \
-    INT_MASK(INT_GPV) | \
-    INT_MASK(INT_IDN_ACCESS) | \
-    INT_MASK(INT_UDN_ACCESS) | \
-    INT_MASK(INT_SWINT_3) | \
-    INT_MASK(INT_SWINT_2) | \
-    INT_MASK(INT_SWINT_1) | \
-    INT_MASK(INT_SWINT_0) | \
-    INT_MASK(INT_ILL_TRANS) | \
-    INT_MASK(INT_UNALIGN_DATA) | \
-    INT_MASK(INT_DTLB_MISS) | \
-    INT_MASK(INT_DTLB_ACCESS) | \
-    INT_MASK(INT_BOOT_ACCESS) | \
-    INT_MASK(INT_WORLD_ACCESS) | \
-    INT_MASK(INT_I_ASID) | \
-    INT_MASK(INT_D_ASID) | \
-    INT_MASK(INT_DOUBLE_FAULT) | \
-    0)
-#define MASKABLE_INTERRUPTS ( \
-    INT_MASK(INT_MEM_ERROR) | \
-    INT_MASK(INT_SINGLE_STEP_3) | \
-    INT_MASK(INT_SINGLE_STEP_2) | \
-    INT_MASK(INT_SINGLE_STEP_1) | \
-    INT_MASK(INT_SINGLE_STEP_0) | \
-    INT_MASK(INT_IDN_COMPLETE) | \
-    INT_MASK(INT_UDN_COMPLETE) | \
-    INT_MASK(INT_IDN_FIREWALL) | \
-    INT_MASK(INT_UDN_FIREWALL) | \
-    INT_MASK(INT_TILE_TIMER) | \
-    INT_MASK(INT_AUX_TILE_TIMER) | \
-    INT_MASK(INT_IDN_TIMER) | \
-    INT_MASK(INT_UDN_TIMER) | \
-    INT_MASK(INT_IDN_AVAIL) | \
-    INT_MASK(INT_UDN_AVAIL) | \
-    INT_MASK(INT_IPI_3) | \
-    INT_MASK(INT_IPI_2) | \
-    INT_MASK(INT_IPI_1) | \
-    INT_MASK(INT_IPI_0) | \
-    INT_MASK(INT_PERF_COUNT) | \
-    INT_MASK(INT_AUX_PERF_COUNT) | \
-    INT_MASK(INT_INTCTRL_3) | \
-    INT_MASK(INT_INTCTRL_2) | \
-    INT_MASK(INT_INTCTRL_1) | \
-    INT_MASK(INT_INTCTRL_0) | \
-    0)
-#define UNMASKABLE_INTERRUPTS ( \
-    INT_MASK(INT_ITLB_MISS) | \
-    INT_MASK(INT_ILL) | \
-    INT_MASK(INT_GPV) | \
-    INT_MASK(INT_IDN_ACCESS) | \
-    INT_MASK(INT_UDN_ACCESS) | \
-    INT_MASK(INT_SWINT_3) | \
-    INT_MASK(INT_SWINT_2) | \
-    INT_MASK(INT_SWINT_1) | \
-    INT_MASK(INT_SWINT_0) | \
-    INT_MASK(INT_ILL_TRANS) | \
-    INT_MASK(INT_UNALIGN_DATA) | \
-    INT_MASK(INT_DTLB_MISS) | \
-    INT_MASK(INT_DTLB_ACCESS) | \
-    INT_MASK(INT_BOOT_ACCESS) | \
-    INT_MASK(INT_WORLD_ACCESS) | \
-    INT_MASK(INT_I_ASID) | \
-    INT_MASK(INT_D_ASID) | \
-    INT_MASK(INT_DOUBLE_FAULT) | \
-    0)
-#define SYNC_INTERRUPTS ( \
-    INT_MASK(INT_SINGLE_STEP_3) | \
-    INT_MASK(INT_SINGLE_STEP_2) | \
-    INT_MASK(INT_SINGLE_STEP_1) | \
-    INT_MASK(INT_SINGLE_STEP_0) | \
-    INT_MASK(INT_IDN_COMPLETE) | \
-    INT_MASK(INT_UDN_COMPLETE) | \
-    INT_MASK(INT_ITLB_MISS) | \
-    INT_MASK(INT_ILL) | \
-    INT_MASK(INT_GPV) | \
-    INT_MASK(INT_IDN_ACCESS) | \
-    INT_MASK(INT_UDN_ACCESS) | \
-    INT_MASK(INT_SWINT_3) | \
-    INT_MASK(INT_SWINT_2) | \
-    INT_MASK(INT_SWINT_1) | \
-    INT_MASK(INT_SWINT_0) | \
-    INT_MASK(INT_ILL_TRANS) | \
-    INT_MASK(INT_UNALIGN_DATA) | \
-    INT_MASK(INT_DTLB_MISS) | \
-    INT_MASK(INT_DTLB_ACCESS) | \
-    0)
-#define NON_SYNC_INTERRUPTS ( \
-    INT_MASK(INT_MEM_ERROR) | \
-    INT_MASK(INT_IDN_FIREWALL) | \
-    INT_MASK(INT_UDN_FIREWALL) | \
-    INT_MASK(INT_TILE_TIMER) | \
-    INT_MASK(INT_AUX_TILE_TIMER) | \
-    INT_MASK(INT_IDN_TIMER) | \
-    INT_MASK(INT_UDN_TIMER) | \
-    INT_MASK(INT_IDN_AVAIL) | \
-    INT_MASK(INT_UDN_AVAIL) | \
-    INT_MASK(INT_IPI_3) | \
-    INT_MASK(INT_IPI_2) | \
-    INT_MASK(INT_IPI_1) | \
-    INT_MASK(INT_IPI_0) | \
-    INT_MASK(INT_PERF_COUNT) | \
-    INT_MASK(INT_AUX_PERF_COUNT) | \
-    INT_MASK(INT_INTCTRL_3) | \
-    INT_MASK(INT_INTCTRL_2) | \
-    INT_MASK(INT_INTCTRL_1) | \
-    INT_MASK(INT_INTCTRL_0) | \
-    INT_MASK(INT_BOOT_ACCESS) | \
-    INT_MASK(INT_WORLD_ACCESS) | \
-    INT_MASK(INT_I_ASID) | \
-    INT_MASK(INT_D_ASID) | \
-    INT_MASK(INT_DOUBLE_FAULT) | \
-    0)
-#endif /* !__ASSEMBLER__ */
-#endif /* !__ARCH_INTERRUPTS_H__ */
diff --git a/arch/tile/include/arch/mpipe.h b/arch/tile/include/arch/mpipe.h
new file mode 100644
index 00000000000..904538e754d
--- /dev/null
+++ b/arch/tile/include/arch/mpipe.h
@@ -0,0 +1,371 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_MPIPE_H__
+#define __ARCH_MPIPE_H__
+
+#include <arch/abi.h>
+#include <arch/mpipe_def.h>
+
+#ifndef __ASSEMBLER__
+
+/*
+ * MMIO Ingress DMA Release Region Address.
+ * This is a description of the physical addresses used to manipulate ingress
+ * credit counters.  Accesses to this address space should use an address of
+ * this form and a value like that specified in IDMA_RELEASE_REGION_VAL.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Reserved. */
+    uint_reg_t __reserved_0  : 3;
+    /* NotifRing to be released */
+    uint_reg_t ring          : 8;
+    /* Bucket to be released */
+    uint_reg_t bucket        : 13;
+    /* Enable NotifRing release */
+    uint_reg_t ring_enable   : 1;
+    /* Enable Bucket release */
+    uint_reg_t bucket_enable : 1;
+    /*
+     * This field of the address selects the region (address space) to be
+     * accessed.  For the iDMA release region, this field must be 4.
+     */
+    uint_reg_t region        : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_1  : 6;
+    /* This field of the address indexes the 32 entry service domain table. */
+    uint_reg_t svc_dom       : 5;
+    /* Reserved. */
+    uint_reg_t __reserved_2  : 24;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_2  : 24;
+    uint_reg_t svc_dom       : 5;
+    uint_reg_t __reserved_1  : 6;
+    uint_reg_t region        : 3;
+    uint_reg_t bucket_enable : 1;
+    uint_reg_t ring_enable   : 1;
+    uint_reg_t bucket        : 13;
+    uint_reg_t ring          : 8;
+    uint_reg_t __reserved_0  : 3;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_IDMA_RELEASE_REGION_ADDR_t;
+
+/*
+ * MMIO Ingress DMA Release Region Value - Release NotifRing and/or Bucket.
+ * Provides release of the associated NotifRing.  The address of the MMIO
+ * operation is described in IDMA_RELEASE_REGION_ADDR.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * Number of packets being released.  The load balancer's count of
+     * inflight packets will be decremented by this amount for the associated
+     * Bucket and/or NotifRing
+     */
+    uint_reg_t count      : 16;
+    /* Reserved. */
+    uint_reg_t __reserved : 48;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved : 48;
+    uint_reg_t count      : 16;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_IDMA_RELEASE_REGION_VAL_t;
+
+/*
+ * MMIO Buffer Stack Manager Region Address.
+ * This MMIO region is used for posting or fetching buffers to/from the
+ * buffer stack manager.  On an MMIO load, this pops a buffer descriptor from
+ * the top of stack if one is available.  On an MMIO store, this pushes a
+ * buffer to the stack.  The value read or written is described in
+ * BSM_REGION_VAL.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 3;
+    /* BufferStack being accessed. */
+    uint_reg_t stack        : 5;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 18;
+    /*
+     * This field of the address selects the region (address space) to be
+     * accessed.  For the buffer stack manager region, this field must be 6.
+     */
+    uint_reg_t region       : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_2 : 6;
+    /* This field of the address indexes the 32 entry service domain table. */
+    uint_reg_t svc_dom      : 5;
+    /* Reserved. */
+    uint_reg_t __reserved_3 : 24;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_3 : 24;
+    uint_reg_t svc_dom      : 5;
+    uint_reg_t __reserved_2 : 6;
+    uint_reg_t region       : 3;
+    uint_reg_t __reserved_1 : 18;
+    uint_reg_t stack        : 5;
+    uint_reg_t __reserved_0 : 3;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_BSM_REGION_ADDR_t;
+
+/*
+ * MMIO Buffer Stack Manager Region Value.
+ * This MMIO region is used for posting or fetching buffers to/from the
+ * buffer stack manager.  On an MMIO load, this pops a buffer descriptor from
+ * the top of stack if one is available. On an MMIO store, this pushes a
+ * buffer to the stack.  The address of the MMIO operation is described in
+ * BSM_REGION_ADDR.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 7;
+    /*
+     * Base virtual address of the buffer.  Must be sign extended by consumer.
+     */
+    int_reg_t va           : 35;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 6;
+    /*
+     * Index of the buffer stack to which this buffer belongs.  Ignored on
+     * writes since the offset bits specify the stack being accessed.
+     */
+    uint_reg_t stack_idx    : 5;
+    /* Reserved. */
+    uint_reg_t __reserved_2 : 3;
+    /*
+     * Instance ID.  For devices that support automatic buffer return between
+     * mPIPE instances, this field indicates the buffer owner.  If the INST
+     * field does not match the mPIPE's instance number when a packet is
+     * egressed, buffers with HWB set will be returned to the other mPIPE
+     * instance.  Note that not all devices support multi-mPIPE buffer
+     * return.  The MPIPE_EDMA_INFO.REMOTE_BUFF_RTN_SUPPORT bit indicates
+     * whether the INST field in the buffer descriptor is populated by iDMA
+     * hardware. This field is ignored on writes.
+     */
+    uint_reg_t inst         : 2;
+    /*
+     * Reads as one to indicate that this is a hardware managed buffer.
+     * Ignored on writes since all buffers on a given stack are the same size.
+     */
+    uint_reg_t hwb          : 1;
+    /*
+     * Encoded size of buffer (ignored on writes):
+     * 0 = 128 bytes
+     * 1 = 256 bytes
+     * 2 = 512 bytes
+     * 3 = 1024 bytes
+     * 4 = 1664 bytes
+     * 5 = 4096 bytes
+     * 6 = 10368 bytes
+     * 7 = 16384 bytes
+     */
+    uint_reg_t size         : 3;
+    /*
+     * Valid indication for the buffer.  Ignored on writes.
+     * 0 : Valid buffer descriptor popped from stack.
+     * 3 : Could not pop a buffer from the stack.  Either the stack is empty,
+     * or the hardware's prefetch buffer is empty for this stack.
+     */
+    uint_reg_t c            : 2;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t c            : 2;
+    uint_reg_t size         : 3;
+    uint_reg_t hwb          : 1;
+    uint_reg_t inst         : 2;
+    uint_reg_t __reserved_2 : 3;
+    uint_reg_t stack_idx    : 5;
+    uint_reg_t __reserved_1 : 6;
+    int_reg_t va           : 35;
+    uint_reg_t __reserved_0 : 7;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_BSM_REGION_VAL_t;
+
+/*
+ * MMIO Egress DMA Post Region Address.
+ * Used to post descriptor locations to the eDMA descriptor engine.  The
+ * value to be written is described in EDMA_POST_REGION_VAL
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 3;
+    /* eDMA ring being accessed */
+    uint_reg_t ring         : 6;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 17;
+    /*
+     * This field of the address selects the region (address space) to be
+     * accessed.  For the egress DMA post region, this field must be 5.
+     */
+    uint_reg_t region       : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_2 : 6;
+    /* This field of the address indexes the 32 entry service domain table. */
+    uint_reg_t svc_dom      : 5;
+    /* Reserved. */
+    uint_reg_t __reserved_3 : 24;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_3 : 24;
+    uint_reg_t svc_dom      : 5;
+    uint_reg_t __reserved_2 : 6;
+    uint_reg_t region       : 3;
+    uint_reg_t __reserved_1 : 17;
+    uint_reg_t ring         : 6;
+    uint_reg_t __reserved_0 : 3;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_EDMA_POST_REGION_ADDR_t;
+
+/*
+ * MMIO Egress DMA Post Region Value.
+ * Used to post descriptor locations to the eDMA descriptor engine.  The
+ * address is described in EDMA_POST_REGION_ADDR.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * For writes, this specifies the current ring tail pointer prior to any
+     * post.  For example, to post 1 or more descriptors starting at location
+     * 23, this would contain 23 (not 24).  On writes, this index must be
+     * masked based on the ring size.  The new tail pointer after this post
+     * is COUNT+RING_IDX (masked by the ring size).
+     *
+     * For reads, this provides the hardware descriptor fetcher's head
+     * pointer.  The descriptors prior to the head pointer, however, may not
+     * yet have been processed so this indicator is only used to determine
+     * how full the ring is and if software may post more descriptors.
+     */
+    uint_reg_t ring_idx   : 16;
+    /*
+     * For writes, this specifies number of contiguous descriptors that are
+     * being posted.  Software may post up to RingSize descriptors with a
+     * single MMIO store.  A zero in this field on a write will "wake up" an
+     * eDMA ring and cause it fetch descriptors regardless of the hardware's
+     * current view of the state of the tail pointer.
+     *
+     * For reads, this field provides a rolling count of the number of
+     * descriptors that have been completely processed.  This may be used by
+     * software to determine when buffers associated with a descriptor may be
+     * returned or reused.  When the ring's flush bit is cleared by software
+     * (after having been set by HW or SW), the COUNT will be cleared.
+     */
+    uint_reg_t count      : 16;
+    /*
+     * For writes, this specifies the generation number of the tail being
+     * posted. Note that if tail+cnt wraps to the beginning of the ring, the
+     * eDMA hardware assumes that the descriptors posted at the beginning of
+     * the ring are also valid so it is okay to post around the wrap point.
+     *
+     * For reads, this is the current generation number.  Valid descriptors
+     * will have the inverse of this generation number.
+     */
+    uint_reg_t gen        : 1;
+    /* Reserved. */
+    uint_reg_t __reserved : 31;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved : 31;
+    uint_reg_t gen        : 1;
+    uint_reg_t count      : 16;
+    uint_reg_t ring_idx   : 16;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_EDMA_POST_REGION_VAL_t;
+
+/*
+ * Load Balancer Bucket Status Data.
+ * Read/Write data for load balancer Bucket-Status Table. 4160 entries
+ * indexed by LBL_INIT_CTL.IDX when LBL_INIT_CTL.STRUCT_SEL is BSTS_TBL
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* NotifRing currently assigned to this bucket. */
+    uint_reg_t notifring  : 8;
+    /* Current reference count. */
+    uint_reg_t count      : 16;
+    /* Group associated with this bucket. */
+    uint_reg_t group      : 5;
+    /* Mode select for this bucket. */
+    uint_reg_t mode       : 3;
+    /* Reserved. */
+    uint_reg_t __reserved : 32;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved : 32;
+    uint_reg_t mode       : 3;
+    uint_reg_t group      : 5;
+    uint_reg_t count      : 16;
+    uint_reg_t notifring  : 8;
+#endif
+  };
+
+  uint_reg_t word;
+} MPIPE_LBL_INIT_DAT_BSTS_TBL_t;
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_MPIPE_H__) */
diff --git a/arch/tile/include/arch/mpipe_constants.h b/arch/tile/include/arch/mpipe_constants.h
new file mode 100644
index 00000000000..84022ac5fe8
--- /dev/null
+++ b/arch/tile/include/arch/mpipe_constants.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+
+#ifndef __ARCH_MPIPE_CONSTANTS_H__
+#define __ARCH_MPIPE_CONSTANTS_H__
+
+#define MPIPE_NUM_CLASSIFIERS 16
+#define MPIPE_CLS_MHZ 1200
+
+#define MPIPE_NUM_EDMA_RINGS 64
+
+#define MPIPE_NUM_SGMII_MACS 16
+#define MPIPE_NUM_XAUI_MACS 16
+#define MPIPE_NUM_LOOPBACK_CHANNELS 4
+#define MPIPE_NUM_NON_LB_CHANNELS 28
+
+#define MPIPE_NUM_IPKT_BLOCKS 1536
+
+#define MPIPE_NUM_BUCKETS 4160
+
+#define MPIPE_NUM_NOTIF_RINGS 256
+
+#define MPIPE_NUM_NOTIF_GROUPS 32
+
+#define MPIPE_NUM_TLBS_PER_ASID 16
+#define MPIPE_TLB_IDX_WIDTH 4
+
+#define MPIPE_MMIO_NUM_SVC_DOM 32
+
+#endif /* __ARCH_MPIPE_CONSTANTS_H__ */
diff --git a/arch/tile/include/arch/mpipe_def.h b/arch/tile/include/arch/mpipe_def.h
new file mode 100644
index 00000000000..c3d30217fc6
--- /dev/null
+++ b/arch/tile/include/arch/mpipe_def.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_MPIPE_DEF_H__
+#define __ARCH_MPIPE_DEF_H__
+#define MPIPE_MMIO_ADDR__REGION_SHIFT 26
+#define MPIPE_MMIO_ADDR__REGION_VAL_CFG 0x0
+#define MPIPE_MMIO_ADDR__REGION_VAL_IDMA 0x4
+#define MPIPE_MMIO_ADDR__REGION_VAL_EDMA 0x5
+#define MPIPE_MMIO_ADDR__REGION_VAL_BSM 0x6
+#define MPIPE_BSM_REGION_VAL__VA_SHIFT 7
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_128 0x0
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_256 0x1
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_512 0x2
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_1024 0x3
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_1664 0x4
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_4096 0x5
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_10368 0x6
+#define MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_16384 0x7
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_DFA 0x0
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_FIXED 0x1
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_ALWAYS_PICK 0x2
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_STICKY 0x3
+#define MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_STICKY_RAND 0x7
+#define MPIPE_LBL_NR_STATE__FIRST_WORD 0x2138
+#endif /* !defined(__ARCH_MPIPE_DEF_H__) */
diff --git a/arch/tile/include/arch/mpipe_shm.h b/arch/tile/include/arch/mpipe_shm.h
new file mode 100644
index 00000000000..13b3c4300e5
--- /dev/null
+++ b/arch/tile/include/arch/mpipe_shm.h
@@ -0,0 +1,521 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+
+#ifndef __ARCH_MPIPE_SHM_H__
+#define __ARCH_MPIPE_SHM_H__
+
+#include <arch/abi.h>
+#include <arch/mpipe_shm_def.h>
+
+#ifndef __ASSEMBLER__
+/**
+ * MPIPE eDMA Descriptor.
+ * The eDMA descriptor is written by software and consumed by hardware.  It
+ * is used to specify the location of egress packet data to be sent out of
+ * the chip via one of the packet interfaces.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+    /* Word 0 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Generation number.  Used to indicate a valid descriptor in ring.  When
+     * a new descriptor is written into the ring, software must toggle this
+     * bit.  The net effect is that the GEN bit being written into new
+     * descriptors toggles each time the ring tail pointer wraps.
+     */
+    uint_reg_t gen        : 1;
+    /**
+     * For devices with EDMA reorder support, this field allows the
+     * descriptor to select the egress FIFO.  The associated DMA ring must
+     * have ALLOW_EFIFO_SEL enabled.
+     */
+    uint_reg_t efifo_sel  : 6;
+    /** Reserved.  Must be zero. */
+    uint_reg_t r0         : 1;
+    /** Checksum generation enabled for this transfer. */
+    uint_reg_t csum       : 1;
+    /**
+     * Nothing to be sent.  Used, for example, when software has dropped a
+     * packet but still wishes to return all of the associated buffers.
+     */
+    uint_reg_t ns         : 1;
+    /**
+     * Notification interrupt will be delivered when packet has been egressed.
+     */
+    uint_reg_t notif      : 1;
+    /**
+     * Boundary indicator.  When 1, this transfer includes the EOP for this
+     * command.  Must be clear on all but the last descriptor for an egress
+     * packet.
+     */
+    uint_reg_t bound      : 1;
+    /** Reserved.  Must be zero. */
+    uint_reg_t r1         : 4;
+    /**
+     * Number of bytes to be sent for this descriptor.  When zero, no data
+     * will be moved and the buffer descriptor will be ignored.  If the
+     * buffer descriptor indicates that it is chained, the low 7 bits of the
+     * VA indicate the offset within the first buffer (e.g. 127 bytes is the
+     * maximum offset into the first buffer).  If the size exceeds a single
+     * buffer, subsequent buffer descriptors will be fetched prior to
+     * processing the next eDMA descriptor in the ring.
+     */
+    uint_reg_t xfer_size  : 14;
+    /** Reserved.  Must be zero. */
+    uint_reg_t r2         : 2;
+    /**
+     * Destination of checksum relative to CSUM_START relative to the first
+     * byte moved by this descriptor.  Must be zero if CSUM=0 in this
+     * descriptor.  Must be less than XFER_SIZE (e.g. the first byte of the
+     * CSUM_DEST must be within the span of this descriptor).
+     */
+    uint_reg_t csum_dest  : 8;
+    /**
+     * Start byte of checksum relative to the first byte moved by this
+     * descriptor.  If this is not the first descriptor for the egress
+     * packet, CSUM_START is still relative to the first byte in this
+     * descriptor.  Must be zero if CSUM=0 in this descriptor.
+     */
+    uint_reg_t csum_start : 8;
+    /**
+     * Initial value for 16-bit 1's compliment checksum if enabled via CSUM.
+     * Specified in network order.  That is, bits[7:0] will be added to the
+     * byte pointed to by CSUM_START and bits[15:8] will be added to the byte
+     * pointed to by CSUM_START+1 (with appropriate 1's compliment carries).
+     * Must be zero if CSUM=0 in this descriptor.
+     */
+    uint_reg_t csum_seed  : 16;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t csum_seed  : 16;
+    uint_reg_t csum_start : 8;
+    uint_reg_t csum_dest  : 8;
+    uint_reg_t r2         : 2;
+    uint_reg_t xfer_size  : 14;
+    uint_reg_t r1         : 4;
+    uint_reg_t bound      : 1;
+    uint_reg_t notif      : 1;
+    uint_reg_t ns         : 1;
+    uint_reg_t csum       : 1;
+    uint_reg_t r0         : 1;
+    uint_reg_t efifo_sel  : 6;
+    uint_reg_t gen        : 1;
+#endif
+
+    /* Word 1 */
+
+#ifndef __BIG_ENDIAN__
+    /** Virtual address.  Must be sign extended by consumer. */
+    int_reg_t va           : 42;
+    /** Reserved. */
+    uint_reg_t __reserved_0 : 6;
+    /** Index of the buffer stack to which this buffer belongs. */
+    uint_reg_t stack_idx    : 5;
+    /** Reserved. */
+    uint_reg_t __reserved_1 : 3;
+    /**
+     * Instance ID.  For devices that support automatic buffer return between
+     * mPIPE instances, this field indicates the buffer owner.  If the INST
+     * field does not match the mPIPE's instance number when a packet is
+     * egressed, buffers with HWB set will be returned to the other mPIPE
+     * instance.  Note that not all devices support multi-mPIPE buffer
+     * return.  The MPIPE_EDMA_INFO.REMOTE_BUFF_RTN_SUPPORT bit indicates
+     * whether the INST field in the buffer descriptor is populated by iDMA
+     * hardware.
+     */
+    uint_reg_t inst         : 2;
+    /**
+     * Always set to one by hardware in iDMA packet descriptors.  For eDMA,
+     * indicates whether the buffer will be released to the buffer stack
+     * manager.  When 0, software is responsible for releasing the buffer.
+     */
+    uint_reg_t hwb          : 1;
+    /**
+     * Encoded size of buffer.  Set by the ingress hardware for iDMA packet
+     * descriptors.  For eDMA descriptors, indicates the buffer size if .c
+     * indicates a chained packet.  If an eDMA descriptor is not chained and
+     * the .hwb bit is not set, this field is ignored and the size is
+     * specified by the .xfer_size field.
+     * 0 = 128 bytes
+     * 1 = 256 bytes
+     * 2 = 512 bytes
+     * 3 = 1024 bytes
+     * 4 = 1664 bytes
+     * 5 = 4096 bytes
+     * 6 = 10368 bytes
+     * 7 = 16384 bytes
+     */
+    uint_reg_t size         : 3;
+    /**
+     * Chaining configuration for the buffer.  Indicates that an ingress
+     * packet or egress command is chained across multiple buffers, with each
+     * buffer's size indicated by the .size field.
+     */
+    uint_reg_t c            : 2;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t c            : 2;
+    uint_reg_t size         : 3;
+    uint_reg_t hwb          : 1;
+    uint_reg_t inst         : 2;
+    uint_reg_t __reserved_1 : 3;
+    uint_reg_t stack_idx    : 5;
+    uint_reg_t __reserved_0 : 6;
+    int_reg_t va           : 42;
+#endif
+
+  };
+
+  /** Word access */
+  uint_reg_t words[2];
+} MPIPE_EDMA_DESC_t;
+
+/**
+ * MPIPE Packet Descriptor.
+ * The packet descriptor is filled by the mPIPE's classification,
+ * load-balancing, and buffer management services.  Some fields are consumed
+ * by mPIPE hardware, and others are consumed by Tile software.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+    /* Word 0 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Notification ring into which this packet descriptor is written.
+     * Typically written by load balancer, but can be overridden by
+     * classification program if NR is asserted.
+     */
+    uint_reg_t notif_ring   : 8;
+    /** Source channel for this packet.  Written by mPIPE DMA hardware. */
+    uint_reg_t channel      : 5;
+    /** Reserved. */
+    uint_reg_t __reserved_0 : 1;
+    /**
+     * MAC Error.
+     * Generated by the MAC interface.  Asserted if there was an overrun of
+     * the MAC's receive FIFO.  This condition generally only occurs if the
+     * mPIPE clock is running too slowly.
+     */
+    uint_reg_t me           : 1;
+    /**
+     * Truncation Error.
+     * Written by the iDMA hardware.  Asserted if packet was truncated due to
+     * insufficient space in iPkt buffer
+     */
+    uint_reg_t tr           : 1;
+    /**
+     * Written by the iDMA hardware.  Indicates the number of bytes written
+     * to Tile memory.  In general, this is the actual size of the packet as
+     * received from the MAC.  But if the packet is truncated due to running
+     * out of buffers or due to the iPkt buffer filling up, then the L2_SIZE
+     * will be reduced to reflect the actual number of valid bytes written to
+     * Tile memory.
+     */
+    uint_reg_t l2_size      : 14;
+    /**
+     * CRC Error.
+     * Generated by the MAC.  Asserted if MAC indicated an L2 CRC error or
+     * other L2 error (bad length etc.) on the packet.
+     */
+    uint_reg_t ce           : 1;
+    /**
+     * Cut Through.
+     * Written by the iDMA hardware.  Asserted if packet was not completely
+     * received before being sent to classifier.  L2_Size will indicate
+     * number of bytes received so far.
+     */
+    uint_reg_t ct           : 1;
+    /**
+     * Written by the classification program.  Used by the load balancer to
+     * select the ring into which this packet descriptor is written.
+     */
+    uint_reg_t bucket_id    : 13;
+    /** Reserved. */
+    uint_reg_t __reserved_1 : 3;
+    /**
+     * Checksum.
+     * Written by classification program.  When 1, the checksum engine will
+     * perform checksum based on the CSUM_SEED, CSUM_START, and CSUM_BYTES
+     * fields.  The result will be placed in CSUM_VAL.
+     */
+    uint_reg_t cs           : 1;
+    /**
+     * Notification Ring Select.
+     * Written by the classification program.  When 1, the NotifRingIDX is
+     * set by classification program rather than being set by load balancer.
+     */
+    uint_reg_t nr           : 1;
+    /**
+     * Written by classification program.  Indicates whether packet and
+     * descriptor should both be dropped, both be delivered, or only the
+     * descriptor should be delivered.
+     */
+    uint_reg_t dest         : 2;
+    /**
+     * General Purpose Sequence Number Enable.
+     * Written by the classification program.  When 1, the GP_SQN_SEL field
+     * contains the sequence number selector and the GP_SQN field will be
+     * replaced with the associated sequence number.  When clear, the GP_SQN
+     * field is left intact and be used as "Custom" bytes.
+     */
+    uint_reg_t sq           : 1;
+    /**
+     * TimeStamp Enable.
+     * Enable TimeStamp insertion.  When clear, timestamp field may be filled
+     * with custom data by classifier.  When set, hardware inserts the
+     * timestamp when the start of packet is received from the MAC.
+     */
+    uint_reg_t ts           : 1;
+    /**
+     * Packet Sequence Number Enable.
+     * Enable PacketSQN insertion.  When clear, PacketSQN field may be filled
+     * with custom data by classifier.  When set, hardware inserts the packet
+     * sequence number when the packet descriptor is written to a
+     * notification ring.
+     */
+    uint_reg_t ps           : 1;
+    /**
+     * Buffer Error.
+     * Written by the iDMA hardware.  Asserted if iDMA ran out of buffers
+     * while writing the packet. Software must still return any buffer
+     * descriptors whose C field indicates a valid descriptor was consumed.
+     */
+    uint_reg_t be           : 1;
+    /**
+     * Written by  the classification program.  The associated counter is
+     * incremented when the packet is sent.
+     */
+    uint_reg_t ctr0         : 5;
+    /** Reserved. */
+    uint_reg_t __reserved_2 : 3;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_2 : 3;
+    uint_reg_t ctr0         : 5;
+    uint_reg_t be           : 1;
+    uint_reg_t ps           : 1;
+    uint_reg_t ts           : 1;
+    uint_reg_t sq           : 1;
+    uint_reg_t dest         : 2;
+    uint_reg_t nr           : 1;
+    uint_reg_t cs           : 1;
+    uint_reg_t __reserved_1 : 3;
+    uint_reg_t bucket_id    : 13;
+    uint_reg_t ct           : 1;
+    uint_reg_t ce           : 1;
+    uint_reg_t l2_size      : 14;
+    uint_reg_t tr           : 1;
+    uint_reg_t me           : 1;
+    uint_reg_t __reserved_0 : 1;
+    uint_reg_t channel      : 5;
+    uint_reg_t notif_ring   : 8;
+#endif
+
+    /* Word 1 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Written by  the classification program.  The associated counter is
+     * incremented when the packet is sent.
+     */
+    uint_reg_t ctr1          : 5;
+    /** Reserved. */
+    uint_reg_t __reserved_3  : 3;
+    /**
+     * Written by classification program.  Indicates the start byte for
+     * checksum.  Relative to 1st byte received from MAC.
+     */
+    uint_reg_t csum_start    : 8;
+    /**
+     * Checksum seed written by classification program.  Overwritten with
+     * resultant checksum if CS bit is asserted.  The endianness of the CSUM
+     * value bits when viewed by Tile software match the packet byte order.
+     * That is, bits[7:0] of the resulting checksum value correspond to
+     * earlier (more significant) bytes in the packet.  To avoid classifier
+     * software from having to byte swap the CSUM_SEED, the iDMA checksum
+     * engine byte swaps the classifier's result before seeding the checksum
+     * calculation.  Thus, the CSUM_START byte of packet data is added to
+     * bits[15:8] of the CSUM_SEED field generated by the classifier.  This
+     * byte swap will be visible to Tile software if the CS bit is clear.
+     */
+    uint_reg_t csum_seed_val : 16;
+    /**
+     * Written by  the classification program.  Not interpreted by mPIPE
+     * hardware.
+     */
+    uint_reg_t custom0       : 32;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t custom0       : 32;
+    uint_reg_t csum_seed_val : 16;
+    uint_reg_t csum_start    : 8;
+    uint_reg_t __reserved_3  : 3;
+    uint_reg_t ctr1          : 5;
+#endif
+
+    /* Word 2 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Written by  the classification program.  Not interpreted by mPIPE
+     * hardware.
+     */
+    uint_reg_t custom1 : 64;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t custom1 : 64;
+#endif
+
+    /* Word 3 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Written by  the classification program.  Not interpreted by mPIPE
+     * hardware.
+     */
+    uint_reg_t custom2 : 64;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t custom2 : 64;
+#endif
+
+    /* Word 4 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Written by  the classification program.  Not interpreted by mPIPE
+     * hardware.
+     */
+    uint_reg_t custom3 : 64;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t custom3 : 64;
+#endif
+
+    /* Word 5 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Sequence number applied when packet is distributed.   Classifier
+     * selects which sequence number is to be applied by writing the 13-bit
+     * SQN-selector into this field.  For devices that support EXT_SQN (as
+     * indicated in IDMA_INFO.EXT_SQN_SUPPORT), the GP_SQN can be extended to
+     * 32-bits via the IDMA_CTL.EXT_SQN register.  In this case the
+     * PACKET_SQN will be reduced to 32 bits.
+     */
+    uint_reg_t gp_sqn     : 16;
+    /**
+     * Written by notification hardware.  The packet sequence number is
+     * incremented for each packet that wasn't dropped.
+     */
+    uint_reg_t packet_sqn : 48;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t packet_sqn : 48;
+    uint_reg_t gp_sqn     : 16;
+#endif
+
+    /* Word 6 */
+
+#ifndef __BIG_ENDIAN__
+    /**
+     * Written by hardware when the start-of-packet is received by the mPIPE
+     * from the MAC.  This is the nanoseconds part of the packet timestamp.
+     */
+    uint_reg_t time_stamp_ns  : 32;
+    /**
+     * Written by hardware when the start-of-packet is received by the mPIPE
+     * from the MAC.  This is the seconds part of the packet timestamp.
+     */
+    uint_reg_t time_stamp_sec : 32;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t time_stamp_sec : 32;
+    uint_reg_t time_stamp_ns  : 32;
+#endif
+
+    /* Word 7 */
+
+#ifndef __BIG_ENDIAN__
+    /** Virtual address.  Must be sign extended by consumer. */
+    int_reg_t va           : 42;
+    /** Reserved. */
+    uint_reg_t __reserved_4 : 6;
+    /** Index of the buffer stack to which this buffer belongs. */
+    uint_reg_t stack_idx    : 5;
+    /** Reserved. */
+    uint_reg_t __reserved_5 : 3;
+    /**
+     * Instance ID.  For devices that support automatic buffer return between
+     * mPIPE instances, this field indicates the buffer owner.  If the INST
+     * field does not match the mPIPE's instance number when a packet is
+     * egressed, buffers with HWB set will be returned to the other mPIPE
+     * instance.  Note that not all devices support multi-mPIPE buffer
+     * return.  The MPIPE_EDMA_INFO.REMOTE_BUFF_RTN_SUPPORT bit indicates
+     * whether the INST field in the buffer descriptor is populated by iDMA
+     * hardware.
+     */
+    uint_reg_t inst         : 2;
+    /**
+     * Always set to one by hardware in iDMA packet descriptors.  For eDMA,
+     * indicates whether the buffer will be released to the buffer stack
+     * manager.  When 0, software is responsible for releasing the buffer.
+     */
+    uint_reg_t hwb          : 1;
+    /**
+     * Encoded size of buffer.  Set by the ingress hardware for iDMA packet
+     * descriptors.  For eDMA descriptors, indicates the buffer size if .c
+     * indicates a chained packet.  If an eDMA descriptor is not chained and
+     * the .hwb bit is not set, this field is ignored and the size is
+     * specified by the .xfer_size field.
+     * 0 = 128 bytes
+     * 1 = 256 bytes
+     * 2 = 512 bytes
+     * 3 = 1024 bytes
+     * 4 = 1664 bytes
+     * 5 = 4096 bytes
+     * 6 = 10368 bytes
+     * 7 = 16384 bytes
+     */
+    uint_reg_t size         : 3;
+    /**
+     * Chaining configuration for the buffer.  Indicates that an ingress
+     * packet or egress command is chained across multiple buffers, with each
+     * buffer's size indicated by the .size field.
+     */
+    uint_reg_t c            : 2;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t c            : 2;
+    uint_reg_t size         : 3;
+    uint_reg_t hwb          : 1;
+    uint_reg_t inst         : 2;
+    uint_reg_t __reserved_5 : 3;
+    uint_reg_t stack_idx    : 5;
+    uint_reg_t __reserved_4 : 6;
+    int_reg_t va           : 42;
+#endif
+
+  };
+
+  /** Word access */
+  uint_reg_t words[8];
+} MPIPE_PDESC_t;
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_MPIPE_SHM_H__) */
diff --git a/arch/tile/include/arch/mpipe_shm_def.h b/arch/tile/include/arch/mpipe_shm_def.h
new file mode 100644
index 00000000000..6124d39c831
--- /dev/null
+++ b/arch/tile/include/arch/mpipe_shm_def.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_MPIPE_SHM_DEF_H__
+#define __ARCH_MPIPE_SHM_DEF_H__
+#define MPIPE_EDMA_DESC_WORD1__C_VAL_UNCHAINED 0x0
+#define MPIPE_EDMA_DESC_WORD1__C_VAL_CHAINED 0x1
+#define MPIPE_EDMA_DESC_WORD1__C_VAL_NOT_RDY 0x2
+#define MPIPE_EDMA_DESC_WORD1__C_VAL_INVALID 0x3
+#endif /* !defined(__ARCH_MPIPE_SHM_DEF_H__) */
diff --git a/arch/tile/include/arch/spr_def.h b/arch/tile/include/arch/spr_def.h
index f548efeb2de..2de83e7aff3 100644
--- a/arch/tile/include/arch/spr_def.h
+++ b/arch/tile/include/arch/spr_def.h
@@ -11,15 +11,11 @@
  *   NON INFRINGEMENT.  See the GNU General Public License for
  *   more details.
  */
+#ifndef __ARCH_SPR_DEF_H__
+#define __ARCH_SPR_DEF_H__
 
-/* Include the proper base SPR definition file. */
-#ifdef __tilegx__
-#include <arch/spr_def_64.h>
-#else
-#include <arch/spr_def_32.h>
-#endif
+#include <uapi/arch/spr_def.h>
 
-#ifdef __KERNEL__
 
 /*
  * In addition to including the proper base SPR definition file, depending
@@ -60,8 +56,8 @@
 	_concat4(SPR_IPI_EVENT_, CONFIG_KERNEL_PL,,)
 #define SPR_IPI_EVENT_RESET_K \
 	_concat4(SPR_IPI_EVENT_RESET_, CONFIG_KERNEL_PL,,)
-#define SPR_IPI_MASK_SET_K \
-	_concat4(SPR_IPI_MASK_SET_, CONFIG_KERNEL_PL,,)
+#define SPR_IPI_EVENT_SET_K \
+	_concat4(SPR_IPI_EVENT_SET_, CONFIG_KERNEL_PL,,)
 #define INT_IPI_K \
 	_concat4(INT_IPI_, CONFIG_KERNEL_PL,,)
 
@@ -110,4 +106,4 @@
 #define INT_INTCTRL_K \
 	_concat4(INT_INTCTRL_, CONFIG_KERNEL_PL,,)
 
-#endif /* __KERNEL__ */
+#endif /* __ARCH_SPR_DEF_H__ */
diff --git a/arch/tile/include/arch/trio.h b/arch/tile/include/arch/trio.h
new file mode 100644
index 00000000000..c0ddedcae08
--- /dev/null
+++ b/arch/tile/include/arch/trio.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_TRIO_H__
+#define __ARCH_TRIO_H__
+
+#include <arch/abi.h>
+#include <arch/trio_def.h>
+
+#ifndef __ASSEMBLER__
+
+/*
+ * Map SQ Doorbell Format.
+ * This describes the format of the write-only doorbell register that exists
+ * in the last 8-bytes of the MAP_SQ_BASE/LIM range.  This register is only
+ * writable from PCIe space.  Writes to this register will not be written to
+ * Tile memory space and thus no IO VA translation is required if the last
+ * page of the BASE/LIM range is not otherwise written.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * When written with a 1, the associated MAP_SQ region's doorbell
+     * interrupt will be triggered once all previous writes are visible to
+     * Tile software.
+     */
+    uint_reg_t doorbell   : 1;
+    /*
+     * When written with a 1, the descriptor at the head of the associated
+     * MAP_SQ's FIFO will be dequeued.
+     */
+    uint_reg_t pop        : 1;
+    /* Reserved. */
+    uint_reg_t __reserved : 62;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved : 62;
+    uint_reg_t pop        : 1;
+    uint_reg_t doorbell   : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} TRIO_MAP_SQ_DOORBELL_FMT_t;
+
+
+/*
+ * Tile PIO Region Configuration - CFG Address Format.
+ * This register describes the address format for PIO accesses when the
+ * associated region is setup with TYPE=CFG.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Register Address (full byte address). */
+    uint_reg_t reg_addr     : 12;
+    /* Function Number */
+    uint_reg_t fn           : 3;
+    /* Device Number */
+    uint_reg_t dev          : 5;
+    /* BUS Number */
+    uint_reg_t bus          : 8;
+    /* Config Type: 0 for access to directly-attached device.  1 otherwise. */
+    uint_reg_t type         : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 1;
+    /*
+     * MAC select.  This must match the configuration in
+     * TILE_PIO_REGION_SETUP.MAC.
+     */
+    uint_reg_t mac          : 2;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 32;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_1 : 32;
+    uint_reg_t mac          : 2;
+    uint_reg_t __reserved_0 : 1;
+    uint_reg_t type         : 1;
+    uint_reg_t bus          : 8;
+    uint_reg_t dev          : 5;
+    uint_reg_t fn           : 3;
+    uint_reg_t reg_addr     : 12;
+#endif
+  };
+
+  uint_reg_t word;
+} TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR_t;
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_TRIO_H__) */
diff --git a/arch/tile/include/arch/trio_constants.h b/arch/tile/include/arch/trio_constants.h
new file mode 100644
index 00000000000..85647e91a45
--- /dev/null
+++ b/arch/tile/include/arch/trio_constants.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+
+#ifndef __ARCH_TRIO_CONSTANTS_H__
+#define __ARCH_TRIO_CONSTANTS_H__
+
+#define TRIO_NUM_ASIDS 32
+#define TRIO_NUM_TLBS_PER_ASID 16
+
+#define TRIO_NUM_TPIO_REGIONS 8
+#define TRIO_LOG2_NUM_TPIO_REGIONS 3
+
+#define TRIO_NUM_MAP_MEM_REGIONS 32
+#define TRIO_LOG2_NUM_MAP_MEM_REGIONS 5
+#define TRIO_NUM_MAP_SQ_REGIONS 8
+#define TRIO_LOG2_NUM_MAP_SQ_REGIONS 3
+
+#define TRIO_LOG2_NUM_SQ_FIFO_ENTRIES 6
+
+#define TRIO_NUM_PUSH_DMA_RINGS 64
+
+#define TRIO_NUM_PULL_DMA_RINGS 64
+
+#endif /* __ARCH_TRIO_CONSTANTS_H__ */
diff --git a/arch/tile/include/arch/trio_def.h b/arch/tile/include/arch/trio_def.h
new file mode 100644
index 00000000000..e80500317dc
--- /dev/null
+++ b/arch/tile/include/arch/trio_def.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_TRIO_DEF_H__
+#define __ARCH_TRIO_DEF_H__
+#define TRIO_CFG_REGION_ADDR__REG_SHIFT 0
+#define TRIO_CFG_REGION_ADDR__INTFC_SHIFT 16
+#define TRIO_CFG_REGION_ADDR__INTFC_VAL_TRIO 0x0
+#define TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE 0x1
+#define TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_STANDARD 0x2
+#define TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_PROTECTED 0x3
+#define TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT 18
+#define TRIO_CFG_REGION_ADDR__PROT_SHIFT 20
+#define TRIO_PIO_REGIONS_ADDR__REGION_SHIFT 32
+#define TRIO_MAP_MEM_REG_INT0 0x1000000000
+#define TRIO_MAP_MEM_REG_INT1 0x1000000008
+#define TRIO_MAP_MEM_REG_INT2 0x1000000010
+#define TRIO_MAP_MEM_REG_INT3 0x1000000018
+#define TRIO_MAP_MEM_REG_INT4 0x1000000020
+#define TRIO_MAP_MEM_REG_INT5 0x1000000028
+#define TRIO_MAP_MEM_REG_INT6 0x1000000030
+#define TRIO_MAP_MEM_REG_INT7 0x1000000038
+#define TRIO_MAP_MEM_LIM__ADDR_SHIFT 12
+#define TRIO_MAP_MEM_SETUP__ORDER_MODE_VAL_UNORDERED 0x0
+#define TRIO_MAP_MEM_SETUP__ORDER_MODE_VAL_STRICT 0x1
+#define TRIO_MAP_MEM_SETUP__ORDER_MODE_VAL_REL_ORD 0x2
+#define TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT 30
+#endif /* !defined(__ARCH_TRIO_DEF_H__) */
diff --git a/arch/tile/include/arch/trio_pcie_intfc.h b/arch/tile/include/arch/trio_pcie_intfc.h
new file mode 100644
index 00000000000..0487fdb9d58
--- /dev/null
+++ b/arch/tile/include/arch/trio_pcie_intfc.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_TRIO_PCIE_INTFC_H__
+#define __ARCH_TRIO_PCIE_INTFC_H__
+
+#include <arch/abi.h>
+#include <arch/trio_pcie_intfc_def.h>
+
+#ifndef __ASSEMBLER__
+
+/*
+ * Port Configuration.
+ * Configuration of the PCIe Port
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Provides the state of the strapping pins for this port. */
+    uint_reg_t strap_state      : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_0     : 1;
+    /*
+     * When 1, the device type will be overridden using OVD_DEV_TYPE_VAL.
+     * When 0, the device type is determined based on the STRAP_STATE.
+     */
+    uint_reg_t ovd_dev_type     : 1;
+    /* Provides the device type when OVD_DEV_TYPE is 1. */
+    uint_reg_t ovd_dev_type_val : 4;
+    /* Determines how link is trained. */
+    uint_reg_t train_mode       : 2;
+    /* Reserved. */
+    uint_reg_t __reserved_1     : 1;
+    /*
+     * For PCIe, used to flip physical RX lanes that were not properly wired.
+     *  This is not the same as lane reversal which is handled automatically
+     * during link training.  When 0, RX Lane0 must be wired to the link
+     * partner (either to its Lane0 or it's LaneN).  When RX_LANE_FLIP is 1,
+     * the highest numbered lane for this port becomes Lane0 and Lane0 does
+     * NOT have to be wired to the link partner.
+     */
+    uint_reg_t rx_lane_flip     : 1;
+    /*
+     * For PCIe, used to flip physical TX lanes that were not properly wired.
+     *  This is not the same as lane reversal which is handled automatically
+     * during link training.  When 0, TX Lane0 must be wired to the link
+     * partner (either to its Lane0 or it's LaneN).  When TX_LANE_FLIP is 1,
+     * the highest numbered lane for this port becomes Lane0 and Lane0 does
+     * NOT have to be wired to the link partner.
+     */
+    uint_reg_t tx_lane_flip     : 1;
+    /*
+     * For StreamIO port, configures the width of the port when TRAIN_MODE is
+     * not STRAP.
+     */
+    uint_reg_t stream_width     : 2;
+    /*
+     * For StreamIO port, configures the rate of the port when TRAIN_MODE is
+     * not STRAP.
+     */
+    uint_reg_t stream_rate      : 2;
+    /* Reserved. */
+    uint_reg_t __reserved_2     : 46;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_2     : 46;
+    uint_reg_t stream_rate      : 2;
+    uint_reg_t stream_width     : 2;
+    uint_reg_t tx_lane_flip     : 1;
+    uint_reg_t rx_lane_flip     : 1;
+    uint_reg_t __reserved_1     : 1;
+    uint_reg_t train_mode       : 2;
+    uint_reg_t ovd_dev_type_val : 4;
+    uint_reg_t ovd_dev_type     : 1;
+    uint_reg_t __reserved_0     : 1;
+    uint_reg_t strap_state      : 3;
+#endif
+  };
+
+  uint_reg_t word;
+} TRIO_PCIE_INTFC_PORT_CONFIG_t;
+
+/*
+ * Port Status.
+ * Status of the PCIe Port.  This register applies to the StreamIO port when
+ * StreamIO is enabled.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * Indicates the DL state of the port.  When 1, the port is up and ready
+     * to receive traffic.
+     */
+    uint_reg_t dl_up        : 1;
+    /*
+     * Indicates the number of times the link has gone down.  Clears on read.
+     */
+    uint_reg_t dl_down_cnt  : 7;
+    /* Indicates the SERDES PLL has spun up and is providing a valid clock. */
+    uint_reg_t clock_ready  : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 7;
+    /* Device revision ID. */
+    uint_reg_t device_rev   : 8;
+    /* Link state (PCIe). */
+    uint_reg_t ltssm_state  : 6;
+    /* Link power management state (PCIe). */
+    uint_reg_t pm_state     : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 31;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_1 : 31;
+    uint_reg_t pm_state     : 3;
+    uint_reg_t ltssm_state  : 6;
+    uint_reg_t device_rev   : 8;
+    uint_reg_t __reserved_0 : 7;
+    uint_reg_t clock_ready  : 1;
+    uint_reg_t dl_down_cnt  : 7;
+    uint_reg_t dl_up        : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} TRIO_PCIE_INTFC_PORT_STATUS_t;
+
+/*
+ * Transmit FIFO Control.
+ * Contains TX FIFO thresholds.  These registers are for diagnostics purposes
+ * only.  Changing these values causes undefined behavior.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * Almost-Empty level for TX0 data.  Typically set to at least
+     * roundup(38.0*M/N) where N=tclk frequency and M=MAC symbol rate in MHz
+     * for a x4 port (250MHz).
+     */
+    uint_reg_t tx0_data_ae_lvl : 7;
+    /* Reserved. */
+    uint_reg_t __reserved_0    : 1;
+    /* Almost-Empty level for TX1 data. */
+    uint_reg_t tx1_data_ae_lvl : 7;
+    /* Reserved. */
+    uint_reg_t __reserved_1    : 1;
+    /* Almost-Full level for TX0 data. */
+    uint_reg_t tx0_data_af_lvl : 7;
+    /* Reserved. */
+    uint_reg_t __reserved_2    : 1;
+    /* Almost-Full level for TX1 data. */
+    uint_reg_t tx1_data_af_lvl : 7;
+    /* Reserved. */
+    uint_reg_t __reserved_3    : 1;
+    /* Almost-Full level for TX0 info. */
+    uint_reg_t tx0_info_af_lvl : 5;
+    /* Reserved. */
+    uint_reg_t __reserved_4    : 3;
+    /* Almost-Full level for TX1 info. */
+    uint_reg_t tx1_info_af_lvl : 5;
+    /* Reserved. */
+    uint_reg_t __reserved_5    : 3;
+    /*
+     * This register provides performance adjustment for high bandwidth
+     * flows.  The MAC will assert almost-full to TRIO if non-posted credits
+     * fall below this level.  Note that setting this larger than the initial
+     * PORT_CREDIT.NPH value will cause READS to never be sent.  If the
+     * initial credit value from the link partner is smaller than this value
+     * when the link comes up, the value will be reset to the initial credit
+     * value to prevent lockup.
+     */
+    uint_reg_t min_np_credits  : 8;
+    /*
+     * This register provides performance adjustment for high bandwidth
+     * flows.  The MAC will assert almost-full to TRIO if posted credits fall
+     * below this level.  Note that setting this larger than the initial
+     * PORT_CREDIT.PH value will cause WRITES to never be sent.  If the
+     * initial credit value from the link partner is smaller than this value
+     * when the link comes up, the value will be reset to the initial credit
+     * value to prevent lockup.
+     */
+    uint_reg_t min_p_credits   : 8;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t min_p_credits   : 8;
+    uint_reg_t min_np_credits  : 8;
+    uint_reg_t __reserved_5    : 3;
+    uint_reg_t tx1_info_af_lvl : 5;
+    uint_reg_t __reserved_4    : 3;
+    uint_reg_t tx0_info_af_lvl : 5;
+    uint_reg_t __reserved_3    : 1;
+    uint_reg_t tx1_data_af_lvl : 7;
+    uint_reg_t __reserved_2    : 1;
+    uint_reg_t tx0_data_af_lvl : 7;
+    uint_reg_t __reserved_1    : 1;
+    uint_reg_t tx1_data_ae_lvl : 7;
+    uint_reg_t __reserved_0    : 1;
+    uint_reg_t tx0_data_ae_lvl : 7;
+#endif
+  };
+
+  uint_reg_t word;
+} TRIO_PCIE_INTFC_TX_FIFO_CTL_t;
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_TRIO_PCIE_INTFC_H__) */
diff --git a/arch/tile/include/arch/trio_pcie_intfc_def.h b/arch/tile/include/arch/trio_pcie_intfc_def.h
new file mode 100644
index 00000000000..d3fd6781fb2
--- /dev/null
+++ b/arch/tile/include/arch/trio_pcie_intfc_def.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_TRIO_PCIE_INTFC_DEF_H__
+#define __ARCH_TRIO_PCIE_INTFC_DEF_H__
+#define TRIO_PCIE_INTFC_MAC_INT_STS 0x0000
+#define TRIO_PCIE_INTFC_MAC_INT_STS__INT_LEVEL_MASK  0xf000
+#define TRIO_PCIE_INTFC_PORT_CONFIG 0x0018
+#define TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_DISABLED 0x0
+#define TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT 0x1
+#define TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC 0x2
+#define TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT_G1 0x3
+#define TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC_G1 0x4
+#define TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_XLINK 0x5
+#define TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_STREAM_X1 0x6
+#define TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_STREAM_X4 0x7
+#define TRIO_PCIE_INTFC_PORT_STATUS 0x0020
+#define TRIO_PCIE_INTFC_TX_FIFO_CTL 0x0050
+#endif /* !defined(__ARCH_TRIO_PCIE_INTFC_DEF_H__) */
diff --git a/arch/tile/include/arch/trio_pcie_rc.h b/arch/tile/include/arch/trio_pcie_rc.h
new file mode 100644
index 00000000000..6a25d0aca85
--- /dev/null
+++ b/arch/tile/include/arch/trio_pcie_rc.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_TRIO_PCIE_RC_H__
+#define __ARCH_TRIO_PCIE_RC_H__
+
+#include <arch/abi.h>
+#include <arch/trio_pcie_rc_def.h>
+
+#ifndef __ASSEMBLER__
+
+/* Device Capabilities Register. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * Max_Payload_Size Supported, writablethrough the MAC_STANDARD interface
+     */
+    uint_reg_t mps_sup                    : 3;
+    /*
+     * This field is writable through the MAC_STANDARD interface.  However,
+     * Phantom Function is not  supported. Therefore, the application must
+     * not write any value other than 0x0 to this  field.
+     */
+    uint_reg_t phantom_function_supported : 2;
+    /* This bit is writable through the MAC_STANDARD interface. */
+    uint_reg_t ext_tag_field_supported    : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_0               : 3;
+    /* Endpoint L1 Acceptable Latency Must be 0x0 for non-Endpoint devices. */
+    uint_reg_t l1_lat                     : 3;
+    /*
+     * Undefined since PCI Express 1.1 (Was Attention Button Present for PCI
+     * Express 1.0a)
+     */
+    uint_reg_t r1                         : 1;
+    /*
+     * Undefined since PCI Express 1.1 (Was Attention Indicator Present for
+     * PCI  Express 1.0a)
+     */
+    uint_reg_t r2                         : 1;
+    /*
+     * Undefined since PCI Express 1.1 (Was Power Indicator Present for PCI
+     * Express 1.0a)
+     */
+    uint_reg_t r3                         : 1;
+    /*
+     * Role-Based Error Reporting, writable through the MAC_STANDARD
+     * interface.  Required to be set for device compliant to 1.1  spec and
+     * later.
+     */
+    uint_reg_t rer                        : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_1               : 2;
+    /* Captured Slot Power Limit Value Upstream port only. */
+    uint_reg_t slot_pwr_lim               : 8;
+    /* Captured Slot Power Limit Scale Upstream port only. */
+    uint_reg_t slot_pwr_scale             : 2;
+    /* Reserved. */
+    uint_reg_t __reserved_2               : 4;
+    /* Endpoint L0s Acceptable LatencyMust be 0x0 for non-Endpoint devices. */
+    uint_reg_t l0s_lat                    : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_3               : 31;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_3               : 31;
+    uint_reg_t l0s_lat                    : 1;
+    uint_reg_t __reserved_2               : 4;
+    uint_reg_t slot_pwr_scale             : 2;
+    uint_reg_t slot_pwr_lim               : 8;
+    uint_reg_t __reserved_1               : 2;
+    uint_reg_t rer                        : 1;
+    uint_reg_t r3                         : 1;
+    uint_reg_t r2                         : 1;
+    uint_reg_t r1                         : 1;
+    uint_reg_t l1_lat                     : 3;
+    uint_reg_t __reserved_0               : 3;
+    uint_reg_t ext_tag_field_supported    : 1;
+    uint_reg_t phantom_function_supported : 2;
+    uint_reg_t mps_sup                    : 3;
+#endif
+  };
+
+  uint_reg_t word;
+} TRIO_PCIE_RC_DEVICE_CAP_t;
+
+/* Device Control Register. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Correctable Error Reporting Enable */
+    uint_reg_t cor_err_ena      : 1;
+    /* Non-Fatal Error Reporting Enable */
+    uint_reg_t nf_err_ena       : 1;
+    /* Fatal Error Reporting Enable */
+    uint_reg_t fatal_err_ena    : 1;
+    /* Unsupported Request Reporting Enable */
+    uint_reg_t ur_ena           : 1;
+    /* Relaxed orderring enable */
+    uint_reg_t ro_ena           : 1;
+    /* Max Payload Size */
+    uint_reg_t max_payload_size : 3;
+    /* Extended Tag Field Enable */
+    uint_reg_t ext_tag          : 1;
+    /* Phantom Function Enable */
+    uint_reg_t ph_fn_ena        : 1;
+    /* AUX Power PM Enable */
+    uint_reg_t aux_pm_ena       : 1;
+    /* Enable NoSnoop */
+    uint_reg_t no_snoop         : 1;
+    /* Max read request size */
+    uint_reg_t max_read_req_sz  : 3;
+    /* Reserved. */
+    uint_reg_t __reserved       : 49;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved       : 49;
+    uint_reg_t max_read_req_sz  : 3;
+    uint_reg_t no_snoop         : 1;
+    uint_reg_t aux_pm_ena       : 1;
+    uint_reg_t ph_fn_ena        : 1;
+    uint_reg_t ext_tag          : 1;
+    uint_reg_t max_payload_size : 3;
+    uint_reg_t ro_ena           : 1;
+    uint_reg_t ur_ena           : 1;
+    uint_reg_t fatal_err_ena    : 1;
+    uint_reg_t nf_err_ena       : 1;
+    uint_reg_t cor_err_ena      : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} TRIO_PCIE_RC_DEVICE_CONTROL_t;
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_TRIO_PCIE_RC_H__) */
diff --git a/arch/tile/include/arch/trio_pcie_rc_def.h b/arch/tile/include/arch/trio_pcie_rc_def.h
new file mode 100644
index 00000000000..74081a65b6f
--- /dev/null
+++ b/arch/tile/include/arch/trio_pcie_rc_def.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_TRIO_PCIE_RC_DEF_H__
+#define __ARCH_TRIO_PCIE_RC_DEF_H__
+#define TRIO_PCIE_RC_DEVICE_CAP 0x0074
+#define TRIO_PCIE_RC_DEVICE_CONTROL 0x0078
+#define TRIO_PCIE_RC_DEVICE_ID_VEN_ID 0x0000
+#define TRIO_PCIE_RC_DEVICE_ID_VEN_ID__DEV_ID_SHIFT 16
+#define TRIO_PCIE_RC_REVISION_ID 0x0008
+#endif /* !defined(__ARCH_TRIO_PCIE_RC_DEF_H__) */
diff --git a/arch/tile/include/arch/trio_shm.h b/arch/tile/include/arch/trio_shm.h
new file mode 100644
index 00000000000..3382e38245a
--- /dev/null
+++ b/arch/tile/include/arch/trio_shm.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+
+#ifndef __ARCH_TRIO_SHM_H__
+#define __ARCH_TRIO_SHM_H__
+
+#include <arch/abi.h>
+#include <arch/trio_shm_def.h>
+
+#ifndef __ASSEMBLER__
+/**
+ * TRIO DMA Descriptor.
+ * The TRIO DMA descriptor is written by software and consumed by hardware.
+ * It is used to specify the location of transaction data in the IO and Tile
+ * domains.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+    /* Word 0 */
+
+#ifndef __BIG_ENDIAN__
+    /** Tile side virtual address. */
+    int_reg_t va           : 42;
+    /**
+     * Encoded size of buffer used on push DMA when C=1:
+     * 0 = 128 bytes
+     * 1 = 256 bytes
+     * 2 = 512 bytes
+     * 3 = 1024 bytes
+     * 4 = 1664 bytes
+     * 5 = 4096 bytes
+     * 6 = 10368 bytes
+     * 7 = 16384 bytes
+     */
+    uint_reg_t bsz          : 3;
+    /**
+     * Chaining designation.  Always zero for pull DMA
+     * 0 : Unchained buffer pointer
+     * 1 : Chained buffer pointer.  Next buffer descriptor (e.g. VA) stored
+     * in 1st 8-bytes in buffer.  For chained buffers, first 8-bytes of each
+     * buffer contain the next buffer descriptor formatted exactly like a PDE
+     * buffer descriptor.  This allows a chained PDE buffer to be sent using
+     * push DMA.
+     */
+    uint_reg_t c            : 1;
+    /**
+     * Notification interrupt will be delivered when the transaction has
+     * completed (all data has been read from or written to the Tile-side
+     * buffer).
+     */
+    uint_reg_t notif        : 1;
+    /**
+     * When 0, the XSIZE field specifies the total byte count for the
+     * transaction.  When 1, the XSIZE field is encoded as 2^(N+14) for N in
+     * {0..6}:
+     * 0 = 16KB
+     * 1 = 32KB
+     * 2 = 64KB
+     * 3 = 128KB
+     * 4 = 256KB
+     * 5 = 512KB
+     * 6 = 1MB
+     * All other encodings of the XSIZE field are reserved when SMOD=1
+     */
+    uint_reg_t smod         : 1;
+    /**
+     * Total number of bytes to move for this transaction.   When SMOD=1,
+     * this field is encoded - see SMOD description.
+     */
+    uint_reg_t xsize        : 14;
+    /** Reserved. */
+    uint_reg_t __reserved_0 : 1;
+    /**
+     * Generation number.  Used to indicate a valid descriptor in ring.  When
+     * a new descriptor is written into the ring, software must toggle this
+     * bit.  The net effect is that the GEN bit being written into new
+     * descriptors toggles each time the ring tail pointer wraps.
+     */
+    uint_reg_t gen          : 1;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t gen          : 1;
+    uint_reg_t __reserved_0 : 1;
+    uint_reg_t xsize        : 14;
+    uint_reg_t smod         : 1;
+    uint_reg_t notif        : 1;
+    uint_reg_t c            : 1;
+    uint_reg_t bsz          : 3;
+    int_reg_t va           : 42;
+#endif
+
+    /* Word 1 */
+
+#ifndef __BIG_ENDIAN__
+    /** IO-side address */
+    uint_reg_t io_address : 64;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t io_address : 64;
+#endif
+
+  };
+
+  /** Word access */
+  uint_reg_t words[2];
+} TRIO_DMA_DESC_t;
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_TRIO_SHM_H__) */
diff --git a/arch/tile/include/arch/trio_shm_def.h b/arch/tile/include/arch/trio_shm_def.h
new file mode 100644
index 00000000000..72a59c88b06
--- /dev/null
+++ b/arch/tile/include/arch/trio_shm_def.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_TRIO_SHM_DEF_H__
+#define __ARCH_TRIO_SHM_DEF_H__
+#endif /* !defined(__ARCH_TRIO_SHM_DEF_H__) */
diff --git a/arch/tile/include/arch/uart.h b/arch/tile/include/arch/uart.h
new file mode 100644
index 00000000000..07966970ada
--- /dev/null
+++ b/arch/tile/include/arch/uart.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_UART_H__
+#define __ARCH_UART_H__
+
+#include <arch/abi.h>
+#include <arch/uart_def.h>
+
+#ifndef __ASSEMBLER__
+
+/* Divisor. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * Baud Rate Divisor.  Desired_baud_rate = REF_CLK frequency / (baud *
+     * 16).
+     *                       Note: REF_CLK is always 125 MHz, the default
+     * divisor = 68, baud rate = 125M/(68*16) = 115200 baud.
+     */
+    uint_reg_t divisor    : 12;
+    /* Reserved. */
+    uint_reg_t __reserved : 52;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved : 52;
+    uint_reg_t divisor    : 12;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_DIVISOR_t;
+
+/* FIFO Count. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /*
+     * n: n active entries in the receive FIFO (max is 2**8). Each entry has
+     * 8 bits.
+     * 0: no active entry in the receive FIFO (that is empty).
+     */
+    uint_reg_t rfifo_count  : 9;
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 7;
+    /*
+     * n: n active entries in the transmit FIFO (max is 2**8). Each entry has
+     * 8 bits.
+     * 0: no active entry in the transmit FIFO (that is empty).
+     */
+    uint_reg_t tfifo_count  : 9;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 7;
+    /*
+     * n: n active entries in the write FIFO (max is 2**2). Each entry has 8
+     * bits.
+     * 0: no active entry in the write FIFO (that is empty).
+     */
+    uint_reg_t wfifo_count  : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_2 : 29;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_2 : 29;
+    uint_reg_t wfifo_count  : 3;
+    uint_reg_t __reserved_1 : 7;
+    uint_reg_t tfifo_count  : 9;
+    uint_reg_t __reserved_0 : 7;
+    uint_reg_t rfifo_count  : 9;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_FIFO_COUNT_t;
+
+/* FLAG. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 1;
+    /* 1: receive FIFO is empty */
+    uint_reg_t rfifo_empty  : 1;
+    /* 1: write FIFO is empty. */
+    uint_reg_t wfifo_empty  : 1;
+    /* 1: transmit FIFO is empty. */
+    uint_reg_t tfifo_empty  : 1;
+    /* 1: receive FIFO is full. */
+    uint_reg_t rfifo_full   : 1;
+    /* 1: write FIFO is full. */
+    uint_reg_t wfifo_full   : 1;
+    /* 1: transmit FIFO is full. */
+    uint_reg_t tfifo_full   : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 57;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_1 : 57;
+    uint_reg_t tfifo_full   : 1;
+    uint_reg_t wfifo_full   : 1;
+    uint_reg_t rfifo_full   : 1;
+    uint_reg_t tfifo_empty  : 1;
+    uint_reg_t wfifo_empty  : 1;
+    uint_reg_t rfifo_empty  : 1;
+    uint_reg_t __reserved_0 : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_FLAG_t;
+
+/*
+ * Interrupt Vector Mask.
+ * Each bit in this register corresponds to a specific interrupt. When set,
+ * the associated interrupt will not be dispatched.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Read data FIFO read and no data available */
+    uint_reg_t rdat_err       : 1;
+    /* Write FIFO was written but it was full */
+    uint_reg_t wdat_err       : 1;
+    /* Stop bit not found when current data was received */
+    uint_reg_t frame_err      : 1;
+    /* Parity error was detected when current data was received */
+    uint_reg_t parity_err     : 1;
+    /* Data was received but the receive FIFO was full */
+    uint_reg_t rfifo_overflow : 1;
+    /*
+     * An almost full event is reached when data is to be written to the
+     * receive FIFO, and the receive FIFO has more than or equal to
+     * BUFFER_THRESHOLD.RFIFO_AFULL bytes.
+     */
+    uint_reg_t rfifo_afull    : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_0   : 1;
+    /* An entry in the transmit FIFO was popped */
+    uint_reg_t tfifo_re       : 1;
+    /* An entry has been pushed into the receive FIFO */
+    uint_reg_t rfifo_we       : 1;
+    /* An entry of the write FIFO has been popped */
+    uint_reg_t wfifo_re       : 1;
+    /* Rshim read receive FIFO in protocol mode */
+    uint_reg_t rfifo_err      : 1;
+    /*
+     * An almost empty event is reached when data is to be read from the
+     * transmit FIFO, and the transmit FIFO has less than or equal to
+     * BUFFER_THRESHOLD.TFIFO_AEMPTY bytes.
+     */
+    uint_reg_t tfifo_aempty   : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_1   : 52;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_1   : 52;
+    uint_reg_t tfifo_aempty   : 1;
+    uint_reg_t rfifo_err      : 1;
+    uint_reg_t wfifo_re       : 1;
+    uint_reg_t rfifo_we       : 1;
+    uint_reg_t tfifo_re       : 1;
+    uint_reg_t __reserved_0   : 1;
+    uint_reg_t rfifo_afull    : 1;
+    uint_reg_t rfifo_overflow : 1;
+    uint_reg_t parity_err     : 1;
+    uint_reg_t frame_err      : 1;
+    uint_reg_t wdat_err       : 1;
+    uint_reg_t rdat_err       : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_INTERRUPT_MASK_t;
+
+/*
+ * Interrupt vector, write-one-to-clear.
+ * Each bit in this register corresponds to a specific interrupt. Hardware
+ * sets the bit when the associated condition has occurred. Writing a 1
+ * clears the status bit.
+ */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Read data FIFO read and no data available */
+    uint_reg_t rdat_err       : 1;
+    /* Write FIFO was written but it was full */
+    uint_reg_t wdat_err       : 1;
+    /* Stop bit not found when current data was received */
+    uint_reg_t frame_err      : 1;
+    /* Parity error was detected when current data was received */
+    uint_reg_t parity_err     : 1;
+    /* Data was received but the receive FIFO was full */
+    uint_reg_t rfifo_overflow : 1;
+    /*
+     * Data was received and the receive FIFO is now almost full (more than
+     * BUFFER_THRESHOLD.RFIFO_AFULL bytes in it)
+     */
+    uint_reg_t rfifo_afull    : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_0   : 1;
+    /* An entry in the transmit FIFO was popped */
+    uint_reg_t tfifo_re       : 1;
+    /* An entry has been pushed into the receive FIFO */
+    uint_reg_t rfifo_we       : 1;
+    /* An entry of the write FIFO has been popped */
+    uint_reg_t wfifo_re       : 1;
+    /* Rshim read receive FIFO in protocol mode */
+    uint_reg_t rfifo_err      : 1;
+    /*
+     * Data was read from the transmit FIFO and now it is almost empty (less
+     * than or equal to BUFFER_THRESHOLD.TFIFO_AEMPTY bytes in it).
+     */
+    uint_reg_t tfifo_aempty   : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_1   : 52;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_1   : 52;
+    uint_reg_t tfifo_aempty   : 1;
+    uint_reg_t rfifo_err      : 1;
+    uint_reg_t wfifo_re       : 1;
+    uint_reg_t rfifo_we       : 1;
+    uint_reg_t tfifo_re       : 1;
+    uint_reg_t __reserved_0   : 1;
+    uint_reg_t rfifo_afull    : 1;
+    uint_reg_t rfifo_overflow : 1;
+    uint_reg_t parity_err     : 1;
+    uint_reg_t frame_err      : 1;
+    uint_reg_t wdat_err       : 1;
+    uint_reg_t rdat_err       : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_INTERRUPT_STATUS_t;
+
+/* Type. */
+
+__extension__
+typedef union
+{
+  struct
+  {
+#ifndef __BIG_ENDIAN__
+    /* Number of stop bits, rx and tx */
+    uint_reg_t sbits        : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_0 : 1;
+    /* Data word size, rx and tx */
+    uint_reg_t dbits        : 1;
+    /* Reserved. */
+    uint_reg_t __reserved_1 : 1;
+    /* Parity selection, rx and tx */
+    uint_reg_t ptype        : 3;
+    /* Reserved. */
+    uint_reg_t __reserved_2 : 57;
+#else   /* __BIG_ENDIAN__ */
+    uint_reg_t __reserved_2 : 57;
+    uint_reg_t ptype        : 3;
+    uint_reg_t __reserved_1 : 1;
+    uint_reg_t dbits        : 1;
+    uint_reg_t __reserved_0 : 1;
+    uint_reg_t sbits        : 1;
+#endif
+  };
+
+  uint_reg_t word;
+} UART_TYPE_t;
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_UART_H__) */
diff --git a/arch/tile/include/arch/uart_def.h b/arch/tile/include/arch/uart_def.h
new file mode 100644
index 00000000000..42bcaf53537
--- /dev/null
+++ b/arch/tile/include/arch/uart_def.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_UART_DEF_H__
+#define __ARCH_UART_DEF_H__
+#define UART_DIVISOR 0x0158
+#define UART_FIFO_COUNT 0x0110
+#define UART_FLAG 0x0108
+#define UART_INTERRUPT_MASK 0x0208
+#define UART_INTERRUPT_MASK__RDAT_ERR_SHIFT 0
+#define UART_INTERRUPT_MASK__RDAT_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__RDAT_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RDAT_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__RDAT_ERR_MASK  0x1
+#define UART_INTERRUPT_MASK__RDAT_ERR_FIELD 0,0
+#define UART_INTERRUPT_MASK__WDAT_ERR_SHIFT 1
+#define UART_INTERRUPT_MASK__WDAT_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__WDAT_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__WDAT_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__WDAT_ERR_MASK  0x2
+#define UART_INTERRUPT_MASK__WDAT_ERR_FIELD 1,1
+#define UART_INTERRUPT_MASK__FRAME_ERR_SHIFT 2
+#define UART_INTERRUPT_MASK__FRAME_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__FRAME_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__FRAME_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__FRAME_ERR_MASK  0x4
+#define UART_INTERRUPT_MASK__FRAME_ERR_FIELD 2,2
+#define UART_INTERRUPT_MASK__PARITY_ERR_SHIFT 3
+#define UART_INTERRUPT_MASK__PARITY_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__PARITY_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__PARITY_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__PARITY_ERR_MASK  0x8
+#define UART_INTERRUPT_MASK__PARITY_ERR_FIELD 3,3
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_SHIFT 4
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_WIDTH 1
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_RMASK 0x1
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_MASK  0x10
+#define UART_INTERRUPT_MASK__RFIFO_OVERFLOW_FIELD 4,4
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_SHIFT 5
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_WIDTH 1
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_RMASK 0x1
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_MASK  0x20
+#define UART_INTERRUPT_MASK__RFIFO_AFULL_FIELD 5,5
+#define UART_INTERRUPT_MASK__TFIFO_RE_SHIFT 7
+#define UART_INTERRUPT_MASK__TFIFO_RE_WIDTH 1
+#define UART_INTERRUPT_MASK__TFIFO_RE_RESET_VAL 1
+#define UART_INTERRUPT_MASK__TFIFO_RE_RMASK 0x1
+#define UART_INTERRUPT_MASK__TFIFO_RE_MASK  0x80
+#define UART_INTERRUPT_MASK__TFIFO_RE_FIELD 7,7
+#define UART_INTERRUPT_MASK__RFIFO_WE_SHIFT 8
+#define UART_INTERRUPT_MASK__RFIFO_WE_WIDTH 1
+#define UART_INTERRUPT_MASK__RFIFO_WE_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RFIFO_WE_RMASK 0x1
+#define UART_INTERRUPT_MASK__RFIFO_WE_MASK  0x100
+#define UART_INTERRUPT_MASK__RFIFO_WE_FIELD 8,8
+#define UART_INTERRUPT_MASK__WFIFO_RE_SHIFT 9
+#define UART_INTERRUPT_MASK__WFIFO_RE_WIDTH 1
+#define UART_INTERRUPT_MASK__WFIFO_RE_RESET_VAL 1
+#define UART_INTERRUPT_MASK__WFIFO_RE_RMASK 0x1
+#define UART_INTERRUPT_MASK__WFIFO_RE_MASK  0x200
+#define UART_INTERRUPT_MASK__WFIFO_RE_FIELD 9,9
+#define UART_INTERRUPT_MASK__RFIFO_ERR_SHIFT 10
+#define UART_INTERRUPT_MASK__RFIFO_ERR_WIDTH 1
+#define UART_INTERRUPT_MASK__RFIFO_ERR_RESET_VAL 1
+#define UART_INTERRUPT_MASK__RFIFO_ERR_RMASK 0x1
+#define UART_INTERRUPT_MASK__RFIFO_ERR_MASK  0x400
+#define UART_INTERRUPT_MASK__RFIFO_ERR_FIELD 10,10
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_SHIFT 11
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_WIDTH 1
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_RESET_VAL 1
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_RMASK 0x1
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_MASK  0x800
+#define UART_INTERRUPT_MASK__TFIFO_AEMPTY_FIELD 11,11
+#define UART_INTERRUPT_STATUS 0x0200
+#define UART_RECEIVE_DATA 0x0148
+#define UART_TRANSMIT_DATA 0x0140
+#define UART_TYPE 0x0160
+#define UART_TYPE__SBITS_SHIFT 0
+#define UART_TYPE__SBITS_WIDTH 1
+#define UART_TYPE__SBITS_RESET_VAL 1
+#define UART_TYPE__SBITS_RMASK 0x1
+#define UART_TYPE__SBITS_MASK  0x1
+#define UART_TYPE__SBITS_FIELD 0,0
+#define UART_TYPE__SBITS_VAL_ONE_SBITS 0x0
+#define UART_TYPE__SBITS_VAL_TWO_SBITS 0x1
+#define UART_TYPE__DBITS_SHIFT 2
+#define UART_TYPE__DBITS_WIDTH 1
+#define UART_TYPE__DBITS_RESET_VAL 0
+#define UART_TYPE__DBITS_RMASK 0x1
+#define UART_TYPE__DBITS_MASK  0x4
+#define UART_TYPE__DBITS_FIELD 2,2
+#define UART_TYPE__DBITS_VAL_EIGHT_DBITS 0x0
+#define UART_TYPE__DBITS_VAL_SEVEN_DBITS 0x1
+#define UART_TYPE__PTYPE_SHIFT 4
+#define UART_TYPE__PTYPE_WIDTH 3
+#define UART_TYPE__PTYPE_RESET_VAL 3
+#define UART_TYPE__PTYPE_RMASK 0x7
+#define UART_TYPE__PTYPE_MASK  0x70
+#define UART_TYPE__PTYPE_FIELD 4,6
+#define UART_TYPE__PTYPE_VAL_NONE 0x0
+#define UART_TYPE__PTYPE_VAL_MARK 0x1
+#define UART_TYPE__PTYPE_VAL_SPACE 0x2
+#define UART_TYPE__PTYPE_VAL_EVEN 0x3
+#define UART_TYPE__PTYPE_VAL_ODD 0x4
+#endif /* !defined(__ARCH_UART_DEF_H__) */
diff --git a/arch/tile/include/arch/usb_host.h b/arch/tile/include/arch/usb_host.h
new file mode 100644
index 00000000000..d09f3268396
--- /dev/null
+++ b/arch/tile/include/arch/usb_host.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_USB_HOST_H__
+#define __ARCH_USB_HOST_H__
+
+#include <arch/abi.h>
+#include <arch/usb_host_def.h>
+
+#ifndef __ASSEMBLER__
+#endif /* !defined(__ASSEMBLER__) */
+
+#endif /* !defined(__ARCH_USB_HOST_H__) */
diff --git a/arch/tile/include/arch/usb_host_def.h b/arch/tile/include/arch/usb_host_def.h
new file mode 100644
index 00000000000..aeed7753e8e
--- /dev/null
+++ b/arch/tile/include/arch/usb_host_def.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Machine-generated file; do not edit. */
+
+#ifndef __ARCH_USB_HOST_DEF_H__
+#define __ARCH_USB_HOST_DEF_H__
+#endif /* !defined(__ARCH_USB_HOST_DEF_H__) */
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index 0bb42642343..0aa5675e702 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -1,33 +1,32 @@
-include include/asm-generic/Kbuild.asm
 
 header-y += ../arch/
 
-header-y += ucontext.h
-header-y += hardwall.h
-
 generic-y += bug.h
 generic-y += bugs.h
+generic-y += clkdev.h
 generic-y += cputime.h
-generic-y += device.h
 generic-y += div64.h
 generic-y += emergency-restart.h
 generic-y += errno.h
+generic-y += exec.h
 generic-y += fb.h
 generic-y += fcntl.h
+generic-y += hash.h
+generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
-generic-y += ipc.h
 generic-y += ipcbuf.h
 generic-y += irq_regs.h
-generic-y += kdebug.h
 generic-y += local.h
-generic-y += module.h
+generic-y += local64.h
+generic-y += mcs_spinlock.h
 generic-y += msgbuf.h
 generic-y += mutex.h
 generic-y += param.h
 generic-y += parport.h
 generic-y += poll.h
 generic-y += posix_types.h
+generic-y += preempt.h
 generic-y += resource.h
 generic-y += scatterlist.h
 generic-y += sembuf.h
@@ -39,6 +38,6 @@ generic-y += sockios.h
 generic-y += statfs.h
 generic-y += termbits.h
 generic-y += termios.h
+generic-y += trace_clock.h
 generic-y += types.h
-generic-y += ucontext.h
 generic-y += xor.h
diff --git a/arch/tile/include/asm/atomic.h b/arch/tile/include/asm/atomic.h
index 921dbeb8a70..70979846076 100644
--- a/arch/tile/include/asm/atomic.h
+++ b/arch/tile/include/asm/atomic.h
@@ -17,10 +17,12 @@
 #ifndef _ASM_TILE_ATOMIC_H
 #define _ASM_TILE_ATOMIC_H
 
+#include <asm/cmpxchg.h>
+
 #ifndef __ASSEMBLY__
 
 #include <linux/compiler.h>
-#include <asm/system.h>
+#include <linux/types.h>
 
 #define ATOMIC_INIT(i)	{ (i) }
 
@@ -112,6 +114,32 @@ static inline int atomic_read(const atomic_t *v)
 #define atomic_inc_and_test(v)		(atomic_inc_return(v) == 0)
 
 /**
+ * atomic_xchg - atomically exchange contents of memory with a new value
+ * @v: pointer of type atomic_t
+ * @i: integer value to store in memory
+ *
+ * Atomically sets @v to @i and returns old @v
+ */
+static inline int atomic_xchg(atomic_t *v, int n)
+{
+	return xchg(&v->counter, n);
+}
+
+/**
+ * atomic_cmpxchg - atomically exchange contents of memory if it matches
+ * @v: pointer of type atomic_t
+ * @o: old value that memory should have
+ * @n: new value to write to memory if it matches
+ *
+ * Atomically checks if @v holds @o and replaces it with @n if so.
+ * Returns the old value at @v.
+ */
+static inline int atomic_cmpxchg(atomic_t *v, int o, int n)
+{
+	return cmpxchg(&v->counter, o, n);
+}
+
+/**
  * atomic_add_negative - add and test if negative
  * @v: pointer of type atomic_t
  * @i: integer value to add
@@ -121,54 +149,6 @@ static inline int atomic_read(const atomic_t *v)
  */
 #define atomic_add_negative(i, v)	(atomic_add_return((i), (v)) < 0)
 
-/* Nonexistent functions intended to cause link errors. */
-extern unsigned long __xchg_called_with_bad_pointer(void);
-extern unsigned long __cmpxchg_called_with_bad_pointer(void);
-
-#define xchg(ptr, x)							\
-	({								\
-		typeof(*(ptr)) __x;					\
-		switch (sizeof(*(ptr))) {				\
-		case 4:							\
-			__x = (typeof(__x))(typeof(__x-__x))atomic_xchg( \
-				(atomic_t *)(ptr),			\
-				(u32)(typeof((x)-(x)))(x));		\
-			break;						\
-		case 8:							\
-			__x = (typeof(__x))(typeof(__x-__x))atomic64_xchg( \
-				(atomic64_t *)(ptr),			\
-				(u64)(typeof((x)-(x)))(x));		\
-			break;						\
-		default:						\
-			__xchg_called_with_bad_pointer();		\
-		}							\
-		__x;							\
-	})
-
-#define cmpxchg(ptr, o, n)						\
-	({								\
-		typeof(*(ptr)) __x;					\
-		switch (sizeof(*(ptr))) {				\
-		case 4:							\
-			__x = (typeof(__x))(typeof(__x-__x))atomic_cmpxchg( \
-				(atomic_t *)(ptr),			\
-				(u32)(typeof((o)-(o)))(o),		\
-				(u32)(typeof((n)-(n)))(n));		\
-			break;						\
-		case 8:							\
-			__x = (typeof(__x))(typeof(__x-__x))atomic64_cmpxchg( \
-				(atomic64_t *)(ptr),			\
-				(u64)(typeof((o)-(o)))(o),		\
-				(u64)(typeof((n)-(n)))(n));		\
-			break;						\
-		default:						\
-			__cmpxchg_called_with_bad_pointer();		\
-		}							\
-		__x;							\
-	})
-
-#define tas(ptr) (xchg((ptr), 1))
-
 #endif /* __ASSEMBLY__ */
 
 #ifndef __tilegx__
@@ -177,4 +157,52 @@ extern unsigned long __cmpxchg_called_with_bad_pointer(void);
 #include <asm/atomic_64.h>
 #endif
 
+#ifndef __ASSEMBLY__
+
+/**
+ * atomic64_xchg - atomically exchange contents of memory with a new value
+ * @v: pointer of type atomic64_t
+ * @i: integer value to store in memory
+ *
+ * Atomically sets @v to @i and returns old @v
+ */
+static inline long long atomic64_xchg(atomic64_t *v, long long n)
+{
+	return xchg64(&v->counter, n);
+}
+
+/**
+ * atomic64_cmpxchg - atomically exchange contents of memory if it matches
+ * @v: pointer of type atomic64_t
+ * @o: old value that memory should have
+ * @n: new value to write to memory if it matches
+ *
+ * Atomically checks if @v holds @o and replaces it with @n if so.
+ * Returns the old value at @v.
+ */
+static inline long long atomic64_cmpxchg(atomic64_t *v, long long o,
+					long long n)
+{
+	return cmpxchg64(&v->counter, o, n);
+}
+
+static inline long long atomic64_dec_if_positive(atomic64_t *v)
+{
+	long long c, old, dec;
+
+	c = atomic64_read(v);
+	for (;;) {
+		dec = c - 1;
+		if (unlikely(dec < 0))
+			break;
+		old = atomic64_cmpxchg((v), c, dec);
+		if (likely(old == c))
+			break;
+		c = old;
+	}
+	return dec;
+}
+
+#endif /* __ASSEMBLY__ */
+
 #endif /* _ASM_TILE_ATOMIC_H */
diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h
index c03349e0ca9..1b109fad9ff 100644
--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -17,44 +17,11 @@
 #ifndef _ASM_TILE_ATOMIC_32_H
 #define _ASM_TILE_ATOMIC_32_H
 
+#include <asm/barrier.h>
 #include <arch/chip.h>
 
 #ifndef __ASSEMBLY__
 
-/* Tile-specific routines to support <linux/atomic.h>. */
-int _atomic_xchg(atomic_t *v, int n);
-int _atomic_xchg_add(atomic_t *v, int i);
-int _atomic_xchg_add_unless(atomic_t *v, int a, int u);
-int _atomic_cmpxchg(atomic_t *v, int o, int n);
-
-/**
- * atomic_xchg - atomically exchange contents of memory with a new value
- * @v: pointer of type atomic_t
- * @i: integer value to store in memory
- *
- * Atomically sets @v to @i and returns old @v
- */
-static inline int atomic_xchg(atomic_t *v, int n)
-{
-	smp_mb();  /* barrier for proper semantics */
-	return _atomic_xchg(v, n);
-}
-
-/**
- * atomic_cmpxchg - atomically exchange contents of memory if it matches
- * @v: pointer of type atomic_t
- * @o: old value that memory should have
- * @n: new value to write to memory if it matches
- *
- * Atomically checks if @v holds @o and replaces it with @n if so.
- * Returns the old value at @v.
- */
-static inline int atomic_cmpxchg(atomic_t *v, int o, int n)
-{
-	smp_mb();  /* barrier for proper semantics */
-	return _atomic_cmpxchg(v, o, n);
-}
-
 /**
  * atomic_add - add integer to atomic variable
  * @i: integer value to add
@@ -64,7 +31,7 @@ static inline int atomic_cmpxchg(atomic_t *v, int o, int n)
  */
 static inline void atomic_add(int i, atomic_t *v)
 {
-	_atomic_xchg_add(v, i);
+	_atomic_xchg_add(&v->counter, i);
 }
 
 /**
@@ -77,7 +44,7 @@ static inline void atomic_add(int i, atomic_t *v)
 static inline int atomic_add_return(int i, atomic_t *v)
 {
 	smp_mb();  /* barrier for proper semantics */
-	return _atomic_xchg_add(v, i) + i;
+	return _atomic_xchg_add(&v->counter, i) + i;
 }
 
 /**
@@ -92,7 +59,7 @@ static inline int atomic_add_return(int i, atomic_t *v)
 static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
 	smp_mb();  /* barrier for proper semantics */
-	return _atomic_xchg_add_unless(v, a, u);
+	return _atomic_xchg_add_unless(&v->counter, a, u);
 }
 
 /**
@@ -107,64 +74,31 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
  */
 static inline void atomic_set(atomic_t *v, int n)
 {
-	_atomic_xchg(v, n);
+	_atomic_xchg(&v->counter, n);
 }
 
 /* A 64bit atomic type */
 
 typedef struct {
-	u64 __aligned(8) counter;
+	long long counter;
 } atomic64_t;
 
 #define ATOMIC64_INIT(val) { (val) }
 
-u64 _atomic64_xchg(atomic64_t *v, u64 n);
-u64 _atomic64_xchg_add(atomic64_t *v, u64 i);
-u64 _atomic64_xchg_add_unless(atomic64_t *v, u64 a, u64 u);
-u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n);
-
 /**
  * atomic64_read - read atomic variable
  * @v: pointer of type atomic64_t
  *
  * Atomically reads the value of @v.
  */
-static inline u64 atomic64_read(const atomic64_t *v)
+static inline long long atomic64_read(const atomic64_t *v)
 {
 	/*
 	 * Requires an atomic op to read both 32-bit parts consistently.
 	 * Casting away const is safe since the atomic support routines
 	 * do not write to memory if the value has not been modified.
 	 */
-	return _atomic64_xchg_add((atomic64_t *)v, 0);
-}
-
-/**
- * atomic64_xchg - atomically exchange contents of memory with a new value
- * @v: pointer of type atomic64_t
- * @i: integer value to store in memory
- *
- * Atomically sets @v to @i and returns old @v
- */
-static inline u64 atomic64_xchg(atomic64_t *v, u64 n)
-{
-	smp_mb();  /* barrier for proper semantics */
-	return _atomic64_xchg(v, n);
-}
-
-/**
- * atomic64_cmpxchg - atomically exchange contents of memory if it matches
- * @v: pointer of type atomic64_t
- * @o: old value that memory should have
- * @n: new value to write to memory if it matches
- *
- * Atomically checks if @v holds @o and replaces it with @n if so.
- * Returns the old value at @v.
- */
-static inline u64 atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
-{
-	smp_mb();  /* barrier for proper semantics */
-	return _atomic64_cmpxchg(v, o, n);
+	return _atomic64_xchg_add((long long *)&v->counter, 0);
 }
 
 /**
@@ -174,9 +108,9 @@ static inline u64 atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
  *
  * Atomically adds @i to @v.
  */
-static inline void atomic64_add(u64 i, atomic64_t *v)
+static inline void atomic64_add(long long i, atomic64_t *v)
 {
-	_atomic64_xchg_add(v, i);
+	_atomic64_xchg_add(&v->counter, i);
 }
 
 /**
@@ -186,10 +120,10 @@ static inline void atomic64_add(u64 i, atomic64_t *v)
  *
  * Atomically adds @i to @v and returns @i + @v
  */
-static inline u64 atomic64_add_return(u64 i, atomic64_t *v)
+static inline long long atomic64_add_return(long long i, atomic64_t *v)
 {
 	smp_mb();  /* barrier for proper semantics */
-	return _atomic64_xchg_add(v, i) + i;
+	return _atomic64_xchg_add(&v->counter, i) + i;
 }
 
 /**
@@ -199,12 +133,13 @@ static inline u64 atomic64_add_return(u64 i, atomic64_t *v)
  * @u: ...unless v is equal to u.
  *
  * Atomically adds @a to @v, so long as @v was not already @u.
- * Returns the old value of @v.
+ * Returns non-zero if @v was not @u, and zero otherwise.
  */
-static inline u64 atomic64_add_unless(atomic64_t *v, u64 a, u64 u)
+static inline long long atomic64_add_unless(atomic64_t *v, long long a,
+					long long u)
 {
 	smp_mb();  /* barrier for proper semantics */
-	return _atomic64_xchg_add_unless(v, a, u) != u;
+	return _atomic64_xchg_add_unless(&v->counter, a, u) != u;
 }
 
 /**
@@ -217,9 +152,9 @@ static inline u64 atomic64_add_unless(atomic64_t *v, u64 a, u64 u)
  * atomic64_set() can't be just a raw store, since it would be lost if it
  * fell between the load and store of one of the other atomic ops.
  */
-static inline void atomic64_set(atomic64_t *v, u64 n)
+static inline void atomic64_set(atomic64_t *v, long long n)
 {
-	_atomic64_xchg(v, n);
+	_atomic64_xchg(&v->counter, n);
 }
 
 #define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
@@ -234,16 +169,6 @@ static inline void atomic64_set(atomic64_t *v, u64 n)
 #define atomic64_dec_and_test(v)	(atomic64_dec_return((v)) == 0)
 #define atomic64_inc_not_zero(v)	atomic64_add_unless((v), 1LL, 0LL)
 
-/*
- * We need to barrier before modifying the word, since the _atomic_xxx()
- * routines just tns the lock and then read/modify/write of the word.
- * But after the word is updated, the routine issues an "mf" before returning,
- * and since it's a function call, we don't even need a compiler barrier.
- */
-#define smp_mb__before_atomic_dec()	smp_mb()
-#define smp_mb__before_atomic_inc()	smp_mb()
-#define smp_mb__after_atomic_dec()	do { } while (0)
-#define smp_mb__after_atomic_inc()	do { } while (0)
 
 #endif /* !__ASSEMBLY__ */
 
@@ -251,21 +176,6 @@ static inline void atomic64_set(atomic64_t *v, u64 n)
  * Internal definitions only beyond this point.
  */
 
-#define ATOMIC_LOCKS_FOUND_VIA_TABLE() \
-  (!CHIP_HAS_CBOX_HOME_MAP() && defined(CONFIG_SMP))
-
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-/* Number of entries in atomic_lock_ptr[]. */
-#define ATOMIC_HASH_L1_SHIFT 6
-#define ATOMIC_HASH_L1_SIZE (1 << ATOMIC_HASH_L1_SHIFT)
-
-/* Number of locks in each struct pointed to by atomic_lock_ptr[]. */
-#define ATOMIC_HASH_L2_SHIFT (CHIP_L2_LOG_LINE_SIZE() - 2)
-#define ATOMIC_HASH_L2_SIZE (1 << ATOMIC_HASH_L2_SHIFT)
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /*
  * Number of atomic locks in atomic_locks[]. Must be a power of two.
  * There is no reason for more than PAGE_SIZE / 8 entries, since that
@@ -280,8 +190,6 @@ static inline void atomic64_set(atomic64_t *v, u64 n)
 extern int atomic_locks[];
 #endif
 
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /*
  * All the code that may fault while holding an atomic lock must
  * place the pointer to the lock in ATOMIC_LOCK_REG so the fault code
@@ -302,7 +210,14 @@ void __init_atomic_per_cpu(void);
 void __atomic_fault_unlock(int *lock_ptr);
 #endif
 
+/* Return a pointer to the lock for the given address. */
+int *__atomic_hashed_lock(volatile void *v);
+
 /* Private helper routines in lib/atomic_asm_32.S */
+struct __get_user {
+	unsigned long val;
+	int err;
+};
 extern struct __get_user __atomic_cmpxchg(volatile int *p,
 					  int *lock, int o, int n);
 extern struct __get_user __atomic_xchg(volatile int *p, int *lock, int n);
@@ -312,11 +227,16 @@ extern struct __get_user __atomic_xchg_add_unless(volatile int *p,
 extern struct __get_user __atomic_or(volatile int *p, int *lock, int n);
 extern struct __get_user __atomic_andn(volatile int *p, int *lock, int n);
 extern struct __get_user __atomic_xor(volatile int *p, int *lock, int n);
-extern u64 __atomic64_cmpxchg(volatile u64 *p, int *lock, u64 o, u64 n);
-extern u64 __atomic64_xchg(volatile u64 *p, int *lock, u64 n);
-extern u64 __atomic64_xchg_add(volatile u64 *p, int *lock, u64 n);
-extern u64 __atomic64_xchg_add_unless(volatile u64 *p,
-				      int *lock, u64 o, u64 n);
+extern long long __atomic64_cmpxchg(volatile long long *p, int *lock,
+					long long o, long long n);
+extern long long __atomic64_xchg(volatile long long *p, int *lock, long long n);
+extern long long __atomic64_xchg_add(volatile long long *p, int *lock,
+					long long n);
+extern long long __atomic64_xchg_add_unless(volatile long long *p,
+					int *lock, long long o, long long n);
+
+/* Return failure from the atomic wrappers. */
+struct __get_user __atomic_bad_address(int __user *addr);
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h
index 27fe667fddf..7b11c5fadd4 100644
--- a/arch/tile/include/asm/atomic_64.h
+++ b/arch/tile/include/asm/atomic_64.h
@@ -19,6 +19,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/barrier.h>
 #include <arch/spr_def.h>
 
 /* First, the 32-bit atomic ops that are "real" on our 64-bit platform. */
@@ -31,25 +32,6 @@
  * on any routine which updates memory and returns a value.
  */
 
-static inline int atomic_cmpxchg(atomic_t *v, int o, int n)
-{
-	int val;
-	__insn_mtspr(SPR_CMPEXCH_VALUE, o);
-	smp_mb();  /* barrier for proper semantics */
-	val = __insn_cmpexch4((void *)&v->counter, n);
-	smp_mb();  /* barrier for proper semantics */
-	return val;
-}
-
-static inline int atomic_xchg(atomic_t *v, int n)
-{
-	int val;
-	smp_mb();  /* barrier for proper semantics */
-	val = __insn_exch4((void *)&v->counter, n);
-	smp_mb();  /* barrier for proper semantics */
-	return val;
-}
-
 static inline void atomic_add(int i, atomic_t *v)
 {
 	__insn_fetchadd4((void *)&v->counter, i);
@@ -71,7 +53,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 		if (oldval == u)
 			break;
 		guess = oldval;
-		oldval = atomic_cmpxchg(v, guess, guess + a);
+		oldval = cmpxchg(&v->counter, guess, guess + a);
 	} while (guess != oldval);
 	return oldval;
 }
@@ -83,25 +65,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 #define atomic64_read(v)		((v)->counter)
 #define atomic64_set(v, i) ((v)->counter = (i))
 
-static inline long atomic64_cmpxchg(atomic64_t *v, long o, long n)
-{
-	long val;
-	smp_mb();  /* barrier for proper semantics */
-	__insn_mtspr(SPR_CMPEXCH_VALUE, o);
-	val = __insn_cmpexch((void *)&v->counter, n);
-	smp_mb();  /* barrier for proper semantics */
-	return val;
-}
-
-static inline long atomic64_xchg(atomic64_t *v, long n)
-{
-	long val;
-	smp_mb();  /* barrier for proper semantics */
-	val = __insn_exch((void *)&v->counter, n);
-	smp_mb();  /* barrier for proper semantics */
-	return val;
-}
-
 static inline void atomic64_add(long i, atomic64_t *v)
 {
 	__insn_fetchadd((void *)&v->counter, i);
@@ -123,7 +86,7 @@ static inline long atomic64_add_unless(atomic64_t *v, long a, long u)
 		if (oldval == u)
 			break;
 		guess = oldval;
-		oldval = atomic64_cmpxchg(v, guess, guess + a);
+		oldval = cmpxchg(&v->counter, guess, guess + a);
 	} while (guess != oldval);
 	return oldval != u;
 }
@@ -142,12 +105,6 @@ static inline long atomic64_add_unless(atomic64_t *v, long a, long u)
 
 #define atomic64_inc_not_zero(v)	atomic64_add_unless((v), 1, 0)
 
-/* Atomic dec and inc don't implement barrier, so provide them if needed. */
-#define smp_mb__before_atomic_dec()	smp_mb()
-#define smp_mb__after_atomic_dec()	smp_mb()
-#define smp_mb__before_atomic_inc()	smp_mb()
-#define smp_mb__after_atomic_inc()	smp_mb()
-
 /* Define this to indicate that cmpxchg is an efficient operation. */
 #define __HAVE_ARCH_CMPXCHG
 
diff --git a/arch/tile/include/asm/barrier.h b/arch/tile/include/asm/barrier.h
new file mode 100644
index 00000000000..96a42ae79f4
--- /dev/null
+++ b/arch/tile/include/asm/barrier.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_BARRIER_H
+#define _ASM_TILE_BARRIER_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <arch/chip.h>
+#include <arch/spr_def.h>
+#include <asm/timex.h>
+
+#define __sync()	__insn_mf()
+
+#include <hv/syscall_public.h>
+/*
+ * Issue an uncacheable load to each memory controller, then
+ * wait until those loads have completed.
+ */
+static inline void __mb_incoherent(void)
+{
+	long clobber_r10;
+	asm volatile("swint2"
+		     : "=R10" (clobber_r10)
+		     : "R10" (HV_SYS_fence_incoherent)
+		     : "r0", "r1", "r2", "r3", "r4",
+		       "r5", "r6", "r7", "r8", "r9",
+		       "r11", "r12", "r13", "r14",
+		       "r15", "r16", "r17", "r18", "r19",
+		       "r20", "r21", "r22", "r23", "r24",
+		       "r25", "r26", "r27", "r28", "r29");
+}
+
+/* Fence to guarantee visibility of stores to incoherent memory. */
+static inline void
+mb_incoherent(void)
+{
+	__insn_mf();
+
+	{
+#if CHIP_HAS_TILE_WRITE_PENDING()
+		const unsigned long WRITE_TIMEOUT_CYCLES = 400;
+		unsigned long start = get_cycles_low();
+		do {
+			if (__insn_mfspr(SPR_TILE_WRITE_PENDING) == 0)
+				return;
+		} while ((get_cycles_low() - start) < WRITE_TIMEOUT_CYCLES);
+#endif /* CHIP_HAS_TILE_WRITE_PENDING() */
+		(void) __mb_incoherent();
+	}
+}
+
+#define fast_wmb()	__sync()
+#define fast_rmb()	__sync()
+#define fast_mb()	__sync()
+#define fast_iob()	mb_incoherent()
+
+#define wmb()		fast_wmb()
+#define rmb()		fast_rmb()
+#define mb()		fast_mb()
+#define iob()		fast_iob()
+
+#ifndef __tilegx__ /* 32 bit */
+/*
+ * We need to barrier before modifying the word, since the _atomic_xxx()
+ * routines just tns the lock and then read/modify/write of the word.
+ * But after the word is updated, the routine issues an "mf" before returning,
+ * and since it's a function call, we don't even need a compiler barrier.
+ */
+#define smp_mb__before_atomic()	smp_mb()
+#define smp_mb__after_atomic()	do { } while (0)
+#else /* 64 bit */
+#define smp_mb__before_atomic()	smp_mb()
+#define smp_mb__after_atomic()	smp_mb()
+#endif
+
+#include <asm-generic/barrier.h>
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_TILE_BARRIER_H */
diff --git a/arch/tile/include/asm/bitops.h b/arch/tile/include/asm/bitops.h
index 16f1fa51fea..20caa346ac0 100644
--- a/arch/tile/include/asm/bitops.h
+++ b/arch/tile/include/asm/bitops.h
@@ -17,6 +17,7 @@
 #define _ASM_TILE_BITOPS_H
 
 #include <linux/types.h>
+#include <asm/barrier.h>
 
 #ifndef _LINUX_BITOPS_H
 #error only <linux/bitops.h> can be included directly
@@ -29,17 +30,6 @@
 #endif
 
 /**
- * __ffs - find first set bit in word
- * @word: The word to search
- *
- * Undefined if no set bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
-	return __builtin_ctzl(word);
-}
-
-/**
  * ffz - find first zero bit in word
  * @word: The word to search
  *
@@ -50,31 +40,9 @@ static inline unsigned long ffz(unsigned long word)
 	return __builtin_ctzl(~word);
 }
 
-/**
- * __fls - find last set bit in word
- * @word: The word to search
- *
- * Undefined if no set bit exists, so code should check against 0 first.
- */
-static inline unsigned long __fls(unsigned long word)
-{
-	return (sizeof(word) * 8) - 1 - __builtin_clzl(word);
-}
-
-/**
- * ffs - find first set bit in word
- * @x: the word to search
- *
- * This is defined the same way as the libc and compiler builtin ffs
- * routines, therefore differs in spirit from the other bitops.
- *
- * ffs(value) returns 0 if value is 0 or the position of the first
- * set bit if value is nonzero. The first (least significant) bit
- * is at position 1.
- */
-static inline int ffs(int x)
+static inline int fls64(__u64 w)
 {
-	return __builtin_ffs(x);
+	return (sizeof(__u64) * 8) - __builtin_clzll(w);
 }
 
 /**
@@ -90,12 +58,7 @@ static inline int ffs(int x)
  */
 static inline int fls(int x)
 {
-	return (sizeof(int) * 8) - __builtin_clz(x);
-}
-
-static inline int fls64(__u64 w)
-{
-	return (sizeof(__u64) * 8) - __builtin_clzll(w);
+	return fls64((unsigned int) x);
 }
 
 static inline unsigned int __arch_hweight32(unsigned int w)
@@ -118,6 +81,9 @@ static inline unsigned long __arch_hweight64(__u64 w)
 	return __builtin_popcountll(w);
 }
 
+#include <asm-generic/bitops/builtin-__ffs.h>
+#include <asm-generic/bitops/builtin-__fls.h>
+#include <asm-generic/bitops/builtin-ffs.h>
 #include <asm-generic/bitops/const_hweight.h>
 #include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/find.h>
diff --git a/arch/tile/include/asm/bitops_32.h b/arch/tile/include/asm/bitops_32.h
index 571b118bfd9..bbf7b666f21 100644
--- a/arch/tile/include/asm/bitops_32.h
+++ b/arch/tile/include/asm/bitops_32.h
@@ -16,8 +16,7 @@
 #define _ASM_TILE_BITOPS_32_H
 
 #include <linux/compiler.h>
-#include <linux/atomic.h>
-#include <asm/system.h>
+#include <asm/barrier.h>
 
 /* Tile-specific routines to support <asm/bitops.h>. */
 unsigned long _atomic_or(volatile unsigned long *p, unsigned long mask);
@@ -50,8 +49,8 @@ static inline void set_bit(unsigned nr, volatile unsigned long *addr)
  * restricted to acting on a single-word quantity.
  *
  * clear_bit() may not contain a memory barrier, so if it is used for
- * locking purposes, you should call smp_mb__before_clear_bit() and/or
- * smp_mb__after_clear_bit() to ensure changes are visible on other cpus.
+ * locking purposes, you should call smp_mb__before_atomic() and/or
+ * smp_mb__after_atomic() to ensure changes are visible on other cpus.
  */
 static inline void clear_bit(unsigned nr, volatile unsigned long *addr)
 {
@@ -122,10 +121,6 @@ static inline int test_and_change_bit(unsigned nr,
 	return (_atomic_xor(addr, mask) & mask) != 0;
 }
 
-/* See discussion at smp_mb__before_atomic_dec() in <asm/atomic_32.h>. */
-#define smp_mb__before_clear_bit()	smp_mb()
-#define smp_mb__after_clear_bit()	do {} while (0)
-
 #include <asm-generic/bitops/ext2-atomic.h>
 
 #endif /* _ASM_TILE_BITOPS_32_H */
diff --git a/arch/tile/include/asm/bitops_64.h b/arch/tile/include/asm/bitops_64.h
index e9c8e381ee0..bb1a29221fc 100644
--- a/arch/tile/include/asm/bitops_64.h
+++ b/arch/tile/include/asm/bitops_64.h
@@ -16,8 +16,7 @@
 #define _ASM_TILE_BITOPS_64_H
 
 #include <linux/compiler.h>
-#include <linux/atomic.h>
-#include <asm/system.h>
+#include <asm/cmpxchg.h>
 
 /* See <asm/bitops.h> for API comments. */
 
@@ -33,20 +32,15 @@ static inline void clear_bit(unsigned nr, volatile unsigned long *addr)
 	__insn_fetchand((void *)(addr + nr / BITS_PER_LONG), ~mask);
 }
 
-#define smp_mb__before_clear_bit()	smp_mb()
-#define smp_mb__after_clear_bit()	smp_mb()
-
-
 static inline void change_bit(unsigned nr, volatile unsigned long *addr)
 {
-	unsigned long old, mask = (1UL << (nr % BITS_PER_LONG));
-	long guess, oldval;
+	unsigned long mask = (1UL << (nr % BITS_PER_LONG));
+	unsigned long guess, oldval;
 	addr += nr / BITS_PER_LONG;
-	old = *addr;
+	oldval = *addr;
 	do {
 		guess = oldval;
-		oldval = atomic64_cmpxchg((atomic64_t *)addr,
-					  guess, guess ^ mask);
+		oldval = cmpxchg(addr, guess, guess ^ mask);
 	} while (guess != oldval);
 }
 
@@ -86,13 +80,12 @@ static inline int test_and_change_bit(unsigned nr,
 				      volatile unsigned long *addr)
 {
 	unsigned long mask = (1UL << (nr % BITS_PER_LONG));
-	long guess, oldval = *addr;
+	unsigned long guess, oldval;
 	addr += nr / BITS_PER_LONG;
 	oldval = *addr;
 	do {
 		guess = oldval;
-		oldval = atomic64_cmpxchg((atomic64_t *)addr,
-					  guess, guess ^ mask);
+		oldval = cmpxchg(addr, guess, guess ^ mask);
 	} while (guess != oldval);
 	return (oldval & mask) != 0;
 }
diff --git a/arch/tile/include/asm/byteorder.h b/arch/tile/include/asm/byteorder.h
deleted file mode 100644
index 9558416d578..00000000000
--- a/arch/tile/include/asm/byteorder.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <linux/byteorder/little_endian.h>
diff --git a/arch/tile/include/asm/cache.h b/arch/tile/include/asm/cache.h
index 392e5333dd8..6160761d5f6 100644
--- a/arch/tile/include/asm/cache.h
+++ b/arch/tile/include/asm/cache.h
@@ -27,11 +27,17 @@
 #define L2_CACHE_ALIGN(x)	(((x)+(L2_CACHE_BYTES-1)) & -L2_CACHE_BYTES)
 
 /*
- * TILE-Gx is fully coherent so we don't need to define ARCH_DMA_MINALIGN.
+ * TILEPro I/O is not always coherent (networking typically uses coherent
+ * I/O, but PCI traffic does not) and setting ARCH_DMA_MINALIGN to the
+ * L2 cacheline size helps ensure that kernel heap allocations are aligned.
+ * TILE-Gx I/O is always coherent when used on hash-for-home pages.
+ *
+ * However, it's possible at runtime to request not to use hash-for-home
+ * for the kernel heap, in which case the kernel will use flush-and-inval
+ * to manage coherence.  As a result, we use L2_CACHE_BYTES for the
+ * DMA minimum alignment to avoid false sharing in the kernel heap.
  */
-#ifndef __tilegx__
 #define ARCH_DMA_MINALIGN	L2_CACHE_BYTES
-#endif
 
 /* use the cache line size for the L2, which is where it counts */
 #define SMP_CACHE_BYTES_SHIFT	L2_CACHE_SHIFT
@@ -43,9 +49,16 @@
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
 /*
- * Attribute for data that is kept read/write coherent until the end of
- * initialization, then bumped to read/only incoherent for performance.
+ * Originally we used small TLB pages for kernel data and grouped some
+ * things together as "write once", enforcing the property at the end
+ * of initialization by making those pages read-only and non-coherent.
+ * This allowed better cache utilization since cache inclusion did not
+ * need to be maintained.  However, to do this requires an extra TLB
+ * entry, which on balance is more of a performance hit than the
+ * non-coherence is a performance gain, so we now just make "read
+ * mostly" and "write once" be synonyms.  We keep the attribute
+ * separate in case we change our minds at a future date.
  */
-#define __write_once __attribute__((__section__(".w1data")))
+#define __write_once __read_mostly
 
 #endif /* _ASM_TILE_CACHE_H */
diff --git a/arch/tile/include/asm/cacheflush.h b/arch/tile/include/asm/cacheflush.h
index e925f4bb498..92ee4c8a4f7 100644
--- a/arch/tile/include/asm/cacheflush.h
+++ b/arch/tile/include/asm/cacheflush.h
@@ -20,7 +20,6 @@
 /* Keep includes the same across arches.  */
 #include <linux/mm.h>
 #include <linux/cache.h>
-#include <asm/system.h>
 #include <arch/icache.h>
 
 /* Caches are physically-indexed and so don't need special treatment */
@@ -76,23 +75,6 @@ static inline void copy_to_user_page(struct vm_area_struct *vma,
 #define copy_from_user_page(vma, page, vaddr, dst, src, len) \
 	memcpy((dst), (src), (len))
 
-/*
- * Invalidate a VA range; pads to L2 cacheline boundaries.
- *
- * Note that on TILE64, __inv_buffer() actually flushes modified
- * cache lines in addition to invalidating them, i.e., it's the
- * same as __finv_buffer().
- */
-static inline void __inv_buffer(void *buffer, size_t size)
-{
-	char *next = (char *)((long)buffer & -L2_CACHE_BYTES);
-	char *finish = (char *)L2_CACHE_ALIGN((long)buffer + size);
-	while (next < finish) {
-		__insn_inv(next);
-		next += CHIP_INV_STRIDE();
-	}
-}
-
 /* Flush a VA range; pads to L2 cacheline boundaries. */
 static inline void __flush_buffer(void *buffer, size_t size)
 {
@@ -116,13 +98,6 @@ static inline void __finv_buffer(void *buffer, size_t size)
 }
 
 
-/* Invalidate a VA range and wait for it to be complete. */
-static inline void inv_buffer(void *buffer, size_t size)
-{
-	__inv_buffer(buffer, size);
-	mb();
-}
-
 /*
  * Flush a locally-homecached VA range and wait for the evicted
  * cachelines to hit memory.
@@ -143,6 +118,26 @@ static inline void finv_buffer_local(void *buffer, size_t size)
 	mb_incoherent();
 }
 
+#ifdef __tilepro__
+/* Invalidate a VA range; pads to L2 cacheline boundaries. */
+static inline void __inv_buffer(void *buffer, size_t size)
+{
+	char *next = (char *)((long)buffer & -L2_CACHE_BYTES);
+	char *finish = (char *)L2_CACHE_ALIGN((long)buffer + size);
+	while (next < finish) {
+		__insn_inv(next);
+		next += CHIP_INV_STRIDE();
+	}
+}
+
+/* Invalidate a VA range and wait for it to be complete. */
+static inline void inv_buffer(void *buffer, size_t size)
+{
+	__inv_buffer(buffer, size);
+	mb();
+}
+#endif
+
 /*
  * Flush and invalidate a VA range that is homed remotely, waiting
  * until the memory controller holds the flushed values.  If "hfh" is
@@ -152,4 +147,14 @@ static inline void finv_buffer_local(void *buffer, size_t size)
  */
 void finv_buffer_remote(void *buffer, size_t size, int hfh);
 
+/*
+ * On SMP systems, when the scheduler does migration-cost autodetection,
+ * it needs a way to flush as much of the CPU's caches as possible:
+ *
+ * TODO: fill this in!
+ */
+static inline void sched_cacheflush(void)
+{
+}
+
 #endif /* _ASM_TILE_CACHEFLUSH_H */
diff --git a/arch/tile/include/asm/checksum.h b/arch/tile/include/asm/checksum.h
index a120766c726..b21a2fdec9f 100644
--- a/arch/tile/include/asm/checksum.h
+++ b/arch/tile/include/asm/checksum.h
@@ -21,4 +21,22 @@
 __wsum do_csum(const unsigned char *buff, int len);
 #define do_csum do_csum
 
+/*
+ * Return the sum of all the 16-bit subwords in a long.
+ * This sums two subwords on a 32-bit machine, and four on 64 bits.
+ * The implementation does two vector adds to capture any overflow.
+ */
+static inline unsigned int csum_long(unsigned long x)
+{
+	unsigned long ret;
+#ifdef __tilegx__
+	ret = __insn_v2sadu(x, 0);
+	ret = __insn_v2sadu(ret, 0);
+#else
+	ret = __insn_sadh_u(x, 0);
+	ret = __insn_sadh_u(ret, 0);
+#endif
+	return ret;
+}
+
 #endif /* _ASM_TILE_CHECKSUM_H */
diff --git a/arch/tile/include/asm/cmpxchg.h b/arch/tile/include/asm/cmpxchg.h
new file mode 100644
index 00000000000..0ccda3c425b
--- /dev/null
+++ b/arch/tile/include/asm/cmpxchg.h
@@ -0,0 +1,134 @@
+/*
+ * cmpxchg.h -- forked from asm/atomic.h with this copyright:
+ *
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ */
+
+#ifndef _ASM_TILE_CMPXCHG_H
+#define _ASM_TILE_CMPXCHG_H
+
+#ifndef __ASSEMBLY__
+
+#include <asm/barrier.h>
+
+/* Nonexistent functions intended to cause compile errors. */
+extern void __xchg_called_with_bad_pointer(void)
+	__compiletime_error("Bad argument size for xchg");
+extern void __cmpxchg_called_with_bad_pointer(void)
+	__compiletime_error("Bad argument size for cmpxchg");
+
+#ifndef __tilegx__
+
+/* Note the _atomic_xxx() routines include a final mb(). */
+int _atomic_xchg(int *ptr, int n);
+int _atomic_xchg_add(int *v, int i);
+int _atomic_xchg_add_unless(int *v, int a, int u);
+int _atomic_cmpxchg(int *ptr, int o, int n);
+long long _atomic64_xchg(long long *v, long long n);
+long long _atomic64_xchg_add(long long *v, long long i);
+long long _atomic64_xchg_add_unless(long long *v, long long a, long long u);
+long long _atomic64_cmpxchg(long long *v, long long o, long long n);
+
+#define xchg(ptr, n)							\
+	({								\
+		if (sizeof(*(ptr)) != 4)				\
+			__xchg_called_with_bad_pointer();		\
+		smp_mb();						\
+		(typeof(*(ptr)))_atomic_xchg((int *)(ptr), (int)(n));	\
+	})
+
+#define cmpxchg(ptr, o, n)						\
+	({								\
+		if (sizeof(*(ptr)) != 4)				\
+			__cmpxchg_called_with_bad_pointer();		\
+		smp_mb();						\
+		(typeof(*(ptr)))_atomic_cmpxchg((int *)ptr, (int)o,	\
+						(int)n);		\
+	})
+
+#define xchg64(ptr, n)							\
+	({								\
+		if (sizeof(*(ptr)) != 8)				\
+			__xchg_called_with_bad_pointer();		\
+		smp_mb();						\
+		(typeof(*(ptr)))_atomic64_xchg((long long *)(ptr),	\
+						(long long)(n));	\
+	})
+
+#define cmpxchg64(ptr, o, n)						\
+	({								\
+		if (sizeof(*(ptr)) != 8)				\
+			__cmpxchg_called_with_bad_pointer();		\
+		smp_mb();						\
+		(typeof(*(ptr)))_atomic64_cmpxchg((long long *)ptr,	\
+					(long long)o, (long long)n);	\
+	})
+
+#else
+
+#define xchg(ptr, n)							\
+	({								\
+		typeof(*(ptr)) __x;					\
+		smp_mb();						\
+		switch (sizeof(*(ptr))) {				\
+		case 4:							\
+			__x = (typeof(__x))(unsigned long)		\
+				__insn_exch4((ptr),			\
+					(u32)(unsigned long)(n));	\
+			break;						\
+		case 8:							\
+			__x = (typeof(__x))				\
+				__insn_exch((ptr), (unsigned long)(n));	\
+			break;						\
+		default:						\
+			__xchg_called_with_bad_pointer();		\
+			break;						\
+		}							\
+		smp_mb();						\
+		__x;							\
+	})
+
+#define cmpxchg(ptr, o, n)						\
+	({								\
+		typeof(*(ptr)) __x;					\
+		__insn_mtspr(SPR_CMPEXCH_VALUE, (unsigned long)(o));	\
+		smp_mb();						\
+		switch (sizeof(*(ptr))) {				\
+		case 4:							\
+			__x = (typeof(__x))(unsigned long)		\
+				__insn_cmpexch4((ptr),			\
+					(u32)(unsigned long)(n));	\
+			break;						\
+		case 8:							\
+			__x = (typeof(__x))__insn_cmpexch((ptr),	\
+						(long long)(n));	\
+			break;						\
+		default:						\
+			__cmpxchg_called_with_bad_pointer();		\
+			break;						\
+		}							\
+		smp_mb();						\
+		__x;							\
+	})
+
+#define xchg64 xchg
+#define cmpxchg64 cmpxchg
+
+#endif
+
+#define tas(ptr) xchg((ptr), 1)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_TILE_CMPXCHG_H */
diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h
index bf95f55b82b..ffd4493efc7 100644
--- a/arch/tile/include/asm/compat.h
+++ b/arch/tile/include/asm/compat.h
@@ -44,7 +44,6 @@ typedef __kernel_uid32_t __compat_gid32_t;
 typedef __kernel_mode_t compat_mode_t;
 typedef __kernel_dev_t compat_dev_t;
 typedef __kernel_loff_t compat_loff_t;
-typedef __kernel_nlink_t compat_nlink_t;
 typedef __kernel_ipc_pid_t compat_ipc_pid_t;
 typedef __kernel_daddr_t compat_daddr_t;
 typedef __kernel_fsid_t	compat_fsid_t;
@@ -111,6 +110,68 @@ struct compat_flock64 {
 
 typedef u32               compat_sigset_word;
 
+typedef union compat_sigval {
+	compat_int_t	sival_int;
+	compat_uptr_t	sival_ptr;
+} compat_sigval_t;
+
+#define COMPAT_SI_PAD_SIZE	(128/sizeof(int) - 3)
+
+typedef struct compat_siginfo {
+	int si_signo;
+	int si_errno;
+	int si_code;
+
+	union {
+		int _pad[COMPAT_SI_PAD_SIZE];
+
+		/* kill() */
+		struct {
+			unsigned int _pid;	/* sender's pid */
+			unsigned int _uid;	/* sender's uid */
+		} _kill;
+
+		/* POSIX.1b timers */
+		struct {
+			compat_timer_t _tid;	/* timer id */
+			int _overrun;		/* overrun count */
+			compat_sigval_t _sigval;	/* same as below */
+			int _sys_private;	/* not to be passed to user */
+			int _overrun_incr;	/* amount to add to overrun */
+		} _timer;
+
+		/* POSIX.1b signals */
+		struct {
+			unsigned int _pid;	/* sender's pid */
+			unsigned int _uid;	/* sender's uid */
+			compat_sigval_t _sigval;
+		} _rt;
+
+		/* SIGCHLD */
+		struct {
+			unsigned int _pid;	/* which child */
+			unsigned int _uid;	/* sender's uid */
+			int _status;		/* exit code */
+			compat_clock_t _utime;
+			compat_clock_t _stime;
+		} _sigchld;
+
+		/* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
+		struct {
+			unsigned int _addr;	/* faulting insn/memory ref. */
+#ifdef __ARCH_SI_TRAPNO
+			int _trapno;	/* TRAP # which caused the signal */
+#endif
+		} _sigfault;
+
+		/* SIGPOLL */
+		struct {
+			int _band;	/* POLL_IN, POLL_OUT, POLL_MSG */
+			int _fd;
+		} _sigpoll;
+	} _sifields;
+} compat_siginfo_t;
+
 #define COMPAT_OFF_T_MAX	0x7fffffff
 #define COMPAT_LOFF_T_MAX	0x7fffffffffffffffL
 
@@ -211,57 +272,26 @@ extern int compat_setup_rt_frame(int sig, struct k_sigaction *ka,
 				 struct pt_regs *regs);
 
 /* Compat syscalls. */
-struct compat_sigaction;
 struct compat_siginfo;
 struct compat_sigaltstack;
-long compat_sys_execve(const char __user *path,
-		       compat_uptr_t __user *argv,
-		       compat_uptr_t __user *envp, struct pt_regs *);
-long compat_sys_rt_sigaction(int sig, struct compat_sigaction __user *act,
-			     struct compat_sigaction __user *oact,
-			     size_t sigsetsize);
-long compat_sys_rt_sigqueueinfo(int pid, int sig,
-				struct compat_siginfo __user *uinfo);
-long compat_sys_rt_sigreturn(struct pt_regs *);
-long compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr,
-			    struct compat_sigaltstack __user *uoss_ptr,
-			    struct pt_regs *);
+long compat_sys_rt_sigreturn(void);
 long compat_sys_truncate64(char __user *filename, u32 dummy, u32 low, u32 high);
 long compat_sys_ftruncate64(unsigned int fd, u32 dummy, u32 low, u32 high);
 long compat_sys_pread64(unsigned int fd, char __user *ubuf, size_t count,
 			u32 dummy, u32 low, u32 high);
 long compat_sys_pwrite64(unsigned int fd, char __user *ubuf, size_t count,
 			 u32 dummy, u32 low, u32 high);
-long compat_sys_lookup_dcookie(u32 low, u32 high, char __user *buf, size_t len);
 long compat_sys_sync_file_range2(int fd, unsigned int flags,
 				 u32 offset_lo, u32 offset_hi,
 				 u32 nbytes_lo, u32 nbytes_hi);
 long compat_sys_fallocate(int fd, int mode,
 			  u32 offset_lo, u32 offset_hi,
 			  u32 len_lo, u32 len_hi);
-long compat_sys_sched_rr_get_interval(compat_pid_t pid,
-				      struct compat_timespec __user *interval);
-
-/* Versions of compat functions that differ from generic Linux. */
-struct compat_msgbuf;
-long tile_compat_sys_msgsnd(int msqid,
-			    struct compat_msgbuf __user *msgp,
-			    size_t msgsz, int msgflg);
-long tile_compat_sys_msgrcv(int msqid,
-			    struct compat_msgbuf __user *msgp,
-			    size_t msgsz, long msgtyp, int msgflg);
-long tile_compat_sys_ptrace(compat_long_t request, compat_long_t pid,
-			    compat_long_t addr, compat_long_t data);
-
-/* Tilera Linux syscalls that don't have "compat" versions. */
-#define compat_sys_flush_cache sys_flush_cache
-
-/* These are the intvec_64.S trampolines. */
-long _compat_sys_execve(const char __user *path,
-			const compat_uptr_t __user *argv,
-			const compat_uptr_t __user *envp);
-long _compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr,
-			    struct compat_sigaltstack __user *uoss_ptr);
+long compat_sys_llseek(unsigned int fd, unsigned int offset_high,
+		       unsigned int offset_low, loff_t __user * result,
+		       unsigned int origin);
+
+/* Assembly trampoline to avoid clobbering r0. */
 long _compat_sys_rt_sigreturn(void);
 
 #endif /* _ASM_TILE_COMPAT_H */
diff --git a/arch/tile/include/asm/device.h b/arch/tile/include/asm/device.h
new file mode 100644
index 00000000000..6ab8bf146d4
--- /dev/null
+++ b/arch/tile/include/asm/device.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ * Arch specific extensions to struct device
+ */
+
+#ifndef _ASM_TILE_DEVICE_H
+#define _ASM_TILE_DEVICE_H
+
+struct dev_archdata {
+	/* DMA operations on that device */
+        struct dma_map_ops	*dma_ops;
+
+	/* Offset of the DMA address from the PA. */
+	dma_addr_t		dma_offset;
+
+	/*
+	 * Highest DMA address that can be generated by devices that
+	 * have limited DMA capability, i.e. non 64-bit capable.
+	 */
+	dma_addr_t		max_direct_dma_addr;
+};
+
+struct pdev_archdata {
+};
+
+#endif /* _ASM_TILE_DEVICE_H */
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index eaa06d175b3..1eae359d831 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -20,69 +20,94 @@
 #include <linux/cache.h>
 #include <linux/io.h>
 
-/*
- * Note that on x86 and powerpc, there is a "struct dma_mapping_ops"
- * that is used for all the DMA operations.  For now, we don't have an
- * equivalent on tile, because we only have a single way of doing DMA.
- * (Tilera bug 7994 to use dma_mapping_ops.)
- */
+#ifdef __tilegx__
+#define ARCH_HAS_DMA_GET_REQUIRED_MASK
+#endif
+
+extern struct dma_map_ops *tile_dma_map_ops;
+extern struct dma_map_ops *gx_pci_dma_map_ops;
+extern struct dma_map_ops *gx_legacy_pci_dma_map_ops;
+extern struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
+
+static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+{
+	if (dev && dev->archdata.dma_ops)
+		return dev->archdata.dma_ops;
+	else
+		return tile_dma_map_ops;
+}
+
+static inline dma_addr_t get_dma_offset(struct device *dev)
+{
+	return dev->archdata.dma_offset;
+}
+
+static inline void set_dma_offset(struct device *dev, dma_addr_t off)
+{
+	dev->archdata.dma_offset = off;
+}
+
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+	return paddr;
+}
+
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
+{
+	return daddr;
+}
+
+static inline void dma_mark_clean(void *addr, size_t size) {}
+
+#include <asm-generic/dma-mapping-common.h>
+
+static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
+{
+	dev->archdata.dma_ops = ops;
+}
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
-extern dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
-			  enum dma_data_direction);
-extern void dma_unmap_single(struct device *dev, dma_addr_t dma_addr,
-			     size_t size, enum dma_data_direction);
-extern int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
-	       enum dma_data_direction);
-extern void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-			 int nhwentries, enum dma_data_direction);
-extern dma_addr_t dma_map_page(struct device *dev, struct page *page,
-			       unsigned long offset, size_t size,
-			       enum dma_data_direction);
-extern void dma_unmap_page(struct device *dev, dma_addr_t dma_address,
-			   size_t size, enum dma_data_direction);
-extern void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
-				int nelems, enum dma_data_direction);
-extern void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
-				   int nelems, enum dma_data_direction);
-
-
-void *dma_alloc_coherent(struct device *dev, size_t size,
-			   dma_addr_t *dma_handle, gfp_t flag);
-
-void dma_free_coherent(struct device *dev, size_t size,
-			 void *vaddr, dma_addr_t dma_handle);
-
-extern void dma_sync_single_for_cpu(struct device *, dma_addr_t, size_t,
-				    enum dma_data_direction);
-extern void dma_sync_single_for_device(struct device *, dma_addr_t,
-				       size_t, enum dma_data_direction);
-extern void dma_sync_single_range_for_cpu(struct device *, dma_addr_t,
-					  unsigned long offset, size_t,
-					  enum dma_data_direction);
-extern void dma_sync_single_range_for_device(struct device *, dma_addr_t,
-					     unsigned long offset, size_t,
-					     enum dma_data_direction);
-extern void dma_cache_sync(struct device *dev, void *vaddr, size_t,
-			   enum dma_data_direction);
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+{
+	if (!dev->dma_mask)
+		return 0;
+
+	return addr + size - 1 <= *dev->dma_mask;
+}
 
 static inline int
 dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
-	return 0;
+	debug_dma_mapping_error(dev, dma_addr);
+	return get_dma_ops(dev)->mapping_error(dev, dma_addr);
 }
 
 static inline int
 dma_supported(struct device *dev, u64 mask)
 {
-	return 1;
+	return get_dma_ops(dev)->dma_supported(dev, mask);
 }
 
 static inline int
 dma_set_mask(struct device *dev, u64 mask)
 {
+	struct dma_map_ops *dma_ops = get_dma_ops(dev);
+
+	/*
+	 * For PCI devices with 64-bit DMA addressing capability, promote
+	 * the dma_ops to hybrid, with the consistent memory DMA space limited
+	 * to 32-bit. For 32-bit capable devices, limit the streaming DMA
+	 * address range to max_direct_dma_addr.
+	 */
+	if (dma_ops == gx_pci_dma_map_ops ||
+	    dma_ops == gx_hybrid_pci_dma_map_ops ||
+	    dma_ops == gx_legacy_pci_dma_map_ops) {
+		if (mask == DMA_BIT_MASK(64) &&
+		    dma_ops == gx_legacy_pci_dma_map_ops)
+			set_dma_ops(dev, gx_hybrid_pci_dma_map_ops);
+		else if (mask > dev->archdata.max_direct_dma_addr)
+			mask = dev->archdata.max_direct_dma_addr;
+	}
+
 	if (!dev->dma_mask || !dma_supported(dev, mask))
 		return -EIO;
 
@@ -91,4 +116,43 @@ dma_set_mask(struct device *dev, u64 mask)
 	return 0;
 }
 
+static inline void *dma_alloc_attrs(struct device *dev, size_t size,
+				    dma_addr_t *dma_handle, gfp_t flag,
+				    struct dma_attrs *attrs)
+{
+	struct dma_map_ops *dma_ops = get_dma_ops(dev);
+	void *cpu_addr;
+
+	cpu_addr = dma_ops->alloc(dev, size, dma_handle, flag, attrs);
+
+	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+
+	return cpu_addr;
+}
+
+static inline void dma_free_attrs(struct device *dev, size_t size,
+				  void *cpu_addr, dma_addr_t dma_handle,
+				  struct dma_attrs *attrs)
+{
+	struct dma_map_ops *dma_ops = get_dma_ops(dev);
+
+	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
+
+	dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
+}
+
+#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
+#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
+#define dma_free_coherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
+#define dma_free_noncoherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
+
+/*
+ * dma_alloc_noncoherent() is #defined to return coherent memory,
+ * so there's no need to do any flushing here.
+ */
+static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+				  enum dma_data_direction direction)
+{
+}
+
 #endif /* _ASM_TILE_DMA_MAPPING_H */
diff --git a/arch/tile/include/asm/elf.h b/arch/tile/include/asm/elf.h
index 623a6bb741c..41d9878a968 100644
--- a/arch/tile/include/asm/elf.h
+++ b/arch/tile/include/asm/elf.h
@@ -30,7 +30,6 @@ typedef unsigned long elf_greg_t;
 #define ELF_NGREG (sizeof(struct pt_regs) / sizeof(elf_greg_t))
 typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 
-#define EM_TILE64  187
 #define EM_TILEPRO 188
 #define EM_TILEGX  191
 
@@ -44,7 +43,11 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
 #else
 #define ELF_CLASS	ELFCLASS32
 #endif
+#ifdef __BIG_ENDIAN__
+#define ELF_DATA	ELFDATA2MSB
+#else
 #define ELF_DATA	ELFDATA2LSB
+#endif
 
 /*
  * There seems to be a bug in how compat_binfmt_elf.c works: it
@@ -59,6 +62,7 @@ enum { ELF_ARCH = CHIP_ELF_TYPE() };
  */
 #define elf_check_arch(x)  \
 	((x)->e_ident[EI_CLASS] == ELF_CLASS && \
+	 (x)->e_ident[EI_DATA] == ELF_DATA && \
 	 (x)->e_machine == CHIP_ELF_TYPE())
 
 /* The module loader only handles a few relocation types. */
@@ -127,6 +131,15 @@ extern int dump_task_regs(struct task_struct *, elf_gregset_t *);
 struct linux_binprm;
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 				       int executable_stack);
+#define ARCH_DLINFO \
+do { \
+	NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_BASE); \
+} while (0)
+
+struct mm_struct;
+extern unsigned long arch_randomize_brk(struct mm_struct *mm);
+#define arch_randomize_brk arch_randomize_brk
+
 #ifdef CONFIG_COMPAT
 
 #define COMPAT_ELF_PLATFORM "tilegx-m32"
@@ -143,6 +156,7 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 #define compat_start_thread(regs, ip, usp) do { \
 		regs->pc = ptr_to_compat_reg((void *)(ip)); \
 		regs->sp = ptr_to_compat_reg((void *)(usp)); \
+		single_step_execve();	\
 	} while (0)
 
 /*
@@ -151,12 +165,12 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 #undef SET_PERSONALITY
 #define SET_PERSONALITY(ex) \
 do { \
-	current->personality = PER_LINUX; \
+	set_personality(PER_LINUX | (current->personality & (~PER_MASK))); \
 	current_thread_info()->status &= ~TS_COMPAT; \
 } while (0)
 #define COMPAT_SET_PERSONALITY(ex) \
 do { \
-	current->personality = PER_LINUX_32BIT; \
+	set_personality(PER_LINUX | (current->personality & (~PER_MASK))); \
 	current_thread_info()->status |= TS_COMPAT; \
 } while (0)
 
@@ -164,4 +178,6 @@ do { \
 
 #endif /* CONFIG_COMPAT */
 
+#define CORE_DUMP_USE_REGSET
+
 #endif /* _ASM_TILE_ELF_H */
diff --git a/arch/tile/include/asm/fixmap.h b/arch/tile/include/asm/fixmap.h
index c66f7933bea..ffe2637aeb3 100644
--- a/arch/tile/include/asm/fixmap.h
+++ b/arch/tile/include/asm/fixmap.h
@@ -25,9 +25,6 @@
 #include <asm/kmap_types.h>
 #endif
 
-#define __fix_to_virt(x)	(FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x)	((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
 /*
  * Here we define all the compile-time 'special' virtual
  * addresses. The point is to have a constant address at
@@ -45,15 +42,23 @@
  *
  * TLB entries of such buffers will not be flushed across
  * task switches.
- *
- * We don't bother with a FIX_HOLE since above the fixmaps
- * is unmapped memory in any case.
  */
 enum fixed_addresses {
+#ifdef __tilegx__
+	/*
+	 * TILEPro has unmapped memory above so the hole isn't needed,
+	 * and in any case the hole pushes us over a single 16MB pmd.
+	 */
+	FIX_HOLE,
+#endif
 #ifdef CONFIG_HIGHMEM
 	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
 	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
 #endif
+#ifdef __tilegx__  /* see homecache.c */
+	FIX_HOMECACHE_BEGIN,
+	FIX_HOMECACHE_END = FIX_HOMECACHE_BEGIN+(NR_CPUS)-1,
+#endif
 	__end_of_permanent_fixed_addresses,
 
 	/*
@@ -70,48 +75,12 @@ enum fixed_addresses {
 #endif
 };
 
-extern void __set_fixmap(enum fixed_addresses idx,
-			 unsigned long phys, pgprot_t flags);
-
-#define set_fixmap(idx, phys) \
-		__set_fixmap(idx, phys, PAGE_KERNEL)
-#define clear_fixmap(idx) \
-		__set_fixmap(idx, 0, __pgprot(0))
-
 #define __FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT)
 #define __FIXADDR_BOOT_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
 #define FIXADDR_START		(FIXADDR_TOP + PAGE_SIZE - __FIXADDR_SIZE)
 #define FIXADDR_BOOT_START	(FIXADDR_TOP + PAGE_SIZE - __FIXADDR_BOOT_SIZE)
 
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without tranlation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
-{
-	/*
-	 * this branch gets completely eliminated after inlining,
-	 * except when someone tries to use fixaddr indices in an
-	 * illegal way. (such as mixing up address types or using
-	 * out-of-range indices).
-	 *
-	 * If it doesn't get removed, the linker will complain
-	 * loudly with a reasonably clear error message..
-	 */
-	if (idx >= __end_of_fixed_addresses)
-		__this_fixmap_does_not_exist();
-
-	return __fix_to_virt(idx);
-}
-
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
-	BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
-	return __virt_to_fix(vaddr);
-}
+#include <asm-generic/fixmap.h>
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/tile/include/asm/ftrace.h b/arch/tile/include/asm/ftrace.h
index 461459b06d9..13a9bb81a8a 100644
--- a/arch/tile/include/asm/ftrace.h
+++ b/arch/tile/include/asm/ftrace.h
@@ -15,6 +15,26 @@
 #ifndef _ASM_TILE_FTRACE_H
 #define _ASM_TILE_FTRACE_H
 
-/* empty */
+#ifdef CONFIG_FUNCTION_TRACER
+
+#define MCOUNT_ADDR ((unsigned long)(__mcount))
+#define MCOUNT_INSN_SIZE 8		/* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void __mcount(void);
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+	return addr;
+}
+
+struct dyn_arch_ftrace {
+};
+#endif /*  CONFIG_DYNAMIC_FTRACE */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* CONFIG_FUNCTION_TRACER */
 
 #endif /* _ASM_TILE_FTRACE_H */
diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h
index d03ec124a59..1a6ef1b69cb 100644
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -28,29 +28,82 @@
 #include <linux/futex.h>
 #include <linux/uaccess.h>
 #include <linux/errno.h>
+#include <asm/atomic.h>
 
-extern struct __get_user futex_set(u32 __user *v, int i);
-extern struct __get_user futex_add(u32 __user *v, int n);
-extern struct __get_user futex_or(u32 __user *v, int n);
-extern struct __get_user futex_andn(u32 __user *v, int n);
-extern struct __get_user futex_cmpxchg(u32 __user *v, int o, int n);
+/*
+ * Support macros for futex operations.  Do not use these macros directly.
+ * They assume "ret", "val", "oparg", and "uaddr" in the lexical context.
+ * __futex_cmpxchg() additionally assumes "oldval".
+ */
+
+#ifdef __tilegx__
+
+#define __futex_asm(OP) \
+	asm("1: {" #OP " %1, %3, %4; movei %0, 0 }\n"		\
+	    ".pushsection .fixup,\"ax\"\n"			\
+	    "0: { movei %0, %5; j 9f }\n"			\
+	    ".section __ex_table,\"a\"\n"			\
+	    ".align 8\n"					\
+	    ".quad 1b, 0b\n"					\
+	    ".popsection\n"					\
+	    "9:"						\
+	    : "=r" (ret), "=r" (val), "+m" (*(uaddr))		\
+	    : "r" (uaddr), "r" (oparg), "i" (-EFAULT))
+
+#define __futex_set() __futex_asm(exch4)
+#define __futex_add() __futex_asm(fetchadd4)
+#define __futex_or() __futex_asm(fetchor4)
+#define __futex_andn() ({ oparg = ~oparg; __futex_asm(fetchand4); })
+#define __futex_cmpxchg() \
+	({ __insn_mtspr(SPR_CMPEXCH_VALUE, oldval); __futex_asm(cmpexch4); })
+
+#define __futex_xor()						\
+	({							\
+		u32 oldval, n = oparg;				\
+		if ((ret = __get_user(oldval, uaddr)) == 0) {	\
+			do {					\
+				oparg = oldval ^ n;		\
+				__futex_cmpxchg();		\
+			} while (ret == 0 && oldval != val);	\
+		}						\
+	})
+
+/* No need to prefetch, since the atomic ops go to the home cache anyway. */
+#define __futex_prolog()
 
-#ifndef __tilegx__
-extern struct __get_user futex_xor(u32 __user *v, int n);
 #else
-static inline struct __get_user futex_xor(u32 __user *uaddr, int n)
-{
-	struct __get_user asm_ret = __get_user_4(uaddr);
-	if (!asm_ret.err) {
-		int oldval, newval;
-		do {
-			oldval = asm_ret.val;
-			newval = oldval ^ n;
-			asm_ret = futex_cmpxchg(uaddr, oldval, newval);
-		} while (asm_ret.err == 0 && oldval != asm_ret.val);
+
+#define __futex_call(FN)						\
+	{								\
+		struct __get_user gu = FN((u32 __force *)uaddr, lock, oparg); \
+		val = gu.val;						\
+		ret = gu.err;						\
 	}
-	return asm_ret;
-}
+
+#define __futex_set() __futex_call(__atomic_xchg)
+#define __futex_add() __futex_call(__atomic_xchg_add)
+#define __futex_or() __futex_call(__atomic_or)
+#define __futex_andn() __futex_call(__atomic_andn)
+#define __futex_xor() __futex_call(__atomic_xor)
+
+#define __futex_cmpxchg()						\
+	{								\
+		struct __get_user gu = __atomic_cmpxchg((u32 __force *)uaddr, \
+							lock, oldval, oparg); \
+		val = gu.val;						\
+		ret = gu.err;						\
+	}
+
+/*
+ * Find the lock pointer for the atomic calls to use, and issue a
+ * prefetch to the user address to bring it into cache.  Similar to
+ * __atomic_setup(), but we can't do a read into the L1 since it might
+ * fault; instead we do a prefetch into the L2.
+ */
+#define __futex_prolog()					\
+	int *lock;						\
+	__insn_prefetch(uaddr);					\
+	lock = __atomic_hashed_lock((int __force *)uaddr)
 #endif
 
 static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
@@ -59,8 +112,12 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	int cmp = (encoded_op >> 24) & 15;
 	int oparg = (encoded_op << 8) >> 20;
 	int cmparg = (encoded_op << 20) >> 20;
-	int ret;
-	struct __get_user asm_ret;
+	int uninitialized_var(val), ret;
+
+	__futex_prolog();
+
+	/* The 32-bit futex code makes this assumption, so validate it here. */
+	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
 
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
@@ -71,46 +128,45 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	pagefault_disable();
 	switch (op) {
 	case FUTEX_OP_SET:
-		asm_ret = futex_set(uaddr, oparg);
+		__futex_set();
 		break;
 	case FUTEX_OP_ADD:
-		asm_ret = futex_add(uaddr, oparg);
+		__futex_add();
 		break;
 	case FUTEX_OP_OR:
-		asm_ret = futex_or(uaddr, oparg);
+		__futex_or();
 		break;
 	case FUTEX_OP_ANDN:
-		asm_ret = futex_andn(uaddr, oparg);
+		__futex_andn();
 		break;
 	case FUTEX_OP_XOR:
-		asm_ret = futex_xor(uaddr, oparg);
+		__futex_xor();
 		break;
 	default:
-		asm_ret.err = -ENOSYS;
+		ret = -ENOSYS;
+		break;
 	}
 	pagefault_enable();
 
-	ret = asm_ret.err;
-
 	if (!ret) {
 		switch (cmp) {
 		case FUTEX_OP_CMP_EQ:
-			ret = (asm_ret.val == cmparg);
+			ret = (val == cmparg);
 			break;
 		case FUTEX_OP_CMP_NE:
-			ret = (asm_ret.val != cmparg);
+			ret = (val != cmparg);
 			break;
 		case FUTEX_OP_CMP_LT:
-			ret = (asm_ret.val < cmparg);
+			ret = (val < cmparg);
 			break;
 		case FUTEX_OP_CMP_GE:
-			ret = (asm_ret.val >= cmparg);
+			ret = (val >= cmparg);
 			break;
 		case FUTEX_OP_CMP_LE:
-			ret = (asm_ret.val <= cmparg);
+			ret = (val <= cmparg);
 			break;
 		case FUTEX_OP_CMP_GT:
-			ret = (asm_ret.val > cmparg);
+			ret = (val > cmparg);
 			break;
 		default:
 			ret = -ENOSYS;
@@ -120,22 +176,20 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 }
 
 static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
-						u32 oldval, u32 newval)
+						u32 oldval, u32 oparg)
 {
-	struct __get_user asm_ret;
+	int ret, val;
+
+	__futex_prolog();
 
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
-	asm_ret = futex_cmpxchg(uaddr, oldval, newval);
-	*uval = asm_ret.val;
-	return asm_ret.err;
-}
+	__futex_cmpxchg();
 
-#ifndef __tilegx__
-/* Return failure from the atomic wrappers. */
-struct __get_user __atomic_bad_address(int __user *addr);
-#endif
+	*uval = val;
+	return ret;
+}
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/tile/include/asm/hardirq.h b/arch/tile/include/asm/hardirq.h
index 822390f9a15..54110af2398 100644
--- a/arch/tile/include/asm/hardirq.h
+++ b/arch/tile/include/asm/hardirq.h
@@ -42,6 +42,4 @@ DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
 
 #include <linux/irq_cpustat.h>	/* Standard mappings for irq_cpustat_t above */
 
-#define HARDIRQ_BITS	8
-
 #endif /* _ASM_TILE_HARDIRQ_H */
diff --git a/arch/tile/include/asm/hardwall.h b/arch/tile/include/asm/hardwall.h
index 2ac422848c7..2f572b6b7bc 100644
--- a/arch/tile/include/asm/hardwall.h
+++ b/arch/tile/include/asm/hardwall.h
@@ -11,45 +11,13 @@
  *   NON INFRINGEMENT.  See the GNU General Public License for
  *   more details.
  *
- * Provide methods for the HARDWALL_FILE for accessing the UDN.
+ * Provide methods for access control of per-cpu resources like
+ * UDN, IDN, or IPI.
  */
-
 #ifndef _ASM_TILE_HARDWALL_H
 #define _ASM_TILE_HARDWALL_H
 
-#include <linux/ioctl.h>
-
-#define HARDWALL_IOCTL_BASE 0xa2
-
-/*
- * The HARDWALL_CREATE() ioctl is a macro with a "size" argument.
- * The resulting ioctl value is passed to the kernel in conjunction
- * with a pointer to a little-endian bitmask of cpus, which must be
- * physically in a rectangular configuration on the chip.
- * The "size" is the number of bytes of cpu mask data.
- */
-#define _HARDWALL_CREATE 1
-#define HARDWALL_CREATE(size) \
-  _IOC(_IOC_READ, HARDWALL_IOCTL_BASE, _HARDWALL_CREATE, (size))
-
-#define _HARDWALL_ACTIVATE 2
-#define HARDWALL_ACTIVATE \
-  _IO(HARDWALL_IOCTL_BASE, _HARDWALL_ACTIVATE)
-
-#define _HARDWALL_DEACTIVATE 3
-#define HARDWALL_DEACTIVATE \
- _IO(HARDWALL_IOCTL_BASE, _HARDWALL_DEACTIVATE)
-
-#define _HARDWALL_GET_ID 4
-#define HARDWALL_GET_ID \
- _IO(HARDWALL_IOCTL_BASE, _HARDWALL_GET_ID)
-
-#ifndef __KERNEL__
-
-/* This is the canonical name expected by userspace. */
-#define HARDWALL_FILE "/dev/hardwall"
-
-#else
+#include <uapi/asm/hardwall.h>
 
 /* /proc hooks for hardwall. */
 struct proc_dir_entry;
@@ -59,7 +27,4 @@ int proc_pid_hardwall(struct task_struct *task, char *buffer);
 #else
 static inline void proc_tile_hardwall_init(struct proc_dir_entry *root) {}
 #endif
-
-#endif
-
 #endif /* _ASM_TILE_HARDWALL_H */
diff --git a/arch/tile/include/asm/highmem.h b/arch/tile/include/asm/highmem.h
index b2a6c5de79a..fc8429a31c8 100644
--- a/arch/tile/include/asm/highmem.h
+++ b/arch/tile/include/asm/highmem.h
@@ -59,7 +59,7 @@ void *kmap_fix_kpte(struct page *page, int finished);
 /* This macro is used only in map_new_virtual() to map "page". */
 #define kmap_prot page_to_kpgprot(page)
 
-void *__kmap_atomic(struct page *page);
+void *kmap_atomic(struct page *page);
 void __kunmap_atomic(void *kvaddr);
 void *kmap_atomic_pfn(unsigned long pfn);
 void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
diff --git a/arch/tile/include/asm/homecache.h b/arch/tile/include/asm/homecache.h
index a8243865d49..7ddd1b8d691 100644
--- a/arch/tile/include/asm/homecache.h
+++ b/arch/tile/include/asm/homecache.h
@@ -33,8 +33,7 @@ struct zone;
 
 /*
  * Is this page immutable (unwritable) and thus able to be cached more
- * widely than would otherwise be possible?  On tile64 this means we
- * mark the PTE to cache locally; on tilepro it means we have "nc" set.
+ * widely than would otherwise be possible?  This means we have "nc" set.
  */
 #define PAGE_HOME_IMMUTABLE -2
 
@@ -44,16 +43,8 @@ struct zone;
  */
 #define PAGE_HOME_INCOHERENT -3
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 /* Home for the page is distributed via hash-for-home. */
 #define PAGE_HOME_HASH -4
-#endif
-
-/* Homing is unknown or unspecified.  Not valid for page_home(). */
-#define PAGE_HOME_UNKNOWN -5
-
-/* Home on the current cpu.  Not valid for page_home(). */
-#define PAGE_HOME_HERE -6
 
 /* Support wrapper to use instead of explicit hv_flush_remote(). */
 extern void flush_remote(unsigned long cache_pfn, unsigned long cache_length,
@@ -79,10 +70,17 @@ extern void homecache_change_page_home(struct page *, int order, int home);
 /*
  * Flush a page out of whatever cache(s) it is in.
  * This is more than just finv, since it properly handles waiting
- * for the data to reach memory on tilepro, but it can be quite
- * heavyweight, particularly on hash-for-home memory.
+ * for the data to reach memory, but it can be quite
+ * heavyweight, particularly on incoherent or immutable memory.
+ */
+extern void homecache_finv_page(struct page *);
+
+/*
+ * Flush a page out of the specified home cache.
+ * Note that the specified home need not be the actual home of the page,
+ * as for example might be the case when coordinating with I/O devices.
  */
-extern void homecache_flush_cache(struct page *, int order);
+extern void homecache_finv_map_page(struct page *, int home);
 
 /*
  * Allocate a page with the given GFP flags, home, and optionally
@@ -104,10 +102,10 @@ extern struct page *homecache_alloc_pages_node(int nid, gfp_t gfp_mask,
  * routines use homecache_change_page_home() to reset the home
  * back to the default before returning the page to the allocator.
  */
+void __homecache_free_pages(struct page *, unsigned int order);
 void homecache_free_pages(unsigned long addr, unsigned int order);
-#define homecache_free_page(page) \
-  homecache_free_pages((page), 0)
-
+#define __homecache_free_page(page) __homecache_free_pages((page), 0)
+#define homecache_free_page(page) homecache_free_pages((page), 0)
 
 
 /*
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h
index d396d180516..3257733003f 100644
--- a/arch/tile/include/asm/hugetlb.h
+++ b/arch/tile/include/asm/hugetlb.h
@@ -16,6 +16,7 @@
 #define _ASM_TILE_HUGETLB_H
 
 #include <asm/page.h>
+#include <asm-generic/hugetlb.h>
 
 
 static inline int is_hugepage_only_range(struct mm_struct *mm,
@@ -106,4 +107,29 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+				       struct page *page, int writable)
+{
+	size_t pagesize = huge_page_size(hstate_vma(vma));
+	if (pagesize != PUD_SIZE && pagesize != PMD_SIZE)
+		entry = pte_mksuper(entry);
+	return entry;
+}
+#define arch_make_huge_pte arch_make_huge_pte
+
+/* Sizes to scale up page size for PTEs with HV_PTE_SUPER bit. */
+enum {
+	HUGE_SHIFT_PGDIR = 0,
+	HUGE_SHIFT_PMD = 1,
+	HUGE_SHIFT_PAGE = 2,
+	HUGE_SHIFT_ENTRIES
+};
+extern int huge_shift[HUGE_SHIFT_ENTRIES];
+#endif
+
 #endif /* _ASM_TILE_HUGETLB_H */
diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
index d2152deb1f3..9fe434969fa 100644
--- a/arch/tile/include/asm/io.h
+++ b/arch/tile/include/asm/io.h
@@ -19,7 +19,8 @@
 #include <linux/bug.h>
 #include <asm/page.h>
 
-#define IO_SPACE_LIMIT 0xfffffffful
+/* Maximum PCI I/O space address supported. */
+#define IO_SPACE_LIMIT 0xffffffff
 
 /*
  * Convert a physical pointer to a virtual kernel pointer for /dev/mem
@@ -62,6 +63,92 @@ extern void iounmap(volatile void __iomem *addr);
 #define mm_ptov(addr)		((void *)phys_to_virt(addr))
 #define mm_vtop(addr)		((unsigned long)virt_to_phys(addr))
 
+#if CHIP_HAS_MMIO()
+
+/*
+ * We use inline assembly to guarantee that the compiler does not
+ * split an access into multiple byte-sized accesses as it might
+ * sometimes do if a register data structure is marked "packed".
+ * Obviously on tile we can't tolerate such an access being
+ * actually unaligned, but we want to avoid the case where the
+ * compiler conservatively would generate multiple accesses even
+ * for an aligned read or write.
+ */
+
+static inline u8 __raw_readb(const volatile void __iomem *addr)
+{
+	return *(const volatile u8 __force *)addr;
+}
+
+static inline u16 __raw_readw(const volatile void __iomem *addr)
+{
+	u16 ret;
+	asm volatile("ld2u %0, %1" : "=r" (ret) : "r" (addr));
+	barrier();
+	return le16_to_cpu(ret);
+}
+
+static inline u32 __raw_readl(const volatile void __iomem *addr)
+{
+	u32 ret;
+	/* Sign-extend to conform to u32 ABI sign-extension convention. */
+	asm volatile("ld4s %0, %1" : "=r" (ret) : "r" (addr));
+	barrier();
+	return le32_to_cpu(ret);
+}
+
+static inline u64 __raw_readq(const volatile void __iomem *addr)
+{
+	u64 ret;
+	asm volatile("ld %0, %1" : "=r" (ret) : "r" (addr));
+	barrier();
+	return le64_to_cpu(ret);
+}
+
+static inline void __raw_writeb(u8 val, volatile void __iomem *addr)
+{
+	*(volatile u8 __force *)addr = val;
+}
+
+static inline void __raw_writew(u16 val, volatile void __iomem *addr)
+{
+	asm volatile("st2 %0, %1" :: "r" (addr), "r" (cpu_to_le16(val)));
+}
+
+static inline void __raw_writel(u32 val, volatile void __iomem *addr)
+{
+	asm volatile("st4 %0, %1" :: "r" (addr), "r" (cpu_to_le32(val)));
+}
+
+static inline void __raw_writeq(u64 val, volatile void __iomem *addr)
+{
+	asm volatile("st %0, %1" :: "r" (addr), "r" (cpu_to_le64(val)));
+}
+
+/*
+ * The on-chip I/O hardware on tilegx is configured with VA=PA for the
+ * kernel's PA range.  The low-level APIs and field names use "va" and
+ * "void *" nomenclature, to be consistent with the general notion
+ * that the addresses in question are virtualizable, but in the kernel
+ * context we are actually manipulating PA values.  (In other contexts,
+ * e.g. access from user space, we do in fact use real virtual addresses
+ * in the va fields.)  To allow readers of the code to understand what's
+ * happening, we direct their attention to this comment by using the
+ * following two functions that just duplicate __va() and __pa().
+ */
+typedef unsigned long tile_io_addr_t;
+static inline tile_io_addr_t va_to_tile_io_addr(void *va)
+{
+	BUILD_BUG_ON(sizeof(phys_addr_t) != sizeof(tile_io_addr_t));
+	return __pa(va);
+}
+static inline void *tile_io_addr_to_va(tile_io_addr_t tile_io_addr)
+{
+	return __va(tile_io_addr);
+}
+
+#else /* CHIP_HAS_MMIO() */
+
 #ifdef CONFIG_PCI
 
 extern u8 _tile_readb(unsigned long addr);
@@ -73,10 +160,19 @@ extern void _tile_writew(u16 val, unsigned long addr);
 extern void _tile_writel(u32 val, unsigned long addr);
 extern void _tile_writeq(u64 val, unsigned long addr);
 
-#else
+#define __raw_readb(addr) _tile_readb((unsigned long)addr)
+#define __raw_readw(addr) _tile_readw((unsigned long)addr)
+#define __raw_readl(addr) _tile_readl((unsigned long)addr)
+#define __raw_readq(addr) _tile_readq((unsigned long)addr)
+#define __raw_writeb(val, addr) _tile_writeb(val, (unsigned long)addr)
+#define __raw_writew(val, addr) _tile_writew(val, (unsigned long)addr)
+#define __raw_writel(val, addr) _tile_writel(val, (unsigned long)addr)
+#define __raw_writeq(val, addr) _tile_writeq(val, (unsigned long)addr)
+
+#else /* CONFIG_PCI */
 
 /*
- * The Tile architecture does not support IOMEM unless PCI is enabled.
+ * The tilepro architecture does not support IOMEM unless PCI is enabled.
  * Unfortunately we can't yet simply not declare these methods,
  * since some generic code that compiles into the kernel, but
  * we never run, uses them unconditionally.
@@ -88,65 +184,58 @@ static inline int iomem_panic(void)
 	return 0;
 }
 
-static inline u8 _tile_readb(unsigned long addr)
+static inline u8 readb(unsigned long addr)
 {
 	return iomem_panic();
 }
 
-static inline u16 _tile_readw(unsigned long addr)
+static inline u16 _readw(unsigned long addr)
 {
 	return iomem_panic();
 }
 
-static inline u32 _tile_readl(unsigned long addr)
+static inline u32 readl(unsigned long addr)
 {
 	return iomem_panic();
 }
 
-static inline u64 _tile_readq(unsigned long addr)
+static inline u64 readq(unsigned long addr)
 {
 	return iomem_panic();
 }
 
-static inline void _tile_writeb(u8  val, unsigned long addr)
+static inline void writeb(u8  val, unsigned long addr)
 {
 	iomem_panic();
 }
 
-static inline void _tile_writew(u16 val, unsigned long addr)
+static inline void writew(u16 val, unsigned long addr)
 {
 	iomem_panic();
 }
 
-static inline void _tile_writel(u32 val, unsigned long addr)
+static inline void writel(u32 val, unsigned long addr)
 {
 	iomem_panic();
 }
 
-static inline void _tile_writeq(u64 val, unsigned long addr)
+static inline void writeq(u64 val, unsigned long addr)
 {
 	iomem_panic();
 }
 
-#endif
+#endif /* CONFIG_PCI */
 
-#define readb(addr) _tile_readb((unsigned long)addr)
-#define readw(addr) _tile_readw((unsigned long)addr)
-#define readl(addr) _tile_readl((unsigned long)addr)
-#define readq(addr) _tile_readq((unsigned long)addr)
-#define writeb(val, addr) _tile_writeb(val, (unsigned long)addr)
-#define writew(val, addr) _tile_writew(val, (unsigned long)addr)
-#define writel(val, addr) _tile_writel(val, (unsigned long)addr)
-#define writeq(val, addr) _tile_writeq(val, (unsigned long)addr)
-
-#define __raw_readb readb
-#define __raw_readw readw
-#define __raw_readl readl
-#define __raw_readq readq
-#define __raw_writeb writeb
-#define __raw_writew writew
-#define __raw_writel writel
-#define __raw_writeq writeq
+#endif /* CHIP_HAS_MMIO() */
+
+#define readb __raw_readb
+#define readw __raw_readw
+#define readl __raw_readl
+#define readq __raw_readq
+#define writeb __raw_writeb
+#define writew __raw_writew
+#define writel __raw_writel
+#define writeq __raw_writeq
 
 #define readb_relaxed readb
 #define readw_relaxed readw
@@ -162,9 +251,11 @@ static inline void _tile_writeq(u64 val, unsigned long addr)
 #define iowrite32 writel
 #define iowrite64 writeq
 
-static inline void memset_io(void *dst, int val, size_t len)
+#if CHIP_HAS_MMIO() || defined(CONFIG_PCI)
+
+static inline void memset_io(volatile void *dst, int val, size_t len)
 {
-	int x;
+	size_t x;
 	BUG_ON((unsigned long)dst & 0x3);
 	val = (val & 0xff) * 0x01010101;
 	for (x = 0; x < len; x += 4)
@@ -174,7 +265,7 @@ static inline void memset_io(void *dst, int val, size_t len)
 static inline void memcpy_fromio(void *dst, const volatile void __iomem *src,
 				 size_t len)
 {
-	int x;
+	size_t x;
 	BUG_ON((unsigned long)src & 0x3);
 	for (x = 0; x < len; x += 4)
 		*(u32 *)(dst + x) = readl(src + x);
@@ -183,14 +274,116 @@ static inline void memcpy_fromio(void *dst, const volatile void __iomem *src,
 static inline void memcpy_toio(volatile void __iomem *dst, const void *src,
 				size_t len)
 {
-	int x;
+	size_t x;
 	BUG_ON((unsigned long)dst & 0x3);
 	for (x = 0; x < len; x += 4)
 		writel(*(u32 *)(src + x), dst + x);
 }
 
+#endif
+
+#if CHIP_HAS_MMIO() && defined(CONFIG_TILE_PCI_IO)
+
+static inline u8 inb(unsigned long addr)
+{
+	return readb((volatile void __iomem *) addr);
+}
+
+static inline u16 inw(unsigned long addr)
+{
+	return readw((volatile void __iomem *) addr);
+}
+
+static inline u32 inl(unsigned long addr)
+{
+	return readl((volatile void __iomem *) addr);
+}
+
+static inline void outb(u8 b, unsigned long addr)
+{
+	writeb(b, (volatile void __iomem *) addr);
+}
+
+static inline void outw(u16 b, unsigned long addr)
+{
+	writew(b, (volatile void __iomem *) addr);
+}
+
+static inline void outl(u32 b, unsigned long addr)
+{
+	writel(b, (volatile void __iomem *) addr);
+}
+
+static inline void insb(unsigned long addr, void *buffer, int count)
+{
+	if (count) {
+		u8 *buf = buffer;
+		do {
+			u8 x = inb(addr);
+			*buf++ = x;
+		} while (--count);
+	}
+}
+
+static inline void insw(unsigned long addr, void *buffer, int count)
+{
+	if (count) {
+		u16 *buf = buffer;
+		do {
+			u16 x = inw(addr);
+			*buf++ = x;
+		} while (--count);
+	}
+}
+
+static inline void insl(unsigned long addr, void *buffer, int count)
+{
+	if (count) {
+		u32 *buf = buffer;
+		do {
+			u32 x = inl(addr);
+			*buf++ = x;
+		} while (--count);
+	}
+}
+
+static inline void outsb(unsigned long addr, const void *buffer, int count)
+{
+	if (count) {
+		const u8 *buf = buffer;
+		do {
+			outb(*buf++, addr);
+		} while (--count);
+	}
+}
+
+static inline void outsw(unsigned long addr, const void *buffer, int count)
+{
+	if (count) {
+		const u16 *buf = buffer;
+		do {
+			outw(*buf++, addr);
+		} while (--count);
+	}
+}
+
+static inline void outsl(unsigned long addr, const void *buffer, int count)
+{
+	if (count) {
+		const u32 *buf = buffer;
+		do {
+			outl(*buf++, addr);
+		} while (--count);
+	}
+}
+
+extern void __iomem *ioport_map(unsigned long port, unsigned int len);
+extern void ioport_unmap(void __iomem *addr);
+
+#else
+
 /*
- * The Tile architecture does not support IOPORT, even with PCI.
+ * The TilePro architecture does not support IOPORT, even with PCI.
  * Unfortunately we can't yet simply not declare these methods,
  * since some generic code that compiles into the kernel, but
  * we never run, uses them unconditionally.
@@ -198,7 +391,12 @@ static inline void memcpy_toio(volatile void __iomem *dst, const void *src,
 
 static inline long ioport_panic(void)
 {
+#ifdef __tilegx__
+	panic("PCI IO space support is disabled. Configure the kernel with"
+	      " CONFIG_TILE_PCI_IO to enable it");
+#else
 	panic("inb/outb and friends do not exist on tile");
+#endif
 	return 0;
 }
 
@@ -243,13 +441,6 @@ static inline void outl(u32 b, unsigned long addr)
 	ioport_panic();
 }
 
-#define inb_p(addr)	inb(addr)
-#define inw_p(addr)	inw(addr)
-#define inl_p(addr)	inl(addr)
-#define outb_p(x, addr)	outb((x), (addr))
-#define outw_p(x, addr)	outw((x), (addr))
-#define outl_p(x, addr)	outl((x), (addr))
-
 static inline void insb(unsigned long addr, void *buffer, int count)
 {
 	ioport_panic();
@@ -280,6 +471,15 @@ static inline void outsl(unsigned long addr, const void *buffer, int count)
 	ioport_panic();
 }
 
+#endif /* CHIP_HAS_MMIO() && defined(CONFIG_TILE_PCI_IO) */
+
+#define inb_p(addr)	inb(addr)
+#define inw_p(addr)	inw(addr)
+#define inl_p(addr)	inl(addr)
+#define outb_p(x, addr)	outb((x), (addr))
+#define outw_p(x, addr)	outw((x), (addr))
+#define outl_p(x, addr)	outl((x), (addr))
+
 #define ioread16be(addr)	be16_to_cpu(ioread16(addr))
 #define ioread32be(addr)	be32_to_cpu(ioread32(addr))
 #define iowrite16be(v, addr)	iowrite16(be16_to_cpu(v), (addr))
diff --git a/arch/tile/include/asm/irq.h b/arch/tile/include/asm/irq.h
index f80f8ceabc6..1fe86911838 100644
--- a/arch/tile/include/asm/irq.h
+++ b/arch/tile/include/asm/irq.h
@@ -18,10 +18,12 @@
 #include <linux/hardirq.h>
 
 /* The hypervisor interface provides 32 IRQs. */
-#define NR_IRQS 32
+#define NR_IRQS			32
 
 /* IRQ numbers used for linux IPIs. */
-#define IRQ_RESCHEDULE 1
+#define IRQ_RESCHEDULE	0
+/* Interrupts for dynamic allocation start at 1. Let the core allocate irq0 */
+#define NR_IRQS_LEGACY	1
 
 #define irq_canonicalize(irq)   (irq)
 
diff --git a/arch/tile/include/asm/irqflags.h b/arch/tile/include/asm/irqflags.h
index 5db0ce54284..71af5747874 100644
--- a/arch/tile/include/asm/irqflags.h
+++ b/arch/tile/include/asm/irqflags.h
@@ -18,32 +18,20 @@
 #include <arch/interrupts.h>
 #include <arch/chip.h>
 
-#if !defined(__tilegx__) && defined(__ASSEMBLY__)
-
 /*
  * The set of interrupts we want to allow when interrupts are nominally
  * disabled.  The remainder are effectively "NMI" interrupts from
  * the point of view of the generic Linux code.  Note that synchronous
  * interrupts (aka "non-queued") are not blocked by the mask in any case.
  */
-#if CHIP_HAS_AUX_PERF_COUNTERS()
-#define LINUX_MASKABLE_INTERRUPTS_HI \
-       (~(INT_MASK_HI(INT_PERF_COUNT) | INT_MASK_HI(INT_AUX_PERF_COUNT)))
-#else
-#define LINUX_MASKABLE_INTERRUPTS_HI \
-       (~(INT_MASK_HI(INT_PERF_COUNT)))
-#endif
-
-#else
-
-#if CHIP_HAS_AUX_PERF_COUNTERS()
-#define LINUX_MASKABLE_INTERRUPTS \
-	(~(INT_MASK(INT_PERF_COUNT) | INT_MASK(INT_AUX_PERF_COUNT)))
-#else
 #define LINUX_MASKABLE_INTERRUPTS \
-	(~(INT_MASK(INT_PERF_COUNT)))
-#endif
+	(~((_AC(1,ULL) << INT_PERF_COUNT) | (_AC(1,ULL) << INT_AUX_PERF_COUNT)))
 
+#if CHIP_HAS_SPLIT_INTR_MASK()
+/* The same macro, but for the two 32-bit SPRs separately. */
+#define LINUX_MASKABLE_INTERRUPTS_LO (-1)
+#define LINUX_MASKABLE_INTERRUPTS_HI \
+	(~((1 << (INT_PERF_COUNT - 32)) | (1 << (INT_AUX_PERF_COUNT - 32))))
 #endif
 
 #ifndef __ASSEMBLY__
@@ -52,7 +40,15 @@
 #include <asm/percpu.h>
 #include <arch/spr_def.h>
 
-/* Set and clear kernel interrupt masks. */
+/*
+ * Set and clear kernel interrupt masks.
+ *
+ * NOTE: __insn_mtspr() is a compiler builtin marked as a memory
+ * clobber.  We rely on it being equivalent to a compiler barrier in
+ * this code since arch_local_irq_save() and friends must act as
+ * compiler barriers.  This compiler semantic is baked into enough
+ * places that the compiler will maintain it going forward.
+ */
 #if CHIP_HAS_SPLIT_INTR_MASK()
 #if INT_PERF_COUNT < 32 || INT_AUX_PERF_COUNT < 32 || INT_MEM_ERROR >= 32
 # error Fix assumptions about which word various interrupts are in
@@ -90,6 +86,14 @@
 	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K_0, (unsigned long)(__m)); \
 	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K_1, (unsigned long)(__m>>32)); \
 } while (0)
+#define interrupt_mask_save_mask() \
+	(__insn_mfspr(SPR_INTERRUPT_MASK_SET_K_0) | \
+	 (((unsigned long long)__insn_mfspr(SPR_INTERRUPT_MASK_SET_K_1))<<32))
+#define interrupt_mask_restore_mask(mask) do { \
+	unsigned long long __m = (mask); \
+	__insn_mtspr(SPR_INTERRUPT_MASK_K_0, (unsigned long)(__m)); \
+	__insn_mtspr(SPR_INTERRUPT_MASK_K_1, (unsigned long)(__m>>32)); \
+} while (0)
 #else
 #define interrupt_mask_set(n) \
 	__insn_mtspr(SPR_INTERRUPT_MASK_SET_K, (1UL << (n)))
@@ -101,6 +105,10 @@
 	__insn_mtspr(SPR_INTERRUPT_MASK_SET_K, (mask))
 #define interrupt_mask_reset_mask(mask) \
 	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K, (mask))
+#define interrupt_mask_save_mask() \
+	__insn_mfspr(SPR_INTERRUPT_MASK_K)
+#define interrupt_mask_restore_mask(mask) \
+	__insn_mtspr(SPR_INTERRUPT_MASK_K, (mask))
 #endif
 
 /*
@@ -114,7 +122,13 @@
  * to know our current state.
  */
 DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
-#define INITIAL_INTERRUPTS_ENABLED INT_MASK(INT_MEM_ERROR)
+#define INITIAL_INTERRUPTS_ENABLED (1ULL << INT_MEM_ERROR)
+
+#ifdef CONFIG_DEBUG_PREEMPT
+/* Due to inclusion issues, we can't rely on <linux/smp.h> here. */
+extern unsigned int debug_smp_processor_id(void);
+# define smp_processor_id() debug_smp_processor_id()
+#endif
 
 /* Disable interrupts. */
 #define arch_local_irq_disable() \
@@ -122,11 +136,20 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 
 /* Disable all interrupts, including NMIs. */
 #define arch_local_irq_disable_all() \
-	interrupt_mask_set_mask(-1UL)
+	interrupt_mask_set_mask(-1ULL)
+
+/*
+ * Read the set of maskable interrupts.
+ * We avoid the preemption warning here via __this_cpu_ptr since even
+ * if irqs are already enabled, it's harmless to read the wrong cpu's
+ * enabled mask.
+ */
+#define arch_local_irqs_enabled() \
+	(*__this_cpu_ptr(&interrupts_enabled_mask))
 
 /* Re-enable all maskable interrupts. */
 #define arch_local_irq_enable() \
-	interrupt_mask_reset_mask(__get_cpu_var(interrupts_enabled_mask))
+	interrupt_mask_reset_mask(arch_local_irqs_enabled())
 
 /* Disable or enable interrupts based on flag argument. */
 #define arch_local_irq_restore(disabled) do { \
@@ -153,7 +176,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 
 /* Prevent the given interrupt from being enabled next time we enable irqs. */
 #define arch_local_irq_mask(interrupt) \
-	(__get_cpu_var(interrupts_enabled_mask) &= ~INT_MASK(interrupt))
+	this_cpu_and(interrupts_enabled_mask, ~(1ULL << (interrupt)))
 
 /* Prevent the given interrupt from being enabled immediately. */
 #define arch_local_irq_mask_now(interrupt) do { \
@@ -163,7 +186,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 
 /* Allow the given interrupt to be enabled next time we enable irqs. */
 #define arch_local_irq_unmask(interrupt) \
-	(__get_cpu_var(interrupts_enabled_mask) |= INT_MASK(interrupt))
+	this_cpu_or(interrupts_enabled_mask, (1ULL << (interrupt)))
 
 /* Allow the given interrupt to be enabled immediately, if !irqs_disabled. */
 #define arch_local_irq_unmask_now(interrupt) do { \
@@ -179,7 +202,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 #ifdef __tilegx__
 
 #if INT_MEM_ERROR != 0
-# error Fix IRQ_DISABLED() macro
+# error Fix IRQS_DISABLED() macro
 #endif
 
 /* Return 0 or 1 to indicate whether interrupts are currently disabled. */
@@ -207,9 +230,10 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 	mtspr   SPR_INTERRUPT_MASK_SET_K, tmp
 
 /* Enable interrupts. */
-#define IRQ_ENABLE(tmp0, tmp1)					\
+#define IRQ_ENABLE_LOAD(tmp0, tmp1)				\
 	GET_INTERRUPTS_ENABLED_MASK_PTR(tmp0);			\
-	ld      tmp0, tmp0;					\
+	ld      tmp0, tmp0
+#define IRQ_ENABLE_APPLY(tmp0, tmp1)				\
 	mtspr   SPR_INTERRUPT_MASK_RESET_K, tmp0
 
 #else /* !__tilegx__ */
@@ -237,7 +261,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 /* Disable interrupts. */
 #define IRQ_DISABLE(tmp0, tmp1)					\
 	{							\
-	 movei  tmp0, -1;					\
+	 movei  tmp0, LINUX_MASKABLE_INTERRUPTS_LO;		\
 	 moveli tmp1, lo16(LINUX_MASKABLE_INTERRUPTS_HI)	\
 	};							\
 	{							\
@@ -253,17 +277,22 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 	mtspr   SPR_INTERRUPT_MASK_SET_K_1, tmp
 
 /* Enable interrupts. */
-#define IRQ_ENABLE(tmp0, tmp1)					\
+#define IRQ_ENABLE_LOAD(tmp0, tmp1)				\
 	GET_INTERRUPTS_ENABLED_MASK_PTR(tmp0);			\
 	{							\
 	 lw     tmp0, tmp0;					\
 	 addi   tmp1, tmp0, 4					\
 	};							\
-	lw      tmp1, tmp1;					\
+	lw      tmp1, tmp1
+#define IRQ_ENABLE_APPLY(tmp0, tmp1)				\
 	mtspr   SPR_INTERRUPT_MASK_RESET_K_0, tmp0;		\
 	mtspr   SPR_INTERRUPT_MASK_RESET_K_1, tmp1
 #endif
 
+#define IRQ_ENABLE(tmp0, tmp1)					\
+	IRQ_ENABLE_LOAD(tmp0, tmp1);				\
+	IRQ_ENABLE_APPLY(tmp0, tmp1)
+
 /*
  * Do the CPU's IRQ-state tracing from assembly code. We call a
  * C function, but almost everywhere we do, we don't mind clobbering
diff --git a/arch/tile/include/asm/kdebug.h b/arch/tile/include/asm/kdebug.h
new file mode 100644
index 00000000000..5bbbfa904c2
--- /dev/null
+++ b/arch/tile/include/asm/kdebug.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_KDEBUG_H
+#define _ASM_TILE_KDEBUG_H
+
+#include <linux/notifier.h>
+
+enum die_val {
+	DIE_OOPS = 1,
+	DIE_BREAK,
+	DIE_SSTEPBP,
+	DIE_PAGE_FAULT,
+	DIE_COMPILED_BPT
+};
+
+#endif /* _ASM_TILE_KDEBUG_H */
diff --git a/arch/tile/include/asm/kexec.h b/arch/tile/include/asm/kexec.h
index c11a6cc73bb..fc98ccfc98a 100644
--- a/arch/tile/include/asm/kexec.h
+++ b/arch/tile/include/asm/kexec.h
@@ -19,12 +19,24 @@
 
 #include <asm/page.h>
 
+#ifndef __tilegx__
 /* Maximum physical address we can use pages from. */
 #define KEXEC_SOURCE_MEMORY_LIMIT TASK_SIZE
 /* Maximum address we can reach in physical address mode. */
 #define KEXEC_DESTINATION_MEMORY_LIMIT TASK_SIZE
 /* Maximum address we can use for the control code buffer. */
 #define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+#else
+/* We need to limit the memory below PGDIR_SIZE since
+ * we only setup page table for [0, PGDIR_SIZE) before final kexec.
+ */
+/* Maximum physical address we can use pages from. */
+#define KEXEC_SOURCE_MEMORY_LIMIT PGDIR_SIZE
+/* Maximum address we can reach in physical address mode. */
+#define KEXEC_DESTINATION_MEMORY_LIMIT PGDIR_SIZE
+/* Maximum address we can use for the control code buffer. */
+#define KEXEC_CONTROL_MEMORY_LIMIT PGDIR_SIZE
+#endif
 
 #define KEXEC_CONTROL_PAGE_SIZE	PAGE_SIZE
 
diff --git a/arch/tile/include/asm/kgdb.h b/arch/tile/include/asm/kgdb.h
new file mode 100644
index 00000000000..280c181cf0d
--- /dev/null
+++ b/arch/tile/include/asm/kgdb.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx KGDB support.
+ */
+
+#ifndef __TILE_KGDB_H__
+#define __TILE_KGDB_H__
+
+#include <linux/kdebug.h>
+#include <arch/opcode.h>
+
+#define GDB_SIZEOF_REG		sizeof(unsigned long)
+
+/*
+ * TILE-Gx gdb is expecting the following register layout:
+ * 56 GPRs(R0 - R52, TP, SP, LR), 8 special GPRs(networks and ZERO),
+ * plus the PC and the faultnum.
+ *
+ * Even though kernel not use the 8 special GPRs, they need to be present
+ * in the registers sent for correct processing in the host-side gdb.
+ *
+ */
+#define DBG_MAX_REG_NUM		(56+8+2)
+#define NUMREGBYTES		(DBG_MAX_REG_NUM * GDB_SIZEOF_REG)
+
+/*
+ * BUFMAX defines the maximum number of characters in inbound/outbound
+ * buffers at least NUMREGBYTES*2 are needed for register packets,
+ * Longer buffer is needed to list all threads.
+ */
+#define BUFMAX			2048
+
+#define BREAK_INSTR_SIZE	TILEGX_BUNDLE_SIZE_IN_BYTES
+
+/*
+ * Require cache flush for set/clear a software breakpoint or write memory.
+ */
+#define CACHE_FLUSH_IS_SAFE	1
+
+/*
+ * The compiled-in breakpoint instruction can be used to "break" into
+ * the debugger via magic system request key (sysrq-G).
+ */
+static tile_bundle_bits compiled_bpt = TILEGX_BPT_BUNDLE | DIE_COMPILED_BPT;
+
+enum tilegx_regnum {
+	TILEGX_PC_REGNUM = TREG_LAST_GPR + 9,
+	TILEGX_FAULTNUM_REGNUM,
+};
+
+/*
+ * Generate a breakpoint exception to "break" into the debugger.
+ */
+static inline void arch_kgdb_breakpoint(void)
+{
+	asm volatile (".quad %0\n\t"
+		      ::""(compiled_bpt));
+}
+
+#endif /* __TILE_KGDB_H__ */
diff --git a/arch/tile/include/asm/kmap_types.h b/arch/tile/include/asm/kmap_types.h
index 3d0f2024626..92b28e3e997 100644
--- a/arch/tile/include/asm/kmap_types.h
+++ b/arch/tile/include/asm/kmap_types.h
@@ -23,35 +23,6 @@
  * adds 4MB of required address-space.  For now we leave KM_TYPE_NR
  * set to depth 8.
  */
-enum km_type {
-	KM_TYPE_NR = 8
-};
-
-/*
- * We provide dummy definitions of all the stray values that used to be
- * required for kmap_atomic() and no longer are.
- */
-enum {
-	KM_BOUNCE_READ,
-	KM_SKB_SUNRPC_DATA,
-	KM_SKB_DATA_SOFTIRQ,
-	KM_USER0,
-	KM_USER1,
-	KM_BIO_SRC_IRQ,
-	KM_BIO_DST_IRQ,
-	KM_PTE0,
-	KM_PTE1,
-	KM_IRQ0,
-	KM_IRQ1,
-	KM_SOFTIRQ0,
-	KM_SOFTIRQ1,
-	KM_SYNC_ICACHE,
-	KM_SYNC_DCACHE,
-	KM_UML_USERCOPY,
-	KM_IRQ_PTE,
-	KM_NMI,
-	KM_NMI_PTE,
-	KM_KDB
-};
+#define	KM_TYPE_NR 8
 
 #endif /* _ASM_TILE_KMAP_TYPES_H */
diff --git a/arch/tile/include/asm/kprobes.h b/arch/tile/include/asm/kprobes.h
new file mode 100644
index 00000000000..d8f9a83943b
--- /dev/null
+++ b/arch/tile/include/asm/kprobes.h
@@ -0,0 +1,79 @@
+/*
+ * arch/tile/include/asm/kprobes.h
+ *
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_KPROBES_H
+#define _ASM_TILE_KPROBES_H
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+
+#include <arch/opcode.h>
+
+#define __ARCH_WANT_KPROBES_INSN_SLOT
+#define MAX_INSN_SIZE			2
+
+#define kretprobe_blacklist_size 0
+
+typedef tile_bundle_bits kprobe_opcode_t;
+
+#define flush_insn_slot(p)						\
+	flush_icache_range((unsigned long)p->addr,			\
+			   (unsigned long)p->addr +			\
+			   (MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+
+struct kprobe;
+
+/* Architecture specific copy of original instruction. */
+struct arch_specific_insn {
+	kprobe_opcode_t *insn;
+};
+
+struct prev_kprobe {
+	struct kprobe *kp;
+	unsigned long status;
+	unsigned long saved_pc;
+};
+
+#define MAX_JPROBES_STACK_SIZE 128
+#define MAX_JPROBES_STACK_ADDR \
+	(((unsigned long)current_thread_info()) + THREAD_SIZE - 32 \
+		- sizeof(struct pt_regs))
+
+#define MIN_JPROBES_STACK_SIZE(ADDR)					\
+	((((ADDR) + MAX_JPROBES_STACK_SIZE) > MAX_JPROBES_STACK_ADDR)	\
+		? MAX_JPROBES_STACK_ADDR - (ADDR)			\
+		: MAX_JPROBES_STACK_SIZE)
+
+/* per-cpu kprobe control block. */
+struct kprobe_ctlblk {
+	unsigned long kprobe_status;
+	unsigned long kprobe_saved_pc;
+	unsigned long jprobe_saved_sp;
+	struct prev_kprobe prev_kprobe;
+	struct pt_regs jprobe_saved_regs;
+	char jprobes_stack[MAX_JPROBES_STACK_SIZE];
+};
+
+extern tile_bundle_bits breakpoint2_insn;
+extern tile_bundle_bits breakpoint_insn;
+
+void arch_remove_kprobe(struct kprobe *);
+
+extern int kprobe_exceptions_notify(struct notifier_block *self,
+			     unsigned long val, void *data);
+
+#endif /* _ASM_TILE_KPROBES_H */
diff --git a/arch/tile/include/asm/memprof.h b/arch/tile/include/asm/memprof.h
deleted file mode 100644
index 359949be28c..00000000000
--- a/arch/tile/include/asm/memprof.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- *
- * The hypervisor's memory controller profiling infrastructure allows
- * the programmer to find out what fraction of the available memory
- * bandwidth is being consumed at each memory controller.  The
- * profiler provides start, stop, and clear operations to allows
- * profiling over a specific time window, as well as an interface for
- * reading the most recent profile values.
- *
- * This header declares IOCTL codes necessary to control memprof.
- */
-#ifndef _ASM_TILE_MEMPROF_H
-#define _ASM_TILE_MEMPROF_H
-
-#include <linux/ioctl.h>
-
-#define MEMPROF_IOCTL_TYPE 0xB4
-#define MEMPROF_IOCTL_START _IO(MEMPROF_IOCTL_TYPE, 0)
-#define MEMPROF_IOCTL_STOP _IO(MEMPROF_IOCTL_TYPE, 1)
-#define MEMPROF_IOCTL_CLEAR _IO(MEMPROF_IOCTL_TYPE, 2)
-
-#endif /* _ASM_TILE_MEMPROF_H */
diff --git a/arch/tile/include/asm/mmu.h b/arch/tile/include/asm/mmu.h
index 92f94c77b6e..0cab1182bde 100644
--- a/arch/tile/include/asm/mmu.h
+++ b/arch/tile/include/asm/mmu.h
@@ -21,7 +21,8 @@ struct mm_context {
 	 * Written under the mmap_sem semaphore; read without the
 	 * semaphore but atomically, but it is conservatively set.
 	 */
-	unsigned int priority_cached;
+	unsigned long priority_cached;
+	unsigned long vdso_base;
 };
 
 typedef struct mm_context mm_context_t;
diff --git a/arch/tile/include/asm/mmu_context.h b/arch/tile/include/asm/mmu_context.h
index 15fb2464112..4734215e2ad 100644
--- a/arch/tile/include/asm/mmu_context.h
+++ b/arch/tile/include/asm/mmu_context.h
@@ -30,18 +30,22 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	return 0;
 }
 
-/* Note that arch/tile/kernel/head.S also calls hv_install_context() */
+/*
+ * Note that arch/tile/kernel/head_NN.S and arch/tile/mm/migrate_NN.S
+ * also call hv_install_context().
+ */
 static inline void __install_page_table(pgd_t *pgdir, int asid, pgprot_t prot)
 {
 	/* FIXME: DIRECTIO should not always be set. FIXME. */
-	int rc = hv_install_context(__pa(pgdir), prot, asid, HV_CTX_DIRECTIO);
+	int rc = hv_install_context(__pa(pgdir), prot, asid,
+				    HV_CTX_DIRECTIO | CTX_PAGE_FLAG);
 	if (rc < 0)
 		panic("hv_install_context failed: %d", rc);
 }
 
 static inline void install_page_table(pgd_t *pgdir, int asid)
 {
-	pte_t *ptep = virt_to_pte(NULL, (unsigned long)pgdir);
+	pte_t *ptep = virt_to_kpte((unsigned long)pgdir);
 	__install_page_table(pgdir, asid, *ptep);
 }
 
diff --git a/arch/tile/include/asm/mmzone.h b/arch/tile/include/asm/mmzone.h
index 9d3dbce8f95..804f1098b6c 100644
--- a/arch/tile/include/asm/mmzone.h
+++ b/arch/tile/include/asm/mmzone.h
@@ -42,7 +42,7 @@ static inline int pfn_to_nid(unsigned long pfn)
 
 #define kern_addr_valid(kaddr)	virt_addr_valid((void *)kaddr)
 
-static inline int pfn_valid(int pfn)
+static inline int pfn_valid(unsigned long pfn)
 {
 	int nid = pfn_to_nid(pfn);
 
diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h
new file mode 100644
index 00000000000..44ed07ccd3d
--- /dev/null
+++ b/arch/tile/include/asm/module.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_MODULE_H
+#define _ASM_TILE_MODULE_H
+
+#include <arch/chip.h>
+
+#include <asm-generic/module.h>
+
+/* We can't use modules built with different page sizes. */
+#if defined(CONFIG_PAGE_SIZE_16KB)
+# define MODULE_PGSZ " 16KB"
+#elif defined(CONFIG_PAGE_SIZE_64KB)
+# define MODULE_PGSZ " 64KB"
+#else
+# define MODULE_PGSZ ""
+#endif
+
+/* We don't really support no-SMP so tag if someone tries. */
+#ifdef CONFIG_SMP
+#define MODULE_NOSMP ""
+#else
+#define MODULE_NOSMP " nosmp"
+#endif
+
+#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_NOSMP
+
+#endif /* _ASM_TILE_MODULE_H */
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index db93518fac0..67276800861 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -20,8 +20,17 @@
 #include <arch/chip.h>
 
 /* PAGE_SHIFT and HPAGE_SHIFT determine the page sizes. */
-#define PAGE_SHIFT	HV_LOG2_PAGE_SIZE_SMALL
-#define HPAGE_SHIFT	HV_LOG2_PAGE_SIZE_LARGE
+#if defined(CONFIG_PAGE_SIZE_16KB)
+#define PAGE_SHIFT	14
+#define CTX_PAGE_FLAG	HV_CTX_PG_SM_16K
+#elif defined(CONFIG_PAGE_SIZE_64KB)
+#define PAGE_SHIFT	16
+#define CTX_PAGE_FLAG	HV_CTX_PG_SM_64K
+#else
+#define PAGE_SHIFT	HV_LOG2_DEFAULT_PAGE_SIZE_SMALL
+#define CTX_PAGE_FLAG	0
+#endif
+#define HPAGE_SHIFT	HV_LOG2_DEFAULT_PAGE_SIZE_LARGE
 
 #define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
 #define HPAGE_SIZE	(_AC(1, UL) << HPAGE_SHIFT)
@@ -30,6 +39,12 @@
 #define HPAGE_MASK	(~(HPAGE_SIZE - 1))
 
 /*
+ * We do define AT_SYSINFO_EHDR to support vDSO,
+ * but don't use the gate mechanism.
+ */
+#define __HAVE_ARCH_GATE_AREA		1
+
+/*
  * If the Kconfig doesn't specify, set a maximum zone order that
  * is enough so that we can create huge pages from small pages given
  * the respective sizes of the two page types.  See <linux/mmzone.h>.
@@ -78,8 +93,7 @@ typedef HV_PTE pgprot_t;
 /*
  * User L2 page tables are managed as one L2 page table per page,
  * because we use the page allocator for them.  This keeps the allocation
- * simple and makes it potentially useful to implement HIGHPTE at some point.
- * However, it's also inefficient, since L2 page tables are much smaller
+ * simple, but it's also inefficient, since L2 page tables are much smaller
  * than pages (currently 2KB vs 64KB).  So we should revisit this.
  */
 typedef struct page *pgtable_t;
@@ -128,14 +142,18 @@ static inline __attribute_const__ int get_order(unsigned long size)
 
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 
-#define HUGE_MAX_HSTATE		2
+#define HUGE_MAX_HSTATE		6
 
 #ifdef CONFIG_HUGETLB_PAGE
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 #endif
 
+/* Allow overriding how much VA or PA the kernel will use. */
+#define MAX_PA_WIDTH CHIP_PA_WIDTH()
+#define MAX_VA_WIDTH CHIP_VA_WIDTH()
+
 /* Each memory controller has PAs distinct in their high bits. */
-#define NR_PA_HIGHBIT_SHIFT (CHIP_PA_WIDTH() - CHIP_LOG_NUM_MSHIMS())
+#define NR_PA_HIGHBIT_SHIFT (MAX_PA_WIDTH - CHIP_LOG_NUM_MSHIMS())
 #define NR_PA_HIGHBIT_VALUES (1 << CHIP_LOG_NUM_MSHIMS())
 #define __pa_to_highbits(pa) ((phys_addr_t)(pa) >> NR_PA_HIGHBIT_SHIFT)
 #define __pfn_to_highbits(pfn) ((pfn) >> (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT))
@@ -146,7 +164,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
  * We reserve the lower half of memory for user-space programs, and the
  * upper half for system code.  We re-map all of physical memory in the
  * upper half, which takes a quarter of our VA space.  Then we have
- * the vmalloc regions.  The supervisor code lives at 0xfffffff700000000,
+ * the vmalloc regions.  The supervisor code lives at the highest address,
  * with the hypervisor above that.
  *
  * Loadable kernel modules are placed immediately after the static
@@ -158,27 +176,18 @@ static inline __attribute_const__ int get_order(unsigned long size)
  * Similarly, for now we don't play any struct page mapping games.
  */
 
-#if CHIP_PA_WIDTH() + 2 > CHIP_VA_WIDTH()
+#if MAX_PA_WIDTH + 2 > MAX_VA_WIDTH
 # error Too much PA to map with the VA available!
 #endif
-#define HALF_VA_SPACE           (_AC(1, UL) << (CHIP_VA_WIDTH() - 1))
-
-#define MEM_LOW_END		(HALF_VA_SPACE - 1)         /* low half */
-#define MEM_HIGH_START		(-HALF_VA_SPACE)            /* high half */
-#define PAGE_OFFSET		MEM_HIGH_START
-#define _VMALLOC_START		_AC(0xfffffff500000000, UL) /* 4 GB */
-#define HUGE_VMAP_BASE		_AC(0xfffffff600000000, UL) /* 4 GB */
-#define MEM_SV_START		_AC(0xfffffff700000000, UL) /* 256 MB */
-#define MEM_SV_INTRPT		MEM_SV_START
-#define MEM_MODULE_START	_AC(0xfffffff710000000, UL) /* 256 MB */
-#define MEM_MODULE_END		(MEM_MODULE_START + (256*1024*1024))
-#define MEM_HV_START		_AC(0xfffffff800000000, UL) /* 32 GB */
-
-/* Highest DTLB address we will use */
-#define KERNEL_HIGH_VADDR	MEM_SV_START
 
-/* Since we don't currently provide any fixmaps, we use an impossible VA. */
-#define FIXADDR_TOP             MEM_HV_START
+#define PAGE_OFFSET		(-(_AC(1, UL) << (MAX_VA_WIDTH - 1)))
+#define KERNEL_HIGH_VADDR	_AC(0xfffffff800000000, UL)  /* high 32GB */
+#define FIXADDR_BASE		(KERNEL_HIGH_VADDR - 0x300000000) /* 4 GB */
+#define FIXADDR_TOP		(KERNEL_HIGH_VADDR - 0x200000000) /* 4 GB */
+#define _VMALLOC_START		FIXADDR_TOP
+#define MEM_SV_START		(KERNEL_HIGH_VADDR - 0x100000000) /* 256 MB */
+#define MEM_MODULE_START	(MEM_SV_START + (256*1024*1024)) /* 256 MB */
+#define MEM_MODULE_END		(MEM_MODULE_START + (256*1024*1024))
 
 #else /* !__tilegx__ */
 
@@ -200,25 +209,18 @@ static inline __attribute_const__ int get_order(unsigned long size)
  * values, and after that, we show "typical" values, since the actual
  * addresses depend on kernel #defines.
  *
- * MEM_HV_INTRPT                   0xfe000000
- * MEM_SV_INTRPT (kernel code)     0xfd000000
+ * MEM_HV_START                    0xfe000000
+ * MEM_SV_START  (kernel code)     0xfd000000
  * MEM_USER_INTRPT (user vector)   0xfc000000
- * FIX_KMAP_xxx                    0xf8000000 (via NR_CPUS * KM_TYPE_NR)
- * PKMAP_BASE                      0xf7000000 (via LAST_PKMAP)
- * HUGE_VMAP                       0xf3000000 (via CONFIG_NR_HUGE_VMAPS)
- * VMALLOC_START                   0xf0000000 (via __VMALLOC_RESERVE)
+ * FIX_KMAP_xxx                    0xfa000000 (via NR_CPUS * KM_TYPE_NR)
+ * PKMAP_BASE                      0xf9000000 (via LAST_PKMAP)
+ * VMALLOC_START                   0xf7000000 (via VMALLOC_RESERVE)
  * mapped LOWMEM                   0xc0000000
  */
 
 #define MEM_USER_INTRPT		_AC(0xfc000000, UL)
-#if CONFIG_KERNEL_PL == 1
-#define MEM_SV_INTRPT		_AC(0xfd000000, UL)
-#define MEM_HV_INTRPT		_AC(0xfe000000, UL)
-#else
-#define MEM_GUEST_INTRPT	_AC(0xfd000000, UL)
-#define MEM_SV_INTRPT		_AC(0xfe000000, UL)
-#define MEM_HV_INTRPT		_AC(0xff000000, UL)
-#endif
+#define MEM_SV_START		_AC(0xfd000000, UL)
+#define MEM_HV_START		_AC(0xfe000000, UL)
 
 #define INTRPT_SIZE		0x4000
 
@@ -239,7 +241,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
 
 #endif /* __tilegx__ */
 
-#ifndef __ASSEMBLY__
+#if !defined(__ASSEMBLY__) && !defined(VDSO_BUILD)
 
 #ifdef CONFIG_HIGHMEM
 
@@ -325,6 +327,7 @@ static inline int pfn_valid(unsigned long pfn)
 
 struct mm_struct;
 extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr);
+extern pte_t *virt_to_kpte(unsigned long kaddr);
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/tile/include/asm/pci.h b/arch/tile/include/asm/pci.h
index 5d5a635530b..dfedd7ac729 100644
--- a/arch/tile/include/asm/pci.h
+++ b/arch/tile/include/asm/pci.h
@@ -15,9 +15,12 @@
 #ifndef _ASM_TILE_PCI_H
 #define _ASM_TILE_PCI_H
 
+#include <linux/dma-mapping.h>
 #include <linux/pci.h>
 #include <asm-generic/pci_iomap.h>
 
+#ifndef __tilegx__
+
 /*
  * Structure of a PCI controller (host bridge)
  */
@@ -25,7 +28,6 @@ struct pci_controller {
 	int index;		/* PCI domain number */
 	struct pci_bus *root_bus;
 
-	int first_busno;
 	int last_busno;
 
 	int hv_cfg_fd[2];	/* config{0,1} fds for this PCIe controller */
@@ -41,20 +43,161 @@ struct pci_controller {
 };
 
 /*
+ * This flag tells if the platform is TILEmpower that needs
+ * special configuration for the PLX switch chip.
+ */
+extern int tile_plx_gen1;
+
+static inline void pci_iounmap(struct pci_dev *dev, void __iomem *addr) {}
+
+#define	TILE_NUM_PCIE	2
+
+/*
  * The hypervisor maps the entirety of CPA-space as bus addresses, so
  * bus addresses are physical addresses.  The networking and block
  * device layers use this boolean for bounce buffer decisions.
  */
 #define PCI_DMA_BUS_IS_PHYS     1
 
-int __devinit tile_pci_init(void);
-int __devinit pcibios_init(void);
+/* generic pci stuff */
+#include <asm-generic/pci.h>
 
-static inline void pci_iounmap(struct pci_dev *dev, void __iomem *addr) {}
+#else
 
-void __devinit pcibios_fixup_bus(struct pci_bus *bus);
+#include <asm/page.h>
+#include <gxio/trio.h>
 
-#define	TILE_NUM_PCIE	2
+/**
+ * We reserve the hugepage-size address range at the top of the 64-bit address
+ * space to serve as the PCI window, emulating the BAR0 space of an endpoint
+ * device. This window is used by the chip-to-chip applications running on
+ * the RC node. The reason for carving out this window is that Mem-Maps that
+ * back up this window will not overlap with those that map the real physical
+ * memory.
+ */
+#define PCIE_HOST_BAR0_SIZE		HPAGE_SIZE
+#define PCIE_HOST_BAR0_START		HPAGE_MASK
+
+/**
+ * The first PAGE_SIZE of the above "BAR" window is mapped to the
+ * gxpci_host_regs structure.
+ */
+#define PCIE_HOST_REGS_SIZE		PAGE_SIZE
+
+/*
+ * This is the PCI address where the Mem-Map interrupt regions start.
+ * We use the 2nd to the last huge page of the 64-bit address space.
+ * The last huge page is used for the rootcomplex "bar", for C2C purpose.
+ */
+#define	MEM_MAP_INTR_REGIONS_BASE	(HPAGE_MASK - HPAGE_SIZE)
+
+/*
+ * Each Mem-Map interrupt region occupies 4KB.
+ */
+#define	MEM_MAP_INTR_REGION_SIZE	(1 << TRIO_MAP_MEM_LIM__ADDR_SHIFT)
+
+/*
+ * Allocate the PCI BAR window right below 4GB.
+ */
+#define	TILE_PCI_BAR_WINDOW_TOP		(1ULL << 32)
+
+/*
+ * Allocate 1GB for the PCI BAR window.
+ */
+#define	TILE_PCI_BAR_WINDOW_SIZE	(1 << 30)
+
+/*
+ * This is the highest bus address targeting the host memory that
+ * can be generated by legacy PCI devices with 32-bit or less
+ * DMA capability, dictated by the BAR window size and location.
+ */
+#define	TILE_PCI_MAX_DIRECT_DMA_ADDRESS \
+	(TILE_PCI_BAR_WINDOW_TOP - TILE_PCI_BAR_WINDOW_SIZE - 1)
+
+/*
+ * We shift the PCI bus range for all the physical memory up by the whole PA
+ * range. The corresponding CPA of an incoming PCI request will be the PCI
+ * address minus TILE_PCI_MEM_MAP_BASE_OFFSET. This also implies
+ * that the 64-bit capable devices will be given DMA addresses as
+ * the CPA plus TILE_PCI_MEM_MAP_BASE_OFFSET. To support 32-bit
+ * devices, we create a separate map region that handles the low
+ * 4GB.
+ *
+ * This design lets us avoid the "PCI hole" problem where the host bridge
+ * won't pass DMA traffic with target addresses that happen to fall within the
+ * BAR space. This enables us to use all the physical memory for DMA, instead
+ * of wasting the same amount of physical memory as the BAR window size.
+ */
+#define	TILE_PCI_MEM_MAP_BASE_OFFSET	(1ULL << CHIP_PA_WIDTH())
+
+/*
+ * Start of the PCI memory resource, which starts at the end of the
+ * maximum system physical RAM address.
+ */
+#define	TILE_PCI_MEM_START	(1ULL << CHIP_PA_WIDTH())
+
+/*
+ * Structure of a PCI controller (host bridge) on Gx.
+ */
+struct pci_controller {
+
+	/* Pointer back to the TRIO that this PCIe port is connected to. */
+	gxio_trio_context_t *trio;
+	int mac;		/* PCIe mac index on the TRIO shim */
+	int trio_index;		/* Index of TRIO shim that contains the MAC. */
+
+	int pio_mem_index;	/* PIO region index for memory access */
+
+#ifdef CONFIG_TILE_PCI_IO
+	int pio_io_index;	/* PIO region index for I/O space access */
+#endif
+
+	/*
+	 * Mem-Map regions for all the memory controllers so that Linux can
+	 * map all of its physical memory space to the PCI bus.
+	 */
+	int mem_maps[MAX_NUMNODES];
+
+	int index;		/* PCI domain number */
+	struct pci_bus *root_bus;
+
+	/* PCI I/O space resource for this controller. */
+	struct resource io_space;
+	char io_space_name[32];
+
+	/* PCI memory space resource for this controller. */
+	struct resource mem_space;
+	char mem_space_name[32];
+
+	uint64_t mem_offset;	/* cpu->bus memory mapping offset. */
+
+	int first_busno;
+
+	struct pci_ops *ops;
+
+	/* Table that maps the INTx numbers to Linux irq numbers. */
+	int irq_intx_table[4];
+};
+
+extern struct pci_controller pci_controllers[TILEGX_NUM_TRIO * TILEGX_TRIO_PCIES];
+extern gxio_trio_context_t trio_contexts[TILEGX_NUM_TRIO];
+extern int num_trio_shims;
+
+extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
+
+/*
+ * The PCI address space does not equal the physical memory address
+ * space (we have an IOMMU). The IDE and SCSI device layers use this
+ * boolean for bounce buffer decisions.
+ */
+#define PCI_DMA_BUS_IS_PHYS     0
+
+#endif /* __tilegx__ */
+
+int __init tile_pci_init(void);
+int __init pcibios_init(void);
+
+void pcibios_fixup_bus(struct pci_bus *bus);
 
 #define pci_domain_nr(bus) (((struct pci_controller *)(bus)->sysdata)->index)
 
@@ -77,13 +220,8 @@ static inline int pcibios_assign_all_busses(void)
 }
 
 #define PCIBIOS_MIN_MEM		0
-#define PCIBIOS_MIN_IO		0
-
-/*
- * This flag tells if the platform is TILEmpower that needs
- * special configuration for the PLX switch chip.
- */
-extern int tile_plx_gen1;
+/* Minimum PCI I/O address, starting at the page boundary. */
+#define PCIBIOS_MIN_IO		PAGE_SIZE
 
 /* Use any cpu for PCI. */
 #define cpumask_of_pcibus(bus) cpu_online_mask
@@ -91,7 +229,4 @@ extern int tile_plx_gen1;
 /* implement the pci_ DMA API in terms of the generic device dma_ one */
 #include <asm-generic/pci-dma-compat.h>
 
-/* generic pci stuff */
-#include <asm-generic/pci.h>
-
 #endif /* _ASM_TILE_PCI_H */
diff --git a/arch/tile/include/asm/percpu.h b/arch/tile/include/asm/percpu.h
index 63294f5a8ef..4f7ae39fa20 100644
--- a/arch/tile/include/asm/percpu.h
+++ b/arch/tile/include/asm/percpu.h
@@ -15,9 +15,37 @@
 #ifndef _ASM_TILE_PERCPU_H
 #define _ASM_TILE_PERCPU_H
 
-register unsigned long __my_cpu_offset __asm__("tp");
-#define __my_cpu_offset __my_cpu_offset
-#define set_my_cpu_offset(tp) (__my_cpu_offset = (tp))
+register unsigned long my_cpu_offset_reg asm("tp");
+
+#ifdef CONFIG_PREEMPT
+/*
+ * For full preemption, we can't just use the register variable
+ * directly, since we need barrier() to hazard against it, causing the
+ * compiler to reload anything computed from a previous "tp" value.
+ * But we also don't want to use volatile asm, since we'd like the
+ * compiler to be able to cache the value across multiple percpu reads.
+ * So we use a fake stack read as a hazard against barrier().
+ * The 'U' constraint is like 'm' but disallows postincrement.
+ */
+static inline unsigned long __my_cpu_offset(void)
+{
+	unsigned long tp;
+	register unsigned long *sp asm("sp");
+	asm("move %0, tp" : "=r" (tp) : "U" (*sp));
+	return tp;
+}
+#define __my_cpu_offset __my_cpu_offset()
+#else
+/*
+ * We don't need to hazard against barrier() since "tp" doesn't ever
+ * change with PREEMPT_NONE, and with PREEMPT_VOLUNTARY it only
+ * changes at function call points, at which we are already re-reading
+ * the value of "tp" due to "my_cpu_offset_reg" being a global variable.
+ */
+#define __my_cpu_offset my_cpu_offset_reg
+#endif
+
+#define set_my_cpu_offset(tp) (my_cpu_offset_reg = (tp))
 
 #include <asm-generic/percpu.h>
 
diff --git a/arch/tile/include/asm/perf_event.h b/arch/tile/include/asm/perf_event.h
new file mode 100644
index 00000000000..59c5b164e5b
--- /dev/null
+++ b/arch/tile/include/asm/perf_event.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2014 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_PERF_EVENT_H
+#define _ASM_TILE_PERF_EVENT_H
+
+#include <linux/percpu.h>
+DECLARE_PER_CPU(u64, perf_irqs);
+
+unsigned long handle_syscall_link_address(void);
+#endif /* _ASM_TILE_PERF_EVENT_H */
diff --git a/arch/tile/include/asm/pgalloc.h b/arch/tile/include/asm/pgalloc.h
index e919c0bdc22..1b902508b66 100644
--- a/arch/tile/include/asm/pgalloc.h
+++ b/arch/tile/include/asm/pgalloc.h
@@ -19,24 +19,24 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <asm/fixmap.h>
+#include <asm/page.h>
 #include <hv/hypervisor.h>
 
 /* Bits for the size of the second-level page table. */
-#define L2_KERNEL_PGTABLE_SHIFT \
-  (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL + HV_LOG2_PTE_SIZE)
+#define L2_KERNEL_PGTABLE_SHIFT _HV_LOG2_L2_SIZE(HPAGE_SHIFT, PAGE_SHIFT)
+
+/* How big is a kernel L2 page table? */
+#define L2_KERNEL_PGTABLE_SIZE (1UL << L2_KERNEL_PGTABLE_SHIFT)
 
 /* We currently allocate user L2 page tables by page (unlike kernel L2s). */
-#if L2_KERNEL_PGTABLE_SHIFT < HV_LOG2_PAGE_SIZE_SMALL
-#define L2_USER_PGTABLE_SHIFT HV_LOG2_PAGE_SIZE_SMALL
+#if L2_KERNEL_PGTABLE_SHIFT < PAGE_SHIFT
+#define L2_USER_PGTABLE_SHIFT PAGE_SHIFT
 #else
 #define L2_USER_PGTABLE_SHIFT L2_KERNEL_PGTABLE_SHIFT
 #endif
 
 /* How many pages do we need, as an "order", for a user L2 page table? */
-#define L2_USER_PGTABLE_ORDER (L2_USER_PGTABLE_SHIFT - HV_LOG2_PAGE_SIZE_SMALL)
-
-/* How big is a kernel L2 page table? */
-#define L2_KERNEL_PGTABLE_SIZE (1 << L2_KERNEL_PGTABLE_SHIFT)
+#define L2_USER_PGTABLE_ORDER (L2_USER_PGTABLE_SHIFT - PAGE_SHIFT)
 
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
@@ -50,14 +50,14 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 static inline void pmd_populate_kernel(struct mm_struct *mm,
 				       pmd_t *pmd, pte_t *ptep)
 {
-	set_pmd(pmd, ptfn_pmd(__pa(ptep) >> HV_LOG2_PAGE_TABLE_ALIGN,
+	set_pmd(pmd, ptfn_pmd(HV_CPA_TO_PTFN(__pa(ptep)),
 			      __pgprot(_PAGE_PRESENT)));
 }
 
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 				pgtable_t page)
 {
-	set_pmd(pmd, ptfn_pmd(HV_PFN_TO_PTFN(page_to_pfn(page)),
+	set_pmd(pmd, ptfn_pmd(HV_CPA_TO_PTFN(PFN_PHYS(page_to_pfn(page))),
 			      __pgprot(_PAGE_PRESENT)));
 }
 
@@ -68,8 +68,20 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 
-extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address);
-extern void pte_free(struct mm_struct *mm, struct page *pte);
+extern pgtable_t pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
+				   int order);
+extern void pgtable_free(struct mm_struct *mm, struct page *pte, int order);
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+				      unsigned long address)
+{
+	return pgtable_alloc_one(mm, address, L2_USER_PGTABLE_ORDER);
+}
+
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+	pgtable_free(mm, pte, L2_USER_PGTABLE_ORDER);
+}
 
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
@@ -85,8 +97,13 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 	pte_free(mm, virt_to_page(pte));
 }
 
-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
-			   unsigned long address);
+extern void __pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte,
+			       unsigned long address, int order);
+static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
+				  unsigned long address)
+{
+	__pgtable_free_tlb(tlb, pte, address, L2_USER_PGTABLE_ORDER);
+}
 
 #define check_pgt_cache()	do { } while (0)
 
@@ -104,19 +121,44 @@ void shatter_pmd(pmd_t *pmd);
 void shatter_huge_page(unsigned long addr);
 
 #ifdef __tilegx__
-/* We share a single page allocator for both L1 and L2 page tables. */
-#if HV_L1_SIZE != HV_L2_SIZE
-# error Rework assumption that L1 and L2 page tables are same size.
-#endif
-#define L1_USER_PGTABLE_ORDER L2_USER_PGTABLE_ORDER
+
 #define pud_populate(mm, pud, pmd) \
   pmd_populate_kernel((mm), (pmd_t *)(pud), (pte_t *)(pmd))
-#define pmd_alloc_one(mm, addr) \
-  ((pmd_t *)page_to_virt(pte_alloc_one((mm), (addr))))
-#define pmd_free(mm, pmdp) \
-  pte_free((mm), virt_to_page(pmdp))
-#define __pmd_free_tlb(tlb, pmdp, address) \
-  __pte_free_tlb((tlb), virt_to_page(pmdp), (address))
+
+/* Bits for the size of the L1 (intermediate) page table. */
+#define L1_KERNEL_PGTABLE_SHIFT _HV_LOG2_L1_SIZE(HPAGE_SHIFT)
+
+/* How big is a kernel L2 page table? */
+#define L1_KERNEL_PGTABLE_SIZE (1UL << L1_KERNEL_PGTABLE_SHIFT)
+
+/* We currently allocate L1 page tables by page. */
+#if L1_KERNEL_PGTABLE_SHIFT < PAGE_SHIFT
+#define L1_USER_PGTABLE_SHIFT PAGE_SHIFT
+#else
+#define L1_USER_PGTABLE_SHIFT L1_KERNEL_PGTABLE_SHIFT
 #endif
 
+/* How many pages do we need, as an "order", for an L1 page table? */
+#define L1_USER_PGTABLE_ORDER (L1_USER_PGTABLE_SHIFT - PAGE_SHIFT)
+
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	struct page *p = pgtable_alloc_one(mm, address, L1_USER_PGTABLE_ORDER);
+	return (pmd_t *)page_to_virt(p);
+}
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmdp)
+{
+	pgtable_free(mm, virt_to_page(pmdp), L1_USER_PGTABLE_ORDER);
+}
+
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
+				  unsigned long address)
+{
+	__pgtable_free_tlb(tlb, virt_to_page(pmdp), address,
+			   L1_USER_PGTABLE_ORDER);
+}
+
+#endif /* __tilegx__ */
+
 #endif /* _ASM_TILE_PGALLOC_H */
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 1a20b7ef8ea..33587f16c15 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -27,9 +27,10 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/pfn.h>
 #include <asm/processor.h>
 #include <asm/fixmap.h>
-#include <asm/system.h>
+#include <asm/page.h>
 
 struct mm_struct;
 struct vm_area_struct;
@@ -70,6 +71,7 @@ extern void set_page_homes(void);
 
 #define _PAGE_PRESENT           HV_PTE_PRESENT
 #define _PAGE_HUGE_PAGE         HV_PTE_PAGE
+#define _PAGE_SUPER_PAGE        HV_PTE_SUPER
 #define _PAGE_READABLE          HV_PTE_READABLE
 #define _PAGE_WRITABLE          HV_PTE_WRITABLE
 #define _PAGE_EXECUTABLE        HV_PTE_EXECUTABLE
@@ -86,6 +88,7 @@ extern void set_page_homes(void);
 #define _PAGE_ALL (\
   _PAGE_PRESENT | \
   _PAGE_HUGE_PAGE | \
+  _PAGE_SUPER_PAGE | \
   _PAGE_READABLE | \
   _PAGE_WRITABLE | \
   _PAGE_EXECUTABLE | \
@@ -163,7 +166,7 @@ extern void set_page_homes(void);
   (pgprot_t) { ((oldprot).val & ~_PAGE_ALL) | (newprot).val }
 
 /* Just setting the PFN to zero suffices. */
-#define pte_pgprot(x) hv_pte_set_pfn((x), 0)
+#define pte_pgprot(x) hv_pte_set_pa((x), 0)
 
 /*
  * For PTEs and PDEs, we must clear the Present bit first when
@@ -188,6 +191,7 @@ static inline void __pte_clear(pte_t *ptep)
  * Undefined behaviour if not..
  */
 #define pte_present hv_pte_get_present
+#define pte_mknotpresent hv_pte_clear_present
 #define pte_user hv_pte_get_user
 #define pte_read hv_pte_get_readable
 #define pte_dirty hv_pte_get_dirty
@@ -195,6 +199,7 @@ static inline void __pte_clear(pte_t *ptep)
 #define pte_write hv_pte_get_writable
 #define pte_exec hv_pte_get_executable
 #define pte_huge hv_pte_get_page
+#define pte_super hv_pte_get_super
 #define pte_rdprotect hv_pte_clear_readable
 #define pte_exprotect hv_pte_clear_executable
 #define pte_mkclean hv_pte_clear_dirty
@@ -207,6 +212,7 @@ static inline void __pte_clear(pte_t *ptep)
 #define pte_mkyoung hv_pte_set_accessed
 #define pte_mkwrite hv_pte_set_writable
 #define pte_mkhuge hv_pte_set_page
+#define pte_mksuper hv_pte_set_super
 
 #define pte_special(pte) 0
 #define pte_mkspecial(pte) (pte)
@@ -262,7 +268,7 @@ static inline int pte_none(pte_t pte)
 
 static inline unsigned long pte_pfn(pte_t pte)
 {
-	return hv_pte_get_pfn(pte);
+	return PFN_DOWN(hv_pte_get_pa(pte));
 }
 
 /* Set or get the remote cache cpu in a pgprot with remote caching. */
@@ -271,7 +277,7 @@ extern int get_remote_cache_cpu(pgprot_t prot);
 
 static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
 {
-	return hv_pte_set_pfn(prot, pfn);
+	return hv_pte_set_pa(prot, PFN_PHYS(pfn));
 }
 
 /* Support for priority mappings. */
@@ -313,7 +319,7 @@ extern void check_mm_caching(struct mm_struct *prev, struct mm_struct *next);
  */
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
-	return pfn_pte(hv_pte_get_pfn(pte), newprot);
+	return pfn_pte(pte_pfn(pte), newprot);
 }
 
 /*
@@ -336,13 +342,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
  */
 #define pgd_offset_k(address) pgd_offset(&init_mm, address)
 
-#if defined(CONFIG_HIGHPTE)
-extern pte_t *pte_offset_map(pmd_t *, unsigned long address);
-#define pte_unmap(pte) kunmap_atomic(pte)
-#else
 #define pte_offset_map(dir, address) pte_offset_kernel(dir, address)
 #define pte_unmap(pte) do { } while (0)
-#endif
 
 /* Clear a non-executable kernel PTE and flush it from the TLB. */
 #define kpte_clear_flush(ptep, vaddr)		\
@@ -361,9 +362,6 @@ do {						\
 #define kern_addr_valid(addr)	(1)
 #endif /* CONFIG_FLATMEM */
 
-#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)		\
-		remap_pfn_range(vma, vaddr, pfn, size, prot)
-
 extern void vmalloc_sync_all(void);
 
 #endif /* !__ASSEMBLY__ */
@@ -411,6 +409,46 @@ static inline unsigned long pmd_index(unsigned long address)
 	return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
 }
 
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+					    unsigned long address,
+					    pmd_t *pmdp)
+{
+	return ptep_test_and_clear_young(vma, address, pmdp_ptep(pmdp));
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+				      unsigned long address, pmd_t *pmdp)
+{
+	ptep_set_wrprotect(mm, address, pmdp_ptep(pmdp));
+}
+
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+				       unsigned long address,
+				       pmd_t *pmdp)
+{
+	return pte_pmd(ptep_get_and_clear(mm, address, pmdp_ptep(pmdp)));
+}
+
+static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	set_pte(pmdp_ptep(pmdp), pmd_pte(pmdval));
+}
+
+#define set_pmd_at(mm, addr, pmdp, pmdval) __set_pmd(pmdp, pmdval)
+
+/* Create a pmd from a PTFN. */
+static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot)
+{
+	return pte_pmd(hv_pte_set_ptfn(prot, ptfn));
+}
+
+/* Return the page-table frame number (ptfn) that a pmd_t points at. */
+#define pmd_ptfn(pmd) hv_pte_get_ptfn(pmd_pte(pmd))
+
 /*
  * A given kernel pmd_t maps to a specific virtual address (either a
  * kernel huge page or a kernel pte_t table).  Since kernel pte_t
@@ -431,7 +469,48 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
  * OK for pte_lockptr(), since we just end up with potentially one
  * lock being used for several pte_t arrays.
  */
-#define pmd_page(pmd) pfn_to_page(HV_PTFN_TO_PFN(pmd_ptfn(pmd)))
+#define pmd_page(pmd) pfn_to_page(PFN_DOWN(HV_PTFN_TO_CPA(pmd_ptfn(pmd))))
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+	__pte_clear(pmdp_ptep(pmdp));
+}
+
+#define pmd_mknotpresent(pmd)	pte_pmd(pte_mknotpresent(pmd_pte(pmd)))
+#define pmd_young(pmd)		pte_young(pmd_pte(pmd))
+#define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
+#define pmd_mkold(pmd)		pte_pmd(pte_mkold(pmd_pte(pmd)))
+#define pmd_mkwrite(pmd)	pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_write(pmd)		pte_write(pmd_pte(pmd))
+#define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
+#define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_huge_page(pmd)	pte_huge(pmd_pte(pmd))
+#define pmd_mkhuge(pmd)		pte_pmd(pte_mkhuge(pmd_pte(pmd)))
+#define __HAVE_ARCH_PMD_WRITE
+
+#define pfn_pmd(pfn, pgprot)	pte_pmd(pfn_pte((pfn), (pgprot)))
+#define pmd_pfn(pmd)		pte_pfn(pmd_pte(pmd))
+#define mk_pmd(page, pgprot)	pfn_pmd(page_to_pfn(page), (pgprot))
+
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+	return pfn_pmd(pmd_pfn(pmd), newprot);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define has_transparent_hugepage() 1
+#define pmd_trans_huge pmd_huge_page
+
+static inline pmd_t pmd_mksplitting(pmd_t pmd)
+{
+	return pte_pmd(hv_pte_set_client2(pmd_pte(pmd)));
+}
+
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+	return hv_pte_get_client2(pmd_pte(pmd));
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
  * The pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
@@ -449,17 +528,13 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
        return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
 }
 
-static inline int pmd_huge_page(pmd_t pmd)
-{
-	return pmd_val(pmd) & _PAGE_HUGE_PAGE;
-}
-
 #include <asm-generic/pgtable.h>
 
 /* Support /proc/NN/pgtable API. */
 struct seq_file;
 int arch_proc_pgtable_show(struct seq_file *m, struct mm_struct *mm,
-			   unsigned long vaddr, pte_t *ptep, void **datap);
+			   unsigned long vaddr, unsigned long pagesize,
+			   pte_t *ptep, void **datap);
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h
index 9f98529761f..d26a4227903 100644
--- a/arch/tile/include/asm/pgtable_32.h
+++ b/arch/tile/include/asm/pgtable_32.h
@@ -20,11 +20,12 @@
  * The level-1 index is defined by the huge page size.  A PGD is composed
  * of PTRS_PER_PGD pgd_t's and is the top level of the page table.
  */
-#define PGDIR_SHIFT	HV_LOG2_PAGE_SIZE_LARGE
-#define PGDIR_SIZE	HV_PAGE_SIZE_LARGE
+#define PGDIR_SHIFT	HPAGE_SHIFT
+#define PGDIR_SIZE	HPAGE_SIZE
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
-#define PTRS_PER_PGD	(1 << (32 - PGDIR_SHIFT))
-#define SIZEOF_PGD	(PTRS_PER_PGD * sizeof(pgd_t))
+#define PTRS_PER_PGD	_HV_L1_ENTRIES(HPAGE_SHIFT)
+#define PGD_INDEX(va)	_HV_L1_INDEX(va, HPAGE_SHIFT)
+#define SIZEOF_PGD	_HV_L1_SIZE(HPAGE_SHIFT)
 
 /*
  * The level-2 index is defined by the difference between the huge
@@ -33,8 +34,9 @@
  * Note that the hypervisor docs use PTE for what we call pte_t, so
  * this nomenclature is somewhat confusing.
  */
-#define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL))
-#define SIZEOF_PTE	(PTRS_PER_PTE * sizeof(pte_t))
+#define PTRS_PER_PTE	_HV_L2_ENTRIES(HPAGE_SHIFT, PAGE_SHIFT)
+#define PTE_INDEX(va)	_HV_L2_INDEX(va, HPAGE_SHIFT, PAGE_SHIFT)
+#define SIZEOF_PTE	_HV_L2_SIZE(HPAGE_SHIFT, PAGE_SHIFT)
 
 #ifndef __ASSEMBLY__
 
@@ -53,17 +55,9 @@
 #define PKMAP_BASE   ((FIXADDR_BOOT_START - PAGE_SIZE*LAST_PKMAP) & PGDIR_MASK)
 
 #ifdef CONFIG_HIGHMEM
-# define __VMAPPING_END	(PKMAP_BASE & ~(HPAGE_SIZE-1))
+# define _VMALLOC_END	(PKMAP_BASE & ~(HPAGE_SIZE-1))
 #else
-# define __VMAPPING_END	(FIXADDR_START & ~(HPAGE_SIZE-1))
-#endif
-
-#ifdef CONFIG_HUGEVMAP
-#define HUGE_VMAP_END	__VMAPPING_END
-#define HUGE_VMAP_BASE	(HUGE_VMAP_END - CONFIG_NR_HUGE_VMAPS * HPAGE_SIZE)
-#define _VMALLOC_END	HUGE_VMAP_BASE
-#else
-#define _VMALLOC_END	__VMAPPING_END
+# define _VMALLOC_END	(FIXADDR_START & ~(HPAGE_SIZE-1))
 #endif
 
 /*
@@ -82,10 +76,12 @@ extern unsigned long VMALLOC_RESERVE /* = CONFIG_VMALLOC_RESERVE */;
 /* We have no pmd or pud since we are strictly a two-level page table */
 #include <asm-generic/pgtable-nopmd.h>
 
+static inline int pud_huge_page(pud_t pud)	{ return 0; }
+
 /* We don't define any pgds for these addresses. */
 static inline int pgd_addr_invalid(unsigned long addr)
 {
-	return addr >= MEM_HV_INTRPT;
+	return addr >= MEM_HV_START;
 }
 
 /*
@@ -111,24 +107,14 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 	return pte;
 }
 
-static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	set_pte(&pmdp->pud.pgd, pmdval.pud.pgd);
-}
-
-/* Create a pmd from a PTFN. */
-static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot)
-{
-	return (pmd_t){ { hv_pte_set_ptfn(prot, ptfn) } };
-}
-
-/* Return the page-table frame number (ptfn) that a pmd_t points at. */
-#define pmd_ptfn(pmd) hv_pte_get_ptfn((pmd).pud.pgd)
-
-static inline void pmd_clear(pmd_t *pmdp)
-{
-	__pte_clear(&pmdp->pud.pgd);
-}
+/*
+ * pmds are wrappers around pgds, which are the same as ptes.
+ * It's often convenient to "cast" back and forth and use the pte methods,
+ * which are the methods supplied by the hypervisor.
+ */
+#define pmd_pte(pmd) ((pmd).pud.pgd)
+#define pmdp_ptep(pmdp) (&(pmdp)->pud.pgd)
+#define pte_pmd(pte) ((pmd_t){ { (pte) } })
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/tile/include/asm/pgtable_64.h b/arch/tile/include/asm/pgtable_64.h
index fd80328523b..2c8a9cd102d 100644
--- a/arch/tile/include/asm/pgtable_64.h
+++ b/arch/tile/include/asm/pgtable_64.h
@@ -21,17 +21,19 @@
 #define PGDIR_SIZE	HV_L1_SPAN
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 #define PTRS_PER_PGD	HV_L0_ENTRIES
-#define SIZEOF_PGD	(PTRS_PER_PGD * sizeof(pgd_t))
+#define PGD_INDEX(va)	HV_L0_INDEX(va)
+#define SIZEOF_PGD	HV_L0_SIZE
 
 /*
  * The level-1 index is defined by the huge page size.  A PMD is composed
  * of PTRS_PER_PMD pgd_t's and is the middle level of the page table.
  */
-#define PMD_SHIFT	HV_LOG2_PAGE_SIZE_LARGE
-#define PMD_SIZE	HV_PAGE_SIZE_LARGE
+#define PMD_SHIFT	HPAGE_SHIFT
+#define PMD_SIZE	HPAGE_SIZE
 #define PMD_MASK	(~(PMD_SIZE-1))
-#define PTRS_PER_PMD	(1 << (PGDIR_SHIFT - PMD_SHIFT))
-#define SIZEOF_PMD	(PTRS_PER_PMD * sizeof(pmd_t))
+#define PTRS_PER_PMD	_HV_L1_ENTRIES(HPAGE_SHIFT)
+#define PMD_INDEX(va)	_HV_L1_INDEX(va, HPAGE_SHIFT)
+#define SIZEOF_PMD	_HV_L1_SIZE(HPAGE_SHIFT)
 
 /*
  * The level-2 index is defined by the difference between the huge
@@ -40,25 +42,34 @@
  * Note that the hypervisor docs use PTE for what we call pte_t, so
  * this nomenclature is somewhat confusing.
  */
-#define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL))
-#define SIZEOF_PTE	(PTRS_PER_PTE * sizeof(pte_t))
+#define PTRS_PER_PTE	_HV_L2_ENTRIES(HPAGE_SHIFT, PAGE_SHIFT)
+#define PTE_INDEX(va)	_HV_L2_INDEX(va, HPAGE_SHIFT, PAGE_SHIFT)
+#define SIZEOF_PTE	_HV_L2_SIZE(HPAGE_SHIFT, PAGE_SHIFT)
 
 /*
- * Align the vmalloc area to an L2 page table, and leave a guard page
- * at the beginning and end.  The vmalloc code also puts in an internal
+ * Align the vmalloc area to an L2 page table.  Omit guard pages at
+ * the beginning and end for simplicity (particularly in the per-cpu
+ * memory allocation code).  The vmalloc code puts in an internal
  * guard page between each allocation.
  */
-#define _VMALLOC_END	HUGE_VMAP_BASE
-#define VMALLOC_END	(_VMALLOC_END - PAGE_SIZE)
-#define VMALLOC_START	(_VMALLOC_START + PAGE_SIZE)
-
-#define HUGE_VMAP_END	(HUGE_VMAP_BASE + PGDIR_SIZE)
+#define _VMALLOC_END	MEM_SV_START
+#define VMALLOC_END	_VMALLOC_END
+#define VMALLOC_START	_VMALLOC_START
 
 #ifndef __ASSEMBLY__
 
 /* We have no pud since we are a three-level page table. */
 #include <asm-generic/pgtable-nopud.h>
 
+/*
+ * pmds are the same as pgds and ptes, so converting is a no-op.
+ */
+#define pmd_pte(pmd) (pmd)
+#define pmdp_ptep(pmdp) (pmdp)
+#define pte_pmd(pte) (pte)
+
+#define pud_pte(pud) ((pud).pgd)
+
 static inline int pud_none(pud_t pud)
 {
 	return pud_val(pud) == 0;
@@ -69,6 +80,11 @@ static inline int pud_present(pud_t pud)
 	return pud_val(pud) & _PAGE_PRESENT;
 }
 
+static inline int pud_huge_page(pud_t pud)
+{
+	return pud_val(pud) & _PAGE_HUGE_PAGE;
+}
+
 #define pmd_ERROR(e) \
 	pr_err("%s:%d: bad pmd 0x%016llx.\n", __FILE__, __LINE__, pmd_val(e))
 
@@ -85,6 +101,9 @@ static inline int pud_bad(pud_t pud)
 /* Return the page-table frame number (ptfn) that a pud_t points at. */
 #define pud_ptfn(pud) hv_pte_get_ptfn((pud).pgd)
 
+/* Return the page frame number (pfn) that a pud_t points at. */
+#define pud_pfn(pud) pte_pfn(pud_pte(pud))
+
 /*
  * A given kernel pud_t maps to a kernel pmd_t table at a specific
  * virtual address.  Since kernel pmd_t tables can be aligned at
@@ -98,7 +117,7 @@ static inline int pud_bad(pud_t pud)
  * A pud_t points to a pmd_t array.  Since we can have multiple per
  * page, we don't have a one-to-one mapping of pud_t's to pages.
  */
-#define pud_page(pud) pfn_to_page(HV_PTFN_TO_PFN(pud_ptfn(pud)))
+#define pud_page(pud) pfn_to_page(PFN_DOWN(HV_PTFN_TO_CPA(pud_ptfn(pud))))
 
 static inline unsigned long pud_index(unsigned long address)
 {
@@ -108,28 +127,6 @@ static inline unsigned long pud_index(unsigned long address)
 #define pmd_offset(pud, address) \
 	((pmd_t *)pud_page_vaddr(*(pud)) + pmd_index(address))
 
-static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	set_pte(pmdp, pmdval);
-}
-
-/* Create a pmd from a PTFN and pgprot. */
-static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot)
-{
-	return hv_pte_set_ptfn(prot, ptfn);
-}
-
-/* Return the page-table frame number (ptfn) that a pmd_t points at. */
-static inline unsigned long pmd_ptfn(pmd_t pmd)
-{
-	return hv_pte_get_ptfn(pmd);
-}
-
-static inline void pmd_clear(pmd_t *pmdp)
-{
-	__pte_clear(pmdp);
-}
-
 /* Normalize an address to having the correct high bits set. */
 #define pgd_addr_normalize pgd_addr_normalize
 static inline unsigned long pgd_addr_normalize(unsigned long addr)
@@ -141,8 +138,7 @@ static inline unsigned long pgd_addr_normalize(unsigned long addr)
 /* We don't define any pgds for these addresses. */
 static inline int pgd_addr_invalid(unsigned long addr)
 {
-	return addr >= MEM_HV_START ||
-		(addr > MEM_LOW_END && addr < MEM_HIGH_START);
+	return addr >= KERNEL_HIGH_VADDR || addr != pgd_addr_normalize(addr);
 }
 
 /*
diff --git a/arch/tile/include/asm/pmc.h b/arch/tile/include/asm/pmc.h
new file mode 100644
index 00000000000..7ae3956d900
--- /dev/null
+++ b/arch/tile/include/asm/pmc.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2014 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_PMC_H
+#define _ASM_TILE_PMC_H
+
+#include <linux/ptrace.h>
+
+#define TILE_BASE_COUNTERS	2
+
+/* Bitfields below are derived from SPR PERF_COUNT_CTL*/
+#ifndef __tilegx__
+/* PERF_COUNT_CTL on TILEPro */
+#define TILE_CTL_EXCL_USER	(1 << 7) /* exclude user level */
+#define TILE_CTL_EXCL_KERNEL	(1 << 8) /* exclude kernel level */
+#define TILE_CTL_EXCL_HV	(1 << 9) /* exclude hypervisor level */
+
+#define TILE_SEL_MASK		0x7f	/* 7 bits for event SEL,
+					COUNT_0_SEL */
+#define TILE_PLM_MASK		0x780	/* 4 bits priv level msks,
+					COUNT_0_MASK*/
+#define TILE_EVENT_MASK	(TILE_SEL_MASK | TILE_PLM_MASK)
+
+#else /* __tilegx__*/
+/* PERF_COUNT_CTL on TILEGx*/
+#define TILE_CTL_EXCL_USER	(1 << 10) /* exclude user level */
+#define TILE_CTL_EXCL_KERNEL	(1 << 11) /* exclude kernel level */
+#define TILE_CTL_EXCL_HV	(1 << 12) /* exclude hypervisor level */
+
+#define TILE_SEL_MASK		0x3f	/* 6 bits for event SEL,
+					COUNT_0_SEL*/
+#define TILE_BOX_MASK		0x1c0	/* 3 bits box msks,
+					COUNT_0_BOX */
+#define TILE_PLM_MASK		0x3c00	/* 4 bits priv level msks,
+					COUNT_0_MASK */
+#define TILE_EVENT_MASK	(TILE_SEL_MASK | TILE_BOX_MASK | TILE_PLM_MASK)
+#endif /* __tilegx__*/
+
+/* Takes register and fault number.  Returns error to disable the interrupt. */
+typedef int (*perf_irq_t)(struct pt_regs *, int);
+
+int userspace_perf_handler(struct pt_regs *regs, int fault);
+
+perf_irq_t reserve_pmc_hardware(perf_irq_t new_perf_irq);
+void release_pmc_hardware(void);
+
+unsigned long pmc_get_overflow(void);
+void pmc_ack_overflow(unsigned long status);
+
+void unmask_pmc_interrupts(void);
+void mask_pmc_interrupts(void);
+
+#endif /* _ASM_TILE_PMC_H */
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index 34c1e01ffb5..42323636c45 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -15,6 +15,8 @@
 #ifndef _ASM_TILE_PROCESSOR_H
 #define _ASM_TILE_PROCESSOR_H
 
+#include <arch/chip.h>
+
 #ifndef __ASSEMBLY__
 
 /*
@@ -25,7 +27,6 @@
 #include <asm/ptrace.h>
 #include <asm/percpu.h>
 
-#include <arch/chip.h>
 #include <arch/spr_def.h>
 
 struct task_struct;
@@ -76,6 +77,17 @@ struct async_tlb {
 
 #ifdef CONFIG_HARDWALL
 struct hardwall_info;
+struct hardwall_task {
+	/* Which hardwall is this task tied to? (or NULL if none) */
+	struct hardwall_info *info;
+	/* Chains this task into the list at info->task_head. */
+	struct list_head list;
+};
+#ifdef __tilepro__
+#define HARDWALL_TYPES 1   /* udn */
+#else
+#define HARDWALL_TYPES 3   /* udn, idn, and ipi */
+#endif
 #endif
 
 struct thread_struct {
@@ -99,47 +111,38 @@ struct thread_struct {
 	unsigned long long interrupt_mask;
 	/* User interrupt-control 0 state */
 	unsigned long intctrl_0;
-#if CHIP_HAS_PROC_STATUS_SPR()
+	/* Is this task currently doing a backtrace? */
+	bool in_backtrace;
 	/* Any other miscellaneous processor state bits */
 	unsigned long proc_status;
-#endif
 #if !CHIP_HAS_FIXED_INTVEC_BASE()
 	/* Interrupt base for PL0 interrupts */
 	unsigned long interrupt_vector_base;
 #endif
-#if CHIP_HAS_TILE_RTF_HWM()
 	/* Tile cache retry fifo high-water mark */
 	unsigned long tile_rtf_hwm;
-#endif
 #if CHIP_HAS_DSTREAM_PF()
 	/* Data stream prefetch control */
 	unsigned long dstream_pf;
 #endif
 #ifdef CONFIG_HARDWALL
-	/* Is this task tied to an activated hardwall? */
-	struct hardwall_info *hardwall;
-	/* Chains this task into the list at hardwall->list. */
-	struct list_head hardwall_list;
+	/* Hardwall information for various resources. */
+	struct hardwall_task hardwall[HARDWALL_TYPES];
 #endif
 #if CHIP_HAS_TILE_DMA()
 	/* Async DMA TLB fault information */
 	struct async_tlb dma_async_tlb;
 #endif
-#if CHIP_HAS_SN_PROC()
-	/* Was static network processor when we were switched out? */
-	int sn_proc_running;
-	/* Async SNI TLB fault information */
-	struct async_tlb sn_async_tlb;
-#endif
 };
 
 #endif /* !__ASSEMBLY__ */
 
 /*
  * Start with "sp" this many bytes below the top of the kernel stack.
- * This preserves the invariant that a called function may write to *sp.
+ * This allows us to be cache-aware when handling the initial save
+ * of the pt_regs value to the stack.
  */
-#define STACK_TOP_DELTA 8
+#define STACK_TOP_DELTA 64
 
 /*
  * When entering the kernel via a fault, start with the top of the
@@ -155,7 +158,7 @@ struct thread_struct {
 #ifndef __ASSEMBLY__
 
 #ifdef __tilegx__
-#define TASK_SIZE_MAX		(MEM_LOW_END + 1)
+#define TASK_SIZE_MAX		(_AC(1, UL) << (MAX_VA_WIDTH - 1))
 #else
 #define TASK_SIZE_MAX		PAGE_OFFSET
 #endif
@@ -169,10 +172,10 @@ struct thread_struct {
 #define TASK_SIZE		TASK_SIZE_MAX
 #endif
 
-/* We provide a minimal "vdso" a la x86; just the sigreturn code for now. */
-#define VDSO_BASE		(TASK_SIZE - PAGE_SIZE)
+#define VDSO_BASE	((unsigned long)current->active_mm->context.vdso_base)
+#define VDSO_SYM(x)	(VDSO_BASE + (unsigned long)(x))
 
-#define STACK_TOP		VDSO_BASE
+#define STACK_TOP		TASK_SIZE
 
 /* STACK_TOP_MAX is used temporarily in execve and should not check COMPAT. */
 #define STACK_TOP_MAX		TASK_SIZE_MAX
@@ -202,6 +205,7 @@ static inline void start_thread(struct pt_regs *regs,
 {
 	regs->pc = pc;
 	regs->sp = usp;
+	single_step_execve();
 }
 
 /* Free all resources held by a thread. */
@@ -210,35 +214,40 @@ static inline void release_thread(struct task_struct *dead_task)
 	/* Nothing for now */
 }
 
-/* Prepare to copy thread state - unlazy all lazy status. */
-#define prepare_to_copy(tsk)	do { } while (0)
-
-extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
-
 extern int do_work_pending(struct pt_regs *regs, u32 flags);
 
 
 /*
  * Return saved (kernel) PC of a blocked thread.
- * Only used in a printk() in kernel/sched.c, so don't work too hard.
+ * Only used in a printk() in kernel/sched/core.c, so don't work too hard.
  */
 #define thread_saved_pc(t)   ((t)->thread.pc)
 
 unsigned long get_wchan(struct task_struct *p);
 
 /* Return initial ksp value for given task. */
-#define task_ksp0(task) ((unsigned long)(task)->stack + THREAD_SIZE)
+#define task_ksp0(task) \
+	((unsigned long)(task)->stack + THREAD_SIZE - STACK_TOP_DELTA)
 
 /* Return some info about the user process TASK. */
-#define KSTK_TOP(task)	(task_ksp0(task) - STACK_TOP_DELTA)
 #define task_pt_regs(task) \
-  ((struct pt_regs *)(task_ksp0(task) - KSTK_PTREGS_GAP) - 1)
+	((struct pt_regs *)(task_ksp0(task) - KSTK_PTREGS_GAP) - 1)
+#define current_pt_regs()                                   \
+	((struct pt_regs *)((stack_pointer | (THREAD_SIZE - 1)) - \
+			    STACK_TOP_DELTA - (KSTK_PTREGS_GAP - 1)) - 1)
 #define task_sp(task)	(task_pt_regs(task)->sp)
 #define task_pc(task)	(task_pt_regs(task)->pc)
 /* Aliases for pc and sp (used in fs/proc/array.c) */
 #define KSTK_EIP(task)	task_pc(task)
 #define KSTK_ESP(task)	task_sp(task)
 
+/* Fine-grained unaligned JIT support */
+#define GET_UNALIGN_CTL(tsk, adr)	get_unalign_ctl((tsk), (adr))
+#define SET_UNALIGN_CTL(tsk, val)	set_unalign_ctl((tsk), (val))
+
+extern int get_unalign_ctl(struct task_struct *tsk, unsigned long adr);
+extern int set_unalign_ctl(struct task_struct *tsk, unsigned int val);
+
 /* Standard format for printing registers and other word-size data. */
 #ifdef __tilegx__
 # define REGFMT "0x%016lx"
@@ -267,7 +276,6 @@ extern char chip_model[64];
 /* Data on which physical memory controller corresponds to which NUMA node. */
 extern int node_controller[];
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 /* Does the heap allocator return hash-for-home pages by default? */
 extern int hash_default;
 
@@ -277,11 +285,6 @@ extern int kstack_hash;
 /* Does MAP_ANONYMOUS return hash-for-home pages by default? */
 #define uheap_hash hash_default
 
-#else
-#define hash_default 0
-#define kstack_hash 0
-#define uheap_hash 0
-#endif
 
 /* Are we using huge pages in the TLB for kernel data? */
 extern int kdata_huge;
@@ -329,7 +332,6 @@ extern int kdata_huge;
 
 /*
  * Provide symbolic constants for PLs.
- * Note that assembly code assumes that USER_PL is zero.
  */
 #define USER_PL 0
 #if CONFIG_KERNEL_PL == 2
@@ -338,20 +340,38 @@ extern int kdata_huge;
 #define KERNEL_PL CONFIG_KERNEL_PL
 
 /* SYSTEM_SAVE_K_0 holds the current cpu number ORed with ksp0. */
-#define CPU_LOG_MASK_VALUE 12
-#define CPU_MASK_VALUE ((1 << CPU_LOG_MASK_VALUE) - 1)
-#if CONFIG_NR_CPUS > CPU_MASK_VALUE
-# error Too many cpus!
+#ifdef __tilegx__
+#define CPU_SHIFT 48
+#if CHIP_VA_WIDTH() > CPU_SHIFT
+# error Too many VA bits!
 #endif
+#define MAX_CPU_ID ((1 << (64 - CPU_SHIFT)) - 1)
 #define raw_smp_processor_id() \
-	((int)__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & CPU_MASK_VALUE)
+	((int)(__insn_mfspr(SPR_SYSTEM_SAVE_K_0) >> CPU_SHIFT))
 #define get_current_ksp0() \
-	(__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & ~CPU_MASK_VALUE)
+	((unsigned long)(((long)__insn_mfspr(SPR_SYSTEM_SAVE_K_0) << \
+			  (64 - CPU_SHIFT)) >> (64 - CPU_SHIFT)))
+#define next_current_ksp0(task) ({ \
+	unsigned long __ksp0 = task_ksp0(task) & ((1UL << CPU_SHIFT) - 1); \
+	unsigned long __cpu = (long)raw_smp_processor_id() << CPU_SHIFT; \
+	__ksp0 | __cpu; \
+})
+#else
+#define LOG2_NR_CPU_IDS 6
+#define MAX_CPU_ID ((1 << LOG2_NR_CPU_IDS) - 1)
+#define raw_smp_processor_id() \
+	((int)__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & MAX_CPU_ID)
+#define get_current_ksp0() \
+	(__insn_mfspr(SPR_SYSTEM_SAVE_K_0) & ~MAX_CPU_ID)
 #define next_current_ksp0(task) ({ \
 	unsigned long __ksp0 = task_ksp0(task); \
 	int __cpu = raw_smp_processor_id(); \
-	BUG_ON(__ksp0 & CPU_MASK_VALUE); \
+	BUG_ON(__ksp0 & MAX_CPU_ID); \
 	__ksp0 | __cpu; \
 })
+#endif
+#if CONFIG_NR_CPUS > (MAX_CPU_ID + 1)
+# error Too many cpus!
+#endif
 
 #endif /* _ASM_TILE_PROCESSOR_H */
diff --git a/arch/tile/include/asm/ptrace.h b/arch/tile/include/asm/ptrace.h
index c6cddd7e8d5..b9620c077ab 100644
--- a/arch/tile/include/asm/ptrace.h
+++ b/arch/tile/include/asm/ptrace.h
@@ -11,87 +11,20 @@
  *   NON INFRINGEMENT.  See the GNU General Public License for
  *   more details.
  */
-
 #ifndef _ASM_TILE_PTRACE_H
 #define _ASM_TILE_PTRACE_H
 
-#include <arch/chip.h>
-#include <arch/abi.h>
-
-/* These must match struct pt_regs, below. */
-#if CHIP_WORD_SIZE() == 32
-#define PTREGS_OFFSET_REG(n)    ((n)*4)
-#else
-#define PTREGS_OFFSET_REG(n)    ((n)*8)
-#endif
-#define PTREGS_OFFSET_BASE      0
-#define PTREGS_OFFSET_TP        PTREGS_OFFSET_REG(53)
-#define PTREGS_OFFSET_SP        PTREGS_OFFSET_REG(54)
-#define PTREGS_OFFSET_LR        PTREGS_OFFSET_REG(55)
-#define PTREGS_NR_GPRS          56
-#define PTREGS_OFFSET_PC        PTREGS_OFFSET_REG(56)
-#define PTREGS_OFFSET_EX1       PTREGS_OFFSET_REG(57)
-#define PTREGS_OFFSET_FAULTNUM  PTREGS_OFFSET_REG(58)
-#define PTREGS_OFFSET_ORIG_R0   PTREGS_OFFSET_REG(59)
-#define PTREGS_OFFSET_FLAGS     PTREGS_OFFSET_REG(60)
-#if CHIP_HAS_CMPEXCH()
-#define PTREGS_OFFSET_CMPEXCH   PTREGS_OFFSET_REG(61)
-#endif
-#define PTREGS_SIZE             PTREGS_OFFSET_REG(64)
+#include <linux/compiler.h>
 
 #ifndef __ASSEMBLY__
-
-#ifdef __KERNEL__
 /* Benefit from consistent use of "long" on all chips. */
 typedef unsigned long pt_reg_t;
-#else
-/* Provide appropriate length type to userspace regardless of -m32/-m64. */
-typedef uint_reg_t pt_reg_t;
-#endif
-
-/*
- * This struct defines the way the registers are stored on the stack during a
- * system call or exception.  "struct sigcontext" has the same shape.
- */
-struct pt_regs {
-	/* Saved main processor registers; 56..63 are special. */
-	/* tp, sp, and lr must immediately follow regs[] for aliasing. */
-	pt_reg_t regs[53];
-	pt_reg_t tp;		/* aliases regs[TREG_TP] */
-	pt_reg_t sp;		/* aliases regs[TREG_SP] */
-	pt_reg_t lr;		/* aliases regs[TREG_LR] */
-
-	/* Saved special registers. */
-	pt_reg_t pc;		/* stored in EX_CONTEXT_K_0 */
-	pt_reg_t ex1;		/* stored in EX_CONTEXT_K_1 (PL and ICS bit) */
-	pt_reg_t faultnum;	/* fault number (INT_SWINT_1 for syscall) */
-	pt_reg_t orig_r0;	/* r0 at syscall entry, else zero */
-	pt_reg_t flags;		/* flags (see below) */
-#if !CHIP_HAS_CMPEXCH()
-	pt_reg_t pad[3];
-#else
-	pt_reg_t cmpexch;	/* value of CMPEXCH_VALUE SPR at interrupt */
-	pt_reg_t pad[2];
 #endif
-};
-
-#endif /* __ASSEMBLY__ */
 
-#define PTRACE_GETREGS		12
-#define PTRACE_SETREGS		13
-#define PTRACE_GETFPREGS	14
-#define PTRACE_SETFPREGS	15
+#include <uapi/asm/ptrace.h>
 
-/* Support TILE-specific ptrace options, with events starting at 16. */
-#define PTRACE_O_TRACEMIGRATE	0x00010000
-#define PTRACE_EVENT_MIGRATE	16
-#ifdef __KERNEL__
 #define PTRACE_O_MASK_TILE	(PTRACE_O_TRACEMIGRATE)
-#define PT_TRACE_MIGRATE	0x00080000
-#define PT_TRACE_MASK_TILE	(PT_TRACE_MIGRATE)
-#endif
-
-#ifdef __KERNEL__
+#define PT_TRACE_MIGRATE	PT_EVENT_FLAG(PTRACE_EVENT_MIGRATE)
 
 /* Flag bits in pt_regs.flags */
 #define PT_FLAGS_DISABLE_IRQ    1  /* on return to kernel, disable irqs */
@@ -100,17 +33,20 @@ struct pt_regs {
 
 #ifndef __ASSEMBLY__
 
+#define regs_return_value(regs) ((regs)->regs[0])
 #define instruction_pointer(regs) ((regs)->pc)
 #define profile_pc(regs) instruction_pointer(regs)
+#define user_stack_pointer(regs) ((regs)->sp)
 
 /* Does the process account for user or for system time? */
-#define user_mode(regs) (EX1_PL((regs)->ex1) == USER_PL)
+#define user_mode(regs) (EX1_PL((regs)->ex1) < KERNEL_PL)
 
 /* Fill in a struct pt_regs with the current kernel registers. */
 struct pt_regs *get_pt_regs(struct pt_regs *);
 
 /* Trace the current syscall. */
-extern void do_syscall_trace(void);
+extern int do_syscall_trace_enter(struct pt_regs *regs);
+extern void do_syscall_trace_exit(struct pt_regs *regs);
 
 #define arch_has_single_step()	(1)
 
@@ -144,8 +80,7 @@ extern void single_step_execve(void);
 
 struct task_struct;
 
-extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
-			 int error_code);
+extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs);
 
 #ifdef __tilegx__
 /* We need this since sigval_t has a user pointer in it, for GETSIGINFO etc. */
@@ -159,6 +94,4 @@ extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 #define SINGLESTEP_STATE_TARGET_LB              2
 #define SINGLESTEP_STATE_TARGET_UB              7
 
-#endif /* !__KERNEL__ */
-
 #endif /* _ASM_TILE_PTRACE_H */
diff --git a/arch/tile/include/asm/sections.h b/arch/tile/include/asm/sections.h
index d062d463fca..5d5d3b739a6 100644
--- a/arch/tile/include/asm/sections.h
+++ b/arch/tile/include/asm/sections.h
@@ -25,16 +25,22 @@ extern char _sinitdata[], _einitdata[];
 /* Write-once data is writable only till the end of initialization. */
 extern char __w1data_begin[], __w1data_end[];
 
+extern char vdso_start[], vdso_end[];
+#ifdef CONFIG_COMPAT
+extern char vdso32_start[], vdso32_end[];
+#endif
 
 /* Not exactly sections, but PC comparison points in the code. */
 extern char __rt_sigreturn[], __rt_sigreturn_end[];
-#ifndef __tilegx__
+#ifdef __tilegx__
+extern char __start_unalign_asm_code[], __end_unalign_asm_code[];
+#else
 extern char sys_cmpxchg[], __sys_cmpxchg_end[];
 extern char __sys_cmpxchg_grab_lock[];
 extern char __start_atomic_asm_code[], __end_atomic_asm_code[];
 #endif
 
-/* Handle the discontiguity between _sdata and _stext. */
+/* Handle the discontiguity between _sdata and _text. */
 static inline int arch_is_kernel_data(unsigned long addr)
 {
 	return addr >= (unsigned long)_sdata &&
diff --git a/arch/tile/include/asm/setup.h b/arch/tile/include/asm/setup.h
index 7caf0f36b03..e98909033e5 100644
--- a/arch/tile/include/asm/setup.h
+++ b/arch/tile/include/asm/setup.h
@@ -11,26 +11,42 @@
  *   NON INFRINGEMENT.  See the GNU General Public License for
  *   more details.
  */
-
 #ifndef _ASM_TILE_SETUP_H
 #define _ASM_TILE_SETUP_H
 
-#define COMMAND_LINE_SIZE	2048
-
-#ifdef __KERNEL__
 
 #include <linux/pfn.h>
 #include <linux/init.h>
+#include <uapi/asm/setup.h>
 
 /*
  * Reserved space for vmalloc and iomap - defined in asm/page.h
  */
 #define MAXMEM_PFN	PFN_DOWN(MAXMEM)
 
+int tile_console_write(const char *buf, int count);
 void early_panic(const char *fmt, ...);
-void warn_early_printk(void);
-void __init disable_early_printk(void);
 
-#endif /* __KERNEL__ */
+/* Init-time routine to do tile-specific per-cpu setup. */
+void setup_cpu(int boot);
+
+/* User-level DMA management functions */
+void grant_dma_mpls(void);
+void restrict_dma_mpls(void);
+
+#ifdef CONFIG_HARDWALL
+/* User-level network management functions */
+void reset_network_state(void);
+struct task_struct;
+void hardwall_switch_tasks(struct task_struct *prev, struct task_struct *next);
+void hardwall_deactivate_all(struct task_struct *task);
+int hardwall_ipi_valid(int cpu);
+
+/* Hook hardwall code into changes in affinity. */
+#define arch_set_cpus_allowed(p, new_mask) do { \
+	if (!cpumask_equal(&p->cpus_allowed, new_mask)) \
+		hardwall_deactivate_all(p); \
+} while (0)
+#endif
 
 #endif /* _ASM_TILE_SETUP_H */
diff --git a/arch/tile/include/asm/signal.h b/arch/tile/include/asm/signal.h
index 1e5e49aad54..10e183de96d 100644
--- a/arch/tile/include/asm/signal.h
+++ b/arch/tile/include/asm/signal.h
@@ -11,19 +11,11 @@
  *   NON INFRINGEMENT.  See the GNU General Public License for
  *   more details.
  */
-
 #ifndef _ASM_TILE_SIGNAL_H
 #define _ASM_TILE_SIGNAL_H
 
-/* Do not notify a ptracer when this signal is handled. */
-#define SA_NOPTRACE 0x02000000u
-
-/* Used in earlier Tilera releases, so keeping for binary compatibility. */
-#define SA_RESTORER 0x04000000u
-
-#include <asm-generic/signal.h>
+#include <uapi/asm/signal.h>
 
-#if defined(__KERNEL__)
 #if !defined(__ASSEMBLY__)
 struct pt_regs;
 int restore_sigcontext(struct pt_regs *, struct sigcontext __user *);
@@ -34,6 +26,4 @@ void signal_fault(const char *type, struct pt_regs *,
 void trace_unhandled_signal(const char *type, struct pt_regs *regs,
 			    unsigned long address, int signo);
 #endif
-#endif
-
 #endif /* _ASM_TILE_SIGNAL_H */
diff --git a/arch/tile/include/asm/smp.h b/arch/tile/include/asm/smp.h
index 532124ae4b1..9a326b64f7a 100644
--- a/arch/tile/include/asm/smp.h
+++ b/arch/tile/include/asm/smp.h
@@ -43,10 +43,6 @@ void evaluate_message(int tag);
 /* Boot a secondary cpu */
 void online_secondary(void);
 
-/* Call a function on a specified set of CPUs (may include this one). */
-extern void on_each_cpu_mask(const struct cpumask *mask,
-			     void (*func)(void *), void *info, bool wait);
-
 /* Topology of the supervisor tile grid, and coordinates of boot processor */
 extern HV_Topology smp_topology;
 
@@ -91,9 +87,6 @@ void print_disabled_cpus(void);
 
 #else /* !CONFIG_SMP */
 
-#define on_each_cpu_mask(mask, func, info, wait)		\
-  do { if (cpumask_test_cpu(0, (mask))) func(info); } while (0)
-
 #define smp_master_cpu		0
 #define smp_height		1
 #define smp_width		1
@@ -108,10 +101,8 @@ void print_disabled_cpus(void);
 extern struct cpumask cpu_lotar_map;
 #define cpu_is_valid_lotar(cpu) cpumask_test_cpu((cpu), &cpu_lotar_map)
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 /* Which processors are used for hash-for-home mapping */
 extern struct cpumask hash_for_home_map;
-#endif
 
 /* Which cpus can have their cache flushed by hv_flush_remote(). */
 extern struct cpumask cpu_cacheable_map;
diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h
index a5e4208d34f..c0a77b38d39 100644
--- a/arch/tile/include/asm/spinlock_32.h
+++ b/arch/tile/include/asm/spinlock_32.h
@@ -19,7 +19,6 @@
 
 #include <linux/atomic.h>
 #include <asm/page.h>
-#include <asm/system.h>
 #include <linux/compiler.h>
 
 /*
diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h
index 72be5904e02..9a12b9c7e5d 100644
--- a/arch/tile/include/asm/spinlock_64.h
+++ b/arch/tile/include/asm/spinlock_64.h
@@ -27,7 +27,7 @@
  * Return the "current" portion of a ticket lock value,
  * i.e. the number that currently owns the lock.
  */
-static inline int arch_spin_current(u32 val)
+static inline u32 arch_spin_current(u32 val)
 {
 	return val >> __ARCH_SPIN_CURRENT_SHIFT;
 }
@@ -36,7 +36,7 @@ static inline int arch_spin_current(u32 val)
  * Return the "next" portion of a ticket lock value,
  * i.e. the number that the next task to try to acquire the lock will get.
  */
-static inline int arch_spin_next(u32 val)
+static inline u32 arch_spin_next(u32 val)
 {
 	return val & __ARCH_SPIN_NEXT_MASK;
 }
@@ -137,7 +137,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
 static inline void arch_write_unlock(arch_rwlock_t *rw)
 {
 	__insn_mf();
-	rw->lock = 0;
+	__insn_exch4(&rw->lock, 0);  /* Avoid waiting in the write buffer. */
 }
 
 static inline int arch_read_trylock(arch_rwlock_t *rw)
diff --git a/arch/tile/include/asm/stack.h b/arch/tile/include/asm/stack.h
index 4d97a2db932..0e9d382a2d4 100644
--- a/arch/tile/include/asm/stack.h
+++ b/arch/tile/include/asm/stack.h
@@ -25,7 +25,6 @@
 struct KBacktraceIterator {
 	BacktraceIterator it;
 	struct task_struct *task;     /* task we are backtracing */
-	pte_t *pgtable;		      /* page table for user space access */
 	int end;		      /* iteration complete. */
 	int new_context;              /* new context is starting */
 	int profile;                  /* profiling, so stop on async intrpt */
diff --git a/arch/tile/include/asm/string.h b/arch/tile/include/asm/string.h
index 7535cf1a30e..92b271bd9eb 100644
--- a/arch/tile/include/asm/string.h
+++ b/arch/tile/include/asm/string.h
@@ -21,8 +21,10 @@
 #define __HAVE_ARCH_MEMMOVE
 #define __HAVE_ARCH_STRCHR
 #define __HAVE_ARCH_STRLEN
+#define __HAVE_ARCH_STRNLEN
 
 extern __kernel_size_t strlen(const char *);
+extern __kernel_size_t strnlen(const char *, __kernel_size_t);
 extern char *strchr(const char *s, int c);
 extern void *memchr(const void *s, int c, size_t n);
 extern void *memset(void *, int, __kernel_size_t);
diff --git a/arch/tile/include/asm/switch_to.h b/arch/tile/include/asm/switch_to.h
new file mode 100644
index 00000000000..b8f888cbe6b
--- /dev/null
+++ b/arch/tile/include/asm/switch_to.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_SWITCH_TO_H
+#define _ASM_TILE_SWITCH_TO_H
+
+#include <arch/sim_def.h>
+
+/*
+ * switch_to(n) should switch tasks to task nr n, first
+ * checking that n isn't the current task, in which case it does nothing.
+ * The number of callee-saved registers saved on the kernel stack
+ * is defined here for use in copy_thread() and must agree with __switch_to().
+ */
+#define CALLEE_SAVED_FIRST_REG 30
+#define CALLEE_SAVED_REGS_COUNT 24   /* r30 to r52, plus an empty to align */
+
+#ifndef __ASSEMBLY__
+
+struct task_struct;
+
+/*
+ * Pause the DMA engine and static network before task switching.
+ */
+#define prepare_arch_switch(next) _prepare_arch_switch(next)
+void _prepare_arch_switch(struct task_struct *next);
+
+struct task_struct;
+#define switch_to(prev, next, last) ((last) = _switch_to((prev), (next)))
+extern struct task_struct *_switch_to(struct task_struct *prev,
+				      struct task_struct *next);
+
+/* Helper function for _switch_to(). */
+extern struct task_struct *__switch_to(struct task_struct *prev,
+				       struct task_struct *next,
+				       unsigned long new_system_save_k_0);
+
+/* Address that switched-away from tasks are at. */
+extern unsigned long get_switch_to_pc(void);
+
+/*
+ * Kernel threads can check to see if they need to migrate their
+ * stack whenever they return from a context switch; for user
+ * threads, we defer until they are returning to user-space.
+ */
+#define finish_arch_switch(prev) do {                                     \
+	if (unlikely((prev)->state == TASK_DEAD))                         \
+		__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT |       \
+			((prev)->pid << _SIM_CONTROL_OPERATOR_BITS));     \
+	__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH |             \
+		(current->pid << _SIM_CONTROL_OPERATOR_BITS));            \
+	if (current->mm == NULL && !kstack_hash &&                        \
+	    current_thread_info()->homecache_cpu != smp_processor_id())   \
+		homecache_migrate_kthread();                              \
+} while (0)
+
+/* Support function for forking a new task. */
+void ret_from_fork(void);
+
+/* Support function for forking a new kernel thread. */
+void ret_from_kernel_thread(void *fn, void *arg);
+
+/* Called from ret_from_xxx() when a new process starts up. */
+struct task_struct *sim_notify_fork(struct task_struct *prev);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_TILE_SWITCH_TO_H */
diff --git a/arch/tile/include/asm/syscall.h b/arch/tile/include/asm/syscall.h
index d35e0dcb67b..9644b88f133 100644
--- a/arch/tile/include/asm/syscall.h
+++ b/arch/tile/include/asm/syscall.h
@@ -22,6 +22,12 @@
 #include <linux/err.h>
 #include <arch/abi.h>
 
+/* The array of function pointers for syscalls. */
+extern void *sys_call_table[];
+#ifdef CONFIG_COMPAT
+extern void *compat_sys_call_table[];
+#endif
+
 /*
  * Only the low 32 bits of orig_r0 are meaningful, so we return int.
  * This importantly ignores the high bits on 64-bit, so comparisons
diff --git a/arch/tile/include/asm/syscalls.h b/arch/tile/include/asm/syscalls.h
index 3b5507c31ea..07b298450ef 100644
--- a/arch/tile/include/asm/syscalls.h
+++ b/arch/tile/include/asm/syscalls.h
@@ -24,12 +24,6 @@
 #include <linux/types.h>
 #include <linux/compat.h>
 
-/* The array of function pointers for syscalls. */
-extern void *sys_call_table[];
-#ifdef CONFIG_COMPAT
-extern void *compat_sys_call_table[];
-#endif
-
 /*
  * Note that by convention, any syscall which requires the current
  * register set takes an additional "struct pt_regs *" pointer; a
@@ -43,15 +37,15 @@ long sys32_fadvise64(int fd, u32 offset_lo, u32 offset_hi,
 		     u32 len, int advice);
 int sys32_fadvise64_64(int fd, u32 offset_lo, u32 offset_hi,
 		       u32 len_lo, u32 len_hi, int advice);
-long sys_flush_cache(void);
+long sys_cacheflush(unsigned long addr, unsigned long len,
+		    unsigned long flags);
 #ifndef __tilegx__  /* No mmap() in the 32-bit kernel. */
 #define sys_mmap sys_mmap
 #endif
 
 #ifndef __tilegx__
 /* mm/fault.c */
-long sys_cmpxchg_badaddr(unsigned long address, struct pt_regs *);
-long _sys_cmpxchg_badaddr(unsigned long address);
+long sys_cmpxchg_badaddr(unsigned long address);
 #endif
 
 #ifdef CONFIG_COMPAT
@@ -62,14 +56,14 @@ long sys_truncate64(const char __user *path, loff_t length);
 long sys_ftruncate64(unsigned int fd, loff_t length);
 #endif
 
+/* Provide versions of standard syscalls that use current_pt_regs(). */
+long sys_rt_sigreturn(void);
+#define sys_rt_sigreturn sys_rt_sigreturn
+
 /* These are the intvec*.S trampolines. */
-long _sys_sigaltstack(const stack_t __user *, stack_t __user *);
 long _sys_rt_sigreturn(void);
 long _sys_clone(unsigned long clone_flags, unsigned long newsp,
 		void __user *parent_tid, void __user *child_tid);
-long _sys_execve(const char __user *filename,
-		 const char __user *const __user *argv,
-		 const char __user *const __user *envp);
 
 #include <asm-generic/syscalls.h>
 
diff --git a/arch/tile/include/asm/system.h b/arch/tile/include/asm/system.h
deleted file mode 100644
index 23d1842f483..00000000000
--- a/arch/tile/include/asm/system.h
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-#ifndef _ASM_TILE_SYSTEM_H
-#define _ASM_TILE_SYSTEM_H
-
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-#include <linux/irqflags.h>
-
-/* NOTE: we can't include <linux/ptrace.h> due to #include dependencies. */
-#include <asm/ptrace.h>
-
-#include <arch/chip.h>
-#include <arch/sim_def.h>
-#include <arch/spr_def.h>
-
-/*
- * read_barrier_depends - Flush all pending reads that subsequents reads
- * depend on.
- *
- * No data-dependent reads from memory-like regions are ever reordered
- * over this barrier.  All reads preceding this primitive are guaranteed
- * to access memory (but not necessarily other CPUs' caches) before any
- * reads following this primitive that depend on the data return by
- * any of the preceding reads.  This primitive is much lighter weight than
- * rmb() on most CPUs, and is never heavier weight than is
- * rmb().
- *
- * These ordering constraints are respected by both the local CPU
- * and the compiler.
- *
- * Ordering is not guaranteed by anything other than these primitives,
- * not even by data dependencies.  See the documentation for
- * memory_barrier() for examples and URLs to more information.
- *
- * For example, the following code would force ordering (the initial
- * value of "a" is zero, "b" is one, and "p" is "&a"):
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	b = 2;
- *	memory_barrier();
- *	p = &b;				q = p;
- *					read_barrier_depends();
- *					d = *q;
- * </programlisting>
- *
- * because the read of "*q" depends on the read of "p" and these
- * two reads are separated by a read_barrier_depends().  However,
- * the following code, with the same initial values for "a" and "b":
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	a = 2;
- *	memory_barrier();
- *	b = 3;				y = b;
- *					read_barrier_depends();
- *					x = a;
- * </programlisting>
- *
- * does not enforce ordering, since there is no data dependency between
- * the read of "a" and the read of "b".  Therefore, on some CPUs, such
- * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
- * in cases like this where there are no data dependencies.
- */
-
-#define read_barrier_depends()	do { } while (0)
-
-#define __sync()	__insn_mf()
-
-#if CHIP_HAS_SPLIT_CYCLE()
-#define get_cycles_low() __insn_mfspr(SPR_CYCLE_LOW)
-#else
-#define get_cycles_low() __insn_mfspr(SPR_CYCLE)   /* just get all 64 bits */
-#endif
-
-#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
-#include <hv/syscall_public.h>
-/*
- * Issue an uncacheable load to each memory controller, then
- * wait until those loads have completed.
- */
-static inline void __mb_incoherent(void)
-{
-	long clobber_r10;
-	asm volatile("swint2"
-		     : "=R10" (clobber_r10)
-		     : "R10" (HV_SYS_fence_incoherent)
-		     : "r0", "r1", "r2", "r3", "r4",
-		       "r5", "r6", "r7", "r8", "r9",
-		       "r11", "r12", "r13", "r14",
-		       "r15", "r16", "r17", "r18", "r19",
-		       "r20", "r21", "r22", "r23", "r24",
-		       "r25", "r26", "r27", "r28", "r29");
-}
-#endif
-
-/* Fence to guarantee visibility of stores to incoherent memory. */
-static inline void
-mb_incoherent(void)
-{
-	__insn_mf();
-
-#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
-	{
-#if CHIP_HAS_TILE_WRITE_PENDING()
-		const unsigned long WRITE_TIMEOUT_CYCLES = 400;
-		unsigned long start = get_cycles_low();
-		do {
-			if (__insn_mfspr(SPR_TILE_WRITE_PENDING) == 0)
-				return;
-		} while ((get_cycles_low() - start) < WRITE_TIMEOUT_CYCLES);
-#endif /* CHIP_HAS_TILE_WRITE_PENDING() */
-		(void) __mb_incoherent();
-	}
-#endif /* CHIP_HAS_MF_WAITS_FOR_VICTIMS() */
-}
-
-#define fast_wmb()	__sync()
-#define fast_rmb()	__sync()
-#define fast_mb()	__sync()
-#define fast_iob()	mb_incoherent()
-
-#define wmb()		fast_wmb()
-#define rmb()		fast_rmb()
-#define mb()		fast_mb()
-#define iob()		fast_iob()
-
-#ifdef CONFIG_SMP
-#define smp_mb()	mb()
-#define smp_rmb()	rmb()
-#define smp_wmb()	wmb()
-#define smp_read_barrier_depends()	read_barrier_depends()
-#else
-#define smp_mb()	barrier()
-#define smp_rmb()	barrier()
-#define smp_wmb()	barrier()
-#define smp_read_barrier_depends()	do { } while (0)
-#endif
-
-#define set_mb(var, value) \
-	do { var = value; mb(); } while (0)
-
-/*
- * Pause the DMA engine and static network before task switching.
- */
-#define prepare_arch_switch(next) _prepare_arch_switch(next)
-void _prepare_arch_switch(struct task_struct *next);
-
-
-/*
- * switch_to(n) should switch tasks to task nr n, first
- * checking that n isn't the current task, in which case it does nothing.
- * The number of callee-saved registers saved on the kernel stack
- * is defined here for use in copy_thread() and must agree with __switch_to().
- */
-#endif /* !__ASSEMBLY__ */
-#define CALLEE_SAVED_FIRST_REG 30
-#define CALLEE_SAVED_REGS_COUNT 24   /* r30 to r52, plus an empty to align */
-#ifndef __ASSEMBLY__
-struct task_struct;
-#define switch_to(prev, next, last) ((last) = _switch_to((prev), (next)))
-extern struct task_struct *_switch_to(struct task_struct *prev,
-				      struct task_struct *next);
-
-/* Helper function for _switch_to(). */
-extern struct task_struct *__switch_to(struct task_struct *prev,
-				       struct task_struct *next,
-				       unsigned long new_system_save_k_0);
-
-/* Address that switched-away from tasks are at. */
-extern unsigned long get_switch_to_pc(void);
-
-/*
- * On SMP systems, when the scheduler does migration-cost autodetection,
- * it needs a way to flush as much of the CPU's caches as possible:
- *
- * TODO: fill this in!
- */
-static inline void sched_cacheflush(void)
-{
-}
-
-#define arch_align_stack(x) (x)
-
-/*
- * Is the kernel doing fixups of unaligned accesses?  If <0, no kernel
- * intervention occurs and SIGBUS is delivered with no data address
- * info.  If 0, the kernel single-steps the instruction to discover
- * the data address to provide with the SIGBUS.  If 1, the kernel does
- * a fixup.
- */
-extern int unaligned_fixup;
-
-/* Is the kernel printing on each unaligned fixup? */
-extern int unaligned_printk;
-
-/* Number of unaligned fixups performed */
-extern unsigned int unaligned_fixup_count;
-
-/* Init-time routine to do tile-specific per-cpu setup. */
-void setup_cpu(int boot);
-
-/* User-level DMA management functions */
-void grant_dma_mpls(void);
-void restrict_dma_mpls(void);
-
-#ifdef CONFIG_HARDWALL
-/* User-level network management functions */
-void reset_network_state(void);
-void grant_network_mpls(void);
-void restrict_network_mpls(void);
-int hardwall_deactivate(struct task_struct *task);
-
-/* Hook hardwall code into changes in affinity. */
-#define arch_set_cpus_allowed(p, new_mask) do { \
-	if (p->thread.hardwall && !cpumask_equal(&p->cpus_allowed, new_mask)) \
-		hardwall_deactivate(p); \
-} while (0)
-#endif
-
-/*
- * Kernel threads can check to see if they need to migrate their
- * stack whenever they return from a context switch; for user
- * threads, we defer until they are returning to user-space.
- */
-#define finish_arch_switch(prev) do {                                     \
-	if (unlikely((prev)->state == TASK_DEAD))                         \
-		__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT |       \
-			((prev)->pid << _SIM_CONTROL_OPERATOR_BITS));     \
-	__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH |             \
-		(current->pid << _SIM_CONTROL_OPERATOR_BITS));            \
-	if (current->mm == NULL && !kstack_hash &&                        \
-	    current_thread_info()->homecache_cpu != smp_processor_id())   \
-		homecache_migrate_kthread();                              \
-} while (0)
-
-/* Support function for forking a new task. */
-void ret_from_fork(void);
-
-/* Called from ret_from_fork() when a new process starts up. */
-struct task_struct *sim_notify_fork(struct task_struct *prev);
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* _ASM_TILE_SYSTEM_H */
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index bc4f562bd45..48e4fd0f38e 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -39,6 +39,11 @@ struct thread_info {
 	struct restart_block	restart_block;
 	struct single_step_state *step_state;	/* single step state
 						   (if non-zero) */
+	int			align_ctl;	/* controls unaligned access */
+#ifdef __tilegx__
+	unsigned long		unalign_jit_tmp[4]; /* temp r0..r3 storage */
+	void __user		*unalign_jit_base; /* unalign fixup JIT base */
+#endif
 };
 
 /*
@@ -56,6 +61,7 @@ struct thread_info {
 		.fn = do_no_restart_syscall,	\
 	},					\
 	.step_state	= NULL,			\
+	.align_ctl	= 0,			\
 }
 
 #define init_thread_info	(init_thread_union.thread_info)
@@ -77,40 +83,36 @@ struct thread_info {
 
 #ifndef __ASSEMBLY__
 
+void arch_release_thread_info(struct thread_info *info);
+
 /* How to get the thread information struct from C. */
 register unsigned long stack_pointer __asm__("sp");
 
 #define current_thread_info() \
   ((struct thread_info *)(stack_pointer & -THREAD_SIZE))
 
-#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-extern struct thread_info *alloc_thread_info_node(struct task_struct *task, int node);
-extern void free_thread_info(struct thread_info *info);
-
 /* Sit on a nap instruction until interrupted. */
 extern void smp_nap(void);
 
-/* Enable interrupts racelessly and nap forever: helper for cpu_idle(). */
+/* Enable interrupts racelessly and nap forever: helper for arch_cpu_idle(). */
 extern void _cpu_idle(void);
 
-/* Switch boot idle thread to a freshly-allocated stack and free old stack. */
-extern void cpu_idle_on_new_stack(struct thread_info *old_ti,
-				  unsigned long new_sp,
-				  unsigned long new_ss10);
-
 #else /* __ASSEMBLY__ */
 
-/* how to get the thread information struct from ASM */
+/*
+ * How to get the thread information struct from assembly.
+ * Note that we use different macros since different architectures
+ * have different semantics in their "mm" instruction and we would
+ * like to guarantee that the macro expands to exactly one instruction.
+ */
 #ifdef __tilegx__
-#define GET_THREAD_INFO(reg) move reg, sp; mm reg, zero, LOG2_THREAD_SIZE, 63
+#define EXTRACT_THREAD_INFO(reg) mm reg, zero, LOG2_THREAD_SIZE, 63
 #else
 #define GET_THREAD_INFO(reg) mm reg, sp, zero, LOG2_THREAD_SIZE, 31
 #endif
 
 #endif /* !__ASSEMBLY__ */
 
-#define PREEMPT_ACTIVE		0x10000000
-
 /*
  * Thread information flags that various assembly files may need to access.
  * Keep flags accessed frequently in low bits, particular since it makes
@@ -126,6 +128,8 @@ extern void cpu_idle_on_new_stack(struct thread_info *old_ti,
 #define TIF_SECCOMP		6	/* secure computing */
 #define TIF_MEMDIE		7	/* OOM killer at work */
 #define TIF_NOTIFY_RESUME	8	/* callback before returning to user */
+#define TIF_SYSCALL_TRACEPOINT	9	/* syscall tracepoint instrumentation */
+#define TIF_POLLING_NRFLAG	10	/* idle is polling for TIF_NEED_RESCHED */
 
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
@@ -136,12 +140,20 @@ extern void cpu_idle_on_new_stack(struct thread_info *old_ti,
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_MEMDIE		(1<<TIF_MEMDIE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
+#define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
+#define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
 /* Work to do on any return to user space. */
 #define _TIF_ALLWORK_MASK \
   (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|\
    _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME)
 
+/* Work to do at syscall entry. */
+#define _TIF_SYSCALL_ENTRY_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT)
+
+/* Work to do at syscall exit. */
+#define _TIF_SYSCALL_EXIT_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT)
+
 /*
  * Thread-synchronous status.
  *
@@ -152,18 +164,31 @@ extern void cpu_idle_on_new_stack(struct thread_info *old_ti,
 #ifdef __tilegx__
 #define TS_COMPAT		0x0001	/* 32-bit compatibility mode */
 #endif
-#define TS_POLLING		0x0004	/* in idle loop but not sleeping */
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal */
 
-#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
-
 #ifndef __ASSEMBLY__
 #define HAVE_SET_RESTORE_SIGMASK	1
 static inline void set_restore_sigmask(void)
 {
 	struct thread_info *ti = current_thread_info();
 	ti->status |= TS_RESTORE_SIGMASK;
-	set_bit(TIF_SIGPENDING, &ti->flags);
+	WARN_ON(!test_bit(TIF_SIGPENDING, &ti->flags));
+}
+static inline void clear_restore_sigmask(void)
+{
+	current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
+}
+static inline bool test_restore_sigmask(void)
+{
+	return current_thread_info()->status & TS_RESTORE_SIGMASK;
+}
+static inline bool test_and_clear_restore_sigmask(void)
+{
+	struct thread_info *ti = current_thread_info();
+	if (!(ti->status & TS_RESTORE_SIGMASK))
+		return false;
+	ti->status &= ~TS_RESTORE_SIGMASK;
+	return true;
 }
 #endif	/* !__ASSEMBLY__ */
 
diff --git a/arch/tile/include/asm/timex.h b/arch/tile/include/asm/timex.h
index 29921f0b86d..dc987d53e2a 100644
--- a/arch/tile/include/asm/timex.h
+++ b/arch/tile/include/asm/timex.h
@@ -29,11 +29,13 @@ typedef unsigned long long cycles_t;
 
 #if CHIP_HAS_SPLIT_CYCLE()
 cycles_t get_cycles(void);
+#define get_cycles_low() __insn_mfspr(SPR_CYCLE_LOW)
 #else
 static inline cycles_t get_cycles(void)
 {
 	return __insn_mfspr(SPR_CYCLE);
 }
+#define get_cycles_low() __insn_mfspr(SPR_CYCLE)   /* just get all 64 bits */
 #endif
 
 cycles_t get_clock_rate(void);
diff --git a/arch/tile/include/asm/tlbflush.h b/arch/tile/include/asm/tlbflush.h
index 96199d214fb..dcf91b25a1e 100644
--- a/arch/tile/include/asm/tlbflush.h
+++ b/arch/tile/include/asm/tlbflush.h
@@ -38,16 +38,11 @@ DECLARE_PER_CPU(int, current_asid);
 /* The hypervisor tells us what ASIDs are available to us. */
 extern int min_asid, max_asid;
 
-static inline unsigned long hv_page_size(const struct vm_area_struct *vma)
-{
-	return (vma->vm_flags & VM_HUGETLB) ? HPAGE_SIZE : PAGE_SIZE;
-}
-
 /* Pass as vma pointer for non-executable mapping, if no vma available. */
-#define FLUSH_NONEXEC ((const struct vm_area_struct *)-1UL)
+#define FLUSH_NONEXEC ((struct vm_area_struct *)-1UL)
 
 /* Flush a single user page on this cpu. */
-static inline void local_flush_tlb_page(const struct vm_area_struct *vma,
+static inline void local_flush_tlb_page(struct vm_area_struct *vma,
 					unsigned long addr,
 					unsigned long page_size)
 {
@@ -60,7 +55,7 @@ static inline void local_flush_tlb_page(const struct vm_area_struct *vma,
 }
 
 /* Flush range of user pages on this cpu. */
-static inline void local_flush_tlb_pages(const struct vm_area_struct *vma,
+static inline void local_flush_tlb_pages(struct vm_area_struct *vma,
 					 unsigned long addr,
 					 unsigned long page_size,
 					 unsigned long len)
@@ -117,10 +112,10 @@ extern void flush_tlb_all(void);
 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 extern void flush_tlb_current_task(void);
 extern void flush_tlb_mm(struct mm_struct *);
-extern void flush_tlb_page(const struct vm_area_struct *, unsigned long);
-extern void flush_tlb_page_mm(const struct vm_area_struct *,
+extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
+extern void flush_tlb_page_mm(struct vm_area_struct *,
 			      struct mm_struct *, unsigned long);
-extern void flush_tlb_range(const struct vm_area_struct *,
+extern void flush_tlb_range(struct vm_area_struct *,
 			    unsigned long start, unsigned long end);
 
 #define flush_tlb()     flush_tlb_current_task()
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index 6fdd0c86019..93831184423 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -44,66 +44,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
 /* For now, use numa node -1 for global allocation. */
 #define pcibus_to_node(bus)		((void)(bus), -1)
 
-/*
- * TILE architecture has many cores integrated in one processor, so we need
- * setup bigger balance_interval for both CPU/NODE scheduling domains to
- * reduce process scheduling costs.
- */
-
-/* sched_domains SD_CPU_INIT for TILE architecture */
-#define SD_CPU_INIT (struct sched_domain) {				\
-	.min_interval		= 4,					\
-	.max_interval		= 128,					\
-	.busy_factor		= 64,					\
-	.imbalance_pct		= 125,					\
-	.cache_nice_tries	= 1,					\
-	.busy_idx		= 2,					\
-	.idle_idx		= 1,					\
-	.newidle_idx		= 0,					\
-	.wake_idx		= 0,					\
-	.forkexec_idx		= 0,					\
-									\
-	.flags			= 1*SD_LOAD_BALANCE			\
-				| 1*SD_BALANCE_NEWIDLE			\
-				| 1*SD_BALANCE_EXEC			\
-				| 1*SD_BALANCE_FORK			\
-				| 0*SD_BALANCE_WAKE			\
-				| 0*SD_WAKE_AFFINE			\
-				| 0*SD_PREFER_LOCAL			\
-				| 0*SD_SHARE_CPUPOWER			\
-				| 0*SD_SHARE_PKG_RESOURCES		\
-				| 0*SD_SERIALIZE			\
-				,					\
-	.last_balance		= jiffies,				\
-	.balance_interval	= 32,					\
-}
-
-/* sched_domains SD_NODE_INIT for TILE architecture */
-#define SD_NODE_INIT (struct sched_domain) {				\
-	.min_interval		= 16,					\
-	.max_interval		= 512,					\
-	.busy_factor		= 32,					\
-	.imbalance_pct		= 125,					\
-	.cache_nice_tries	= 1,					\
-	.busy_idx		= 3,					\
-	.idle_idx		= 1,					\
-	.newidle_idx		= 2,					\
-	.wake_idx		= 1,					\
-	.flags			= 1*SD_LOAD_BALANCE			\
-				| 1*SD_BALANCE_NEWIDLE			\
-				| 1*SD_BALANCE_EXEC			\
-				| 1*SD_BALANCE_FORK			\
-				| 0*SD_BALANCE_WAKE			\
-				| 0*SD_WAKE_AFFINE			\
-				| 0*SD_PREFER_LOCAL			\
-				| 0*SD_SHARE_CPUPOWER			\
-				| 0*SD_SHARE_PKG_RESOURCES		\
-				| 1*SD_SERIALIZE			\
-				,					\
-	.last_balance		= jiffies,				\
-	.balance_interval	= 128,					\
-}
-
 /* By definition, we create nodes based on online memory. */
 #define node_has_online_mem(nid) 1
 
@@ -116,9 +56,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
 #define topology_core_id(cpu)                   (cpu)
 #define topology_core_cpumask(cpu)              ((void)(cpu), cpu_online_mask)
 #define topology_thread_cpumask(cpu)            cpumask_of(cpu)
-
-/* indicates that pointers to the topology struct cpumask maps are valid */
-#define arch_provides_topology_pointers         yes
 #endif
 
 #endif /* _ASM_TILE_TOPOLOGY_H */
diff --git a/arch/tile/include/asm/traps.h b/arch/tile/include/asm/traps.h
index 5f20f920f93..4b99a1c3aab 100644
--- a/arch/tile/include/asm/traps.h
+++ b/arch/tile/include/asm/traps.h
@@ -15,12 +15,13 @@
 #ifndef _ASM_TILE_TRAPS_H
 #define _ASM_TILE_TRAPS_H
 
+#ifndef __ASSEMBLY__
 #include <arch/chip.h>
 
 /* mm/fault.c */
 void do_page_fault(struct pt_regs *, int fault_num,
 		   unsigned long address, unsigned long write);
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
+#if CHIP_HAS_TILE_DMA()
 void do_async_page_fault(struct pt_regs *);
 #endif
 
@@ -64,7 +65,21 @@ void do_breakpoint(struct pt_regs *, int fault_num);
 
 
 #ifdef __tilegx__
+/* kernel/single_step.c */
 void gx_singlestep_handle(struct pt_regs *, int fault_num);
+
+/* kernel/intvec_64.S */
+void fill_ra_stack(void);
+
+/* Handle unalign data fixup. */
+extern void do_unaligned(struct pt_regs *regs, int vecnum);
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#ifdef __tilegx__
+/* 128 byte JIT per unalign fixup. */
+#define UNALIGN_JIT_SHIFT    7
 #endif
 
-#endif /* _ASM_TILE_SYSCALLS_H */
+#endif /* _ASM_TILE_TRAPS_H */
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index ef34d2caa5b..b6cde3209b9 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -114,45 +114,79 @@ struct exception_table_entry {
 extern int fixup_exception(struct pt_regs *regs);
 
 /*
- * We return the __get_user_N function results in a structure,
- * thus in r0 and r1.  If "err" is zero, "val" is the result
- * of the read; otherwise, "err" is -EFAULT.
- *
- * We rarely need 8-byte values on a 32-bit architecture, but
- * we size the structure to accommodate.  In practice, for the
- * the smaller reads, we can zero the high word for free, and
- * the caller will ignore it by virtue of casting anyway.
+ * Support macros for __get_user().
+ *
+ * Implementation note: The "case 8" logic of casting to the type of
+ * the result of subtracting the value from itself is basically a way
+ * of keeping all integer types the same, but casting any pointers to
+ * ptrdiff_t, i.e. also an integer type.  This way there are no
+ * questionable casts seen by the compiler on an ILP32 platform.
+ *
+ * Note that __get_user() and __put_user() assume proper alignment.
  */
-struct __get_user {
-	unsigned long long val;
-	int err;
-};
 
-/*
- * FIXME: we should express these as inline extended assembler, since
- * they're fundamentally just a variable dereference and some
- * supporting exception_table gunk.  Note that (a la i386) we can
- * extend the copy_to_user and copy_from_user routines to call into
- * such extended assembler routines, though we will have to use a
- * different return code in that case (1, 2, or 4, rather than -EFAULT).
- */
-extern struct __get_user __get_user_1(const void __user *);
-extern struct __get_user __get_user_2(const void __user *);
-extern struct __get_user __get_user_4(const void __user *);
-extern struct __get_user __get_user_8(const void __user *);
-extern int __put_user_1(long, void __user *);
-extern int __put_user_2(long, void __user *);
-extern int __put_user_4(long, void __user *);
-extern int __put_user_8(long long, void __user *);
-
-/* Unimplemented routines to cause linker failures */
-extern struct __get_user __get_user_bad(void);
-extern int __put_user_bad(void);
+#ifdef __LP64__
+#define _ASM_PTR	".quad"
+#define _ASM_ALIGN	".align 8"
+#else
+#define _ASM_PTR	".long"
+#define _ASM_ALIGN	".align 4"
+#endif
+
+#define __get_user_asm(OP, x, ptr, ret)					\
+	asm volatile("1: {" #OP " %1, %2; movei %0, 0 }\n"		\
+		     ".pushsection .fixup,\"ax\"\n"			\
+		     "0: { movei %1, 0; movei %0, %3 }\n"		\
+		     "j 9f\n"						\
+		     ".section __ex_table,\"a\"\n"			\
+		     _ASM_ALIGN "\n"					\
+		     _ASM_PTR " 1b, 0b\n"				\
+		     ".popsection\n"					\
+		     "9:"						\
+		     : "=r" (ret), "=r" (x)				\
+		     : "r" (ptr), "i" (-EFAULT))
+
+#ifdef __tilegx__
+#define __get_user_1(x, ptr, ret) __get_user_asm(ld1u, x, ptr, ret)
+#define __get_user_2(x, ptr, ret) __get_user_asm(ld2u, x, ptr, ret)
+#define __get_user_4(x, ptr, ret) __get_user_asm(ld4s, x, ptr, ret)
+#define __get_user_8(x, ptr, ret) __get_user_asm(ld, x, ptr, ret)
+#else
+#define __get_user_1(x, ptr, ret) __get_user_asm(lb_u, x, ptr, ret)
+#define __get_user_2(x, ptr, ret) __get_user_asm(lh_u, x, ptr, ret)
+#define __get_user_4(x, ptr, ret) __get_user_asm(lw, x, ptr, ret)
+#ifdef __LITTLE_ENDIAN
+#define __lo32(a, b) a
+#define __hi32(a, b) b
+#else
+#define __lo32(a, b) b
+#define __hi32(a, b) a
+#endif
+#define __get_user_8(x, ptr, ret)					\
+	({								\
+		unsigned int __a, __b;					\
+		asm volatile("1: { lw %1, %3; addi %2, %3, 4 }\n"	\
+			     "2: { lw %2, %2; movei %0, 0 }\n"		\
+			     ".pushsection .fixup,\"ax\"\n"		\
+			     "0: { movei %1, 0; movei %2, 0 }\n"	\
+			     "{ movei %0, %4; j 9f }\n"			\
+			     ".section __ex_table,\"a\"\n"		\
+			     ".align 4\n"				\
+			     ".word 1b, 0b\n"				\
+			     ".word 2b, 0b\n"				\
+			     ".popsection\n"				\
+			     "9:"					\
+			     : "=r" (ret), "=r" (__a), "=&r" (__b)	\
+			     : "r" (ptr), "i" (-EFAULT));		\
+		(x) = (__typeof(x))(__typeof((x)-(x)))			\
+			(((u64)__hi32(__a, __b) << 32) |		\
+			 __lo32(__a, __b));				\
+	})
+#endif
+
+extern int __get_user_bad(void)
+  __attribute__((warning("sizeof __get_user argument not 1, 2, 4 or 8")));
 
-/*
- * Careful: we have to cast the result to the type of the pointer
- * for sign reasons.
- */
 /**
  * __get_user: - Get a simple variable from user space, with less checking.
  * @x:   Variable to store result.
@@ -174,30 +208,64 @@ extern int __put_user_bad(void);
  * function.
  */
 #define __get_user(x, ptr)						\
-({	struct __get_user __ret;					\
-	__typeof__(*(ptr)) const __user *__gu_addr = (ptr);		\
-	__chk_user_ptr(__gu_addr);					\
-	switch (sizeof(*(__gu_addr))) {					\
-	case 1:								\
-		__ret = __get_user_1(__gu_addr);			\
-		break;							\
-	case 2:								\
-		__ret = __get_user_2(__gu_addr);			\
-		break;							\
-	case 4:								\
-		__ret = __get_user_4(__gu_addr);			\
-		break;							\
-	case 8:								\
-		__ret = __get_user_8(__gu_addr);			\
-		break;							\
-	default:							\
-		__ret = __get_user_bad();				\
-		break;							\
-	}								\
-	(x) = (__typeof__(*__gu_addr)) (__typeof__(*__gu_addr - *__gu_addr)) \
-	  __ret.val;			                                \
-	__ret.err;							\
-})
+	({								\
+		int __ret;						\
+		__chk_user_ptr(ptr);					\
+		switch (sizeof(*(ptr))) {				\
+		case 1: __get_user_1(x, ptr, __ret); break;		\
+		case 2: __get_user_2(x, ptr, __ret); break;		\
+		case 4: __get_user_4(x, ptr, __ret); break;		\
+		case 8: __get_user_8(x, ptr, __ret); break;		\
+		default: __ret = __get_user_bad(); break;		\
+		}							\
+		__ret;							\
+	})
+
+/* Support macros for __put_user(). */
+
+#define __put_user_asm(OP, x, ptr, ret)			\
+	asm volatile("1: {" #OP " %1, %2; movei %0, 0 }\n"		\
+		     ".pushsection .fixup,\"ax\"\n"			\
+		     "0: { movei %0, %3; j 9f }\n"			\
+		     ".section __ex_table,\"a\"\n"			\
+		     _ASM_ALIGN "\n"					\
+		     _ASM_PTR " 1b, 0b\n"				\
+		     ".popsection\n"					\
+		     "9:"						\
+		     : "=r" (ret)					\
+		     : "r" (ptr), "r" (x), "i" (-EFAULT))
+
+#ifdef __tilegx__
+#define __put_user_1(x, ptr, ret) __put_user_asm(st1, x, ptr, ret)
+#define __put_user_2(x, ptr, ret) __put_user_asm(st2, x, ptr, ret)
+#define __put_user_4(x, ptr, ret) __put_user_asm(st4, x, ptr, ret)
+#define __put_user_8(x, ptr, ret) __put_user_asm(st, x, ptr, ret)
+#else
+#define __put_user_1(x, ptr, ret) __put_user_asm(sb, x, ptr, ret)
+#define __put_user_2(x, ptr, ret) __put_user_asm(sh, x, ptr, ret)
+#define __put_user_4(x, ptr, ret) __put_user_asm(sw, x, ptr, ret)
+#define __put_user_8(x, ptr, ret)					\
+	({								\
+		u64 __x = (__typeof((x)-(x)))(x);			\
+		int __lo = (int) __x, __hi = (int) (__x >> 32);		\
+		asm volatile("1: { sw %1, %2; addi %0, %1, 4 }\n"	\
+			     "2: { sw %0, %3; movei %0, 0 }\n"		\
+			     ".pushsection .fixup,\"ax\"\n"		\
+			     "0: { movei %0, %4; j 9f }\n"		\
+			     ".section __ex_table,\"a\"\n"		\
+			     ".align 4\n"				\
+			     ".word 1b, 0b\n"				\
+			     ".word 2b, 0b\n"				\
+			     ".popsection\n"				\
+			     "9:"					\
+			     : "=&r" (ret)				\
+			     : "r" (ptr), "r" (__lo32(__lo, __hi)),	\
+			     "r" (__hi32(__lo, __hi)), "i" (-EFAULT));	\
+	})
+#endif
+
+extern int __put_user_bad(void)
+  __attribute__((warning("sizeof __put_user argument not 1, 2, 4 or 8")));
 
 /**
  * __put_user: - Write a simple value into user space, with less checking.
@@ -217,39 +285,19 @@ extern int __put_user_bad(void);
  * function.
  *
  * Returns zero on success, or -EFAULT on error.
- *
- * Implementation note: The "case 8" logic of casting to the type of
- * the result of subtracting the value from itself is basically a way
- * of keeping all integer types the same, but casting any pointers to
- * ptrdiff_t, i.e. also an integer type.  This way there are no
- * questionable casts seen by the compiler on an ILP32 platform.
  */
 #define __put_user(x, ptr)						\
 ({									\
-	int __pu_err = 0;						\
-	__typeof__(*(ptr)) __user *__pu_addr = (ptr);			\
-	typeof(*__pu_addr) __pu_val = (x);				\
-	__chk_user_ptr(__pu_addr);					\
-	switch (sizeof(__pu_val)) {					\
-	case 1:								\
-		__pu_err = __put_user_1((long)__pu_val, __pu_addr);	\
-		break;							\
-	case 2:								\
-		__pu_err = __put_user_2((long)__pu_val, __pu_addr);	\
-		break;							\
-	case 4:								\
-		__pu_err = __put_user_4((long)__pu_val, __pu_addr);	\
-		break;							\
-	case 8:								\
-		__pu_err =						\
-		  __put_user_8((__typeof__(__pu_val - __pu_val))__pu_val,\
-			__pu_addr);					\
-		break;							\
-	default:							\
-		__pu_err = __put_user_bad();				\
-		break;							\
+	int __ret;							\
+	__chk_user_ptr(ptr);						\
+	switch (sizeof(*(ptr))) {					\
+	case 1: __put_user_1(x, ptr, __ret); break;			\
+	case 2: __put_user_2(x, ptr, __ret); break;			\
+	case 4: __put_user_4(x, ptr, __ret); break;			\
+	case 8: __put_user_8(x, ptr, __ret); break;			\
+	default: __ret = __put_user_bad(); break;			\
 	}								\
-	__pu_err;							\
+	__ret;								\
 })
 
 /*
@@ -353,7 +401,12 @@ _copy_from_user(void *to, const void __user *from, unsigned long n)
 	return n;
 }
 
-#ifdef CONFIG_DEBUG_COPY_FROM_USER
+#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
+/*
+ * There are still unprovable places in the generic code as of 2.6.34, so this
+ * option is not really compatible with -Werror, which is more useful in
+ * general.
+ */
 extern void copy_from_user_overflow(void)
 	__compiletime_warning("copy_from_user() size is not provably correct");
 
@@ -378,7 +431,7 @@ static inline unsigned long __must_check copy_from_user(void *to,
 /**
  * __copy_in_user() - copy data within user space, with less checking.
  * @to:   Destination address, in user space.
- * @from: Source address, in kernel space.
+ * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
  * Context: User context only.  This function may sleep.
@@ -395,7 +448,7 @@ extern unsigned long __copy_in_user_inatomic(
 static inline unsigned long __must_check
 __copy_in_user(void __user *to, const void __user *from, unsigned long n)
 {
-	might_sleep();
+	might_fault();
 	return __copy_in_user_inatomic(to, from, n);
 }
 
@@ -520,37 +573,6 @@ static inline unsigned long __must_check flush_user(
 }
 
 /**
- * inv_user: - Invalidate a block of memory in user space from cache.
- * @mem:   Destination address, in user space.
- * @len:   Number of bytes to invalidate.
- *
- * Returns number of bytes that could not be invalidated.
- * On success, this will be zero.
- *
- * Note that on Tile64, the "inv" operation is in fact a
- * "flush and invalidate", so cache write-backs will occur prior
- * to the cache being marked invalid.
- */
-extern unsigned long inv_user_asm(void __user *mem, unsigned long len);
-static inline unsigned long __must_check __inv_user(
-	void __user *mem, unsigned long len)
-{
-	int retval;
-
-	might_fault();
-	retval = inv_user_asm(mem, len);
-	mb_incoherent();
-	return retval;
-}
-static inline unsigned long __must_check inv_user(
-	void __user *mem, unsigned long len)
-{
-	if (access_ok(VERIFY_WRITE, mem, len))
-		return __inv_user(mem, len);
-	return len;
-}
-
-/**
  * finv_user: - Flush-inval a block of memory in user space from cache.
  * @mem:   Destination address, in user space.
  * @len:   Number of bytes to invalidate.
diff --git a/arch/tile/include/asm/unaligned.h b/arch/tile/include/asm/unaligned.h
index 137e2de5b10..5a58a0d1144 100644
--- a/arch/tile/include/asm/unaligned.h
+++ b/arch/tile/include/asm/unaligned.h
@@ -15,10 +15,29 @@
 #ifndef _ASM_TILE_UNALIGNED_H
 #define _ASM_TILE_UNALIGNED_H
 
-#include <linux/unaligned/le_struct.h>
-#include <linux/unaligned/be_byteshift.h>
-#include <linux/unaligned/generic.h>
-#define get_unaligned	__get_unaligned_le
-#define put_unaligned	__put_unaligned_le
+/*
+ * We could implement faster get_unaligned_[be/le]64 using the ldna
+ * instruction on tilegx; however, we need to either copy all of the
+ * other generic functions to here (which is pretty ugly) or else
+ * modify both the generic code and other arch code to allow arch
+ * specific unaligned data access functions.  Given these functions
+ * are not often called, we'll stick with the generic version.
+ */
+#include <asm-generic/unaligned.h>
+
+/*
+ * Is the kernel doing fixups of unaligned accesses?  If <0, no kernel
+ * intervention occurs and SIGBUS is delivered with no data address
+ * info.  If 0, the kernel single-steps the instruction to discover
+ * the data address to provide with the SIGBUS.  If 1, the kernel does
+ * a fixup.
+ */
+extern int unaligned_fixup;
+
+/* Is the kernel printing on each unaligned fixup? */
+extern int unaligned_printk;
+
+/* Number of unaligned fixups performed */
+extern unsigned int unaligned_fixup_count;
 
 #endif /* _ASM_TILE_UNALIGNED_H */
diff --git a/arch/tile/include/asm/unistd.h b/arch/tile/include/asm/unistd.h
index f70bf1c541f..940831fe9e9 100644
--- a/arch/tile/include/asm/unistd.h
+++ b/arch/tile/include/asm/unistd.h
@@ -11,37 +11,10 @@
  *   NON INFRINGEMENT.  See the GNU General Public License for
  *   more details.
  */
-
-#if !defined(_ASM_TILE_UNISTD_H) || defined(__SYSCALL)
-#define _ASM_TILE_UNISTD_H
-
-#if !defined(__LP64__) || defined(__SYSCALL_COMPAT)
-/* Use the flavor of this syscall that matches the 32-bit API better. */
-#define __ARCH_WANT_SYNC_FILE_RANGE2
-#endif
-
-/* Use the standard ABI for syscalls. */
-#include <asm-generic/unistd.h>
-
-/* Additional Tilera-specific syscalls. */
-#define __NR_flush_cache	(__NR_arch_specific_syscall + 1)
-__SYSCALL(__NR_flush_cache, sys_flush_cache)
-
-#ifndef __tilegx__
-/* "Fast" syscalls provide atomic support for 32-bit chips. */
-#define __NR_FAST_cmpxchg	-1
-#define __NR_FAST_atomic_update	-2
-#define __NR_FAST_cmpxchg64	-3
-#define __NR_cmpxchg_badaddr	(__NR_arch_specific_syscall + 0)
-__SYSCALL(__NR_cmpxchg_badaddr, sys_cmpxchg_badaddr)
-#endif
-
-#ifdef __KERNEL__
 /* In compat mode, we use sys_llseek() for compat_sys_llseek(). */
 #ifdef CONFIG_COMPAT
 #define __ARCH_WANT_SYS_LLSEEK
 #endif
 #define __ARCH_WANT_SYS_NEWFSTATAT
-#endif
-
-#endif /* _ASM_TILE_UNISTD_H */
+#define __ARCH_WANT_SYS_CLONE
+#include <uapi/asm/unistd.h>
diff --git a/arch/tile/include/asm/vdso.h b/arch/tile/include/asm/vdso.h
new file mode 100644
index 00000000000..9f6a78d665f
--- /dev/null
+++ b/arch/tile/include/asm/vdso.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef __TILE_VDSO_H__
+#define __TILE_VDSO_H__
+
+#include <linux/types.h>
+
+/*
+ * Note about the vdso_data structure:
+ *
+ * NEVER USE THEM IN USERSPACE CODE DIRECTLY. The layout of the
+ * structure is supposed to be known only to the function in the vdso
+ * itself and may change without notice.
+ */
+
+struct vdso_data {
+	__u64 tz_update_count;  /* Timezone atomicity ctr             */
+	__u64 tb_update_count;  /* Timebase atomicity ctr             */
+	__u64 xtime_tod_stamp;  /* TOD clock for xtime                */
+	__u64 xtime_clock_sec;  /* Kernel time second                 */
+	__u64 xtime_clock_nsec; /* Kernel time nanosecond             */
+	__u64 wtom_clock_sec;   /* Wall to monotonic clock second     */
+	__u64 wtom_clock_nsec;  /* Wall to monotonic clock nanosecond */
+	__u32 mult;             /* Cycle to nanosecond multiplier     */
+	__u32 shift;            /* Cycle to nanosecond divisor (power of two) */
+	__u32 tz_minuteswest;   /* Minutes west of Greenwich          */
+	__u32 tz_dsttime;       /* Type of dst correction             */
+};
+
+extern struct vdso_data *vdso_data;
+
+/* __vdso_rt_sigreturn is defined with the addresses in the vdso page. */
+extern void __vdso_rt_sigreturn(void);
+
+extern int setup_vdso_pages(void);
+
+#endif /* __TILE_VDSO_H__ */
diff --git a/arch/tile/include/gxio/common.h b/arch/tile/include/gxio/common.h
new file mode 100644
index 00000000000..724595a24d0
--- /dev/null
+++ b/arch/tile/include/gxio/common.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _GXIO_COMMON_H_
+#define _GXIO_COMMON_H_
+
+/*
+ * Routines shared between the various GXIO device components.
+ */
+
+#include <hv/iorpc.h>
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/io.h>
+
+/* Define the standard gxio MMIO functions using kernel functions. */
+#define __gxio_mmio_read8(addr)		readb(addr)
+#define __gxio_mmio_read16(addr)	readw(addr)
+#define __gxio_mmio_read32(addr)	readl(addr)
+#define __gxio_mmio_read64(addr)	readq(addr)
+#define __gxio_mmio_write8(addr, val)	writeb((val), (addr))
+#define __gxio_mmio_write16(addr, val)	writew((val), (addr))
+#define __gxio_mmio_write32(addr, val)	writel((val), (addr))
+#define __gxio_mmio_write64(addr, val)	writeq((val), (addr))
+#define __gxio_mmio_read(addr)		__gxio_mmio_read64(addr)
+#define __gxio_mmio_write(addr, val)	__gxio_mmio_write64((addr), (val))
+
+#endif /* !_GXIO_COMMON_H_ */
diff --git a/arch/tile/include/gxio/dma_queue.h b/arch/tile/include/gxio/dma_queue.h
new file mode 100644
index 00000000000..b9e45e37649
--- /dev/null
+++ b/arch/tile/include/gxio/dma_queue.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _GXIO_DMA_QUEUE_H_
+#define _GXIO_DMA_QUEUE_H_
+
+/*
+ * DMA queue management APIs shared between TRIO and mPIPE.
+ */
+
+#include <gxio/common.h>
+
+/* The credit counter lives in the high 32 bits. */
+#define DMA_QUEUE_CREDIT_SHIFT 32
+
+/*
+ * State object that tracks a DMA queue's head and tail indices, as
+ * well as the number of commands posted and completed.  The
+ * structure is accessed via a thread-safe, lock-free algorithm.
+ */
+typedef struct {
+	/*
+	 * Address of a MPIPE_EDMA_POST_REGION_VAL_t,
+	 * TRIO_PUSH_DMA_REGION_VAL_t, or TRIO_PULL_DMA_REGION_VAL_t
+	 * register.  These register have identical encodings and provide
+	 * information about how many commands have been processed.
+	 */
+	void *post_region_addr;
+
+	/*
+	 * A lazily-updated count of how many edescs the hardware has
+	 * completed.
+	 */
+	uint64_t hw_complete_count __attribute__ ((aligned(64)));
+
+	/*
+	 * High 32 bits are a count of available egress command credits,
+	 * low 24 bits are the next egress "slot".
+	 */
+	int64_t credits_and_next_index;
+
+} __gxio_dma_queue_t;
+
+/* Initialize a dma queue. */
+extern void __gxio_dma_queue_init(__gxio_dma_queue_t *dma_queue,
+				  void *post_region_addr,
+				  unsigned int num_entries);
+
+/*
+ * Update the "credits_and_next_index" and "hw_complete_count" fields
+ * based on pending hardware completions.  Note that some other thread
+ * may have already done this and, importantly, may still be in the
+ * process of updating "credits_and_next_index".
+ */
+extern void __gxio_dma_queue_update_credits(__gxio_dma_queue_t *dma_queue);
+
+/* Wait for credits to become available. */
+extern int64_t __gxio_dma_queue_wait_for_credits(__gxio_dma_queue_t *dma_queue,
+						 int64_t modifier);
+
+/* Reserve slots in the queue, optionally waiting for slots to become
+ * available, and optionally returning a "completion_slot" suitable for
+ * direct comparison to "hw_complete_count".
+ */
+static inline int64_t __gxio_dma_queue_reserve(__gxio_dma_queue_t *dma_queue,
+					       unsigned int num, bool wait,
+					       bool completion)
+{
+	uint64_t slot;
+
+	/*
+	 * Try to reserve 'num' egress command slots.  We do this by
+	 * constructing a constant that subtracts N credits and adds N to
+	 * the index, and using fetchaddgez to only apply it if the credits
+	 * count doesn't go negative.
+	 */
+	int64_t modifier = (((int64_t)(-num)) << DMA_QUEUE_CREDIT_SHIFT) | num;
+	int64_t old =
+		__insn_fetchaddgez(&dma_queue->credits_and_next_index,
+				   modifier);
+
+	if (unlikely(old + modifier < 0)) {
+		/*
+		 * We're out of credits.  Try once to get more by checking for
+		 * completed egress commands.  If that fails, wait or fail.
+		 */
+		__gxio_dma_queue_update_credits(dma_queue);
+		old = __insn_fetchaddgez(&dma_queue->credits_and_next_index,
+					 modifier);
+		if (old + modifier < 0) {
+			if (wait)
+				old = __gxio_dma_queue_wait_for_credits
+					(dma_queue, modifier);
+			else
+				return GXIO_ERR_DMA_CREDITS;
+		}
+	}
+
+	/* The bottom 24 bits of old encode the "slot". */
+	slot = (old & 0xffffff);
+
+	if (completion) {
+		/*
+		 * A "completion_slot" is a "slot" which can be compared to
+		 * "hw_complete_count" at any time in the future.  To convert
+		 * "slot" into a "completion_slot", we access "hw_complete_count"
+		 * once (knowing that we have reserved a slot, and thus, it will
+		 * be "basically" accurate), and combine its high 40 bits with
+		 * the 24 bit "slot", and handle "wrapping" by adding "1 << 24"
+		 * if the result is LESS than "hw_complete_count".
+		 */
+		uint64_t complete;
+		complete = ACCESS_ONCE(dma_queue->hw_complete_count);
+		slot |= (complete & 0xffffffffff000000);
+		if (slot < complete)
+			slot += 0x1000000;
+	}
+
+	/*
+	 * If any of our slots mod 256 were equivalent to 0, go ahead and
+	 * collect some egress credits, and update "hw_complete_count", and
+	 * make sure the index doesn't overflow into the credits.
+	 */
+	if (unlikely(((old + num) & 0xff) < num)) {
+		__gxio_dma_queue_update_credits(dma_queue);
+
+		/* Make sure the index doesn't overflow into the credits. */
+#ifdef __BIG_ENDIAN__
+		*(((uint8_t *)&dma_queue->credits_and_next_index) + 4) = 0;
+#else
+		*(((uint8_t *)&dma_queue->credits_and_next_index) + 3) = 0;
+#endif
+	}
+
+	return slot;
+}
+
+/* Non-inlinable "__gxio_dma_queue_reserve(..., true)". */
+extern int64_t __gxio_dma_queue_reserve_aux(__gxio_dma_queue_t *dma_queue,
+					    unsigned int num, int wait);
+
+/* Check whether a particular "completion slot" has completed.
+ *
+ * Note that this function requires a "completion slot", and thus
+ * cannot be used with the result of any "reserve_fast" function.
+ */
+extern int __gxio_dma_queue_is_complete(__gxio_dma_queue_t *dma_queue,
+					int64_t completion_slot, int update);
+
+#endif /* !_GXIO_DMA_QUEUE_H_ */
diff --git a/arch/tile/include/gxio/iorpc_globals.h b/arch/tile/include/gxio/iorpc_globals.h
new file mode 100644
index 00000000000..52c721f8dad
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_globals.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __IORPC_LINUX_RPC_H__
+#define __IORPC_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+#define IORPC_OP_ARM_POLLFD            IORPC_OPCODE(IORPC_FORMAT_KERNEL_POLLFD, 0x9000)
+#define IORPC_OP_CLOSE_POLLFD          IORPC_OPCODE(IORPC_FORMAT_KERNEL_POLLFD, 0x9001)
+#define IORPC_OP_GET_MMIO_BASE         IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define IORPC_OP_CHECK_MMIO_OFFSET     IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+int __iorpc_arm_pollfd(int fd, int pollfd_cookie);
+
+int __iorpc_close_pollfd(int fd, int pollfd_cookie);
+
+int __iorpc_get_mmio_base(int fd, HV_PTE *base);
+
+int __iorpc_check_mmio_offset(int fd, unsigned long offset, unsigned long size);
+
+#endif /* !__IORPC_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/iorpc_mpipe.h b/arch/tile/include/gxio/iorpc_mpipe.h
new file mode 100644
index 00000000000..4cda03de734
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_mpipe.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __GXIO_MPIPE_LINUX_RPC_H__
+#define __GXIO_MPIPE_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <hv/drv_mpipe_intf.h>
+#include <asm/page.h>
+#include <gxio/kiorpc.h>
+#include <gxio/mpipe.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+#define GXIO_MPIPE_OP_ALLOC_BUFFER_STACKS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1200)
+#define GXIO_MPIPE_OP_INIT_BUFFER_STACK_AUX IORPC_OPCODE(IORPC_FORMAT_KERNEL_MEM, 0x1201)
+
+#define GXIO_MPIPE_OP_ALLOC_NOTIF_RINGS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1203)
+#define GXIO_MPIPE_OP_INIT_NOTIF_RING_AUX IORPC_OPCODE(IORPC_FORMAT_KERNEL_MEM, 0x1204)
+#define GXIO_MPIPE_OP_REQUEST_NOTIF_RING_INTERRUPT IORPC_OPCODE(IORPC_FORMAT_KERNEL_INTERRUPT, 0x1205)
+#define GXIO_MPIPE_OP_ENABLE_NOTIF_RING_INTERRUPT IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1206)
+#define GXIO_MPIPE_OP_ALLOC_NOTIF_GROUPS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1207)
+#define GXIO_MPIPE_OP_INIT_NOTIF_GROUP IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1208)
+#define GXIO_MPIPE_OP_ALLOC_BUCKETS    IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1209)
+#define GXIO_MPIPE_OP_INIT_BUCKET      IORPC_OPCODE(IORPC_FORMAT_NONE, 0x120a)
+#define GXIO_MPIPE_OP_ALLOC_EDMA_RINGS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x120b)
+#define GXIO_MPIPE_OP_INIT_EDMA_RING_AUX IORPC_OPCODE(IORPC_FORMAT_KERNEL_MEM, 0x120c)
+
+#define GXIO_MPIPE_OP_COMMIT_RULES     IORPC_OPCODE(IORPC_FORMAT_NONE, 0x120f)
+#define GXIO_MPIPE_OP_REGISTER_CLIENT_MEMORY IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x1210)
+#define GXIO_MPIPE_OP_LINK_OPEN_AUX    IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1211)
+#define GXIO_MPIPE_OP_LINK_CLOSE_AUX   IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1212)
+#define GXIO_MPIPE_OP_LINK_SET_ATTR_AUX IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1213)
+
+#define GXIO_MPIPE_OP_GET_TIMESTAMP_AUX IORPC_OPCODE(IORPC_FORMAT_NONE, 0x121e)
+#define GXIO_MPIPE_OP_SET_TIMESTAMP_AUX IORPC_OPCODE(IORPC_FORMAT_NONE, 0x121f)
+#define GXIO_MPIPE_OP_ADJUST_TIMESTAMP_AUX IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1220)
+#define GXIO_MPIPE_OP_CONFIG_EDMA_RING_BLKS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1221)
+#define GXIO_MPIPE_OP_ADJUST_TIMESTAMP_FREQ IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1222)
+#define GXIO_MPIPE_OP_ARM_POLLFD       IORPC_OPCODE(IORPC_FORMAT_KERNEL_POLLFD, 0x9000)
+#define GXIO_MPIPE_OP_CLOSE_POLLFD     IORPC_OPCODE(IORPC_FORMAT_KERNEL_POLLFD, 0x9001)
+#define GXIO_MPIPE_OP_GET_MMIO_BASE    IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define GXIO_MPIPE_OP_CHECK_MMIO_OFFSET IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+int gxio_mpipe_alloc_buffer_stacks(gxio_mpipe_context_t *context,
+				   unsigned int count, unsigned int first,
+				   unsigned int flags);
+
+int gxio_mpipe_init_buffer_stack_aux(gxio_mpipe_context_t *context,
+				     void *mem_va, size_t mem_size,
+				     unsigned int mem_flags, unsigned int stack,
+				     unsigned int buffer_size_enum);
+
+
+int gxio_mpipe_alloc_notif_rings(gxio_mpipe_context_t *context,
+				 unsigned int count, unsigned int first,
+				 unsigned int flags);
+
+int gxio_mpipe_init_notif_ring_aux(gxio_mpipe_context_t *context, void *mem_va,
+				   size_t mem_size, unsigned int mem_flags,
+				   unsigned int ring);
+
+int gxio_mpipe_request_notif_ring_interrupt(gxio_mpipe_context_t *context,
+					    int inter_x, int inter_y,
+					    int inter_ipi, int inter_event,
+					    unsigned int ring);
+
+int gxio_mpipe_enable_notif_ring_interrupt(gxio_mpipe_context_t *context,
+					   unsigned int ring);
+
+int gxio_mpipe_alloc_notif_groups(gxio_mpipe_context_t *context,
+				  unsigned int count, unsigned int first,
+				  unsigned int flags);
+
+int gxio_mpipe_init_notif_group(gxio_mpipe_context_t *context,
+				unsigned int group,
+				gxio_mpipe_notif_group_bits_t bits);
+
+int gxio_mpipe_alloc_buckets(gxio_mpipe_context_t *context, unsigned int count,
+			     unsigned int first, unsigned int flags);
+
+int gxio_mpipe_init_bucket(gxio_mpipe_context_t *context, unsigned int bucket,
+			   MPIPE_LBL_INIT_DAT_BSTS_TBL_t bucket_info);
+
+int gxio_mpipe_alloc_edma_rings(gxio_mpipe_context_t *context,
+				unsigned int count, unsigned int first,
+				unsigned int flags);
+
+int gxio_mpipe_init_edma_ring_aux(gxio_mpipe_context_t *context, void *mem_va,
+				  size_t mem_size, unsigned int mem_flags,
+				  unsigned int ring, unsigned int channel);
+
+
+int gxio_mpipe_commit_rules(gxio_mpipe_context_t *context, const void *blob,
+			    size_t blob_size);
+
+int gxio_mpipe_register_client_memory(gxio_mpipe_context_t *context,
+				      unsigned int iotlb, HV_PTE pte,
+				      unsigned int flags);
+
+int gxio_mpipe_link_open_aux(gxio_mpipe_context_t *context,
+			     _gxio_mpipe_link_name_t name, unsigned int flags);
+
+int gxio_mpipe_link_close_aux(gxio_mpipe_context_t *context, int mac);
+
+int gxio_mpipe_link_set_attr_aux(gxio_mpipe_context_t *context, int mac,
+				 uint32_t attr, int64_t val);
+
+int gxio_mpipe_get_timestamp_aux(gxio_mpipe_context_t *context, uint64_t *sec,
+				 uint64_t *nsec, uint64_t *cycles);
+
+int gxio_mpipe_set_timestamp_aux(gxio_mpipe_context_t *context, uint64_t sec,
+				 uint64_t nsec, uint64_t cycles);
+
+int gxio_mpipe_adjust_timestamp_aux(gxio_mpipe_context_t *context,
+				    int64_t nsec);
+
+int gxio_mpipe_adjust_timestamp_freq(gxio_mpipe_context_t *context,
+				     int32_t ppb);
+
+int gxio_mpipe_arm_pollfd(gxio_mpipe_context_t *context, int pollfd_cookie);
+
+int gxio_mpipe_close_pollfd(gxio_mpipe_context_t *context, int pollfd_cookie);
+
+int gxio_mpipe_get_mmio_base(gxio_mpipe_context_t *context, HV_PTE *base);
+
+int gxio_mpipe_check_mmio_offset(gxio_mpipe_context_t *context,
+				 unsigned long offset, unsigned long size);
+
+#endif /* !__GXIO_MPIPE_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/iorpc_mpipe_info.h b/arch/tile/include/gxio/iorpc_mpipe_info.h
new file mode 100644
index 00000000000..f0b04284468
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_mpipe_info.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __GXIO_MPIPE_INFO_LINUX_RPC_H__
+#define __GXIO_MPIPE_INFO_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <hv/drv_mpipe_intf.h>
+#include <asm/page.h>
+#include <gxio/kiorpc.h>
+#include <gxio/mpipe.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+
+#define GXIO_MPIPE_INFO_OP_INSTANCE_AUX IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1250)
+#define GXIO_MPIPE_INFO_OP_ENUMERATE_AUX IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1251)
+#define GXIO_MPIPE_INFO_OP_GET_MMIO_BASE IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define GXIO_MPIPE_INFO_OP_CHECK_MMIO_OFFSET IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+
+int gxio_mpipe_info_instance_aux(gxio_mpipe_info_context_t *context,
+				 _gxio_mpipe_link_name_t name);
+
+int gxio_mpipe_info_enumerate_aux(gxio_mpipe_info_context_t *context,
+				  unsigned int idx,
+				  _gxio_mpipe_link_name_t *name,
+				  _gxio_mpipe_link_mac_t *mac);
+
+int gxio_mpipe_info_get_mmio_base(gxio_mpipe_info_context_t *context,
+				  HV_PTE *base);
+
+int gxio_mpipe_info_check_mmio_offset(gxio_mpipe_info_context_t *context,
+				      unsigned long offset, unsigned long size);
+
+#endif /* !__GXIO_MPIPE_INFO_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/iorpc_trio.h b/arch/tile/include/gxio/iorpc_trio.h
new file mode 100644
index 00000000000..376a4f77116
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_trio.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __GXIO_TRIO_LINUX_RPC_H__
+#define __GXIO_TRIO_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <hv/drv_trio_intf.h>
+#include <gxio/trio.h>
+#include <gxio/kiorpc.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+#define GXIO_TRIO_OP_DEALLOC_ASID      IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1400)
+#define GXIO_TRIO_OP_ALLOC_ASIDS       IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1401)
+
+#define GXIO_TRIO_OP_ALLOC_MEMORY_MAPS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1404)
+
+#define GXIO_TRIO_OP_ALLOC_SCATTER_QUEUES IORPC_OPCODE(IORPC_FORMAT_NONE, 0x140e)
+#define GXIO_TRIO_OP_ALLOC_PIO_REGIONS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1412)
+
+#define GXIO_TRIO_OP_INIT_PIO_REGION_AUX IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1414)
+
+#define GXIO_TRIO_OP_INIT_MEMORY_MAP_MMU_AUX IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x141e)
+#define GXIO_TRIO_OP_GET_PORT_PROPERTY IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x141f)
+#define GXIO_TRIO_OP_CONFIG_LEGACY_INTR IORPC_OPCODE(IORPC_FORMAT_KERNEL_INTERRUPT, 0x1420)
+#define GXIO_TRIO_OP_CONFIG_MSI_INTR   IORPC_OPCODE(IORPC_FORMAT_KERNEL_INTERRUPT, 0x1421)
+
+#define GXIO_TRIO_OP_SET_MPS_MRS       IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x1423)
+#define GXIO_TRIO_OP_FORCE_RC_LINK_UP  IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x1424)
+#define GXIO_TRIO_OP_FORCE_EP_LINK_UP  IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x1425)
+#define GXIO_TRIO_OP_GET_MMIO_BASE     IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define GXIO_TRIO_OP_CHECK_MMIO_OFFSET IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+int gxio_trio_alloc_asids(gxio_trio_context_t *context, unsigned int count,
+			  unsigned int first, unsigned int flags);
+
+
+int gxio_trio_alloc_memory_maps(gxio_trio_context_t *context,
+				unsigned int count, unsigned int first,
+				unsigned int flags);
+
+
+int gxio_trio_alloc_scatter_queues(gxio_trio_context_t *context,
+				   unsigned int count, unsigned int first,
+				   unsigned int flags);
+
+int gxio_trio_alloc_pio_regions(gxio_trio_context_t *context,
+				unsigned int count, unsigned int first,
+				unsigned int flags);
+
+int gxio_trio_init_pio_region_aux(gxio_trio_context_t *context,
+				  unsigned int pio_region, unsigned int mac,
+				  uint32_t bus_address_hi, unsigned int flags);
+
+
+int gxio_trio_init_memory_map_mmu_aux(gxio_trio_context_t *context,
+				      unsigned int map, unsigned long va,
+				      uint64_t size, unsigned int asid,
+				      unsigned int mac, uint64_t bus_address,
+				      unsigned int node,
+				      unsigned int order_mode);
+
+int gxio_trio_get_port_property(gxio_trio_context_t *context,
+				struct pcie_trio_ports_property *trio_ports);
+
+int gxio_trio_config_legacy_intr(gxio_trio_context_t *context, int inter_x,
+				 int inter_y, int inter_ipi, int inter_event,
+				 unsigned int mac, unsigned int intx);
+
+int gxio_trio_config_msi_intr(gxio_trio_context_t *context, int inter_x,
+			      int inter_y, int inter_ipi, int inter_event,
+			      unsigned int mac, unsigned int mem_map,
+			      uint64_t mem_map_base, uint64_t mem_map_limit,
+			      unsigned int asid);
+
+
+int gxio_trio_set_mps_mrs(gxio_trio_context_t *context, uint16_t mps,
+			  uint16_t mrs, unsigned int mac);
+
+int gxio_trio_force_rc_link_up(gxio_trio_context_t *context, unsigned int mac);
+
+int gxio_trio_force_ep_link_up(gxio_trio_context_t *context, unsigned int mac);
+
+int gxio_trio_get_mmio_base(gxio_trio_context_t *context, HV_PTE *base);
+
+int gxio_trio_check_mmio_offset(gxio_trio_context_t *context,
+				unsigned long offset, unsigned long size);
+
+#endif /* !__GXIO_TRIO_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/iorpc_uart.h b/arch/tile/include/gxio/iorpc_uart.h
new file mode 100644
index 00000000000..55429d48ea5
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_uart.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __GXIO_UART_LINUX_RPC_H__
+#define __GXIO_UART_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <hv/drv_uart_intf.h>
+#include <gxio/uart.h>
+#include <gxio/kiorpc.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+#define GXIO_UART_OP_CFG_INTERRUPT     IORPC_OPCODE(IORPC_FORMAT_KERNEL_INTERRUPT, 0x1900)
+#define GXIO_UART_OP_GET_MMIO_BASE     IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define GXIO_UART_OP_CHECK_MMIO_OFFSET IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+int gxio_uart_cfg_interrupt(gxio_uart_context_t *context, int inter_x,
+			    int inter_y, int inter_ipi, int inter_event);
+
+int gxio_uart_get_mmio_base(gxio_uart_context_t *context, HV_PTE *base);
+
+int gxio_uart_check_mmio_offset(gxio_uart_context_t *context,
+				unsigned long offset, unsigned long size);
+
+#endif /* !__GXIO_UART_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/iorpc_usb_host.h b/arch/tile/include/gxio/iorpc_usb_host.h
new file mode 100644
index 00000000000..79962a97de8
--- /dev/null
+++ b/arch/tile/include/gxio/iorpc_usb_host.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* This file is machine-generated; DO NOT EDIT! */
+#ifndef __GXIO_USB_HOST_LINUX_RPC_H__
+#define __GXIO_USB_HOST_LINUX_RPC_H__
+
+#include <hv/iorpc.h>
+
+#include <hv/drv_usb_host_intf.h>
+#include <asm/page.h>
+#include <gxio/kiorpc.h>
+#include <gxio/usb_host.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <asm/pgtable.h>
+
+#define GXIO_USB_HOST_OP_CFG_INTERRUPT IORPC_OPCODE(IORPC_FORMAT_KERNEL_INTERRUPT, 0x1800)
+#define GXIO_USB_HOST_OP_REGISTER_CLIENT_MEMORY IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x1801)
+#define GXIO_USB_HOST_OP_GET_MMIO_BASE IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8000)
+#define GXIO_USB_HOST_OP_CHECK_MMIO_OFFSET IORPC_OPCODE(IORPC_FORMAT_NONE_NOUSER, 0x8001)
+
+int gxio_usb_host_cfg_interrupt(gxio_usb_host_context_t *context, int inter_x,
+				int inter_y, int inter_ipi, int inter_event);
+
+int gxio_usb_host_register_client_memory(gxio_usb_host_context_t *context,
+					 HV_PTE pte, unsigned int flags);
+
+int gxio_usb_host_get_mmio_base(gxio_usb_host_context_t *context,
+				HV_PTE *base);
+
+int gxio_usb_host_check_mmio_offset(gxio_usb_host_context_t *context,
+				    unsigned long offset, unsigned long size);
+
+#endif /* !__GXIO_USB_HOST_LINUX_RPC_H__ */
diff --git a/arch/tile/include/gxio/kiorpc.h b/arch/tile/include/gxio/kiorpc.h
new file mode 100644
index 00000000000..ee5820979ff
--- /dev/null
+++ b/arch/tile/include/gxio/kiorpc.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Support routines for kernel IORPC drivers.
+ */
+
+#ifndef _GXIO_KIORPC_H
+#define _GXIO_KIORPC_H
+
+#include <linux/types.h>
+#include <asm/page.h>
+#include <arch/chip.h>
+
+#if CHIP_HAS_MMIO()
+void __iomem *iorpc_ioremap(int hv_fd, resource_size_t offset,
+			    unsigned long size);
+#endif
+
+#endif /* _GXIO_KIORPC_H */
diff --git a/arch/tile/include/gxio/mpipe.h b/arch/tile/include/gxio/mpipe.h
new file mode 100644
index 00000000000..e37cf4f0cff
--- /dev/null
+++ b/arch/tile/include/gxio/mpipe.h
@@ -0,0 +1,1871 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _GXIO_MPIPE_H_
+#define _GXIO_MPIPE_H_
+
+/*
+ *
+ * An API for allocating, configuring, and manipulating mPIPE hardware
+ * resources.
+ */
+
+#include <gxio/common.h>
+#include <gxio/dma_queue.h>
+
+#include <linux/time.h>
+
+#include <arch/mpipe_def.h>
+#include <arch/mpipe_shm.h>
+
+#include <hv/drv_mpipe_intf.h>
+#include <hv/iorpc.h>
+
+/*
+ *
+ * The TILE-Gx mPIPE&tm; shim provides Ethernet connectivity, packet
+ * classification, and packet load balancing services.  The
+ * gxio_mpipe_ API, declared in <gxio/mpipe.h>, allows applications to
+ * allocate mPIPE IO channels, configure packet distribution
+ * parameters, and send and receive Ethernet packets.  The API is
+ * designed to be a minimal wrapper around the mPIPE hardware, making
+ * system calls only where necessary to preserve inter-process
+ * protection guarantees.
+ *
+ * The APIs described below allow the programmer to allocate and
+ * configure mPIPE resources.  As described below, the mPIPE is a
+ * single shared hardware device that provides partitionable resources
+ * that are shared between all applications in the system.  The
+ * gxio_mpipe_ API allows userspace code to make resource request
+ * calls to the hypervisor, which in turns keeps track of the
+ * resources in use by all applications, maintains protection
+ * guarantees, and resets resources upon application shutdown.
+ *
+ * We strongly recommend reading the mPIPE section of the IO Device
+ * Guide (UG404) before working with this API.  Most functions in the
+ * gxio_mpipe_ API are directly analogous to hardware interfaces and
+ * the documentation assumes that the reader understands those
+ * hardware interfaces.
+ *
+ * @section mpipe__ingress mPIPE Ingress Hardware Resources
+ *
+ * The mPIPE ingress hardware provides extensive hardware offload for
+ * tasks like packet header parsing, load balancing, and memory
+ * management.  This section provides a brief introduction to the
+ * hardware components and the gxio_mpipe_ calls used to manage them;
+ * see the IO Device Guide for a much more detailed description of the
+ * mPIPE's capabilities.
+ *
+ * When a packet arrives at one of the mPIPE's Ethernet MACs, it is
+ * assigned a channel number indicating which MAC received it.  It
+ * then proceeds through the following hardware pipeline:
+ *
+ * @subsection mpipe__classification Classification
+ *
+ * A set of classification processors run header parsing code on each
+ * incoming packet, extracting information including the destination
+ * MAC address, VLAN, Ethernet type, and five-tuple hash.  Some of
+ * this information is then used to choose which buffer stack will be
+ * used to hold the packet, and which bucket will be used by the load
+ * balancer to determine which application will receive the packet.
+ *
+ * The rules by which the buffer stack and bucket are chosen can be
+ * configured via the @ref gxio_mpipe_classifier API.  A given app can
+ * specify multiple rules, each one specifying a bucket range, and a
+ * set of buffer stacks, to be used for packets matching the rule.
+ * Each rule can optionally specify a restricted set of channels,
+ * VLANs, and/or dMACs, in which it is interested.  By default, a
+ * given rule starts out matching all channels associated with the
+ * mPIPE context's set of open links; all VLANs; and all dMACs.
+ * Subsequent restrictions can then be added.
+ *
+ * @subsection mpipe__load_balancing Load Balancing
+ *
+ * The mPIPE load balancer is responsible for choosing the NotifRing
+ * to which the packet will be delivered.  This decision is based on
+ * the bucket number indicated by the classification program.  In
+ * general, the bucket number is based on some number of low bits of
+ * the packet's flow hash (applications that aren't interested in flow
+ * hashing use a single bucket).  Each load balancer bucket keeps a
+ * record of the NotifRing to which packets directed to that bucket
+ * are currently being delivered.  Based on the bucket's load
+ * balancing mode (@ref gxio_mpipe_bucket_mode_t), the load balancer
+ * either forwards the packet to the previously assigned NotifRing or
+ * decides to choose a new NotifRing.  If a new NotifRing is required,
+ * the load balancer chooses the least loaded ring in the NotifGroup
+ * associated with the bucket.
+ *
+ * The load balancer is a shared resource.  Each application needs to
+ * explicitly allocate NotifRings, NotifGroups, and buckets, using
+ * gxio_mpipe_alloc_notif_rings(), gxio_mpipe_alloc_notif_groups(),
+ * and gxio_mpipe_alloc_buckets().  Then the application needs to
+ * configure them using gxio_mpipe_init_notif_ring() and
+ * gxio_mpipe_init_notif_group_and_buckets().
+ *
+ * @subsection mpipe__buffers Buffer Selection and Packet Delivery
+ *
+ * Once the load balancer has chosen the destination NotifRing, the
+ * mPIPE DMA engine pops at least one buffer off of the 'buffer stack'
+ * chosen by the classification program and DMAs the packet data into
+ * that buffer.  Each buffer stack provides a hardware-accelerated
+ * stack of data buffers with the same size.  If the packet data is
+ * larger than the buffers provided by the chosen buffer stack, the
+ * mPIPE hardware pops off multiple buffers and chains the packet data
+ * through a multi-buffer linked list.  Once the packet data is
+ * delivered to the buffer(s), the mPIPE hardware writes the
+ * ::gxio_mpipe_idesc_t metadata object (calculated by the classifier)
+ * into the NotifRing and increments the number of packets delivered
+ * to that ring.
+ *
+ * Applications can push buffers onto a buffer stack by calling
+ * gxio_mpipe_push_buffer() or by egressing a packet with the
+ * ::gxio_mpipe_edesc_t::hwb bit set, indicating that the egressed
+ * buffers should be returned to the stack.
+ *
+ * Applications can allocate and initialize buffer stacks with the
+ * gxio_mpipe_alloc_buffer_stacks() and gxio_mpipe_init_buffer_stack()
+ * APIs.
+ *
+ * The application must also register the memory pages that will hold
+ * packets.  This requires calling gxio_mpipe_register_page() for each
+ * memory page that will hold packets allocated by the application for
+ * a given buffer stack.  Since each buffer stack is limited to 16
+ * registered pages, it may be necessary to use huge pages, or even
+ * extremely huge pages, to hold all the buffers.
+ *
+ * @subsection mpipe__iqueue NotifRings
+ *
+ * Each NotifRing is a region of shared memory, allocated by the
+ * application, to which the mPIPE delivers packet descriptors
+ * (::gxio_mpipe_idesc_t).  The application can allocate them via
+ * gxio_mpipe_alloc_notif_rings().  The application can then either
+ * explicitly initialize them with gxio_mpipe_init_notif_ring() and
+ * then read from them manually, or can make use of the convenience
+ * wrappers provided by @ref gxio_mpipe_wrappers.
+ *
+ * @section mpipe__egress mPIPE Egress Hardware
+ *
+ * Applications use eDMA rings to queue packets for egress.  The
+ * application can allocate them via gxio_mpipe_alloc_edma_rings().
+ * The application can then either explicitly initialize them with
+ * gxio_mpipe_init_edma_ring() and then write to them manually, or
+ * can make use of the convenience wrappers provided by
+ * @ref gxio_mpipe_wrappers.
+ *
+ * @section gxio__shortcomings Plans for Future API Revisions
+ *
+ * The API defined here is only an initial version of the mPIPE API.
+ * Future plans include:
+ *
+ * - Higher level wrapper functions to provide common initialization
+ * patterns.  This should help users start writing mPIPE programs
+ * without having to learn the details of the hardware.
+ *
+ * - Support for reset and deallocation of resources, including
+ * cleanup upon application shutdown.
+ *
+ * - Support for calling these APIs in the BME.
+ *
+ * - Support for IO interrupts.
+ *
+ * - Clearer definitions of thread safety guarantees.
+ *
+ * @section gxio__mpipe_examples Examples
+ *
+ * See the following mPIPE example programs for more information about
+ * allocating mPIPE resources and using them in real applications:
+ *
+ * - @ref mpipe/ingress/app.c : Receiving packets.
+ *
+ * - @ref mpipe/forward/app.c : Forwarding packets.
+ *
+ * Note that there are several more examples.
+ */
+
+/* Flags that can be passed to resource allocation functions. */
+enum gxio_mpipe_alloc_flags_e {
+	/* Require an allocation to start at a specified resource index. */
+	GXIO_MPIPE_ALLOC_FIXED = HV_MPIPE_ALLOC_FIXED,
+};
+
+/* Flags that can be passed to memory registration functions. */
+enum gxio_mpipe_mem_flags_e {
+	/* Do not fill L3 when writing, and invalidate lines upon egress. */
+	GXIO_MPIPE_MEM_FLAG_NT_HINT = IORPC_MEM_BUFFER_FLAG_NT_HINT,
+
+	/* L3 cache fills should only populate IO cache ways. */
+	GXIO_MPIPE_MEM_FLAG_IO_PIN = IORPC_MEM_BUFFER_FLAG_IO_PIN,
+};
+
+/* An ingress packet descriptor.  When a packet arrives, the mPIPE
+ * hardware generates this structure and writes it into a NotifRing.
+ */
+typedef MPIPE_PDESC_t gxio_mpipe_idesc_t;
+
+/* An egress command descriptor.  Applications write this structure
+ * into eDMA rings and the hardware performs the indicated operation
+ * (normally involving egressing some bytes).  Note that egressing a
+ * single packet may involve multiple egress command descriptors.
+ */
+typedef MPIPE_EDMA_DESC_t gxio_mpipe_edesc_t;
+
+/*
+ * Max # of mpipe instances. 2 currently.
+ */
+#define GXIO_MPIPE_INSTANCE_MAX  HV_MPIPE_INSTANCE_MAX
+
+#define NR_MPIPE_MAX   GXIO_MPIPE_INSTANCE_MAX
+
+/* Get the "va" field from an "idesc".
+ *
+ * This is the address at which the ingress hardware copied the first
+ * byte of the packet.
+ *
+ * If the classifier detected a custom header, then this will point to
+ * the custom header, and gxio_mpipe_idesc_get_l2_start() will point
+ * to the actual L2 header.
+ *
+ * Note that this value may be misleading if "idesc->be" is set.
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline unsigned char *gxio_mpipe_idesc_get_va(gxio_mpipe_idesc_t *idesc)
+{
+	return (unsigned char *)(long)idesc->va;
+}
+
+/* Get the "xfer_size" from an "idesc".
+ *
+ * This is the actual number of packet bytes transferred into memory
+ * by the hardware.
+ *
+ * Note that this value may be misleading if "idesc->be" is set.
+ *
+ * @param idesc An ingress packet descriptor.
+ *
+ * ISSUE: Is this the best name for this?
+ * FIXME: Add more docs about chaining, clipping, etc.
+ */
+static inline unsigned int gxio_mpipe_idesc_get_xfer_size(gxio_mpipe_idesc_t
+							  *idesc)
+{
+	return idesc->l2_size;
+}
+
+/* Get the "l2_offset" from an "idesc".
+ *
+ * Extremely customized classifiers might not support this function.
+ *
+ * This is the number of bytes between the "va" and the L2 header.
+ *
+ * The L2 header consists of a destination mac address, a source mac
+ * address, and an initial ethertype.  Various initial ethertypes
+ * allow encoding extra information in the L2 header, often including
+ * a vlan, and/or a new ethertype.
+ *
+ * Note that the "l2_offset" will be non-zero if (and only if) the
+ * classifier processed a custom header for the packet.
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline uint8_t gxio_mpipe_idesc_get_l2_offset(gxio_mpipe_idesc_t *idesc)
+{
+	return (idesc->custom1 >> 32) & 0xFF;
+}
+
+/* Get the "l2_start" from an "idesc".
+ *
+ * This is simply gxio_mpipe_idesc_get_va() plus
+ * gxio_mpipe_idesc_get_l2_offset().
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline unsigned char *gxio_mpipe_idesc_get_l2_start(gxio_mpipe_idesc_t
+							   *idesc)
+{
+	unsigned char *va = gxio_mpipe_idesc_get_va(idesc);
+	return va + gxio_mpipe_idesc_get_l2_offset(idesc);
+}
+
+/* Get the "l2_length" from an "idesc".
+ *
+ * This is simply gxio_mpipe_idesc_get_xfer_size() minus
+ * gxio_mpipe_idesc_get_l2_offset().
+ *
+ * @param idesc An ingress packet descriptor.
+ */
+static inline unsigned int gxio_mpipe_idesc_get_l2_length(gxio_mpipe_idesc_t
+							  *idesc)
+{
+	unsigned int xfer_size = idesc->l2_size;
+	return xfer_size - gxio_mpipe_idesc_get_l2_offset(idesc);
+}
+
+/* A context object used to manage mPIPE hardware resources. */
+typedef struct {
+
+	/* File descriptor for calling up to Linux (and thus the HV). */
+	int fd;
+
+	/* Corresponding mpipe instance #. */
+	int instance;
+
+	/* The VA at which configuration registers are mapped. */
+	char *mmio_cfg_base;
+
+	/* The VA at which IDMA, EDMA, and buffer manager are mapped. */
+	char *mmio_fast_base;
+
+	/* The "initialized" buffer stacks. */
+	gxio_mpipe_rules_stacks_t __stacks;
+
+} gxio_mpipe_context_t;
+
+/* This is only used internally, but it's most easily made visible here. */
+typedef gxio_mpipe_context_t gxio_mpipe_info_context_t;
+
+/* Initialize an mPIPE context.
+ *
+ * This function allocates an mPIPE "service domain" and maps the MMIO
+ * registers into the caller's VA space.
+ *
+ * @param context Context object to be initialized.
+ * @param mpipe_instance Instance number of mPIPE shim to be controlled via
+ *  context.
+ */
+extern int gxio_mpipe_init(gxio_mpipe_context_t *context,
+			   unsigned int mpipe_instance);
+
+/* Destroy an mPIPE context.
+ *
+ * This function frees the mPIPE "service domain" and unmaps the MMIO
+ * registers from the caller's VA space.
+ *
+ * If a user process exits without calling this routine, the kernel
+ * will destroy the mPIPE context as part of process teardown.
+ *
+ * @param context Context object to be destroyed.
+ */
+extern int gxio_mpipe_destroy(gxio_mpipe_context_t *context);
+
+/*****************************************************************
+ *                         Buffer Stacks                          *
+ ******************************************************************/
+
+/* Allocate a set of buffer stacks.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of stacks required.
+ * @param first Index of first stack if ::GXIO_MPIPE_ALLOC_FIXED flag is set,
+ *   otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer stack, or
+ * ::GXIO_MPIPE_ERR_NO_BUFFER_STACK if allocation failed.
+ */
+extern int gxio_mpipe_alloc_buffer_stacks(gxio_mpipe_context_t *context,
+					  unsigned int count,
+					  unsigned int first,
+					  unsigned int flags);
+
+/* Enum codes for buffer sizes supported by mPIPE. */
+typedef enum {
+	/* 128 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_128 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_128,
+	/* 256 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_256 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_256,
+	/* 512 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_512 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_512,
+	/* 1024 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_1024 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_1024,
+	/* 1664 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_1664 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_1664,
+	/* 4096 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_4096 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_4096,
+	/* 10368 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_10368 =
+		MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_10368,
+	/* 16384 byte packet data buffer. */
+	GXIO_MPIPE_BUFFER_SIZE_16384 = MPIPE_BSM_INIT_DAT_1__SIZE_VAL_BSZ_16384
+} gxio_mpipe_buffer_size_enum_t;
+
+/* Convert a buffer size in bytes into a buffer size enum. */
+extern gxio_mpipe_buffer_size_enum_t
+gxio_mpipe_buffer_size_to_buffer_size_enum(size_t size);
+
+/* Convert a buffer size enum into a buffer size in bytes. */
+extern size_t
+gxio_mpipe_buffer_size_enum_to_buffer_size(gxio_mpipe_buffer_size_enum_t
+					   buffer_size_enum);
+
+/* Calculate the number of bytes required to store a given number of
+ * buffers in the memory registered with a buffer stack via
+ * gxio_mpipe_init_buffer_stack().
+ */
+extern size_t gxio_mpipe_calc_buffer_stack_bytes(unsigned long buffers);
+
+/* Initialize a buffer stack.  This function binds a region of memory
+ * to be used by the hardware for storing buffer addresses pushed via
+ * gxio_mpipe_push_buffer() or as the result of sending a buffer out
+ * the egress with the 'push to stack when done' bit set.  Once this
+ * function returns, the memory region's contents may be arbitrarily
+ * modified by the hardware at any time and software should not access
+ * the memory region again.
+ *
+ * @param context An initialized mPIPE context.
+ * @param stack The buffer stack index.
+ * @param buffer_size_enum The size of each buffer in the buffer stack,
+ * as an enum.
+ * @param mem The address of the buffer stack.  This memory must be
+ * physically contiguous and aligned to a 64kB boundary.
+ * @param mem_size The size of the buffer stack, in bytes.
+ * @param mem_flags ::gxio_mpipe_mem_flags_e memory flags.
+ * @return Zero on success, ::GXIO_MPIPE_ERR_INVAL_BUFFER_SIZE if
+ * buffer_size_enum is invalid, ::GXIO_MPIPE_ERR_BAD_BUFFER_STACK if
+ * stack has not been allocated.
+ */
+extern int gxio_mpipe_init_buffer_stack(gxio_mpipe_context_t *context,
+					unsigned int stack,
+					gxio_mpipe_buffer_size_enum_t
+					buffer_size_enum, void *mem,
+					size_t mem_size,
+					unsigned int mem_flags);
+
+/* Push a buffer onto a previously initialized buffer stack.
+ *
+ * The size of the buffer being pushed must match the size that was
+ * registered with gxio_mpipe_init_buffer_stack().  All packet buffer
+ * addresses are 128-byte aligned; the low 7 bits of the specified
+ * buffer address will be ignored.
+ *
+ * @param context An initialized mPIPE context.
+ * @param stack The buffer stack index.
+ * @param buffer The buffer (the low seven bits are ignored).
+ */
+static inline void gxio_mpipe_push_buffer(gxio_mpipe_context_t *context,
+					  unsigned int stack, void *buffer)
+{
+	MPIPE_BSM_REGION_ADDR_t offset = { {0} };
+	MPIPE_BSM_REGION_VAL_t val = { {0} };
+
+	/*
+	 * The mmio_fast_base region starts at the IDMA region, so subtract
+	 * off that initial offset.
+	 */
+	offset.region =
+		MPIPE_MMIO_ADDR__REGION_VAL_BSM -
+		MPIPE_MMIO_ADDR__REGION_VAL_IDMA;
+	offset.stack = stack;
+
+#if __SIZEOF_POINTER__ == 4
+	val.va = ((ulong) buffer) >> MPIPE_BSM_REGION_VAL__VA_SHIFT;
+#else
+	val.va = ((long)buffer) >> MPIPE_BSM_REGION_VAL__VA_SHIFT;
+#endif
+
+	__gxio_mmio_write(context->mmio_fast_base + offset.word, val.word);
+}
+
+/* Pop a buffer off of a previously initialized buffer stack.
+ *
+ * @param context An initialized mPIPE context.
+ * @param stack The buffer stack index.
+ * @return The buffer, or NULL if the stack is empty.
+ */
+static inline void *gxio_mpipe_pop_buffer(gxio_mpipe_context_t *context,
+					  unsigned int stack)
+{
+	MPIPE_BSM_REGION_ADDR_t offset = { {0} };
+
+	/*
+	 * The mmio_fast_base region starts at the IDMA region, so subtract
+	 * off that initial offset.
+	 */
+	offset.region =
+		MPIPE_MMIO_ADDR__REGION_VAL_BSM -
+		MPIPE_MMIO_ADDR__REGION_VAL_IDMA;
+	offset.stack = stack;
+
+	while (1) {
+		/*
+		 * Case 1: val.c == ..._UNCHAINED, va is non-zero.
+		 * Case 2: val.c == ..._INVALID, va is zero.
+		 * Case 3: val.c == ..._NOT_RDY, va is zero.
+		 */
+		MPIPE_BSM_REGION_VAL_t val;
+		val.word =
+			__gxio_mmio_read(context->mmio_fast_base +
+					 offset.word);
+
+		/*
+		 * Handle case 1 and 2 by returning the buffer (or NULL).
+		 * Handle case 3 by waiting for the prefetch buffer to refill.
+		 */
+		if (val.c != MPIPE_EDMA_DESC_WORD1__C_VAL_NOT_RDY)
+			return (void *)((unsigned long)val.
+					va << MPIPE_BSM_REGION_VAL__VA_SHIFT);
+	}
+}
+
+/*****************************************************************
+ *                          NotifRings                            *
+ ******************************************************************/
+
+/* Allocate a set of NotifRings.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * Note that NotifRings are allocated in chunks, so allocating one at
+ * a time is much less efficient than allocating several at once.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of NotifRings required.
+ * @param first Index of first NotifRing if ::GXIO_MPIPE_ALLOC_FIXED flag
+ *   is set, otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer NotifRing, or
+ * ::GXIO_MPIPE_ERR_NO_NOTIF_RING if allocation failed.
+ */
+extern int gxio_mpipe_alloc_notif_rings(gxio_mpipe_context_t *context,
+					unsigned int count, unsigned int first,
+					unsigned int flags);
+
+/* Initialize a NotifRing, using the given memory and size.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ring The NotifRing index.
+ * @param mem A physically contiguous region of memory to be filled
+ * with a ring of ::gxio_mpipe_idesc_t structures.
+ * @param mem_size Number of bytes in the ring.  Must be 128, 512,
+ * 2048, or 65536 * sizeof(gxio_mpipe_idesc_t).
+ * @param mem_flags ::gxio_mpipe_mem_flags_e memory flags.
+ *
+ * @return 0 on success, ::GXIO_MPIPE_ERR_BAD_NOTIF_RING or
+ * ::GXIO_ERR_INVAL_MEMORY_SIZE on failure.
+ */
+extern int gxio_mpipe_init_notif_ring(gxio_mpipe_context_t *context,
+				      unsigned int ring,
+				      void *mem, size_t mem_size,
+				      unsigned int mem_flags);
+
+/* Configure an interrupt to be sent to a tile on incoming NotifRing
+ *  traffic.  Once an interrupt is sent for a particular ring, no more
+ *  will be sent until gxio_mica_enable_notif_ring_interrupt() is called.
+ *
+ * @param context An initialized mPIPE context.
+ * @param x X coordinate of interrupt target tile.
+ * @param y Y coordinate of interrupt target tile.
+ * @param i Index of the IPI register which will receive the interrupt.
+ * @param e Specific event which will be set in the target IPI register when
+ * the interrupt occurs.
+ * @param ring The NotifRing index.
+ * @return Zero on success, GXIO_ERR_INVAL if params are out of range.
+ */
+extern int gxio_mpipe_request_notif_ring_interrupt(gxio_mpipe_context_t
+						   *context, int x, int y,
+						   int i, int e,
+						   unsigned int ring);
+
+/* Enable an interrupt on incoming NotifRing traffic.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ring The NotifRing index.
+ * @return Zero on success, GXIO_ERR_INVAL if params are out of range.
+ */
+extern int gxio_mpipe_enable_notif_ring_interrupt(gxio_mpipe_context_t
+						  *context, unsigned int ring);
+
+/* Map all of a client's memory via the given IOTLB.
+ * @param context An initialized mPIPE context.
+ * @param iotlb IOTLB index.
+ * @param pte Page table entry.
+ * @param flags Flags.
+ * @return Zero on success, or a negative error code.
+ */
+extern int gxio_mpipe_register_client_memory(gxio_mpipe_context_t *context,
+					     unsigned int iotlb, HV_PTE pte,
+					     unsigned int flags);
+
+/*****************************************************************
+ *                        Notif Groups                            *
+ ******************************************************************/
+
+/* Allocate a set of NotifGroups.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of NotifGroups required.
+ * @param first Index of first NotifGroup if ::GXIO_MPIPE_ALLOC_FIXED flag
+ *   is set, otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer NotifGroup, or
+ * ::GXIO_MPIPE_ERR_NO_NOTIF_GROUP if allocation failed.
+ */
+extern int gxio_mpipe_alloc_notif_groups(gxio_mpipe_context_t *context,
+					 unsigned int count,
+					 unsigned int first,
+					 unsigned int flags);
+
+/* Add a NotifRing to a NotifGroup.  This only sets a bit in the
+ * application's 'group' object; the hardware NotifGroup can be
+ * initialized by passing 'group' to gxio_mpipe_init_notif_group() or
+ * gxio_mpipe_init_notif_group_and_buckets().
+ */
+static inline void
+gxio_mpipe_notif_group_add_ring(gxio_mpipe_notif_group_bits_t *bits, int ring)
+{
+	bits->ring_mask[ring / 64] |= (1ull << (ring % 64));
+}
+
+/* Set a particular NotifGroup bitmask.  Since the load balancer
+ * makes decisions based on both bucket and NotifGroup state, most
+ * applications should use gxio_mpipe_init_notif_group_and_buckets()
+ * rather than using this function to configure just a NotifGroup.
+ */
+extern int gxio_mpipe_init_notif_group(gxio_mpipe_context_t *context,
+				       unsigned int group,
+				       gxio_mpipe_notif_group_bits_t bits);
+
+/*****************************************************************
+ *                         Load Balancer                          *
+ ******************************************************************/
+
+/* Allocate a set of load balancer buckets.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * Note that buckets are allocated in chunks, so allocating one at
+ * a time is much less efficient than allocating several at once.
+ *
+ * Note that the buckets are actually divided into two sub-ranges, of
+ * different sizes, and different chunk sizes, and the range you get
+ * by default is determined by the size of the request.  Allocations
+ * cannot span the two sub-ranges.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of buckets required.
+ * @param first Index of first bucket if ::GXIO_MPIPE_ALLOC_FIXED flag is set,
+ *   otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer bucket, or
+ * ::GXIO_MPIPE_ERR_NO_BUCKET if allocation failed.
+ */
+extern int gxio_mpipe_alloc_buckets(gxio_mpipe_context_t *context,
+				    unsigned int count, unsigned int first,
+				    unsigned int flags);
+
+/* The legal modes for gxio_mpipe_bucket_info_t and
+ * gxio_mpipe_init_notif_group_and_buckets().
+ *
+ * All modes except ::GXIO_MPIPE_BUCKET_ROUND_ROBIN expect that the user
+ * will allocate a power-of-two number of buckets and initialize them
+ * to the same mode.  The classifier program then uses the appropriate
+ * number of low bits from the incoming packet's flow hash to choose a
+ * load balancer bucket.  Based on that bucket's load balancing mode,
+ * reference count, and currently active NotifRing, the load balancer
+ * chooses the NotifRing to which the packet will be delivered.
+ */
+typedef enum {
+	/* All packets for a bucket go to the same NotifRing unless the
+	 * NotifRing gets full, in which case packets will be dropped.  If
+	 * the bucket reference count ever reaches zero, a new NotifRing may
+	 * be chosen.
+	 */
+	GXIO_MPIPE_BUCKET_DYNAMIC_FLOW_AFFINITY =
+		MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_DFA,
+
+	/* All packets for a bucket always go to the same NotifRing.
+	 */
+	GXIO_MPIPE_BUCKET_STATIC_FLOW_AFFINITY =
+		MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_FIXED,
+
+	/* All packets for a bucket go to the least full NotifRing in the
+	 * group, providing load balancing round robin behavior.
+	 */
+	GXIO_MPIPE_BUCKET_ROUND_ROBIN =
+		MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_ALWAYS_PICK,
+
+	/* All packets for a bucket go to the same NotifRing unless the
+	 * NotifRing gets full, at which point the bucket starts using the
+	 * least full NotifRing in the group.  If all NotifRings in the
+	 * group are full, packets will be dropped.
+	 */
+	GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY =
+		MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_STICKY,
+
+	/* All packets for a bucket go to the same NotifRing unless the
+	 * NotifRing gets full, or a random timer fires, at which point the
+	 * bucket starts using the least full NotifRing in the group.  If
+	 * all NotifRings in the group are full, packets will be dropped.
+	 * WARNING: This mode is BROKEN on chips with fewer than 64 tiles.
+	 */
+	GXIO_MPIPE_BUCKET_PREFER_FLOW_LOCALITY =
+		MPIPE_LBL_INIT_DAT_BSTS_TBL__MODE_VAL_STICKY_RAND,
+
+} gxio_mpipe_bucket_mode_t;
+
+/* Copy a set of bucket initialization values into the mPIPE
+ * hardware.  Since the load balancer makes decisions based on both
+ * bucket and NotifGroup state, most applications should use
+ * gxio_mpipe_init_notif_group_and_buckets() rather than using this
+ * function to configure a single bucket.
+ *
+ * @param context An initialized mPIPE context.
+ * @param bucket Bucket index to be initialized.
+ * @param bucket_info Initial reference count, NotifRing index, and mode.
+ * @return 0 on success, ::GXIO_MPIPE_ERR_BAD_BUCKET on failure.
+ */
+extern int gxio_mpipe_init_bucket(gxio_mpipe_context_t *context,
+				  unsigned int bucket,
+				  gxio_mpipe_bucket_info_t bucket_info);
+
+/* Initializes a group and range of buckets and range of rings such
+ * that the load balancer runs a particular load balancing function.
+ *
+ * First, the group is initialized with the given rings.
+ *
+ * Second, each bucket is initialized with the mode and group, and a
+ * ring chosen round-robin from the given rings.
+ *
+ * Normally, the classifier picks a bucket, and then the load balancer
+ * picks a ring, based on the bucket's mode, group, and current ring,
+ * possibly updating the bucket's ring.
+ *
+ * @param context An initialized mPIPE context.
+ * @param group The group.
+ * @param ring The first ring.
+ * @param num_rings The number of rings.
+ * @param bucket The first bucket.
+ * @param num_buckets The number of buckets.
+ * @param mode The load balancing mode.
+ *
+ * @return 0 on success, ::GXIO_MPIPE_ERR_BAD_BUCKET,
+ * ::GXIO_MPIPE_ERR_BAD_NOTIF_GROUP, or
+ * ::GXIO_MPIPE_ERR_BAD_NOTIF_RING on failure.
+ */
+extern int gxio_mpipe_init_notif_group_and_buckets(gxio_mpipe_context_t
+						   *context,
+						   unsigned int group,
+						   unsigned int ring,
+						   unsigned int num_rings,
+						   unsigned int bucket,
+						   unsigned int num_buckets,
+						   gxio_mpipe_bucket_mode_t
+						   mode);
+
+/* Return credits to a NotifRing and/or bucket.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ring The NotifRing index, or -1.
+ * @param bucket The bucket, or -1.
+ * @param count The number of credits to return.
+ */
+static inline void gxio_mpipe_credit(gxio_mpipe_context_t *context,
+				     int ring, int bucket, unsigned int count)
+{
+	/* NOTE: Fancy struct initialization would break "C89" header test. */
+
+	MPIPE_IDMA_RELEASE_REGION_ADDR_t offset = { {0} };
+	MPIPE_IDMA_RELEASE_REGION_VAL_t val = { {0} };
+
+	/*
+	 * The mmio_fast_base region starts at the IDMA region, so subtract
+	 * off that initial offset.
+	 */
+	offset.region =
+		MPIPE_MMIO_ADDR__REGION_VAL_IDMA -
+		MPIPE_MMIO_ADDR__REGION_VAL_IDMA;
+	offset.ring = ring;
+	offset.bucket = bucket;
+	offset.ring_enable = (ring >= 0);
+	offset.bucket_enable = (bucket >= 0);
+	val.count = count;
+
+	__gxio_mmio_write(context->mmio_fast_base + offset.word, val.word);
+}
+
+/*****************************************************************
+ *                         Egress Rings                           *
+ ******************************************************************/
+
+/* Allocate a set of eDMA rings.
+ *
+ * The return value is NOT interesting if count is zero.
+ *
+ * @param context An initialized mPIPE context.
+ * @param count Number of eDMA rings required.
+ * @param first Index of first eDMA ring if ::GXIO_MPIPE_ALLOC_FIXED flag
+ *   is set, otherwise ignored.
+ * @param flags Flag bits from ::gxio_mpipe_alloc_flags_e.
+ * @return Index of first allocated buffer eDMA ring, or
+ * ::GXIO_MPIPE_ERR_NO_EDMA_RING if allocation failed.
+ */
+extern int gxio_mpipe_alloc_edma_rings(gxio_mpipe_context_t *context,
+				       unsigned int count, unsigned int first,
+				       unsigned int flags);
+
+/* Initialize an eDMA ring, using the given memory and size.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ering The eDMA ring index.
+ * @param channel The channel to use.  This must be one of the channels
+ * associated with the context's set of open links.
+ * @param mem A physically contiguous region of memory to be filled
+ * with a ring of ::gxio_mpipe_edesc_t structures.
+ * @param mem_size Number of bytes in the ring.  Must be 512, 2048,
+ * 8192 or 65536, times 16 (i.e. sizeof(gxio_mpipe_edesc_t)).
+ * @param mem_flags ::gxio_mpipe_mem_flags_e memory flags.
+ *
+ * @return 0 on success, ::GXIO_MPIPE_ERR_BAD_EDMA_RING or
+ * ::GXIO_ERR_INVAL_MEMORY_SIZE on failure.
+ */
+extern int gxio_mpipe_init_edma_ring(gxio_mpipe_context_t *context,
+				     unsigned int ering, unsigned int channel,
+				     void *mem, size_t mem_size,
+				     unsigned int mem_flags);
+
+/* Set the "max_blks", "min_snf_blks", and "db" fields of
+ * ::MPIPE_EDMA_RG_INIT_DAT_THRESH_t for a given edma ring.
+ *
+ * The global pool of dynamic blocks will be automatically adjusted.
+ *
+ * This function should not be called after any egress has been done
+ * on the edma ring.
+ *
+ * Most applications should just use gxio_mpipe_equeue_set_snf_size().
+ *
+ * @param context An initialized mPIPE context.
+ * @param ering The eDMA ring index.
+ * @param max_blks The number of blocks to dedicate to the ring
+ * (normally min_snf_blks + 1).  Must be greater than min_snf_blocks.
+ * @param min_snf_blks The number of blocks which must be stored
+ * prior to starting to send the packet (normally 12).
+ * @param db Whether to allow use of dynamic blocks by the ring
+ * (normally 1).
+ *
+ * @return 0 on success, negative on error.
+ */
+extern int gxio_mpipe_config_edma_ring_blks(gxio_mpipe_context_t *context,
+					    unsigned int ering,
+					    unsigned int max_blks,
+					    unsigned int min_snf_blks,
+					    unsigned int db);
+
+/*****************************************************************
+ *                      Classifier Program                        *
+ ******************************************************************/
+
+/*
+ *
+ * Functions for loading or configuring the mPIPE classifier program.
+ *
+ * The mPIPE classification processors all run a special "classifier"
+ * program which, for each incoming packet, parses the packet headers,
+ * encodes some packet metadata in the "idesc", and either drops the
+ * packet, or picks a notif ring to handle the packet, and a buffer
+ * stack to contain the packet, usually based on the channel, VLAN,
+ * dMAC, flow hash, and packet size, under the guidance of the "rules"
+ * API described below.
+ *
+ * @section gxio_mpipe_classifier_default Default Classifier
+ *
+ * The MDE provides a simple "default" classifier program.  It is
+ * shipped as source in "$TILERA_ROOT/src/sys/mpipe/classifier.c",
+ * which serves as its official documentation.  It is shipped as a
+ * binary program in "$TILERA_ROOT/tile/boot/classifier", which is
+ * automatically included in bootroms created by "tile-monitor", and
+ * is automatically loaded by the hypervisor at boot time.
+ *
+ * The L2 analysis handles LLC packets, SNAP packets, and "VLAN
+ * wrappers" (keeping the outer VLAN).
+ *
+ * The L3 analysis handles IPv4 and IPv6, dropping packets with bad
+ * IPv4 header checksums, requesting computation of a TCP/UDP checksum
+ * if appropriate, and hashing the dest and src IP addresses, plus the
+ * ports for TCP/UDP packets, into the flow hash.  No special analysis
+ * is done for "fragmented" packets or "tunneling" protocols.  Thus,
+ * the first fragment of a fragmented TCP/UDP packet is hashed using
+ * src/dest IP address and ports and all subsequent fragments are only
+ * hashed according to src/dest IP address.
+ *
+ * The L3 analysis handles other packets too, hashing the dMAC
+ * smac into a flow hash.
+ *
+ * The channel, VLAN, and dMAC used to pick a "rule" (see the
+ * "rules" APIs below), which in turn is used to pick a buffer stack
+ * (based on the packet size) and a bucket (based on the flow hash).
+ *
+ * To receive traffic matching a particular (channel/VLAN/dMAC
+ * pattern, an application should allocate its own buffer stacks and
+ * load balancer buckets, and map traffic to those stacks and buckets,
+ * as decribed by the "rules" API below.
+ *
+ * Various packet metadata is encoded in the idesc.  The flow hash is
+ * four bytes at 0x0C.  The VLAN is two bytes at 0x10.  The ethtype is
+ * two bytes at 0x12.  The l3 start is one byte at 0x14.  The l4 start
+ * is one byte at 0x15 for IPv4 and IPv6 packets, and otherwise zero.
+ * The protocol is one byte at 0x16 for IPv4 and IPv6 packets, and
+ * otherwise zero.
+ *
+ * @section gxio_mpipe_classifier_custom Custom Classifiers.
+ *
+ * A custom classifier may be created using "tile-mpipe-cc" with a
+ * customized version of the default classifier sources.
+ *
+ * The custom classifier may be included in bootroms using the
+ * "--classifier" option to "tile-monitor", or loaded dynamically
+ * using gxio_mpipe_classifier_load_from_file().
+ *
+ * Be aware that "extreme" customizations may break the assumptions of
+ * the "rules" APIs described below, but simple customizations, such
+ * as adding new packet metadata, should be fine.
+ */
+
+/* A set of classifier rules, plus a context. */
+typedef struct {
+
+	/* The context. */
+	gxio_mpipe_context_t *context;
+
+	/* The actual rules. */
+	gxio_mpipe_rules_list_t list;
+
+} gxio_mpipe_rules_t;
+
+/* Initialize a classifier program rules list.
+ *
+ * This function can be called on a previously initialized rules list
+ * to discard any previously added rules.
+ *
+ * @param rules Rules list to initialize.
+ * @param context An initialized mPIPE context.
+ */
+extern void gxio_mpipe_rules_init(gxio_mpipe_rules_t *rules,
+				  gxio_mpipe_context_t *context);
+
+/* Begin a new rule on the indicated rules list.
+ *
+ * Note that an empty rule matches all packets, but an empty rule list
+ * matches no packets.
+ *
+ * @param rules Rules list to which new rule is appended.
+ * @param bucket First load balancer bucket to which packets will be
+ * delivered.
+ * @param num_buckets Number of buckets (must be a power of two) across
+ * which packets will be distributed based on the "flow hash".
+ * @param stacks Either NULL, to assign each packet to the smallest
+ * initialized buffer stack which does not induce chaining (and to
+ * drop packets which exceed the largest initialized buffer stack
+ * buffer size), or an array, with each entry indicating which buffer
+ * stack should be used for packets up to that size (with 255
+ * indicating that those packets should be dropped).
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int gxio_mpipe_rules_begin(gxio_mpipe_rules_t *rules,
+				  unsigned int bucket,
+				  unsigned int num_buckets,
+				  gxio_mpipe_rules_stacks_t *stacks);
+
+/* Set the headroom of the current rule.
+ *
+ * @param rules Rules list whose current rule will be modified.
+ * @param headroom The headroom.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int gxio_mpipe_rules_set_headroom(gxio_mpipe_rules_t *rules,
+					 uint8_t headroom);
+
+/* Indicate that packets from a particular channel can be delivered
+ * to the buckets and buffer stacks associated with the current rule.
+ *
+ * Channels added must be associated with links opened by the mPIPE context
+ * used in gxio_mpipe_rules_init().  A rule with no channels is equivalent
+ * to a rule naming all such associated channels.
+ *
+ * @param rules Rules list whose current rule will be modified.
+ * @param channel The channel to add.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int gxio_mpipe_rules_add_channel(gxio_mpipe_rules_t *rules,
+					unsigned int channel);
+
+/* Commit rules.
+ *
+ * The rules are sent to the hypervisor, where they are combined with
+ * the rules from other apps, and used to program the hardware classifier.
+ *
+ * Note that if this function returns an error, then the rules will NOT
+ * have been committed, even if the error is due to interactions with
+ * rules from another app.
+ *
+ * @param rules Rules list to commit.
+ * @return 0 on success, or a negative error code on failure.
+ */
+extern int gxio_mpipe_rules_commit(gxio_mpipe_rules_t *rules);
+
+/*****************************************************************
+ *                     Ingress Queue Wrapper                      *
+ ******************************************************************/
+
+/*
+ *
+ * Convenience functions for receiving packets from a NotifRing and
+ * sending packets via an eDMA ring.
+ *
+ * The mpipe ingress and egress hardware uses shared memory packet
+ * descriptors to describe packets that have arrived on ingress or
+ * are destined for egress.  These descriptors are stored in shared
+ * memory ring buffers and written or read by hardware as necessary.
+ * The gxio library provides wrapper functions that manage the head and
+ * tail pointers for these rings, allowing the user to easily read or
+ * write packet descriptors.
+ *
+ * The initialization interface for ingress and egress rings is quite
+ * similar.  For example, to create an ingress queue, the user passes
+ * a ::gxio_mpipe_iqueue_t state object, a ring number from
+ * gxio_mpipe_alloc_notif_rings(), and the address of memory to hold a
+ * ring buffer to the gxio_mpipe_iqueue_init() function.  The function
+ * returns success when the state object has been initialized and the
+ * hardware configured to deliver packets to the specified ring
+ * buffer.  Similarly, gxio_mpipe_equeue_init() takes a
+ * ::gxio_mpipe_equeue_t state object, a ring number from
+ * gxio_mpipe_alloc_edma_rings(), and a shared memory buffer.
+ *
+ * @section gxio_mpipe_iqueue Working with Ingress Queues
+ *
+ * Once initialized, the gxio_mpipe_iqueue_t API provides two flows
+ * for getting the ::gxio_mpipe_idesc_t packet descriptor associated
+ * with incoming packets.  The simplest is to call
+ * gxio_mpipe_iqueue_get() or gxio_mpipe_iqueue_try_get().  These
+ * functions copy the oldest packet descriptor out of the NotifRing and
+ * into a descriptor provided by the caller.  They also immediately
+ * inform the hardware that a descriptor has been processed.
+ *
+ * For applications with stringent performance requirements, higher
+ * efficiency can be achieved by avoiding the packet descriptor copy
+ * and processing multiple descriptors at once.  The
+ * gxio_mpipe_iqueue_peek() and gxio_mpipe_iqueue_try_peek() functions
+ * allow such optimizations.  These functions provide a pointer to the
+ * next valid ingress descriptor in the NotifRing's shared memory ring
+ * buffer, and a count of how many contiguous descriptors are ready to
+ * be processed.  The application can then process any number of those
+ * descriptors in place, calling gxio_mpipe_iqueue_consume() to inform
+ * the hardware after each one has been processed.
+ *
+ * @section gxio_mpipe_equeue Working with Egress Queues
+ *
+ * Similarly, the egress queue API provides a high-performance
+ * interface plus a simple wrapper for use in posting
+ * ::gxio_mpipe_edesc_t egress packet descriptors.  The simple
+ * version, gxio_mpipe_equeue_put(), allows the programmer to wait for
+ * an eDMA ring slot to become available and write a single descriptor
+ * into the ring.
+ *
+ * Alternatively, you can reserve slots in the eDMA ring using
+ * gxio_mpipe_equeue_reserve() or gxio_mpipe_equeue_try_reserve(), and
+ * then fill in each slot using gxio_mpipe_equeue_put_at().  This
+ * capability can be used to amortize the cost of reserving slots
+ * across several packets.  It also allows gather operations to be
+ * performed on a shared equeue, by ensuring that the edescs for all
+ * the fragments are all contiguous in the eDMA ring.
+ *
+ * The gxio_mpipe_equeue_reserve() and gxio_mpipe_equeue_try_reserve()
+ * functions return a 63-bit "completion slot", which is actually a
+ * sequence number, the low bits of which indicate the ring buffer
+ * index and the high bits the number of times the application has
+ * gone around the egress ring buffer.  The extra bits allow an
+ * application to check for egress completion by calling
+ * gxio_mpipe_equeue_is_complete() to see whether a particular 'slot'
+ * number has finished.  Given the maximum packet rates of the Gx
+ * processor, the 63-bit slot number will never wrap.
+ *
+ * In practice, most applications use the ::gxio_mpipe_edesc_t::hwb
+ * bit to indicate that the buffers containing egress packet data
+ * should be pushed onto a buffer stack when egress is complete.  Such
+ * applications generally do not need to know when an egress operation
+ * completes (since there is no need to free a buffer post-egress),
+ * and thus can use the optimized gxio_mpipe_equeue_reserve_fast() or
+ * gxio_mpipe_equeue_try_reserve_fast() functions, which return a 24
+ * bit "slot", instead of a 63-bit "completion slot".
+ *
+ * Once a slot has been "reserved", it MUST be filled.  If the
+ * application reserves a slot and then decides that it does not
+ * actually need it, it can set the ::gxio_mpipe_edesc_t::ns (no send)
+ * bit on the descriptor passed to gxio_mpipe_equeue_put_at() to
+ * indicate that no data should be sent.  This technique can also be
+ * used to drop an incoming packet, instead of forwarding it, since
+ * any buffer will still be pushed onto the buffer stack when the
+ * egress descriptor is processed.
+ */
+
+/* A convenient interface to a NotifRing, for use by a single thread.
+ */
+typedef struct {
+
+	/* The context. */
+	gxio_mpipe_context_t *context;
+
+	/* The actual NotifRing. */
+	gxio_mpipe_idesc_t *idescs;
+
+	/* The number of entries. */
+	unsigned long num_entries;
+
+	/* The number of entries minus one. */
+	unsigned long mask_num_entries;
+
+	/* The log2() of the number of entries. */
+	unsigned long log2_num_entries;
+
+	/* The next entry. */
+	unsigned int head;
+
+	/* The NotifRing id. */
+	unsigned int ring;
+
+#ifdef __BIG_ENDIAN__
+	/* The number of byteswapped entries. */
+	unsigned int swapped;
+#endif
+
+} gxio_mpipe_iqueue_t;
+
+/* Initialize an "iqueue".
+ *
+ * Takes the iqueue plus the same args as gxio_mpipe_init_notif_ring().
+ */
+extern int gxio_mpipe_iqueue_init(gxio_mpipe_iqueue_t *iqueue,
+				  gxio_mpipe_context_t *context,
+				  unsigned int ring,
+				  void *mem, size_t mem_size,
+				  unsigned int mem_flags);
+
+/* Advance over some old entries in an iqueue.
+ *
+ * Please see the documentation for gxio_mpipe_iqueue_consume().
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param count The number of entries to advance over.
+ */
+static inline void gxio_mpipe_iqueue_advance(gxio_mpipe_iqueue_t *iqueue,
+					     int count)
+{
+	/* Advance with proper wrap. */
+	int head = iqueue->head + count;
+	iqueue->head =
+		(head & iqueue->mask_num_entries) +
+		(head >> iqueue->log2_num_entries);
+
+#ifdef __BIG_ENDIAN__
+	/* HACK: Track swapped entries. */
+	iqueue->swapped -= count;
+#endif
+}
+
+/* Release the ring and bucket for an old entry in an iqueue.
+ *
+ * Releasing the ring allows more packets to be delivered to the ring.
+ *
+ * Releasing the bucket allows flows using the bucket to be moved to a
+ * new ring when using GXIO_MPIPE_BUCKET_DYNAMIC_FLOW_AFFINITY.
+ *
+ * This function is shorthand for "gxio_mpipe_credit(iqueue->context,
+ * iqueue->ring, idesc->bucket_id, 1)", and it may be more convenient
+ * to make that underlying call, using those values, instead of
+ * tracking the entire "idesc".
+ *
+ * If packet processing is deferred, optimal performance requires that
+ * the releasing be deferred as well.
+ *
+ * Please see the documentation for gxio_mpipe_iqueue_consume().
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param idesc The descriptor which was processed.
+ */
+static inline void gxio_mpipe_iqueue_release(gxio_mpipe_iqueue_t *iqueue,
+					     gxio_mpipe_idesc_t *idesc)
+{
+	gxio_mpipe_credit(iqueue->context, iqueue->ring, idesc->bucket_id, 1);
+}
+
+/* Consume a packet from an "iqueue".
+ *
+ * After processing packets peeked at via gxio_mpipe_iqueue_peek()
+ * or gxio_mpipe_iqueue_try_peek(), you must call this function, or
+ * gxio_mpipe_iqueue_advance() plus gxio_mpipe_iqueue_release(), to
+ * advance over those entries, and release their rings and buckets.
+ *
+ * You may call this function as each packet is processed, or you can
+ * wait until several packets have been processed.
+ *
+ * Note that if you are using a single bucket, and you are handling
+ * batches of N packets, then you can replace several calls to this
+ * function with calls to "gxio_mpipe_iqueue_advance(iqueue, N)" and
+ * "gxio_mpipe_credit(iqueue->context, iqueue->ring, bucket, N)".
+ *
+ * Note that if your classifier sets "idesc->nr", then you should
+ * explicitly call "gxio_mpipe_iqueue_advance(iqueue, idesc)" plus
+ * "gxio_mpipe_credit(iqueue->context, iqueue->ring, -1, 1)", to
+ * avoid incorrectly crediting the (unused) bucket.
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param idesc The descriptor which was processed.
+ */
+static inline void gxio_mpipe_iqueue_consume(gxio_mpipe_iqueue_t *iqueue,
+					     gxio_mpipe_idesc_t *idesc)
+{
+	gxio_mpipe_iqueue_advance(iqueue, 1);
+	gxio_mpipe_iqueue_release(iqueue, idesc);
+}
+
+/* Peek at the next packet(s) in an "iqueue", without waiting.
+ *
+ * If no packets are available, fills idesc_ref with NULL, and then
+ * returns ::GXIO_MPIPE_ERR_IQUEUE_EMPTY.  Otherwise, fills idesc_ref
+ * with the address of the next valid packet descriptor, and returns
+ * the maximum number of valid descriptors which can be processed.
+ * You may process fewer descriptors if desired.
+ *
+ * Call gxio_mpipe_iqueue_consume() on each packet once it has been
+ * processed (or dropped), to allow more packets to be delivered.
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param idesc_ref A pointer to a packet descriptor pointer.
+ * @return The (positive) number of packets which can be processed,
+ * or ::GXIO_MPIPE_ERR_IQUEUE_EMPTY if no packets are available.
+ */
+static inline int gxio_mpipe_iqueue_try_peek(gxio_mpipe_iqueue_t *iqueue,
+					     gxio_mpipe_idesc_t **idesc_ref)
+{
+	gxio_mpipe_idesc_t *next;
+
+	uint64_t head = iqueue->head;
+	uint64_t tail = __gxio_mmio_read(iqueue->idescs);
+
+	/* Available entries. */
+	uint64_t avail =
+		(tail >= head) ? (tail - head) : (iqueue->num_entries - head);
+
+	if (avail == 0) {
+		*idesc_ref = NULL;
+		return GXIO_MPIPE_ERR_IQUEUE_EMPTY;
+	}
+
+	next = &iqueue->idescs[head];
+
+	/* ISSUE: Is this helpful? */
+	__insn_prefetch(next);
+
+#ifdef __BIG_ENDIAN__
+	/* HACK: Swap new entries directly in memory. */
+	{
+		int i, j;
+		for (i = iqueue->swapped; i < avail; i++) {
+			for (j = 0; j < 8; j++)
+				next[i].words[j] =
+					__builtin_bswap64(next[i].words[j]);
+		}
+		iqueue->swapped = avail;
+	}
+#endif
+
+	*idesc_ref = next;
+
+	return avail;
+}
+
+/* Drop a packet by pushing its buffer (if appropriate).
+ *
+ * NOTE: The caller must still call gxio_mpipe_iqueue_consume() if idesc
+ * came from gxio_mpipe_iqueue_try_peek() or gxio_mpipe_iqueue_peek().
+ *
+ * @param iqueue An ingress queue initialized via gxio_mpipe_iqueue_init().
+ * @param idesc A packet descriptor.
+ */
+static inline void gxio_mpipe_iqueue_drop(gxio_mpipe_iqueue_t *iqueue,
+					  gxio_mpipe_idesc_t *idesc)
+{
+	/* FIXME: Handle "chaining" properly. */
+
+	if (!idesc->be) {
+		unsigned char *va = gxio_mpipe_idesc_get_va(idesc);
+		gxio_mpipe_push_buffer(iqueue->context, idesc->stack_idx, va);
+	}
+}
+
+/*****************************************************************
+ *                      Egress Queue Wrapper                      *
+ ******************************************************************/
+
+/* A convenient, thread-safe interface to an eDMA ring. */
+typedef struct {
+
+	/* State object for tracking head and tail pointers. */
+	__gxio_dma_queue_t dma_queue;
+
+	/* The ring entries. */
+	gxio_mpipe_edesc_t *edescs;
+
+	/* The number of entries minus one. */
+	unsigned long mask_num_entries;
+
+	/* The log2() of the number of entries. */
+	unsigned long log2_num_entries;
+
+	/* The context. */
+	gxio_mpipe_context_t *context;
+
+	/* The ering. */
+	unsigned int ering;
+
+	/* The channel. */
+	unsigned int channel;
+
+} gxio_mpipe_equeue_t;
+
+/* Initialize an "equeue".
+ *
+ * This function uses gxio_mpipe_init_edma_ring() to initialize the
+ * underlying edma_ring using the provided arguments.
+ *
+ * @param equeue An egress queue to be initialized.
+ * @param context An initialized mPIPE context.
+ * @param ering The eDMA ring index.
+ * @param channel The channel to use.  This must be one of the channels
+ * associated with the context's set of open links.
+ * @param mem A physically contiguous region of memory to be filled
+ * with a ring of ::gxio_mpipe_edesc_t structures.
+ * @param mem_size Number of bytes in the ring.  Must be 512, 2048,
+ * 8192 or 65536, times 16 (i.e. sizeof(gxio_mpipe_edesc_t)).
+ * @param mem_flags ::gxio_mpipe_mem_flags_e memory flags.
+ *
+ * @return 0 on success, ::GXIO_MPIPE_ERR_BAD_EDMA_RING or
+ * ::GXIO_ERR_INVAL_MEMORY_SIZE on failure.
+ */
+extern int gxio_mpipe_equeue_init(gxio_mpipe_equeue_t *equeue,
+				  gxio_mpipe_context_t *context,
+				  unsigned int ering,
+				  unsigned int channel,
+				  void *mem, unsigned int mem_size,
+				  unsigned int mem_flags);
+
+/* Reserve completion slots for edescs.
+ *
+ * Use gxio_mpipe_equeue_put_at() to actually populate the slots.
+ *
+ * This function is slower than gxio_mpipe_equeue_reserve_fast(), but
+ * returns a full 64 bit completion slot, which can be used with
+ * gxio_mpipe_equeue_is_complete().
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param num Number of slots to reserve (must be non-zero).
+ * @return The first reserved completion slot, or a negative error code.
+ */
+static inline int64_t gxio_mpipe_equeue_reserve(gxio_mpipe_equeue_t *equeue,
+						unsigned int num)
+{
+	return __gxio_dma_queue_reserve_aux(&equeue->dma_queue, num, true);
+}
+
+/* Reserve completion slots for edescs, if possible.
+ *
+ * Use gxio_mpipe_equeue_put_at() to actually populate the slots.
+ *
+ * This function is slower than gxio_mpipe_equeue_try_reserve_fast(),
+ * but returns a full 64 bit completion slot, which can be used with
+ * gxio_mpipe_equeue_is_complete().
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param num Number of slots to reserve (must be non-zero).
+ * @return The first reserved completion slot, or a negative error code.
+ */
+static inline int64_t gxio_mpipe_equeue_try_reserve(gxio_mpipe_equeue_t
+						    *equeue, unsigned int num)
+{
+	return __gxio_dma_queue_reserve_aux(&equeue->dma_queue, num, false);
+}
+
+/* Reserve slots for edescs.
+ *
+ * Use gxio_mpipe_equeue_put_at() to actually populate the slots.
+ *
+ * This function is faster than gxio_mpipe_equeue_reserve(), but
+ * returns a 24 bit slot (instead of a 64 bit completion slot), which
+ * thus cannot be used with gxio_mpipe_equeue_is_complete().
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param num Number of slots to reserve (should be non-zero).
+ * @return The first reserved slot, or a negative error code.
+ */
+static inline int64_t gxio_mpipe_equeue_reserve_fast(gxio_mpipe_equeue_t
+						     *equeue, unsigned int num)
+{
+	return __gxio_dma_queue_reserve(&equeue->dma_queue, num, true, false);
+}
+
+/* Reserve slots for edescs, if possible.
+ *
+ * Use gxio_mpipe_equeue_put_at() to actually populate the slots.
+ *
+ * This function is faster than gxio_mpipe_equeue_try_reserve(), but
+ * returns a 24 bit slot (instead of a 64 bit completion slot), which
+ * thus cannot be used with gxio_mpipe_equeue_is_complete().
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param num Number of slots to reserve (should be non-zero).
+ * @return The first reserved slot, or a negative error code.
+ */
+static inline int64_t gxio_mpipe_equeue_try_reserve_fast(gxio_mpipe_equeue_t
+							 *equeue,
+							 unsigned int num)
+{
+	return __gxio_dma_queue_reserve(&equeue->dma_queue, num, false, false);
+}
+
+/*
+ * HACK: This helper function tricks gcc 4.6 into avoiding saving
+ * a copy of "edesc->words[0]" on the stack for no obvious reason.
+ */
+
+static inline void gxio_mpipe_equeue_put_at_aux(gxio_mpipe_equeue_t *equeue,
+						uint_reg_t ew[2],
+						unsigned long slot)
+{
+	unsigned long edma_slot = slot & equeue->mask_num_entries;
+	gxio_mpipe_edesc_t *edesc_p = &equeue->edescs[edma_slot];
+
+	/*
+	 * ISSUE: Could set eDMA ring to be on generation 1 at start, which
+	 * would avoid the negation here, perhaps allowing "__insn_bfins()".
+	 */
+	ew[0] |= !((slot >> equeue->log2_num_entries) & 1);
+
+	/*
+	 * NOTE: We use "__gxio_mpipe_write()", plus the fact that the eDMA
+	 * queue alignment restrictions ensure that these two words are on
+	 * the same cacheline, to force proper ordering between the stores.
+	 */
+	__gxio_mmio_write64(&edesc_p->words[1], ew[1]);
+	__gxio_mmio_write64(&edesc_p->words[0], ew[0]);
+}
+
+/* Post an edesc to a given slot in an equeue.
+ *
+ * This function copies the supplied edesc into entry "slot mod N" in
+ * the underlying ring, setting the "gen" bit to the appropriate value
+ * based on "(slot mod N*2)", where "N" is the size of the ring.  Note
+ * that the higher bits of slot are unused, and thus, this function
+ * can handle "slots" as well as "completion slots".
+ *
+ * Normally this function is used to fill in slots reserved by
+ * gxio_mpipe_equeue_try_reserve(), gxio_mpipe_equeue_reserve(),
+ * gxio_mpipe_equeue_try_reserve_fast(), or
+ * gxio_mpipe_equeue_reserve_fast(),
+ *
+ * This function can also be used without "reserving" slots, if the
+ * application KNOWS that the ring can never overflow, for example, by
+ * pushing fewer buffers into the buffer stacks than there are total
+ * slots in the equeue, but this is NOT recommended.
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param edesc The egress descriptor to be posted.
+ * @param slot An egress slot (only the low bits are actually used).
+ */
+static inline void gxio_mpipe_equeue_put_at(gxio_mpipe_equeue_t *equeue,
+					    gxio_mpipe_edesc_t edesc,
+					    unsigned long slot)
+{
+	gxio_mpipe_equeue_put_at_aux(equeue, edesc.words, slot);
+}
+
+/* Post an edesc to the next slot in an equeue.
+ *
+ * This is a convenience wrapper around
+ * gxio_mpipe_equeue_reserve_fast() and gxio_mpipe_equeue_put_at().
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param edesc The egress descriptor to be posted.
+ * @return 0 on success.
+ */
+static inline int gxio_mpipe_equeue_put(gxio_mpipe_equeue_t *equeue,
+					gxio_mpipe_edesc_t edesc)
+{
+	int64_t slot = gxio_mpipe_equeue_reserve_fast(equeue, 1);
+	if (slot < 0)
+		return (int)slot;
+
+	gxio_mpipe_equeue_put_at(equeue, edesc, slot);
+
+	return 0;
+}
+
+/* Ask the mPIPE hardware to egress outstanding packets immediately.
+ *
+ * This call is not necessary, but may slightly reduce overall latency.
+ *
+ * Technically, you should flush all gxio_mpipe_equeue_put_at() writes
+ * to memory before calling this function, to ensure the descriptors
+ * are visible in memory before the mPIPE hardware actually looks for
+ * them.  But this should be very rare, and the only side effect would
+ * be increased latency, so it is up to the caller to decide whether
+ * or not to flush memory.
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ */
+static inline void gxio_mpipe_equeue_flush(gxio_mpipe_equeue_t *equeue)
+{
+	/* Use "ring_idx = 0" and "count = 0" to "wake up" the eDMA ring. */
+	MPIPE_EDMA_POST_REGION_VAL_t val = { {0} };
+	/* Flush the write buffers. */
+	__insn_flushwb();
+	__gxio_mmio_write(equeue->dma_queue.post_region_addr, val.word);
+}
+
+/* Determine if a given edesc has been completed.
+ *
+ * Note that this function requires a "completion slot", and thus may
+ * NOT be used with a "slot" from gxio_mpipe_equeue_reserve_fast() or
+ * gxio_mpipe_equeue_try_reserve_fast().
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param completion_slot The completion slot used by the edesc.
+ * @param update If true, and the desc does not appear to have completed
+ * yet, then update any software cache of the hardware completion counter,
+ * and check again.  This should normally be true.
+ * @return True iff the given edesc has been completed.
+ */
+static inline int gxio_mpipe_equeue_is_complete(gxio_mpipe_equeue_t *equeue,
+						int64_t completion_slot,
+						int update)
+{
+	return __gxio_dma_queue_is_complete(&equeue->dma_queue,
+					    completion_slot, update);
+}
+
+/* Set the snf (store and forward) size for an equeue.
+ *
+ * The snf size for an equeue defaults to 1536, and encodes the size
+ * of the largest packet for which egress is guaranteed to avoid
+ * transmission underruns and/or corrupt checksums under heavy load.
+ *
+ * The snf size affects a global resource pool which cannot support,
+ * for example, all 24 equeues each requesting an snf size of 8K.
+ *
+ * To ensure that jumbo packets can be egressed properly, the snf size
+ * should be set to the size of the largest possible packet, which
+ * will usually be limited by the size of the app's largest buffer.
+ *
+ * This is a convenience wrapper around
+ * gxio_mpipe_config_edma_ring_blks().
+ *
+ * This function should not be called after any egress has been done
+ * on the equeue.
+ *
+ * @param equeue An egress queue initialized via gxio_mpipe_equeue_init().
+ * @param size The snf size, in bytes.
+ * @return Zero on success, negative error otherwise.
+ */
+static inline int gxio_mpipe_equeue_set_snf_size(gxio_mpipe_equeue_t *equeue,
+						 size_t size)
+{
+	int blks = (size + 127) / 128;
+	return gxio_mpipe_config_edma_ring_blks(equeue->context, equeue->ering,
+						blks + 1, blks, 1);
+}
+
+/*****************************************************************
+ *                        Link Management                         *
+ ******************************************************************/
+
+/*
+ *
+ * Functions for manipulating and sensing the state and configuration
+ * of physical network links.
+ *
+ * @section gxio_mpipe_link_perm Link Permissions
+ *
+ * Opening a link (with gxio_mpipe_link_open()) requests a set of link
+ * permissions, which control what may be done with the link, and potentially
+ * what permissions may be granted to other processes.
+ *
+ * Data permission allows the process to receive packets from the link by
+ * specifying the link's channel number in mPIPE packet distribution rules,
+ * and to send packets to the link by using the link's channel number as
+ * the target for an eDMA ring.
+ *
+ * Stats permission allows the process to retrieve link attributes (such as
+ * the speeds it is capable of running at, or whether it is currently up), and
+ * to read and write certain statistics-related registers in the link's MAC.
+ *
+ * Control permission allows the process to retrieve and modify link attributes
+ * (so that it may, for example, bring the link up and take it down), and
+ * read and write many registers in the link's MAC and PHY.
+ *
+ * Any permission may be requested as shared, which allows other processes
+ * to also request shared permission, or exclusive, which prevents other
+ * processes from requesting it.  In keeping with GXIO's typical usage in
+ * an embedded environment, the defaults for all permissions are shared.
+ *
+ * Permissions are granted on a first-come, first-served basis, so if two
+ * applications request an exclusive permission on the same link, the one
+ * to run first will win.  Note, however, that some system components, like
+ * the kernel Ethernet driver, may get an opportunity to open links before
+ * any applications run.
+ *
+ * @section gxio_mpipe_link_names Link Names
+ *
+ * Link names are of the form gbe<em>number</em> (for Gigabit Ethernet),
+ * xgbe<em>number</em> (for 10 Gigabit Ethernet), loop<em>number</em> (for
+ * internal mPIPE loopback), or ilk<em>number</em>/<em>channel</em>
+ * (for Interlaken links); for instance, gbe0, xgbe1, loop3, and
+ * ilk0/12 are all possible link names.  The correspondence between
+ * the link name and an mPIPE instance number or mPIPE channel number is
+ * system-dependent; all links will not exist on all systems, and the set
+ * of numbers used for a particular link type may not start at zero and may
+ * not be contiguous.  Use gxio_mpipe_link_enumerate() to retrieve the set of
+ * links which exist on a system, and always use gxio_mpipe_link_instance()
+ * to determine which mPIPE controls a particular link.
+ *
+ * Note that in some cases, links may share hardware, such as PHYs, or
+ * internal mPIPE buffers; in these cases, only one of the links may be
+ * opened at a time.  This is especially common with xgbe and gbe ports,
+ * since each xgbe port uses 4 SERDES lanes, each of which may also be
+ * configured as one gbe port.
+ *
+ * @section gxio_mpipe_link_states Link States
+ *
+ * The mPIPE link management model revolves around three different states,
+ * which are maintained for each link:
+ *
+ * 1. The <em>current</em> link state: is the link up now, and if so, at
+ *    what speed?
+ *
+ * 2. The <em>desired</em> link state: what do we want the link state to be?
+ *    The system is always working to make this state the current state;
+ *    thus, if the desired state is up, and the link is down, we'll be
+ *    constantly trying to bring it up, automatically.
+ *
+ * 3. The <em>possible</em> link state: what speeds are valid for this
+ *    particular link?  Or, in other words, what are the capabilities of
+ *    the link hardware?
+ *
+ * These link states are not, strictly speaking, related to application
+ * state; they may be manipulated at any time, whether or not the link
+ * is currently being used for data transfer.  However, for convenience,
+ * gxio_mpipe_link_open() and gxio_mpipe_link_close() (or application exit)
+ * can affect the link state.  These implicit link management operations
+ * may be modified or disabled by the use of link open flags.
+ *
+ * From an application, you can use gxio_mpipe_link_get_attr()
+ * and gxio_mpipe_link_set_attr() to manipulate the link states.
+ * gxio_mpipe_link_get_attr() with ::GXIO_MPIPE_LINK_POSSIBLE_STATE
+ * gets you the possible link state.  gxio_mpipe_link_get_attr() with
+ * ::GXIO_MPIPE_LINK_CURRENT_STATE gets you the current link state.
+ * Finally, gxio_mpipe_link_set_attr() and gxio_mpipe_link_get_attr()
+ * with ::GXIO_MPIPE_LINK_DESIRED_STATE allow you to modify or retrieve
+ * the desired link state.
+ *
+ * If you want to manage a link from a part of your application which isn't
+ * involved in packet processing, you can use the ::GXIO_MPIPE_LINK_NO_DATA
+ * flags on a gxio_mpipe_link_open() call.  This opens the link, but does
+ * not request data permission, so it does not conflict with any exclusive
+ * permissions which may be held by other processes.  You can then can use
+ * gxio_mpipe_link_get_attr() and gxio_mpipe_link_set_attr() on this link
+ * object to bring up or take down the link.
+ *
+ * Some links support link state bits which support various loopback
+ * modes. ::GXIO_MPIPE_LINK_LOOP_MAC tests datapaths within the Tile
+ * Processor itself; ::GXIO_MPIPE_LINK_LOOP_PHY tests the datapath between
+ * the Tile Processor and the external physical layer interface chip; and
+ * ::GXIO_MPIPE_LINK_LOOP_EXT tests the entire network datapath with the
+ * aid of an external loopback connector.  In addition to enabling hardware
+ * testing, such configuration can be useful for software testing, as well.
+ *
+ * When LOOP_MAC or LOOP_PHY is enabled, packets transmitted on a channel
+ * will be received by that channel, instead of being emitted on the
+ * physical link, and packets received on the physical link will be ignored.
+ * Other than that, all standard GXIO operations work as you might expect.
+ * Note that loopback operation requires that the link be brought up using
+ * one or more of the GXIO_MPIPE_LINK_SPEED_xxx link state bits.
+ *
+ * Those familiar with previous versions of the MDE on TILEPro hardware
+ * will notice significant similarities between the NetIO link management
+ * model and the mPIPE link management model.  However, the NetIO model
+ * was developed in stages, and some of its features -- for instance,
+ * the default setting of certain flags -- were shaped by the need to be
+ * compatible with previous versions of NetIO.  Since the features provided
+ * by the mPIPE hardware and the mPIPE GXIO library are significantly
+ * different than those provided by NetIO, in some cases, we have made
+ * different choices in the mPIPE link management API.  Thus, please read
+ * this documentation carefully before assuming that mPIPE link management
+ * operations are exactly equivalent to their NetIO counterparts.
+ */
+
+/* An object used to manage mPIPE link state and resources. */
+typedef struct {
+	/* The overall mPIPE context. */
+	gxio_mpipe_context_t *context;
+
+	/* The channel number used by this link. */
+	uint8_t channel;
+
+	/* The MAC index used by this link. */
+	uint8_t mac;
+} gxio_mpipe_link_t;
+
+/* Translate a link name to the instance number of the mPIPE shim which is
+ *  connected to that link.  This call does not verify whether the link is
+ *  currently available, and does not reserve any link resources;
+ *  gxio_mpipe_link_open() must be called to perform those functions.
+ *
+ *  Typically applications will call this function to translate a link name
+ *  to an mPIPE instance number; call gxio_mpipe_init(), passing it that
+ *  instance number, to initialize the mPIPE shim; and then call
+ *  gxio_mpipe_link_open(), passing it the same link name plus the mPIPE
+ *  context, to configure the link.
+ *
+ * @param link_name Name of the link; see @ref gxio_mpipe_link_names.
+ * @return The mPIPE instance number which is associated with the named
+ *  link, or a negative error code (::GXIO_ERR_NO_DEVICE) if the link does
+ *  not exist.
+ */
+extern int gxio_mpipe_link_instance(const char *link_name);
+
+/* Retrieve one of this system's legal link names, and its MAC address.
+ *
+ * @param index Link name index.  If a system supports N legal link names,
+ *  then indices between 0 and N - 1, inclusive, each correspond to one of
+ *  those names.  Thus, to retrieve all of a system's legal link names,
+ *  call this function in a loop, starting with an index of zero, and
+ *  incrementing it once per iteration until -1 is returned.
+ * @param link_name Pointer to the buffer which will receive the retrieved
+ *  link name.  The buffer should contain space for at least
+ *  ::GXIO_MPIPE_LINK_NAME_LEN bytes; the returned name, including the
+ *  terminating null byte, will be no longer than that.
+ * @param link_name Pointer to the buffer which will receive the retrieved
+ *  MAC address.  The buffer should contain space for at least 6 bytes.
+ * @return Zero if a link name was successfully retrieved; -1 if one was
+ *  not.
+ */
+extern int gxio_mpipe_link_enumerate_mac(int index, char *link_name,
+					 uint8_t *mac_addr);
+
+/* Open an mPIPE link.
+ *
+ *  A link must be opened before it may be used to send or receive packets,
+ *  and before its state may be examined or changed.  Depending up on the
+ *  link's intended use, one or more link permissions may be requested via
+ *  the flags parameter; see @ref gxio_mpipe_link_perm.  In addition, flags
+ *  may request that the link's state be modified at open time.  See @ref
+ *  gxio_mpipe_link_states and @ref gxio_mpipe_link_open_flags for more detail.
+ *
+ * @param link A link state object, which will be initialized if this
+ *  function completes successfully.
+ * @param context An initialized mPIPE context.
+ * @param link_name Name of the link.
+ * @param flags Zero or more @ref gxio_mpipe_link_open_flags, ORed together.
+ * @return 0 if the link was successfully opened, or a negative error code.
+ *
+ */
+extern int gxio_mpipe_link_open(gxio_mpipe_link_t *link,
+				gxio_mpipe_context_t *context,
+				const char *link_name, unsigned int flags);
+
+/* Close an mPIPE link.
+ *
+ *  Closing a link makes it available for use by other processes.  Once
+ *  a link has been closed, packets may no longer be sent on or received
+ *  from the link, and its state may not be examined or changed.
+ *
+ * @param link A link state object, which will no longer be initialized
+ *  if this function completes successfully.
+ * @return 0 if the link was successfully closed, or a negative error code.
+ *
+ */
+extern int gxio_mpipe_link_close(gxio_mpipe_link_t *link);
+
+/* Return a link's channel number.
+ *
+ * @param link A properly initialized link state object.
+ * @return The channel number for the link.
+ */
+static inline int gxio_mpipe_link_channel(gxio_mpipe_link_t *link)
+{
+	return link->channel;
+}
+
+/* Set a link attribute.
+ *
+ * @param link A properly initialized link state object.
+ * @param attr An attribute from the set of @ref gxio_mpipe_link_attrs.
+ * @param val New value of the attribute.
+ * @return 0 if the attribute was successfully set, or a negative error
+ *  code.
+ */
+extern int gxio_mpipe_link_set_attr(gxio_mpipe_link_t *link, uint32_t attr,
+				    int64_t val);
+
+///////////////////////////////////////////////////////////////////
+//                             Timestamp                         //
+///////////////////////////////////////////////////////////////////
+
+/* Get the timestamp of mPIPE when this routine is called.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ts A timespec structure to store the current clock.
+ * @return If the call was successful, zero; otherwise, a negative error
+ *  code.
+ */
+extern int gxio_mpipe_get_timestamp(gxio_mpipe_context_t *context,
+				    struct timespec *ts);
+
+/* Set the timestamp of mPIPE.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ts A timespec structure to store the requested clock.
+ * @return If the call was successful, zero; otherwise, a negative error
+ *  code.
+ */
+extern int gxio_mpipe_set_timestamp(gxio_mpipe_context_t *context,
+				    const struct timespec *ts);
+
+/* Adjust the timestamp of mPIPE.
+ *
+ * @param context An initialized mPIPE context.
+ * @param delta A signed time offset to adjust, in nanoseconds.
+ * The absolute value of this parameter must be less than or
+ * equal to 1000000000.
+ * @return If the call was successful, zero; otherwise, a negative error
+ *  code.
+ */
+extern int gxio_mpipe_adjust_timestamp(gxio_mpipe_context_t *context,
+				       int64_t delta);
+
+/** Adjust the mPIPE timestamp clock frequency.
+ *
+ * @param context An initialized mPIPE context.
+ * @param ppb A 32-bit signed PPB (Parts Per Billion) value to adjust.
+ * The absolute value of ppb must be less than or equal to 1000000000.
+ * Values less than about 30000 will generally cause a GXIO_ERR_INVAL
+ * return due to the granularity of the hardware that converts reference
+ * clock cycles into seconds and nanoseconds.
+ * @return If the call was successful, zero; otherwise, a negative error
+ *  code.
+ */
+extern int gxio_mpipe_adjust_timestamp_freq(gxio_mpipe_context_t* context,
+                                            int32_t ppb);
+
+#endif /* !_GXIO_MPIPE_H_ */
diff --git a/arch/tile/include/gxio/trio.h b/arch/tile/include/gxio/trio.h
new file mode 100644
index 00000000000..df10a662cc2
--- /dev/null
+++ b/arch/tile/include/gxio/trio.h
@@ -0,0 +1,298 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/*
+ *
+ * An API for allocating, configuring, and manipulating TRIO hardware
+ * resources
+ */
+
+/*
+ *
+ * The TILE-Gx TRIO shim provides connections to external devices via
+ * PCIe or other transaction IO standards.  The gxio_trio_ API,
+ * declared in <gxio/trio.h>, allows applications to allocate and
+ * configure TRIO IO resources like DMA command rings, memory map
+ * windows, and device interrupts.  The following sections introduce
+ * the various components of the API.  We strongly recommend reading
+ * the TRIO section of the IO Device Guide (UG404) before working with
+ * this API.
+ *
+ * @section trio__ingress TRIO Ingress Hardware Resources
+ *
+ * The TRIO ingress hardware is responsible for examining incoming
+ * PCIe or StreamIO packets and choosing a processing mechanism based
+ * on the packets' bus address.  The gxio_trio_ API can be used to
+ * configure different handlers for different ranges of bus address
+ * space.  The user can configure "mapped memory" and "scatter queue"
+ * regions to match incoming packets within 4kB-aligned ranges of bus
+ * addresses.  Each range specifies a different set of mapping
+ * parameters to be applied when handling the ingress packet.  The
+ * following sections describe how to work with MapMem and scatter
+ * queue regions.
+ *
+ * @subsection trio__mapmem TRIO MapMem Regions
+ *
+ * TRIO mapped memory (or MapMem) regions allow the user to map
+ * incoming read and write requests directly to the application's
+ * memory space.  MapMem regions are allocated via
+ * gxio_trio_alloc_memory_maps().  Given an integer MapMem number,
+ * applications can use gxio_trio_init_memory_map() to specify the
+ * range of bus addresses that will match the region and the range of
+ * virtual addresses to which those packets will be applied.
+ *
+ * As with many other gxio APIs, the programmer must be sure to
+ * register memory pages that will be used with MapMem regions.  Pages
+ * can be registered with TRIO by allocating an ASID (address space
+ * identifier) and then using gxio_trio_register_page() to register up to
+ * 16 pages with the hardware.  The initialization functions for
+ * resources that require registered memory (MapMem, scatter queues,
+ * push DMA, and pull DMA) then take an 'asid' parameter in order to
+ * configure which set of registered pages is used by each resource.
+ *
+ * @subsection trio__scatter_queue TRIO Scatter Queues
+ *
+ * The TRIO shim's scatter queue regions allow users to dynamically
+ * map buffers from a large address space into a small range of bus
+ * addresses.  This is particularly helpful for PCIe endpoint devices,
+ * where the host generally limits the size of BARs to tens of
+ * megabytes.
+ *
+ * Each scatter queue consists of a memory map region, a queue of
+ * tile-side buffer VAs to be mapped to that region, and a bus-mapped
+ * "doorbell" register that the remote endpoint can write to trigger a
+ * dequeue of the current buffer VA, thus swapping in a new buffer.
+ * The VAs pushed onto a scatter queue must be 4kB aligned, so
+ * applications may need to use higher-level protocols to inform
+ * remote entities that they should apply some additional, sub-4kB
+ * offset when reading or writing the scatter queue region.  For more
+ * information, see the IO Device Guide (UG404).
+ *
+ * @section trio__egress TRIO Egress Hardware Resources
+ *
+ * The TRIO shim supports two mechanisms for egress packet generation:
+ * programmed IO (PIO) and push/pull DMA.  PIO allows applications to
+ * create MMIO mappings for PCIe or StreamIO address space, such that
+ * the application can generate word-sized read or write transactions
+ * by issuing load or store instructions.  Push and pull DMA are tuned
+ * for larger transactions; they use specialized hardware engines to
+ * transfer large blocks of data at line rate.
+ *
+ * @subsection trio__pio TRIO Programmed IO
+ *
+ * Programmed IO allows applications to create MMIO mappings for PCIe
+ * or StreamIO address space.  The hardware PIO regions support access
+ * to PCIe configuration, IO, and memory space, but the gxio_trio API
+ * only supports memory space accesses.  PIO regions are allocated
+ * with gxio_trio_alloc_pio_regions() and initialized via
+ * gxio_trio_init_pio_region().  Once a region is bound to a range of
+ * bus address via the initialization function, the application can
+ * use gxio_trio_map_pio_region() to create MMIO mappings from its VA
+ * space onto the range of bus addresses supported by the PIO region.
+ *
+ * @subsection trio_dma TRIO Push and Pull DMA
+ *
+ * The TRIO push and pull DMA engines allow users to copy blocks of
+ * data between application memory and the bus.  Push DMA generates
+ * write packets that copy from application memory to the bus and pull
+ * DMA generates read packets that copy from the bus into application
+ * memory.  The DMA engines are managed via an API that is very
+ * similar to the mPIPE eDMA interface.  For a detailed explanation of
+ * the eDMA queue API, see @ref gxio_mpipe_wrappers.
+ *
+ * Push and pull DMA queues are allocated via
+ * gxio_trio_alloc_push_dma_ring() / gxio_trio_alloc_pull_dma_ring().
+ * Once allocated, users generally use a ::gxio_trio_dma_queue_t
+ * object to manage the queue, providing easy wrappers for reserving
+ * command slots in the DMA command ring, filling those slots, and
+ * waiting for commands to complete.  DMA queues can be initialized
+ * via gxio_trio_init_push_dma_queue() or
+ * gxio_trio_init_pull_dma_queue().
+ *
+ * See @ref trio/push_dma/app.c for an example of how to use push DMA.
+ *
+ * @section trio_shortcomings Plans for Future API Revisions
+ *
+ * The simulation framework is incomplete.  Future features include:
+ *
+ * - Support for reset and deallocation of resources.
+ *
+ * - Support for pull DMA.
+ *
+ * - Support for interrupt regions and user-space interrupt delivery.
+ *
+ * - Support for getting BAR mappings and reserving regions of BAR
+ *   address space.
+ */
+#ifndef _GXIO_TRIO_H_
+#define _GXIO_TRIO_H_
+
+#include <linux/types.h>
+
+#include <gxio/common.h>
+#include <gxio/dma_queue.h>
+
+#include <arch/trio_constants.h>
+#include <arch/trio.h>
+#include <arch/trio_pcie_intfc.h>
+#include <arch/trio_pcie_rc.h>
+#include <arch/trio_shm.h>
+#include <hv/drv_trio_intf.h>
+#include <hv/iorpc.h>
+
+/* A context object used to manage TRIO hardware resources. */
+typedef struct {
+
+	/* File descriptor for calling up to Linux (and thus the HV). */
+	int fd;
+
+	/* The VA at which the MAC MMIO registers are mapped. */
+	char *mmio_base_mac;
+
+	/* The VA at which the PIO config space are mapped for each PCIe MAC.
+	   Gx36 has max 3 PCIe MACs per TRIO shim. */
+	char *mmio_base_pio_cfg[TILEGX_TRIO_PCIES];
+
+#ifdef USE_SHARED_PCIE_CONFIG_REGION
+	/* Index of the shared PIO region for PCI config access. */
+	int pio_cfg_index;
+#else
+	/* Index of the PIO region for PCI config access per MAC. */
+	int pio_cfg_index[TILEGX_TRIO_PCIES];
+#endif
+
+	/*  The VA at which the push DMA MMIO registers are mapped. */
+	char *mmio_push_dma[TRIO_NUM_PUSH_DMA_RINGS];
+
+	/*  The VA at which the pull DMA MMIO registers are mapped. */
+	char *mmio_pull_dma[TRIO_NUM_PUSH_DMA_RINGS];
+
+	/* Application space ID. */
+	unsigned int asid;
+
+} gxio_trio_context_t;
+
+/* Command descriptor for push or pull DMA. */
+typedef TRIO_DMA_DESC_t gxio_trio_dma_desc_t;
+
+/* A convenient, thread-safe interface to an eDMA ring. */
+typedef struct {
+
+	/* State object for tracking head and tail pointers. */
+	__gxio_dma_queue_t dma_queue;
+
+	/* The ring entries. */
+	gxio_trio_dma_desc_t *dma_descs;
+
+	/* The number of entries minus one. */
+	unsigned long mask_num_entries;
+
+	/* The log2() of the number of entries. */
+	unsigned int log2_num_entries;
+
+} gxio_trio_dma_queue_t;
+
+/* Initialize a TRIO context.
+ *
+ * This function allocates a TRIO "service domain" and maps the MMIO
+ * registers into the the caller's VA space.
+ *
+ * @param trio_index Which TRIO shim; Gx36 must pass 0.
+ * @param context Context object to be initialized.
+ */
+extern int gxio_trio_init(gxio_trio_context_t *context,
+			  unsigned int trio_index);
+
+/* This indicates that an ASID hasn't been allocated. */
+#define GXIO_ASID_NULL -1
+
+/* Ordering modes for map memory regions and scatter queue regions. */
+typedef enum gxio_trio_order_mode_e {
+	/* Writes are not ordered.  Reads always wait for previous writes. */
+	GXIO_TRIO_ORDER_MODE_UNORDERED =
+		TRIO_MAP_MEM_SETUP__ORDER_MODE_VAL_UNORDERED,
+	/* Both writes and reads wait for previous transactions to complete. */
+	GXIO_TRIO_ORDER_MODE_STRICT =
+		TRIO_MAP_MEM_SETUP__ORDER_MODE_VAL_STRICT,
+	/* Writes are ordered unless the incoming packet has the
+	   relaxed-ordering attributes set. */
+	GXIO_TRIO_ORDER_MODE_OBEY_PACKET =
+		TRIO_MAP_MEM_SETUP__ORDER_MODE_VAL_REL_ORD
+} gxio_trio_order_mode_t;
+
+/* Initialize a memory mapping region.
+ *
+ * @param context An initialized TRIO context.
+ * @param map A Memory map region allocated by gxio_trio_alloc_memory_map().
+ * @param target_mem VA of backing memory, should be registered via
+ *   gxio_trio_register_page() and aligned to 4kB.
+ * @param target_size Length of the memory mapping, must be a multiple
+ * of 4kB.
+ * @param asid ASID to be used for Tile-side address translation.
+ * @param mac MAC number.
+ * @param bus_address Bus address at which the mapping starts.
+ * @param order_mode Memory ordering mode for this mapping.
+ * @return Zero on success, else ::GXIO_TRIO_ERR_BAD_MEMORY_MAP,
+ * GXIO_TRIO_ERR_BAD_ASID, or ::GXIO_TRIO_ERR_BAD_BUS_RANGE.
+ */
+extern int gxio_trio_init_memory_map(gxio_trio_context_t *context,
+				     unsigned int map, void *target_mem,
+				     size_t target_size, unsigned int asid,
+				     unsigned int mac, uint64_t bus_address,
+				     gxio_trio_order_mode_t order_mode);
+
+/* Flags that can be passed to resource allocation functions. */
+enum gxio_trio_alloc_flags_e {
+	GXIO_TRIO_ALLOC_FIXED = HV_TRIO_ALLOC_FIXED,
+};
+
+/* Flags that can be passed to memory registration functions. */
+enum gxio_trio_mem_flags_e {
+	/* Do not fill L3 when writing, and invalidate lines upon egress. */
+	GXIO_TRIO_MEM_FLAG_NT_HINT = IORPC_MEM_BUFFER_FLAG_NT_HINT,
+
+	/* L3 cache fills should only populate IO cache ways. */
+	GXIO_TRIO_MEM_FLAG_IO_PIN = IORPC_MEM_BUFFER_FLAG_IO_PIN,
+};
+
+/* Flag indicating a request generator uses a special traffic
+    class. */
+#define GXIO_TRIO_FLAG_TRAFFIC_CLASS(N) HV_TRIO_FLAG_TC(N)
+
+/* Flag indicating a request generator uses a virtual function
+    number. */
+#define GXIO_TRIO_FLAG_VFUNC(N) HV_TRIO_FLAG_VFUNC(N)
+
+/*****************************************************************
+ *                       Memory Registration                      *
+ ******************************************************************/
+
+/* Allocate Application Space Identifiers (ASIDs).  Each ASID can
+ * register up to 16 page translations.  ASIDs are used by memory map
+ * regions, scatter queues, and DMA queues to translate application
+ * VAs into memory system PAs.
+ *
+ * @param context An initialized TRIO context.
+ * @param count Number of ASIDs required.
+ * @param first Index of first ASID if ::GXIO_TRIO_ALLOC_FIXED flag
+ *   is set, otherwise ignored.
+ * @param flags Flag bits, including bits from ::gxio_trio_alloc_flags_e.
+ * @return Index of first ASID, or ::GXIO_TRIO_ERR_NO_ASID if allocation
+ *   failed.
+ */
+extern int gxio_trio_alloc_asids(gxio_trio_context_t *context,
+				 unsigned int count, unsigned int first,
+				 unsigned int flags);
+
+#endif /* ! _GXIO_TRIO_H_ */
diff --git a/arch/tile/include/gxio/uart.h b/arch/tile/include/gxio/uart.h
new file mode 100644
index 00000000000..438ee7e46c7
--- /dev/null
+++ b/arch/tile/include/gxio/uart.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _GXIO_UART_H_
+#define _GXIO_UART_H_
+
+#include "common.h"
+
+#include <hv/drv_uart_intf.h>
+#include <hv/iorpc.h>
+
+/*
+ *
+ * An API for manipulating UART interface.
+ */
+
+/*
+ *
+ * The Rshim allows access to the processor's UART interface.
+ */
+
+/* A context object used to manage UART resources. */
+typedef struct {
+
+	/* File descriptor for calling up to the hypervisor. */
+	int fd;
+
+	/* The VA at which our MMIO registers are mapped. */
+	char *mmio_base;
+
+} gxio_uart_context_t;
+
+/* Request UART interrupts.
+ *
+ *  Request that interrupts be delivered to a tile when the UART's
+ *  Receive FIFO is written, or the Write FIFO is read.
+ *
+ * @param context Pointer to a properly initialized gxio_uart_context_t.
+ * @param bind_cpu_x X coordinate of CPU to which interrupt will be delivered.
+ * @param bind_cpu_y Y coordinate of CPU to which interrupt will be delivered.
+ * @param bind_interrupt IPI interrupt number.
+ * @param bind_event Sub-interrupt event bit number; a negative value can
+ *  disable the interrupt.
+ * @return Zero if all of the requested UART events were successfully
+ *  configured to interrupt.
+ */
+extern int gxio_uart_cfg_interrupt(gxio_uart_context_t *context,
+				   int bind_cpu_x,
+				   int bind_cpu_y,
+				   int bind_interrupt, int bind_event);
+
+/* Initialize a UART context.
+ *
+ *  A properly initialized context must be obtained before any of the other
+ *  gxio_uart routines may be used.
+ *
+ * @param context Pointer to a gxio_uart_context_t, which will be initialized
+ *  by this routine, if it succeeds.
+ * @param uart_index Index of the UART to use.
+ * @return Zero if the context was successfully initialized, else a
+ *  GXIO_ERR_xxx error code.
+ */
+extern int gxio_uart_init(gxio_uart_context_t *context, int uart_index);
+
+/* Destroy a UART context.
+ *
+ *  Once destroyed, a context may not be used with any gxio_uart routines
+ *  other than gxio_uart_init().  After this routine returns, no further
+ *  interrupts requested on this context will be delivered.  The state and
+ *  configuration of the pins which had been attached to this context are
+ *  unchanged by this operation.
+ *
+ * @param context Pointer to a gxio_uart_context_t.
+ * @return Zero if the context was successfully destroyed, else a
+ *  GXIO_ERR_xxx error code.
+ */
+extern int gxio_uart_destroy(gxio_uart_context_t *context);
+
+/* Write UART register.
+ * @param context Pointer to a gxio_uart_context_t.
+ * @param offset UART register offset.
+ * @param word Data will be wrote to UART reigister.
+ */
+extern void gxio_uart_write(gxio_uart_context_t *context, uint64_t offset,
+			    uint64_t word);
+
+/* Read UART register.
+ * @param context Pointer to a gxio_uart_context_t.
+ * @param offset UART register offset.
+ * @return Data read from UART register.
+ */
+extern uint64_t gxio_uart_read(gxio_uart_context_t *context, uint64_t offset);
+
+#endif /* _GXIO_UART_H_ */
diff --git a/arch/tile/include/gxio/usb_host.h b/arch/tile/include/gxio/usb_host.h
new file mode 100644
index 00000000000..93c9636d2dd
--- /dev/null
+++ b/arch/tile/include/gxio/usb_host.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#ifndef _GXIO_USB_H_
+#define _GXIO_USB_H_
+
+#include <gxio/common.h>
+
+#include <hv/drv_usb_host_intf.h>
+#include <hv/iorpc.h>
+
+/*
+ *
+ * An API for manipulating general-purpose I/O pins.
+ */
+
+/*
+ *
+ * The USB shim allows access to the processor's Universal Serial Bus
+ * connections.
+ */
+
+/* A context object used to manage USB hardware resources. */
+typedef struct {
+
+	/* File descriptor for calling up to the hypervisor. */
+	int fd;
+
+	/* The VA at which our MMIO registers are mapped. */
+	char *mmio_base;
+} gxio_usb_host_context_t;
+
+/* Initialize a USB context.
+ *
+ *  A properly initialized context must be obtained before any of the other
+ *  gxio_usb_host routines may be used.
+ *
+ * @param context Pointer to a gxio_usb_host_context_t, which will be
+ *  initialized by this routine, if it succeeds.
+ * @param usb_index Index of the USB shim to use.
+ * @param is_ehci Nonzero to use the EHCI interface; zero to use the OHCI
+ *  intereface.
+ * @return Zero if the context was successfully initialized, else a
+ *  GXIO_ERR_xxx error code.
+ */
+extern int gxio_usb_host_init(gxio_usb_host_context_t *context, int usb_index,
+			      int is_ehci);
+
+/* Destroy a USB context.
+ *
+ *  Once destroyed, a context may not be used with any gxio_usb_host routines
+ *  other than gxio_usb_host_init().  After this routine returns, no further
+ *  interrupts or signals requested on this context will be delivered.  The
+ *  state and configuration of the pins which had been attached to this
+ *  context are unchanged by this operation.
+ *
+ * @param context Pointer to a gxio_usb_host_context_t.
+ * @return Zero if the context was successfully destroyed, else a
+ *  GXIO_ERR_xxx error code.
+ */
+extern int gxio_usb_host_destroy(gxio_usb_host_context_t *context);
+
+/* Retrieve the address of the shim's MMIO registers.
+ *
+ * @param context Pointer to a properly initialized gxio_usb_host_context_t.
+ * @return The address of the shim's MMIO registers.
+ */
+extern void *gxio_usb_host_get_reg_start(gxio_usb_host_context_t *context);
+
+/* Retrieve the length of the shim's MMIO registers.
+ *
+ * @param context Pointer to a properly initialized gxio_usb_host_context_t.
+ * @return The length of the shim's MMIO registers.
+ */
+extern size_t gxio_usb_host_get_reg_len(gxio_usb_host_context_t *context);
+
+#endif /* _GXIO_USB_H_ */
diff --git a/arch/tile/include/hv/drv_mpipe_intf.h b/arch/tile/include/hv/drv_mpipe_intf.h
new file mode 100644
index 00000000000..c97e416dd96
--- /dev/null
+++ b/arch/tile/include/hv/drv_mpipe_intf.h
@@ -0,0 +1,605 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Interface definitions for the mpipe driver.
+ */
+
+#ifndef _SYS_HV_DRV_MPIPE_INTF_H
+#define _SYS_HV_DRV_MPIPE_INTF_H
+
+#include <arch/mpipe.h>
+#include <arch/mpipe_constants.h>
+
+
+/** Number of mPIPE instances supported */
+#define HV_MPIPE_INSTANCE_MAX   (2)
+
+/** Number of buffer stacks (32). */
+#define HV_MPIPE_NUM_BUFFER_STACKS \
+  (MPIPE_MMIO_INIT_DAT_GX36_1__BUFFER_STACK_MASK_WIDTH)
+
+/** Number of NotifRings (256). */
+#define HV_MPIPE_NUM_NOTIF_RINGS (MPIPE_NUM_NOTIF_RINGS)
+
+/** Number of NotifGroups (32). */
+#define HV_MPIPE_NUM_NOTIF_GROUPS (MPIPE_NUM_NOTIF_GROUPS)
+
+/** Number of buckets (4160). */
+#define HV_MPIPE_NUM_BUCKETS (MPIPE_NUM_BUCKETS)
+
+/** Number of "lo" buckets (4096). */
+#define HV_MPIPE_NUM_LO_BUCKETS 4096
+
+/** Number of "hi" buckets (64). */
+#define HV_MPIPE_NUM_HI_BUCKETS \
+  (HV_MPIPE_NUM_BUCKETS - HV_MPIPE_NUM_LO_BUCKETS)
+
+/** Number of edma rings (24). */
+#define HV_MPIPE_NUM_EDMA_RINGS \
+  (MPIPE_MMIO_INIT_DAT_GX36_1__EDMA_POST_MASK_WIDTH)
+
+
+
+
+/** A flag bit indicating a fixed resource allocation. */
+#define HV_MPIPE_ALLOC_FIXED 0x01
+
+/** Offset for the config register MMIO region. */
+#define HV_MPIPE_CONFIG_MMIO_OFFSET \
+  (MPIPE_MMIO_ADDR__REGION_VAL_CFG << MPIPE_MMIO_ADDR__REGION_SHIFT)
+
+/** Size of the config register MMIO region. */
+#define HV_MPIPE_CONFIG_MMIO_SIZE (64 * 1024)
+
+/** Offset for the config register MMIO region. */
+#define HV_MPIPE_FAST_MMIO_OFFSET \
+  (MPIPE_MMIO_ADDR__REGION_VAL_IDMA << MPIPE_MMIO_ADDR__REGION_SHIFT)
+
+/** Size of the fast register MMIO region (IDMA, EDMA, buffer stack). */
+#define HV_MPIPE_FAST_MMIO_SIZE \
+  ((MPIPE_MMIO_ADDR__REGION_VAL_BSM + 1 - MPIPE_MMIO_ADDR__REGION_VAL_IDMA) \
+   << MPIPE_MMIO_ADDR__REGION_SHIFT)
+
+
+/*
+ * Each type of resource allocation comes in quantized chunks, where
+ * XXX_BITS is the number of chunks, and XXX_RES_PER_BIT is the number
+ * of resources in each chunk.
+ */
+
+/** Number of buffer stack chunks available (32). */
+#define HV_MPIPE_ALLOC_BUFFER_STACKS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_1__BUFFER_STACK_MASK_WIDTH
+
+/** Granularity of buffer stack allocation (1). */
+#define HV_MPIPE_ALLOC_BUFFER_STACKS_RES_PER_BIT \
+  (HV_MPIPE_NUM_BUFFER_STACKS / HV_MPIPE_ALLOC_BUFFER_STACKS_BITS)
+
+/** Number of NotifRing chunks available (32). */
+#define HV_MPIPE_ALLOC_NOTIF_RINGS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_0__NOTIF_RING_MASK_WIDTH
+
+/** Granularity of NotifRing allocation (8). */
+#define HV_MPIPE_ALLOC_NOTIF_RINGS_RES_PER_BIT \
+  (HV_MPIPE_NUM_NOTIF_RINGS / HV_MPIPE_ALLOC_NOTIF_RINGS_BITS)
+
+/** Number of NotifGroup chunks available (32). */
+#define HV_MPIPE_ALLOC_NOTIF_GROUPS_BITS \
+  HV_MPIPE_NUM_NOTIF_GROUPS
+
+/** Granularity of NotifGroup allocation (1). */
+#define HV_MPIPE_ALLOC_NOTIF_GROUPS_RES_PER_BIT \
+  (HV_MPIPE_NUM_NOTIF_GROUPS / HV_MPIPE_ALLOC_NOTIF_GROUPS_BITS)
+
+/** Number of lo bucket chunks available (16). */
+#define HV_MPIPE_ALLOC_LO_BUCKETS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_0__BUCKET_RELEASE_MASK_LO_WIDTH
+
+/** Granularity of lo bucket allocation (256). */
+#define HV_MPIPE_ALLOC_LO_BUCKETS_RES_PER_BIT \
+  (HV_MPIPE_NUM_LO_BUCKETS / HV_MPIPE_ALLOC_LO_BUCKETS_BITS)
+
+/** Number of hi bucket chunks available (16). */
+#define HV_MPIPE_ALLOC_HI_BUCKETS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_0__BUCKET_RELEASE_MASK_HI_WIDTH
+
+/** Granularity of hi bucket allocation (4). */
+#define HV_MPIPE_ALLOC_HI_BUCKETS_RES_PER_BIT \
+  (HV_MPIPE_NUM_HI_BUCKETS / HV_MPIPE_ALLOC_HI_BUCKETS_BITS)
+
+/** Number of eDMA ring chunks available (24). */
+#define HV_MPIPE_ALLOC_EDMA_RINGS_BITS \
+  MPIPE_MMIO_INIT_DAT_GX36_1__EDMA_POST_MASK_WIDTH
+
+/** Granularity of eDMA ring allocation (1). */
+#define HV_MPIPE_ALLOC_EDMA_RINGS_RES_PER_BIT \
+  (HV_MPIPE_NUM_EDMA_RINGS / HV_MPIPE_ALLOC_EDMA_RINGS_BITS)
+
+
+
+
+/** Bit vector encoding which NotifRings are in a NotifGroup. */
+typedef struct
+{
+  /** The actual bits. */
+  uint64_t ring_mask[4];
+
+} gxio_mpipe_notif_group_bits_t;
+
+
+/** Another name for MPIPE_LBL_INIT_DAT_BSTS_TBL_t. */
+typedef MPIPE_LBL_INIT_DAT_BSTS_TBL_t gxio_mpipe_bucket_info_t;
+
+
+
+/** Eight buffer stack ids. */
+typedef struct
+{
+  /** The stacks. */
+  uint8_t stacks[8];
+
+} gxio_mpipe_rules_stacks_t;
+
+
+/** A destination mac address. */
+typedef struct
+{
+  /** The octets. */
+  uint8_t octets[6];
+
+} gxio_mpipe_rules_dmac_t;
+
+
+/** A vlan. */
+typedef uint16_t gxio_mpipe_rules_vlan_t;
+
+
+
+/** Maximum number of characters in a link name. */
+#define GXIO_MPIPE_LINK_NAME_LEN  32
+
+
+/** Structure holding a link name.  Only needed, and only typedef'ed,
+ *  because the IORPC stub generator only handles types which are single
+ *  words coming before the parameter name. */
+typedef struct
+{
+  /** The name itself. */
+  char name[GXIO_MPIPE_LINK_NAME_LEN];
+}
+_gxio_mpipe_link_name_t;
+
+/** Maximum number of characters in a symbol name. */
+#define GXIO_MPIPE_SYMBOL_NAME_LEN  128
+
+
+/** Structure holding a symbol name.  Only needed, and only typedef'ed,
+ *  because the IORPC stub generator only handles types which are single
+ *  words coming before the parameter name. */
+typedef struct
+{
+  /** The name itself. */
+  char name[GXIO_MPIPE_SYMBOL_NAME_LEN];
+}
+_gxio_mpipe_symbol_name_t;
+
+
+/** Structure holding a MAC address. */
+typedef struct
+{
+  /** The address. */
+  uint8_t mac[6];
+}
+_gxio_mpipe_link_mac_t;
+
+
+
+/** Request shared data permission -- that is, the ability to send and
+ *  receive packets -- on the specified link.  Other processes may also
+ *  request shared data permission on the same link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_DATA, ::GXIO_MPIPE_LINK_NO_DATA,
+ *  or ::GXIO_MPIPE_LINK_EXCL_DATA may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_DATA is assumed.
+ */
+#define GXIO_MPIPE_LINK_DATA               0x00000001UL
+
+/** Do not request data permission on the specified link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_DATA, ::GXIO_MPIPE_LINK_NO_DATA,
+ *  or ::GXIO_MPIPE_LINK_EXCL_DATA may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_DATA is assumed.
+ */
+#define GXIO_MPIPE_LINK_NO_DATA            0x00000002UL
+
+/** Request exclusive data permission -- that is, the ability to send and
+ *  receive packets -- on the specified link.  No other processes may
+ *  request data permission on this link, and if any process already has
+ *  data permission on it, this open will fail.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_DATA, ::GXIO_MPIPE_LINK_NO_DATA,
+ *  or ::GXIO_MPIPE_LINK_EXCL_DATA may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_DATA is assumed.
+ */
+#define GXIO_MPIPE_LINK_EXCL_DATA          0x00000004UL
+
+/** Request shared stats permission -- that is, the ability to read and write
+ *  registers which contain link statistics, and to get link attributes --
+ *  on the specified link.  Other processes may also request shared stats
+ *  permission on the same link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_STATS, ::GXIO_MPIPE_LINK_NO_STATS,
+ *  or ::GXIO_MPIPE_LINK_EXCL_STATS may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_STATS is assumed.
+ */
+#define GXIO_MPIPE_LINK_STATS              0x00000008UL
+
+/** Do not request stats permission on the specified link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_STATS, ::GXIO_MPIPE_LINK_NO_STATS,
+ *  or ::GXIO_MPIPE_LINK_EXCL_STATS may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_STATS is assumed.
+ */
+#define GXIO_MPIPE_LINK_NO_STATS           0x00000010UL
+
+/** Request exclusive stats permission -- that is, the ability to read and
+ *  write registers which contain link statistics, and to get link
+ *  attributes -- on the specified link.  No other processes may request
+ *  stats permission on this link, and if any process already
+ *  has stats permission on it, this open will fail.
+ *
+ *  Requesting exclusive stats permission is normally a very bad idea, since
+ *  it prevents programs like mpipe-stat from providing information on this
+ *  link.  Applications should only do this if they use MAC statistics
+ *  registers, and cannot tolerate any of the clear-on-read registers being
+ *  reset by other statistics programs.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_STATS, ::GXIO_MPIPE_LINK_NO_STATS,
+ *  or ::GXIO_MPIPE_LINK_EXCL_STATS may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_STATS is assumed.
+ */
+#define GXIO_MPIPE_LINK_EXCL_STATS         0x00000020UL
+
+/** Request shared control permission -- that is, the ability to modify link
+ *  attributes, and read and write MAC and MDIO registers -- on the
+ *  specified link.  Other processes may also request shared control
+ *  permission on the same link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_CTL, ::GXIO_MPIPE_LINK_NO_CTL,
+ *  or ::GXIO_MPIPE_LINK_EXCL_CTL may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_CTL is assumed.
+ */
+#define GXIO_MPIPE_LINK_CTL                0x00000040UL
+
+/** Do not request control permission on the specified link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_CTL, ::GXIO_MPIPE_LINK_NO_CTL,
+ *  or ::GXIO_MPIPE_LINK_EXCL_CTL may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_CTL is assumed.
+ */
+#define GXIO_MPIPE_LINK_NO_CTL             0x00000080UL
+
+/** Request exclusive control permission -- that is, the ability to modify
+ *  link attributes, and read and write MAC and MDIO registers -- on the
+ *  specified link.  No other processes may request control permission on
+ *  this link, and if any process already has control permission on it,
+ *  this open will fail.
+ *
+ *  Requesting exclusive control permission is not always a good idea, since
+ *  it prevents programs like mpipe-link from configuring the link.
+ *
+ *  No more than one of ::GXIO_MPIPE_LINK_CTL, ::GXIO_MPIPE_LINK_NO_CTL,
+ *  or ::GXIO_MPIPE_LINK_EXCL_CTL may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_CTL is assumed.
+ */
+#define GXIO_MPIPE_LINK_EXCL_CTL           0x00000100UL
+
+/** Set the desired state of the link to up, allowing any speeds which are
+ *  supported by the link hardware, as part of this open operation; do not
+ *  change the desired state of the link when it is closed or the process
+ *  exits.  No more than one of ::GXIO_MPIPE_LINK_AUTO_UP,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN, ::GXIO_MPIPE_LINK_AUTO_DOWN, or
+ *  ::GXIO_MPIPE_LINK_AUTO_NONE may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_UP            0x00000200UL
+
+/** Set the desired state of the link to up, allowing any speeds which are
+ *  supported by the link hardware, as part of this open operation; when the
+ *  link is closed or this process exits, if no other process has the link
+ *  open, set the desired state of the link to down.  No more than one of
+ *  ::GXIO_MPIPE_LINK_AUTO_UP, ::GXIO_MPIPE_LINK_AUTO_UPDOWN,
+ *  ::GXIO_MPIPE_LINK_AUTO_DOWN, or ::GXIO_MPIPE_LINK_AUTO_NONE may be
+ *  specifed in a gxio_mpipe_link_open() call.  If none are specified,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_UPDOWN        0x00000400UL
+
+/** Do not change the desired state of the link as part of the open
+ *  operation; when the link is closed or this process exits, if no other
+ *  process has the link open, set the desired state of the link to down.
+ *  No more than one of ::GXIO_MPIPE_LINK_AUTO_UP,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN, ::GXIO_MPIPE_LINK_AUTO_DOWN, or
+ *  ::GXIO_MPIPE_LINK_AUTO_NONE may be specifed in a gxio_mpipe_link_open()
+ *  call.  If none are specified, ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_DOWN          0x00000800UL
+
+/** Do not change the desired state of the link as part of the open
+ *  operation; do not change the desired state of the link when it is
+ *  closed or the process exits.  No more than one of
+ *  ::GXIO_MPIPE_LINK_AUTO_UP, ::GXIO_MPIPE_LINK_AUTO_UPDOWN,
+ *  ::GXIO_MPIPE_LINK_AUTO_DOWN, or ::GXIO_MPIPE_LINK_AUTO_NONE may be
+ *  specifed in a gxio_mpipe_link_open() call.  If none are specified,
+ *  ::GXIO_MPIPE_LINK_AUTO_UPDOWN is assumed.
+ */
+#define GXIO_MPIPE_LINK_AUTO_NONE          0x00001000UL
+
+/** Request that this open call not complete until the network link is up.
+ *  The process will wait as long as necessary for this to happen;
+ *  applications which wish to abandon waiting for the link after a
+ *  specific time period should not specify this flag when opening a link,
+ *  but should instead call gxio_mpipe_link_wait() afterward.  The link
+ *  must be opened with stats permission.  Note that this flag by itself
+ *  does not change the desired link state; if other open flags or previous
+ *  link state changes have not requested a desired state of up, the open
+ *  call will never complete.  This flag is not available to kernel
+ *  clients.
+ */
+#define GXIO_MPIPE_LINK_WAIT               0x00002000UL
+
+
+/*
+ * Note: link attributes must fit in 24 bits, since we use the top 8 bits
+ * of the IORPC offset word for the channel number.
+ */
+
+/** Determine whether jumbo frames may be received.  If this attribute's
+ *  value value is nonzero, the MAC will accept frames of up to 10240 bytes.
+ *  If the value is zero, the MAC will only accept frames of up to 1544
+ *  bytes.  The default value is zero. */
+#define GXIO_MPIPE_LINK_RECEIVE_JUMBO      0x010000
+
+/** Determine whether to send pause frames on this link if the mPIPE packet
+ *  FIFO is nearly full.  If the value is zero, pause frames are not sent.
+ *  If the value is nonzero, it is the delay value which will be sent in any
+ *  pause frames which are output, in units of 512 bit times.
+ *
+ *  Bear in mind that in almost all circumstances, the mPIPE packet FIFO
+ *  will never fill up, since mPIPE will empty it as fast as or faster than
+ *  the incoming data rate, by either delivering or dropping packets.  The
+ *  only situation in which this is not true is if the memory and cache
+ *  subsystem is extremely heavily loaded, and mPIPE cannot perform DMA of
+ *  packet data to memory in a timely fashion.  In particular, pause frames
+ *  will <em>not</em> be sent if packets cannot be delivered because
+ *  NotifRings are full, buckets are full, or buffers are not available in
+ *  a buffer stack. */
+#define GXIO_MPIPE_LINK_SEND_PAUSE         0x020000
+
+/** Determine whether to suspend output on the receipt of pause frames.
+ *  If the value is nonzero, mPIPE shim will suspend output on the link's
+ *  channel when a pause frame is received.  If the value is zero, pause
+ *  frames will be ignored.  The default value is zero. */
+#define GXIO_MPIPE_LINK_RECEIVE_PAUSE      0x030000
+
+/** Interface MAC address.  The value is a 6-byte MAC address, in the least
+ *  significant 48 bits of the value; in other words, an address which would
+ *  be printed as '12:34:56:78:90:AB' in IEEE 802 canonical format would
+ *  be returned as 0x12345678ab.
+ *
+ *  Depending upon the overall system design, a MAC address may or may not
+ *  be available for each interface.  Note that the interface's MAC address
+ *  does not limit the packets received on its channel, although the
+ *  classifier's rules could be configured to do that.  Similarly, the MAC
+ *  address is not used when transmitting packets, although applications
+ *  could certainly decide to use the assigned address as a source MAC
+ *  address when doing so.  This attribute may only be retrieved with
+ *  gxio_mpipe_link_get_attr(); it may not be modified.
+ */
+#define GXIO_MPIPE_LINK_MAC                0x040000
+
+/** Determine whether to discard egress packets on link down. If this value
+ *  is nonzero, packets sent on this link while the link is down will be
+ *  discarded.  If this value is zero, no packets will be sent on this link
+ *  while it is down.  The default value is one. */
+#define GXIO_MPIPE_LINK_DISCARD_IF_DOWN    0x050000
+
+/** Possible link state.  The value is a combination of link state flags,
+ *  ORed together, that indicate link modes which are actually supported by
+ *  the hardware.  This attribute may only be retrieved with
+ *  gxio_mpipe_link_get_attr(); it may not be modified. */
+#define GXIO_MPIPE_LINK_POSSIBLE_STATE     0x060000
+
+/** Current link state.  The value is a combination of link state flags,
+ *  ORed together, that indicate the current state of the hardware.  If the
+ *  link is down, the value ANDed with ::GXIO_MPIPE_LINK_SPEED will be zero;
+ *  if the link is up, the value ANDed with ::GXIO_MPIPE_LINK_SPEED will
+ *  result in exactly one of the speed values, indicating the current speed.
+ *  This attribute may only be retrieved with gxio_mpipe_link_get_attr(); it
+ *  may not be modified. */
+#define GXIO_MPIPE_LINK_CURRENT_STATE      0x070000
+
+/** Desired link state. The value is a conbination of flags, which specify
+ *  the desired state for the link.  With gxio_mpipe_link_set_attr(), this
+ *  will, in the background, attempt to bring up the link using whichever of
+ *  the requested flags are reasonable, or take down the link if the flags
+ *  are zero.  The actual link up or down operation may happen after this
+ *  call completes.  If the link state changes in the future, the system
+ *  will continue to try to get back to the desired link state; for
+ *  instance, if the link is brought up successfully, and then the network
+ *  cable is disconnected, the link will go down.  However, the desired
+ *  state of the link is still up, so if the cable is reconnected, the link
+ *  will be brought up again.
+ *
+ *  With gxio_mpipe_link_set_attr(), this will indicate the desired state
+ *  for the link, as set with a previous gxio_mpipe_link_set_attr() call,
+ *  or implicitly by a gxio_mpipe_link_open() or link close operation.
+ *  This may not reflect the current state of the link; to get that, use
+ *  ::GXIO_MPIPE_LINK_CURRENT_STATE.
+ */
+#define GXIO_MPIPE_LINK_DESIRED_STATE      0x080000
+
+
+
+/** Link can run, should run, or is running at 10 Mbps. */
+#define GXIO_MPIPE_LINK_10M        0x0000000000000001UL
+
+/** Link can run, should run, or is running at 100 Mbps. */
+#define GXIO_MPIPE_LINK_100M       0x0000000000000002UL
+
+/** Link can run, should run, or is running at 1 Gbps. */
+#define GXIO_MPIPE_LINK_1G         0x0000000000000004UL
+
+/** Link can run, should run, or is running at 10 Gbps. */
+#define GXIO_MPIPE_LINK_10G        0x0000000000000008UL
+
+/** Link can run, should run, or is running at 20 Gbps. */
+#define GXIO_MPIPE_LINK_20G        0x0000000000000010UL
+
+/** Link can run, should run, or is running at 25 Gbps. */
+#define GXIO_MPIPE_LINK_25G        0x0000000000000020UL
+
+/** Link can run, should run, or is running at 50 Gbps. */
+#define GXIO_MPIPE_LINK_50G        0x0000000000000040UL
+
+/** Link should run at the highest speed supported by the link and by
+ *  the device connected to the link.  Only usable as a value for
+ *  the link's desired state; never returned as a value for the current
+ *  or possible states. */
+#define GXIO_MPIPE_LINK_ANYSPEED   0x0000000000000800UL
+
+/** All legal link speeds.  This value is provided for use in extracting
+ *  the speed-related subset of the link state flags; it is not intended
+ *  to be set directly as a value for one of the GXIO_MPIPE_LINK_xxx_STATE
+ *  attributes.  A link is up or is requested to be up if its current or
+ *  desired state, respectively, ANDED with this value, is nonzero. */
+#define GXIO_MPIPE_LINK_SPEED_MASK 0x0000000000000FFFUL
+
+/** Link can run, should run, or is running in MAC loopback mode.  This
+ *  loops transmitted packets back to the receiver, inside the Tile
+ *  Processor. */
+#define GXIO_MPIPE_LINK_LOOP_MAC   0x0000000000001000UL
+
+/** Link can run, should run, or is running in PHY loopback mode.  This
+ *  loops transmitted packets back to the receiver, inside the external
+ *  PHY chip. */
+#define GXIO_MPIPE_LINK_LOOP_PHY   0x0000000000002000UL
+
+/** Link can run, should run, or is running in external loopback mode.
+ *  This requires that an external loopback plug be installed on the
+ *  Ethernet port.  Note that only some links require that this be
+ *  configured via the gxio_mpipe_link routines; other links can do
+ *  external loopack with the plug and no special configuration. */
+#define GXIO_MPIPE_LINK_LOOP_EXT   0x0000000000004000UL
+
+/** All legal loopback types. */
+#define GXIO_MPIPE_LINK_LOOP_MASK  0x000000000000F000UL
+
+/** Link can run, should run, or is running in full-duplex mode.
+ *  If neither ::GXIO_MPIPE_LINK_FDX nor ::GXIO_MPIPE_LINK_HDX are
+ *  specified in a set of desired state flags, both are assumed. */
+#define GXIO_MPIPE_LINK_FDX        0x0000000000010000UL
+
+/** Link can run, should run, or is running in half-duplex mode.
+ *  If neither ::GXIO_MPIPE_LINK_FDX nor ::GXIO_MPIPE_LINK_HDX are
+ *  specified in a set of desired state flags, both are assumed. */
+#define GXIO_MPIPE_LINK_HDX        0x0000000000020000UL
+
+
+/** An individual rule. */
+typedef struct
+{
+  /** The total size. */
+  uint16_t size;
+
+  /** The priority. */
+  int16_t priority;
+
+  /** The "headroom" in each buffer. */
+  uint8_t headroom;
+
+  /** The "tailroom" in each buffer. */
+  uint8_t tailroom;
+
+  /** The "capacity" of the largest buffer. */
+  uint16_t capacity;
+
+  /** The mask for converting a flow hash into a bucket. */
+  uint16_t bucket_mask;
+
+  /** The offset for converting a flow hash into a bucket. */
+  uint16_t bucket_first;
+
+  /** The buffer stack ids. */
+  gxio_mpipe_rules_stacks_t stacks;
+
+  /** The actual channels. */
+  uint32_t channel_bits;
+
+  /** The number of dmacs. */
+  uint16_t num_dmacs;
+
+  /** The number of vlans. */
+  uint16_t num_vlans;
+
+  /** The actual dmacs and vlans. */
+  uint8_t dmacs_and_vlans[];
+
+} gxio_mpipe_rules_rule_t;
+
+
+/** A list of classifier rules. */
+typedef struct
+{
+  /** The offset to the end of the current rule. */
+  uint16_t tail;
+
+  /** The offset to the start of the current rule. */
+  uint16_t head;
+
+  /** The actual rules. */
+  uint8_t rules[4096 - 4];
+
+} gxio_mpipe_rules_list_t;
+
+
+
+
+/** mPIPE statistics structure. These counters include all relevant
+ *  events occurring on all links within the mPIPE shim. */
+typedef struct
+{
+  /** Number of ingress packets dropped for any reason. */
+  uint64_t ingress_drops;
+  /** Number of ingress packets dropped because a buffer stack was empty. */
+  uint64_t ingress_drops_no_buf;
+  /** Number of ingress packets dropped or truncated due to lack of space in
+   *  the iPkt buffer. */
+  uint64_t ingress_drops_ipkt;
+  /** Number of ingress packets dropped by the classifier or load balancer */
+  uint64_t ingress_drops_cls_lb;
+  /** Total number of ingress packets. */
+  uint64_t ingress_packets;
+  /** Total number of egress packets. */
+  uint64_t egress_packets;
+  /** Total number of ingress bytes. */
+  uint64_t ingress_bytes;
+  /** Total number of egress bytes. */
+  uint64_t egress_bytes;
+}
+gxio_mpipe_stats_t;
+
+
+#endif /* _SYS_HV_DRV_MPIPE_INTF_H */
diff --git a/arch/tile/include/hv/drv_trio_intf.h b/arch/tile/include/hv/drv_trio_intf.h
new file mode 100644
index 00000000000..237e04dee66
--- /dev/null
+++ b/arch/tile/include/hv/drv_trio_intf.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Interface definitions for the trio driver.
+ */
+
+#ifndef _SYS_HV_DRV_TRIO_INTF_H
+#define _SYS_HV_DRV_TRIO_INTF_H
+
+#include <arch/trio.h>
+
+/** The vendor ID for all Tilera processors. */
+#define TILERA_VENDOR_ID 0x1a41
+
+/** The device ID for the Gx36 processor. */
+#define TILERA_GX36_DEV_ID 0x0200
+
+/** Device ID for our internal bridge when running as RC. */
+#define TILERA_GX36_RC_DEV_ID 0x2000
+
+/** Maximum number of TRIO interfaces. */
+#define TILEGX_NUM_TRIO         2
+
+/** Gx36 has max 3 PCIe MACs per TRIO interface. */
+#define TILEGX_TRIO_PCIES       3
+
+/** Specify port properties for a PCIe MAC. */
+struct pcie_port_property
+{
+  /** If true, the link can be configured in PCIe root complex mode. */
+  uint8_t allow_rc: 1;
+
+  /** If true, the link can be configured in PCIe endpoint mode. */
+  uint8_t allow_ep: 1;
+
+  /** If true, the link can be configured in StreamIO mode. */
+  uint8_t allow_sio: 1;
+
+  /** If true, the link is allowed to support 1-lane operation. Software
+   *  will not consider it an error if the link comes up as a x1 link. */
+  uint8_t allow_x1: 1;
+
+  /** If true, the link is allowed to support 2-lane operation. Software
+   *  will not consider it an error if the link comes up as a x2 link. */
+  uint8_t allow_x2: 1;
+
+  /** If true, the link is allowed to support 4-lane operation. Software
+   *  will not consider it an error if the link comes up as a x4 link. */
+  uint8_t allow_x4: 1;
+
+  /** If true, the link is allowed to support 8-lane operation. Software
+   *  will not consider it an error if the link comes up as a x8 link. */
+  uint8_t allow_x8: 1;
+
+  /** If true, this link is connected to a device which may or may not
+   *  be present. */
+  uint8_t removable: 1;
+
+};
+
+/** Configurations can be issued to configure a char stream interrupt. */
+typedef enum pcie_stream_intr_config_sel_e
+{
+  /** Interrupt configuration for memory map regions. */
+  MEM_MAP_SEL,
+
+  /** Interrupt configuration for push DMAs. */
+  PUSH_DMA_SEL,
+
+  /** Interrupt configuration for pull DMAs. */
+  PULL_DMA_SEL,
+}
+pcie_stream_intr_config_sel_t;
+
+
+/** The mmap file offset (PA) of the TRIO config region. */
+#define HV_TRIO_CONFIG_OFFSET                                        \
+  ((unsigned long long)TRIO_MMIO_ADDRESS_SPACE__REGION_VAL_CFG <<   \
+    TRIO_MMIO_ADDRESS_SPACE__REGION_SHIFT)
+
+/** The maximum size of the TRIO config region. */
+#define HV_TRIO_CONFIG_SIZE                                 \
+  (1ULL << TRIO_CFG_REGION_ADDR__REGION_SHIFT)
+
+/** Size of the config region mapped into client. We can't use
+ *  TRIO_MMIO_ADDRESS_SPACE__OFFSET_WIDTH because it
+ *  will require the kernel to allocate 4GB VA space
+ *  from the VMALLOC region which has a total range
+ *  of 4GB.
+ */
+#define HV_TRIO_CONFIG_IOREMAP_SIZE                            \
+  ((uint64_t) 1 << TRIO_CFG_REGION_ADDR__PROT_SHIFT)
+
+/** The mmap file offset (PA) of a scatter queue region. */
+#define HV_TRIO_SQ_OFFSET(queue)                                        \
+  (((unsigned long long)TRIO_MMIO_ADDRESS_SPACE__REGION_VAL_MAP_SQ <<   \
+    TRIO_MMIO_ADDRESS_SPACE__REGION_SHIFT) |                            \
+   ((queue) << TRIO_MAP_SQ_REGION_ADDR__SQ_SEL_SHIFT))
+
+/** The maximum size of a scatter queue region. */
+#define HV_TRIO_SQ_SIZE                                 \
+  (1ULL << TRIO_MAP_SQ_REGION_ADDR__SQ_SEL_SHIFT)
+
+
+/** The "hardware MMIO region" of the first PIO region. */
+#define HV_TRIO_FIRST_PIO_REGION 8
+
+/** The mmap file offset (PA) of a PIO region. */
+#define HV_TRIO_PIO_OFFSET(region)                           \
+  (((unsigned long long)(region) + HV_TRIO_FIRST_PIO_REGION) \
+   << TRIO_PIO_REGIONS_ADDR__REGION_SHIFT)
+
+/** The maximum size of a PIO region. */
+#define HV_TRIO_PIO_SIZE (1ULL << TRIO_PIO_REGIONS_ADDR__ADDR_WIDTH)
+
+
+/** The mmap file offset (PA) of a push DMA region. */
+#define HV_TRIO_PUSH_DMA_OFFSET(ring)                                   \
+  (((unsigned long long)TRIO_MMIO_ADDRESS_SPACE__REGION_VAL_PUSH_DMA << \
+    TRIO_MMIO_ADDRESS_SPACE__REGION_SHIFT) |                            \
+   ((ring) << TRIO_PUSH_DMA_REGION_ADDR__RING_SEL_SHIFT))
+
+/** The mmap file offset (PA) of a pull DMA region. */
+#define HV_TRIO_PULL_DMA_OFFSET(ring)                                   \
+  (((unsigned long long)TRIO_MMIO_ADDRESS_SPACE__REGION_VAL_PULL_DMA << \
+    TRIO_MMIO_ADDRESS_SPACE__REGION_SHIFT) |                            \
+   ((ring) << TRIO_PULL_DMA_REGION_ADDR__RING_SEL_SHIFT))
+
+/** The maximum size of a DMA region. */
+#define HV_TRIO_DMA_REGION_SIZE                         \
+  (1ULL << TRIO_PUSH_DMA_REGION_ADDR__RING_SEL_SHIFT)
+
+
+/** The mmap file offset (PA) of a Mem-Map interrupt region. */
+#define HV_TRIO_MEM_MAP_INTR_OFFSET(map)                                 \
+  (((unsigned long long)TRIO_MMIO_ADDRESS_SPACE__REGION_VAL_MAP_MEM <<   \
+    TRIO_MMIO_ADDRESS_SPACE__REGION_SHIFT) |                            \
+   ((map) << TRIO_MAP_MEM_REGION_ADDR__MAP_SEL_SHIFT))
+
+/** The maximum size of a Mem-Map interrupt region. */
+#define HV_TRIO_MEM_MAP_INTR_SIZE                                 \
+  (1ULL << TRIO_MAP_MEM_REGION_ADDR__MAP_SEL_SHIFT)
+
+
+/** A flag bit indicating a fixed resource allocation. */
+#define HV_TRIO_ALLOC_FIXED 0x01
+
+/** TRIO requires that all mappings have 4kB aligned start addresses. */
+#define HV_TRIO_PAGE_SHIFT 12
+
+/** TRIO requires that all mappings have 4kB aligned start addresses. */
+#define HV_TRIO_PAGE_SIZE (1ull << HV_TRIO_PAGE_SHIFT)
+
+
+/* Specify all PCIe port properties for a TRIO. */
+struct pcie_trio_ports_property
+{
+  struct pcie_port_property ports[TILEGX_TRIO_PCIES];
+
+  /** Set if this TRIO belongs to a Gx72 device. */
+  uint8_t is_gx72;
+};
+
+/* Flags indicating traffic class. */
+#define HV_TRIO_FLAG_TC_SHIFT 4
+#define HV_TRIO_FLAG_TC_RMASK 0xf
+#define HV_TRIO_FLAG_TC(N) \
+  ((((N) & HV_TRIO_FLAG_TC_RMASK) + 1) << HV_TRIO_FLAG_TC_SHIFT)
+
+/* Flags indicating virtual functions. */
+#define HV_TRIO_FLAG_VFUNC_SHIFT 8
+#define HV_TRIO_FLAG_VFUNC_RMASK 0xff
+#define HV_TRIO_FLAG_VFUNC(N) \
+  ((((N) & HV_TRIO_FLAG_VFUNC_RMASK) + 1) << HV_TRIO_FLAG_VFUNC_SHIFT)
+
+
+/* Flag indicating an ordered PIO region. */
+#define HV_TRIO_PIO_FLAG_ORDERED (1 << 16)
+
+/* Flags indicating special types of PIO regions. */
+#define HV_TRIO_PIO_FLAG_SPACE_SHIFT 17
+#define HV_TRIO_PIO_FLAG_SPACE_MASK (0x3 << HV_TRIO_PIO_FLAG_SPACE_SHIFT)
+#define HV_TRIO_PIO_FLAG_CONFIG_SPACE (0x1 << HV_TRIO_PIO_FLAG_SPACE_SHIFT)
+#define HV_TRIO_PIO_FLAG_IO_SPACE (0x2 << HV_TRIO_PIO_FLAG_SPACE_SHIFT)
+
+
+#endif /* _SYS_HV_DRV_TRIO_INTF_H */
diff --git a/arch/tile/include/hv/drv_uart_intf.h b/arch/tile/include/hv/drv_uart_intf.h
new file mode 100644
index 00000000000..f5379e2404f
--- /dev/null
+++ b/arch/tile/include/hv/drv_uart_intf.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Interface definitions for the UART driver.
+ */
+
+#ifndef _SYS_HV_DRV_UART_INTF_H
+#define _SYS_HV_DRV_UART_INTF_H
+
+#include <arch/uart.h>
+
+/** Number of UART ports supported. */
+#define TILEGX_UART_NR        2
+
+/** The mmap file offset (PA) of the UART MMIO region. */
+#define HV_UART_MMIO_OFFSET   0
+
+/** The maximum size of the UARTs MMIO region (64K Bytes). */
+#define HV_UART_MMIO_SIZE     (1UL << 16)
+
+#endif /* _SYS_HV_DRV_UART_INTF_H */
diff --git a/arch/tile/include/hv/drv_usb_host_intf.h b/arch/tile/include/hv/drv_usb_host_intf.h
new file mode 100644
index 00000000000..24ce774a3f1
--- /dev/null
+++ b/arch/tile/include/hv/drv_usb_host_intf.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * Interface definitions for the USB host driver.
+ */
+
+#ifndef _SYS_HV_DRV_USB_HOST_INTF_H
+#define _SYS_HV_DRV_USB_HOST_INTF_H
+
+#include <arch/usb_host.h>
+
+
+/** Offset for the EHCI register MMIO region. */
+#define HV_USB_HOST_MMIO_OFFSET_EHCI ((uint64_t) USB_HOST_HCCAPBASE_REG)
+
+/** Offset for the OHCI register MMIO region. */
+#define HV_USB_HOST_MMIO_OFFSET_OHCI ((uint64_t) USB_HOST_OHCD_HC_REVISION_REG)
+
+/** Size of the register MMIO region.  This turns out to be the same for
+ *  both EHCI and OHCI. */
+#define HV_USB_HOST_MMIO_SIZE ((uint64_t) 0x1000)
+
+/** The number of service domains supported by the USB host shim. */
+#define HV_USB_HOST_NUM_SVC_DOM 1
+
+
+#endif /* _SYS_HV_DRV_USB_HOST_INTF_H */
diff --git a/arch/tile/include/hv/drv_xgbe_intf.h b/arch/tile/include/hv/drv_xgbe_intf.h
index f13188ac281..2a20b266d94 100644
--- a/arch/tile/include/hv/drv_xgbe_intf.h
+++ b/arch/tile/include/hv/drv_xgbe_intf.h
@@ -460,7 +460,7 @@ typedef void* lepp_comp_t;
  *  linux's "MAX_SKB_FRAGS", and presumably over-estimates by one, for
  *  our page size of exactly 65536.  We add one for a "body" fragment.
  */
-#define LEPP_MAX_FRAGS (65536 / HV_PAGE_SIZE_SMALL + 2 + 1)
+#define LEPP_MAX_FRAGS (65536 / HV_DEFAULT_PAGE_SIZE_SMALL + 2 + 1)
 
 /** Total number of bytes needed for an lepp_tso_cmd_t. */
 #define LEPP_TSO_CMD_SIZE(num_frags, header_size) \
diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
index 72ec1e972f1..dfcdeb61ba3 100644
--- a/arch/tile/include/hv/hypervisor.h
+++ b/arch/tile/include/hv/hypervisor.h
@@ -17,8 +17,8 @@
  * The hypervisor's public API.
  */
 
-#ifndef _TILE_HV_H
-#define _TILE_HV_H
+#ifndef _HV_HV_H
+#define _HV_HV_H
 
 #include <arch/chip.h>
 
@@ -42,25 +42,45 @@
  */
 #define HV_L1_SPAN (__HV_SIZE_ONE << HV_LOG2_L1_SPAN)
 
-/** The log2 of the size of small pages, in bytes. This value should
- * be verified at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_SMALL).
+/** The log2 of the initial size of small pages, in bytes.
+ * See HV_DEFAULT_PAGE_SIZE_SMALL.
  */
-#define HV_LOG2_PAGE_SIZE_SMALL 16
+#define HV_LOG2_DEFAULT_PAGE_SIZE_SMALL 16
 
-/** The size of small pages, in bytes. This value should be verified
+/** The initial size of small pages, in bytes. This value should be verified
  * at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_SMALL).
+ * It may also be modified when installing a new context.
  */
-#define HV_PAGE_SIZE_SMALL (__HV_SIZE_ONE << HV_LOG2_PAGE_SIZE_SMALL)
+#define HV_DEFAULT_PAGE_SIZE_SMALL \
+  (__HV_SIZE_ONE << HV_LOG2_DEFAULT_PAGE_SIZE_SMALL)
 
-/** The log2 of the size of large pages, in bytes. This value should be
- * verified at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_LARGE).
+/** The log2 of the initial size of large pages, in bytes.
+ * See HV_DEFAULT_PAGE_SIZE_LARGE.
  */
-#define HV_LOG2_PAGE_SIZE_LARGE 24
+#define HV_LOG2_DEFAULT_PAGE_SIZE_LARGE 24
 
-/** The size of large pages, in bytes. This value should be verified
+/** The initial size of large pages, in bytes. This value should be verified
  * at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_LARGE).
+ * It may also be modified when installing a new context.
  */
-#define HV_PAGE_SIZE_LARGE (__HV_SIZE_ONE << HV_LOG2_PAGE_SIZE_LARGE)
+#define HV_DEFAULT_PAGE_SIZE_LARGE \
+  (__HV_SIZE_ONE << HV_LOG2_DEFAULT_PAGE_SIZE_LARGE)
+
+#if CHIP_VA_WIDTH() > 32
+
+/** The log2 of the initial size of jumbo pages, in bytes.
+ * See HV_DEFAULT_PAGE_SIZE_JUMBO.
+ */
+#define HV_LOG2_DEFAULT_PAGE_SIZE_JUMBO 32
+
+/** The initial size of jumbo pages, in bytes. This value should
+ * be verified at runtime by calling hv_sysconf(HV_SYSCONF_PAGE_SIZE_JUMBO).
+ * It may also be modified when installing a new context.
+ */
+#define HV_DEFAULT_PAGE_SIZE_JUMBO \
+  (__HV_SIZE_ONE << HV_LOG2_DEFAULT_PAGE_SIZE_JUMBO)
+
+#endif
 
 /** The log2 of the granularity at which page tables must be aligned;
  *  in other words, the CPA for a page table must have this many zero
@@ -87,7 +107,22 @@
 #define HV_DISPATCH_ENTRY_SIZE 32
 
 /** Version of the hypervisor interface defined by this file */
-#define _HV_VERSION 11
+#define _HV_VERSION 13
+
+/** Last version of the hypervisor interface with old hv_init() ABI.
+ *
+ * The change from version 12 to version 13 corresponds to launching
+ * the client by default at PL2 instead of PL1 (corresponding to the
+ * hv itself running at PL3 instead of PL2).  To make this explicit,
+ * the hv_init() API was also extended so the client can report its
+ * desired PL, resulting in a more helpful failure diagnostic.  If you
+ * call hv_init() with _HV_VERSION_OLD_HV_INIT and omit the client_pl
+ * argument, the hypervisor will assume client_pl = 1.
+ *
+ * Note that this is a deprecated solution and we do not expect to
+ * support clients of the Tilera hypervisor running at PL1 indefinitely.
+ */
+#define _HV_VERSION_OLD_HV_INIT 12
 
 /* Index into hypervisor interface dispatch code blocks.
  *
@@ -280,8 +315,14 @@
 #define HV_DISPATCH_GET_IPI_PTE                   56
 #endif
 
+/** hv_set_pte_super_shift */
+#define HV_DISPATCH_SET_PTE_SUPER_SHIFT           57
+
+/** hv_console_set_ipi */
+#define HV_DISPATCH_CONSOLE_SET_IPI               63
+
 /** One more than the largest dispatch value */
-#define _HV_DISPATCH_END                          57
+#define _HV_DISPATCH_END                          64
 
 
 #ifndef __ASSEMBLER__
@@ -354,7 +395,11 @@ typedef int HV_Errno;
 #ifndef __ASSEMBLER__
 
 /** Pass HV_VERSION to hv_init to request this version of the interface. */
-typedef enum { HV_VERSION = _HV_VERSION } HV_VersionNumber;
+typedef enum {
+  HV_VERSION = _HV_VERSION,
+  HV_VERSION_OLD_HV_INIT = _HV_VERSION_OLD_HV_INIT,
+
+} HV_VersionNumber;
 
 /** Initializes the hypervisor.
  *
@@ -362,9 +407,11 @@ typedef enum { HV_VERSION = _HV_VERSION } HV_VersionNumber;
  * that this program expects, typically HV_VERSION.
  * @param chip_num Architecture number of the chip the client was built for.
  * @param chip_rev_num Revision number of the chip the client was built for.
+ * @param client_pl Privilege level the client is built for
+ *   (not required if interface_version_number == HV_VERSION_OLD_HV_INIT).
  */
 void hv_init(HV_VersionNumber interface_version_number,
-             int chip_num, int chip_rev_num);
+             int chip_num, int chip_rev_num, int client_pl);
 
 
 /** Queries we can make for hv_sysconf().
@@ -401,7 +448,18 @@ typedef enum {
    *  that the temperature has hit an upper limit and is no longer being
    *  accurately tracked.
    */
-  HV_SYSCONF_BOARD_TEMP      = 6
+  HV_SYSCONF_BOARD_TEMP      = 6,
+
+  /** Legal page size bitmask for hv_install_context().
+   * For example, if 16KB and 64KB small pages are supported,
+   * it would return "HV_CTX_PG_SM_16K | HV_CTX_PG_SM_64K".
+   */
+  HV_SYSCONF_VALID_PAGE_SIZES = 7,
+
+  /** The size of jumbo pages, in bytes.
+   * If no jumbo pages are available, zero will be returned.
+   */
+  HV_SYSCONF_PAGE_SIZE_JUMBO = 8,
 
 } HV_SysconfQuery;
 
@@ -474,14 +532,36 @@ typedef enum {
   HV_CONFSTR_SWITCH_CONTROL  = 14,
 
   /** Chip revision level. */
-  HV_CONFSTR_CHIP_REV        = 15
+  HV_CONFSTR_CHIP_REV        = 15,
+
+  /** CPU module part number. */
+  HV_CONFSTR_CPUMOD_PART_NUM = 16,
+
+  /** CPU module serial number. */
+  HV_CONFSTR_CPUMOD_SERIAL_NUM = 17,
+
+  /** CPU module revision level. */
+  HV_CONFSTR_CPUMOD_REV      = 18,
+
+  /** Human-readable CPU module description. */
+  HV_CONFSTR_CPUMOD_DESC     = 19,
+
+  /** Per-tile hypervisor statistics.  When this identifier is specified,
+   *  the hv_confstr call takes two extra arguments.  The first is the
+   *  HV_XY_TO_LOTAR of the target tile's coordinates.  The second is
+   *  a flag word.  The only current flag is the lowest bit, which means
+   *  "zero out the stats instead of retrieving them"; in this case the
+   *  buffer and buffer length are ignored. */
+  HV_CONFSTR_HV_STATS        = 20
 
 } HV_ConfstrQuery;
 
 /** Query a configuration string from the hypervisor.
  *
  * @param query Identifier for the specific string to be retrieved
- *        (HV_CONFSTR_xxx).
+ *        (HV_CONFSTR_xxx).  Some strings may require or permit extra
+ *        arguments to be appended which select specific objects to be
+ *        described; see the string descriptions above.
  * @param buf Buffer in which to place the string.
  * @param len Length of the buffer.
  * @return If query is valid, then the length of the corresponding string,
@@ -489,7 +569,7 @@ typedef enum {
  *        was truncated.  If query is invalid, HV_EINVAL.  If the specified
  *        buffer is not writable by the client, HV_EFAULT.
  */
-int hv_confstr(HV_ConfstrQuery query, HV_VirtAddr buf, int len);
+int hv_confstr(HV_ConfstrQuery query, HV_VirtAddr buf, int len, ...);
 
 /** Tile coordinate */
 typedef struct
@@ -513,6 +593,30 @@ typedef struct
  */
 int hv_get_ipi_pte(HV_Coord tile, int pl, HV_PTE* pte);
 
+/** Configure the console interrupt.
+ *
+ * When the console client interrupt is enabled, the hypervisor will
+ * deliver the specified IPI to the client in the following situations:
+ *
+ * - The console has at least one character available for input.
+ *
+ * - The console can accept new characters for output, and the last call
+ *   to hv_console_write() did not write all of the characters requested
+ *   by the client.
+ *
+ * Note that in some system configurations, console interrupt will not
+ * be available; clients should be prepared for this routine to fail and
+ * to fall back to periodic console polling in that case.
+ *
+ * @param ipi Index of the IPI register which will receive the interrupt.
+ * @param event IPI event number for console interrupt. If less than 0,
+ *        disable the console IPI interrupt.
+ * @param coord Tile to be targeted for console interrupt.
+ * @return 0 on success, otherwise, HV_EINVAL if illegal parameter,
+ *         HV_ENOTSUP if console interrupt are not available.
+ */
+int hv_console_set_ipi(int ipi, int event, HV_Coord coord);
+
 #else /* !CHIP_HAS_IPI() */
 
 /** A set of interrupts. */
@@ -649,6 +753,12 @@ void hv_set_rtc(HV_RTCTime time);
  *  new page table does not need to contain any mapping for the
  *  hv_install_context address itself.
  *
+ *  At most one HV_CTX_PG_SM_* flag may be specified in "flags";
+ *  if multiple flags are specified, HV_EINVAL is returned.
+ *  Specifying none of the flags results in using the default page size.
+ *  All cores participating in a given client must request the same
+ *  page size, or the results are undefined.
+ *
  * @param page_table Root of the page table.
  * @param access PTE providing info on how to read the page table.  This
  *   value must be consistent between multiple tiles sharing a page table,
@@ -667,8 +777,36 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
 #define HV_CTX_DIRECTIO     0x1   /**< Direct I/O requests are accepted from
                                        PL0. */
 
+#define HV_CTX_PG_SM_4K     0x10  /**< Use 4K small pages, if available. */
+#define HV_CTX_PG_SM_16K    0x20  /**< Use 16K small pages, if available. */
+#define HV_CTX_PG_SM_64K    0x40  /**< Use 64K small pages, if available. */
+#define HV_CTX_PG_SM_MASK   0xf0  /**< Mask of all possible small pages. */
+
 #ifndef __ASSEMBLER__
 
+
+/** Set the number of pages ganged together by HV_PTE_SUPER at a
+ * particular level of the page table.
+ *
+ * The current TILE-Gx hardware only supports powers of four
+ * (i.e. log2_count must be a multiple of two), and the requested
+ * "super" page size must be less than the span of the next level in
+ * the page table.  The largest size that can be requested is 64GB.
+ *
+ * The shift value is initially "0" for all page table levels,
+ * indicating that the HV_PTE_SUPER bit is effectively ignored.
+ *
+ * If you change the count from one non-zero value to another, the
+ * hypervisor will flush the entire TLB and TSB to avoid confusion.
+ *
+ * @param level Page table level (0, 1, or 2)
+ * @param log2_count Base-2 log of the number of pages to gang together,
+ * i.e. how much to shift left the base page size for the super page size.
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+int hv_set_pte_super_shift(int level, int log2_count);
+
+
 /** Value returned from hv_inquire_context(). */
 typedef struct
 {
@@ -1238,11 +1376,14 @@ HV_Errno hv_set_command_line(HV_VirtAddr buf, int length);
  * with the existing priority pages) or "red/black" (if they don't).
  * The bitmask provides information on which parts of the cache
  * have been used for pinned pages so far on this tile; if (1 << N)
- * appears in the bitmask, that indicates that a page has been marked
- * "priority" whose PFN equals N, mod 8.
+ * appears in the bitmask, that indicates that a 4KB region of the
+ * cache starting at (N * 4KB) is in use by a "priority" page.
+ * The portion of cache used by a particular page can be computed
+ * by taking the page's PA, modulo CHIP_L2_CACHE_SIZE(), and setting
+ * all the "4KB" bits corresponding to the actual page size.
  * @param bitmask A bitmap of priority page set values
  */
-void hv_set_caching(unsigned int bitmask);
+void hv_set_caching(unsigned long bitmask);
 
 
 /** Zero out a specified number of pages.
@@ -1851,12 +1992,12 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
 #define HV_PTE_INDEX_USER            10  /**< Page is user-accessible */
 #define HV_PTE_INDEX_ACCESSED        11  /**< Page has been accessed */
 #define HV_PTE_INDEX_DIRTY           12  /**< Page has been written */
-                                         /*   Bits 13-15 are reserved for
+                                         /*   Bits 13-14 are reserved for
                                               future use. */
+#define HV_PTE_INDEX_SUPER           15  /**< Pages ganged together for TLB */
 #define HV_PTE_INDEX_MODE            16  /**< Page mode; see HV_PTE_MODE_xxx */
 #define HV_PTE_MODE_BITS              3  /**< Number of bits in mode */
-                                         /*   Bit 19 is reserved for
-                                              future use. */
+#define HV_PTE_INDEX_CLIENT2         19  /**< Page client state 2 */
 #define HV_PTE_INDEX_LOTAR           20  /**< Page's LOTAR; must be high bits
                                               of word */
 #define HV_PTE_LOTAR_BITS            12  /**< Number of bits in a LOTAR */
@@ -1869,15 +2010,6 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
                                               of word */
 #define HV_PTE_PTFN_BITS             29  /**< Number of bits in a PTFN */
 
-/** Position of the PFN field within the PTE (subset of the PTFN). */
-#define HV_PTE_INDEX_PFN (HV_PTE_INDEX_PTFN + (HV_LOG2_PAGE_SIZE_SMALL - \
-                                               HV_LOG2_PAGE_TABLE_ALIGN))
-
-/** Length of the PFN field within the PTE (subset of the PTFN). */
-#define HV_PTE_INDEX_PFN_BITS (HV_PTE_INDEX_PTFN_BITS - \
-                               (HV_LOG2_PAGE_SIZE_SMALL - \
-                                HV_LOG2_PAGE_TABLE_ALIGN))
-
 /*
  * Legal values for the PTE's mode field
  */
@@ -1957,7 +2089,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
 
 /** Does this PTE map a page?
  *
- * If this bit is set in the level-1 page table, the entry should be
+ * If this bit is set in a level-0 page table, the entry should be
+ * interpreted as a level-2 page table entry mapping a jumbo page.
+ *
+ * If this bit is set in a level-1 page table, the entry should be
  * interpreted as a level-2 page table entry mapping a large page.
  *
  * This bit should not be modified by the client while PRESENT is set, as
@@ -1967,6 +2102,18 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  */
 #define HV_PTE_PAGE                  (__HV_PTE_ONE << HV_PTE_INDEX_PAGE)
 
+/** Does this PTE implicitly reference multiple pages?
+ *
+ * If this bit is set in the page table (either in the level-2 page table,
+ * or in a higher level page table in conjunction with the PAGE bit)
+ * then the PTE specifies a range of contiguous pages, not a single page.
+ * The hv_set_pte_super_shift() allows you to specify the count for
+ * each level of the page table.
+ *
+ * Note: this bit is not supported on TILEPro systems.
+ */
+#define HV_PTE_SUPER                 (__HV_PTE_ONE << HV_PTE_INDEX_SUPER)
+
 /** Is this a global (non-ASID) mapping?
  *
  * If this bit is set, the translations established by this PTE will
@@ -2046,6 +2193,13 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  */
 #define HV_PTE_CLIENT1               (__HV_PTE_ONE << HV_PTE_INDEX_CLIENT1)
 
+/** Client-private bit in PTE.
+ *
+ * This bit is guaranteed not to be inspected or modified by the
+ * hypervisor.
+ */
+#define HV_PTE_CLIENT2               (__HV_PTE_ONE << HV_PTE_INDEX_CLIENT2)
+
 /** Non-coherent (NC) bit in PTE.
  *
  * If this bit is set, the mapping that is set up will be non-coherent
@@ -2178,8 +2332,10 @@ hv_pte_clear_##name(HV_PTE pte)                                 \
  */
 _HV_BIT(present,         PRESENT)
 _HV_BIT(page,            PAGE)
+_HV_BIT(super,           SUPER)
 _HV_BIT(client0,         CLIENT0)
 _HV_BIT(client1,         CLIENT1)
+_HV_BIT(client2,         CLIENT2)
 _HV_BIT(migrating,       MIGRATING)
 _HV_BIT(nc,              NC)
 _HV_BIT(readable,        READABLE)
@@ -2222,40 +2378,11 @@ hv_pte_set_mode(HV_PTE pte, unsigned int val)
  *
  * This field contains the upper bits of the CPA (client physical
  * address) of the target page; the complete CPA is this field with
- * HV_LOG2_PAGE_SIZE_SMALL zero bits appended to it.
- *
- * For PTEs in a level-1 page table where the Page bit is set, the
- * CPA must be aligned modulo the large page size.
- */
-static __inline unsigned int
-hv_pte_get_pfn(const HV_PTE pte)
-{
-  return pte.val >> HV_PTE_INDEX_PFN;
-}
-
-
-/** Set the page frame number into a PTE.  See hv_pte_get_pfn. */
-static __inline HV_PTE
-hv_pte_set_pfn(HV_PTE pte, unsigned int val)
-{
-  /*
-   * Note that the use of "PTFN" in the next line is intentional; we
-   * don't want any garbage lower bits left in that field.
-   */
-  pte.val &= ~(((1ULL << HV_PTE_PTFN_BITS) - 1) << HV_PTE_INDEX_PTFN);
-  pte.val |= (__hv64) val << HV_PTE_INDEX_PFN;
-  return pte;
-}
-
-/** Get the page table frame number from the PTE.
+ * HV_LOG2_PAGE_TABLE_ALIGN zero bits appended to it.
  *
- * This field contains the upper bits of the CPA (client physical
- * address) of the target page table; the complete CPA is this field with
- * with HV_PAGE_TABLE_ALIGN zero bits appended to it.
- *
- * For PTEs in a level-1 page table when the Page bit is not set, the
- * CPA must be aligned modulo the sticter of HV_PAGE_TABLE_ALIGN and
- * the level-2 page table size.
+ * For all PTEs in the lowest-level page table, and for all PTEs with
+ * the Page bit set in all page tables, the CPA must be aligned modulo
+ * the relevant page size.
  */
 static __inline unsigned long
 hv_pte_get_ptfn(const HV_PTE pte)
@@ -2263,7 +2390,6 @@ hv_pte_get_ptfn(const HV_PTE pte)
   return pte.val >> HV_PTE_INDEX_PTFN;
 }
 
-
 /** Set the page table frame number into a PTE.  See hv_pte_get_ptfn. */
 static __inline HV_PTE
 hv_pte_set_ptfn(HV_PTE pte, unsigned long val)
@@ -2273,6 +2399,20 @@ hv_pte_set_ptfn(HV_PTE pte, unsigned long val)
   return pte;
 }
 
+/** Get the client physical address from the PTE.  See hv_pte_set_ptfn. */
+static __inline HV_PhysAddr
+hv_pte_get_pa(const HV_PTE pte)
+{
+  return (__hv64) hv_pte_get_ptfn(pte) << HV_LOG2_PAGE_TABLE_ALIGN;
+}
+
+/** Set the client physical address into a PTE.  See hv_pte_get_ptfn. */
+static __inline HV_PTE
+hv_pte_set_pa(HV_PTE pte, HV_PhysAddr pa)
+{
+  return hv_pte_set_ptfn(pte, pa >> HV_LOG2_PAGE_TABLE_ALIGN);
+}
+
 
 /** Get the remote tile caching this page.
  *
@@ -2308,28 +2448,20 @@ hv_pte_set_lotar(HV_PTE pte, unsigned int val)
 
 #endif  /* !__ASSEMBLER__ */
 
-/** Converts a client physical address to a pfn. */
-#define HV_CPA_TO_PFN(p) ((p) >> HV_LOG2_PAGE_SIZE_SMALL)
-
-/** Converts a pfn to a client physical address. */
-#define HV_PFN_TO_CPA(p) (((HV_PhysAddr)(p)) << HV_LOG2_PAGE_SIZE_SMALL)
-
 /** Converts a client physical address to a ptfn. */
 #define HV_CPA_TO_PTFN(p) ((p) >> HV_LOG2_PAGE_TABLE_ALIGN)
 
 /** Converts a ptfn to a client physical address. */
 #define HV_PTFN_TO_CPA(p) (((HV_PhysAddr)(p)) << HV_LOG2_PAGE_TABLE_ALIGN)
 
-/** Converts a ptfn to a pfn. */
-#define HV_PTFN_TO_PFN(p) \
-  ((p) >> (HV_LOG2_PAGE_SIZE_SMALL - HV_LOG2_PAGE_TABLE_ALIGN))
-
-/** Converts a pfn to a ptfn. */
-#define HV_PFN_TO_PTFN(p) \
-  ((p) << (HV_LOG2_PAGE_SIZE_SMALL - HV_LOG2_PAGE_TABLE_ALIGN))
-
 #if CHIP_VA_WIDTH() > 32
 
+/*
+ * Note that we currently do not allow customizing the page size
+ * of the L0 pages, but fix them at 4GB, so we do not use the
+ * "_HV_xxx" nomenclature for the L0 macros.
+ */
+
 /** Log number of HV_PTE entries in L0 page table */
 #define HV_LOG2_L0_ENTRIES (CHIP_VA_WIDTH() - HV_LOG2_L1_SPAN)
 
@@ -2359,69 +2491,104 @@ hv_pte_set_lotar(HV_PTE pte, unsigned int val)
 #endif /* CHIP_VA_WIDTH() > 32 */
 
 /** Log number of HV_PTE entries in L1 page table */
-#define HV_LOG2_L1_ENTRIES (HV_LOG2_L1_SPAN - HV_LOG2_PAGE_SIZE_LARGE)
+#define _HV_LOG2_L1_ENTRIES(log2_page_size_large) \
+  (HV_LOG2_L1_SPAN - log2_page_size_large)
 
 /** Number of HV_PTE entries in L1 page table */
-#define HV_L1_ENTRIES (1 << HV_LOG2_L1_ENTRIES)
+#define _HV_L1_ENTRIES(log2_page_size_large) \
+  (1 << _HV_LOG2_L1_ENTRIES(log2_page_size_large))
 
 /** Log size of L1 page table in bytes */
-#define HV_LOG2_L1_SIZE (HV_LOG2_PTE_SIZE + HV_LOG2_L1_ENTRIES)
+#define _HV_LOG2_L1_SIZE(log2_page_size_large) \
+  (HV_LOG2_PTE_SIZE + _HV_LOG2_L1_ENTRIES(log2_page_size_large))
 
 /** Size of L1 page table in bytes */
-#define HV_L1_SIZE (1 << HV_LOG2_L1_SIZE)
+#define _HV_L1_SIZE(log2_page_size_large) \
+  (1 << _HV_LOG2_L1_SIZE(log2_page_size_large))
 
 /** Log number of HV_PTE entries in level-2 page table */
-#define HV_LOG2_L2_ENTRIES (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL)
+#define _HV_LOG2_L2_ENTRIES(log2_page_size_large, log2_page_size_small) \
+  (log2_page_size_large - log2_page_size_small)
 
 /** Number of HV_PTE entries in level-2 page table */
-#define HV_L2_ENTRIES (1 << HV_LOG2_L2_ENTRIES)
+#define _HV_L2_ENTRIES(log2_page_size_large, log2_page_size_small) \
+  (1 << _HV_LOG2_L2_ENTRIES(log2_page_size_large, log2_page_size_small))
 
 /** Log size of level-2 page table in bytes */
-#define HV_LOG2_L2_SIZE (HV_LOG2_PTE_SIZE + HV_LOG2_L2_ENTRIES)
+#define _HV_LOG2_L2_SIZE(log2_page_size_large, log2_page_size_small) \
+  (HV_LOG2_PTE_SIZE + \
+   _HV_LOG2_L2_ENTRIES(log2_page_size_large, log2_page_size_small))
 
 /** Size of level-2 page table in bytes */
-#define HV_L2_SIZE (1 << HV_LOG2_L2_SIZE)
+#define _HV_L2_SIZE(log2_page_size_large, log2_page_size_small) \
+  (1 << _HV_LOG2_L2_SIZE(log2_page_size_large, log2_page_size_small))
 
 #ifdef __ASSEMBLER__
 
 #if CHIP_VA_WIDTH() > 32
 
 /** Index in L1 for a specific VA */
-#define HV_L1_INDEX(va) \
-  (((va) >> HV_LOG2_PAGE_SIZE_LARGE) & (HV_L1_ENTRIES - 1))
+#define _HV_L1_INDEX(va, log2_page_size_large) \
+  (((va) >> log2_page_size_large) & (_HV_L1_ENTRIES(log2_page_size_large) - 1))
 
 #else /* CHIP_VA_WIDTH() > 32 */
 
 /** Index in L1 for a specific VA */
-#define HV_L1_INDEX(va) \
-  (((va) >> HV_LOG2_PAGE_SIZE_LARGE))
+#define _HV_L1_INDEX(va, log2_page_size_large) \
+  (((va) >> log2_page_size_large))
 
 #endif /* CHIP_VA_WIDTH() > 32 */
 
 /** Index in level-2 page table for a specific VA */
-#define HV_L2_INDEX(va) \
-  (((va) >> HV_LOG2_PAGE_SIZE_SMALL) & (HV_L2_ENTRIES - 1))
+#define _HV_L2_INDEX(va, log2_page_size_large, log2_page_size_small) \
+  (((va) >> log2_page_size_small) & \
+   (_HV_L2_ENTRIES(log2_page_size_large, log2_page_size_small) - 1))
 
 #else /* __ASSEMBLER __ */
 
 #if CHIP_VA_WIDTH() > 32
 
 /** Index in L1 for a specific VA */
-#define HV_L1_INDEX(va) \
-  (((HV_VirtAddr)(va) >> HV_LOG2_PAGE_SIZE_LARGE) & (HV_L1_ENTRIES - 1))
+#define _HV_L1_INDEX(va, log2_page_size_large) \
+  (((HV_VirtAddr)(va) >> log2_page_size_large) & \
+   (_HV_L1_ENTRIES(log2_page_size_large) - 1))
 
 #else /* CHIP_VA_WIDTH() > 32 */
 
 /** Index in L1 for a specific VA */
-#define HV_L1_INDEX(va) \
-  (((HV_VirtAddr)(va) >> HV_LOG2_PAGE_SIZE_LARGE))
+#define _HV_L1_INDEX(va, log2_page_size_large) \
+  (((HV_VirtAddr)(va) >> log2_page_size_large))
 
 #endif /* CHIP_VA_WIDTH() > 32 */
 
 /** Index in level-2 page table for a specific VA */
-#define HV_L2_INDEX(va) \
-  (((HV_VirtAddr)(va) >> HV_LOG2_PAGE_SIZE_SMALL) & (HV_L2_ENTRIES - 1))
+#define _HV_L2_INDEX(va, log2_page_size_large, log2_page_size_small) \
+  (((HV_VirtAddr)(va) >> log2_page_size_small) & \
+   (_HV_L2_ENTRIES(log2_page_size_large, log2_page_size_small) - 1))
 
 #endif /* __ASSEMBLER __ */
 
-#endif /* _TILE_HV_H */
+/** Position of the PFN field within the PTE (subset of the PTFN). */
+#define _HV_PTE_INDEX_PFN(log2_page_size) \
+  (HV_PTE_INDEX_PTFN + (log2_page_size - HV_LOG2_PAGE_TABLE_ALIGN))
+
+/** Length of the PFN field within the PTE (subset of the PTFN). */
+#define _HV_PTE_INDEX_PFN_BITS(log2_page_size) \
+  (HV_PTE_INDEX_PTFN_BITS - (log2_page_size - HV_LOG2_PAGE_TABLE_ALIGN))
+
+/** Converts a client physical address to a pfn. */
+#define _HV_CPA_TO_PFN(p, log2_page_size) ((p) >> log2_page_size)
+
+/** Converts a pfn to a client physical address. */
+#define _HV_PFN_TO_CPA(p, log2_page_size) \
+  (((HV_PhysAddr)(p)) << log2_page_size)
+
+/** Converts a ptfn to a pfn. */
+#define _HV_PTFN_TO_PFN(p, log2_page_size) \
+  ((p) >> (log2_page_size - HV_LOG2_PAGE_TABLE_ALIGN))
+
+/** Converts a pfn to a ptfn. */
+#define _HV_PFN_TO_PTFN(p, log2_page_size) \
+  ((p) << (log2_page_size - HV_LOG2_PAGE_TABLE_ALIGN))
+
+#endif /* _HV_HV_H */
diff --git a/arch/tile/include/hv/iorpc.h b/arch/tile/include/hv/iorpc.h
new file mode 100644
index 00000000000..ddf1604482b
--- /dev/null
+++ b/arch/tile/include/hv/iorpc.h
@@ -0,0 +1,714 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#ifndef _HV_IORPC_H_
+#define _HV_IORPC_H_
+
+/**
+ *
+ * Error codes and struct definitions for the IO RPC library.
+ *
+ * The hypervisor's IO RPC component provides a convenient way for
+ * driver authors to proxy system calls between user space, linux, and
+ * the hypervisor driver.  The core of the system is a set of Python
+ * files that take ".idl" files as input and generates the following
+ * source code:
+ *
+ * - _rpc_call() routines for use in userspace IO libraries.  These
+ * routines take an argument list specified in the .idl file, pack the
+ * arguments in to a buffer, and read or write that buffer via the
+ * Linux iorpc driver.
+ *
+ * - dispatch_read() and dispatch_write() routines that hypervisor
+ * drivers can use to implement most of their dev_pread() and
+ * dev_pwrite() methods.  These routines decode the incoming parameter
+ * blob, permission check and translate parameters where appropriate,
+ * and then invoke a callback routine for whichever RPC call has
+ * arrived.  The driver simply implements the set of callback
+ * routines.
+ *
+ * The IO RPC system also includes the Linux 'iorpc' driver, which
+ * proxies calls between the userspace library and the hypervisor
+ * driver.  The Linux driver is almost entirely device agnostic; it
+ * watches for special flags indicating cases where a memory buffer
+ * address might need to be translated, etc.  As a result, driver
+ * writers can avoid many of the problem cases related to registering
+ * hardware resources like memory pages or interrupts.  However, the
+ * drivers must be careful to obey the conventions documented below in
+ * order to work properly with the generic Linux iorpc driver.
+ *
+ * @section iorpc_domains Service Domains
+ *
+ * All iorpc-based drivers must support a notion of service domains.
+ * A service domain is basically an application context - state
+ * indicating resources that are allocated to that particular app
+ * which it may access and (perhaps) other applications may not
+ * access.  Drivers can support any number of service domains they
+ * choose.  In some cases the design is limited by a number of service
+ * domains supported by the IO hardware; in other cases the service
+ * domains are a purely software concept and the driver chooses a
+ * maximum number of domains based on how much state memory it is
+ * willing to preallocate.
+ *
+ * For example, the mPIPE driver only supports as many service domains
+ * as are supported by the mPIPE hardware.  This limitation is
+ * required because the hardware implements its own MMIO protection
+ * scheme to allow large MMIO mappings while still protecting small
+ * register ranges within the page that should only be accessed by the
+ * hypervisor.
+ *
+ * In contrast, drivers with no hardware service domain limitations
+ * (for instance the TRIO shim) can implement an arbitrary number of
+ * service domains.  In these cases, each service domain is limited to
+ * a carefully restricted set of legal MMIO addresses if necessary to
+ * keep one application from corrupting another application's state.
+ *
+ * @section iorpc_conventions System Call Conventions
+ *
+ * The driver's open routine is responsible for allocating a new
+ * service domain for each hv_dev_open() call.  By convention, the
+ * return value from open() should be the service domain number on
+ * success, or GXIO_ERR_NO_SVC_DOM if no more service domains are
+ * available.
+ *
+ * The implementations of hv_dev_pread() and hv_dev_pwrite() are
+ * responsible for validating the devhdl value passed up by the
+ * client.  Since the device handle returned by hv_dev_open() should
+ * embed the positive service domain number, drivers should make sure
+ * that DRV_HDL2BITS(devhdl) is a legal service domain.  If the client
+ * passes an illegal service domain number, the routine should return
+ * GXIO_ERR_INVAL_SVC_DOM.  Once the service domain number has been
+ * validated, the driver can copy to/from the client buffer and call
+ * the dispatch_read() or dispatch_write() methods created by the RPC
+ * generator.
+ *
+ * The hv_dev_close() implementation should reset all service domain
+ * state and put the service domain back on a free list for
+ * reallocation by a future application.  In most cases, this will
+ * require executing a hardware reset or drain flow and denying any
+ * MMIO regions that were created for the service domain.
+ *
+ * @section iorpc_data Special Data Types
+ *
+ * The .idl file syntax allows the creation of syscalls with special
+ * parameters that require permission checks or translations as part
+ * of the system call path.  Because of limitations in the code
+ * generator, APIs are generally limited to just one of these special
+ * parameters per system call, and they are sometimes required to be
+ * the first or last parameter to the call.  Special parameters
+ * include:
+ *
+ * @subsection iorpc_mem_buffer MEM_BUFFER
+ *
+ * The MEM_BUFFER() datatype allows user space to "register" memory
+ * buffers with a device.  Registering memory accomplishes two tasks:
+ * Linux keeps track of all buffers that might be modified by a
+ * hardware device, and the hardware device drivers bind registered
+ * buffers to particular hardware resources like ingress NotifRings.
+ * The MEM_BUFFER() idl syntax can take extra flags like ALIGN_64KB,
+ * ALIGN_SELF_SIZE, and FLAGS indicating that memory buffers must have
+ * certain alignment or that the user should be able to pass a "memory
+ * flags" word specifying attributes like nt_hint or IO cache pinning.
+ * The parser will accept multiple MEM_BUFFER() flags.
+ *
+ * Implementations must obey the following conventions when
+ * registering memory buffers via the iorpc flow.  These rules are a
+ * result of the Linux driver implementation, which needs to keep
+ * track of how many times a particular page has been registered with
+ * the hardware so that it can release the page when all those
+ * registrations are cleared.
+ *
+ * - Memory registrations that refer to a resource which has already
+ * been bound must return GXIO_ERR_ALREADY_INIT.  Thus, it is an
+ * error to register memory twice without resetting (i.e. closing) the
+ * resource in between.  This convention keeps the Linux driver from
+ * having to track which particular devices a page is bound to.
+ *
+ * - At present, a memory registration is only cleared when the
+ * service domain is reset.  In this case, the Linux driver simply
+ * closes the HV device file handle and then decrements the reference
+ * counts of all pages that were previously registered with the
+ * device.
+ *
+ * - In the future, we may add a mechanism for unregistering memory.
+ * One possible implementation would require that the user specify
+ * which buffer is currently registered.  The HV would then verify
+ * that that page was actually the one currently mapped and return
+ * success or failure to Linux, which would then only decrement the
+ * page reference count if the addresses were mapped.  Another scheme
+ * might allow Linux to pass a token to the HV to be returned when the
+ * resource is unmapped.
+ *
+ * @subsection iorpc_interrupt INTERRUPT
+ *
+ * The INTERRUPT .idl datatype allows the client to bind hardware
+ * interrupts to a particular combination of IPI parameters - CPU, IPI
+ * PL, and event bit number.  This data is passed via a special
+ * datatype so that the Linux driver can validate the CPU and PL and
+ * the HV generic iorpc code can translate client CPUs to real CPUs.
+ *
+ * @subsection iorpc_pollfd_setup POLLFD_SETUP
+ *
+ * The POLLFD_SETUP .idl datatype allows the client to set up hardware
+ * interrupt bindings which are received by Linux but which are made
+ * visible to user processes as state transitions on a file descriptor;
+ * this allows user processes to use Linux primitives, such as poll(), to
+ * await particular hardware events.  This data is passed via a special
+ * datatype so that the Linux driver may recognize the pollable file
+ * descriptor and translate it to a set of interrupt target information,
+ * and so that the HV generic iorpc code can translate client CPUs to real
+ * CPUs.
+ *
+ * @subsection iorpc_pollfd POLLFD
+ *
+ * The POLLFD .idl datatype allows manipulation of hardware interrupt
+ * bindings set up via the POLLFD_SETUP datatype; common operations are
+ * resetting the state of the requested interrupt events, and unbinding any
+ * bound interrupts.  This data is passed via a special datatype so that
+ * the Linux driver may recognize the pollable file descriptor and
+ * translate it to an interrupt identifier previously supplied by the
+ * hypervisor as the result of an earlier pollfd_setup operation.
+ *
+ * @subsection iorpc_blob BLOB
+ *
+ * The BLOB .idl datatype allows the client to write an arbitrary
+ * length string of bytes up to the hypervisor driver.  This can be
+ * useful for passing up large, arbitrarily structured data like
+ * classifier programs.  The iorpc stack takes care of validating the
+ * buffer VA and CPA as the data passes up to the hypervisor.  Unlike
+ * MEM_BUFFER(), the buffer is not registered - Linux does not bump
+ * page refcounts and the HV driver should not reuse the buffer once
+ * the system call is complete.
+ *
+ * @section iorpc_translation Translating User Space Calls
+ *
+ * The ::iorpc_offset structure describes the formatting of the offset
+ * that is passed to pread() or pwrite() as part of the generated RPC code.
+ * When the user calls up to Linux, the rpc code fills in all the fields of
+ * the offset, including a 16-bit opcode, a 16 bit format indicator, and 32
+ * bits of user-specified "sub-offset".  The opcode indicates which syscall
+ * is being requested.  The format indicates whether there is a "prefix
+ * struct" at the start of the memory buffer passed to pwrite(), and if so
+ * what data is in that prefix struct.  These prefix structs are used to
+ * implement special datatypes like MEM_BUFFER() and INTERRUPT - we arrange
+ * to put data that needs translation and permission checks at the start of
+ * the buffer so that the Linux driver and generic portions of the HV iorpc
+ * code can easily access the data.  The 32 bits of user-specified
+ * "sub-offset" are most useful for pread() calls where the user needs to
+ * also pass in a few bits indicating which register to read, etc.
+ *
+ * The Linux iorpc driver watches for system calls that contain prefix
+ * structs so that it can translate parameters and bump reference
+ * counts as appropriate.  It does not (currently) have any knowledge
+ * of the per-device opcodes - it doesn't care what operation you're
+ * doing to mPIPE, so long as it can do all the generic book-keeping.
+ * The hv/iorpc.h header file defines all of the generic encoding bits
+ * needed to translate iorpc calls without knowing which particular
+ * opcode is being issued.
+ *
+ * @section iorpc_globals Global iorpc Calls
+ *
+ * Implementing mmap() required adding some special iorpc syscalls
+ * that are only called by the Linux driver, never by userspace.
+ * These include get_mmio_base() and check_mmio_offset().  These
+ * routines are described in globals.idl and must be included in every
+ * iorpc driver.  By providing these routines in every driver, Linux's
+ * mmap implementation can easily get the PTE bits it needs and
+ * validate the PA offset without needing to know the per-device
+ * opcodes to perform those tasks.
+ *
+ * @section iorpc_kernel Supporting gxio APIs in the Kernel
+ *
+ * The iorpc code generator also supports generation of kernel code
+ * implementing the gxio APIs.  This capability is currently used by
+ * the mPIPE network driver, and will likely be used by the TRIO root
+ * complex and endpoint drivers and perhaps an in-kernel crypto
+ * driver.  Each driver that wants to instantiate iorpc calls in the
+ * kernel needs to generate a kernel version of the generate rpc code
+ * and (probably) copy any related gxio source files into the kernel.
+ * The mPIPE driver provides a good example of this pattern.
+ */
+
+#ifdef __KERNEL__
+#include <linux/stddef.h>
+#else
+#include <stddef.h>
+#endif
+
+#if defined(__HV__)
+#include <hv/hypervisor.h>
+#elif defined(__KERNEL__)
+#include <hv/hypervisor.h>
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+
+/** Code indicating translation services required within the RPC path.
+ * These indicate whether there is a translatable struct at the start
+ * of the RPC buffer and what information that struct contains.
+ */
+enum iorpc_format_e
+{
+  /** No translation required, no prefix struct. */
+  IORPC_FORMAT_NONE,
+
+  /** No translation required, no prefix struct, no access to this
+   *  operation from user space. */
+  IORPC_FORMAT_NONE_NOUSER,
+
+  /** Prefix struct contains user VA and size. */
+  IORPC_FORMAT_USER_MEM,
+
+  /** Prefix struct contains CPA, size, and homing bits. */
+  IORPC_FORMAT_KERNEL_MEM,
+
+  /** Prefix struct contains interrupt. */
+  IORPC_FORMAT_KERNEL_INTERRUPT,
+
+  /** Prefix struct contains user-level interrupt. */
+  IORPC_FORMAT_USER_INTERRUPT,
+
+  /** Prefix struct contains pollfd_setup (interrupt information). */
+  IORPC_FORMAT_KERNEL_POLLFD_SETUP,
+
+  /** Prefix struct contains user-level pollfd_setup (file descriptor). */
+  IORPC_FORMAT_USER_POLLFD_SETUP,
+
+  /** Prefix struct contains pollfd (interrupt cookie). */
+  IORPC_FORMAT_KERNEL_POLLFD,
+
+  /** Prefix struct contains user-level pollfd (file descriptor). */
+  IORPC_FORMAT_USER_POLLFD,
+};
+
+
+/** Generate an opcode given format and code. */
+#define IORPC_OPCODE(FORMAT, CODE) (((FORMAT) << 16) | (CODE))
+
+/** The offset passed through the read() and write() system calls
+    combines an opcode with 32 bits of user-specified offset. */
+union iorpc_offset
+{
+#ifndef __BIG_ENDIAN__
+  uint64_t offset;              /**< All bits. */
+
+  struct
+  {
+    uint16_t code;              /**< RPC code. */
+    uint16_t format;            /**< iorpc_format_e */
+    uint32_t sub_offset;        /**< caller-specified offset. */
+  };
+
+  uint32_t opcode;              /**< Opcode combines code & format. */
+#else
+  uint64_t offset;              /**< All bits. */
+
+  struct
+  {
+    uint32_t sub_offset;        /**< caller-specified offset. */
+    uint16_t format;            /**< iorpc_format_e */
+    uint16_t code;              /**< RPC code. */
+  };
+
+  struct
+  {
+    uint32_t padding;
+    uint32_t opcode;              /**< Opcode combines code & format. */
+  };
+#endif
+};
+
+
+/** Homing and cache hinting bits that can be used by IO devices. */
+struct iorpc_mem_attr
+{
+  unsigned int lotar_x:4;       /**< lotar X bits (or Gx page_mask). */
+  unsigned int lotar_y:4;       /**< lotar Y bits (or Gx page_offset). */
+  unsigned int hfh:1;           /**< Uses hash-for-home. */
+  unsigned int nt_hint:1;       /**< Non-temporal hint. */
+  unsigned int io_pin:1;        /**< Only fill 'IO' cache ways. */
+};
+
+/** Set the nt_hint bit. */
+#define IORPC_MEM_BUFFER_FLAG_NT_HINT (1 << 0)
+
+/** Set the IO pin bit. */
+#define IORPC_MEM_BUFFER_FLAG_IO_PIN (1 << 1)
+
+
+/** A structure used to describe memory registration.  Different
+    protection levels describe memory differently, so this union
+    contains all the different possible descriptions.  As a request
+    moves up the call chain, each layer translates from one
+    description format to the next.  In particular, the Linux iorpc
+    driver translates user VAs into CPAs and homing parameters. */
+union iorpc_mem_buffer
+{
+  struct
+  {
+    uint64_t va;                /**< User virtual address. */
+    uint64_t size;              /**< Buffer size. */
+    unsigned int flags;         /**< nt_hint, IO pin. */
+  }
+  user;                         /**< Buffer as described by user apps. */
+
+  struct
+  {
+    unsigned long long cpa;     /**< Client physical address. */
+#if defined(__KERNEL__) || defined(__HV__)
+    size_t size;                /**< Buffer size. */
+    HV_PTE pte;                 /**< PTE describing memory homing. */
+#else
+    uint64_t size;
+    uint64_t pte;
+#endif
+    unsigned int flags;         /**< nt_hint, IO pin. */
+  }
+  kernel;                       /**< Buffer as described by kernel. */
+
+  struct
+  {
+    unsigned long long pa;      /**< Physical address. */
+    size_t size;                /**< Buffer size. */
+    struct iorpc_mem_attr attr;      /**< Homing and locality hint bits. */
+  }
+  hv;                           /**< Buffer parameters for HV driver. */
+};
+
+
+/** A structure used to describe interrupts.  The format differs slightly
+ *  for user and kernel interrupts.  As with the mem_buffer_t, translation
+ *  between the formats is done at each level. */
+union iorpc_interrupt
+{
+  struct
+  {
+    int cpu;   /**< CPU. */
+    int event; /**< evt_num */
+  }
+  user;        /**< Interrupt as described by user applications. */
+
+  struct
+  {
+    int x;     /**< X coord. */
+    int y;     /**< Y coord. */
+    int ipi;   /**< int_num */
+    int event; /**< evt_num */
+  }
+  kernel;      /**< Interrupt as described by the kernel. */
+
+};
+
+
+/** A structure used to describe interrupts used with poll().  The format
+ *  differs significantly for requests from user to kernel, and kernel to
+ *  hypervisor.  As with the mem_buffer_t, translation between the formats
+ *  is done at each level. */
+union iorpc_pollfd_setup
+{
+  struct
+  {
+    int fd;    /**< Pollable file descriptor. */
+  }
+  user;        /**< pollfd_setup as described by user applications. */
+
+  struct
+  {
+    int x;     /**< X coord. */
+    int y;     /**< Y coord. */
+    int ipi;   /**< int_num */
+    int event; /**< evt_num */
+  }
+  kernel;      /**< pollfd_setup as described by the kernel. */
+
+};
+
+
+/** A structure used to describe previously set up interrupts used with
+ *  poll().  The format differs significantly for requests from user to
+ *  kernel, and kernel to hypervisor.  As with the mem_buffer_t, translation
+ *  between the formats is done at each level. */
+union iorpc_pollfd
+{
+  struct
+  {
+    int fd;    /**< Pollable file descriptor. */
+  }
+  user;        /**< pollfd as described by user applications. */
+
+  struct
+  {
+    int cookie; /**< hv cookie returned by the pollfd_setup operation. */
+  }
+  kernel;      /**< pollfd as described by the kernel. */
+
+};
+
+
+/** The various iorpc devices use error codes from -1100 to -1299.
+ *
+ * This range is distinct from netio (-700 to -799), the hypervisor
+ * (-800 to -899), tilepci (-900 to -999), ilib (-1000 to -1099),
+ * gxcr (-1300 to -1399) and gxpci (-1400 to -1499).
+ */
+enum gxio_err_e {
+
+  /** Largest iorpc error number. */
+  GXIO_ERR_MAX = -1101,
+
+
+  /********************************************************/
+  /*                   Generic Error Codes                */
+  /********************************************************/
+
+  /** Bad RPC opcode - possible version incompatibility. */
+  GXIO_ERR_OPCODE = -1101,
+
+  /** Invalid parameter. */
+  GXIO_ERR_INVAL = -1102,
+
+  /** Memory buffer did not meet alignment requirements. */
+  GXIO_ERR_ALIGNMENT = -1103,
+
+  /** Memory buffers must be coherent and cacheable. */
+  GXIO_ERR_COHERENCE = -1104,
+
+  /** Resource already initialized. */
+  GXIO_ERR_ALREADY_INIT = -1105,
+
+  /** No service domains available. */
+  GXIO_ERR_NO_SVC_DOM = -1106,
+
+  /** Illegal service domain number. */
+  GXIO_ERR_INVAL_SVC_DOM = -1107,
+
+  /** Illegal MMIO address. */
+  GXIO_ERR_MMIO_ADDRESS = -1108,
+
+  /** Illegal interrupt binding. */
+  GXIO_ERR_INTERRUPT = -1109,
+
+  /** Unreasonable client memory. */
+  GXIO_ERR_CLIENT_MEMORY = -1110,
+
+  /** No more IOTLB entries. */
+  GXIO_ERR_IOTLB_ENTRY = -1111,
+
+  /** Invalid memory size. */
+  GXIO_ERR_INVAL_MEMORY_SIZE = -1112,
+
+  /** Unsupported operation. */
+  GXIO_ERR_UNSUPPORTED_OP = -1113,
+
+  /** Insufficient DMA credits. */
+  GXIO_ERR_DMA_CREDITS = -1114,
+
+  /** Operation timed out. */
+  GXIO_ERR_TIMEOUT = -1115,
+
+  /** No such device or object. */
+  GXIO_ERR_NO_DEVICE = -1116,
+
+  /** Device or resource busy. */
+  GXIO_ERR_BUSY = -1117,
+
+  /** I/O error. */
+  GXIO_ERR_IO = -1118,
+
+  /** Permissions error. */
+  GXIO_ERR_PERM = -1119,
+
+
+
+  /********************************************************/
+  /*                 Test Device Error Codes              */
+  /********************************************************/
+
+  /** Illegal register number. */
+  GXIO_TEST_ERR_REG_NUMBER = -1120,
+
+  /** Illegal buffer slot. */
+  GXIO_TEST_ERR_BUFFER_SLOT = -1121,
+
+
+  /********************************************************/
+  /*                    MPIPE Error Codes                 */
+  /********************************************************/
+
+
+  /** Invalid buffer size. */
+  GXIO_MPIPE_ERR_INVAL_BUFFER_SIZE = -1131,
+
+  /** Cannot allocate buffer stack. */
+  GXIO_MPIPE_ERR_NO_BUFFER_STACK = -1140,
+
+  /** Invalid buffer stack number. */
+  GXIO_MPIPE_ERR_BAD_BUFFER_STACK = -1141,
+
+  /** Cannot allocate NotifRing. */
+  GXIO_MPIPE_ERR_NO_NOTIF_RING = -1142,
+
+  /** Invalid NotifRing number. */
+  GXIO_MPIPE_ERR_BAD_NOTIF_RING = -1143,
+
+  /** Cannot allocate NotifGroup. */
+  GXIO_MPIPE_ERR_NO_NOTIF_GROUP = -1144,
+
+  /** Invalid NotifGroup number. */
+  GXIO_MPIPE_ERR_BAD_NOTIF_GROUP = -1145,
+
+  /** Cannot allocate bucket. */
+  GXIO_MPIPE_ERR_NO_BUCKET = -1146,
+
+  /** Invalid bucket number. */
+  GXIO_MPIPE_ERR_BAD_BUCKET = -1147,
+
+  /** Cannot allocate eDMA ring. */
+  GXIO_MPIPE_ERR_NO_EDMA_RING = -1148,
+
+  /** Invalid eDMA ring number. */
+  GXIO_MPIPE_ERR_BAD_EDMA_RING = -1149,
+
+  /** Invalid channel number. */
+  GXIO_MPIPE_ERR_BAD_CHANNEL = -1150,
+
+  /** Bad configuration. */
+  GXIO_MPIPE_ERR_BAD_CONFIG = -1151,
+
+  /** Empty iqueue. */
+  GXIO_MPIPE_ERR_IQUEUE_EMPTY = -1152,
+
+  /** Empty rules. */
+  GXIO_MPIPE_ERR_RULES_EMPTY = -1160,
+
+  /** Full rules. */
+  GXIO_MPIPE_ERR_RULES_FULL = -1161,
+
+  /** Corrupt rules. */
+  GXIO_MPIPE_ERR_RULES_CORRUPT = -1162,
+
+  /** Invalid rules. */
+  GXIO_MPIPE_ERR_RULES_INVALID = -1163,
+
+  /** Classifier is too big. */
+  GXIO_MPIPE_ERR_CLASSIFIER_TOO_BIG = -1170,
+
+  /** Classifier is too complex. */
+  GXIO_MPIPE_ERR_CLASSIFIER_TOO_COMPLEX = -1171,
+
+  /** Classifier has bad header. */
+  GXIO_MPIPE_ERR_CLASSIFIER_BAD_HEADER = -1172,
+
+  /** Classifier has bad contents. */
+  GXIO_MPIPE_ERR_CLASSIFIER_BAD_CONTENTS = -1173,
+
+  /** Classifier encountered invalid symbol. */
+  GXIO_MPIPE_ERR_CLASSIFIER_INVAL_SYMBOL = -1174,
+
+  /** Classifier encountered invalid bounds. */
+  GXIO_MPIPE_ERR_CLASSIFIER_INVAL_BOUNDS = -1175,
+
+  /** Classifier encountered invalid relocation. */
+  GXIO_MPIPE_ERR_CLASSIFIER_INVAL_RELOCATION = -1176,
+
+  /** Classifier encountered undefined symbol. */
+  GXIO_MPIPE_ERR_CLASSIFIER_UNDEF_SYMBOL = -1177,
+
+
+  /********************************************************/
+  /*                    TRIO  Error Codes                 */
+  /********************************************************/
+
+  /** Cannot allocate memory map region. */
+  GXIO_TRIO_ERR_NO_MEMORY_MAP = -1180,
+
+  /** Invalid memory map region number. */
+  GXIO_TRIO_ERR_BAD_MEMORY_MAP = -1181,
+
+  /** Cannot allocate scatter queue. */
+  GXIO_TRIO_ERR_NO_SCATTER_QUEUE = -1182,
+
+  /** Invalid scatter queue number. */
+  GXIO_TRIO_ERR_BAD_SCATTER_QUEUE = -1183,
+
+  /** Cannot allocate push DMA ring. */
+  GXIO_TRIO_ERR_NO_PUSH_DMA_RING = -1184,
+
+  /** Invalid push DMA ring index. */
+  GXIO_TRIO_ERR_BAD_PUSH_DMA_RING = -1185,
+
+  /** Cannot allocate pull DMA ring. */
+  GXIO_TRIO_ERR_NO_PULL_DMA_RING = -1186,
+
+  /** Invalid pull DMA ring index. */
+  GXIO_TRIO_ERR_BAD_PULL_DMA_RING = -1187,
+
+  /** Cannot allocate PIO region. */
+  GXIO_TRIO_ERR_NO_PIO = -1188,
+
+  /** Invalid PIO region index. */
+  GXIO_TRIO_ERR_BAD_PIO = -1189,
+
+  /** Cannot allocate ASID. */
+  GXIO_TRIO_ERR_NO_ASID = -1190,
+
+  /** Invalid ASID. */
+  GXIO_TRIO_ERR_BAD_ASID = -1191,
+
+
+  /********************************************************/
+  /*                    MICA Error Codes                  */
+  /********************************************************/
+
+  /** No such accelerator type. */
+  GXIO_MICA_ERR_BAD_ACCEL_TYPE = -1220,
+
+  /** Cannot allocate context. */
+  GXIO_MICA_ERR_NO_CONTEXT = -1221,
+
+  /** PKA command queue is full, can't add another command. */
+  GXIO_MICA_ERR_PKA_CMD_QUEUE_FULL = -1222,
+
+  /** PKA result queue is empty, can't get a result from the queue. */
+  GXIO_MICA_ERR_PKA_RESULT_QUEUE_EMPTY = -1223,
+
+  /********************************************************/
+  /*                    GPIO Error Codes                  */
+  /********************************************************/
+
+  /** Pin not available.  Either the physical pin does not exist, or
+   *  it is reserved by the hypervisor for system usage. */
+  GXIO_GPIO_ERR_PIN_UNAVAILABLE = -1240,
+
+  /** Pin busy.  The pin exists, and is available for use via GXIO, but
+   *  it has been attached by some other process or driver. */
+  GXIO_GPIO_ERR_PIN_BUSY = -1241,
+
+  /** Cannot access unattached pin.  One or more of the pins being
+   *  manipulated by this call are not attached to the requesting
+   *  context. */
+  GXIO_GPIO_ERR_PIN_UNATTACHED = -1242,
+
+  /** Invalid I/O mode for pin.  The wiring of the pin in the system
+   *  is such that the I/O mode or electrical control parameters
+   *  requested could cause damage. */
+  GXIO_GPIO_ERR_PIN_INVALID_MODE = -1243,
+
+  /** Smallest iorpc error number. */
+  GXIO_ERR_MIN = -1299
+};
+
+
+#endif /* !_HV_IORPC_H_ */
diff --git a/arch/tile/include/uapi/arch/Kbuild b/arch/tile/include/uapi/arch/Kbuild
new file mode 100644
index 00000000000..97dfbecec6b
--- /dev/null
+++ b/arch/tile/include/uapi/arch/Kbuild
@@ -0,0 +1,17 @@
+# UAPI Header export list
+header-y += abi.h
+header-y += chip.h
+header-y += chip_tilegx.h
+header-y += chip_tilepro.h
+header-y += icache.h
+header-y += interrupts.h
+header-y += interrupts_32.h
+header-y += interrupts_64.h
+header-y += opcode.h
+header-y += opcode_tilegx.h
+header-y += opcode_tilepro.h
+header-y += sim.h
+header-y += sim_def.h
+header-y += spr_def.h
+header-y += spr_def_32.h
+header-y += spr_def_64.h
diff --git a/arch/tile/include/arch/abi.h b/arch/tile/include/uapi/arch/abi.h
index c55a3d43264..c55a3d43264 100644
--- a/arch/tile/include/arch/abi.h
+++ b/arch/tile/include/uapi/arch/abi.h
diff --git a/arch/tile/include/arch/chip.h b/arch/tile/include/uapi/arch/chip.h
index 926d3db0e91..4c91f90b936 100644
--- a/arch/tile/include/arch/chip.h
+++ b/arch/tile/include/uapi/arch/chip.h
@@ -12,9 +12,7 @@
  *   more details.
  */
 
-#if __tile_chip__ == 0
-#include <arch/chip_tile64.h>
-#elif __tile_chip__ == 1
+#if __tile_chip__ == 1
 #include <arch/chip_tilepro.h>
 #elif defined(__tilegx__)
 #include <arch/chip_tilegx.h>
diff --git a/arch/tile/include/arch/chip_tilegx.h b/arch/tile/include/uapi/arch/chip_tilegx.h
index ea8e4f2c948..ea8e4f2c948 100644
--- a/arch/tile/include/arch/chip_tilegx.h
+++ b/arch/tile/include/uapi/arch/chip_tilegx.h
diff --git a/arch/tile/include/arch/chip_tilepro.h b/arch/tile/include/uapi/arch/chip_tilepro.h
index 70017699a74..70017699a74 100644
--- a/arch/tile/include/arch/chip_tilepro.h
+++ b/arch/tile/include/uapi/arch/chip_tilepro.h
diff --git a/arch/tile/include/arch/icache.h b/arch/tile/include/uapi/arch/icache.h
index 762eafa8a11..762eafa8a11 100644
--- a/arch/tile/include/arch/icache.h
+++ b/arch/tile/include/uapi/arch/icache.h
diff --git a/arch/tile/include/arch/interrupts.h b/arch/tile/include/uapi/arch/interrupts.h
index 20f8f07d2de..20f8f07d2de 100644
--- a/arch/tile/include/arch/interrupts.h
+++ b/arch/tile/include/uapi/arch/interrupts.h
diff --git a/arch/tile/include/uapi/arch/interrupts_32.h b/arch/tile/include/uapi/arch/interrupts_32.h
new file mode 100644
index 00000000000..2efe3f68b2d
--- /dev/null
+++ b/arch/tile/include/uapi/arch/interrupts_32.h
@@ -0,0 +1,309 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef __ARCH_INTERRUPTS_H__
+#define __ARCH_INTERRUPTS_H__
+
+#ifndef __KERNEL__
+/** Mask for an interrupt. */
+/* Note: must handle breaking interrupts into high and low words manually. */
+#define INT_MASK_LO(intno) (1 << (intno))
+#define INT_MASK_HI(intno) (1 << ((intno) - 32))
+
+#ifndef __ASSEMBLER__
+#define INT_MASK(intno) (1ULL << (intno))
+#endif
+#endif
+
+
+/** Where a given interrupt executes */
+#define INTERRUPT_VECTOR(i, pl) (0xFC000000 + ((pl) << 24) + ((i) << 8))
+
+/** Where to store a vector for a given interrupt. */
+#define USER_INTERRUPT_VECTOR(i) INTERRUPT_VECTOR(i, 0)
+
+/** The base address of user-level interrupts. */
+#define USER_INTERRUPT_VECTOR_BASE INTERRUPT_VECTOR(0, 0)
+
+
+/** Additional synthetic interrupt. */
+#define INT_BREAKPOINT (63)
+
+#define INT_ITLB_MISS    0
+#define INT_MEM_ERROR    1
+#define INT_ILL    2
+#define INT_GPV    3
+#define INT_SN_ACCESS    4
+#define INT_IDN_ACCESS    5
+#define INT_UDN_ACCESS    6
+#define INT_IDN_REFILL    7
+#define INT_UDN_REFILL    8
+#define INT_IDN_COMPLETE    9
+#define INT_UDN_COMPLETE   10
+#define INT_SWINT_3   11
+#define INT_SWINT_2   12
+#define INT_SWINT_1   13
+#define INT_SWINT_0   14
+#define INT_UNALIGN_DATA   15
+#define INT_DTLB_MISS   16
+#define INT_DTLB_ACCESS   17
+#define INT_DMATLB_MISS   18
+#define INT_DMATLB_ACCESS   19
+#define INT_SNITLB_MISS   20
+#define INT_SN_NOTIFY   21
+#define INT_SN_FIREWALL   22
+#define INT_IDN_FIREWALL   23
+#define INT_UDN_FIREWALL   24
+#define INT_TILE_TIMER   25
+#define INT_IDN_TIMER   26
+#define INT_UDN_TIMER   27
+#define INT_DMA_NOTIFY   28
+#define INT_IDN_CA   29
+#define INT_UDN_CA   30
+#define INT_IDN_AVAIL   31
+#define INT_UDN_AVAIL   32
+#define INT_PERF_COUNT   33
+#define INT_INTCTRL_3   34
+#define INT_INTCTRL_2   35
+#define INT_INTCTRL_1   36
+#define INT_INTCTRL_0   37
+#define INT_BOOT_ACCESS   38
+#define INT_WORLD_ACCESS   39
+#define INT_I_ASID   40
+#define INT_D_ASID   41
+#define INT_DMA_ASID   42
+#define INT_SNI_ASID   43
+#define INT_DMA_CPL   44
+#define INT_SN_CPL   45
+#define INT_DOUBLE_FAULT   46
+#define INT_SN_STATIC_ACCESS   47
+#define INT_AUX_PERF_COUNT   48
+
+#define NUM_INTERRUPTS 49
+
+#ifndef __ASSEMBLER__
+#define QUEUED_INTERRUPTS ( \
+    (1ULL << INT_MEM_ERROR) | \
+    (1ULL << INT_DMATLB_MISS) | \
+    (1ULL << INT_DMATLB_ACCESS) | \
+    (1ULL << INT_SNITLB_MISS) | \
+    (1ULL << INT_SN_NOTIFY) | \
+    (1ULL << INT_SN_FIREWALL) | \
+    (1ULL << INT_IDN_FIREWALL) | \
+    (1ULL << INT_UDN_FIREWALL) | \
+    (1ULL << INT_TILE_TIMER) | \
+    (1ULL << INT_IDN_TIMER) | \
+    (1ULL << INT_UDN_TIMER) | \
+    (1ULL << INT_DMA_NOTIFY) | \
+    (1ULL << INT_IDN_CA) | \
+    (1ULL << INT_UDN_CA) | \
+    (1ULL << INT_IDN_AVAIL) | \
+    (1ULL << INT_UDN_AVAIL) | \
+    (1ULL << INT_PERF_COUNT) | \
+    (1ULL << INT_INTCTRL_3) | \
+    (1ULL << INT_INTCTRL_2) | \
+    (1ULL << INT_INTCTRL_1) | \
+    (1ULL << INT_INTCTRL_0) | \
+    (1ULL << INT_BOOT_ACCESS) | \
+    (1ULL << INT_WORLD_ACCESS) | \
+    (1ULL << INT_I_ASID) | \
+    (1ULL << INT_D_ASID) | \
+    (1ULL << INT_DMA_ASID) | \
+    (1ULL << INT_SNI_ASID) | \
+    (1ULL << INT_DMA_CPL) | \
+    (1ULL << INT_SN_CPL) | \
+    (1ULL << INT_DOUBLE_FAULT) | \
+    (1ULL << INT_AUX_PERF_COUNT) | \
+    0)
+#define NONQUEUED_INTERRUPTS ( \
+    (1ULL << INT_ITLB_MISS) | \
+    (1ULL << INT_ILL) | \
+    (1ULL << INT_GPV) | \
+    (1ULL << INT_SN_ACCESS) | \
+    (1ULL << INT_IDN_ACCESS) | \
+    (1ULL << INT_UDN_ACCESS) | \
+    (1ULL << INT_IDN_REFILL) | \
+    (1ULL << INT_UDN_REFILL) | \
+    (1ULL << INT_IDN_COMPLETE) | \
+    (1ULL << INT_UDN_COMPLETE) | \
+    (1ULL << INT_SWINT_3) | \
+    (1ULL << INT_SWINT_2) | \
+    (1ULL << INT_SWINT_1) | \
+    (1ULL << INT_SWINT_0) | \
+    (1ULL << INT_UNALIGN_DATA) | \
+    (1ULL << INT_DTLB_MISS) | \
+    (1ULL << INT_DTLB_ACCESS) | \
+    (1ULL << INT_SN_STATIC_ACCESS) | \
+    0)
+#define CRITICAL_MASKED_INTERRUPTS ( \
+    (1ULL << INT_MEM_ERROR) | \
+    (1ULL << INT_DMATLB_MISS) | \
+    (1ULL << INT_DMATLB_ACCESS) | \
+    (1ULL << INT_SNITLB_MISS) | \
+    (1ULL << INT_SN_NOTIFY) | \
+    (1ULL << INT_SN_FIREWALL) | \
+    (1ULL << INT_IDN_FIREWALL) | \
+    (1ULL << INT_UDN_FIREWALL) | \
+    (1ULL << INT_TILE_TIMER) | \
+    (1ULL << INT_IDN_TIMER) | \
+    (1ULL << INT_UDN_TIMER) | \
+    (1ULL << INT_DMA_NOTIFY) | \
+    (1ULL << INT_IDN_CA) | \
+    (1ULL << INT_UDN_CA) | \
+    (1ULL << INT_IDN_AVAIL) | \
+    (1ULL << INT_UDN_AVAIL) | \
+    (1ULL << INT_PERF_COUNT) | \
+    (1ULL << INT_INTCTRL_3) | \
+    (1ULL << INT_INTCTRL_2) | \
+    (1ULL << INT_INTCTRL_1) | \
+    (1ULL << INT_INTCTRL_0) | \
+    (1ULL << INT_AUX_PERF_COUNT) | \
+    0)
+#define CRITICAL_UNMASKED_INTERRUPTS ( \
+    (1ULL << INT_ITLB_MISS) | \
+    (1ULL << INT_ILL) | \
+    (1ULL << INT_GPV) | \
+    (1ULL << INT_SN_ACCESS) | \
+    (1ULL << INT_IDN_ACCESS) | \
+    (1ULL << INT_UDN_ACCESS) | \
+    (1ULL << INT_IDN_REFILL) | \
+    (1ULL << INT_UDN_REFILL) | \
+    (1ULL << INT_IDN_COMPLETE) | \
+    (1ULL << INT_UDN_COMPLETE) | \
+    (1ULL << INT_SWINT_3) | \
+    (1ULL << INT_SWINT_2) | \
+    (1ULL << INT_SWINT_1) | \
+    (1ULL << INT_SWINT_0) | \
+    (1ULL << INT_UNALIGN_DATA) | \
+    (1ULL << INT_DTLB_MISS) | \
+    (1ULL << INT_DTLB_ACCESS) | \
+    (1ULL << INT_BOOT_ACCESS) | \
+    (1ULL << INT_WORLD_ACCESS) | \
+    (1ULL << INT_I_ASID) | \
+    (1ULL << INT_D_ASID) | \
+    (1ULL << INT_DMA_ASID) | \
+    (1ULL << INT_SNI_ASID) | \
+    (1ULL << INT_DMA_CPL) | \
+    (1ULL << INT_SN_CPL) | \
+    (1ULL << INT_DOUBLE_FAULT) | \
+    (1ULL << INT_SN_STATIC_ACCESS) | \
+    0)
+#define MASKABLE_INTERRUPTS ( \
+    (1ULL << INT_MEM_ERROR) | \
+    (1ULL << INT_IDN_REFILL) | \
+    (1ULL << INT_UDN_REFILL) | \
+    (1ULL << INT_IDN_COMPLETE) | \
+    (1ULL << INT_UDN_COMPLETE) | \
+    (1ULL << INT_DMATLB_MISS) | \
+    (1ULL << INT_DMATLB_ACCESS) | \
+    (1ULL << INT_SNITLB_MISS) | \
+    (1ULL << INT_SN_NOTIFY) | \
+    (1ULL << INT_SN_FIREWALL) | \
+    (1ULL << INT_IDN_FIREWALL) | \
+    (1ULL << INT_UDN_FIREWALL) | \
+    (1ULL << INT_TILE_TIMER) | \
+    (1ULL << INT_IDN_TIMER) | \
+    (1ULL << INT_UDN_TIMER) | \
+    (1ULL << INT_DMA_NOTIFY) | \
+    (1ULL << INT_IDN_CA) | \
+    (1ULL << INT_UDN_CA) | \
+    (1ULL << INT_IDN_AVAIL) | \
+    (1ULL << INT_UDN_AVAIL) | \
+    (1ULL << INT_PERF_COUNT) | \
+    (1ULL << INT_INTCTRL_3) | \
+    (1ULL << INT_INTCTRL_2) | \
+    (1ULL << INT_INTCTRL_1) | \
+    (1ULL << INT_INTCTRL_0) | \
+    (1ULL << INT_AUX_PERF_COUNT) | \
+    0)
+#define UNMASKABLE_INTERRUPTS ( \
+    (1ULL << INT_ITLB_MISS) | \
+    (1ULL << INT_ILL) | \
+    (1ULL << INT_GPV) | \
+    (1ULL << INT_SN_ACCESS) | \
+    (1ULL << INT_IDN_ACCESS) | \
+    (1ULL << INT_UDN_ACCESS) | \
+    (1ULL << INT_SWINT_3) | \
+    (1ULL << INT_SWINT_2) | \
+    (1ULL << INT_SWINT_1) | \
+    (1ULL << INT_SWINT_0) | \
+    (1ULL << INT_UNALIGN_DATA) | \
+    (1ULL << INT_DTLB_MISS) | \
+    (1ULL << INT_DTLB_ACCESS) | \
+    (1ULL << INT_BOOT_ACCESS) | \
+    (1ULL << INT_WORLD_ACCESS) | \
+    (1ULL << INT_I_ASID) | \
+    (1ULL << INT_D_ASID) | \
+    (1ULL << INT_DMA_ASID) | \
+    (1ULL << INT_SNI_ASID) | \
+    (1ULL << INT_DMA_CPL) | \
+    (1ULL << INT_SN_CPL) | \
+    (1ULL << INT_DOUBLE_FAULT) | \
+    (1ULL << INT_SN_STATIC_ACCESS) | \
+    0)
+#define SYNC_INTERRUPTS ( \
+    (1ULL << INT_ITLB_MISS) | \
+    (1ULL << INT_ILL) | \
+    (1ULL << INT_GPV) | \
+    (1ULL << INT_SN_ACCESS) | \
+    (1ULL << INT_IDN_ACCESS) | \
+    (1ULL << INT_UDN_ACCESS) | \
+    (1ULL << INT_IDN_REFILL) | \
+    (1ULL << INT_UDN_REFILL) | \
+    (1ULL << INT_IDN_COMPLETE) | \
+    (1ULL << INT_UDN_COMPLETE) | \
+    (1ULL << INT_SWINT_3) | \
+    (1ULL << INT_SWINT_2) | \
+    (1ULL << INT_SWINT_1) | \
+    (1ULL << INT_SWINT_0) | \
+    (1ULL << INT_UNALIGN_DATA) | \
+    (1ULL << INT_DTLB_MISS) | \
+    (1ULL << INT_DTLB_ACCESS) | \
+    (1ULL << INT_SN_STATIC_ACCESS) | \
+    0)
+#define NON_SYNC_INTERRUPTS ( \
+    (1ULL << INT_MEM_ERROR) | \
+    (1ULL << INT_DMATLB_MISS) | \
+    (1ULL << INT_DMATLB_ACCESS) | \
+    (1ULL << INT_SNITLB_MISS) | \
+    (1ULL << INT_SN_NOTIFY) | \
+    (1ULL << INT_SN_FIREWALL) | \
+    (1ULL << INT_IDN_FIREWALL) | \
+    (1ULL << INT_UDN_FIREWALL) | \
+    (1ULL << INT_TILE_TIMER) | \
+    (1ULL << INT_IDN_TIMER) | \
+    (1ULL << INT_UDN_TIMER) | \
+    (1ULL << INT_DMA_NOTIFY) | \
+    (1ULL << INT_IDN_CA) | \
+    (1ULL << INT_UDN_CA) | \
+    (1ULL << INT_IDN_AVAIL) | \
+    (1ULL << INT_UDN_AVAIL) | \
+    (1ULL << INT_PERF_COUNT) | \
+    (1ULL << INT_INTCTRL_3) | \
+    (1ULL << INT_INTCTRL_2) | \
+    (1ULL << INT_INTCTRL_1) | \
+    (1ULL << INT_INTCTRL_0) | \
+    (1ULL << INT_BOOT_ACCESS) | \
+    (1ULL << INT_WORLD_ACCESS) | \
+    (1ULL << INT_I_ASID) | \
+    (1ULL << INT_D_ASID) | \
+    (1ULL << INT_DMA_ASID) | \
+    (1ULL << INT_SNI_ASID) | \
+    (1ULL << INT_DMA_CPL) | \
+    (1ULL << INT_SN_CPL) | \
+    (1ULL << INT_DOUBLE_FAULT) | \
+    (1ULL << INT_AUX_PERF_COUNT) | \
+    0)
+#endif /* !__ASSEMBLER__ */
+#endif /* !__ARCH_INTERRUPTS_H__ */
diff --git a/arch/tile/include/uapi/arch/interrupts_64.h b/arch/tile/include/uapi/arch/interrupts_64.h
new file mode 100644
index 00000000000..13c9f918234
--- /dev/null
+++ b/arch/tile/include/uapi/arch/interrupts_64.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef __ARCH_INTERRUPTS_H__
+#define __ARCH_INTERRUPTS_H__
+
+#ifndef __KERNEL__
+/** Mask for an interrupt. */
+#ifdef __ASSEMBLER__
+/* Note: must handle breaking interrupts into high and low words manually. */
+#define INT_MASK(intno) (1 << (intno))
+#else
+#define INT_MASK(intno) (1ULL << (intno))
+#endif
+#endif
+
+
+/** Where a given interrupt executes */
+#define INTERRUPT_VECTOR(i, pl) (0xFC000000 + ((pl) << 24) + ((i) << 8))
+
+/** Where to store a vector for a given interrupt. */
+#define USER_INTERRUPT_VECTOR(i) INTERRUPT_VECTOR(i, 0)
+
+/** The base address of user-level interrupts. */
+#define USER_INTERRUPT_VECTOR_BASE INTERRUPT_VECTOR(0, 0)
+
+
+/** Additional synthetic interrupt. */
+#define INT_BREAKPOINT (63)
+
+#define INT_MEM_ERROR    0
+#define INT_SINGLE_STEP_3    1
+#define INT_SINGLE_STEP_2    2
+#define INT_SINGLE_STEP_1    3
+#define INT_SINGLE_STEP_0    4
+#define INT_IDN_COMPLETE    5
+#define INT_UDN_COMPLETE    6
+#define INT_ITLB_MISS    7
+#define INT_ILL    8
+#define INT_GPV    9
+#define INT_IDN_ACCESS   10
+#define INT_UDN_ACCESS   11
+#define INT_SWINT_3   12
+#define INT_SWINT_2   13
+#define INT_SWINT_1   14
+#define INT_SWINT_0   15
+#define INT_ILL_TRANS   16
+#define INT_UNALIGN_DATA   17
+#define INT_DTLB_MISS   18
+#define INT_DTLB_ACCESS   19
+#define INT_IDN_FIREWALL   20
+#define INT_UDN_FIREWALL   21
+#define INT_TILE_TIMER   22
+#define INT_AUX_TILE_TIMER   23
+#define INT_IDN_TIMER   24
+#define INT_UDN_TIMER   25
+#define INT_IDN_AVAIL   26
+#define INT_UDN_AVAIL   27
+#define INT_IPI_3   28
+#define INT_IPI_2   29
+#define INT_IPI_1   30
+#define INT_IPI_0   31
+#define INT_PERF_COUNT   32
+#define INT_AUX_PERF_COUNT   33
+#define INT_INTCTRL_3   34
+#define INT_INTCTRL_2   35
+#define INT_INTCTRL_1   36
+#define INT_INTCTRL_0   37
+#define INT_BOOT_ACCESS   38
+#define INT_WORLD_ACCESS   39
+#define INT_I_ASID   40
+#define INT_D_ASID   41
+#define INT_DOUBLE_FAULT   42
+
+#define NUM_INTERRUPTS 43
+
+#ifndef __ASSEMBLER__
+#define QUEUED_INTERRUPTS ( \
+    (1ULL << INT_MEM_ERROR) | \
+    (1ULL << INT_IDN_COMPLETE) | \
+    (1ULL << INT_UDN_COMPLETE) | \
+    (1ULL << INT_IDN_FIREWALL) | \
+    (1ULL << INT_UDN_FIREWALL) | \
+    (1ULL << INT_TILE_TIMER) | \
+    (1ULL << INT_AUX_TILE_TIMER) | \
+    (1ULL << INT_IDN_TIMER) | \
+    (1ULL << INT_UDN_TIMER) | \
+    (1ULL << INT_IDN_AVAIL) | \
+    (1ULL << INT_UDN_AVAIL) | \
+    (1ULL << INT_IPI_3) | \
+    (1ULL << INT_IPI_2) | \
+    (1ULL << INT_IPI_1) | \
+    (1ULL << INT_IPI_0) | \
+    (1ULL << INT_PERF_COUNT) | \
+    (1ULL << INT_AUX_PERF_COUNT) | \
+    (1ULL << INT_INTCTRL_3) | \
+    (1ULL << INT_INTCTRL_2) | \
+    (1ULL << INT_INTCTRL_1) | \
+    (1ULL << INT_INTCTRL_0) | \
+    (1ULL << INT_BOOT_ACCESS) | \
+    (1ULL << INT_WORLD_ACCESS) | \
+    (1ULL << INT_I_ASID) | \
+    (1ULL << INT_D_ASID) | \
+    (1ULL << INT_DOUBLE_FAULT) | \
+    0)
+#define NONQUEUED_INTERRUPTS ( \
+    (1ULL << INT_SINGLE_STEP_3) | \
+    (1ULL << INT_SINGLE_STEP_2) | \
+    (1ULL << INT_SINGLE_STEP_1) | \
+    (1ULL << INT_SINGLE_STEP_0) | \
+    (1ULL << INT_ITLB_MISS) | \
+    (1ULL << INT_ILL) | \
+    (1ULL << INT_GPV) | \
+    (1ULL << INT_IDN_ACCESS) | \
+    (1ULL << INT_UDN_ACCESS) | \
+    (1ULL << INT_SWINT_3) | \
+    (1ULL << INT_SWINT_2) | \
+    (1ULL << INT_SWINT_1) | \
+    (1ULL << INT_SWINT_0) | \
+    (1ULL << INT_ILL_TRANS) | \
+    (1ULL << INT_UNALIGN_DATA) | \
+    (1ULL << INT_DTLB_MISS) | \
+    (1ULL << INT_DTLB_ACCESS) | \
+    0)
+#define CRITICAL_MASKED_INTERRUPTS ( \
+    (1ULL << INT_MEM_ERROR) | \
+    (1ULL << INT_SINGLE_STEP_3) | \
+    (1ULL << INT_SINGLE_STEP_2) | \
+    (1ULL << INT_SINGLE_STEP_1) | \
+    (1ULL << INT_SINGLE_STEP_0) | \
+    (1ULL << INT_IDN_COMPLETE) | \
+    (1ULL << INT_UDN_COMPLETE) | \
+    (1ULL << INT_IDN_FIREWALL) | \
+    (1ULL << INT_UDN_FIREWALL) | \
+    (1ULL << INT_TILE_TIMER) | \
+    (1ULL << INT_AUX_TILE_TIMER) | \
+    (1ULL << INT_IDN_TIMER) | \
+    (1ULL << INT_UDN_TIMER) | \
+    (1ULL << INT_IDN_AVAIL) | \
+    (1ULL << INT_UDN_AVAIL) | \
+    (1ULL << INT_IPI_3) | \
+    (1ULL << INT_IPI_2) | \
+    (1ULL << INT_IPI_1) | \
+    (1ULL << INT_IPI_0) | \
+    (1ULL << INT_PERF_COUNT) | \
+    (1ULL << INT_AUX_PERF_COUNT) | \
+    (1ULL << INT_INTCTRL_3) | \
+    (1ULL << INT_INTCTRL_2) | \
+    (1ULL << INT_INTCTRL_1) | \
+    (1ULL << INT_INTCTRL_0) | \
+    0)
+#define CRITICAL_UNMASKED_INTERRUPTS ( \
+    (1ULL << INT_ITLB_MISS) | \
+    (1ULL << INT_ILL) | \
+    (1ULL << INT_GPV) | \
+    (1ULL << INT_IDN_ACCESS) | \
+    (1ULL << INT_UDN_ACCESS) | \
+    (1ULL << INT_SWINT_3) | \
+    (1ULL << INT_SWINT_2) | \
+    (1ULL << INT_SWINT_1) | \
+    (1ULL << INT_SWINT_0) | \
+    (1ULL << INT_ILL_TRANS) | \
+    (1ULL << INT_UNALIGN_DATA) | \
+    (1ULL << INT_DTLB_MISS) | \
+    (1ULL << INT_DTLB_ACCESS) | \
+    (1ULL << INT_BOOT_ACCESS) | \
+    (1ULL << INT_WORLD_ACCESS) | \
+    (1ULL << INT_I_ASID) | \
+    (1ULL << INT_D_ASID) | \
+    (1ULL << INT_DOUBLE_FAULT) | \
+    0)
+#define MASKABLE_INTERRUPTS ( \
+    (1ULL << INT_MEM_ERROR) | \
+    (1ULL << INT_SINGLE_STEP_3) | \
+    (1ULL << INT_SINGLE_STEP_2) | \
+    (1ULL << INT_SINGLE_STEP_1) | \
+    (1ULL << INT_SINGLE_STEP_0) | \
+    (1ULL << INT_IDN_COMPLETE) | \
+    (1ULL << INT_UDN_COMPLETE) | \
+    (1ULL << INT_IDN_FIREWALL) | \
+    (1ULL << INT_UDN_FIREWALL) | \
+    (1ULL << INT_TILE_TIMER) | \
+    (1ULL << INT_AUX_TILE_TIMER) | \
+    (1ULL << INT_IDN_TIMER) | \
+    (1ULL << INT_UDN_TIMER) | \
+    (1ULL << INT_IDN_AVAIL) | \
+    (1ULL << INT_UDN_AVAIL) | \
+    (1ULL << INT_IPI_3) | \
+    (1ULL << INT_IPI_2) | \
+    (1ULL << INT_IPI_1) | \
+    (1ULL << INT_IPI_0) | \
+    (1ULL << INT_PERF_COUNT) | \
+    (1ULL << INT_AUX_PERF_COUNT) | \
+    (1ULL << INT_INTCTRL_3) | \
+    (1ULL << INT_INTCTRL_2) | \
+    (1ULL << INT_INTCTRL_1) | \
+    (1ULL << INT_INTCTRL_0) | \
+    0)
+#define UNMASKABLE_INTERRUPTS ( \
+    (1ULL << INT_ITLB_MISS) | \
+    (1ULL << INT_ILL) | \
+    (1ULL << INT_GPV) | \
+    (1ULL << INT_IDN_ACCESS) | \
+    (1ULL << INT_UDN_ACCESS) | \
+    (1ULL << INT_SWINT_3) | \
+    (1ULL << INT_SWINT_2) | \
+    (1ULL << INT_SWINT_1) | \
+    (1ULL << INT_SWINT_0) | \
+    (1ULL << INT_ILL_TRANS) | \
+    (1ULL << INT_UNALIGN_DATA) | \
+    (1ULL << INT_DTLB_MISS) | \
+    (1ULL << INT_DTLB_ACCESS) | \
+    (1ULL << INT_BOOT_ACCESS) | \
+    (1ULL << INT_WORLD_ACCESS) | \
+    (1ULL << INT_I_ASID) | \
+    (1ULL << INT_D_ASID) | \
+    (1ULL << INT_DOUBLE_FAULT) | \
+    0)
+#define SYNC_INTERRUPTS ( \
+    (1ULL << INT_SINGLE_STEP_3) | \
+    (1ULL << INT_SINGLE_STEP_2) | \
+    (1ULL << INT_SINGLE_STEP_1) | \
+    (1ULL << INT_SINGLE_STEP_0) | \
+    (1ULL << INT_IDN_COMPLETE) | \
+    (1ULL << INT_UDN_COMPLETE) | \
+    (1ULL << INT_ITLB_MISS) | \
+    (1ULL << INT_ILL) | \
+    (1ULL << INT_GPV) | \
+    (1ULL << INT_IDN_ACCESS) | \
+    (1ULL << INT_UDN_ACCESS) | \
+    (1ULL << INT_SWINT_3) | \
+    (1ULL << INT_SWINT_2) | \
+    (1ULL << INT_SWINT_1) | \
+    (1ULL << INT_SWINT_0) | \
+    (1ULL << INT_ILL_TRANS) | \
+    (1ULL << INT_UNALIGN_DATA) | \
+    (1ULL << INT_DTLB_MISS) | \
+    (1ULL << INT_DTLB_ACCESS) | \
+    0)
+#define NON_SYNC_INTERRUPTS ( \
+    (1ULL << INT_MEM_ERROR) | \
+    (1ULL << INT_IDN_FIREWALL) | \
+    (1ULL << INT_UDN_FIREWALL) | \
+    (1ULL << INT_TILE_TIMER) | \
+    (1ULL << INT_AUX_TILE_TIMER) | \
+    (1ULL << INT_IDN_TIMER) | \
+    (1ULL << INT_UDN_TIMER) | \
+    (1ULL << INT_IDN_AVAIL) | \
+    (1ULL << INT_UDN_AVAIL) | \
+    (1ULL << INT_IPI_3) | \
+    (1ULL << INT_IPI_2) | \
+    (1ULL << INT_IPI_1) | \
+    (1ULL << INT_IPI_0) | \
+    (1ULL << INT_PERF_COUNT) | \
+    (1ULL << INT_AUX_PERF_COUNT) | \
+    (1ULL << INT_INTCTRL_3) | \
+    (1ULL << INT_INTCTRL_2) | \
+    (1ULL << INT_INTCTRL_1) | \
+    (1ULL << INT_INTCTRL_0) | \
+    (1ULL << INT_BOOT_ACCESS) | \
+    (1ULL << INT_WORLD_ACCESS) | \
+    (1ULL << INT_I_ASID) | \
+    (1ULL << INT_D_ASID) | \
+    (1ULL << INT_DOUBLE_FAULT) | \
+    0)
+#endif /* !__ASSEMBLER__ */
+#endif /* !__ARCH_INTERRUPTS_H__ */
diff --git a/arch/tile/include/arch/opcode.h b/arch/tile/include/uapi/arch/opcode.h
index 92d15229ece..92d15229ece 100644
--- a/arch/tile/include/arch/opcode.h
+++ b/arch/tile/include/uapi/arch/opcode.h
diff --git a/arch/tile/include/arch/opcode_tilegx.h b/arch/tile/include/uapi/arch/opcode_tilegx.h
index c14d02c8160..d76ff2db745 100644
--- a/arch/tile/include/arch/opcode_tilegx.h
+++ b/arch/tile/include/uapi/arch/opcode_tilegx.h
@@ -61,6 +61,7 @@ typedef tilegx_bundle_bits tile_bundle_bits;
 #define TILE_BUNDLE_ALIGNMENT_IN_BYTES TILEGX_BUNDLE_ALIGNMENT_IN_BYTES
 #define TILE_LOG2_BUNDLE_ALIGNMENT_IN_BYTES \
   TILEGX_LOG2_BUNDLE_ALIGNMENT_IN_BYTES
+#define TILE_BPT_BUNDLE TILEGX_BPT_BUNDLE
 
 /* 64-bit pattern for a { bpt ; nop } bundle. */
 #define TILEGX_BPT_BUNDLE 0x286a44ae51485000ULL
diff --git a/arch/tile/include/arch/opcode_tilepro.h b/arch/tile/include/uapi/arch/opcode_tilepro.h
index 71b763b8ce8..4451cff1a86 100644
--- a/arch/tile/include/arch/opcode_tilepro.h
+++ b/arch/tile/include/uapi/arch/opcode_tilepro.h
@@ -71,6 +71,7 @@ typedef tilepro_bundle_bits tile_bundle_bits;
 #define TILE_BUNDLE_ALIGNMENT_IN_BYTES TILEPRO_BUNDLE_ALIGNMENT_IN_BYTES
 #define TILE_LOG2_BUNDLE_ALIGNMENT_IN_BYTES \
   TILEPRO_LOG2_BUNDLE_ALIGNMENT_IN_BYTES
+#define TILE_BPT_BUNDLE TILEPRO_BPT_BUNDLE
 
 /* 64-bit pattern for a { bpt ; nop } bundle. */
 #define TILEPRO_BPT_BUNDLE 0x400b3cae70166000ULL
diff --git a/arch/tile/include/arch/sim.h b/arch/tile/include/uapi/arch/sim.h
index e54b7b0527f..e54b7b0527f 100644
--- a/arch/tile/include/arch/sim.h
+++ b/arch/tile/include/uapi/arch/sim.h
diff --git a/arch/tile/include/arch/sim_def.h b/arch/tile/include/uapi/arch/sim_def.h
index 4b44a2b6a09..4b44a2b6a09 100644
--- a/arch/tile/include/arch/sim_def.h
+++ b/arch/tile/include/uapi/arch/sim_def.h
diff --git a/arch/tile/include/uapi/arch/spr_def.h b/arch/tile/include/uapi/arch/spr_def.h
new file mode 100644
index 00000000000..c250c5adb1a
--- /dev/null
+++ b/arch/tile/include/uapi/arch/spr_def.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _UAPI__ARCH_SPR_DEF_H__
+#define _UAPI__ARCH_SPR_DEF_H__
+
+/* Include the proper base SPR definition file. */
+#ifdef __tilegx__
+#include <arch/spr_def_64.h>
+#else
+#include <arch/spr_def_32.h>
+#endif
+
+
+#endif /* _UAPI__ARCH_SPR_DEF_H__ */
diff --git a/arch/tile/include/arch/spr_def_32.h b/arch/tile/include/uapi/arch/spr_def_32.h
index bbc1f4c924e..78daa3146d2 100644
--- a/arch/tile/include/arch/spr_def_32.h
+++ b/arch/tile/include/uapi/arch/spr_def_32.h
@@ -14,8 +14,8 @@
 
 #ifndef __DOXYGEN__
 
-#ifndef __ARCH_SPR_DEF_H__
-#define __ARCH_SPR_DEF_H__
+#ifndef __ARCH_SPR_DEF_32_H__
+#define __ARCH_SPR_DEF_32_H__
 
 #define SPR_AUX_PERF_COUNT_0 0x6005
 #define SPR_AUX_PERF_COUNT_1 0x6006
@@ -65,6 +65,31 @@
 #define SPR_EX_CONTEXT_2_1__ICS_RMASK 0x1
 #define SPR_EX_CONTEXT_2_1__ICS_MASK  0x4
 #define SPR_FAIL 0x4e09
+#define SPR_IDN_AVAIL_EN 0x3e05
+#define SPR_IDN_CA_DATA 0x0b00
+#define SPR_IDN_DATA_AVAIL 0x0b03
+#define SPR_IDN_DEADLOCK_TIMEOUT 0x3406
+#define SPR_IDN_DEMUX_CA_COUNT 0x0a05
+#define SPR_IDN_DEMUX_COUNT_0 0x0a06
+#define SPR_IDN_DEMUX_COUNT_1 0x0a07
+#define SPR_IDN_DEMUX_CTL 0x0a08
+#define SPR_IDN_DEMUX_QUEUE_SEL 0x0a0a
+#define SPR_IDN_DEMUX_STATUS 0x0a0b
+#define SPR_IDN_DEMUX_WRITE_FIFO 0x0a0c
+#define SPR_IDN_DIRECTION_PROTECT 0x2e05
+#define SPR_IDN_PENDING 0x0a0e
+#define SPR_IDN_REFILL_EN 0x0e05
+#define SPR_IDN_SP_FIFO_DATA 0x0a0f
+#define SPR_IDN_SP_FIFO_SEL 0x0a10
+#define SPR_IDN_SP_FREEZE 0x0a11
+#define SPR_IDN_SP_FREEZE__SP_FRZ_MASK  0x1
+#define SPR_IDN_SP_FREEZE__DEMUX_FRZ_MASK  0x2
+#define SPR_IDN_SP_FREEZE__NON_DEST_EXT_MASK  0x4
+#define SPR_IDN_SP_STATE 0x0a12
+#define SPR_IDN_TAG_0 0x0a13
+#define SPR_IDN_TAG_1 0x0a14
+#define SPR_IDN_TAG_VALID 0x0a15
+#define SPR_IDN_TILE_COORD 0x0a16
 #define SPR_INTCTRL_0_STATUS 0x4a07
 #define SPR_INTCTRL_1_STATUS 0x4807
 #define SPR_INTCTRL_2_STATUS 0x4607
@@ -87,12 +112,36 @@
 #define SPR_INTERRUPT_MASK_SET_1_1 0x480e
 #define SPR_INTERRUPT_MASK_SET_2_0 0x460c
 #define SPR_INTERRUPT_MASK_SET_2_1 0x460d
+#define SPR_MPL_AUX_PERF_COUNT_SET_0 0x6000
+#define SPR_MPL_AUX_PERF_COUNT_SET_1 0x6001
+#define SPR_MPL_AUX_PERF_COUNT_SET_2 0x6002
 #define SPR_MPL_DMA_CPL_SET_0 0x5800
 #define SPR_MPL_DMA_CPL_SET_1 0x5801
 #define SPR_MPL_DMA_CPL_SET_2 0x5802
 #define SPR_MPL_DMA_NOTIFY_SET_0 0x3800
 #define SPR_MPL_DMA_NOTIFY_SET_1 0x3801
 #define SPR_MPL_DMA_NOTIFY_SET_2 0x3802
+#define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
+#define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
+#define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
+#define SPR_MPL_IDN_AVAIL_SET_0 0x3e00
+#define SPR_MPL_IDN_AVAIL_SET_1 0x3e01
+#define SPR_MPL_IDN_AVAIL_SET_2 0x3e02
+#define SPR_MPL_IDN_CA_SET_0 0x3a00
+#define SPR_MPL_IDN_CA_SET_1 0x3a01
+#define SPR_MPL_IDN_CA_SET_2 0x3a02
+#define SPR_MPL_IDN_COMPLETE_SET_0 0x1200
+#define SPR_MPL_IDN_COMPLETE_SET_1 0x1201
+#define SPR_MPL_IDN_COMPLETE_SET_2 0x1202
+#define SPR_MPL_IDN_FIREWALL_SET_0 0x2e00
+#define SPR_MPL_IDN_FIREWALL_SET_1 0x2e01
+#define SPR_MPL_IDN_FIREWALL_SET_2 0x2e02
+#define SPR_MPL_IDN_REFILL_SET_0 0x0e00
+#define SPR_MPL_IDN_REFILL_SET_1 0x0e01
+#define SPR_MPL_IDN_REFILL_SET_2 0x0e02
+#define SPR_MPL_IDN_TIMER_SET_0 0x3400
+#define SPR_MPL_IDN_TIMER_SET_1 0x3401
+#define SPR_MPL_IDN_TIMER_SET_2 0x3402
 #define SPR_MPL_INTCTRL_0_SET_0 0x4a00
 #define SPR_MPL_INTCTRL_0_SET_1 0x4a01
 #define SPR_MPL_INTCTRL_0_SET_2 0x4a02
@@ -102,6 +151,9 @@
 #define SPR_MPL_INTCTRL_2_SET_0 0x4600
 #define SPR_MPL_INTCTRL_2_SET_1 0x4601
 #define SPR_MPL_INTCTRL_2_SET_2 0x4602
+#define SPR_MPL_PERF_COUNT_SET_0 0x4200
+#define SPR_MPL_PERF_COUNT_SET_1 0x4201
+#define SPR_MPL_PERF_COUNT_SET_2 0x4202
 #define SPR_MPL_SN_ACCESS_SET_0 0x0800
 #define SPR_MPL_SN_ACCESS_SET_1 0x0801
 #define SPR_MPL_SN_ACCESS_SET_2 0x0802
@@ -148,8 +200,6 @@
 #define SPR_SIM_CONTROL 0x4e0c
 #define SPR_SNCTL 0x0805
 #define SPR_SNCTL__FRZFABRIC_MASK  0x1
-#define SPR_SNCTL__FRZPROC_MASK  0x2
-#define SPR_SNPC 0x080b
 #define SPR_SNSTATIC 0x080c
 #define SPR_SYSTEM_SAVE_0_0 0x4b00
 #define SPR_SYSTEM_SAVE_0_1 0x4b01
@@ -181,6 +231,7 @@
 #define SPR_UDN_DEMUX_STATUS 0x0c0d
 #define SPR_UDN_DEMUX_WRITE_FIFO 0x0c0e
 #define SPR_UDN_DIRECTION_PROTECT 0x3005
+#define SPR_UDN_PENDING 0x0c10
 #define SPR_UDN_REFILL_EN 0x1005
 #define SPR_UDN_SP_FIFO_DATA 0x0c11
 #define SPR_UDN_SP_FIFO_SEL 0x0c12
@@ -195,7 +246,10 @@
 #define SPR_UDN_TAG_3 0x0c18
 #define SPR_UDN_TAG_VALID 0x0c19
 #define SPR_UDN_TILE_COORD 0x0c1a
+#define SPR_WATCH_CTL 0x4209
+#define SPR_WATCH_MASK 0x420a
+#define SPR_WATCH_VAL 0x420b
 
-#endif /* !defined(__ARCH_SPR_DEF_H__) */
+#endif /* !defined(__ARCH_SPR_DEF_32_H__) */
 
 #endif /* !defined(__DOXYGEN__) */
diff --git a/arch/tile/include/arch/spr_def_64.h b/arch/tile/include/uapi/arch/spr_def_64.h
index cd3e5f95d5f..67a6c1751e3 100644
--- a/arch/tile/include/arch/spr_def_64.h
+++ b/arch/tile/include/uapi/arch/spr_def_64.h
@@ -14,8 +14,8 @@
 
 #ifndef __DOXYGEN__
 
-#ifndef __ARCH_SPR_DEF_H__
-#define __ARCH_SPR_DEF_H__
+#ifndef __ARCH_SPR_DEF_64_H__
+#define __ARCH_SPR_DEF_64_H__
 
 #define SPR_AUX_PERF_COUNT_0 0x2105
 #define SPR_AUX_PERF_COUNT_1 0x2106
@@ -52,6 +52,13 @@
 #define SPR_EX_CONTEXT_2_1__ICS_RMASK 0x1
 #define SPR_EX_CONTEXT_2_1__ICS_MASK  0x4
 #define SPR_FAIL 0x2707
+#define SPR_IDN_AVAIL_EN 0x1a05
+#define SPR_IDN_DATA_AVAIL 0x0a80
+#define SPR_IDN_DEADLOCK_TIMEOUT 0x1806
+#define SPR_IDN_DEMUX_COUNT_0 0x0a05
+#define SPR_IDN_DEMUX_COUNT_1 0x0a06
+#define SPR_IDN_DIRECTION_PROTECT 0x1405
+#define SPR_IDN_PENDING 0x0a08
 #define SPR_ILL_TRANS_REASON__I_STREAM_VA_RMASK 0x1
 #define SPR_INTCTRL_0_STATUS 0x2505
 #define SPR_INTCTRL_1_STATUS 0x2405
@@ -88,9 +95,27 @@
 #define SPR_IPI_MASK_SET_0 0x1f0a
 #define SPR_IPI_MASK_SET_1 0x1e0a
 #define SPR_IPI_MASK_SET_2 0x1d0a
+#define SPR_MPL_AUX_PERF_COUNT_SET_0 0x2100
+#define SPR_MPL_AUX_PERF_COUNT_SET_1 0x2101
+#define SPR_MPL_AUX_PERF_COUNT_SET_2 0x2102
 #define SPR_MPL_AUX_TILE_TIMER_SET_0 0x1700
 #define SPR_MPL_AUX_TILE_TIMER_SET_1 0x1701
 #define SPR_MPL_AUX_TILE_TIMER_SET_2 0x1702
+#define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
+#define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
+#define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
+#define SPR_MPL_IDN_AVAIL_SET_0 0x1a00
+#define SPR_MPL_IDN_AVAIL_SET_1 0x1a01
+#define SPR_MPL_IDN_AVAIL_SET_2 0x1a02
+#define SPR_MPL_IDN_COMPLETE_SET_0 0x0500
+#define SPR_MPL_IDN_COMPLETE_SET_1 0x0501
+#define SPR_MPL_IDN_COMPLETE_SET_2 0x0502
+#define SPR_MPL_IDN_FIREWALL_SET_0 0x1400
+#define SPR_MPL_IDN_FIREWALL_SET_1 0x1401
+#define SPR_MPL_IDN_FIREWALL_SET_2 0x1402
+#define SPR_MPL_IDN_TIMER_SET_0 0x1800
+#define SPR_MPL_IDN_TIMER_SET_1 0x1801
+#define SPR_MPL_IDN_TIMER_SET_2 0x1802
 #define SPR_MPL_INTCTRL_0_SET_0 0x2500
 #define SPR_MPL_INTCTRL_0_SET_1 0x2501
 #define SPR_MPL_INTCTRL_0_SET_2 0x2502
@@ -100,6 +125,21 @@
 #define SPR_MPL_INTCTRL_2_SET_0 0x2300
 #define SPR_MPL_INTCTRL_2_SET_1 0x2301
 #define SPR_MPL_INTCTRL_2_SET_2 0x2302
+#define SPR_MPL_IPI_0 0x1f04
+#define SPR_MPL_IPI_0_SET_0 0x1f00
+#define SPR_MPL_IPI_0_SET_1 0x1f01
+#define SPR_MPL_IPI_0_SET_2 0x1f02
+#define SPR_MPL_IPI_1 0x1e04
+#define SPR_MPL_IPI_1_SET_0 0x1e00
+#define SPR_MPL_IPI_1_SET_1 0x1e01
+#define SPR_MPL_IPI_1_SET_2 0x1e02
+#define SPR_MPL_IPI_2 0x1d04
+#define SPR_MPL_IPI_2_SET_0 0x1d00
+#define SPR_MPL_IPI_2_SET_1 0x1d01
+#define SPR_MPL_IPI_2_SET_2 0x1d02
+#define SPR_MPL_PERF_COUNT_SET_0 0x2000
+#define SPR_MPL_PERF_COUNT_SET_1 0x2001
+#define SPR_MPL_PERF_COUNT_SET_2 0x2002
 #define SPR_MPL_UDN_ACCESS_SET_0 0x0b00
 #define SPR_MPL_UDN_ACCESS_SET_1 0x0b01
 #define SPR_MPL_UDN_ACCESS_SET_2 0x0b02
@@ -167,7 +207,10 @@
 #define SPR_UDN_DEMUX_COUNT_2 0x0b07
 #define SPR_UDN_DEMUX_COUNT_3 0x0b08
 #define SPR_UDN_DIRECTION_PROTECT 0x1505
+#define SPR_UDN_PENDING 0x0b0a
+#define SPR_WATCH_MASK 0x200a
+#define SPR_WATCH_VAL 0x200b
 
-#endif /* !defined(__ARCH_SPR_DEF_H__) */
+#endif /* !defined(__ARCH_SPR_DEF_64_H__) */
 
 #endif /* !defined(__DOXYGEN__) */
diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild
new file mode 100644
index 00000000000..c20db8e428b
--- /dev/null
+++ b/arch/tile/include/uapi/asm/Kbuild
@@ -0,0 +1,21 @@
+# UAPI Header export list
+include include/uapi/asm-generic/Kbuild.asm
+
+header-y += auxvec.h
+header-y += bitsperlong.h
+header-y += byteorder.h
+header-y += cachectl.h
+header-y += hardwall.h
+header-y += kvm_para.h
+header-y += mman.h
+header-y += ptrace.h
+header-y += setup.h
+header-y += sigcontext.h
+header-y += siginfo.h
+header-y += signal.h
+header-y += stat.h
+header-y += swab.h
+header-y += ucontext.h
+header-y += unistd.h
+
+generic-y += ucontext.h
diff --git a/arch/tile/include/asm/auxvec.h b/arch/tile/include/uapi/asm/auxvec.h
index 1d393edb064..c93e92709f1 100644
--- a/arch/tile/include/asm/auxvec.h
+++ b/arch/tile/include/uapi/asm/auxvec.h
@@ -15,6 +15,7 @@
 #ifndef _ASM_TILE_AUXVEC_H
 #define _ASM_TILE_AUXVEC_H
 
-/* No extensions to auxvec */
+/* The vDSO location. */
+#define AT_SYSINFO_EHDR         33
 
 #endif /* _ASM_TILE_AUXVEC_H */
diff --git a/arch/tile/include/asm/bitsperlong.h b/arch/tile/include/uapi/asm/bitsperlong.h
index 58c771f2af2..58c771f2af2 100644
--- a/arch/tile/include/asm/bitsperlong.h
+++ b/arch/tile/include/uapi/asm/bitsperlong.h
diff --git a/arch/tile/include/uapi/asm/byteorder.h b/arch/tile/include/uapi/asm/byteorder.h
new file mode 100644
index 00000000000..fb72ecf4921
--- /dev/null
+++ b/arch/tile/include/uapi/asm/byteorder.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#if defined (__BIG_ENDIAN__)
+#include <linux/byteorder/big_endian.h>
+#elif defined (__LITTLE_ENDIAN__)
+#include <linux/byteorder/little_endian.h>
+#else
+#error "__BIG_ENDIAN__ or __LITTLE_ENDIAN__ must be defined."
+#endif
diff --git a/arch/tile/include/uapi/asm/cachectl.h b/arch/tile/include/uapi/asm/cachectl.h
new file mode 100644
index 00000000000..572ddcad209
--- /dev/null
+++ b/arch/tile/include/uapi/asm/cachectl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_CACHECTL_H
+#define _ASM_TILE_CACHECTL_H
+
+/*
+ * Options for cacheflush system call.
+ *
+ * The ICACHE flush is performed on all cores currently running the
+ * current process's address space.  The intent is for user
+ * applications to be able to modify code, invoke the system call,
+ * then allow arbitrary other threads in the same address space to see
+ * the newly-modified code.  Passing a length of CHIP_L1I_CACHE_SIZE()
+ * or more invalidates the entire icache on all cores in the address
+ * spaces.  (Note: currently this option invalidates the entire icache
+ * regardless of the requested address and length, but we may choose
+ * to honor the arguments at some point.)
+ *
+ * Flush and invalidation of memory can normally be performed with the
+ * __insn_flush() and __insn_finv() instructions from userspace.
+ * The DCACHE option to the system call allows userspace
+ * to flush the entire L1+L2 data cache from the core.  In this case,
+ * the address and length arguments are not used.  The DCACHE flush is
+ * restricted to the current core, not all cores in the address space.
+ */
+#define	ICACHE	(1<<0)		/* invalidate L1 instruction cache */
+#define	DCACHE	(1<<1)		/* flush and invalidate data cache */
+#define	BCACHE	(ICACHE|DCACHE)	/* flush both caches               */
+
+#endif	/* _ASM_TILE_CACHECTL_H */
diff --git a/arch/tile/include/uapi/asm/hardwall.h b/arch/tile/include/uapi/asm/hardwall.h
new file mode 100644
index 00000000000..c2169d4f401
--- /dev/null
+++ b/arch/tile/include/uapi/asm/hardwall.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Provide methods for access control of per-cpu resources like
+ * UDN, IDN, or IPI.
+ */
+
+#ifndef _UAPI_ASM_TILE_HARDWALL_H
+#define _UAPI_ASM_TILE_HARDWALL_H
+
+#include <arch/chip.h>
+#include <linux/ioctl.h>
+
+#define HARDWALL_IOCTL_BASE 0xa2
+
+/*
+ * The HARDWALL_CREATE() ioctl is a macro with a "size" argument.
+ * The resulting ioctl value is passed to the kernel in conjunction
+ * with a pointer to a standard kernel bitmask of cpus.
+ * For network resources (UDN or IDN) the bitmask must physically
+ * represent a rectangular configuration on the chip.
+ * The "size" is the number of bytes of cpu mask data.
+ */
+#define _HARDWALL_CREATE 1
+#define HARDWALL_CREATE(size) \
+  _IOC(_IOC_READ, HARDWALL_IOCTL_BASE, _HARDWALL_CREATE, (size))
+
+#define _HARDWALL_ACTIVATE 2
+#define HARDWALL_ACTIVATE \
+  _IO(HARDWALL_IOCTL_BASE, _HARDWALL_ACTIVATE)
+
+#define _HARDWALL_DEACTIVATE 3
+#define HARDWALL_DEACTIVATE \
+ _IO(HARDWALL_IOCTL_BASE, _HARDWALL_DEACTIVATE)
+
+#define _HARDWALL_GET_ID 4
+#define HARDWALL_GET_ID \
+ _IO(HARDWALL_IOCTL_BASE, _HARDWALL_GET_ID)
+
+
+#endif /* _UAPI_ASM_TILE_HARDWALL_H */
diff --git a/arch/tile/include/uapi/asm/kvm_para.h b/arch/tile/include/uapi/asm/kvm_para.h
new file mode 100644
index 00000000000..14fab8f0b95
--- /dev/null
+++ b/arch/tile/include/uapi/asm/kvm_para.h
@@ -0,0 +1 @@
+#include <asm-generic/kvm_para.h>
diff --git a/arch/tile/include/asm/mman.h b/arch/tile/include/uapi/asm/mman.h
index 81b8fc348d6..81b8fc348d6 100644
--- a/arch/tile/include/asm/mman.h
+++ b/arch/tile/include/uapi/asm/mman.h
diff --git a/arch/tile/include/uapi/asm/ptrace.h b/arch/tile/include/uapi/asm/ptrace.h
new file mode 100644
index 00000000000..7757e1985fb
--- /dev/null
+++ b/arch/tile/include/uapi/asm/ptrace.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _UAPI_ASM_TILE_PTRACE_H
+#define _UAPI_ASM_TILE_PTRACE_H
+
+#include <arch/chip.h>
+#include <arch/abi.h>
+
+/* These must match struct pt_regs, below. */
+#if CHIP_WORD_SIZE() == 32
+#define PTREGS_OFFSET_REG(n)    ((n)*4)
+#else
+#define PTREGS_OFFSET_REG(n)    ((n)*8)
+#endif
+#define PTREGS_OFFSET_BASE      0
+#define PTREGS_OFFSET_TP        PTREGS_OFFSET_REG(53)
+#define PTREGS_OFFSET_SP        PTREGS_OFFSET_REG(54)
+#define PTREGS_OFFSET_LR        PTREGS_OFFSET_REG(55)
+#define PTREGS_NR_GPRS          56
+#define PTREGS_OFFSET_PC        PTREGS_OFFSET_REG(56)
+#define PTREGS_OFFSET_EX1       PTREGS_OFFSET_REG(57)
+#define PTREGS_OFFSET_FAULTNUM  PTREGS_OFFSET_REG(58)
+#define PTREGS_OFFSET_ORIG_R0   PTREGS_OFFSET_REG(59)
+#define PTREGS_OFFSET_FLAGS     PTREGS_OFFSET_REG(60)
+#if CHIP_HAS_CMPEXCH()
+#define PTREGS_OFFSET_CMPEXCH   PTREGS_OFFSET_REG(61)
+#endif
+#define PTREGS_SIZE             PTREGS_OFFSET_REG(64)
+
+
+#ifndef __ASSEMBLY__
+
+#ifndef __KERNEL__
+/* Provide appropriate length type to userspace regardless of -m32/-m64. */
+typedef uint_reg_t pt_reg_t;
+#endif
+
+/*
+ * This struct defines the way the registers are stored on the stack during a
+ * system call or exception.  "struct sigcontext" has the same shape.
+ */
+struct pt_regs {
+	/* Saved main processor registers; 56..63 are special. */
+	/* tp, sp, and lr must immediately follow regs[] for aliasing. */
+	pt_reg_t regs[53];
+	pt_reg_t tp;		/* aliases regs[TREG_TP] */
+	pt_reg_t sp;		/* aliases regs[TREG_SP] */
+	pt_reg_t lr;		/* aliases regs[TREG_LR] */
+
+	/* Saved special registers. */
+	pt_reg_t pc;		/* stored in EX_CONTEXT_K_0 */
+	pt_reg_t ex1;		/* stored in EX_CONTEXT_K_1 (PL and ICS bit) */
+	pt_reg_t faultnum;	/* fault number (INT_SWINT_1 for syscall) */
+	pt_reg_t orig_r0;	/* r0 at syscall entry, else zero */
+	pt_reg_t flags;		/* flags (see below) */
+#if !CHIP_HAS_CMPEXCH()
+	pt_reg_t pad[3];
+#else
+	pt_reg_t cmpexch;	/* value of CMPEXCH_VALUE SPR at interrupt */
+	pt_reg_t pad[2];
+#endif
+};
+
+#endif /* __ASSEMBLY__ */
+
+#define PTRACE_GETREGS		12
+#define PTRACE_SETREGS		13
+#define PTRACE_GETFPREGS	14
+#define PTRACE_SETFPREGS	15
+
+/* Support TILE-specific ptrace options, with events starting at 16. */
+#define PTRACE_EVENT_MIGRATE	16
+#define PTRACE_O_TRACEMIGRATE	(1 << PTRACE_EVENT_MIGRATE)
+
+/*
+ * Flag bits in pt_regs.flags that are part of the ptrace API.
+ * We start our numbering higher up to avoid confusion with the
+ * non-ABI kernel-internal values that use the low 16 bits.
+ */
+#define PT_FLAGS_COMPAT		0x10000  /* process is an -m32 compat process */
+
+#endif /* _UAPI_ASM_TILE_PTRACE_H */
diff --git a/arch/tile/include/asm/hw_irq.h b/arch/tile/include/uapi/asm/setup.h
index 4fac5fbf333..e6f7da265ac 100644
--- a/arch/tile/include/asm/hw_irq.h
+++ b/arch/tile/include/uapi/asm/setup.h
@@ -12,7 +12,10 @@
  *   more details.
  */
 
-#ifndef _ASM_TILE_HW_IRQ_H
-#define _ASM_TILE_HW_IRQ_H
+#ifndef _UAPI_ASM_TILE_SETUP_H
+#define _UAPI_ASM_TILE_SETUP_H
 
-#endif /* _ASM_TILE_HW_IRQ_H */
+#define COMMAND_LINE_SIZE	2048
+
+
+#endif /* _UAPI_ASM_TILE_SETUP_H */
diff --git a/arch/tile/include/asm/sigcontext.h b/arch/tile/include/uapi/asm/sigcontext.h
index 6348e59d372..6348e59d372 100644
--- a/arch/tile/include/asm/sigcontext.h
+++ b/arch/tile/include/uapi/asm/sigcontext.h
diff --git a/arch/tile/include/asm/siginfo.h b/arch/tile/include/uapi/asm/siginfo.h
index 56d661bb010..56d661bb010 100644
--- a/arch/tile/include/asm/siginfo.h
+++ b/arch/tile/include/uapi/asm/siginfo.h
diff --git a/arch/tile/include/uapi/asm/signal.h b/arch/tile/include/uapi/asm/signal.h
new file mode 100644
index 00000000000..ef0d32d84a4
--- /dev/null
+++ b/arch/tile/include/uapi/asm/signal.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _UAPI_ASM_TILE_SIGNAL_H
+#define _UAPI_ASM_TILE_SIGNAL_H
+
+/* Do not notify a ptracer when this signal is handled. */
+#define SA_NOPTRACE 0x02000000u
+
+/* Used in earlier Tilera releases, so keeping for binary compatibility. */
+#define SA_RESTORER 0x04000000u
+
+#include <asm-generic/signal.h>
+
+
+#endif /* _UAPI_ASM_TILE_SIGNAL_H */
diff --git a/arch/tile/include/asm/stat.h b/arch/tile/include/uapi/asm/stat.h
index c0db34d56be..c0db34d56be 100644
--- a/arch/tile/include/asm/stat.h
+++ b/arch/tile/include/uapi/asm/stat.h
diff --git a/arch/tile/include/asm/swab.h b/arch/tile/include/uapi/asm/swab.h
index 7c37b38f6c8..7c37b38f6c8 100644
--- a/arch/tile/include/asm/swab.h
+++ b/arch/tile/include/uapi/asm/swab.h
diff --git a/arch/tile/include/uapi/asm/unistd.h b/arch/tile/include/uapi/asm/unistd.h
new file mode 100644
index 00000000000..3866397aaf5
--- /dev/null
+++ b/arch/tile/include/uapi/asm/unistd.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#if !defined(__LP64__) || defined(__SYSCALL_COMPAT)
+/* Use the flavor of this syscall that matches the 32-bit API better. */
+#define __ARCH_WANT_SYNC_FILE_RANGE2
+#endif
+
+/* Use the standard ABI for syscalls. */
+#include <asm-generic/unistd.h>
+
+#define NR_syscalls __NR_syscalls
+
+/* Additional Tilera-specific syscalls. */
+#define __NR_cacheflush	(__NR_arch_specific_syscall + 1)
+__SYSCALL(__NR_cacheflush, sys_cacheflush)
+
+#ifndef __tilegx__
+/* "Fast" syscalls provide atomic support for 32-bit chips. */
+#define __NR_FAST_cmpxchg	-1
+#define __NR_FAST_atomic_update	-2
+#define __NR_FAST_cmpxchg64	-3
+#define __NR_cmpxchg_badaddr	(__NR_arch_specific_syscall + 0)
+__SYSCALL(__NR_cmpxchg_badaddr, sys_cmpxchg_badaddr)
+#endif
diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile
index b4dbc057baa..21f77bf68c6 100644
--- a/arch/tile/kernel/Makefile
+++ b/arch/tile/kernel/Makefile
@@ -3,16 +3,34 @@
 #
 
 extra-y := vmlinux.lds head_$(BITS).o
-obj-y := backtrace.o entry.o init_task.o irq.o messaging.o \
+obj-y := backtrace.o entry.o hvglue.o irq.o messaging.o \
 	pci-dma.o proc.o process.o ptrace.o reboot.o \
-	setup.o signal.o single_step.o stack.o sys.o sysfs.o time.o traps.o \
+	setup.o signal.o single_step.o stack.o sys.o \
+	sysfs.o time.o traps.o unaligned.o vdso.o \
 	intvec_$(BITS).o regs_$(BITS).o tile-desc_$(BITS).o
 
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_early_printk.o = -pg
+endif
+
 obj-$(CONFIG_HARDWALL)		+= hardwall.o
-obj-$(CONFIG_TILEGX)		+= futex_64.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_signal.o
 obj-$(CONFIG_SMP)		+= smpboot.o smp.o tlb.o
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel_$(BITS).o
+ifdef CONFIG_TILEGX
+obj-$(CONFIG_PCI)		+= pci_gx.o
+else
 obj-$(CONFIG_PCI)		+= pci.o
+endif
+obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o
+obj-$(CONFIG_USE_PMC)		+= pmc.o
+obj-$(CONFIG_TILE_USB)		+= usb.o
+obj-$(CONFIG_TILE_HVGLUE_TRACE)	+= hvglue_trace.o
+obj-$(CONFIG_FUNCTION_TRACER)	+= ftrace.o mcount_64.o
+obj-$(CONFIG_KPROBES)		+= kprobes.o
+obj-$(CONFIG_KGDB)		+= kgdb.o
+
+obj-y				+= vdso/
diff --git a/arch/tile/kernel/asm-offsets.c b/arch/tile/kernel/asm-offsets.c
index 01ddf19cc36..375e7c321ee 100644
--- a/arch/tile/kernel/asm-offsets.c
+++ b/arch/tile/kernel/asm-offsets.c
@@ -14,13 +14,6 @@
  * Generates definitions from c-type structures used by assembly sources.
  */
 
-#include <linux/kbuild.h>
-#include <linux/thread_info.h>
-#include <linux/sched.h>
-#include <linux/hardirq.h>
-#include <linux/ptrace.h>
-#include <hv/hypervisor.h>
-
 /* Check for compatible compiler early in the build. */
 #ifdef CONFIG_TILEGX
 # ifndef __tilegx__
@@ -31,46 +24,61 @@
 # endif
 #else
 # ifdef __tilegx__
-#  error Can not build TILEPro/TILE64 configurations with tilegx compiler
+#  error Can not build TILEPro configurations with tilegx compiler
 # endif
 #endif
 
+#include <linux/kbuild.h>
+#include <linux/thread_info.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/ptrace.h>
+#include <hv/hypervisor.h>
+
 void foo(void)
 {
-	DEFINE(SINGLESTEP_STATE_BUFFER_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_BUFFER_OFFSET,
 	       offsetof(struct single_step_state, buffer));
-	DEFINE(SINGLESTEP_STATE_FLAGS_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_FLAGS_OFFSET,
 	       offsetof(struct single_step_state, flags));
-	DEFINE(SINGLESTEP_STATE_ORIG_PC_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_ORIG_PC_OFFSET,
 	       offsetof(struct single_step_state, orig_pc));
-	DEFINE(SINGLESTEP_STATE_NEXT_PC_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_NEXT_PC_OFFSET,
 	       offsetof(struct single_step_state, next_pc));
-	DEFINE(SINGLESTEP_STATE_BRANCH_NEXT_PC_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_BRANCH_NEXT_PC_OFFSET,
 	       offsetof(struct single_step_state, branch_next_pc));
-	DEFINE(SINGLESTEP_STATE_UPDATE_VALUE_OFFSET, \
+	DEFINE(SINGLESTEP_STATE_UPDATE_VALUE_OFFSET,
 	       offsetof(struct single_step_state, update_value));
 
-	DEFINE(THREAD_INFO_TASK_OFFSET, \
+	DEFINE(THREAD_INFO_TASK_OFFSET,
 	       offsetof(struct thread_info, task));
-	DEFINE(THREAD_INFO_FLAGS_OFFSET, \
+	DEFINE(THREAD_INFO_FLAGS_OFFSET,
 	       offsetof(struct thread_info, flags));
-	DEFINE(THREAD_INFO_STATUS_OFFSET, \
+	DEFINE(THREAD_INFO_STATUS_OFFSET,
 	       offsetof(struct thread_info, status));
-	DEFINE(THREAD_INFO_HOMECACHE_CPU_OFFSET, \
+	DEFINE(THREAD_INFO_HOMECACHE_CPU_OFFSET,
 	       offsetof(struct thread_info, homecache_cpu));
-	DEFINE(THREAD_INFO_STEP_STATE_OFFSET, \
+	DEFINE(THREAD_INFO_PREEMPT_COUNT_OFFSET,
+	       offsetof(struct thread_info, preempt_count));
+	DEFINE(THREAD_INFO_STEP_STATE_OFFSET,
 	       offsetof(struct thread_info, step_state));
+#ifdef __tilegx__
+	DEFINE(THREAD_INFO_UNALIGN_JIT_BASE_OFFSET,
+	       offsetof(struct thread_info, unalign_jit_base));
+	DEFINE(THREAD_INFO_UNALIGN_JIT_TMP_OFFSET,
+	       offsetof(struct thread_info, unalign_jit_tmp));
+#endif
 
 	DEFINE(TASK_STRUCT_THREAD_KSP_OFFSET,
 	       offsetof(struct task_struct, thread.ksp));
 	DEFINE(TASK_STRUCT_THREAD_PC_OFFSET,
 	       offsetof(struct task_struct, thread.pc));
 
-	DEFINE(HV_TOPOLOGY_WIDTH_OFFSET, \
+	DEFINE(HV_TOPOLOGY_WIDTH_OFFSET,
 	       offsetof(HV_Topology, width));
-	DEFINE(HV_TOPOLOGY_HEIGHT_OFFSET, \
+	DEFINE(HV_TOPOLOGY_HEIGHT_OFFSET,
 	       offsetof(HV_Topology, height));
 
-	DEFINE(IRQ_CPUSTAT_SYSCALL_COUNT_OFFSET, \
+	DEFINE(IRQ_CPUSTAT_SYSCALL_COUNT_OFFSET,
 	       offsetof(irq_cpustat_t, irq_syscall_count));
 }
diff --git a/arch/tile/kernel/backtrace.c b/arch/tile/kernel/backtrace.c
index 9092ce8aa6b..f8b74ca83b9 100644
--- a/arch/tile/kernel/backtrace.c
+++ b/arch/tile/kernel/backtrace.c
@@ -14,6 +14,7 @@
 
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <asm/byteorder.h>
 #include <asm/backtrace.h>
 #include <asm/tile-desc.h>
 #include <arch/abi.h>
@@ -336,8 +337,12 @@ static void find_caller_pc_and_caller_sp(CallerLocation *location,
 				bytes_to_prefetch / sizeof(tile_bundle_bits);
 		}
 
-		/* Decode the next bundle. */
-		bundle.bits = prefetched_bundles[next_bundle++];
+		/*
+		 * Decode the next bundle.
+		 * TILE always stores instruction bundles in little-endian
+		 * mode, even when the chip is running in big-endian mode.
+		 */
+		bundle.bits = le64_to_cpu(prefetched_bundles[next_bundle++]);
 		bundle.num_insns =
 			parse_insn_tile(bundle.bits, pc, bundle.insns);
 		num_info_ops = bt_get_info_ops(&bundle, info_operands);
diff --git a/arch/tile/kernel/compat.c b/arch/tile/kernel/compat.c
index bf5e9d70266..49120843ff9 100644
--- a/arch/tile/kernel/compat.c
+++ b/arch/tile/kernel/compat.c
@@ -16,7 +16,6 @@
 #define __SYSCALL_COMPAT
 
 #include <linux/compat.h>
-#include <linux/msg.h>
 #include <linux/syscalls.h>
 #include <linux/kdev_t.h>
 #include <linux/fs.h>
@@ -33,121 +32,69 @@
  * adapt the usual convention.
  */
 
-long compat_sys_truncate64(char __user *filename, u32 dummy, u32 low, u32 high)
+COMPAT_SYSCALL_DEFINE4(truncate64, char __user *, filename, u32, dummy,
+                       u32, low, u32, high)
 {
 	return sys_truncate(filename, ((loff_t)high << 32) | low);
 }
 
-long compat_sys_ftruncate64(unsigned int fd, u32 dummy, u32 low, u32 high)
+COMPAT_SYSCALL_DEFINE4(ftruncate64, unsigned int, fd, u32, dummy,
+                       u32, low, u32, high)
 {
 	return sys_ftruncate(fd, ((loff_t)high << 32) | low);
 }
 
-long compat_sys_pread64(unsigned int fd, char __user *ubuf, size_t count,
-			u32 dummy, u32 low, u32 high)
+COMPAT_SYSCALL_DEFINE6(pread64, unsigned int, fd, char __user *, ubuf,
+                       size_t, count, u32, dummy, u32, low, u32, high)
 {
 	return sys_pread64(fd, ubuf, count, ((loff_t)high << 32) | low);
 }
 
-long compat_sys_pwrite64(unsigned int fd, char __user *ubuf, size_t count,
-			 u32 dummy, u32 low, u32 high)
+COMPAT_SYSCALL_DEFINE6(pwrite64, unsigned int, fd, char __user *, ubuf,
+                       size_t, count, u32, dummy, u32, low, u32, high)
 {
 	return sys_pwrite64(fd, ubuf, count, ((loff_t)high << 32) | low);
 }
 
-long compat_sys_lookup_dcookie(u32 low, u32 high, char __user *buf, size_t len)
-{
-	return sys_lookup_dcookie(((loff_t)high << 32) | low, buf, len);
-}
-
-long compat_sys_sync_file_range2(int fd, unsigned int flags,
-				 u32 offset_lo, u32 offset_hi,
-				 u32 nbytes_lo, u32 nbytes_hi)
+COMPAT_SYSCALL_DEFINE6(sync_file_range2, int, fd, unsigned int, flags,
+                       u32, offset_lo, u32, offset_hi,
+                       u32, nbytes_lo, u32, nbytes_hi)
 {
 	return sys_sync_file_range(fd, ((loff_t)offset_hi << 32) | offset_lo,
 				   ((loff_t)nbytes_hi << 32) | nbytes_lo,
 				   flags);
 }
 
-long compat_sys_fallocate(int fd, int mode,
-			  u32 offset_lo, u32 offset_hi,
-			  u32 len_lo, u32 len_hi)
+COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode,
+                       u32, offset_lo, u32, offset_hi,
+                       u32, len_lo, u32, len_hi)
 {
 	return sys_fallocate(fd, mode, ((loff_t)offset_hi << 32) | offset_lo,
 			     ((loff_t)len_hi << 32) | len_lo);
 }
 
-
-
-long compat_sys_sched_rr_get_interval(compat_pid_t pid,
-				      struct compat_timespec __user *interval)
-{
-	struct timespec t;
-	int ret;
-	mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);
-	ret = sys_sched_rr_get_interval(pid,
-					(struct timespec __force __user *)&t);
-	set_fs(old_fs);
-	if (put_compat_timespec(&t, interval))
-		return -EFAULT;
-	return ret;
-}
-
 /*
- * The usual compat_sys_msgsnd() and _msgrcv() seem to be assuming
- * some different calling convention than our normal 32-bit tile code.
+ * Avoid bug in generic sys_llseek() that specifies offset_high and
+ * offset_low as "unsigned long", thus making it possible to pass
+ * a sign-extended high 32 bits in offset_low.
  */
-
-/* Already defined in ipc/compat.c, but we need it here. */
-struct compat_msgbuf {
-	compat_long_t mtype;
-	char mtext[1];
-};
-
-long tile_compat_sys_msgsnd(int msqid,
-			    struct compat_msgbuf __user *msgp,
-			    size_t msgsz, int msgflg)
+COMPAT_SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned int, offset_high,
+		       unsigned int, offset_low, loff_t __user *, result,
+		       unsigned int, origin)
 {
-	compat_long_t mtype;
-
-	if (get_user(mtype, &msgp->mtype))
-		return -EFAULT;
-	return do_msgsnd(msqid, mtype, msgp->mtext, msgsz, msgflg);
-}
-
-long tile_compat_sys_msgrcv(int msqid,
-			    struct compat_msgbuf __user *msgp,
-			    size_t msgsz, long msgtyp, int msgflg)
-{
-	long err, mtype;
-
-	err =  do_msgrcv(msqid, &mtype, msgp->mtext, msgsz, msgtyp, msgflg);
-	if (err < 0)
-		goto out;
-
-	if (put_user(mtype, &msgp->mtype))
-		err = -EFAULT;
- out:
-	return err;
+	return sys_llseek(fd, offset_high, offset_low, result, origin);
 }
 
 /* Provide the compat syscall number to call mapping. */
 #undef __SYSCALL
 #define __SYSCALL(nr, call) [nr] = (call),
 
-/* The generic versions of these don't work for Tile. */
-#define compat_sys_msgrcv tile_compat_sys_msgrcv
-#define compat_sys_msgsnd tile_compat_sys_msgsnd
-
 /* See comments in sys.c */
 #define compat_sys_fadvise64_64 sys32_fadvise64_64
 #define compat_sys_readahead sys32_readahead
+#define sys_llseek compat_sys_llseek
 
-/* Call the trampolines to manage pt_regs where necessary. */
-#define compat_sys_execve _compat_sys_execve
-#define compat_sys_sigaltstack _compat_sys_sigaltstack
+/* Call the assembly trampolines where necessary. */
 #define compat_sys_rt_sigreturn _compat_sys_rt_sigreturn
 #define sys_clone _sys_clone
 
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index 77763ccd5a7..19c04b5ce40 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -32,21 +32,9 @@
 #include <asm/ucontext.h>
 #include <asm/sigframe.h>
 #include <asm/syscalls.h>
+#include <asm/vdso.h>
 #include <arch/interrupts.h>
 
-struct compat_sigaction {
-	compat_uptr_t sa_handler;
-	compat_ulong_t sa_flags;
-	compat_uptr_t sa_restorer;
-	sigset_t sa_mask __packed;
-};
-
-struct compat_sigaltstack {
-	compat_uptr_t ss_sp;
-	int ss_flags;
-	compat_size_t ss_size;
-};
-
 struct compat_ucontext {
 	compat_ulong_t	  uc_flags;
 	compat_uptr_t     uc_link;
@@ -55,129 +43,13 @@ struct compat_ucontext {
 	sigset_t	  uc_sigmask;	/* mask last for extensibility */
 };
 
-#define COMPAT_SI_PAD_SIZE	((SI_MAX_SIZE - 3 * sizeof(int)) / sizeof(int))
-
-struct compat_siginfo {
-	int si_signo;
-	int si_errno;
-	int si_code;
-
-	union {
-		int _pad[COMPAT_SI_PAD_SIZE];
-
-		/* kill() */
-		struct {
-			unsigned int _pid;	/* sender's pid */
-			unsigned int _uid;	/* sender's uid */
-		} _kill;
-
-		/* POSIX.1b timers */
-		struct {
-			compat_timer_t _tid;	/* timer id */
-			int _overrun;		/* overrun count */
-			compat_sigval_t _sigval;	/* same as below */
-			int _sys_private;	/* not to be passed to user */
-			int _overrun_incr;	/* amount to add to overrun */
-		} _timer;
-
-		/* POSIX.1b signals */
-		struct {
-			unsigned int _pid;	/* sender's pid */
-			unsigned int _uid;	/* sender's uid */
-			compat_sigval_t _sigval;
-		} _rt;
-
-		/* SIGCHLD */
-		struct {
-			unsigned int _pid;	/* which child */
-			unsigned int _uid;	/* sender's uid */
-			int _status;		/* exit code */
-			compat_clock_t _utime;
-			compat_clock_t _stime;
-		} _sigchld;
-
-		/* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
-		struct {
-			unsigned int _addr;	/* faulting insn/memory ref. */
-#ifdef __ARCH_SI_TRAPNO
-			int _trapno;	/* TRAP # which caused the signal */
-#endif
-		} _sigfault;
-
-		/* SIGPOLL */
-		struct {
-			int _band;	/* POLL_IN, POLL_OUT, POLL_MSG */
-			int _fd;
-		} _sigpoll;
-	} _sifields;
-};
-
 struct compat_rt_sigframe {
 	unsigned char save_area[C_ABI_SAVE_AREA_SIZE]; /* caller save area */
 	struct compat_siginfo info;
 	struct compat_ucontext uc;
 };
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-long compat_sys_rt_sigaction(int sig, struct compat_sigaction __user *act,
-			     struct compat_sigaction __user *oact,
-			     size_t sigsetsize)
-{
-	struct k_sigaction new_sa, old_sa;
-	int ret = -EINVAL;
-
-	/* XXX: Don't preclude handling different sized sigset_t's.  */
-	if (sigsetsize != sizeof(sigset_t))
-		goto out;
-
-	if (act) {
-		compat_uptr_t handler, restorer;
-
-		if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
-		    __get_user(handler, &act->sa_handler) ||
-		    __get_user(new_sa.sa.sa_flags, &act->sa_flags) ||
-		    __get_user(restorer, &act->sa_restorer) ||
-		    __copy_from_user(&new_sa.sa.sa_mask, &act->sa_mask,
-				     sizeof(sigset_t)))
-			return -EFAULT;
-		new_sa.sa.sa_handler = compat_ptr(handler);
-		new_sa.sa.sa_restorer = compat_ptr(restorer);
-	}
-
-	ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
-
-	if (!ret && oact) {
-		if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
-		    __put_user(ptr_to_compat(old_sa.sa.sa_handler),
-			       &oact->sa_handler) ||
-		    __put_user(ptr_to_compat(old_sa.sa.sa_restorer),
-			       &oact->sa_restorer) ||
-		    __put_user(old_sa.sa.sa_flags, &oact->sa_flags) ||
-		    __copy_to_user(&oact->sa_mask, &old_sa.sa.sa_mask,
-				   sizeof(sigset_t)))
-			return -EFAULT;
-	}
-out:
-	return ret;
-}
-
-long compat_sys_rt_sigqueueinfo(int pid, int sig,
-				struct compat_siginfo __user *uinfo)
-{
-	siginfo_t info;
-	int ret;
-	mm_segment_t old_fs = get_fs();
-
-	if (copy_siginfo_from_user32(&info, uinfo))
-		return -EFAULT;
-	set_fs(KERNEL_DS);
-	ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __force __user *)&info);
-	set_fs(old_fs);
-	return ret;
-}
-
-int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from)
+int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *from)
 {
 	int err;
 
@@ -255,44 +127,10 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from)
 	return err;
 }
 
-long compat_sys_sigaltstack(const struct compat_sigaltstack __user *uss_ptr,
-			    struct compat_sigaltstack __user *uoss_ptr,
-			    struct pt_regs *regs)
-{
-	stack_t uss, uoss;
-	int ret;
-	mm_segment_t seg;
-
-	if (uss_ptr) {
-		u32 ptr;
-
-		memset(&uss, 0, sizeof(stack_t));
-		if (!access_ok(VERIFY_READ, uss_ptr, sizeof(*uss_ptr)) ||
-			    __get_user(ptr, &uss_ptr->ss_sp) ||
-			    __get_user(uss.ss_flags, &uss_ptr->ss_flags) ||
-			    __get_user(uss.ss_size, &uss_ptr->ss_size))
-			return -EFAULT;
-		uss.ss_sp = compat_ptr(ptr);
-	}
-	seg = get_fs();
-	set_fs(KERNEL_DS);
-	ret = do_sigaltstack(uss_ptr ? (stack_t __user __force *)&uss : NULL,
-			     (stack_t __user __force *)&uoss,
-			     (unsigned long)compat_ptr(regs->sp));
-	set_fs(seg);
-	if (ret >= 0 && uoss_ptr)  {
-		if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(*uoss_ptr)) ||
-		    __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
-		    __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
-		    __put_user(uoss.ss_size, &uoss_ptr->ss_size))
-			ret = -EFAULT;
-	}
-	return ret;
-}
-
 /* The assembly shim for this function arranges to ignore the return value. */
-long compat_sys_rt_sigreturn(struct pt_regs *regs)
+long compat_sys_rt_sigreturn(void)
 {
+	struct pt_regs *regs = current_pt_regs();
 	struct compat_rt_sigframe __user *frame =
 		(struct compat_rt_sigframe __user *) compat_ptr(regs->sp);
 	sigset_t set;
@@ -302,13 +140,12 @@ long compat_sys_rt_sigreturn(struct pt_regs *regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
 		goto badframe;
 
-	if (compat_sys_sigaltstack(&frame->uc.uc_stack, NULL, regs) != 0)
+	if (compat_restore_altstack(&frame->uc.uc_stack))
 		goto badframe;
 
 	return 0;
@@ -385,17 +222,13 @@ int compat_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	err |= __clear_user(&frame->save_area, sizeof(frame->save_area));
 	err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __put_user(0, &frame->uc.uc_link);
-	err |= __put_user(ptr_to_compat((void *)(current->sas_ss_sp)),
-			  &frame->uc.uc_stack.ss_sp);
-	err |= __put_user(sas_ss_flags(regs->sp),
-			  &frame->uc.uc_stack.ss_flags);
-	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+	err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp);
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, regs);
 	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
 	if (err)
 		goto give_sigsegv;
 
-	restorer = VDSO_BASE;
+	restorer = VDSO_SYM(&__vdso_rt_sigreturn);
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = ptr_to_compat_reg(ka->sa.sa_restorer);
 
@@ -403,28 +236,17 @@ int compat_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	 * Set up registers for signal handler.
 	 * Registers that we don't modify keep the value they had from
 	 * user-space at the time we took the signal.
+	 * We always pass siginfo and mcontext, regardless of SA_SIGINFO,
+	 * since some things rely on this (e.g. glibc's debug/segfault.c).
 	 */
 	regs->pc = ptr_to_compat_reg(ka->sa.sa_handler);
 	regs->ex1 = PL_ICS_EX1(USER_PL, 1); /* set crit sec in handler */
 	regs->sp = ptr_to_compat_reg(frame);
 	regs->lr = restorer;
 	regs->regs[0] = (unsigned long) usig;
-
-	if (ka->sa.sa_flags & SA_SIGINFO) {
-		/* Need extra arguments, so mark to restore caller-saves. */
-		regs->regs[1] = ptr_to_compat_reg(&frame->info);
-		regs->regs[2] = ptr_to_compat_reg(&frame->uc);
-		regs->flags |= PT_FLAGS_CALLER_SAVES;
-	}
-
-	/*
-	 * Notify any tracer that was single-stepping it.
-	 * The tracer may want to single-step inside the
-	 * handler too.
-	 */
-	if (test_thread_flag(TIF_SINGLESTEP))
-		ptrace_notify(SIGTRAP);
-
+	regs->regs[1] = ptr_to_compat_reg(&frame->info);
+	regs->regs[2] = ptr_to_compat_reg(&frame->uc);
+	regs->flags |= PT_FLAGS_CALLER_SAVES;
 	return 0;
 
 give_sigsegv:
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
index 493a0e66d91..b608e00e7f6 100644
--- a/arch/tile/kernel/early_printk.c
+++ b/arch/tile/kernel/early_printk.c
@@ -16,41 +16,31 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/string.h>
+#include <linux/irqflags.h>
+#include <linux/printk.h>
 #include <asm/setup.h>
 #include <hv/hypervisor.h>
 
 static void early_hv_write(struct console *con, const char *s, unsigned n)
 {
-	hv_console_write((HV_VirtAddr) s, n);
+	tile_console_write(s, n);
+
+	/*
+	 * Convert NL to NLCR (close enough to CRNL) during early boot.
+	 * We assume newlines are at the ends of strings, which turns out
+	 * to be good enough for early boot console output.
+	 */
+	if (n && s[n-1] == '\n')
+		tile_console_write("\r", 1);
 }
 
 static struct console early_hv_console = {
 	.name =		"earlyhv",
 	.write =	early_hv_write,
-	.flags =	CON_PRINTBUFFER,
+	.flags =	CON_PRINTBUFFER | CON_BOOT,
 	.index =	-1,
 };
 
-/* Direct interface for emergencies */
-static struct console *early_console = &early_hv_console;
-static int early_console_initialized;
-static int early_console_complete;
-
-static void early_vprintk(const char *fmt, va_list ap)
-{
-	char buf[512];
-	int n = vscnprintf(buf, sizeof(buf), fmt, ap);
-	early_console->write(early_console, buf, n);
-}
-
-void early_printk(const char *fmt, ...)
-{
-	va_list ap;
-	va_start(ap, fmt);
-	early_vprintk(fmt, ap);
-	va_end(ap);
-}
-
 void early_panic(const char *fmt, ...)
 {
 	va_list ap;
@@ -58,52 +48,21 @@ void early_panic(const char *fmt, ...)
 	va_start(ap, fmt);
 	early_printk("Kernel panic - not syncing: ");
 	early_vprintk(fmt, ap);
-	early_console->write(early_console, "\n", 1);
+	early_printk("\n");
 	va_end(ap);
 	dump_stack();
 	hv_halt();
 }
 
-static int __initdata keep_early;
-
 static int __init setup_early_printk(char *str)
 {
-	if (early_console_initialized)
+	if (early_console)
 		return 1;
 
-	if (str != NULL && strncmp(str, "keep", 4) == 0)
-		keep_early = 1;
-
 	early_console = &early_hv_console;
-	early_console_initialized = 1;
 	register_console(early_console);
 
 	return 0;
 }
 
-void __init disable_early_printk(void)
-{
-	early_console_complete = 1;
-	if (!early_console_initialized || !early_console)
-		return;
-	if (!keep_early) {
-		early_printk("disabling early console\n");
-		unregister_console(early_console);
-		early_console_initialized = 0;
-	} else {
-		early_printk("keeping early console\n");
-	}
-}
-
-void warn_early_printk(void)
-{
-	if (early_console_complete || early_console_initialized)
-		return;
-	early_printk("\
-Machine shutting down before console output is fully initialized.\n\
-You may wish to reboot and add the option 'earlyprintk' to your\n\
-boot command line to see any diagnostic early console output.\n\
-");
-}
-
 early_param("earlyprintk", setup_early_printk);
diff --git a/arch/tile/kernel/entry.S b/arch/tile/kernel/entry.S
index 431e9ae6048..3d9175992a2 100644
--- a/arch/tile/kernel/entry.S
+++ b/arch/tile/kernel/entry.S
@@ -27,33 +27,6 @@ STD_ENTRY(current_text_addr)
 	{ move r0, lr; jrp lr }
 	STD_ENDPROC(current_text_addr)
 
-/*
- * Implement execve().  The i386 code has a note that forking from kernel
- * space results in no copy on write until the execve, so we should be
- * careful not to write to the stack here.
- */
-STD_ENTRY(kernel_execve)
-	moveli TREG_SYSCALL_NR_NAME, __NR_execve
-	swint1
-	jrp lr
-	STD_ENDPROC(kernel_execve)
-
-/*
- * We don't run this function directly, but instead copy it to a page
- * we map into every user process.  See vdso_setup().
- *
- * Note that libc has a copy of this function that it uses to compare
- * against the PC when a stack backtrace ends, so if this code is
- * changed, the libc implementation(s) should also be updated.
- */
-	.pushsection .data
-ENTRY(__rt_sigreturn)
-	moveli TREG_SYSCALL_NR_NAME,__NR_rt_sigreturn
-	swint1
-	ENDPROC(__rt_sigreturn)
-	ENTRY(__rt_sigreturn_end)
-	.popsection
-
 STD_ENTRY(dump_stack)
 	{ move r2, lr; lnk r1 }
 	{ move r4, r52; addli r1, r1, dump_stack - . }
@@ -68,23 +41,10 @@ STD_ENTRY(KBacktraceIterator_init_current)
 	jrp lr   /* keep backtracer happy */
 	STD_ENDPROC(KBacktraceIterator_init_current)
 
-/*
- * Reset our stack to r1/r2 (sp and ksp0+cpu respectively), then
- * free the old stack (passed in r0) and re-invoke cpu_idle().
- * We update sp and ksp0 simultaneously to avoid backtracer warnings.
- */
-STD_ENTRY(cpu_idle_on_new_stack)
-	{
-	 move sp, r1
-	 mtspr SPR_SYSTEM_SAVE_K_0, r2
-	}
-	jal free_thread_info
-	j cpu_idle
-	STD_ENDPROC(cpu_idle_on_new_stack)
-
 /* Loop forever on a nap during SMP boot. */
 STD_ENTRY(smp_nap)
 	nap
+	nop       /* avoid provoking the icache prefetch with a jump */
 	j smp_nap /* we are not architecturally guaranteed not to exit nap */
 	jrp lr    /* clue in the backtracer */
 	STD_ENDPROC(smp_nap)
@@ -99,11 +59,13 @@ STD_ENTRY(smp_nap)
  */
 STD_ENTRY(_cpu_idle)
 	movei r1, 1
+	IRQ_ENABLE_LOAD(r2, r3)
 	mtspr INTERRUPT_CRITICAL_SECTION, r1
-	IRQ_ENABLE(r2, r3)             /* unmask, but still with ICS set */
+	IRQ_ENABLE_APPLY(r2, r3)       /* unmask, but still with ICS set */
 	mtspr INTERRUPT_CRITICAL_SECTION, zero
 	.global _cpu_idle_nap
 _cpu_idle_nap:
 	nap
+	nop       /* avoid provoking the icache prefetch with a jump */
 	jrp lr
 	STD_ENDPROC(_cpu_idle)
diff --git a/arch/tile/kernel/ftrace.c b/arch/tile/kernel/ftrace.c
new file mode 100644
index 00000000000..8d52d83cc51
--- /dev/null
+++ b/arch/tile/kernel/ftrace.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx specific ftrace support
+ */
+
+#include <linux/ftrace.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+#include <asm/ftrace.h>
+#include <asm/sections.h>
+
+#include <arch/opcode.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static inline tilegx_bundle_bits NOP(void)
+{
+	return create_UnaryOpcodeExtension_X0(FNOP_UNARY_OPCODE_X0) |
+		create_RRROpcodeExtension_X0(UNARY_RRR_0_OPCODE_X0) |
+		create_Opcode_X0(RRR_0_OPCODE_X0) |
+		create_UnaryOpcodeExtension_X1(NOP_UNARY_OPCODE_X1) |
+		create_RRROpcodeExtension_X1(UNARY_RRR_0_OPCODE_X1) |
+		create_Opcode_X1(RRR_0_OPCODE_X1);
+}
+
+static int machine_stopped __read_mostly;
+
+int ftrace_arch_code_modify_prepare(void)
+{
+	machine_stopped = 1;
+	return 0;
+}
+
+int ftrace_arch_code_modify_post_process(void)
+{
+	flush_icache_range(0, CHIP_L1I_CACHE_SIZE());
+	machine_stopped = 0;
+	return 0;
+}
+
+/*
+ * Put { move r10, lr; jal ftrace_caller } in a bundle, this lets dynamic
+ * tracer just add one cycle overhead to every kernel function when disabled.
+ */
+static unsigned long ftrace_gen_branch(unsigned long pc, unsigned long addr,
+				       bool link)
+{
+	tilegx_bundle_bits opcode_x0, opcode_x1;
+	long pcrel_by_instr = (addr - pc) >> TILEGX_LOG2_BUNDLE_SIZE_IN_BYTES;
+
+	if (link) {
+		/* opcode: jal addr */
+		opcode_x1 =
+			create_Opcode_X1(JUMP_OPCODE_X1) |
+			create_JumpOpcodeExtension_X1(JAL_JUMP_OPCODE_X1) |
+			create_JumpOff_X1(pcrel_by_instr);
+	} else {
+		/* opcode: j addr */
+		opcode_x1 =
+			create_Opcode_X1(JUMP_OPCODE_X1) |
+			create_JumpOpcodeExtension_X1(J_JUMP_OPCODE_X1) |
+			create_JumpOff_X1(pcrel_by_instr);
+	}
+
+	if (addr == FTRACE_ADDR) {
+		/* opcode: or r10, lr, zero */
+		opcode_x0 =
+			create_Dest_X0(10) |
+			create_SrcA_X0(TREG_LR) |
+			create_SrcB_X0(TREG_ZERO) |
+			create_RRROpcodeExtension_X0(OR_RRR_0_OPCODE_X0) |
+			create_Opcode_X0(RRR_0_OPCODE_X0);
+	} else {
+		/* opcode: fnop */
+		opcode_x0 =
+			create_UnaryOpcodeExtension_X0(FNOP_UNARY_OPCODE_X0) |
+			create_RRROpcodeExtension_X0(UNARY_RRR_0_OPCODE_X0) |
+			create_Opcode_X0(RRR_0_OPCODE_X0);
+	}
+
+	return opcode_x1 | opcode_x0;
+}
+
+static unsigned long ftrace_nop_replace(struct dyn_ftrace *rec)
+{
+	return NOP();
+}
+
+static unsigned long ftrace_call_replace(unsigned long pc, unsigned long addr)
+{
+	return ftrace_gen_branch(pc, addr, true);
+}
+
+static int ftrace_modify_code(unsigned long pc, unsigned long old,
+			      unsigned long new)
+{
+	unsigned long pc_wr;
+
+	/* Check if the address is in kernel text space and module space. */
+	if (!kernel_text_address(pc))
+		return -EINVAL;
+
+	/* Operate on writable kernel text mapping. */
+	pc_wr = pc - MEM_SV_START + PAGE_OFFSET;
+
+	if (probe_kernel_write((void *)pc_wr, &new, MCOUNT_INSN_SIZE))
+		return -EPERM;
+
+	smp_wmb();
+
+	if (!machine_stopped && num_online_cpus() > 1)
+		flush_icache_range(pc, pc + MCOUNT_INSN_SIZE);
+
+	return 0;
+}
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long pc, old;
+	unsigned long new;
+	int ret;
+
+	pc = (unsigned long)&ftrace_call;
+	memcpy(&old, &ftrace_call, MCOUNT_INSN_SIZE);
+	new = ftrace_call_replace(pc, (unsigned long)func);
+
+	ret = ftrace_modify_code(pc, old, new);
+
+	return ret;
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long new, old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_nop_replace(rec);
+	new = ftrace_call_replace(ip, addr);
+
+	return ftrace_modify_code(rec->ip, old, new);
+}
+
+int ftrace_make_nop(struct module *mod,
+		    struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip = rec->ip;
+	unsigned long old;
+	unsigned long new;
+	int ret;
+
+	old = ftrace_call_replace(ip, addr);
+	new = ftrace_nop_replace(rec);
+	ret = ftrace_modify_code(ip, old, new);
+
+	return ret;
+}
+
+int __init ftrace_dyn_arch_init(void)
+{
+	return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
+			   unsigned long frame_pointer)
+{
+	unsigned long return_hooker = (unsigned long) &return_to_handler;
+	struct ftrace_graph_ent trace;
+	unsigned long old;
+	int err;
+
+	if (unlikely(atomic_read(&current->tracing_graph_pause)))
+		return;
+
+	old = *parent;
+	*parent = return_hooker;
+
+	err = ftrace_push_return_trace(old, self_addr, &trace.depth,
+				       frame_pointer);
+	if (err == -EBUSY) {
+		*parent = old;
+		return;
+	}
+
+	trace.func = self_addr;
+
+	/* Only trace if the calling function expects to */
+	if (!ftrace_graph_entry(&trace)) {
+		current->curr_ret_stack--;
+		*parent = old;
+	}
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern unsigned long ftrace_graph_call;
+
+static int __ftrace_modify_caller(unsigned long *callsite,
+				  void (*func) (void), bool enable)
+{
+	unsigned long caller_fn = (unsigned long) func;
+	unsigned long pc = (unsigned long) callsite;
+	unsigned long branch = ftrace_gen_branch(pc, caller_fn, false);
+	unsigned long nop = NOP();
+	unsigned long old = enable ? nop : branch;
+	unsigned long new = enable ? branch : nop;
+
+	return ftrace_modify_code(pc, old, new);
+}
+
+static int ftrace_modify_graph_caller(bool enable)
+{
+	int ret;
+
+	ret = __ftrace_modify_caller(&ftrace_graph_call,
+				     ftrace_graph_caller,
+				     enable);
+
+	return ret;
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+	return ftrace_modify_graph_caller(true);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+	return ftrace_modify_graph_caller(false);
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/tile/kernel/futex_64.S b/arch/tile/kernel/futex_64.S
deleted file mode 100644
index f465d1eda20..00000000000
--- a/arch/tile/kernel/futex_64.S
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright 2011 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- *
- * Atomically access user memory, but use MMU to avoid propagating
- * kernel exceptions.
- */
-
-#include <linux/linkage.h>
-#include <asm/errno.h>
-#include <asm/futex.h>
-#include <asm/page.h>
-#include <asm/processor.h>
-
-/*
- * Provide a set of atomic memory operations supporting <asm/futex.h>.
- *
- * r0: user address to manipulate
- * r1: new value to write, or for cmpxchg, old value to compare against
- * r2: (cmpxchg only) new value to write
- *
- * Return __get_user struct, r0 with value, r1 with error.
- */
-#define FUTEX_OP(name, ...) \
-STD_ENTRY(futex_##name)			\
-	__VA_ARGS__;			\
-	{				\
-	 move   r1, zero;		\
-	 jrp    lr			\
-	};				\
-	STD_ENDPROC(futex_##name);	\
-	.pushsection __ex_table,"a";	\
-	.quad 1b, get_user_fault;	\
-	.popsection
-
-	.pushsection .fixup,"ax"
-get_user_fault:
-	{ movei r1, -EFAULT; jrp lr }
-	ENDPROC(get_user_fault)
-	.popsection
-
-FUTEX_OP(cmpxchg, mtspr CMPEXCH_VALUE, r1; 1: cmpexch4 r0, r0, r2)
-FUTEX_OP(set, 1: exch4 r0, r0, r1)
-FUTEX_OP(add, 1: fetchadd4 r0, r0, r1)
-FUTEX_OP(or, 1: fetchor4 r0, r0, r1)
-FUTEX_OP(andn, nor r1, r1, zero; 1: fetchand4 r0, r0, r1)
diff --git a/arch/tile/kernel/hardwall.c b/arch/tile/kernel/hardwall.c
index 8c41891aab3..531f4c36535 100644
--- a/arch/tile/kernel/hardwall.c
+++ b/arch/tile/kernel/hardwall.c
@@ -33,59 +33,157 @@
 
 
 /*
- * This data structure tracks the rectangle data, etc., associated
- * one-to-one with a "struct file *" from opening HARDWALL_FILE.
+ * Implement a per-cpu "hardwall" resource class such as UDN or IPI.
+ * We use "hardwall" nomenclature throughout for historical reasons.
+ * The lock here controls access to the list data structure as well as
+ * to the items on the list.
+ */
+struct hardwall_type {
+	int index;
+	int is_xdn;
+	int is_idn;
+	int disabled;
+	const char *name;
+	struct list_head list;
+	spinlock_t lock;
+	struct proc_dir_entry *proc_dir;
+};
+
+enum hardwall_index {
+	HARDWALL_UDN = 0,
+#ifndef __tilepro__
+	HARDWALL_IDN = 1,
+	HARDWALL_IPI = 2,
+#endif
+	_HARDWALL_TYPES
+};
+
+static struct hardwall_type hardwall_types[] = {
+	{  /* user-space access to UDN */
+		0,
+		1,
+		0,
+		0,
+		"udn",
+		LIST_HEAD_INIT(hardwall_types[HARDWALL_UDN].list),
+		__SPIN_LOCK_UNLOCKED(hardwall_types[HARDWALL_UDN].lock),
+		NULL
+	},
+#ifndef __tilepro__
+	{  /* user-space access to IDN */
+		1,
+		1,
+		1,
+		1,  /* disabled pending hypervisor support */
+		"idn",
+		LIST_HEAD_INIT(hardwall_types[HARDWALL_IDN].list),
+		__SPIN_LOCK_UNLOCKED(hardwall_types[HARDWALL_IDN].lock),
+		NULL
+	},
+	{  /* access to user-space IPI */
+		2,
+		0,
+		0,
+		0,
+		"ipi",
+		LIST_HEAD_INIT(hardwall_types[HARDWALL_IPI].list),
+		__SPIN_LOCK_UNLOCKED(hardwall_types[HARDWALL_IPI].lock),
+		NULL
+	},
+#endif
+};
+
+/*
+ * This data structure tracks the cpu data, etc., associated
+ * one-to-one with a "struct file *" from opening a hardwall device file.
  * Note that the file's private data points back to this structure.
  */
 struct hardwall_info {
-	struct list_head list;             /* "rectangles" list */
+	struct list_head list;             /* for hardwall_types.list */
 	struct list_head task_head;        /* head of tasks in this hardwall */
-	struct cpumask cpumask;            /* cpus in the rectangle */
+	struct hardwall_type *type;        /* type of this resource */
+	struct cpumask cpumask;            /* cpus reserved */
+	int id;                            /* integer id for this hardwall */
+	int teardown_in_progress;          /* are we tearing this one down? */
+
+	/* Remaining fields only valid for user-network resources. */
 	int ulhc_x;                        /* upper left hand corner x coord */
 	int ulhc_y;                        /* upper left hand corner y coord */
 	int width;                         /* rectangle width */
 	int height;                        /* rectangle height */
-	int id;                            /* integer id for this hardwall */
-	int teardown_in_progress;          /* are we tearing this one down? */
+#if CHIP_HAS_REV1_XDN()
+	atomic_t xdn_pending_count;        /* cores in phase 1 of drain */
+#endif
 };
 
-/* Currently allocated hardwall rectangles */
-static LIST_HEAD(rectangles);
 
 /* /proc/tile/hardwall */
 static struct proc_dir_entry *hardwall_proc_dir;
 
 /* Functions to manage files in /proc/tile/hardwall. */
-static void hardwall_add_proc(struct hardwall_info *rect);
-static void hardwall_remove_proc(struct hardwall_info *rect);
-
-/*
- * Guard changes to the hardwall data structures.
- * This could be finer grained (e.g. one lock for the list of hardwall
- * rectangles, then separate embedded locks for each one's list of tasks),
- * but there are subtle correctness issues when trying to start with
- * a task's "hardwall" pointer and lock the correct rectangle's embedded
- * lock in the presence of a simultaneous deactivation, so it seems
- * easier to have a single lock, given that none of these data
- * structures are touched very frequently during normal operation.
- */
-static DEFINE_SPINLOCK(hardwall_lock);
+static void hardwall_add_proc(struct hardwall_info *);
+static void hardwall_remove_proc(struct hardwall_info *);
 
 /* Allow disabling UDN access. */
-static int udn_disabled;
 static int __init noudn(char *str)
 {
 	pr_info("User-space UDN access is disabled\n");
-	udn_disabled = 1;
+	hardwall_types[HARDWALL_UDN].disabled = 1;
 	return 0;
 }
 early_param("noudn", noudn);
 
+#ifndef __tilepro__
+/* Allow disabling IDN access. */
+static int __init noidn(char *str)
+{
+	pr_info("User-space IDN access is disabled\n");
+	hardwall_types[HARDWALL_IDN].disabled = 1;
+	return 0;
+}
+early_param("noidn", noidn);
+
+/* Allow disabling IPI access. */
+static int __init noipi(char *str)
+{
+	pr_info("User-space IPI access is disabled\n");
+	hardwall_types[HARDWALL_IPI].disabled = 1;
+	return 0;
+}
+early_param("noipi", noipi);
+#endif
+
 
 /*
- * Low-level primitives
+ * Low-level primitives for UDN/IDN
  */
 
+#ifdef __tilepro__
+#define mtspr_XDN(hwt, name, val) \
+	do { (void)(hwt); __insn_mtspr(SPR_UDN_##name, (val)); } while (0)
+#define mtspr_MPL_XDN(hwt, name, val) \
+	do { (void)(hwt); __insn_mtspr(SPR_MPL_UDN_##name, (val)); } while (0)
+#define mfspr_XDN(hwt, name) \
+	((void)(hwt), __insn_mfspr(SPR_UDN_##name))
+#else
+#define mtspr_XDN(hwt, name, val)					\
+	do {								\
+		if ((hwt)->is_idn)					\
+			__insn_mtspr(SPR_IDN_##name, (val));		\
+		else							\
+			__insn_mtspr(SPR_UDN_##name, (val));		\
+	} while (0)
+#define mtspr_MPL_XDN(hwt, name, val)					\
+	do {								\
+		if ((hwt)->is_idn)					\
+			__insn_mtspr(SPR_MPL_IDN_##name, (val));	\
+		else							\
+			__insn_mtspr(SPR_MPL_UDN_##name, (val));	\
+	} while (0)
+#define mfspr_XDN(hwt, name) \
+  ((hwt)->is_idn ? __insn_mfspr(SPR_IDN_##name) : __insn_mfspr(SPR_UDN_##name))
+#endif
+
 /* Set a CPU bit if the CPU is online. */
 #define cpu_online_set(cpu, dst) do { \
 	if (cpu_online(cpu))          \
@@ -101,7 +199,7 @@ static int contains(struct hardwall_info *r, int x, int y)
 }
 
 /* Compute the rectangle parameters and validate the cpumask. */
-static int setup_rectangle(struct hardwall_info *r, struct cpumask *mask)
+static int check_rectangle(struct hardwall_info *r, struct cpumask *mask)
 {
 	int x, y, cpu, ulhc, lrhc;
 
@@ -114,8 +212,6 @@ static int setup_rectangle(struct hardwall_info *r, struct cpumask *mask)
 	r->ulhc_y = cpu_y(ulhc);
 	r->width = cpu_x(lrhc) - r->ulhc_x + 1;
 	r->height = cpu_y(lrhc) - r->ulhc_y + 1;
-	cpumask_copy(&r->cpumask, mask);
-	r->id = ulhc;   /* The ulhc cpu id can be the hardwall id. */
 
 	/* Width and height must be positive */
 	if (r->width <= 0 || r->height <= 0)
@@ -128,7 +224,7 @@ static int setup_rectangle(struct hardwall_info *r, struct cpumask *mask)
 				return -EINVAL;
 
 	/*
-	 * Note that offline cpus can't be drained when this UDN
+	 * Note that offline cpus can't be drained when this user network
 	 * rectangle eventually closes.  We used to detect this
 	 * situation and print a warning, but it annoyed users and
 	 * they ignored it anyway, so now we just return without a
@@ -137,16 +233,6 @@ static int setup_rectangle(struct hardwall_info *r, struct cpumask *mask)
 	return 0;
 }
 
-/* Do the two given rectangles overlap on any cpu? */
-static int overlaps(struct hardwall_info *a, struct hardwall_info *b)
-{
-	return a->ulhc_x + a->width > b->ulhc_x &&    /* A not to the left */
-		b->ulhc_x + b->width > a->ulhc_x &&   /* B not to the left */
-		a->ulhc_y + a->height > b->ulhc_y &&  /* A not above */
-		b->ulhc_y + b->height > a->ulhc_y;    /* B not above */
-}
-
-
 /*
  * Hardware management of hardwall setup, teardown, trapping,
  * and enabling/disabling PL0 access to the networks.
@@ -157,26 +243,38 @@ enum direction_protect {
 	N_PROTECT = (1 << 0),
 	E_PROTECT = (1 << 1),
 	S_PROTECT = (1 << 2),
-	W_PROTECT = (1 << 3)
+	W_PROTECT = (1 << 3),
+	C_PROTECT = (1 << 4),
 };
 
-static void enable_firewall_interrupts(void)
+static inline int xdn_which_interrupt(struct hardwall_type *hwt)
 {
-	arch_local_irq_unmask_now(INT_UDN_FIREWALL);
+#ifndef __tilepro__
+	if (hwt->is_idn)
+		return INT_IDN_FIREWALL;
+#endif
+	return INT_UDN_FIREWALL;
 }
 
-static void disable_firewall_interrupts(void)
+static void enable_firewall_interrupts(struct hardwall_type *hwt)
 {
-	arch_local_irq_mask_now(INT_UDN_FIREWALL);
+	arch_local_irq_unmask_now(xdn_which_interrupt(hwt));
+}
+
+static void disable_firewall_interrupts(struct hardwall_type *hwt)
+{
+	arch_local_irq_mask_now(xdn_which_interrupt(hwt));
 }
 
 /* Set up hardwall on this cpu based on the passed hardwall_info. */
-static void hardwall_setup_ipi_func(void *info)
+static void hardwall_setup_func(void *info)
 {
 	struct hardwall_info *r = info;
-	int cpu = smp_processor_id();
-	int x = cpu % smp_width;
-	int y = cpu / smp_width;
+	struct hardwall_type *hwt = r->type;
+
+	int cpu = smp_processor_id();  /* on_each_cpu disables preemption */
+	int x = cpu_x(cpu);
+	int y = cpu_y(cpu);
 	int bits = 0;
 	if (x == r->ulhc_x)
 		bits |= W_PROTECT;
@@ -187,13 +285,12 @@ static void hardwall_setup_ipi_func(void *info)
 	if (y == r->ulhc_y + r->height - 1)
 		bits |= S_PROTECT;
 	BUG_ON(bits == 0);
-	__insn_mtspr(SPR_UDN_DIRECTION_PROTECT, bits);
-	enable_firewall_interrupts();
-
+	mtspr_XDN(hwt, DIRECTION_PROTECT, bits);
+	enable_firewall_interrupts(hwt);
 }
 
 /* Set up all cpus on edge of rectangle to enable/disable hardwall SPRs. */
-static void hardwall_setup(struct hardwall_info *r)
+static void hardwall_protect_rectangle(struct hardwall_info *r)
 {
 	int x, y, cpu, delta;
 	struct cpumask rect_cpus;
@@ -217,37 +314,50 @@ static void hardwall_setup(struct hardwall_info *r)
 	}
 
 	/* Then tell all the cpus to set up their protection SPR */
-	on_each_cpu_mask(&rect_cpus, hardwall_setup_ipi_func, r, 1);
+	on_each_cpu_mask(&rect_cpus, hardwall_setup_func, r, 1);
 }
 
+/* Entered from INT_xDN_FIREWALL interrupt vector with irqs disabled. */
 void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num)
 {
 	struct hardwall_info *rect;
+	struct hardwall_type *hwt;
 	struct task_struct *p;
 	struct siginfo info;
-	int x, y;
 	int cpu = smp_processor_id();
 	int found_processes;
-	unsigned long flags;
-
 	struct pt_regs *old_regs = set_irq_regs(regs);
+
 	irq_enter();
 
+	/* Figure out which network trapped. */
+	switch (fault_num) {
+#ifndef __tilepro__
+	case INT_IDN_FIREWALL:
+		hwt = &hardwall_types[HARDWALL_IDN];
+		break;
+#endif
+	case INT_UDN_FIREWALL:
+		hwt = &hardwall_types[HARDWALL_UDN];
+		break;
+	default:
+		BUG();
+	}
+	BUG_ON(hwt->disabled);
+
 	/* This tile trapped a network access; find the rectangle. */
-	x = cpu % smp_width;
-	y = cpu / smp_width;
-	spin_lock_irqsave(&hardwall_lock, flags);
-	list_for_each_entry(rect, &rectangles, list) {
-		if (contains(rect, x, y))
+	spin_lock(&hwt->lock);
+	list_for_each_entry(rect, &hwt->list, list) {
+		if (cpumask_test_cpu(cpu, &rect->cpumask))
 			break;
 	}
 
 	/*
 	 * It shouldn't be possible not to find this cpu on the
 	 * rectangle list, since only cpus in rectangles get hardwalled.
-	 * The hardwall is only removed after the UDN is drained.
+	 * The hardwall is only removed after the user network is drained.
 	 */
-	BUG_ON(&rect->list == &rectangles);
+	BUG_ON(&rect->list == &hwt->list);
 
 	/*
 	 * If we already started teardown on this hardwall, don't worry;
@@ -255,30 +365,32 @@ void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num)
 	 * to quiesce.
 	 */
 	if (rect->teardown_in_progress) {
-		pr_notice("cpu %d: detected hardwall violation %#lx"
+		pr_notice("cpu %d: detected %s hardwall violation %#lx"
 		       " while teardown already in progress\n",
-		       cpu, (long) __insn_mfspr(SPR_UDN_DIRECTION_PROTECT));
+			  cpu, hwt->name,
+			  (long)mfspr_XDN(hwt, DIRECTION_PROTECT));
 		goto done;
 	}
 
 	/*
 	 * Kill off any process that is activated in this rectangle.
 	 * We bypass security to deliver the signal, since it must be
-	 * one of the activated processes that generated the UDN
+	 * one of the activated processes that generated the user network
 	 * message that caused this trap, and all the activated
 	 * processes shared a single open file so are pretty tightly
 	 * bound together from a security point of view to begin with.
 	 */
 	rect->teardown_in_progress = 1;
 	wmb(); /* Ensure visibility of rectangle before notifying processes. */
-	pr_notice("cpu %d: detected hardwall violation %#lx...\n",
-	       cpu, (long) __insn_mfspr(SPR_UDN_DIRECTION_PROTECT));
+	pr_notice("cpu %d: detected %s hardwall violation %#lx...\n",
+		  cpu, hwt->name, (long)mfspr_XDN(hwt, DIRECTION_PROTECT));
 	info.si_signo = SIGILL;
 	info.si_errno = 0;
 	info.si_code = ILL_HARDWALL;
 	found_processes = 0;
-	list_for_each_entry(p, &rect->task_head, thread.hardwall_list) {
-		BUG_ON(p->thread.hardwall != rect);
+	list_for_each_entry(p, &rect->task_head,
+			    thread.hardwall[hwt->index].list) {
+		BUG_ON(p->thread.hardwall[hwt->index].info != rect);
 		if (!(p->flags & PF_EXITING)) {
 			found_processes = 1;
 			pr_notice("hardwall: killing %d\n", p->pid);
@@ -289,7 +401,7 @@ void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num)
 		pr_notice("hardwall: no associated processes!\n");
 
  done:
-	spin_unlock_irqrestore(&hardwall_lock, flags);
+	spin_unlock(&hwt->lock);
 
 	/*
 	 * We have to disable firewall interrupts now, or else when we
@@ -298,48 +410,87 @@ void __kprobes do_hardwall_trap(struct pt_regs* regs, int fault_num)
 	 * haven't yet drained the network, and that would allow packets
 	 * to cross out of the hardwall region.
 	 */
-	disable_firewall_interrupts();
+	disable_firewall_interrupts(hwt);
 
 	irq_exit();
 	set_irq_regs(old_regs);
 }
 
-/* Allow access from user space to the UDN. */
-void grant_network_mpls(void)
+/* Allow access from user space to the user network. */
+void grant_hardwall_mpls(struct hardwall_type *hwt)
 {
-	__insn_mtspr(SPR_MPL_UDN_ACCESS_SET_0, 1);
-	__insn_mtspr(SPR_MPL_UDN_AVAIL_SET_0, 1);
-	__insn_mtspr(SPR_MPL_UDN_COMPLETE_SET_0, 1);
-	__insn_mtspr(SPR_MPL_UDN_TIMER_SET_0, 1);
+#ifndef __tilepro__
+	if (!hwt->is_xdn) {
+		__insn_mtspr(SPR_MPL_IPI_0_SET_0, 1);
+		return;
+	}
+#endif
+	mtspr_MPL_XDN(hwt, ACCESS_SET_0, 1);
+	mtspr_MPL_XDN(hwt, AVAIL_SET_0, 1);
+	mtspr_MPL_XDN(hwt, COMPLETE_SET_0, 1);
+	mtspr_MPL_XDN(hwt, TIMER_SET_0, 1);
 #if !CHIP_HAS_REV1_XDN()
-	__insn_mtspr(SPR_MPL_UDN_REFILL_SET_0, 1);
-	__insn_mtspr(SPR_MPL_UDN_CA_SET_0, 1);
+	mtspr_MPL_XDN(hwt, REFILL_SET_0, 1);
+	mtspr_MPL_XDN(hwt, CA_SET_0, 1);
 #endif
 }
 
-/* Deny access from user space to the UDN. */
-void restrict_network_mpls(void)
+/* Deny access from user space to the user network. */
+void restrict_hardwall_mpls(struct hardwall_type *hwt)
 {
-	__insn_mtspr(SPR_MPL_UDN_ACCESS_SET_1, 1);
-	__insn_mtspr(SPR_MPL_UDN_AVAIL_SET_1, 1);
-	__insn_mtspr(SPR_MPL_UDN_COMPLETE_SET_1, 1);
-	__insn_mtspr(SPR_MPL_UDN_TIMER_SET_1, 1);
+#ifndef __tilepro__
+	if (!hwt->is_xdn) {
+		__insn_mtspr(SPR_MPL_IPI_0_SET_1, 1);
+		return;
+	}
+#endif
+	mtspr_MPL_XDN(hwt, ACCESS_SET_1, 1);
+	mtspr_MPL_XDN(hwt, AVAIL_SET_1, 1);
+	mtspr_MPL_XDN(hwt, COMPLETE_SET_1, 1);
+	mtspr_MPL_XDN(hwt, TIMER_SET_1, 1);
 #if !CHIP_HAS_REV1_XDN()
-	__insn_mtspr(SPR_MPL_UDN_REFILL_SET_1, 1);
-	__insn_mtspr(SPR_MPL_UDN_CA_SET_1, 1);
+	mtspr_MPL_XDN(hwt, REFILL_SET_1, 1);
+	mtspr_MPL_XDN(hwt, CA_SET_1, 1);
 #endif
 }
 
+/* Restrict or deny as necessary for the task we're switching to. */
+void hardwall_switch_tasks(struct task_struct *prev,
+			   struct task_struct *next)
+{
+	int i;
+	for (i = 0; i < HARDWALL_TYPES; ++i) {
+		if (prev->thread.hardwall[i].info != NULL) {
+			if (next->thread.hardwall[i].info == NULL)
+				restrict_hardwall_mpls(&hardwall_types[i]);
+		} else if (next->thread.hardwall[i].info != NULL) {
+			grant_hardwall_mpls(&hardwall_types[i]);
+		}
+	}
+}
+
+/* Does this task have the right to IPI the given cpu? */
+int hardwall_ipi_valid(int cpu)
+{
+#ifdef __tilegx__
+	struct hardwall_info *info =
+		current->thread.hardwall[HARDWALL_IPI].info;
+	return info && cpumask_test_cpu(cpu, &info->cpumask);
+#else
+	return 0;
+#endif
+}
 
 /*
- * Code to create, activate, deactivate, and destroy hardwall rectangles.
+ * Code to create, activate, deactivate, and destroy hardwall resources.
  */
 
-/* Create a hardwall for the given rectangle */
-static struct hardwall_info *hardwall_create(
-	size_t size, const unsigned char __user *bits)
+/* Create a hardwall for the given resource */
+static struct hardwall_info *hardwall_create(struct hardwall_type *hwt,
+					     size_t size,
+					     const unsigned char __user *bits)
 {
-	struct hardwall_info *iter, *rect;
+	struct hardwall_info *iter, *info;
 	struct cpumask mask;
 	unsigned long flags;
 	int rc;
@@ -370,55 +521,70 @@ static struct hardwall_info *hardwall_create(
 		}
 	}
 
-	/* Allocate a new rectangle optimistically. */
-	rect = kmalloc(sizeof(struct hardwall_info),
+	/* Allocate a new hardwall_info optimistically. */
+	info = kmalloc(sizeof(struct hardwall_info),
 			GFP_KERNEL | __GFP_ZERO);
-	if (rect == NULL)
+	if (info == NULL)
 		return ERR_PTR(-ENOMEM);
-	INIT_LIST_HEAD(&rect->task_head);
+	INIT_LIST_HEAD(&info->task_head);
+	info->type = hwt;
 
 	/* Compute the rectangle size and validate that it's plausible. */
-	rc = setup_rectangle(rect, &mask);
-	if (rc != 0) {
-		kfree(rect);
-		return ERR_PTR(rc);
+	cpumask_copy(&info->cpumask, &mask);
+	info->id = find_first_bit(cpumask_bits(&mask), nr_cpumask_bits);
+	if (hwt->is_xdn) {
+		rc = check_rectangle(info, &mask);
+		if (rc != 0) {
+			kfree(info);
+			return ERR_PTR(rc);
+		}
 	}
 
+	/*
+	 * Eliminate cpus that are not part of this Linux client.
+	 * Note that this allows for configurations that we might not want to
+	 * support, such as one client on every even cpu, another client on
+	 * every odd cpu.
+	 */
+	cpumask_and(&info->cpumask, &info->cpumask, cpu_online_mask);
+
 	/* Confirm it doesn't overlap and add it to the list. */
-	spin_lock_irqsave(&hardwall_lock, flags);
-	list_for_each_entry(iter, &rectangles, list) {
-		if (overlaps(iter, rect)) {
-			spin_unlock_irqrestore(&hardwall_lock, flags);
-			kfree(rect);
+	spin_lock_irqsave(&hwt->lock, flags);
+	list_for_each_entry(iter, &hwt->list, list) {
+		if (cpumask_intersects(&iter->cpumask, &info->cpumask)) {
+			spin_unlock_irqrestore(&hwt->lock, flags);
+			kfree(info);
 			return ERR_PTR(-EBUSY);
 		}
 	}
-	list_add_tail(&rect->list, &rectangles);
-	spin_unlock_irqrestore(&hardwall_lock, flags);
+	list_add_tail(&info->list, &hwt->list);
+	spin_unlock_irqrestore(&hwt->lock, flags);
 
 	/* Set up appropriate hardwalling on all affected cpus. */
-	hardwall_setup(rect);
+	if (hwt->is_xdn)
+		hardwall_protect_rectangle(info);
 
 	/* Create a /proc/tile/hardwall entry. */
-	hardwall_add_proc(rect);
+	hardwall_add_proc(info);
 
-	return rect;
+	return info;
 }
 
 /* Activate a given hardwall on this cpu for this process. */
-static int hardwall_activate(struct hardwall_info *rect)
+static int hardwall_activate(struct hardwall_info *info)
 {
-	int cpu, x, y;
+	int cpu;
 	unsigned long flags;
 	struct task_struct *p = current;
 	struct thread_struct *ts = &p->thread;
+	struct hardwall_type *hwt;
 
-	/* Require a rectangle. */
-	if (rect == NULL)
+	/* Require a hardwall. */
+	if (info == NULL)
 		return -ENODATA;
 
-	/* Not allowed to activate a rectangle that is being torn down. */
-	if (rect->teardown_in_progress)
+	/* Not allowed to activate a hardwall that is being torn down. */
+	if (info->teardown_in_progress)
 		return -EINVAL;
 
 	/*
@@ -428,78 +594,87 @@ static int hardwall_activate(struct hardwall_info *rect)
 	if (cpumask_weight(&p->cpus_allowed) != 1)
 		return -EPERM;
 
-	/* Make sure we are bound to a cpu in this rectangle. */
+	/* Make sure we are bound to a cpu assigned to this resource. */
 	cpu = smp_processor_id();
 	BUG_ON(cpumask_first(&p->cpus_allowed) != cpu);
-	x = cpu_x(cpu);
-	y = cpu_y(cpu);
-	if (!contains(rect, x, y))
+	if (!cpumask_test_cpu(cpu, &info->cpumask))
 		return -EINVAL;
 
 	/* If we are already bound to this hardwall, it's a no-op. */
-	if (ts->hardwall) {
-		BUG_ON(ts->hardwall != rect);
+	hwt = info->type;
+	if (ts->hardwall[hwt->index].info) {
+		BUG_ON(ts->hardwall[hwt->index].info != info);
 		return 0;
 	}
 
-	/* Success!  This process gets to use the user networks on this cpu. */
-	ts->hardwall = rect;
-	spin_lock_irqsave(&hardwall_lock, flags);
-	list_add(&ts->hardwall_list, &rect->task_head);
-	spin_unlock_irqrestore(&hardwall_lock, flags);
-	grant_network_mpls();
-	printk(KERN_DEBUG "Pid %d (%s) activated for hardwall: cpu %d\n",
-	       p->pid, p->comm, cpu);
+	/* Success!  This process gets to use the resource on this cpu. */
+	ts->hardwall[hwt->index].info = info;
+	spin_lock_irqsave(&hwt->lock, flags);
+	list_add(&ts->hardwall[hwt->index].list, &info->task_head);
+	spin_unlock_irqrestore(&hwt->lock, flags);
+	grant_hardwall_mpls(hwt);
+	printk(KERN_DEBUG "Pid %d (%s) activated for %s hardwall: cpu %d\n",
+	       p->pid, p->comm, hwt->name, cpu);
 	return 0;
 }
 
 /*
- * Deactivate a task's hardwall.  Must hold hardwall_lock.
- * This method may be called from free_task(), so we don't want to
+ * Deactivate a task's hardwall.  Must hold lock for hardwall_type.
+ * This method may be called from exit_thread(), so we don't want to
  * rely on too many fields of struct task_struct still being valid.
  * We assume the cpus_allowed, pid, and comm fields are still valid.
  */
-static void _hardwall_deactivate(struct task_struct *task)
+static void _hardwall_deactivate(struct hardwall_type *hwt,
+				 struct task_struct *task)
 {
 	struct thread_struct *ts = &task->thread;
 
 	if (cpumask_weight(&task->cpus_allowed) != 1) {
-		pr_err("pid %d (%s) releasing networks with"
+		pr_err("pid %d (%s) releasing %s hardwall with"
 		       " an affinity mask containing %d cpus!\n",
-		       task->pid, task->comm,
+		       task->pid, task->comm, hwt->name,
 		       cpumask_weight(&task->cpus_allowed));
 		BUG();
 	}
 
-	BUG_ON(ts->hardwall == NULL);
-	ts->hardwall = NULL;
-	list_del(&ts->hardwall_list);
+	BUG_ON(ts->hardwall[hwt->index].info == NULL);
+	ts->hardwall[hwt->index].info = NULL;
+	list_del(&ts->hardwall[hwt->index].list);
 	if (task == current)
-		restrict_network_mpls();
+		restrict_hardwall_mpls(hwt);
 }
 
 /* Deactivate a task's hardwall. */
-int hardwall_deactivate(struct task_struct *task)
+static int hardwall_deactivate(struct hardwall_type *hwt,
+			       struct task_struct *task)
 {
 	unsigned long flags;
 	int activated;
 
-	spin_lock_irqsave(&hardwall_lock, flags);
-	activated = (task->thread.hardwall != NULL);
+	spin_lock_irqsave(&hwt->lock, flags);
+	activated = (task->thread.hardwall[hwt->index].info != NULL);
 	if (activated)
-		_hardwall_deactivate(task);
-	spin_unlock_irqrestore(&hardwall_lock, flags);
+		_hardwall_deactivate(hwt, task);
+	spin_unlock_irqrestore(&hwt->lock, flags);
 
 	if (!activated)
 		return -EINVAL;
 
-	printk(KERN_DEBUG "Pid %d (%s) deactivated for hardwall: cpu %d\n",
-	       task->pid, task->comm, smp_processor_id());
+	printk(KERN_DEBUG "Pid %d (%s) deactivated for %s hardwall: cpu %d\n",
+	       task->pid, task->comm, hwt->name, raw_smp_processor_id());
 	return 0;
 }
 
-/* Stop a UDN switch before draining the network. */
-static void stop_udn_switch(void *ignored)
+void hardwall_deactivate_all(struct task_struct *task)
+{
+	int i;
+	for (i = 0; i < HARDWALL_TYPES; ++i)
+		if (task->thread.hardwall[i].info)
+			hardwall_deactivate(&hardwall_types[i], task);
+}
+
+/* Stop the switch before draining the network. */
+static void stop_xdn_switch(void *arg)
 {
 #if !CHIP_HAS_REV1_XDN()
 	/* Freeze the switch and the demux. */
@@ -507,13 +682,71 @@ static void stop_udn_switch(void *ignored)
 		     SPR_UDN_SP_FREEZE__SP_FRZ_MASK |
 		     SPR_UDN_SP_FREEZE__DEMUX_FRZ_MASK |
 		     SPR_UDN_SP_FREEZE__NON_DEST_EXT_MASK);
+#else
+	/*
+	 * Drop all packets bound for the core or off the edge.
+	 * We rely on the normal hardwall protection setup code
+	 * to have set the low four bits to trigger firewall interrupts,
+	 * and shift those bits up to trigger "drop on send" semantics,
+	 * plus adding "drop on send to core" for all switches.
+	 * In practice it seems the switches latch the DIRECTION_PROTECT
+	 * SPR so they won't start dropping if they're already
+	 * delivering the last message to the core, but it doesn't
+	 * hurt to enable it here.
+	 */
+	struct hardwall_type *hwt = arg;
+	unsigned long protect = mfspr_XDN(hwt, DIRECTION_PROTECT);
+	mtspr_XDN(hwt, DIRECTION_PROTECT, (protect | C_PROTECT) << 5);
 #endif
 }
 
+static void empty_xdn_demuxes(struct hardwall_type *hwt)
+{
+#ifndef __tilepro__
+	if (hwt->is_idn) {
+		while (__insn_mfspr(SPR_IDN_DATA_AVAIL) & (1 << 0))
+			(void) __tile_idn0_receive();
+		while (__insn_mfspr(SPR_IDN_DATA_AVAIL) & (1 << 1))
+			(void) __tile_idn1_receive();
+		return;
+	}
+#endif
+	while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 0))
+		(void) __tile_udn0_receive();
+	while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 1))
+		(void) __tile_udn1_receive();
+	while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 2))
+		(void) __tile_udn2_receive();
+	while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 3))
+		(void) __tile_udn3_receive();
+}
+
 /* Drain all the state from a stopped switch. */
-static void drain_udn_switch(void *ignored)
+static void drain_xdn_switch(void *arg)
 {
-#if !CHIP_HAS_REV1_XDN()
+	struct hardwall_info *info = arg;
+	struct hardwall_type *hwt = info->type;
+
+#if CHIP_HAS_REV1_XDN()
+	/*
+	 * The switches have been configured to drop any messages
+	 * destined for cores (or off the edge of the rectangle).
+	 * But the current message may continue to be delivered,
+	 * so we wait until all the cores have finished any pending
+	 * messages before we stop draining.
+	 */
+	int pending = mfspr_XDN(hwt, PENDING);
+	while (pending--) {
+		empty_xdn_demuxes(hwt);
+		if (hwt->is_idn)
+			__tile_idn_send(0);
+		else
+			__tile_udn_send(0);
+	}
+	atomic_dec(&info->xdn_pending_count);
+	while (atomic_read(&info->xdn_pending_count))
+		empty_xdn_demuxes(hwt);
+#else
 	int i;
 	int from_tile_words, ca_count;
 
@@ -533,15 +766,7 @@ static void drain_udn_switch(void *ignored)
 		(void) __insn_mfspr(SPR_UDN_DEMUX_WRITE_FIFO);
 
 	/* Empty out demuxes. */
-	while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 0))
-		(void) __tile_udn0_receive();
-	while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 1))
-		(void) __tile_udn1_receive();
-	while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 2))
-		(void) __tile_udn2_receive();
-	while (__insn_mfspr(SPR_UDN_DATA_AVAIL) & (1 << 3))
-		(void) __tile_udn3_receive();
-	BUG_ON((__insn_mfspr(SPR_UDN_DATA_AVAIL) & 0xF) != 0);
+	empty_xdn_demuxes(hwt);
 
 	/* Empty out catch all. */
 	ca_count = __insn_mfspr(SPR_UDN_DEMUX_CA_COUNT);
@@ -563,21 +788,25 @@ static void drain_udn_switch(void *ignored)
 #endif
 }
 
-/* Reset random UDN state registers at boot up and during hardwall teardown. */
-void reset_network_state(void)
+/* Reset random XDN state registers at boot up and during hardwall teardown. */
+static void reset_xdn_network_state(struct hardwall_type *hwt)
 {
-#if !CHIP_HAS_REV1_XDN()
-	/* Reset UDN coordinates to their standard value */
-	unsigned int cpu = smp_processor_id();
-	unsigned int x = cpu % smp_width;
-	unsigned int y = cpu / smp_width;
-#endif
-
-	if (udn_disabled)
+	if (hwt->disabled)
 		return;
 
+	/* Clear out other random registers so we have a clean slate. */
+	mtspr_XDN(hwt, DIRECTION_PROTECT, 0);
+	mtspr_XDN(hwt, AVAIL_EN, 0);
+	mtspr_XDN(hwt, DEADLOCK_TIMEOUT, 0);
+
 #if !CHIP_HAS_REV1_XDN()
-	__insn_mtspr(SPR_UDN_TILE_COORD, (x << 18) | (y << 7));
+	/* Reset UDN coordinates to their standard value */
+	{
+		unsigned int cpu = smp_processor_id();
+		unsigned int x = cpu_x(cpu);
+		unsigned int y = cpu_y(cpu);
+		__insn_mtspr(SPR_UDN_TILE_COORD, (x << 18) | (y << 7));
+	}
 
 	/* Set demux tags to predefined values and enable them. */
 	__insn_mtspr(SPR_UDN_TAG_VALID, 0xf);
@@ -585,56 +814,50 @@ void reset_network_state(void)
 	__insn_mtspr(SPR_UDN_TAG_1, (1 << 1));
 	__insn_mtspr(SPR_UDN_TAG_2, (1 << 2));
 	__insn_mtspr(SPR_UDN_TAG_3, (1 << 3));
-#endif
 
-	/* Clear out other random registers so we have a clean slate. */
-	__insn_mtspr(SPR_UDN_AVAIL_EN, 0);
-	__insn_mtspr(SPR_UDN_DEADLOCK_TIMEOUT, 0);
-#if !CHIP_HAS_REV1_XDN()
+	/* Set other rev0 random registers to a clean state. */
 	__insn_mtspr(SPR_UDN_REFILL_EN, 0);
 	__insn_mtspr(SPR_UDN_DEMUX_QUEUE_SEL, 0);
 	__insn_mtspr(SPR_UDN_SP_FIFO_SEL, 0);
-#endif
 
 	/* Start the switch and demux. */
-#if !CHIP_HAS_REV1_XDN()
 	__insn_mtspr(SPR_UDN_SP_FREEZE, 0);
 #endif
 }
 
-/* Restart a UDN switch after draining. */
-static void restart_udn_switch(void *ignored)
+void reset_network_state(void)
 {
-	reset_network_state();
-
-	/* Disable firewall interrupts. */
-	__insn_mtspr(SPR_UDN_DIRECTION_PROTECT, 0);
-	disable_firewall_interrupts();
+	reset_xdn_network_state(&hardwall_types[HARDWALL_UDN]);
+#ifndef __tilepro__
+	reset_xdn_network_state(&hardwall_types[HARDWALL_IDN]);
+#endif
 }
 
-/* Build a struct cpumask containing all valid tiles in bounding rectangle. */
-static void fill_mask(struct hardwall_info *r, struct cpumask *result)
+/* Restart an XDN switch after draining. */
+static void restart_xdn_switch(void *arg)
 {
-	int x, y, cpu;
+	struct hardwall_type *hwt = arg;
 
-	cpumask_clear(result);
+#if CHIP_HAS_REV1_XDN()
+	/* One last drain step to avoid races with injection and draining. */
+	empty_xdn_demuxes(hwt);
+#endif
 
-	cpu = r->ulhc_y * smp_width + r->ulhc_x;
-	for (y = 0; y < r->height; ++y, cpu += smp_width - r->width) {
-		for (x = 0; x < r->width; ++x, ++cpu)
-			cpu_online_set(cpu, result);
-	}
+	reset_xdn_network_state(hwt);
+
+	/* Disable firewall interrupts. */
+	disable_firewall_interrupts(hwt);
 }
 
 /* Last reference to a hardwall is gone, so clear the network. */
-static void hardwall_destroy(struct hardwall_info *rect)
+static void hardwall_destroy(struct hardwall_info *info)
 {
 	struct task_struct *task;
+	struct hardwall_type *hwt;
 	unsigned long flags;
-	struct cpumask mask;
 
-	/* Make sure this file actually represents a rectangle. */
-	if (rect == NULL)
+	/* Make sure this file actually represents a hardwall. */
+	if (info == NULL)
 		return;
 
 	/*
@@ -644,39 +867,53 @@ static void hardwall_destroy(struct hardwall_info *rect)
 	 * deactivate any remaining tasks before freeing the
 	 * hardwall_info object itself.
 	 */
-	spin_lock_irqsave(&hardwall_lock, flags);
-	list_for_each_entry(task, &rect->task_head, thread.hardwall_list)
-		_hardwall_deactivate(task);
-	spin_unlock_irqrestore(&hardwall_lock, flags);
-
-	/* Drain the UDN. */
-	printk(KERN_DEBUG "Clearing hardwall rectangle %dx%d %d,%d\n",
-	       rect->width, rect->height, rect->ulhc_x, rect->ulhc_y);
-	fill_mask(rect, &mask);
-	on_each_cpu_mask(&mask, stop_udn_switch, NULL, 1);
-	on_each_cpu_mask(&mask, drain_udn_switch, NULL, 1);
+	hwt = info->type;
+	info->teardown_in_progress = 1;
+	spin_lock_irqsave(&hwt->lock, flags);
+	list_for_each_entry(task, &info->task_head,
+			    thread.hardwall[hwt->index].list)
+		_hardwall_deactivate(hwt, task);
+	spin_unlock_irqrestore(&hwt->lock, flags);
+
+	if (hwt->is_xdn) {
+		/* Configure the switches for draining the user network. */
+		printk(KERN_DEBUG
+		       "Clearing %s hardwall rectangle %dx%d %d,%d\n",
+		       hwt->name, info->width, info->height,
+		       info->ulhc_x, info->ulhc_y);
+		on_each_cpu_mask(&info->cpumask, stop_xdn_switch, hwt, 1);
+
+		/* Drain the network. */
+#if CHIP_HAS_REV1_XDN()
+		atomic_set(&info->xdn_pending_count,
+			   cpumask_weight(&info->cpumask));
+		on_each_cpu_mask(&info->cpumask, drain_xdn_switch, info, 0);
+#else
+		on_each_cpu_mask(&info->cpumask, drain_xdn_switch, info, 1);
+#endif
 
-	/* Restart switch and disable firewall. */
-	on_each_cpu_mask(&mask, restart_udn_switch, NULL, 1);
+		/* Restart switch and disable firewall. */
+		on_each_cpu_mask(&info->cpumask, restart_xdn_switch, hwt, 1);
+	}
 
 	/* Remove the /proc/tile/hardwall entry. */
-	hardwall_remove_proc(rect);
-
-	/* Now free the rectangle from the list. */
-	spin_lock_irqsave(&hardwall_lock, flags);
-	BUG_ON(!list_empty(&rect->task_head));
-	list_del(&rect->list);
-	spin_unlock_irqrestore(&hardwall_lock, flags);
-	kfree(rect);
+	hardwall_remove_proc(info);
+
+	/* Now free the hardwall from the list. */
+	spin_lock_irqsave(&hwt->lock, flags);
+	BUG_ON(!list_empty(&info->task_head));
+	list_del(&info->list);
+	spin_unlock_irqrestore(&hwt->lock, flags);
+	kfree(info);
 }
 
 
 static int hardwall_proc_show(struct seq_file *sf, void *v)
 {
-	struct hardwall_info *rect = sf->private;
+	struct hardwall_info *info = sf->private;
 	char buf[256];
 
-	int rc = cpulist_scnprintf(buf, sizeof(buf), &rect->cpumask);
+	int rc = cpulist_scnprintf(buf, sizeof(buf), &info->cpumask);
 	buf[rc++] = '\n';
 	seq_write(sf, buf, rc);
 	return 0;
@@ -685,7 +922,7 @@ static int hardwall_proc_show(struct seq_file *sf, void *v)
 static int hardwall_proc_open(struct inode *inode,
 			      struct file *file)
 {
-	return single_open(file, hardwall_proc_show, PDE(inode)->data);
+	return single_open(file, hardwall_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations hardwall_proc_fops = {
@@ -695,31 +932,45 @@ static const struct file_operations hardwall_proc_fops = {
 	.release	= single_release,
 };
 
-static void hardwall_add_proc(struct hardwall_info *rect)
+static void hardwall_add_proc(struct hardwall_info *info)
 {
 	char buf[64];
-	snprintf(buf, sizeof(buf), "%d", rect->id);
-	proc_create_data(buf, 0444, hardwall_proc_dir,
-			 &hardwall_proc_fops, rect);
+	snprintf(buf, sizeof(buf), "%d", info->id);
+	proc_create_data(buf, 0444, info->type->proc_dir,
+			 &hardwall_proc_fops, info);
 }
 
-static void hardwall_remove_proc(struct hardwall_info *rect)
+static void hardwall_remove_proc(struct hardwall_info *info)
 {
 	char buf[64];
-	snprintf(buf, sizeof(buf), "%d", rect->id);
-	remove_proc_entry(buf, hardwall_proc_dir);
+	snprintf(buf, sizeof(buf), "%d", info->id);
+	remove_proc_entry(buf, info->type->proc_dir);
 }
 
 int proc_pid_hardwall(struct task_struct *task, char *buffer)
 {
-	struct hardwall_info *rect = task->thread.hardwall;
-	return rect ? sprintf(buffer, "%d\n", rect->id) : 0;
+	int i;
+	int n = 0;
+	for (i = 0; i < HARDWALL_TYPES; ++i) {
+		struct hardwall_info *info = task->thread.hardwall[i].info;
+		if (info)
+			n += sprintf(&buffer[n], "%s: %d\n",
+				     info->type->name, info->id);
+	}
+	return n;
 }
 
 void proc_tile_hardwall_init(struct proc_dir_entry *root)
 {
-	if (!udn_disabled)
-		hardwall_proc_dir = proc_mkdir("hardwall", root);
+	int i;
+	for (i = 0; i < HARDWALL_TYPES; ++i) {
+		struct hardwall_type *hwt = &hardwall_types[i];
+		if (hwt->disabled)
+			continue;
+		if (hardwall_proc_dir == NULL)
+			hardwall_proc_dir = proc_mkdir("hardwall", root);
+		hwt->proc_dir = proc_mkdir(hwt->name, hardwall_proc_dir);
+	}
 }
 
 
@@ -729,34 +980,45 @@ void proc_tile_hardwall_init(struct proc_dir_entry *root)
 
 static long hardwall_ioctl(struct file *file, unsigned int a, unsigned long b)
 {
-	struct hardwall_info *rect = file->private_data;
+	struct hardwall_info *info = file->private_data;
+	int minor = iminor(file->f_mapping->host);
+	struct hardwall_type* hwt;
 
 	if (_IOC_TYPE(a) != HARDWALL_IOCTL_BASE)
 		return -EINVAL;
 
+	BUILD_BUG_ON(HARDWALL_TYPES != _HARDWALL_TYPES);
+	BUILD_BUG_ON(HARDWALL_TYPES !=
+		     sizeof(hardwall_types)/sizeof(hardwall_types[0]));
+
+	if (minor < 0 || minor >= HARDWALL_TYPES)
+		return -EINVAL;
+	hwt = &hardwall_types[minor];
+	WARN_ON(info && hwt != info->type);
+
 	switch (_IOC_NR(a)) {
 	case _HARDWALL_CREATE:
-		if (udn_disabled)
+		if (hwt->disabled)
 			return -ENOSYS;
-		if (rect != NULL)
+		if (info != NULL)
 			return -EALREADY;
-		rect = hardwall_create(_IOC_SIZE(a),
-					(const unsigned char __user *)b);
-		if (IS_ERR(rect))
-			return PTR_ERR(rect);
-		file->private_data = rect;
+		info = hardwall_create(hwt, _IOC_SIZE(a),
+				       (const unsigned char __user *)b);
+		if (IS_ERR(info))
+			return PTR_ERR(info);
+		file->private_data = info;
 		return 0;
 
 	case _HARDWALL_ACTIVATE:
-		return hardwall_activate(rect);
+		return hardwall_activate(info);
 
 	case _HARDWALL_DEACTIVATE:
-		if (current->thread.hardwall != rect)
+		if (current->thread.hardwall[hwt->index].info != info)
 			return -EINVAL;
-		return hardwall_deactivate(current);
+		return hardwall_deactivate(hwt, current);
 
 	case _HARDWALL_GET_ID:
-		return rect ? rect->id : -EINVAL;
+		return info ? info->id : -EINVAL;
 
 	default:
 		return -EINVAL;
@@ -775,26 +1037,28 @@ static long hardwall_compat_ioctl(struct file *file,
 /* The user process closed the file; revoke access to user networks. */
 static int hardwall_flush(struct file *file, fl_owner_t owner)
 {
-	struct hardwall_info *rect = file->private_data;
+	struct hardwall_info *info = file->private_data;
 	struct task_struct *task, *tmp;
 	unsigned long flags;
 
-	if (rect) {
+	if (info) {
 		/*
 		 * NOTE: if multiple threads are activated on this hardwall
 		 * file, the other threads will continue having access to the
-		 * UDN until they are context-switched out and back in again.
+		 * user network until they are context-switched out and back
+		 * in again.
 		 *
 		 * NOTE: A NULL files pointer means the task is being torn
 		 * down, so in that case we also deactivate it.
 		 */
-		spin_lock_irqsave(&hardwall_lock, flags);
-		list_for_each_entry_safe(task, tmp, &rect->task_head,
-					 thread.hardwall_list) {
+		struct hardwall_type *hwt = info->type;
+		spin_lock_irqsave(&hwt->lock, flags);
+		list_for_each_entry_safe(task, tmp, &info->task_head,
+					 thread.hardwall[hwt->index].list) {
 			if (task->files == owner || task->files == NULL)
-				_hardwall_deactivate(task);
+				_hardwall_deactivate(hwt, task);
 		}
-		spin_unlock_irqrestore(&hardwall_lock, flags);
+		spin_unlock_irqrestore(&hwt->lock, flags);
 	}
 
 	return 0;
@@ -824,11 +1088,11 @@ static int __init dev_hardwall_init(void)
 	int rc;
 	dev_t dev;
 
-	rc = alloc_chrdev_region(&dev, 0, 1, "hardwall");
+	rc = alloc_chrdev_region(&dev, 0, HARDWALL_TYPES, "hardwall");
 	if (rc < 0)
 		return rc;
 	cdev_init(&hardwall_dev, &dev_hardwall_fops);
-	rc = cdev_add(&hardwall_dev, dev, 1);
+	rc = cdev_add(&hardwall_dev, dev, HARDWALL_TYPES);
 	if (rc < 0)
 		return rc;
 
diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S
index 1a39b7c1c87..8d5b40ff292 100644
--- a/arch/tile/kernel/head_32.S
+++ b/arch/tile/kernel/head_32.S
@@ -38,13 +38,13 @@ ENTRY(_start)
 	  movei r2, TILE_CHIP_REV
 	}
 	{
-	  moveli r0, _HV_VERSION
-	  jal hv_init
+	  moveli r0, _HV_VERSION_OLD_HV_INIT
+	  jal _hv_init
 	}
 	/* Get a reasonable default ASID in r0 */
 	{
 	  move r0, zero
-	  jal hv_inquire_asid
+	  jal _hv_inquire_asid
 	}
 	/* Install the default page table */
 	{
@@ -64,21 +64,21 @@ ENTRY(_start)
 	  auli r0, r0, ha16(swapper_pg_dir - PAGE_OFFSET)
 	}
 	{
-	  inv r6
+	  finv r6
 	  move r1, zero   /* high 32 bits of CPA is zero */
 	}
 	{
 	  moveli lr, lo16(1f)
-	  move r5, zero
+	  moveli r5, CTX_PAGE_FLAG
 	}
 	{
 	  auli lr, lr, ha16(1f)
-	  j hv_install_context
+	  j _hv_install_context
 	}
 1:
 
 	/* Get our processor number and save it away in SAVE_K_0. */
-	jal hv_inquire_topology
+	jal _hv_inquire_topology
 	mulll_uu r4, r1, r2        /* r1 == y, r2 == width */
 	add r4, r4, r0             /* r0 == x, so r4 == cpu == y*width + x */
 
@@ -86,7 +86,7 @@ ENTRY(_start)
 	/*
 	 * Load up our per-cpu offset.  When the first (master) tile
 	 * boots, this value is still zero, so we will load boot_pc
-	 * with start_kernel, and boot_sp with init_stack + THREAD_SIZE.
+	 * with start_kernel, and boot_sp at the top of init_stack.
 	 * The master tile initializes the per-cpu offset array, so that
 	 * when subsequent (secondary) tiles boot, they will instead load
 	 * from their per-cpu versions of boot_sp and boot_pc.
@@ -126,7 +126,6 @@ ENTRY(_start)
 	lw sp, r1
 	or r4, sp, r4
 	mtspr SPR_SYSTEM_SAVE_K_0, r4  /* save ksp0 + cpu */
-	addi sp, sp, -STACK_TOP_DELTA
 	{
 	  move lr, zero   /* stop backtraces in the called function */
 	  jr r0
@@ -141,11 +140,11 @@ ENTRY(empty_zero_page)
 
 	.macro PTE va, cpa, bits1, no_org=0
 	.ifeq \no_org
-	.org swapper_pg_dir + HV_L1_INDEX(\va) * HV_PTE_SIZE
+	.org swapper_pg_dir + PGD_INDEX(\va) * HV_PTE_SIZE
 	.endif
 	.word HV_PTE_PAGE | HV_PTE_DIRTY | HV_PTE_PRESENT | HV_PTE_ACCESSED | \
 	      (HV_PTE_MODE_CACHE_NO_L3 << HV_PTE_INDEX_MODE)
-	.word (\bits1) | (HV_CPA_TO_PFN(\cpa) << (HV_PTE_INDEX_PFN - 32))
+	.word (\bits1) | (HV_CPA_TO_PTFN(\cpa) << (HV_PTE_INDEX_PTFN - 32))
 	.endm
 
 __PAGE_ALIGNED_DATA
@@ -163,10 +162,10 @@ ENTRY(swapper_pg_dir)
 	.set addr, addr + PGDIR_SIZE
 	.endr
 
-	/* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
-	PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
+	/* The true text VAs are mapped as VA = PA + MEM_SV_START */
+	PTE MEM_SV_START, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
 			      (1 << (HV_PTE_INDEX_EXECUTABLE - 32))
-	.org swapper_pg_dir + HV_L1_SIZE
+	.org swapper_pg_dir + PGDIR_SIZE
 	END(swapper_pg_dir)
 
 	/*
diff --git a/arch/tile/kernel/head_64.S b/arch/tile/kernel/head_64.S
index 6bc3a932fe4..bd0e12f283f 100644
--- a/arch/tile/kernel/head_64.S
+++ b/arch/tile/kernel/head_64.S
@@ -25,6 +25,15 @@
 #include <arch/chip.h>
 #include <arch/spr_def.h>
 
+/* Extract two 32-bit bit values that were read into one register. */
+#ifdef __BIG_ENDIAN__
+#define GET_FIRST_INT(rd, rs) shrsi rd, rs, 32
+#define GET_SECOND_INT(rd, rs) addxi rd, rs, 0
+#else
+#define GET_FIRST_INT(rd, rs) addxi rd, rs, 0
+#define GET_SECOND_INT(rd, rs) shrsi rd, rs, 32
+#endif
+
 /*
  * This module contains the entry code for kernel images. It performs the
  * minimal setup needed to call the generic C routines.
@@ -34,17 +43,23 @@
 ENTRY(_start)
 	/* Notify the hypervisor of what version of the API we want */
 	{
+#if KERNEL_PL == 1 && _HV_VERSION == 13
+	  /* Support older hypervisors by asking for API version 12. */
+	  movei r0, _HV_VERSION_OLD_HV_INIT
+#else
+	  movei r0, _HV_VERSION
+#endif
 	  movei r1, TILE_CHIP
-	  movei r2, TILE_CHIP_REV
 	}
 	{
-	  moveli r0, _HV_VERSION
-	  jal hv_init
+	  movei r2, TILE_CHIP_REV
+	  movei r3, KERNEL_PL
 	}
+	jal _hv_init
 	/* Get a reasonable default ASID in r0 */
 	{
 	  move r0, zero
-	  jal hv_inquire_asid
+	  jal _hv_inquire_asid
 	}
 
 	/*
@@ -55,7 +70,7 @@ ENTRY(_start)
 	 * other CPUs should see a properly-constructed page table.
 	 */
 	{
-	  v4int_l r2, zero, r0    /* ASID for hv_install_context */
+	  GET_FIRST_INT(r2, r0)    /* ASID for hv_install_context */
 	  moveli r4, hw1_last(swapper_pgprot - PAGE_OFFSET)
 	}
 	{
@@ -71,7 +86,7 @@ ENTRY(_start)
 	{
 	  /* After initializing swapper_pgprot, HV_PTE_GLOBAL is set. */
 	  bfextu r7, r1, HV_PTE_INDEX_GLOBAL, HV_PTE_INDEX_GLOBAL
-	  inv r4
+	  finv r4
 	}
 	bnez r7, .Lno_write
 	{
@@ -114,30 +129,25 @@ ENTRY(_start)
 	  shl16insli r0, r0, hw0(swapper_pg_dir - PAGE_OFFSET)
 	}
 	{
-	  move r3, zero
-	  j hv_install_context
+	  moveli r3, CTX_PAGE_FLAG
+	  j _hv_install_context
 	}
 1:
 
 	/* Install the interrupt base. */
-	moveli r0, hw2_last(MEM_SV_START)
-	shl16insli r0, r0, hw1(MEM_SV_START)
-	shl16insli r0, r0, hw0(MEM_SV_START)
+	moveli r0, hw2_last(intrpt_start)
+	shl16insli r0, r0, hw1(intrpt_start)
+	shl16insli r0, r0, hw0(intrpt_start)
 	mtspr SPR_INTERRUPT_VECTOR_BASE_K, r0
 
-	/*
-	 * Get our processor number and save it away in SAVE_K_0.
-	 * Extract stuff from the topology structure: r4 = y, r6 = x,
-	 * r5 = width.  FIXME: consider whether we want to just make these
-	 * 64-bit values (and if so fix smp_topology write below, too).
-	 */
-	jal hv_inquire_topology
+	/* Get our processor number and save it away in SAVE_K_0. */
+	jal _hv_inquire_topology
 	{
-	  v4int_l r5, zero, r1    /* r5 = width */
-	  shrui r4, r0, 32        /* r4 = y */
+	  GET_FIRST_INT(r5, r1)   /* r5 = width */
+	  GET_SECOND_INT(r4, r0)  /* r4 = y */
 	}
 	{
-	  v4int_l r6, zero, r0    /* r6 = x */
+	  GET_FIRST_INT(r6, r0)   /* r6 = x */
 	  mul_lu_lu r4, r4, r5
 	}
 	{
@@ -148,7 +158,7 @@ ENTRY(_start)
 	/*
 	 * Load up our per-cpu offset.  When the first (master) tile
 	 * boots, this value is still zero, so we will load boot_pc
-	 * with start_kernel, and boot_sp with init_stack + THREAD_SIZE.
+	 * with start_kernel, and boot_sp with at the top of init_stack.
 	 * The master tile initializes the per-cpu offset array, so that
 	 * when subsequent (secondary) tiles boot, they will instead load
 	 * from their per-cpu versions of boot_sp and boot_pc.
@@ -192,9 +202,9 @@ ENTRY(_start)
 	}
 	ld r0, r0
 	ld sp, r1
-	or r4, sp, r4
+	shli r4, r4, CPU_SHIFT
+	bfins r4, sp, 0, CPU_SHIFT-1
 	mtspr SPR_SYSTEM_SAVE_K_0, r4  /* save ksp0 + cpu */
-	addi sp, sp, -STACK_TOP_DELTA
 	{
 	  move lr, zero   /* stop backtraces in the called function */
 	  jr r0
@@ -210,19 +220,19 @@ ENTRY(empty_zero_page)
 	.macro PTE cpa, bits1
 	.quad HV_PTE_PAGE | HV_PTE_DIRTY | HV_PTE_PRESENT | HV_PTE_ACCESSED |\
 	      HV_PTE_GLOBAL | (HV_PTE_MODE_CACHE_NO_L3 << HV_PTE_INDEX_MODE) |\
-	      (\bits1) | (HV_CPA_TO_PFN(\cpa) << HV_PTE_INDEX_PFN)
+	      (\bits1) | (HV_CPA_TO_PTFN(\cpa) << HV_PTE_INDEX_PTFN)
 	.endm
 
 __PAGE_ALIGNED_DATA
 	.align PAGE_SIZE
 ENTRY(swapper_pg_dir)
-	.org swapper_pg_dir + HV_L0_INDEX(PAGE_OFFSET) * HV_PTE_SIZE
+	.org swapper_pg_dir + PGD_INDEX(PAGE_OFFSET) * HV_PTE_SIZE
 .Lsv_data_pmd:
 	.quad 0  /* PTE temp_data_pmd - PAGE_OFFSET, 0 */
-	.org swapper_pg_dir + HV_L0_INDEX(MEM_SV_START) * HV_PTE_SIZE
+	.org swapper_pg_dir + PGD_INDEX(MEM_SV_START) * HV_PTE_SIZE
 .Lsv_code_pmd:
 	.quad 0  /* PTE temp_code_pmd - PAGE_OFFSET, 0 */
-	.org swapper_pg_dir + HV_L0_SIZE
+	.org swapper_pg_dir + SIZEOF_PGD
 	END(swapper_pg_dir)
 
 	.align HV_PAGE_TABLE_ALIGN
@@ -233,11 +243,11 @@ ENTRY(temp_data_pmd)
 	 * permissions later.
 	 */
 	.set addr, 0
-	.rept HV_L1_ENTRIES
+	.rept PTRS_PER_PMD
 	PTE addr, HV_PTE_READABLE | HV_PTE_WRITABLE
-	.set addr, addr + HV_PAGE_SIZE_LARGE
+	.set addr, addr + HPAGE_SIZE
 	.endr
-	.org temp_data_pmd + HV_L1_SIZE
+	.org temp_data_pmd + SIZEOF_PMD
 	END(temp_data_pmd)
 
 	.align HV_PAGE_TABLE_ALIGN
@@ -248,11 +258,11 @@ ENTRY(temp_code_pmd)
 	 * permissions later.
 	 */
 	.set addr, 0
-	.rept HV_L1_ENTRIES
+	.rept PTRS_PER_PMD
 	PTE addr, HV_PTE_READABLE | HV_PTE_EXECUTABLE
-	.set addr, addr + HV_PAGE_SIZE_LARGE
+	.set addr, addr + HPAGE_SIZE
 	.endr
-	.org temp_code_pmd + HV_L1_SIZE
+	.org temp_code_pmd + SIZEOF_PMD
 	END(temp_code_pmd)
 
 	/*
diff --git a/arch/tile/kernel/hvglue.S b/arch/tile/kernel/hvglue.S
new file mode 100644
index 00000000000..2ab45662239
--- /dev/null
+++ b/arch/tile/kernel/hvglue.S
@@ -0,0 +1,74 @@
+/* Hypervisor call vector addresses; see <hv/hypervisor.h> */
+.macro gensym sym, val, size
+.org \val
+.global _\sym
+.type _\sym,function
+_\sym:
+.size _\sym,\size
+#ifndef CONFIG_TILE_HVGLUE_TRACE
+.globl \sym
+.set \sym,_\sym
+#endif
+.endm
+
+.section .hvglue,"x",@nobits
+.align 8
+gensym hv_init, 0x20, 32
+gensym hv_install_context, 0x40, 32
+gensym hv_sysconf, 0x60, 32
+gensym hv_get_rtc, 0x80, 32
+gensym hv_set_rtc, 0xa0, 32
+gensym hv_flush_asid, 0xc0, 32
+gensym hv_flush_page, 0xe0, 32
+gensym hv_flush_pages, 0x100, 32
+gensym hv_restart, 0x120, 32
+gensym hv_halt, 0x140, 32
+gensym hv_power_off, 0x160, 32
+gensym hv_inquire_physical, 0x180, 32
+gensym hv_inquire_memory_controller, 0x1a0, 32
+gensym hv_inquire_virtual, 0x1c0, 32
+gensym hv_inquire_asid, 0x1e0, 32
+gensym hv_nanosleep, 0x200, 32
+gensym hv_console_read_if_ready, 0x220, 32
+gensym hv_console_write, 0x240, 32
+gensym hv_downcall_dispatch, 0x260, 32
+gensym hv_inquire_topology, 0x280, 32
+gensym hv_fs_findfile, 0x2a0, 32
+gensym hv_fs_fstat, 0x2c0, 32
+gensym hv_fs_pread, 0x2e0, 32
+gensym hv_physaddr_read64, 0x300, 32
+gensym hv_physaddr_write64, 0x320, 32
+gensym hv_get_command_line, 0x340, 32
+gensym hv_set_caching, 0x360, 32
+gensym hv_bzero_page, 0x380, 32
+gensym hv_register_message_state, 0x3a0, 32
+gensym hv_send_message, 0x3c0, 32
+gensym hv_receive_message, 0x3e0, 32
+gensym hv_inquire_context, 0x400, 32
+gensym hv_start_all_tiles, 0x420, 32
+gensym hv_dev_open, 0x440, 32
+gensym hv_dev_close, 0x460, 32
+gensym hv_dev_pread, 0x480, 32
+gensym hv_dev_pwrite, 0x4a0, 32
+gensym hv_dev_poll, 0x4c0, 32
+gensym hv_dev_poll_cancel, 0x4e0, 32
+gensym hv_dev_preada, 0x500, 32
+gensym hv_dev_pwritea, 0x520, 32
+gensym hv_flush_remote, 0x540, 32
+gensym hv_console_putc, 0x560, 32
+gensym hv_inquire_tiles, 0x580, 32
+gensym hv_confstr, 0x5a0, 32
+gensym hv_reexec, 0x5c0, 32
+gensym hv_set_command_line, 0x5e0, 32
+gensym hv_clear_intr, 0x600, 32
+gensym hv_enable_intr, 0x620, 32
+gensym hv_disable_intr, 0x640, 32
+gensym hv_raise_intr, 0x660, 32
+gensym hv_trigger_ipi, 0x680, 32
+gensym hv_store_mapping, 0x6a0, 32
+gensym hv_inquire_realpa, 0x6c0, 32
+gensym hv_flush_all, 0x6e0, 32
+gensym hv_get_ipi_pte, 0x700, 32
+gensym hv_set_pte_super_shift, 0x720, 32
+gensym hv_console_set_ipi, 0x7e0, 32
+gensym hv_glue_internals, 0x800, 30720
diff --git a/arch/tile/kernel/hvglue.lds b/arch/tile/kernel/hvglue.lds
deleted file mode 100644
index 2b7cd0a659a..00000000000
--- a/arch/tile/kernel/hvglue.lds
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Hypervisor call vector addresses; see <hv/hypervisor.h> */
-hv_init = TEXT_OFFSET + 0x10020;
-hv_install_context = TEXT_OFFSET + 0x10040;
-hv_sysconf = TEXT_OFFSET + 0x10060;
-hv_get_rtc = TEXT_OFFSET + 0x10080;
-hv_set_rtc = TEXT_OFFSET + 0x100a0;
-hv_flush_asid = TEXT_OFFSET + 0x100c0;
-hv_flush_page = TEXT_OFFSET + 0x100e0;
-hv_flush_pages = TEXT_OFFSET + 0x10100;
-hv_restart = TEXT_OFFSET + 0x10120;
-hv_halt = TEXT_OFFSET + 0x10140;
-hv_power_off = TEXT_OFFSET + 0x10160;
-hv_inquire_physical = TEXT_OFFSET + 0x10180;
-hv_inquire_memory_controller = TEXT_OFFSET + 0x101a0;
-hv_inquire_virtual = TEXT_OFFSET + 0x101c0;
-hv_inquire_asid = TEXT_OFFSET + 0x101e0;
-hv_nanosleep = TEXT_OFFSET + 0x10200;
-hv_console_read_if_ready = TEXT_OFFSET + 0x10220;
-hv_console_write = TEXT_OFFSET + 0x10240;
-hv_downcall_dispatch = TEXT_OFFSET + 0x10260;
-hv_inquire_topology = TEXT_OFFSET + 0x10280;
-hv_fs_findfile = TEXT_OFFSET + 0x102a0;
-hv_fs_fstat = TEXT_OFFSET + 0x102c0;
-hv_fs_pread = TEXT_OFFSET + 0x102e0;
-hv_physaddr_read64 = TEXT_OFFSET + 0x10300;
-hv_physaddr_write64 = TEXT_OFFSET + 0x10320;
-hv_get_command_line = TEXT_OFFSET + 0x10340;
-hv_set_caching = TEXT_OFFSET + 0x10360;
-hv_bzero_page = TEXT_OFFSET + 0x10380;
-hv_register_message_state = TEXT_OFFSET + 0x103a0;
-hv_send_message = TEXT_OFFSET + 0x103c0;
-hv_receive_message = TEXT_OFFSET + 0x103e0;
-hv_inquire_context = TEXT_OFFSET + 0x10400;
-hv_start_all_tiles = TEXT_OFFSET + 0x10420;
-hv_dev_open = TEXT_OFFSET + 0x10440;
-hv_dev_close = TEXT_OFFSET + 0x10460;
-hv_dev_pread = TEXT_OFFSET + 0x10480;
-hv_dev_pwrite = TEXT_OFFSET + 0x104a0;
-hv_dev_poll = TEXT_OFFSET + 0x104c0;
-hv_dev_poll_cancel = TEXT_OFFSET + 0x104e0;
-hv_dev_preada = TEXT_OFFSET + 0x10500;
-hv_dev_pwritea = TEXT_OFFSET + 0x10520;
-hv_flush_remote = TEXT_OFFSET + 0x10540;
-hv_console_putc = TEXT_OFFSET + 0x10560;
-hv_inquire_tiles = TEXT_OFFSET + 0x10580;
-hv_confstr = TEXT_OFFSET + 0x105a0;
-hv_reexec = TEXT_OFFSET + 0x105c0;
-hv_set_command_line = TEXT_OFFSET + 0x105e0;
-hv_clear_intr = TEXT_OFFSET + 0x10600;
-hv_enable_intr = TEXT_OFFSET + 0x10620;
-hv_disable_intr = TEXT_OFFSET + 0x10640;
-hv_raise_intr = TEXT_OFFSET + 0x10660;
-hv_trigger_ipi = TEXT_OFFSET + 0x10680;
-hv_store_mapping = TEXT_OFFSET + 0x106a0;
-hv_inquire_realpa = TEXT_OFFSET + 0x106c0;
-hv_flush_all = TEXT_OFFSET + 0x106e0;
-hv_get_ipi_pte = TEXT_OFFSET + 0x10700;
-hv_glue_internals = TEXT_OFFSET + 0x10720;
diff --git a/arch/tile/kernel/hvglue_trace.c b/arch/tile/kernel/hvglue_trace.c
new file mode 100644
index 00000000000..85c74ad2931
--- /dev/null
+++ b/arch/tile/kernel/hvglue_trace.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/*
+ * Pull in the hypervisor header so we declare all the ABI functions
+ * with the underscore versions, then undef the names so that we can
+ * provide our own wrapper versions.
+ */
+#define hv_init _hv_init
+#define hv_install_context _hv_install_context
+#define hv_sysconf _hv_sysconf
+#define hv_get_rtc _hv_get_rtc
+#define hv_set_rtc _hv_set_rtc
+#define hv_flush_asid _hv_flush_asid
+#define hv_flush_page _hv_flush_page
+#define hv_flush_pages _hv_flush_pages
+#define hv_restart _hv_restart
+#define hv_halt _hv_halt
+#define hv_power_off _hv_power_off
+#define hv_inquire_physical _hv_inquire_physical
+#define hv_inquire_memory_controller _hv_inquire_memory_controller
+#define hv_inquire_virtual _hv_inquire_virtual
+#define hv_inquire_asid _hv_inquire_asid
+#define hv_nanosleep _hv_nanosleep
+#define hv_console_read_if_ready _hv_console_read_if_ready
+#define hv_console_write _hv_console_write
+#define hv_downcall_dispatch _hv_downcall_dispatch
+#define hv_inquire_topology _hv_inquire_topology
+#define hv_fs_findfile _hv_fs_findfile
+#define hv_fs_fstat _hv_fs_fstat
+#define hv_fs_pread _hv_fs_pread
+#define hv_physaddr_read64 _hv_physaddr_read64
+#define hv_physaddr_write64 _hv_physaddr_write64
+#define hv_get_command_line _hv_get_command_line
+#define hv_set_caching _hv_set_caching
+#define hv_bzero_page _hv_bzero_page
+#define hv_register_message_state _hv_register_message_state
+#define hv_send_message _hv_send_message
+#define hv_receive_message _hv_receive_message
+#define hv_inquire_context _hv_inquire_context
+#define hv_start_all_tiles _hv_start_all_tiles
+#define hv_dev_open _hv_dev_open
+#define hv_dev_close _hv_dev_close
+#define hv_dev_pread _hv_dev_pread
+#define hv_dev_pwrite _hv_dev_pwrite
+#define hv_dev_poll _hv_dev_poll
+#define hv_dev_poll_cancel _hv_dev_poll_cancel
+#define hv_dev_preada _hv_dev_preada
+#define hv_dev_pwritea _hv_dev_pwritea
+#define hv_flush_remote _hv_flush_remote
+#define hv_console_putc _hv_console_putc
+#define hv_inquire_tiles _hv_inquire_tiles
+#define hv_confstr _hv_confstr
+#define hv_reexec _hv_reexec
+#define hv_set_command_line _hv_set_command_line
+#define hv_clear_intr _hv_clear_intr
+#define hv_enable_intr _hv_enable_intr
+#define hv_disable_intr _hv_disable_intr
+#define hv_raise_intr _hv_raise_intr
+#define hv_trigger_ipi _hv_trigger_ipi
+#define hv_store_mapping _hv_store_mapping
+#define hv_inquire_realpa _hv_inquire_realpa
+#define hv_flush_all _hv_flush_all
+#define hv_get_ipi_pte _hv_get_ipi_pte
+#define hv_set_pte_super_shift _hv_set_pte_super_shift
+#define hv_console_set_ipi _hv_console_set_ipi
+#include <hv/hypervisor.h>
+#undef hv_init
+#undef hv_install_context
+#undef hv_sysconf
+#undef hv_get_rtc
+#undef hv_set_rtc
+#undef hv_flush_asid
+#undef hv_flush_page
+#undef hv_flush_pages
+#undef hv_restart
+#undef hv_halt
+#undef hv_power_off
+#undef hv_inquire_physical
+#undef hv_inquire_memory_controller
+#undef hv_inquire_virtual
+#undef hv_inquire_asid
+#undef hv_nanosleep
+#undef hv_console_read_if_ready
+#undef hv_console_write
+#undef hv_downcall_dispatch
+#undef hv_inquire_topology
+#undef hv_fs_findfile
+#undef hv_fs_fstat
+#undef hv_fs_pread
+#undef hv_physaddr_read64
+#undef hv_physaddr_write64
+#undef hv_get_command_line
+#undef hv_set_caching
+#undef hv_bzero_page
+#undef hv_register_message_state
+#undef hv_send_message
+#undef hv_receive_message
+#undef hv_inquire_context
+#undef hv_start_all_tiles
+#undef hv_dev_open
+#undef hv_dev_close
+#undef hv_dev_pread
+#undef hv_dev_pwrite
+#undef hv_dev_poll
+#undef hv_dev_poll_cancel
+#undef hv_dev_preada
+#undef hv_dev_pwritea
+#undef hv_flush_remote
+#undef hv_console_putc
+#undef hv_inquire_tiles
+#undef hv_confstr
+#undef hv_reexec
+#undef hv_set_command_line
+#undef hv_clear_intr
+#undef hv_enable_intr
+#undef hv_disable_intr
+#undef hv_raise_intr
+#undef hv_trigger_ipi
+#undef hv_store_mapping
+#undef hv_inquire_realpa
+#undef hv_flush_all
+#undef hv_get_ipi_pte
+#undef hv_set_pte_super_shift
+#undef hv_console_set_ipi
+
+/*
+ * Provide macros based on <linux/syscalls.h> to provide a wrapper
+ * function that invokes the same function with an underscore prefix.
+ * We can't use the existing __SC_xxx macros because we need to
+ * support up to nine arguments rather than up to six, and also this
+ * way the file stands alone from possible changes in the
+ * implementation of <linux/syscalls.h>.
+ */
+#define HV_WRAP0(type, name)					\
+	type name(void);					\
+	type name(void)						\
+	{							\
+		return _##name();				\
+	}
+#define __HV_DECL1(t1, a1)	t1 a1
+#define __HV_DECL2(t2, a2, ...) t2 a2, __HV_DECL1(__VA_ARGS__)
+#define __HV_DECL3(t3, a3, ...) t3 a3, __HV_DECL2(__VA_ARGS__)
+#define __HV_DECL4(t4, a4, ...) t4 a4, __HV_DECL3(__VA_ARGS__)
+#define __HV_DECL5(t5, a5, ...) t5 a5, __HV_DECL4(__VA_ARGS__)
+#define __HV_DECL6(t6, a6, ...) t6 a6, __HV_DECL5(__VA_ARGS__)
+#define __HV_DECL7(t7, a7, ...) t7 a7, __HV_DECL6(__VA_ARGS__)
+#define __HV_DECL8(t8, a8, ...) t8 a8, __HV_DECL7(__VA_ARGS__)
+#define __HV_DECL9(t9, a9, ...) t9 a9, __HV_DECL8(__VA_ARGS__)
+#define __HV_PASS1(t1, a1)	a1
+#define __HV_PASS2(t2, a2, ...) a2, __HV_PASS1(__VA_ARGS__)
+#define __HV_PASS3(t3, a3, ...) a3, __HV_PASS2(__VA_ARGS__)
+#define __HV_PASS4(t4, a4, ...) a4, __HV_PASS3(__VA_ARGS__)
+#define __HV_PASS5(t5, a5, ...) a5, __HV_PASS4(__VA_ARGS__)
+#define __HV_PASS6(t6, a6, ...) a6, __HV_PASS5(__VA_ARGS__)
+#define __HV_PASS7(t7, a7, ...) a7, __HV_PASS6(__VA_ARGS__)
+#define __HV_PASS8(t8, a8, ...) a8, __HV_PASS7(__VA_ARGS__)
+#define __HV_PASS9(t9, a9, ...) a9, __HV_PASS8(__VA_ARGS__)
+#define HV_WRAPx(x, type, name, ...)				\
+	type name(__HV_DECL##x(__VA_ARGS__));			\
+	type name(__HV_DECL##x(__VA_ARGS__))			\
+	{							\
+		return _##name(__HV_PASS##x(__VA_ARGS__));	\
+	}
+#define HV_WRAP1(type, name, ...) HV_WRAPx(1, type, name, __VA_ARGS__)
+#define HV_WRAP2(type, name, ...) HV_WRAPx(2, type, name, __VA_ARGS__)
+#define HV_WRAP3(type, name, ...) HV_WRAPx(3, type, name, __VA_ARGS__)
+#define HV_WRAP4(type, name, ...) HV_WRAPx(4, type, name, __VA_ARGS__)
+#define HV_WRAP5(type, name, ...) HV_WRAPx(5, type, name, __VA_ARGS__)
+#define HV_WRAP6(type, name, ...) HV_WRAPx(6, type, name, __VA_ARGS__)
+#define HV_WRAP7(type, name, ...) HV_WRAPx(7, type, name, __VA_ARGS__)
+#define HV_WRAP8(type, name, ...) HV_WRAPx(8, type, name, __VA_ARGS__)
+#define HV_WRAP9(type, name, ...) HV_WRAPx(9, type, name, __VA_ARGS__)
+
+/* List all the hypervisor API functions. */
+HV_WRAP4(void, hv_init, HV_VersionNumber, interface_version_number,
+	 int, chip_num, int, chip_rev_num, int, client_pl)
+HV_WRAP1(long, hv_sysconf, HV_SysconfQuery, query)
+HV_WRAP3(int, hv_confstr, HV_ConfstrQuery, query, HV_VirtAddr, buf, int, len)
+#if CHIP_HAS_IPI()
+HV_WRAP3(int, hv_get_ipi_pte, HV_Coord, tile, int, pl, HV_PTE*, pte)
+HV_WRAP3(int, hv_console_set_ipi, int, ipi, int, event, HV_Coord, coord);
+#else
+HV_WRAP1(void, hv_enable_intr, HV_IntrMask, enab_mask)
+HV_WRAP1(void, hv_disable_intr, HV_IntrMask, disab_mask)
+HV_WRAP1(void, hv_clear_intr, HV_IntrMask, clear_mask)
+HV_WRAP1(void, hv_raise_intr, HV_IntrMask, raise_mask)
+HV_WRAP2(HV_Errno, hv_trigger_ipi, HV_Coord, tile, int, interrupt)
+#endif /* !CHIP_HAS_IPI() */
+HV_WRAP3(int, hv_store_mapping, HV_VirtAddr, va, unsigned int, len,
+	 HV_PhysAddr, pa)
+HV_WRAP2(HV_PhysAddr, hv_inquire_realpa, HV_PhysAddr, cpa, unsigned int, len)
+HV_WRAP0(HV_RTCTime, hv_get_rtc)
+HV_WRAP1(void, hv_set_rtc, HV_RTCTime, time)
+HV_WRAP4(int, hv_install_context, HV_PhysAddr, page_table, HV_PTE, access,
+	 HV_ASID, asid, __hv32, flags)
+HV_WRAP2(int, hv_set_pte_super_shift, int, level, int, log2_count)
+HV_WRAP0(HV_Context, hv_inquire_context)
+HV_WRAP1(int, hv_flush_asid, HV_ASID, asid)
+HV_WRAP2(int, hv_flush_page, HV_VirtAddr, address, HV_PageSize, page_size)
+HV_WRAP3(int, hv_flush_pages, HV_VirtAddr, start, HV_PageSize, page_size,
+	 unsigned long, size)
+HV_WRAP1(int, hv_flush_all, int, preserve_global)
+HV_WRAP2(void, hv_restart, HV_VirtAddr, cmd, HV_VirtAddr, args)
+HV_WRAP0(void, hv_halt)
+HV_WRAP0(void, hv_power_off)
+HV_WRAP1(int, hv_reexec, HV_PhysAddr, entry)
+HV_WRAP0(HV_Topology, hv_inquire_topology)
+HV_WRAP3(HV_Errno, hv_inquire_tiles, HV_InqTileSet, set, HV_VirtAddr, cpumask,
+	 int, length)
+HV_WRAP1(HV_PhysAddrRange, hv_inquire_physical, int, idx)
+HV_WRAP2(HV_MemoryControllerInfo, hv_inquire_memory_controller, HV_Coord, coord,
+	 int, controller)
+HV_WRAP1(HV_VirtAddrRange, hv_inquire_virtual, int, idx)
+HV_WRAP1(HV_ASIDRange, hv_inquire_asid, int, idx)
+HV_WRAP1(void, hv_nanosleep, int, nanosecs)
+HV_WRAP0(int, hv_console_read_if_ready)
+HV_WRAP1(void, hv_console_putc, int, byte)
+HV_WRAP2(int, hv_console_write, HV_VirtAddr, bytes, int, len)
+HV_WRAP0(void, hv_downcall_dispatch)
+HV_WRAP1(int, hv_fs_findfile, HV_VirtAddr, filename)
+HV_WRAP1(HV_FS_StatInfo, hv_fs_fstat, int, inode)
+HV_WRAP4(int, hv_fs_pread, int, inode, HV_VirtAddr, buf,
+	 int, length, int, offset)
+HV_WRAP2(unsigned long long, hv_physaddr_read64, HV_PhysAddr, addr,
+	 HV_PTE, access)
+HV_WRAP3(void, hv_physaddr_write64, HV_PhysAddr, addr, HV_PTE, access,
+	 unsigned long long, val)
+HV_WRAP2(int, hv_get_command_line, HV_VirtAddr, buf, int, length)
+HV_WRAP2(HV_Errno, hv_set_command_line, HV_VirtAddr, buf, int, length)
+HV_WRAP1(void, hv_set_caching, unsigned long, bitmask)
+HV_WRAP2(void, hv_bzero_page, HV_VirtAddr, va, unsigned int, size)
+HV_WRAP1(HV_Errno, hv_register_message_state, HV_MsgState*, msgstate)
+HV_WRAP4(int, hv_send_message, HV_Recipient *, recips, int, nrecip,
+	 HV_VirtAddr, buf, int, buflen)
+HV_WRAP3(HV_RcvMsgInfo, hv_receive_message, HV_MsgState, msgstate,
+	 HV_VirtAddr, buf, int, buflen)
+HV_WRAP0(void, hv_start_all_tiles)
+HV_WRAP2(int, hv_dev_open, HV_VirtAddr, name, __hv32, flags)
+HV_WRAP1(int, hv_dev_close, int, devhdl)
+HV_WRAP5(int, hv_dev_pread, int, devhdl, __hv32, flags, HV_VirtAddr, va,
+	 __hv32, len, __hv64, offset)
+HV_WRAP5(int, hv_dev_pwrite, int, devhdl, __hv32, flags, HV_VirtAddr, va,
+	 __hv32, len, __hv64, offset)
+HV_WRAP3(int, hv_dev_poll, int, devhdl, __hv32, events, HV_IntArg, intarg)
+HV_WRAP1(int, hv_dev_poll_cancel, int, devhdl)
+HV_WRAP6(int, hv_dev_preada, int, devhdl, __hv32, flags, __hv32, sgl_len,
+	 HV_SGL *, sglp, __hv64, offset, HV_IntArg, intarg)
+HV_WRAP6(int, hv_dev_pwritea, int, devhdl, __hv32, flags, __hv32, sgl_len,
+	 HV_SGL *, sglp, __hv64, offset, HV_IntArg, intarg)
+HV_WRAP9(int, hv_flush_remote, HV_PhysAddr, cache_pa,
+	 unsigned long, cache_control, unsigned long*, cache_cpumask,
+	 HV_VirtAddr, tlb_va, unsigned long, tlb_length,
+	 unsigned long, tlb_pgsize, unsigned long*, tlb_cpumask,
+	 HV_Remote_ASID*, asids, int, asidcount)
diff --git a/arch/tile/kernel/init_task.c b/arch/tile/kernel/init_task.c
deleted file mode 100644
index 928b3187066..00000000000
--- a/arch/tile/kernel/init_task.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/init_task.h>
-#include <linux/mqueue.h>
-#include <linux/module.h>
-#include <linux/start_kernel.h>
-#include <linux/uaccess.h>
-
-static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
-static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-
-/*
- * Initial thread structure.
- *
- * We need to make sure that this is THREAD_SIZE aligned due to the
- * way process stacks are handled. This is done by having a special
- * "init_task" linker map entry..
- */
-union thread_union init_thread_union __init_task_data = {
-	INIT_THREAD_INFO(init_task)
-};
-
-/*
- * Initial task structure.
- *
- * All other task structs will be allocated on slabs in fork.c
- */
-struct task_struct init_task = INIT_TASK(init_task);
-EXPORT_SYMBOL(init_task);
-
-/*
- * per-CPU stack and boot info.
- */
-DEFINE_PER_CPU(unsigned long, boot_sp) =
-	(unsigned long)init_stack + THREAD_SIZE;
-
-#ifdef CONFIG_SMP
-DEFINE_PER_CPU(unsigned long, boot_pc) = (unsigned long)start_kernel;
-#else
-/*
- * The variable must be __initdata since it references __init code.
- * With CONFIG_SMP it is per-cpu data, which is exempt from validation.
- */
-unsigned long __initdata boot_pc = (unsigned long)start_kernel;
-#endif
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index aecc8ed5f39..cdbda45a4e4 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -28,20 +28,10 @@
 #include <arch/interrupts.h>
 #include <arch/spr_def.h>
 
-#ifdef CONFIG_PREEMPT
-# error "No support for kernel preemption currently"
-#endif
-
 #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
 
 #define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
 
-#if !CHIP_HAS_WH64()
-	/* By making this an empty macro, we can use wh64 in the code. */
-	.macro  wh64 reg
-	.endm
-#endif
-
 	.macro  push_reg reg, ptr=sp, delta=-4
 	{
 	 sw     \ptr, \reg
@@ -189,7 +179,7 @@ intvec_\vecname:
 	 * point sp at the top aligned address on the actual stack page.
 	 */
 	mfspr   r0, SPR_SYSTEM_SAVE_K_0
-	mm      r0, r0, zero, LOG2_THREAD_SIZE, 31
+	mm      r0, r0, zero, LOG2_NR_CPU_IDS, 31
 
 0:
 	/*
@@ -207,6 +197,9 @@ intvec_\vecname:
 	 *    cache line 1: r14...r29
 	 *    cache line 0: 2 x frame, r0..r13
 	 */
+#if STACK_TOP_DELTA != 64
+#error STACK_TOP_DELTA must be 64 for assumptions here and in task_pt_regs()
+#endif
 	andi    r0, r0, -64
 
 	/*
@@ -320,24 +313,20 @@ intvec_\vecname:
 	 movei  r3, 0
 	}
 	.else
-	.ifc \c_routine, op_handle_perf_interrupt
+	.ifc \c_routine, handle_perf_interrupt
 	{
 	 mfspr  r2, PERF_COUNT_STS
 	 movei  r3, -1   /* not used, but set for consistency */
 	}
 	.else
-#if CHIP_HAS_AUX_PERF_COUNTERS()
-	.ifc \c_routine, op_handle_aux_perf_interrupt
+	.ifc \c_routine, handle_perf_interrupt
 	{
 	 mfspr  r2, AUX_PERF_COUNT_STS
 	 movei  r3, -1   /* not used, but set for consistency */
 	}
 	.else
-#endif
 	movei   r3, 0
-#if CHIP_HAS_AUX_PERF_COUNTERS()
 	.endif
-#endif
 	.endif
 	.endif
 	.endif
@@ -354,7 +343,7 @@ intvec_\vecname:
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	.pushsection .text.intvec_feedback,"ax"
 	.org    (\vecnum << 5)
-	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
 	jrp     lr
 	.popsection
 #endif
@@ -468,7 +457,7 @@ intvec_\vecname:
 	}
 	{
 	 auli   r21, r21, ha16(__per_cpu_offset)
-	 mm     r20, r20, zero, 0, LOG2_THREAD_SIZE-1
+	 mm     r20, r20, zero, 0, LOG2_NR_CPU_IDS-1
 	}
 	s2a     r20, r20, r21
 	lw      tp, r20
@@ -562,7 +551,6 @@ intvec_\vecname:
 	.endif
 	mtspr   INTERRUPT_CRITICAL_SECTION, zero
 
-#if CHIP_HAS_WH64()
 	/*
 	 * Prepare the first 256 stack bytes to be rapidly accessible
 	 * without having to fetch the background data.  We don't really
@@ -583,7 +571,6 @@ intvec_\vecname:
 	 addi   r52, r52, -64
 	}
 	wh64    r52
-#endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 	.ifnc \function,handle_nmi
@@ -762,7 +749,7 @@ intvec_\vecname:
 	.macro  dc_dispatch vecnum, vecname
 	.org    (\vecnum << 8)
 intvec_\vecname:
-	j       hv_downcall_dispatch
+	j       _hv_downcall_dispatch
 	ENDPROC(intvec_\vecname)
 	.endm
 
@@ -799,6 +786,10 @@ handle_interrupt:
  * This routine takes a boolean in r30 indicating if this is an NMI.
  * If so, we also expect a boolean in r31 indicating whether to
  * re-enable the oprofile interrupts.
+ *
+ * Note that .Lresume_userspace is jumped to directly in several
+ * places, and we need to make sure r30 is set correctly in those
+ * callers as well.
  */
 STD_ENTRY(interrupt_return)
 	/* If we're resuming to kernel space, don't check thread flags. */
@@ -808,17 +799,37 @@ STD_ENTRY(interrupt_return)
 	}
 	lw      r29, r29
 	andi    r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	bzt     r29, .Lresume_userspace
+
+#ifdef CONFIG_PREEMPT
+	/* Returning to kernel space. Check if we need preemption. */
+	GET_THREAD_INFO(r29)
+	addli   r28, r29, THREAD_INFO_FLAGS_OFFSET
 	{
-	 bzt    r29, .Lresume_userspace
-	 PTREGS_PTR(r29, PTREGS_OFFSET_PC)
+	 lw     r28, r28
+	 addli  r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
 	}
+	{
+	 andi   r28, r28, _TIF_NEED_RESCHED
+	 lw     r29, r29
+	}
+	bzt     r28, 1f
+	bnz     r29, 1f
+	/* Disable interrupts explicitly for preemption. */
+	IRQ_DISABLE(r20,r21)
+	TRACE_IRQS_OFF
+	jal     preempt_schedule_irq
+	FEEDBACK_REENTER(interrupt_return)
+1:
+#endif
 
 	/* If we're resuming to _cpu_idle_nap, bump PC forward by 8. */
 	{
-	 lw     r28, r29
+	 PTREGS_PTR(r29, PTREGS_OFFSET_PC)
 	 moveli r27, lo16(_cpu_idle_nap)
 	}
 	{
+	 lw     r28, r29
 	 auli   r27, r27, ha16(_cpu_idle_nap)
 	}
 	{
@@ -835,6 +846,18 @@ STD_ENTRY(interrupt_return)
 	FEEDBACK_REENTER(interrupt_return)
 
 	/*
+	 * Use r33 to hold whether we have already loaded the callee-saves
+	 * into ptregs.  We don't want to do it twice in this loop, since
+	 * then we'd clobber whatever changes are made by ptrace, etc.
+	 * Get base of stack in r32.
+	 */
+	{
+	 GET_THREAD_INFO(r32)
+	 movei  r33, 0
+	}
+
+.Lretry_work_pending:
+	/*
 	 * Disable interrupts so as to make sure we don't
 	 * miss an interrupt that sets any of the thread flags (like
 	 * need_resched or sigpending) between sampling and the iret.
@@ -844,9 +867,6 @@ STD_ENTRY(interrupt_return)
 	IRQ_DISABLE(r20, r21)
 	TRACE_IRQS_OFF  /* Note: clobbers registers r0-r29 */
 
-	/* Get base of stack in r32; note r30/31 are used as arguments here. */
-	GET_THREAD_INFO(r32)
-
 
 	/* Check to see if there is any work to do before returning to user. */
 	{
@@ -862,16 +882,18 @@ STD_ENTRY(interrupt_return)
 
 	/*
 	 * Make sure we have all the registers saved for signal
-	 * handling or single-step.  Call out to C code to figure out
-	 * exactly what we need to do for each flag bit, then if
-	 * necessary, reload the flags and recheck.
+	 * handling, notify-resume, or single-step.  Call out to C
+	 * code to figure out exactly what we need to do for each flag bit,
+	 * then if necessary, reload the flags and recheck.
 	 */
-	push_extra_callee_saves r0
 	{
 	 PTREGS_PTR(r0, PTREGS_OFFSET_BASE)
-	 jal    do_work_pending
+	 bnz    r33, 1f
 	}
-	bnz     r0, .Lresume_userspace
+	push_extra_callee_saves r0
+	movei   r33, 1
+1:	jal     do_work_pending
+	bnz     r0, .Lretry_work_pending
 
 	/*
 	 * In the NMI case we
@@ -924,6 +946,13 @@ STD_ENTRY(interrupt_return)
 	bzt     r30, .Lrestore_regs
 3:
 
+	/* We are relying on INT_PERF_COUNT at 33, and AUX_PERF_COUNT at 48 */
+	{
+	 moveli r0, lo16(1 << (INT_PERF_COUNT - 32))
+	 bz     r31, .Lrestore_regs
+	}
+	auli    r0, r0, ha16(1 << (INT_AUX_PERF_COUNT - 32))
+	mtspr   SPR_INTERRUPT_MASK_RESET_K_1, r0
 
 	/*
 	 * We now commit to returning from this interrupt, since we will be
@@ -1149,6 +1178,10 @@ handle_nmi:
 	 PTREGS_PTR(r0, PTREGS_OFFSET_BASE)
 	}
 	FEEDBACK_REENTER(handle_nmi)
+	{
+	 movei  r30, 1
+	 seq    r31, r0, zero
+	}
 	j       interrupt_return
 	STD_ENDPROC(handle_nmi)
 
@@ -1176,15 +1209,20 @@ handle_syscall:
 	add     r20, r20, tp
 	lw      r21, r20
 	addi    r21, r21, 1
-	sw      r20, r21
+	{
+	 sw     r20, r21
+	 GET_THREAD_INFO(r31)
+	}
 
 	/* Trace syscalls, if requested. */
-	GET_THREAD_INFO(r31)
 	addi	r31, r31, THREAD_INFO_FLAGS_OFFSET
 	lw	r30, r31
 	andi    r30, r30, _TIF_SYSCALL_TRACE
 	bzt	r30, .Lrestore_syscall_regs
-	jal	do_syscall_trace
+	{
+	 PTREGS_PTR(r0, PTREGS_OFFSET_BASE)
+	 jal    do_syscall_trace_enter
+	}
 	FEEDBACK_REENTER(handle_syscall)
 
 	/*
@@ -1235,9 +1273,15 @@ handle_syscall:
 	lw	r30, r31
 	andi    r30, r30, _TIF_SYSCALL_TRACE
 	bzt     r30, 1f
-	jal	do_syscall_trace
+	{
+	 PTREGS_PTR(r0, PTREGS_OFFSET_BASE)
+	 jal    do_syscall_trace_exit
+	}
 	FEEDBACK_REENTER(handle_syscall)
-1:	j       .Lresume_userspace   /* jump into middle of interrupt_return */
+1:	{
+	 movei  r30, 0               /* not an NMI */
+	 j      .Lresume_userspace   /* jump into middle of interrupt_return */
+	}
 
 .Linvalid_syscall:
 	/* Report an invalid syscall back to the user program */
@@ -1246,7 +1290,10 @@ handle_syscall:
 	 movei  r28, -ENOSYS
 	}
 	sw      r29, r28
-	j       .Lresume_userspace   /* jump into middle of interrupt_return */
+	{
+	 movei  r30, 0               /* not an NMI */
+	 j      .Lresume_userspace   /* jump into middle of interrupt_return */
+	}
 	STD_ENDPROC(handle_syscall)
 
 	/* Return the address for oprofile to suppress in backtraces. */
@@ -1262,9 +1309,27 @@ STD_ENTRY(ret_from_fork)
 	jal     sim_notify_fork
 	jal     schedule_tail
 	FEEDBACK_REENTER(ret_from_fork)
-	j       .Lresume_userspace   /* jump into middle of interrupt_return */
+	{
+	 movei  r30, 0               /* not an NMI */
+	 j      .Lresume_userspace   /* jump into middle of interrupt_return */
+	}
 	STD_ENDPROC(ret_from_fork)
 
+STD_ENTRY(ret_from_kernel_thread)
+	jal     sim_notify_fork
+	jal     schedule_tail
+	FEEDBACK_REENTER(ret_from_fork)
+	{
+	 move   r0, r31
+	 jalr   r30
+	}
+	FEEDBACK_REENTER(ret_from_kernel_thread)
+	{
+	 movei  r30, 0               /* not an NMI */
+	 j      .Lresume_userspace   /* jump into middle of interrupt_return */
+	}
+	STD_ENDPROC(ret_from_kernel_thread)
+
 	/*
 	 * Code for ill interrupt.
 	 */
@@ -1349,7 +1414,10 @@ handle_ill:
 3:
 	/* set PC and continue */
 	lw      r26, r24
-	sw      r28, r26
+	{
+	 sw     r28, r26
+	 GET_THREAD_INFO(r0)
+	}
 
 	/*
 	 * Clear TIF_SINGLESTEP to prevent recursion if we execute an ill.
@@ -1357,7 +1425,6 @@ handle_ill:
 	 * need to clear it here and can't really impose on all other arches.
 	 * So what's another write between friends?
 	 */
-	GET_THREAD_INFO(r0)
 
 	addi    r1, r0, THREAD_INFO_FLAGS_OFFSET
 	{
@@ -1371,12 +1438,14 @@ handle_ill:
 	{
 	 lw     r0, r0          /* indirect thru thread_info to get task_info*/
 	 addi   r1, sp, C_ABI_SAVE_AREA_SIZE  /* put ptregs pointer into r1 */
-	 move   r2, zero        /* load error code into r2 */
 	}
 
 	jal     send_sigtrap    /* issue a SIGTRAP */
 	FEEDBACK_REENTER(handle_ill)
-	j       .Lresume_userspace   /* jump into middle of interrupt_return */
+	{
+	 movei  r30, 0               /* not an NMI */
+	 j      .Lresume_userspace   /* jump into middle of interrupt_return */
+	}
 
 .Ldispatch_normal_ill:
 	{
@@ -1406,15 +1475,6 @@ STD_ENTRY_LOCAL(bad_intr)
 	panic   "Unhandled interrupt %#x: PC %#lx"
 	STD_ENDPROC(bad_intr)
 
-/* Put address of pt_regs in reg and jump. */
-#define PTREGS_SYSCALL(x, reg)                          \
-	STD_ENTRY(_##x);                                \
-	{                                               \
-	 PTREGS_PTR(reg, PTREGS_OFFSET_BASE);           \
-	 j      x                                       \
-	};                                              \
-	STD_ENDPROC(_##x)
-
 /*
  * Special-case sigreturn to not write r0 to the stack on return.
  * This is technically more efficient, but it also avoids difficulties
@@ -1430,12 +1490,9 @@ STD_ENTRY_LOCAL(bad_intr)
 	};                                              \
 	STD_ENDPROC(_##x)
 
-PTREGS_SYSCALL(sys_execve, r3)
-PTREGS_SYSCALL(sys_sigaltstack, r2)
 PTREGS_SYSCALL_SIGRETURN(sys_rt_sigreturn, r0)
-PTREGS_SYSCALL(sys_cmpxchg_badaddr, r1)
 
-/* Save additional callee-saves to pt_regs, put address in r4 and jump. */
+/* Save additional callee-saves to pt_regs and jump to standard function. */
 STD_ENTRY(_sys_clone)
 	push_extra_callee_saves r4
 	j       sys_clone
@@ -1478,12 +1535,10 @@ STD_ENTRY(_sys_clone)
 	__HEAD
 	.align 64
 	/* Align much later jump on the start of a cache line. */
-#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 	nop
 #if PAGE_SIZE >= 0x10000
 	nop
 #endif
-#endif
 ENTRY(sys_cmpxchg)
 
 	/*
@@ -1517,45 +1572,6 @@ ENTRY(sys_cmpxchg)
 # error Code here assumes PAGE_OFFSET can be loaded with just hi16()
 #endif
 
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	{
-	 /* Check for unaligned input. */
-	 bnz    sp, .Lcmpxchg_badaddr
-	 mm     r25, r0, zero, 3, PAGE_SHIFT-1
-	}
-	{
-	 crc32_32 r25, zero, r25
-	 moveli r21, lo16(atomic_lock_ptr)
-	}
-	{
-	 auli   r21, r21, ha16(atomic_lock_ptr)
-	 auli   r23, zero, hi16(PAGE_OFFSET)  /* hugepage-aligned */
-	}
-	{
-	 shri	r20, r25, 32 - ATOMIC_HASH_L1_SHIFT
-	 slt_u  r23, r0, r23
-	 lw	r26, r0  /* see comment in the "#else" for the "lw r26". */
-	}
-	{
-	 s2a    r21, r20, r21
-	 bbns   r23, .Lcmpxchg_badaddr
-	}
-	{
-	 lw     r21, r21
-	 seqi	r23, TREG_SYSCALL_NR_NAME, __NR_FAST_cmpxchg64
-	 andi	r25, r25, ATOMIC_HASH_L2_SIZE - 1
-	}
-	{
-	 /* Branch away at this point if we're doing a 64-bit cmpxchg. */
-	 bbs    r23, .Lcmpxchg64
-	 andi   r23, r0, 7       /* Precompute alignment for cmpxchg64. */
-	}
-	{
-	 s2a	ATOMIC_LOCK_REG_NAME, r25, r21
-	 j      .Lcmpxchg32_tns   /* see comment in the #else for the jump. */
-	}
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 	{
 	 /* Check for unaligned input. */
 	 bnz    sp, .Lcmpxchg_badaddr
@@ -1569,7 +1585,7 @@ ENTRY(sys_cmpxchg)
 	  * Because of C pointer arithmetic, we want to compute this:
 	  *
 	  * ((char*)atomic_locks +
-	  *  (((r0 >> 3) & (1 << (ATOMIC_HASH_SIZE - 1))) << 2))
+	  *  (((r0 >> 3) & ((1 << ATOMIC_HASH_SHIFT) - 1)) << 2))
 	  *
 	  * Instead of two shifts we just ">> 1", and use 'mm'
 	  * to ignore the low and high bits we don't want.
@@ -1580,12 +1596,9 @@ ENTRY(sys_cmpxchg)
 
 	 /*
 	  * Ensure that the TLB is loaded before we take out the lock.
-	  * On tilepro, this will start fetching the value all the way
-	  * into our L1 as well (and if it gets modified before we
-	  * grab the lock, it will be invalidated from our cache
-	  * before we reload it).  On tile64, we'll start fetching it
-	  * into our L1 if we're the home, and if we're not, we'll
-	  * still at least start fetching it into the home's L2.
+	  * This will start fetching the value all the way into our L1
+	  * as well (and if it gets modified before we grab the lock,
+	  * it will be invalidated from our cache before we reload it).
 	  */
 	 lw	r26, r0
 	}
@@ -1628,8 +1641,6 @@ ENTRY(sys_cmpxchg)
 	 j      .Lcmpxchg32_tns
 	}
 
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /* Symbol for do_page_fault_ics() to use to compare against the PC. */
 .global __sys_cmpxchg_grab_lock
 __sys_cmpxchg_grab_lock:
@@ -1767,9 +1778,6 @@ __sys_cmpxchg_grab_lock:
 	.align 64
 .Lcmpxchg64:
 	{
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	 s2a	ATOMIC_LOCK_REG_NAME, r25, r21
-#endif
 	 bzt     r23, .Lcmpxchg64_tns
 	}
 	j       .Lcmpxchg_badaddr
@@ -1835,11 +1843,12 @@ int_unalign:
 	push_extra_callee_saves r0
 	j       do_trap
 
-/* Include .intrpt1 array of interrupt vectors */
-	.section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+	.section ".intrpt", "ax"
 
-#define op_handle_perf_interrupt bad_intr
-#define op_handle_aux_perf_interrupt bad_intr
+#ifndef CONFIG_USE_PMC
+#define handle_perf_interrupt bad_intr
+#endif
 
 #ifndef CONFIG_HARDWALL
 #define do_hardwall_trap bad_intr
@@ -1880,7 +1889,7 @@ int_unalign:
 	int_hand     INT_IDN_AVAIL, IDN_AVAIL, bad_intr
 	int_hand     INT_UDN_AVAIL, UDN_AVAIL, bad_intr
 	int_hand     INT_PERF_COUNT, PERF_COUNT, \
-		     op_handle_perf_interrupt, handle_nmi
+		     handle_perf_interrupt, handle_nmi
 	int_hand     INT_INTCTRL_3, INTCTRL_3, bad_intr
 #if CONFIG_KERNEL_PL == 2
 	dc_dispatch  INT_INTCTRL_2, INTCTRL_2
@@ -1904,10 +1913,8 @@ int_unalign:
 		     do_page_fault
 	int_hand     INT_SN_CPL, SN_CPL, bad_intr
 	int_hand     INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap
-#if CHIP_HAS_AUX_PERF_COUNTERS()
 	int_hand     INT_AUX_PERF_COUNT, AUX_PERF_COUNT, \
-		     op_handle_aux_perf_interrupt, handle_nmi
-#endif
+		     handle_perf_interrupt, handle_nmi
 
 	/* Synthetic interrupt delivered only by the simulator */
 	int_hand     INT_BREAKPOINT, BREAKPOINT, do_breakpoint
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 79c93e10ba2..5b67efcecab 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -17,24 +17,33 @@
 #include <linux/linkage.h>
 #include <linux/errno.h>
 #include <linux/unistd.h>
+#include <linux/init.h>
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
 #include <asm/irqflags.h>
 #include <asm/asm-offsets.h>
 #include <asm/types.h>
+#include <asm/traps.h>
+#include <asm/signal.h>
 #include <hv/hypervisor.h>
 #include <arch/abi.h>
 #include <arch/interrupts.h>
 #include <arch/spr_def.h>
 
-#ifdef CONFIG_PREEMPT
-# error "No support for kernel preemption currently"
-#endif
-
 #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
 
 #define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
 
+#if CONFIG_KERNEL_PL == 1 || CONFIG_KERNEL_PL == 2
+/*
+ * Set "result" non-zero if ex1 holds the PL of the kernel
+ * (with or without ICS being set).  Note this works only
+ * because we never find the PL at level 3.
+ */
+# define IS_KERNEL_EX1(result, ex1) andi result, ex1, CONFIG_KERNEL_PL
+#else
+# error Recode IS_KERNEL_EX1 for CONFIG_KERNEL_PL
+#endif
 
 	.macro  push_reg reg, ptr=sp, delta=-8
 	{
@@ -97,6 +106,185 @@
 	}
 	.endm
 
+	/*
+	 * Unalign data exception fast handling: In order to handle
+	 * unaligned data access, a fast JIT version is generated and stored
+	 * in a specific area in user space. We first need to do a quick poke
+	 * to see if the JIT is available. We use certain bits in the fault
+	 * PC (3 to 9 is used for 16KB page size) as index to address the JIT
+	 * code area. The first 64bit word is the fault PC, and the 2nd one is
+	 * the fault bundle itself. If these 2 words both match, then we
+	 * directly "iret" to JIT code. If not, a slow path is invoked to
+	 * generate new JIT code. Note: the current JIT code WILL be
+	 * overwritten if it existed. So, ideally we can handle 128 unalign
+	 * fixups via JIT. For lookup efficiency and to effectively support
+	 * tight loops with multiple unaligned reference, a simple
+	 * direct-mapped cache is used.
+	 *
+	 * SPR_EX_CONTEXT_K_0 is modified to return to JIT code.
+	 * SPR_EX_CONTEXT_K_1 has ICS set.
+	 * SPR_EX_CONTEXT_0_0 is setup to user program's next PC.
+	 * SPR_EX_CONTEXT_0_1 = 0.
+	 */
+	.macro int_hand_unalign_fast  vecnum, vecname
+	.org  (\vecnum << 8)
+intvec_\vecname:
+	/* Put r3 in SPR_SYSTEM_SAVE_K_1.  */
+	mtspr   SPR_SYSTEM_SAVE_K_1, r3
+
+	mfspr   r3, SPR_EX_CONTEXT_K_1
+	/*
+	 * Examine if exception comes from user without ICS set.
+	 * If not, just go directly to the slow path.
+	 */
+	bnez    r3, hand_unalign_slow_nonuser
+
+	mfspr   r3, SPR_SYSTEM_SAVE_K_0
+
+	/* Get &thread_info->unalign_jit_tmp[0] in r3. */
+	bfexts  r3, r3, 0, CPU_SHIFT-1
+	mm      r3, zero, LOG2_THREAD_SIZE, 63
+	addli   r3, r3, THREAD_INFO_UNALIGN_JIT_TMP_OFFSET
+
+	/*
+	 * Save r0, r1, r2 into thread_info array r3 points to
+	 * from low to high memory in order.
+	 */
+	st_add  r3, r0, 8
+	st_add  r3, r1, 8
+	{
+	 st_add r3, r2, 8
+	 andi   r2, sp, 7
+	}
+
+	/* Save stored r3 value so we can revert it on a page fault. */
+	mfspr   r1, SPR_SYSTEM_SAVE_K_1
+	st      r3, r1
+
+	{
+	 /* Generate a SIGBUS if sp is not 8-byte aligned. */
+	 bnez   r2, hand_unalign_slow_badsp
+	}
+
+	/*
+	 * Get the thread_info in r0; load r1 with pc. Set the low bit of sp
+	 * as an indicator to the page fault code in case we fault.
+	 */
+	{
+	 ori    sp, sp, 1
+	 mfspr  r1, SPR_EX_CONTEXT_K_0
+	}
+
+	/* Add the jit_info offset in thread_info; extract r1 [3:9] into r2. */
+	{
+	 addli  r0, r3, THREAD_INFO_UNALIGN_JIT_BASE_OFFSET - \
+	  (THREAD_INFO_UNALIGN_JIT_TMP_OFFSET + (3 * 8))
+	 bfextu r2, r1, 3, (2 + PAGE_SHIFT - UNALIGN_JIT_SHIFT)
+	}
+
+	/* Load the jit_info; multiply r2 by 128. */
+	{
+	 ld     r0, r0
+	 shli   r2, r2, UNALIGN_JIT_SHIFT
+	}
+
+	/*
+	 * If r0 is NULL, the JIT page is not mapped, so go to slow path;
+	 * add offset r2 to r0 at the same time.
+	 */
+	{
+	 beqz   r0, hand_unalign_slow
+	 add    r2, r0, r2
+	}
+
+        /*
+	 * We are loading from userspace (both the JIT info PC and
+	 * instruction word, and the instruction word we executed)
+	 * and since either could fault while holding the interrupt
+	 * critical section, we must tag this region and check it in
+	 * do_page_fault() to handle it properly.
+	 */
+ENTRY(__start_unalign_asm_code)
+
+	/* Load first word of JIT in r0 and increment r2 by 8. */
+	ld_add  r0, r2, 8
+
+	/*
+	 * Compare the PC with the 1st word in JIT; load the fault bundle
+	 * into r1.
+	 */
+	{
+	 cmpeq  r0, r0, r1
+	 ld     r1, r1
+	}
+
+	/* Go to slow path if PC doesn't match. */
+	beqz    r0, hand_unalign_slow
+
+	/*
+	 * Load the 2nd word of JIT, which is supposed to be the fault
+	 * bundle for a cache hit. Increment r2; after this bundle r2 will
+	 * point to the potential start of the JIT code we want to run.
+	 */
+	ld_add  r0, r2, 8
+
+	/* No further accesses to userspace are done after this point. */
+ENTRY(__end_unalign_asm_code)
+
+	/* Compare the real bundle with what is saved in the JIT area. */
+	{
+	 cmpeq  r0, r1, r0
+	 mtspr  SPR_EX_CONTEXT_0_1, zero
+	}
+
+	/* Go to slow path if the fault bundle does not match. */
+	beqz    r0, hand_unalign_slow
+
+	/*
+	 * A cache hit is found.
+	 * r2 points to start of JIT code (3rd word).
+	 * r0 is the fault pc.
+	 * r1 is the fault bundle.
+	 * Reset the low bit of sp.
+	 */
+	{
+	 mfspr  r0, SPR_EX_CONTEXT_K_0
+	 andi   sp, sp, ~1
+	}
+
+	/* Write r2 into EX_CONTEXT_K_0 and increment PC. */
+	{
+	 mtspr  SPR_EX_CONTEXT_K_0, r2
+	 addi   r0, r0, 8
+	}
+
+	/*
+	 * Set ICS on kernel EX_CONTEXT_K_1 in order to "iret" to
+	 * user with ICS set. This way, if the JIT fixup causes another
+	 * unalign exception (which shouldn't be possible) the user
+	 * process will be terminated with SIGBUS. Also, our fixup will
+	 * run without interleaving with external interrupts.
+	 * Each fixup is at most 14 bundles, so it won't hold ICS for long.
+	 */
+	{
+	 movei  r1, PL_ICS_EX1(USER_PL, 1)
+	 mtspr  SPR_EX_CONTEXT_0_0, r0
+	}
+
+	{
+	 mtspr  SPR_EX_CONTEXT_K_1, r1
+	 addi   r3, r3, -(3 * 8)
+	}
+
+	/* Restore r0..r3. */
+	ld_add  r0, r3, 8
+	ld_add  r1, r3, 8
+	ld_add  r2, r3, 8
+	ld      r3, r3
+
+	iret
+	ENDPROC(intvec_\vecname)
+	.endm
 
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	.pushsection .text.intvec_feedback,"ax"
@@ -117,15 +305,21 @@ intvec_feedback:
 	 * The "processing" argument specifies the code for processing
 	 * the interrupt. Defaults to "handle_interrupt".
 	 */
-	.macro  int_hand vecnum, vecname, c_routine, processing=handle_interrupt
-	.org    (\vecnum << 8)
+	.macro __int_hand vecnum, vecname, c_routine,processing=handle_interrupt
 intvec_\vecname:
 	/* Temporarily save a register so we have somewhere to work. */
 
 	mtspr   SPR_SYSTEM_SAVE_K_1, r0
 	mfspr   r0, SPR_EX_CONTEXT_K_1
 
-	andi    r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	/*
+	 * The unalign data fastpath code sets the low bit in sp to
+	 * force us to reset it here on fault.
+	 */
+	{
+	 blbs   sp, 2f
+	 IS_KERNEL_EX1(r0, r0)
+	}
 
 	.ifc    \vecnum, INT_DOUBLE_FAULT
 	/*
@@ -175,15 +369,15 @@ intvec_\vecname:
 	}
 	.endif
 
-
+2:
 	/*
-	 * SYSTEM_SAVE_K_0 holds the cpu number in the low bits, and
-	 * the current stack top in the higher bits.  So we recover
-	 * our stack top by just masking off the low bits, then
+	 * SYSTEM_SAVE_K_0 holds the cpu number in the high bits, and
+	 * the current stack top in the lower bits.  So we recover
+	 * our starting stack value by sign-extending the low bits, then
 	 * point sp at the top aligned address on the actual stack page.
 	 */
 	mfspr   r0, SPR_SYSTEM_SAVE_K_0
-	mm      r0, zero, LOG2_THREAD_SIZE, 63
+	bfexts  r0, r0, 0, CPU_SHIFT-1
 
 0:
 	/*
@@ -205,6 +399,9 @@ intvec_\vecname:
 	 *    cache line 1: r6...r13
 	 *    cache line 0: 2 x frame, r0..r5
 	 */
+#if STACK_TOP_DELTA != 64
+#error STACK_TOP_DELTA must be 64 for assumptions here and in task_pt_regs()
+#endif
 	andi    r0, r0, -64
 
 	/*
@@ -219,7 +416,9 @@ intvec_\vecname:
 	 * This routine saves just the first four registers, plus the
 	 * stack context so we can do proper backtracing right away,
 	 * and defers to handle_interrupt to save the rest.
-	 * The backtracer needs pc, ex1, lr, sp, r52, and faultnum.
+	 * The backtracer needs pc, ex1, lr, sp, r52, and faultnum,
+	 * and needs sp set to its final location at the bottom of
+	 * the stack frame.
 	 */
 	addli   r0, r0, PTREGS_OFFSET_LR - (PTREGS_SIZE + KSTK_PTREGS_GAP)
 	wh64    r0   /* cache line 7 */
@@ -302,7 +501,7 @@ intvec_\vecname:
 	mfspr   r3, SPR_SYSTEM_SAVE_K_2   /* info about page fault */
 	.else
 	.ifc \vecnum, INT_ILL_TRANS
-	mfspr   r2, ILL_TRANS_REASON
+	mfspr   r2, ILL_VA_PC
 	.else
 	.ifc \vecnum, INT_DOUBLE_FAULT
 	mfspr   r2, SPR_SYSTEM_SAVE_K_2   /* double fault info from HV */
@@ -310,14 +509,12 @@ intvec_\vecname:
 	.ifc \c_routine, do_trap
 	mfspr   r2, GPV_REASON
 	.else
-	.ifc \c_routine, op_handle_perf_interrupt
+	.ifc \c_routine, handle_perf_interrupt
 	mfspr   r2, PERF_COUNT_STS
-#if CHIP_HAS_AUX_PERF_COUNTERS()
 	.else
-	.ifc \c_routine, op_handle_aux_perf_interrupt
+	.ifc \c_routine, handle_perf_interrupt
 	mfspr   r2, AUX_PERF_COUNT_STS
 	.endif
-#endif
 	.endif
 	.endif
 	.endif
@@ -336,7 +533,7 @@ intvec_\vecname:
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	.pushsection .text.intvec_feedback,"ax"
 	.org    (\vecnum << 5)
-	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
 	jrp     lr
 	.popsection
 #endif
@@ -449,31 +646,15 @@ intvec_\vecname:
 	push_reg r5, r52
 	st      r52, r4
 
-	/* Load tp with our per-cpu offset. */
-#ifdef CONFIG_SMP
-	{
-	 mfspr  r20, SPR_SYSTEM_SAVE_K_0
-	 moveli r21, hw2_last(__per_cpu_offset)
-	}
-	{
-	 shl16insli r21, r21, hw1(__per_cpu_offset)
-	 bfextu r20, r20, 0, LOG2_THREAD_SIZE-1
-	}
-	shl16insli r21, r21, hw0(__per_cpu_offset)
-	shl3add r20, r20, r21
-	ld      tp, r20
-#else
-	move    tp, zero
-#endif
-
 	/*
 	 * If we will be returning to the kernel, we will need to
 	 * reset the interrupt masks to the state they had before.
-	 * Set DISABLE_IRQ in flags iff we came from PL1 with irqs disabled.
+	 * Set DISABLE_IRQ in flags iff we came from kernel pl with
+	 * irqs disabled.
 	 */
 	mfspr   r32, SPR_EX_CONTEXT_K_1
 	{
-	 andi   r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	 IS_KERNEL_EX1(r22, r22)
 	 PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS)
 	}
 	beqzt   r32, 1f       /* zero if from user space */
@@ -488,6 +669,44 @@ intvec_\vecname:
 	.endif
 	st      r21, r32
 
+	/*
+	 * we've captured enough state to the stack (including in
+	 * particular our EX_CONTEXT state) that we can now release
+	 * the interrupt critical section and replace it with our
+	 * standard "interrupts disabled" mask value.  This allows
+	 * synchronous interrupts (and profile interrupts) to punch
+	 * through from this point onwards.
+	 *
+	 * It's important that no code before this point touch memory
+	 * other than our own stack (to keep the invariant that this
+	 * is all that gets touched under ICS), and that no code after
+	 * this point reference any interrupt-specific SPR, in particular
+	 * the EX_CONTEXT_K_ values.
+	 */
+	.ifc \function,handle_nmi
+	IRQ_DISABLE_ALL(r20)
+	.else
+	IRQ_DISABLE(r20, r21)
+	.endif
+	mtspr   INTERRUPT_CRITICAL_SECTION, zero
+
+	/* Load tp with our per-cpu offset. */
+#ifdef CONFIG_SMP
+	{
+	 mfspr  r20, SPR_SYSTEM_SAVE_K_0
+	 moveli r21, hw2_last(__per_cpu_offset)
+	}
+	{
+	 shl16insli r21, r21, hw1(__per_cpu_offset)
+	 bfextu r20, r20, CPU_SHIFT, 63
+	}
+	shl16insli r21, r21, hw0(__per_cpu_offset)
+	shl3add r20, r20, r21
+	ld      tp, r20
+#else
+	move    tp, zero
+#endif
+
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	/*
 	 * Notify the feedback routines that we were in the
@@ -512,21 +731,6 @@ intvec_\vecname:
 #endif
 
 	/*
-	 * we've captured enough state to the stack (including in
-	 * particular our EX_CONTEXT state) that we can now release
-	 * the interrupt critical section and replace it with our
-	 * standard "interrupts disabled" mask value.  This allows
-	 * synchronous interrupts (and profile interrupts) to punch
-	 * through from this point onwards.
-	 */
-	.ifc \function,handle_nmi
-	IRQ_DISABLE_ALL(r20)
-	.else
-	IRQ_DISABLE(r20, r21)
-	.endif
-	mtspr   INTERRUPT_CRITICAL_SECTION, zero
-
-	/*
 	 * Prepare the first 256 stack bytes to be rapidly accessible
 	 * without having to fetch the background data.
 	 */
@@ -576,7 +780,7 @@ intvec_\vecname:
 	.macro  dc_dispatch vecnum, vecname
 	.org    (\vecnum << 8)
 intvec_\vecname:
-	j       hv_downcall_dispatch
+	j       _hv_downcall_dispatch
 	ENDPROC(intvec_\vecname)
 	.endm
 
@@ -605,6 +809,10 @@ handle_interrupt:
  * This routine takes a boolean in r30 indicating if this is an NMI.
  * If so, we also expect a boolean in r31 indicating whether to
  * re-enable the oprofile interrupts.
+ *
+ * Note that .Lresume_userspace is jumped to directly in several
+ * places, and we need to make sure r30 is set correctly in those
+ * callers as well.
  */
 STD_ENTRY(interrupt_return)
 	/* If we're resuming to kernel space, don't check thread flags. */
@@ -613,14 +821,39 @@ STD_ENTRY(interrupt_return)
 	 PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
 	}
 	ld      r29, r29
-	andi    r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	IS_KERNEL_EX1(r29, r29)
 	{
 	 beqzt  r29, .Lresume_userspace
-	 PTREGS_PTR(r29, PTREGS_OFFSET_PC)
+	 move   r29, sp
 	}
 
+#ifdef CONFIG_PREEMPT
+	/* Returning to kernel space. Check if we need preemption. */
+	EXTRACT_THREAD_INFO(r29)
+	addli   r28, r29, THREAD_INFO_FLAGS_OFFSET
+	{
+	 ld     r28, r28
+	 addli  r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
+	}
+	{
+	 andi   r28, r28, _TIF_NEED_RESCHED
+	 ld4s   r29, r29
+	}
+	beqzt   r28, 1f
+	bnez    r29, 1f
+	/* Disable interrupts explicitly for preemption. */
+	IRQ_DISABLE(r20,r21)
+	TRACE_IRQS_OFF
+	jal     preempt_schedule_irq
+	FEEDBACK_REENTER(interrupt_return)
+1:
+#endif
+
 	/* If we're resuming to _cpu_idle_nap, bump PC forward by 8. */
-	moveli  r27, hw2_last(_cpu_idle_nap)
+	{
+	 moveli r27, hw2_last(_cpu_idle_nap)
+	 PTREGS_PTR(r29, PTREGS_OFFSET_PC)
+	}
 	{
 	 ld     r28, r29
 	 shl16insli r27, r27, hw1(_cpu_idle_nap)
@@ -642,6 +875,20 @@ STD_ENTRY(interrupt_return)
 	FEEDBACK_REENTER(interrupt_return)
 
 	/*
+	 * Use r33 to hold whether we have already loaded the callee-saves
+	 * into ptregs.  We don't want to do it twice in this loop, since
+	 * then we'd clobber whatever changes are made by ptrace, etc.
+	 */
+	{
+	 movei  r33, 0
+	 move   r32, sp
+	}
+
+	/* Get base of stack in r32. */
+	EXTRACT_THREAD_INFO(r32)
+
+.Lretry_work_pending:
+	/*
 	 * Disable interrupts so as to make sure we don't
 	 * miss an interrupt that sets any of the thread flags (like
 	 * need_resched or sigpending) between sampling and the iret.
@@ -651,9 +898,6 @@ STD_ENTRY(interrupt_return)
 	IRQ_DISABLE(r20, r21)
 	TRACE_IRQS_OFF  /* Note: clobbers registers r0-r29 */
 
-	/* Get base of stack in r32; note r30/31 are used as arguments here. */
-	GET_THREAD_INFO(r32)
-
 
 	/* Check to see if there is any work to do before returning to user. */
 	{
@@ -669,16 +913,18 @@ STD_ENTRY(interrupt_return)
 
 	/*
 	 * Make sure we have all the registers saved for signal
-	 * handling or single-step.  Call out to C code to figure out
+	 * handling or notify-resume.  Call out to C code to figure out
 	 * exactly what we need to do for each flag bit, then if
 	 * necessary, reload the flags and recheck.
 	 */
-	push_extra_callee_saves r0
 	{
 	 PTREGS_PTR(r0, PTREGS_OFFSET_BASE)
-	 jal    do_work_pending
+	 bnez   r33, 1f
 	}
-	bnez    r0, .Lresume_userspace
+	push_extra_callee_saves r0
+	movei   r33, 1
+1:	jal     do_work_pending
+	bnez    r0, .Lretry_work_pending
 
 	/*
 	 * In the NMI case we
@@ -702,7 +948,7 @@ STD_ENTRY(interrupt_return)
 	 PTREGS_PTR(r32, PTREGS_OFFSET_FLAGS)
 	}
 	{
-	 andi   r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK
+	 IS_KERNEL_EX1(r0, r0)
 	 ld     r32, r32
 	}
 	bnez    r0, 1f
@@ -718,12 +964,22 @@ STD_ENTRY(interrupt_return)
 	beqzt   r30, .Lrestore_regs
 	j       3f
 2:	TRACE_IRQS_ON
+	IRQ_ENABLE_LOAD(r20, r21)
 	movei   r0, 1
 	mtspr   INTERRUPT_CRITICAL_SECTION, r0
-	IRQ_ENABLE(r20, r21)
+	IRQ_ENABLE_APPLY(r20, r21)
 	beqzt   r30, .Lrestore_regs
 3:
 
+#if INT_PERF_COUNT + 1 != INT_AUX_PERF_COUNT
+# error Bad interrupt assumption
+#endif
+	{
+	 movei  r0, 3   /* two adjacent bits for the PERF_COUNT mask */
+	 beqz   r31, .Lrestore_regs
+	}
+	shli    r0, r0, INT_PERF_COUNT
+	mtspr   SPR_INTERRUPT_MASK_RESET_K, r0
 
 	/*
 	 * We now commit to returning from this interrupt, since we will be
@@ -737,7 +993,6 @@ STD_ENTRY(interrupt_return)
 	 * that will save some cycles if this turns out to be a syscall.
 	 */
 .Lrestore_regs:
-	FEEDBACK_REENTER(interrupt_return)   /* called from elsewhere */
 
 	/*
 	 * Rotate so we have one high bit and one low bit to test.
@@ -773,7 +1028,7 @@ STD_ENTRY(interrupt_return)
 	pop_reg r21, sp, PTREGS_OFFSET_REG(31) - PTREGS_OFFSET_PC
 	{
 	 mtspr  SPR_EX_CONTEXT_K_1, lr
-	 andi   lr, lr, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	 IS_KERNEL_EX1(lr, lr)
 	}
 	{
 	 mtspr  SPR_EX_CONTEXT_K_0, r21
@@ -941,7 +1196,7 @@ handle_nmi:
 	FEEDBACK_REENTER(handle_nmi)
 	{
 	 movei  r30, 1
-	 move   r31, r0
+	 cmpeq  r31, r0, zero
 	}
 	j       interrupt_return
 	STD_ENDPROC(handle_nmi)
@@ -963,19 +1218,30 @@ handle_syscall:
 	shl16insli r20, r20, hw0(irq_stat + IRQ_CPUSTAT_SYSCALL_COUNT_OFFSET)
 	add     r20, r20, tp
 	ld4s    r21, r20
-	addi    r21, r21, 1
-	st4     r20, r21
+	{
+	 addi   r21, r21, 1
+	 move   r31, sp
+	}
+	{
+	 st4    r20, r21
+	 EXTRACT_THREAD_INFO(r31)
+	}
 
 	/* Trace syscalls, if requested. */
-	GET_THREAD_INFO(r31)
 	addi	r31, r31, THREAD_INFO_FLAGS_OFFSET
-	ld	r30, r31
-	andi    r30, r30, _TIF_SYSCALL_TRACE
+	{
+	 ld     r30, r31
+	 moveli r32, _TIF_SYSCALL_ENTRY_WORK
+	}
+	and     r30, r30, r32
 	{
 	 addi   r30, r31, THREAD_INFO_STATUS_OFFSET - THREAD_INFO_FLAGS_OFFSET
 	 beqzt	r30, .Lrestore_syscall_regs
 	}
-	jal	do_syscall_trace
+	{
+	 PTREGS_PTR(r0, PTREGS_OFFSET_BASE)
+	 jal    do_syscall_trace_enter
+	}
 	FEEDBACK_REENTER(handle_syscall)
 
 	/*
@@ -1004,7 +1270,9 @@ handle_syscall:
 	/* Ensure that the syscall number is within the legal range. */
 	{
 	 moveli r20, hw2(sys_call_table)
+#ifdef CONFIG_COMPAT
 	 blbs   r30, .Lcompat_syscall
+#endif
 	}
 	{
 	 cmpltu r21, TREG_SYSCALL_NR_NAME, r21
@@ -1038,13 +1306,37 @@ handle_syscall:
 	FEEDBACK_REENTER(handle_syscall)
 
 	/* Do syscall trace again, if requested. */
-	ld	r30, r31
-	andi    r30, r30, _TIF_SYSCALL_TRACE
-	beqzt	r30, 1f
-	jal	do_syscall_trace
+	{
+	 ld      r30, r31
+	 moveli  r32, _TIF_SYSCALL_EXIT_WORK
+	}
+	and      r0, r30, r32
+	{
+	 andi    r0, r30, _TIF_SINGLESTEP
+	 beqzt   r0, 1f
+	}
+	{
+	 PTREGS_PTR(r0, PTREGS_OFFSET_BASE)
+	 jal    do_syscall_trace_exit
+	}
 	FEEDBACK_REENTER(handle_syscall)
-1:	j       .Lresume_userspace   /* jump into middle of interrupt_return */
+	andi    r0, r30, _TIF_SINGLESTEP
+
+1:	beqzt	r0, 2f
+
+	/* Single stepping -- notify ptrace. */
+	{
+	 movei   r0, SIGTRAP
+	 jal     ptrace_notify
+	}
+	FEEDBACK_REENTER(handle_syscall)
+
+2:	{
+	 movei  r30, 0               /* not an NMI */
+	 j      .Lresume_userspace   /* jump into middle of interrupt_return */
+	}
 
+#ifdef CONFIG_COMPAT
 .Lcompat_syscall:
 	/*
 	 * Load the base of the compat syscall table in r20, and
@@ -1069,6 +1361,7 @@ handle_syscall:
 	{ move r15, r4; addxi r4, r4, 0 }
 	{ move r16, r5; addxi r5, r5, 0 }
 	j .Lload_syscall_pointer
+#endif
 
 .Linvalid_syscall:
 	/* Report an invalid syscall back to the user program */
@@ -1077,7 +1370,10 @@ handle_syscall:
 	 movei  r28, -ENOSYS
 	}
 	st      r29, r28
-	j       .Lresume_userspace   /* jump into middle of interrupt_return */
+	{
+	 movei  r30, 0               /* not an NMI */
+	 j      .Lresume_userspace   /* jump into middle of interrupt_return */
+	}
 	STD_ENDPROC(handle_syscall)
 
 	/* Return the address for oprofile to suppress in backtraces. */
@@ -1093,9 +1389,27 @@ STD_ENTRY(ret_from_fork)
 	jal     sim_notify_fork
 	jal     schedule_tail
 	FEEDBACK_REENTER(ret_from_fork)
-	j       .Lresume_userspace
+	{
+	 movei  r30, 0               /* not an NMI */
+	 j      .Lresume_userspace   /* jump into middle of interrupt_return */
+	}
 	STD_ENDPROC(ret_from_fork)
 
+STD_ENTRY(ret_from_kernel_thread)
+	jal     sim_notify_fork
+	jal     schedule_tail
+	FEEDBACK_REENTER(ret_from_fork)
+	{
+	 move   r0, r31
+	 jalr   r30
+	}
+	FEEDBACK_REENTER(ret_from_kernel_thread)
+	{
+	 movei  r30, 0               /* not an NMI */
+	 j      .Lresume_userspace   /* jump into middle of interrupt_return */
+	}
+	STD_ENDPROC(ret_from_kernel_thread)
+
 /* Various stub interrupt handlers and syscall handlers */
 
 STD_ENTRY_LOCAL(_kernel_double_fault)
@@ -1112,15 +1426,6 @@ STD_ENTRY_LOCAL(bad_intr)
 	panic   "Unhandled interrupt %#x: PC %#lx"
 	STD_ENDPROC(bad_intr)
 
-/* Put address of pt_regs in reg and jump. */
-#define PTREGS_SYSCALL(x, reg)                          \
-	STD_ENTRY(_##x);                                \
-	{                                               \
-	 PTREGS_PTR(reg, PTREGS_OFFSET_BASE);           \
-	 j      x                                       \
-	};                                              \
-	STD_ENDPROC(_##x)
-
 /*
  * Special-case sigreturn to not write r0 to the stack on return.
  * This is technically more efficient, but it also avoids difficulties
@@ -1136,37 +1441,74 @@ STD_ENTRY_LOCAL(bad_intr)
 	};                                              \
 	STD_ENDPROC(_##x)
 
-PTREGS_SYSCALL(sys_execve, r3)
-PTREGS_SYSCALL(sys_sigaltstack, r2)
 PTREGS_SYSCALL_SIGRETURN(sys_rt_sigreturn, r0)
 #ifdef CONFIG_COMPAT
-PTREGS_SYSCALL(compat_sys_execve, r3)
-PTREGS_SYSCALL(compat_sys_sigaltstack, r2)
 PTREGS_SYSCALL_SIGRETURN(compat_sys_rt_sigreturn, r0)
 #endif
 
-/* Save additional callee-saves to pt_regs, put address in r4 and jump. */
+/* Save additional callee-saves to pt_regs and jump to standard function. */
 STD_ENTRY(_sys_clone)
 	push_extra_callee_saves r4
 	j       sys_clone
 	STD_ENDPROC(_sys_clone)
 
-/* The single-step support may need to read all the registers. */
+	/*
+	 * Recover r3, r2, r1 and r0 here saved by unalign fast vector.
+	 * The vector area limit is 32 bundles, so we handle the reload here.
+	 * r0, r1, r2 are in thread_info from low to high memory in order.
+	 * r3 points to location the original r3 was saved.
+	 * We put this code in the __HEAD section so it can be reached
+	 * via a conditional branch from the fast path.
+	 */
+	__HEAD
+hand_unalign_slow:
+	andi    sp, sp, ~1
+hand_unalign_slow_badsp:
+	addi    r3, r3, -(3 * 8)
+	ld_add  r0, r3, 8
+	ld_add  r1, r3, 8
+	ld      r2, r3
+hand_unalign_slow_nonuser:
+	mfspr   r3, SPR_SYSTEM_SAVE_K_1
+	__int_hand     INT_UNALIGN_DATA, UNALIGN_DATA_SLOW, int_unalign
+
+/* The unaligned data support needs to read all the registers. */
 int_unalign:
 	push_extra_callee_saves r0
-	j       do_trap
+	j       do_unaligned
+ENDPROC(hand_unalign_slow)
 
-/* Include .intrpt1 array of interrupt vectors */
-	.section ".intrpt1", "ax"
+/* Fill the return address stack with nonzero entries. */
+STD_ENTRY(fill_ra_stack)
+	{
+	 move	r0, lr
+	 jal	1f
+	}
+1:	jal	2f
+2:	jal	3f
+3:	jal	4f
+4:	jrp	r0
+	STD_ENDPROC(fill_ra_stack)
 
-#define op_handle_perf_interrupt bad_intr
-#define op_handle_aux_perf_interrupt bad_intr
+	.macro int_hand  vecnum, vecname, c_routine, processing=handle_interrupt
+	.org   (\vecnum << 8)
+		__int_hand   \vecnum, \vecname, \c_routine, \processing
+	.endm
+
+/* Include .intrpt array of interrupt vectors */
+	.section ".intrpt", "ax"
+	.global intrpt_start
+intrpt_start:
+
+#ifndef CONFIG_USE_PMC
+#define handle_perf_interrupt bad_intr
+#endif
 
 #ifndef CONFIG_HARDWALL
 #define do_hardwall_trap bad_intr
 #endif
 
-	int_hand     INT_MEM_ERROR, MEM_ERROR, bad_intr
+	int_hand     INT_MEM_ERROR, MEM_ERROR, do_trap
 	int_hand     INT_SINGLE_STEP_3, SINGLE_STEP_3, bad_intr
 #if CONFIG_KERNEL_PL == 2
 	int_hand     INT_SINGLE_STEP_2, SINGLE_STEP_2, gx_singlestep_handle
@@ -1188,10 +1530,10 @@ int_unalign:
 	int_hand     INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall
 	int_hand     INT_SWINT_0, SWINT_0, do_trap
 	int_hand     INT_ILL_TRANS, ILL_TRANS, do_trap
-	int_hand     INT_UNALIGN_DATA, UNALIGN_DATA, int_unalign
+	int_hand_unalign_fast INT_UNALIGN_DATA, UNALIGN_DATA
 	int_hand     INT_DTLB_MISS, DTLB_MISS, do_page_fault
 	int_hand     INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault
-	int_hand     INT_IDN_FIREWALL, IDN_FIREWALL, bad_intr
+	int_hand     INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap
 	int_hand     INT_UDN_FIREWALL, UDN_FIREWALL, do_hardwall_trap
 	int_hand     INT_TILE_TIMER, TILE_TIMER, do_timer_interrupt
 	int_hand     INT_IDN_TIMER, IDN_TIMER, bad_intr
@@ -1208,9 +1550,9 @@ int_unalign:
 #endif
 	int_hand     INT_IPI_0, IPI_0, bad_intr
 	int_hand     INT_PERF_COUNT, PERF_COUNT, \
-		     op_handle_perf_interrupt, handle_nmi
+		     handle_perf_interrupt, handle_nmi
 	int_hand     INT_AUX_PERF_COUNT, AUX_PERF_COUNT, \
-		     op_handle_perf_interrupt, handle_nmi
+		     handle_perf_interrupt, handle_nmi
 	int_hand     INT_INTCTRL_3, INTCTRL_3, bad_intr
 #if CONFIG_KERNEL_PL == 2
 	dc_dispatch  INT_INTCTRL_2, INTCTRL_2
diff --git a/arch/tile/kernel/irq.c b/arch/tile/kernel/irq.c
index 02e62806501..637f2ffaa5f 100644
--- a/arch/tile/kernel/irq.c
+++ b/arch/tile/kernel/irq.c
@@ -21,6 +21,7 @@
 #include <hv/drv_pcie_rc_intf.h>
 #include <arch/spr_def.h>
 #include <asm/traps.h>
+#include <linux/perf_event.h>
 
 /* Bit-flag stored in irq_desc->chip_data to indicate HW-cleared irqs. */
 #define IS_HW_CLEARED 1
@@ -53,12 +54,6 @@ static DEFINE_PER_CPU(unsigned long, irq_disable_mask)
  */
 static DEFINE_PER_CPU(int, irq_depth);
 
-/* State for allocating IRQs on Gx. */
-#if CHIP_HAS_IPI()
-static unsigned long available_irqs = ~(1UL << IRQ_RESCHEDULE);
-static DEFINE_SPINLOCK(available_irqs_lock);
-#endif
-
 #if CHIP_HAS_IPI()
 /* Use SPRs to manipulate device interrupts. */
 #define mask_irqs(irq_mask) __insn_mtspr(SPR_IPI_MASK_SET_K, irq_mask)
@@ -73,7 +68,8 @@ static DEFINE_SPINLOCK(available_irqs_lock);
 
 /*
  * The interrupt handling path, implemented in terms of HV interrupt
- * emulation on TILE64 and TILEPro, and IPI hardware on TILE-Gx.
+ * emulation on TILEPro, and IPI hardware on TILE-Gx.
+ * Entered with interrupts disabled.
  */
 void tile_dev_intr(struct pt_regs *regs, int intnum)
 {
@@ -220,7 +216,7 @@ void __init init_IRQ(void)
 	ipi_init();
 }
 
-void __cpuinit setup_irq_regs(void)
+void setup_irq_regs(void)
 {
 	/* Enable interrupt delivery. */
 	unmask_irqs(~0UL);
@@ -233,7 +229,7 @@ void tile_irq_activate(unsigned int irq, int tile_irq_type)
 {
 	/*
 	 * We use handle_level_irq() by default because the pending
-	 * interrupt vector (whether modeled by the HV on TILE64 and
+	 * interrupt vector (whether modeled by the HV on
 	 * TILEPro or implemented in hardware on TILE-Gx) has
 	 * level-style semantics for each bit.  An interrupt fires
 	 * whenever a bit is high, not just at edges.
@@ -259,37 +255,27 @@ void ack_bad_irq(unsigned int irq)
 }
 
 /*
- * Generic, controller-independent functions:
+ * /proc/interrupts printing:
  */
-
-#if CHIP_HAS_IPI()
-int create_irq(void)
+int arch_show_interrupts(struct seq_file *p, int prec)
 {
-	unsigned long flags;
-	int result;
-
-	spin_lock_irqsave(&available_irqs_lock, flags);
-	if (available_irqs == 0)
-		result = -ENOMEM;
-	else {
-		result = __ffs(available_irqs);
-		available_irqs &= ~(1UL << result);
-		dynamic_irq_init(result);
-	}
-	spin_unlock_irqrestore(&available_irqs_lock, flags);
+#ifdef CONFIG_PERF_EVENTS
+	int i;
 
-	return result;
+	seq_printf(p, "%*s: ", prec, "PMI");
+
+	for_each_online_cpu(i)
+		seq_printf(p, "%10llu ", per_cpu(perf_irqs, i));
+	seq_puts(p, "  perf_events\n");
+#endif
+	return 0;
 }
-EXPORT_SYMBOL(create_irq);
 
-void destroy_irq(unsigned int irq)
+#if CHIP_HAS_IPI()
+int arch_setup_hwirq(unsigned int irq, int node)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&available_irqs_lock, flags);
-	available_irqs |= (1UL << irq);
-	dynamic_irq_cleanup(irq);
-	spin_unlock_irqrestore(&available_irqs_lock, flags);
+	return irq >= NR_IRQS ? -EINVAL : 0;
 }
-EXPORT_SYMBOL(destroy_irq);
+
+void arch_teardown_hwirq(unsigned int irq) { }
 #endif
diff --git a/arch/tile/kernel/kgdb.c b/arch/tile/kernel/kgdb.c
new file mode 100644
index 00000000000..4cd88381a83
--- /dev/null
+++ b/arch/tile/kernel/kgdb.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx KGDB support.
+ */
+
+#include <linux/ptrace.h>
+#include <linux/kgdb.h>
+#include <linux/kdebug.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <asm/cacheflush.h>
+
+static tile_bundle_bits singlestep_insn = TILEGX_BPT_BUNDLE | DIE_SSTEPBP;
+static unsigned long stepped_addr;
+static tile_bundle_bits stepped_instr;
+
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
+	{ "r0", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[0])},
+	{ "r1", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[1])},
+	{ "r2", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[2])},
+	{ "r3", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[3])},
+	{ "r4", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[4])},
+	{ "r5", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[5])},
+	{ "r6", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[6])},
+	{ "r7", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[7])},
+	{ "r8", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[8])},
+	{ "r9", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[9])},
+	{ "r10", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[10])},
+	{ "r11", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[11])},
+	{ "r12", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[12])},
+	{ "r13", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[13])},
+	{ "r14", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[14])},
+	{ "r15", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[15])},
+	{ "r16", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[16])},
+	{ "r17", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[17])},
+	{ "r18", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[18])},
+	{ "r19", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[19])},
+	{ "r20", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[20])},
+	{ "r21", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[21])},
+	{ "r22", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[22])},
+	{ "r23", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[23])},
+	{ "r24", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[24])},
+	{ "r25", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[25])},
+	{ "r26", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[26])},
+	{ "r27", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[27])},
+	{ "r28", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[28])},
+	{ "r29", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[29])},
+	{ "r30", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[30])},
+	{ "r31", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[31])},
+	{ "r32", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[32])},
+	{ "r33", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[33])},
+	{ "r34", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[34])},
+	{ "r35", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[35])},
+	{ "r36", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[36])},
+	{ "r37", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[37])},
+	{ "r38", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[38])},
+	{ "r39", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[39])},
+	{ "r40", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[40])},
+	{ "r41", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[41])},
+	{ "r42", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[42])},
+	{ "r43", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[43])},
+	{ "r44", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[44])},
+	{ "r45", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[45])},
+	{ "r46", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[46])},
+	{ "r47", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[47])},
+	{ "r48", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[48])},
+	{ "r49", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[49])},
+	{ "r50", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[50])},
+	{ "r51", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[51])},
+	{ "r52", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[52])},
+	{ "tp", GDB_SIZEOF_REG, offsetof(struct pt_regs, tp)},
+	{ "sp", GDB_SIZEOF_REG, offsetof(struct pt_regs, sp)},
+	{ "lr", GDB_SIZEOF_REG, offsetof(struct pt_regs, lr)},
+	{ "sn", GDB_SIZEOF_REG, -1},
+	{ "idn0", GDB_SIZEOF_REG, -1},
+	{ "idn1", GDB_SIZEOF_REG, -1},
+	{ "udn0", GDB_SIZEOF_REG, -1},
+	{ "udn1", GDB_SIZEOF_REG, -1},
+	{ "udn2", GDB_SIZEOF_REG, -1},
+	{ "udn3", GDB_SIZEOF_REG, -1},
+	{ "zero", GDB_SIZEOF_REG, -1},
+	{ "pc", GDB_SIZEOF_REG, offsetof(struct pt_regs, pc)},
+	{ "faultnum", GDB_SIZEOF_REG, offsetof(struct pt_regs, faultnum)},
+};
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return NULL;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+		       dbg_reg_def[regno].size);
+	else
+		memset(mem, 0, dbg_reg_def[regno].size);
+	return dbg_reg_def[regno].name;
+}
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return -EINVAL;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+		       dbg_reg_def[regno].size);
+	return 0;
+}
+
+/*
+ * Similar to pt_regs_to_gdb_regs() except that process is sleeping and so
+ * we may not be able to get all the info.
+ */
+void
+sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task)
+{
+	int reg;
+	struct pt_regs *thread_regs;
+	unsigned long *ptr = gdb_regs;
+
+	if (task == NULL)
+		return;
+
+	/* Initialize to zero. */
+	memset(gdb_regs, 0, NUMREGBYTES);
+
+	thread_regs = task_pt_regs(task);
+	for (reg = 0; reg <= TREG_LAST_GPR; reg++)
+		*(ptr++) = thread_regs->regs[reg];
+
+	gdb_regs[TILEGX_PC_REGNUM] = thread_regs->pc;
+	gdb_regs[TILEGX_FAULTNUM_REGNUM] = thread_regs->faultnum;
+}
+
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->pc = pc;
+}
+
+static void kgdb_call_nmi_hook(void *ignored)
+{
+	kgdb_nmicallback(raw_smp_processor_id(), NULL);
+}
+
+void kgdb_roundup_cpus(unsigned long flags)
+{
+	local_irq_enable();
+	smp_call_function(kgdb_call_nmi_hook, NULL, 0);
+	local_irq_disable();
+}
+
+/*
+ * Convert a kernel address to the writable kernel text mapping.
+ */
+static unsigned long writable_address(unsigned long addr)
+{
+	unsigned long ret = 0;
+
+	if (core_kernel_text(addr))
+		ret = addr - MEM_SV_START + PAGE_OFFSET;
+	else if (is_module_text_address(addr))
+		ret = addr;
+	else
+		pr_err("Unknown virtual address 0x%lx\n", addr);
+
+	return ret;
+}
+
+/*
+ * Calculate the new address for after a step.
+ */
+static unsigned long get_step_address(struct pt_regs *regs)
+{
+	int src_reg;
+	int jump_off;
+	int br_off;
+	unsigned long addr;
+	unsigned int opcode;
+	tile_bundle_bits bundle;
+
+	/* Move to the next instruction by default. */
+	addr = regs->pc + TILEGX_BUNDLE_SIZE_IN_BYTES;
+	bundle = *(unsigned long *)instruction_pointer(regs);
+
+	/* 0: X mode, Otherwise: Y mode. */
+	if (bundle & TILEGX_BUNDLE_MODE_MASK) {
+		if (get_Opcode_Y1(bundle) == RRR_1_OPCODE_Y1 &&
+		    get_RRROpcodeExtension_Y1(bundle) ==
+		    UNARY_RRR_1_OPCODE_Y1) {
+			opcode = get_UnaryOpcodeExtension_Y1(bundle);
+
+			switch (opcode) {
+			case JALR_UNARY_OPCODE_Y1:
+			case JALRP_UNARY_OPCODE_Y1:
+			case JR_UNARY_OPCODE_Y1:
+			case JRP_UNARY_OPCODE_Y1:
+				src_reg = get_SrcA_Y1(bundle);
+				dbg_get_reg(src_reg, &addr, regs);
+				break;
+			}
+		}
+	} else if (get_Opcode_X1(bundle) == RRR_0_OPCODE_X1) {
+		if (get_RRROpcodeExtension_X1(bundle) ==
+		    UNARY_RRR_0_OPCODE_X1) {
+			opcode = get_UnaryOpcodeExtension_X1(bundle);
+
+			switch (opcode) {
+			case JALR_UNARY_OPCODE_X1:
+			case JALRP_UNARY_OPCODE_X1:
+			case JR_UNARY_OPCODE_X1:
+			case JRP_UNARY_OPCODE_X1:
+				src_reg = get_SrcA_X1(bundle);
+				dbg_get_reg(src_reg, &addr, regs);
+				break;
+			}
+		}
+	} else if (get_Opcode_X1(bundle) == JUMP_OPCODE_X1) {
+		opcode = get_JumpOpcodeExtension_X1(bundle);
+
+		switch (opcode) {
+		case JAL_JUMP_OPCODE_X1:
+		case J_JUMP_OPCODE_X1:
+			jump_off = sign_extend(get_JumpOff_X1(bundle), 27);
+			addr = regs->pc +
+				(jump_off << TILEGX_LOG2_BUNDLE_SIZE_IN_BYTES);
+			break;
+		}
+	} else if (get_Opcode_X1(bundle) == BRANCH_OPCODE_X1) {
+		br_off = 0;
+		opcode = get_BrType_X1(bundle);
+
+		switch (opcode) {
+		case BEQZT_BRANCH_OPCODE_X1:
+		case BEQZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) == 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BGEZT_BRANCH_OPCODE_X1:
+		case BGEZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) >= 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BGTZT_BRANCH_OPCODE_X1:
+		case BGTZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) > 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BLBCT_BRANCH_OPCODE_X1:
+		case BLBC_BRANCH_OPCODE_X1:
+			if (!(get_SrcA_X1(bundle) & 1))
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BLBST_BRANCH_OPCODE_X1:
+		case BLBS_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) & 1)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BLEZT_BRANCH_OPCODE_X1:
+		case BLEZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) <= 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BLTZT_BRANCH_OPCODE_X1:
+		case BLTZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) < 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		case BNEZT_BRANCH_OPCODE_X1:
+		case BNEZ_BRANCH_OPCODE_X1:
+			if (get_SrcA_X1(bundle) != 0)
+				br_off = get_BrOff_X1(bundle);
+			break;
+		}
+
+		if (br_off != 0) {
+			br_off = sign_extend(br_off, 17);
+			addr = regs->pc +
+				(br_off << TILEGX_LOG2_BUNDLE_SIZE_IN_BYTES);
+		}
+	}
+
+	return addr;
+}
+
+/*
+ * Replace the next instruction after the current instruction with a
+ * breakpoint instruction.
+ */
+static void do_single_step(struct pt_regs *regs)
+{
+	unsigned long addr_wr;
+
+	/* Determine where the target instruction will send us to. */
+	stepped_addr = get_step_address(regs);
+	probe_kernel_read((char *)&stepped_instr, (char *)stepped_addr,
+			  BREAK_INSTR_SIZE);
+
+	addr_wr = writable_address(stepped_addr);
+	probe_kernel_write((char *)addr_wr, (char *)&singlestep_insn,
+			   BREAK_INSTR_SIZE);
+	smp_wmb();
+	flush_icache_range(stepped_addr, stepped_addr + BREAK_INSTR_SIZE);
+}
+
+static void undo_single_step(struct pt_regs *regs)
+{
+	unsigned long addr_wr;
+
+	if (stepped_instr == 0)
+		return;
+
+	addr_wr = writable_address(stepped_addr);
+	probe_kernel_write((char *)addr_wr, (char *)&stepped_instr,
+			   BREAK_INSTR_SIZE);
+	stepped_instr = 0;
+	smp_wmb();
+	flush_icache_range(stepped_addr, stepped_addr + BREAK_INSTR_SIZE);
+}
+
+/*
+ * Calls linux_debug_hook before the kernel dies. If KGDB is enabled,
+ * then try to fall into the debugger.
+ */
+static int
+kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
+{
+	int ret;
+	unsigned long flags;
+	struct die_args *args = (struct die_args *)ptr;
+	struct pt_regs *regs = args->regs;
+
+#ifdef CONFIG_KPROBES
+	/*
+	 * Return immediately if the kprobes fault notifier has set
+	 * DIE_PAGE_FAULT.
+	 */
+	if (cmd == DIE_PAGE_FAULT)
+		return NOTIFY_DONE;
+#endif /* CONFIG_KPROBES */
+
+	switch (cmd) {
+	case DIE_BREAK:
+	case DIE_COMPILED_BPT:
+		break;
+	case DIE_SSTEPBP:
+		local_irq_save(flags);
+		kgdb_handle_exception(0, SIGTRAP, 0, regs);
+		local_irq_restore(flags);
+		return NOTIFY_STOP;
+	default:
+		/* Userspace events, ignore. */
+		if (user_mode(regs))
+			return NOTIFY_DONE;
+	}
+
+	local_irq_save(flags);
+	ret = kgdb_handle_exception(args->trapnr, args->signr, args->err, regs);
+	local_irq_restore(flags);
+	if (ret)
+		return NOTIFY_DONE;
+
+	return NOTIFY_STOP;
+}
+
+static struct notifier_block kgdb_notifier = {
+	.notifier_call = kgdb_notify,
+};
+
+/*
+ * kgdb_arch_handle_exception - Handle architecture specific GDB packets.
+ * @vector: The error vector of the exception that happened.
+ * @signo: The signal number of the exception that happened.
+ * @err_code: The error code of the exception that happened.
+ * @remcom_in_buffer: The buffer of the packet we have read.
+ * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into.
+ * @regs: The &struct pt_regs of the current process.
+ *
+ * This function MUST handle the 'c' and 's' command packets,
+ * as well packets to set / remove a hardware breakpoint, if used.
+ * If there are additional packets which the hardware needs to handle,
+ * they are handled here. The code should return -1 if it wants to
+ * process more packets, and a %0 or %1 if it wants to exit from the
+ * kgdb callback.
+ */
+int kgdb_arch_handle_exception(int vector, int signo, int err_code,
+			       char *remcom_in_buffer, char *remcom_out_buffer,
+			       struct pt_regs *regs)
+{
+	char *ptr;
+	unsigned long address;
+
+	/* Undo any stepping we may have done. */
+	undo_single_step(regs);
+
+	switch (remcom_in_buffer[0]) {
+	case 'c':
+	case 's':
+	case 'D':
+	case 'k':
+		/*
+		 * Try to read optional parameter, pc unchanged if no parm.
+		 * If this was a compiled-in breakpoint, we need to move
+		 * to the next instruction or we will just breakpoint
+		 * over and over again.
+		 */
+		ptr = &remcom_in_buffer[1];
+		if (kgdb_hex2long(&ptr, &address))
+			regs->pc = address;
+		else if (*(unsigned long *)regs->pc == compiled_bpt)
+			regs->pc += BREAK_INSTR_SIZE;
+
+		if (remcom_in_buffer[0] == 's') {
+			do_single_step(regs);
+			kgdb_single_step = 1;
+			atomic_set(&kgdb_cpu_doing_single_step,
+				   raw_smp_processor_id());
+		} else
+			atomic_set(&kgdb_cpu_doing_single_step, -1);
+
+		return 0;
+	}
+
+	return -1; /* this means that we do not want to exit from the handler */
+}
+
+struct kgdb_arch arch_kgdb_ops;
+
+/*
+ * kgdb_arch_init - Perform any architecture specific initalization.
+ *
+ * This function will handle the initalization of any architecture
+ * specific callbacks.
+ */
+int kgdb_arch_init(void)
+{
+	tile_bundle_bits bundle = TILEGX_BPT_BUNDLE;
+
+	memcpy(arch_kgdb_ops.gdb_bpt_instr, &bundle, BREAK_INSTR_SIZE);
+	return register_die_notifier(&kgdb_notifier);
+}
+
+/*
+ * kgdb_arch_exit - Perform any architecture specific uninitalization.
+ *
+ * This function will handle the uninitalization of any architecture
+ * specific callbacks, for dynamic registration and unregistration.
+ */
+void kgdb_arch_exit(void)
+{
+	unregister_die_notifier(&kgdb_notifier);
+}
+
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+	int err;
+	unsigned long addr_wr = writable_address(bpt->bpt_addr);
+
+	if (addr_wr == 0)
+		return -1;
+
+	err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+				BREAK_INSTR_SIZE);
+	if (err)
+		return err;
+
+	err = probe_kernel_write((char *)addr_wr, arch_kgdb_ops.gdb_bpt_instr,
+				 BREAK_INSTR_SIZE);
+	smp_wmb();
+	flush_icache_range((unsigned long)bpt->bpt_addr,
+			   (unsigned long)bpt->bpt_addr + BREAK_INSTR_SIZE);
+	return err;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+	int err;
+	unsigned long addr_wr = writable_address(bpt->bpt_addr);
+
+	if (addr_wr == 0)
+		return -1;
+
+	err = probe_kernel_write((char *)addr_wr, (char *)bpt->saved_instr,
+				 BREAK_INSTR_SIZE);
+	smp_wmb();
+	flush_icache_range((unsigned long)bpt->bpt_addr,
+			   (unsigned long)bpt->bpt_addr + BREAK_INSTR_SIZE);
+	return err;
+}
diff --git a/arch/tile/kernel/kprobes.c b/arch/tile/kernel/kprobes.c
new file mode 100644
index 00000000000..27cdcacbe81
--- /dev/null
+++ b/arch/tile/kernel/kprobes.c
@@ -0,0 +1,528 @@
+/*
+ * arch/tile/kernel/kprobes.c
+ * Kprobes on TILE-Gx
+ *
+ * Some portions copied from the MIPS version.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright 2006 Sony Corp.
+ * Copyright 2010 Cavium Networks
+ *
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+
+#include <arch/opcode.h>
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+tile_bundle_bits breakpoint_insn = TILEGX_BPT_BUNDLE;
+tile_bundle_bits breakpoint2_insn = TILEGX_BPT_BUNDLE | DIE_SSTEPBP;
+
+/*
+ * Check whether instruction is branch or jump, or if executing it
+ * has different results depending on where it is executed (e.g. lnk).
+ */
+static int __kprobes insn_has_control(kprobe_opcode_t insn)
+{
+	if (get_Mode(insn) != 0) {   /* Y-format bundle */
+		if (get_Opcode_Y1(insn) != RRR_1_OPCODE_Y1 ||
+		    get_RRROpcodeExtension_Y1(insn) != UNARY_RRR_1_OPCODE_Y1)
+			return 0;
+
+		switch (get_UnaryOpcodeExtension_Y1(insn)) {
+		case JALRP_UNARY_OPCODE_Y1:
+		case JALR_UNARY_OPCODE_Y1:
+		case JRP_UNARY_OPCODE_Y1:
+		case JR_UNARY_OPCODE_Y1:
+		case LNK_UNARY_OPCODE_Y1:
+			return 1;
+		default:
+			return 0;
+		}
+	}
+
+	switch (get_Opcode_X1(insn)) {
+	case BRANCH_OPCODE_X1:	/* branch instructions */
+	case JUMP_OPCODE_X1:	/* jump instructions: j and jal */
+		return 1;
+
+	case RRR_0_OPCODE_X1:   /* other jump instructions */
+		if (get_RRROpcodeExtension_X1(insn) != UNARY_RRR_0_OPCODE_X1)
+			return 0;
+		switch (get_UnaryOpcodeExtension_X1(insn)) {
+		case JALRP_UNARY_OPCODE_X1:
+		case JALR_UNARY_OPCODE_X1:
+		case JRP_UNARY_OPCODE_X1:
+		case JR_UNARY_OPCODE_X1:
+		case LNK_UNARY_OPCODE_X1:
+			return 1;
+		default:
+			return 0;
+		}
+	default:
+		return 0;
+	}
+}
+
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+	unsigned long addr = (unsigned long)p->addr;
+
+	if (addr & (sizeof(kprobe_opcode_t) - 1))
+		return -EINVAL;
+
+	if (insn_has_control(*p->addr)) {
+		pr_notice("Kprobes for control instructions are not "
+			  "supported\n");
+		return -EINVAL;
+	}
+
+	/* insn: must be on special executable page on tile. */
+	p->ainsn.insn = get_insn_slot();
+	if (!p->ainsn.insn)
+		return -ENOMEM;
+
+	/*
+	 * In the kprobe->ainsn.insn[] array we store the original
+	 * instruction at index zero and a break trap instruction at
+	 * index one.
+	 */
+	memcpy(&p->ainsn.insn[0], p->addr, sizeof(kprobe_opcode_t));
+	p->ainsn.insn[1] = breakpoint2_insn;
+	p->opcode = *p->addr;
+
+	return 0;
+}
+
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+	unsigned long addr_wr;
+
+	/* Operate on writable kernel text mapping. */
+	addr_wr = (unsigned long)p->addr - MEM_SV_START + PAGE_OFFSET;
+
+	if (probe_kernel_write((void *)addr_wr, &breakpoint_insn,
+		sizeof(breakpoint_insn)))
+		pr_err("%s: failed to enable kprobe\n", __func__);
+
+	smp_wmb();
+	flush_insn_slot(p);
+}
+
+void __kprobes arch_disarm_kprobe(struct kprobe *kp)
+{
+	unsigned long addr_wr;
+
+	/* Operate on writable kernel text mapping. */
+	addr_wr = (unsigned long)kp->addr - MEM_SV_START + PAGE_OFFSET;
+
+	if (probe_kernel_write((void *)addr_wr, &kp->opcode,
+		sizeof(kp->opcode)))
+		pr_err("%s: failed to enable kprobe\n", __func__);
+
+	smp_wmb();
+	flush_insn_slot(kp);
+}
+
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+	if (p->ainsn.insn) {
+		free_insn_slot(p->ainsn.insn, 0);
+		p->ainsn.insn = NULL;
+	}
+}
+
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	kcb->prev_kprobe.kp = kprobe_running();
+	kcb->prev_kprobe.status = kcb->kprobe_status;
+	kcb->prev_kprobe.saved_pc = kcb->kprobe_saved_pc;
+}
+
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+	__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
+	kcb->kprobe_status = kcb->prev_kprobe.status;
+	kcb->kprobe_saved_pc = kcb->prev_kprobe.saved_pc;
+}
+
+static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+			struct kprobe_ctlblk *kcb)
+{
+	__this_cpu_write(current_kprobe, p);
+	kcb->kprobe_saved_pc = regs->pc;
+}
+
+static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+	/* Single step inline if the instruction is a break. */
+	if (p->opcode == breakpoint_insn ||
+	    p->opcode == breakpoint2_insn)
+		regs->pc = (unsigned long)p->addr;
+	else
+		regs->pc = (unsigned long)&p->ainsn.insn[0];
+}
+
+static int __kprobes kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *p;
+	int ret = 0;
+	kprobe_opcode_t *addr;
+	struct kprobe_ctlblk *kcb;
+
+	addr = (kprobe_opcode_t *)regs->pc;
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing.
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
+
+	/* Check we're not actually recursing. */
+	if (kprobe_running()) {
+		p = get_kprobe(addr);
+		if (p) {
+			if (kcb->kprobe_status == KPROBE_HIT_SS &&
+			    p->ainsn.insn[0] == breakpoint_insn) {
+				goto no_kprobe;
+			}
+			/*
+			 * We have reentered the kprobe_handler(), since
+			 * another probe was hit while within the handler.
+			 * We here save the original kprobes variables and
+			 * just single step on the instruction of the new probe
+			 * without calling any user handlers.
+			 */
+			save_previous_kprobe(kcb);
+			set_current_kprobe(p, regs, kcb);
+			kprobes_inc_nmissed_count(p);
+			prepare_singlestep(p, regs);
+			kcb->kprobe_status = KPROBE_REENTER;
+			return 1;
+		} else {
+			if (*addr != breakpoint_insn) {
+				/*
+				 * The breakpoint instruction was removed by
+				 * another cpu right after we hit, no further
+				 * handling of this interrupt is appropriate.
+				 */
+				ret = 1;
+				goto no_kprobe;
+			}
+			p = __this_cpu_read(current_kprobe);
+			if (p->break_handler && p->break_handler(p, regs))
+				goto ss_probe;
+		}
+		goto no_kprobe;
+	}
+
+	p = get_kprobe(addr);
+	if (!p) {
+		if (*addr != breakpoint_insn) {
+			/*
+			 * The breakpoint instruction was removed right
+			 * after we hit it.  Another cpu has removed
+			 * either a probepoint or a debugger breakpoint
+			 * at this address.  In either case, no further
+			 * handling of this interrupt is appropriate.
+			 */
+			ret = 1;
+		}
+		/* Not one of ours: let kernel handle it. */
+		goto no_kprobe;
+	}
+
+	set_current_kprobe(p, regs, kcb);
+	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+	if (p->pre_handler && p->pre_handler(p, regs)) {
+		/* Handler has already set things up, so skip ss setup. */
+		return 1;
+	}
+
+ss_probe:
+	prepare_singlestep(p, regs);
+	kcb->kprobe_status = KPROBE_HIT_SS;
+	return 1;
+
+no_kprobe:
+	preempt_enable_no_resched();
+	return ret;
+}
+
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction that has been replaced by the breakpoint. To avoid the
+ * SMP problems that can occur when we temporarily put back the
+ * original opcode to single-step, we single-stepped a copy of the
+ * instruction.  The address of this copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * breakpoint trap.
+ */
+static void __kprobes resume_execution(struct kprobe *p,
+				       struct pt_regs *regs,
+				       struct kprobe_ctlblk *kcb)
+{
+	unsigned long orig_pc = kcb->kprobe_saved_pc;
+	regs->pc = orig_pc + 8;
+}
+
+static inline int post_kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (!cur)
+		return 0;
+
+	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+		kcb->kprobe_status = KPROBE_HIT_SSDONE;
+		cur->post_handler(cur, regs, 0);
+	}
+
+	resume_execution(cur, regs, kcb);
+
+	/* Restore back the original saved kprobes variables and continue. */
+	if (kcb->kprobe_status == KPROBE_REENTER) {
+		restore_previous_kprobe(kcb);
+		goto out;
+	}
+	reset_current_kprobe();
+out:
+	preempt_enable_no_resched();
+
+	return 1;
+}
+
+static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+	struct kprobe *cur = kprobe_running();
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+		return 1;
+
+	if (kcb->kprobe_status & KPROBE_HIT_SS) {
+		/*
+		 * We are here because the instruction being single
+		 * stepped caused a page fault. We reset the current
+		 * kprobe and the ip points back to the probe address
+		 * and allow the page fault handler to continue as a
+		 * normal page fault.
+		 */
+		resume_execution(cur, regs, kcb);
+		reset_current_kprobe();
+		preempt_enable_no_resched();
+	}
+	return 0;
+}
+
+/*
+ * Wrapper routine for handling exceptions.
+ */
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+				       unsigned long val, void *data)
+{
+	struct die_args *args = (struct die_args *)data;
+	int ret = NOTIFY_DONE;
+
+	switch (val) {
+	case DIE_BREAK:
+		if (kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_SSTEPBP:
+		if (post_kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_PAGE_FAULT:
+		/* kprobe_running() needs smp_processor_id(). */
+		preempt_disable();
+
+		if (kprobe_running()
+		    && kprobe_fault_handler(args->regs, args->trapnr))
+			ret = NOTIFY_STOP;
+		preempt_enable();
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct jprobe *jp = container_of(p, struct jprobe, kp);
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	kcb->jprobe_saved_regs = *regs;
+	kcb->jprobe_saved_sp = regs->sp;
+
+	memcpy(kcb->jprobes_stack, (void *)kcb->jprobe_saved_sp,
+	       MIN_JPROBES_STACK_SIZE(kcb->jprobe_saved_sp));
+
+	regs->pc = (unsigned long)(jp->entry);
+
+	return 1;
+}
+
+/* Defined in the inline asm below. */
+void jprobe_return_end(void);
+
+void __kprobes jprobe_return(void)
+{
+	asm volatile(
+		"bpt\n\t"
+		".globl jprobe_return_end\n"
+		"jprobe_return_end:\n");
+}
+
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+	if (regs->pc >= (unsigned long)jprobe_return &&
+	    regs->pc <= (unsigned long)jprobe_return_end) {
+		*regs = kcb->jprobe_saved_regs;
+		memcpy((void *)kcb->jprobe_saved_sp, kcb->jprobes_stack,
+		       MIN_JPROBES_STACK_SIZE(kcb->jprobe_saved_sp));
+		preempt_enable_no_resched();
+
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Function return probe trampoline:
+ * - init_kprobes() establishes a probepoint here
+ * - When the probed function returns, this probe causes the
+ *   handlers to fire
+ */
+static void __used kretprobe_trampoline_holder(void)
+{
+	asm volatile(
+		"nop\n\t"
+		".global kretprobe_trampoline\n"
+		"kretprobe_trampoline:\n\t"
+		"nop\n\t"
+		: : : "memory");
+}
+
+void kretprobe_trampoline(void);
+
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+				      struct pt_regs *regs)
+{
+	ri->ret_addr = (kprobe_opcode_t *) regs->lr;
+
+	/* Replace the return addr with trampoline addr */
+	regs->lr = (unsigned long)kretprobe_trampoline;
+}
+
+/*
+ * Called when the probe at kretprobe trampoline is hit.
+ */
+static int __kprobes trampoline_probe_handler(struct kprobe *p,
+						struct pt_regs *regs)
+{
+	struct kretprobe_instance *ri = NULL;
+	struct hlist_head *head, empty_rp;
+	struct hlist_node *tmp;
+	unsigned long flags, orig_ret_address = 0;
+	unsigned long trampoline_address = (unsigned long)kretprobe_trampoline;
+
+	INIT_HLIST_HEAD(&empty_rp);
+	kretprobe_hash_lock(current, &head, &flags);
+
+	/*
+	 * It is possible to have multiple instances associated with a given
+	 * task either because multiple functions in the call path have
+	 * a return probe installed on them, and/or more than one return
+	 * return probe was registered for a target function.
+	 *
+	 * We can handle this because:
+	 *     - instances are always inserted at the head of the list
+	 *     - when multiple return probes are registered for the same
+	 *       function, the first instance's ret_addr will point to the
+	 *       real return address, and all the rest will point to
+	 *       kretprobe_trampoline
+	 */
+	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+		if (ri->task != current)
+			/* another task is sharing our hash bucket */
+			continue;
+
+		if (ri->rp && ri->rp->handler)
+			ri->rp->handler(ri, regs);
+
+		orig_ret_address = (unsigned long)ri->ret_addr;
+		recycle_rp_inst(ri, &empty_rp);
+
+		if (orig_ret_address != trampoline_address) {
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			break;
+		}
+	}
+
+	kretprobe_assert(ri, orig_ret_address, trampoline_address);
+	instruction_pointer(regs) = orig_ret_address;
+
+	reset_current_kprobe();
+	kretprobe_hash_unlock(current, &flags);
+	preempt_enable_no_resched();
+
+	hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
+		hlist_del(&ri->hlist);
+		kfree(ri);
+	}
+	/*
+	 * By returning a non-zero value, we are telling
+	 * kprobe_handler() that we don't want the post_handler
+	 * to run (and have re-enabled preemption)
+	 */
+	return 1;
+}
+
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+	if (p->addr == (kprobe_opcode_t *)kretprobe_trampoline)
+		return 1;
+
+	return 0;
+}
+
+static struct kprobe trampoline_p = {
+	.addr = (kprobe_opcode_t *)kretprobe_trampoline,
+	.pre_handler = trampoline_probe_handler
+};
+
+int __init arch_init_kprobes(void)
+{
+	register_kprobe(&trampoline_p);
+	return 0;
+}
diff --git a/arch/tile/kernel/machine_kexec.c b/arch/tile/kernel/machine_kexec.c
index 6255f2eab11..f0b54a93471 100644
--- a/arch/tile/kernel/machine_kexec.c
+++ b/arch/tile/kernel/machine_kexec.c
@@ -31,6 +31,8 @@
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
 #include <asm/checksum.h>
+#include <asm/tlbflush.h>
+#include <asm/homecache.h>
 #include <hv/hypervisor.h>
 
 
@@ -222,11 +224,22 @@ struct page *kimage_alloc_pages_arch(gfp_t gfp_mask, unsigned int order)
 	return alloc_pages_node(0, gfp_mask, order);
 }
 
+/*
+ * Address range in which pa=va mapping is set in setup_quasi_va_is_pa().
+ * For tilepro, PAGE_OFFSET is used since this is the largest possbile value
+ * for tilepro, while for tilegx, we limit it to entire middle level page
+ * table which we assume has been allocated and is undoubtedly large enough.
+ */
+#ifndef __tilegx__
+#define	QUASI_VA_IS_PA_ADDR_RANGE PAGE_OFFSET
+#else
+#define	QUASI_VA_IS_PA_ADDR_RANGE PGDIR_SIZE
+#endif
+
 static void setup_quasi_va_is_pa(void)
 {
-	HV_PTE *pgtable;
 	HV_PTE pte;
-	int i;
+	unsigned long i;
 
 	/*
 	 * Flush our TLB to prevent conflicts between the previous contents
@@ -234,16 +247,22 @@ static void setup_quasi_va_is_pa(void)
 	 */
 	local_flush_tlb_all();
 
-	/* setup VA is PA, at least up to PAGE_OFFSET */
-
-	pgtable = (HV_PTE *)current->mm->pgd;
+	/*
+	 * setup VA is PA, at least up to QUASI_VA_IS_PA_ADDR_RANGE.
+	 * Note here we assume that level-1 page table is defined by
+	 * HPAGE_SIZE.
+	 */
 	pte = hv_pte(_PAGE_KERNEL | _PAGE_HUGE_PAGE);
 	pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
-
-	for (i = 0; i < pgd_index(PAGE_OFFSET); i++) {
+	for (i = 0; i < (QUASI_VA_IS_PA_ADDR_RANGE >> HPAGE_SHIFT); i++) {
+		unsigned long vaddr = i << HPAGE_SHIFT;
+		pgd_t *pgd = pgd_offset(current->mm, vaddr);
+		pud_t *pud = pud_offset(pgd, vaddr);
+		pte_t *ptep = (pte_t *) pmd_offset(pud, vaddr);
 		unsigned long pfn = i << (HPAGE_SHIFT - PAGE_SHIFT);
+
 		if (pfn_valid(pfn))
-			__set_pte(&pgtable[i], pfn_pte(pfn, pte));
+			__set_pte(ptep, pfn_pte(pfn, pte));
 	}
 }
 
@@ -251,6 +270,7 @@ static void setup_quasi_va_is_pa(void)
 void machine_kexec(struct kimage *image)
 {
 	void *reboot_code_buffer;
+	pte_t *ptep;
 	void (*rnk)(unsigned long, void *, unsigned long)
 		__noreturn;
 
@@ -266,8 +286,10 @@ void machine_kexec(struct kimage *image)
 	 */
 	homecache_change_page_home(image->control_code_page, 0,
 				   smp_processor_id());
-	reboot_code_buffer = vmap(&image->control_code_page, 1, 0,
-				  __pgprot(_PAGE_KERNEL | _PAGE_EXECUTABLE));
+	reboot_code_buffer = page_address(image->control_code_page);
+	BUG_ON(reboot_code_buffer == NULL);
+	ptep = virt_to_pte(NULL, (unsigned long)reboot_code_buffer);
+	__set_pte(ptep, pte_mkexec(*ptep));
 	memcpy(reboot_code_buffer, relocate_new_kernel,
 	       relocate_new_kernel_size);
 	__flush_icache_range(
diff --git a/arch/tile/kernel/mcount_64.S b/arch/tile/kernel/mcount_64.S
new file mode 100644
index 00000000000..70d7bb0c4d8
--- /dev/null
+++ b/arch/tile/kernel/mcount_64.S
@@ -0,0 +1,224 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * TILE-Gx specific __mcount support
+ */
+
+#include <linux/linkage.h>
+#include <asm/ftrace.h>
+
+#define REGSIZE 8
+
+	.text
+	.global __mcount
+
+	.macro	MCOUNT_SAVE_REGS
+	addli	sp, sp, -REGSIZE
+	{
+	 st     sp, lr
+	 addli	r29, sp, - (12 * REGSIZE)
+	}
+	{
+	 addli	sp, sp, - (13 * REGSIZE)
+	 st     r29, sp
+	}
+	addli	r29, r29, REGSIZE
+	{ st	r29, r0; addli	r29, r29, REGSIZE }
+	{ st	r29, r1; addli	r29, r29, REGSIZE }
+	{ st	r29, r2; addli	r29, r29, REGSIZE }
+	{ st	r29, r3; addli	r29, r29, REGSIZE }
+	{ st	r29, r4; addli	r29, r29, REGSIZE }
+	{ st	r29, r5; addli	r29, r29, REGSIZE }
+	{ st	r29, r6; addli	r29, r29, REGSIZE }
+	{ st	r29, r7; addli	r29, r29, REGSIZE }
+	{ st	r29, r8; addli	r29, r29, REGSIZE }
+	{ st	r29, r9; addli	r29, r29, REGSIZE }
+	{ st	r29, r10; addli	r29, r29, REGSIZE }
+	.endm
+
+	.macro	MCOUNT_RESTORE_REGS
+	addli	r29, sp, (2 * REGSIZE)
+	{ ld	r0, r29; addli	r29, r29, REGSIZE }
+	{ ld	r1, r29; addli	r29, r29, REGSIZE }
+	{ ld	r2, r29; addli	r29, r29, REGSIZE }
+	{ ld	r3, r29; addli	r29, r29, REGSIZE }
+	{ ld	r4, r29; addli	r29, r29, REGSIZE }
+	{ ld	r5, r29; addli	r29, r29, REGSIZE }
+	{ ld	r6, r29; addli	r29, r29, REGSIZE }
+	{ ld	r7, r29; addli	r29, r29, REGSIZE }
+	{ ld	r8, r29; addli	r29, r29, REGSIZE }
+	{ ld	r9, r29; addli	r29, r29, REGSIZE }
+	{ ld	r10, r29; addli	lr, sp, (13 * REGSIZE) }
+	{ ld	lr, lr;  addli	sp, sp, (14 * REGSIZE) }
+	.endm
+
+	.macro  RETURN_BACK
+	{ move	r12, lr; move	lr, r10 }
+	jrp	r12
+	.endm
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+	.align	64
+STD_ENTRY(__mcount)
+__mcount:
+	j	ftrace_stub
+STD_ENDPROC(__mcount)
+
+	.align	64
+STD_ENTRY(ftrace_caller)
+	moveli	r11, hw2_last(function_trace_stop)
+	{ shl16insli	r11, r11, hw1(function_trace_stop); move r12, lr }
+	{ shl16insli	r11, r11, hw0(function_trace_stop); move lr, r10 }
+	ld	r11, r11
+	beqz	r11, 1f
+	jrp	r12
+
+1:
+	{ move	r10, lr; move	lr, r12 }
+	MCOUNT_SAVE_REGS
+
+	/* arg1: self return address */
+	/* arg2: parent's return address */
+	{ move	r0, lr; move	r1, r10 }
+
+	.global	ftrace_call
+ftrace_call:
+	/*
+	 * a placeholder for the call to a real tracing function, i.e.
+	 * ftrace_trace_function()
+	 */
+	nop
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	.global	ftrace_graph_call
+ftrace_graph_call:
+	/*
+	 * a placeholder for the call to a real tracing function, i.e.
+	 * ftrace_graph_caller()
+	 */
+	nop
+#endif
+	MCOUNT_RESTORE_REGS
+	.global	ftrace_stub
+ftrace_stub:
+	RETURN_BACK
+STD_ENDPROC(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+	.align	64
+STD_ENTRY(__mcount)
+	moveli	r11, hw2_last(function_trace_stop)
+	{ shl16insli	r11, r11, hw1(function_trace_stop); move r12, lr }
+	{ shl16insli	r11, r11, hw0(function_trace_stop); move lr, r10 }
+	ld	r11, r11
+	beqz	r11, 1f
+	jrp	r12
+
+1:
+	{ move	r10, lr; move	lr, r12 }
+	{
+	 moveli	r11, hw2_last(ftrace_trace_function)
+	 moveli	r13, hw2_last(ftrace_stub)
+	}
+	{
+	 shl16insli	r11, r11, hw1(ftrace_trace_function)
+	 shl16insli	r13, r13, hw1(ftrace_stub)
+	}
+	{
+	 shl16insli	r11, r11, hw0(ftrace_trace_function)
+	 shl16insli	r13, r13, hw0(ftrace_stub)
+	}
+
+	ld	r11, r11
+	sub	r14, r13, r11
+	bnez	r14, static_trace
+
+#ifdef	CONFIG_FUNCTION_GRAPH_TRACER
+	moveli	r15, hw2_last(ftrace_graph_return)
+	shl16insli	r15, r15, hw1(ftrace_graph_return)
+	shl16insli	r15, r15, hw0(ftrace_graph_return)
+	ld	r15, r15
+	sub	r15, r15, r13
+	bnez	r15, ftrace_graph_caller
+
+	{
+	 moveli	r16, hw2_last(ftrace_graph_entry)
+	 moveli	r17, hw2_last(ftrace_graph_entry_stub)
+	}
+	{
+	 shl16insli	r16, r16, hw1(ftrace_graph_entry)
+	 shl16insli	r17, r17, hw1(ftrace_graph_entry_stub)
+	}
+	{
+	 shl16insli	r16, r16, hw0(ftrace_graph_entry)
+	 shl16insli	r17, r17, hw0(ftrace_graph_entry_stub)
+	}
+	ld	r16, r16
+	sub	r17, r16, r17
+	bnez	r17, ftrace_graph_caller
+
+#endif
+	RETURN_BACK
+
+static_trace:
+	MCOUNT_SAVE_REGS
+
+	/* arg1: self return address */
+	/* arg2: parent's return address */
+	{ move	r0, lr; move	r1, r10 }
+
+	/* call ftrace_trace_function() */
+	jalr	r11
+
+	MCOUNT_RESTORE_REGS
+
+	.global ftrace_stub
+ftrace_stub:
+	RETURN_BACK
+STD_ENDPROC(__mcount)
+
+#endif	/* ! CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+STD_ENTRY(ftrace_graph_caller)
+ftrace_graph_caller:
+#ifndef CONFIG_DYNAMIC_FTRACE
+	MCOUNT_SAVE_REGS
+#endif
+
+	/* arg1: Get the location of the parent's return address */
+	addi	r0, sp, 12 * REGSIZE
+	/* arg2: Get self return address */
+	move	r1, lr
+
+	jal prepare_ftrace_return
+
+	MCOUNT_RESTORE_REGS
+	RETURN_BACK
+STD_ENDPROC(ftrace_graph_caller)
+
+	.global return_to_handler
+return_to_handler:
+	MCOUNT_SAVE_REGS
+
+	jal	ftrace_return_to_handler
+	/* restore the real parent address */
+	move	r11, r0
+
+	MCOUNT_RESTORE_REGS
+	jr	r11
+
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/tile/kernel/messaging.c b/arch/tile/kernel/messaging.c
index 0858ee6b520..7867266f971 100644
--- a/arch/tile/kernel/messaging.c
+++ b/arch/tile/kernel/messaging.c
@@ -25,7 +25,7 @@
 /* All messages are stored here */
 static DEFINE_PER_CPU(HV_MsgState, msg_state);
 
-void __cpuinit init_messaging(void)
+void init_messaging(void)
 {
 	/* Allocate storage for messages in kernel space */
 	HV_MsgState *state = &__get_cpu_var(msg_state);
@@ -68,8 +68,8 @@ void hv_message_intr(struct pt_regs *regs, int intnum)
 #endif
 
 	while (1) {
-		rmi = hv_receive_message(__get_cpu_var(msg_state),
-					 (HV_VirtAddr) message,
+		HV_MsgState *state = this_cpu_ptr(&msg_state);
+		rmi = hv_receive_message(*state, (HV_VirtAddr) message,
 					 sizeof(message));
 		if (rmi.msglen == 0)
 			break;
diff --git a/arch/tile/kernel/module.c b/arch/tile/kernel/module.c
index b90ab992567..4918d91bc3a 100644
--- a/arch/tile/kernel/module.c
+++ b/arch/tile/kernel/module.c
@@ -24,16 +24,6 @@
 #include <asm/homecache.h>
 #include <arch/opcode.h>
 
-#ifdef __tilegx__
-# define Elf_Rela Elf64_Rela
-# define ELF_R_SYM ELF64_R_SYM
-# define ELF_R_TYPE ELF64_R_TYPE
-#else
-# define Elf_Rela Elf32_Rela
-# define ELF_R_SYM ELF32_R_SYM
-# define ELF_R_TYPE ELF32_R_TYPE
-#endif
-
 #ifdef MODULE_DEBUG
 #define DEBUGP printk
 #else
@@ -52,8 +42,6 @@ void *module_alloc(unsigned long size)
 	int i = 0;
 	int npages;
 
-	if (size == 0)
-		return NULL;
 	npages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
 	pages = kmalloc(npages * sizeof(struct page *), GFP_KERNEL);
 	if (pages == NULL)
@@ -67,6 +55,8 @@ void *module_alloc(unsigned long size)
 	area = __get_vm_area(size, VM_ALLOC, MEM_MODULE_START, MEM_MODULE_END);
 	if (!area)
 		goto error;
+	area->nr_pages = npages;
+	area->pages = pages;
 
 	if (map_vm_area(area, prot_rwx, &pages)) {
 		vunmap(area->addr);
@@ -157,7 +147,17 @@ int apply_relocate_add(Elf_Shdr *sechdrs,
 
 		switch (ELF_R_TYPE(rel[i].r_info)) {
 
-#define MUNGE(func) (*location = ((*location & ~func(-1)) | func(value)))
+#ifdef __LITTLE_ENDIAN
+# define MUNGE(func) \
+	(*location = ((*location & ~func(-1)) | func(value)))
+#else
+/*
+ * Instructions are always little-endian, so when we read them as data,
+ * we have to swap them around before and after modifying them.
+ */
+# define MUNGE(func) \
+	(*location = swab64((swab64(*location) & ~func(-1)) | func(value)))
+#endif
 
 #ifndef __tilegx__
 		case R_TILE_32:
diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index b3ed19f8779..09b58703ac2 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -14,6 +14,7 @@
 
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
+#include <linux/swiotlb.h>
 #include <linux/vmalloc.h>
 #include <linux/export.h>
 #include <asm/tlbflush.h>
@@ -22,16 +23,22 @@
 /* Generic DMA mapping functions: */
 
 /*
- * Allocate what Linux calls "coherent" memory, which for us just
- * means uncached.
+ * Allocate what Linux calls "coherent" memory.  On TILEPro this is
+ * uncached memory; on TILE-Gx it is hash-for-home memory.
  */
-void *dma_alloc_coherent(struct device *dev,
-			 size_t size,
-			 dma_addr_t *dma_handle,
-			 gfp_t gfp)
+#ifdef __tilepro__
+#define PAGE_HOME_DMA PAGE_HOME_UNCACHED
+#else
+#define PAGE_HOME_DMA PAGE_HOME_HASH
+#endif
+
+static void *tile_dma_alloc_coherent(struct device *dev, size_t size,
+				     dma_addr_t *dma_handle, gfp_t gfp,
+				     struct dma_attrs *attrs)
 {
-	u64 dma_mask = dev->coherent_dma_mask ?: DMA_BIT_MASK(32);
-	int node = dev_to_node(dev);
+	u64 dma_mask = (dev && dev->coherent_dma_mask) ?
+		dev->coherent_dma_mask : DMA_BIT_MASK(32);
+	int node = dev ? dev_to_node(dev) : 0;
 	int order = get_order(size);
 	struct page *pg;
 	dma_addr_t addr;
@@ -39,39 +46,42 @@ void *dma_alloc_coherent(struct device *dev,
 	gfp |= __GFP_ZERO;
 
 	/*
-	 * By forcing NUMA node 0 for 32-bit masks we ensure that the
-	 * high 32 bits of the resulting PA will be zero.  If the mask
-	 * size is, e.g., 24, we may still not be able to guarantee a
-	 * suitable memory address, in which case we will return NULL.
-	 * But such devices are uncommon.
+	 * If the mask specifies that the memory be in the first 4 GB, then
+	 * we force the allocation to come from the DMA zone.  We also
+	 * force the node to 0 since that's the only node where the DMA
+	 * zone isn't empty.  If the mask size is smaller than 32 bits, we
+	 * may still not be able to guarantee a suitable memory address, in
+	 * which case we will return NULL.  But such devices are uncommon.
 	 */
-	if (dma_mask <= DMA_BIT_MASK(32))
+	if (dma_mask <= DMA_BIT_MASK(32)) {
+		gfp |= GFP_DMA;
 		node = 0;
+	}
 
-	pg = homecache_alloc_pages_node(node, gfp, order, PAGE_HOME_UNCACHED);
+	pg = homecache_alloc_pages_node(node, gfp, order, PAGE_HOME_DMA);
 	if (pg == NULL)
 		return NULL;
 
 	addr = page_to_phys(pg);
 	if (addr + size > dma_mask) {
-		homecache_free_pages(addr, order);
+		__homecache_free_pages(pg, order);
 		return NULL;
 	}
 
 	*dma_handle = addr;
+
 	return page_address(pg);
 }
-EXPORT_SYMBOL(dma_alloc_coherent);
 
 /*
- * Free memory that was allocated with dma_alloc_coherent.
+ * Free memory that was allocated with tile_dma_alloc_coherent.
  */
-void dma_free_coherent(struct device *dev, size_t size,
-		  void *vaddr, dma_addr_t dma_handle)
+static void tile_dma_free_coherent(struct device *dev, size_t size,
+				   void *vaddr, dma_addr_t dma_handle,
+				   struct dma_attrs *attrs)
 {
 	homecache_free_pages((unsigned long)vaddr, get_order(size));
 }
-EXPORT_SYMBOL(dma_free_coherent);
 
 /*
  * The map routines "map" the specified address range for DMA
@@ -87,52 +97,285 @@ EXPORT_SYMBOL(dma_free_coherent);
  * can count on nothing having been touched.
  */
 
-/* Flush a PA range from cache page by page. */
-static void __dma_map_pa_range(dma_addr_t dma_addr, size_t size)
+/* Set up a single page for DMA access. */
+static void __dma_prep_page(struct page *page, unsigned long offset,
+			    size_t size, enum dma_data_direction direction)
+{
+	/*
+	 * Flush the page from cache if necessary.
+	 * On tilegx, data is delivered to hash-for-home L3; on tilepro,
+	 * data is delivered direct to memory.
+	 *
+	 * NOTE: If we were just doing DMA_TO_DEVICE we could optimize
+	 * this to be a "flush" not a "finv" and keep some of the
+	 * state in cache across the DMA operation, but it doesn't seem
+	 * worth creating the necessary flush_buffer_xxx() infrastructure.
+	 */
+	int home = page_home(page);
+	switch (home) {
+	case PAGE_HOME_HASH:
+#ifdef __tilegx__
+		return;
+#endif
+		break;
+	case PAGE_HOME_UNCACHED:
+#ifdef __tilepro__
+		return;
+#endif
+		break;
+	case PAGE_HOME_IMMUTABLE:
+		/* Should be going to the device only. */
+		BUG_ON(direction == DMA_FROM_DEVICE ||
+		       direction == DMA_BIDIRECTIONAL);
+		return;
+	case PAGE_HOME_INCOHERENT:
+		/* Incoherent anyway, so no need to work hard here. */
+		return;
+	default:
+		BUG_ON(home < 0 || home >= NR_CPUS);
+		break;
+	}
+	homecache_finv_page(page);
+
+#ifdef DEBUG_ALIGNMENT
+	/* Warn if the region isn't cacheline aligned. */
+	if (offset & (L2_CACHE_BYTES - 1) || (size & (L2_CACHE_BYTES - 1)))
+		pr_warn("Unaligned DMA to non-hfh memory: PA %#llx/%#lx\n",
+			PFN_PHYS(page_to_pfn(page)) + offset, size);
+#endif
+}
+
+/* Make the page ready to be read by the core. */
+static void __dma_complete_page(struct page *page, unsigned long offset,
+				size_t size, enum dma_data_direction direction)
+{
+#ifdef __tilegx__
+	switch (page_home(page)) {
+	case PAGE_HOME_HASH:
+		/* I/O device delivered data the way the cpu wanted it. */
+		break;
+	case PAGE_HOME_INCOHERENT:
+		/* Incoherent anyway, so no need to work hard here. */
+		break;
+	case PAGE_HOME_IMMUTABLE:
+		/* Extra read-only copies are not a problem. */
+		break;
+	default:
+		/* Flush the bogus hash-for-home I/O entries to memory. */
+		homecache_finv_map_page(page, PAGE_HOME_HASH);
+		break;
+	}
+#endif
+}
+
+static void __dma_prep_pa_range(dma_addr_t dma_addr, size_t size,
+				enum dma_data_direction direction)
 {
 	struct page *page = pfn_to_page(PFN_DOWN(dma_addr));
-	size_t bytesleft = PAGE_SIZE - (dma_addr & (PAGE_SIZE - 1));
+	unsigned long offset = dma_addr & (PAGE_SIZE - 1);
+	size_t bytes = min(size, (size_t)(PAGE_SIZE - offset));
+
+	while (size != 0) {
+		__dma_prep_page(page, offset, bytes, direction);
+		size -= bytes;
+		++page;
+		offset = 0;
+		bytes = min((size_t)PAGE_SIZE, size);
+	}
+}
 
-	while ((ssize_t)size > 0) {
-		/* Flush the page. */
-		homecache_flush_cache(page++, 0);
+static void __dma_complete_pa_range(dma_addr_t dma_addr, size_t size,
+				    enum dma_data_direction direction)
+{
+	struct page *page = pfn_to_page(PFN_DOWN(dma_addr));
+	unsigned long offset = dma_addr & (PAGE_SIZE - 1);
+	size_t bytes = min(size, (size_t)(PAGE_SIZE - offset));
+
+	while (size != 0) {
+		__dma_complete_page(page, offset, bytes, direction);
+		size -= bytes;
+		++page;
+		offset = 0;
+		bytes = min((size_t)PAGE_SIZE, size);
+	}
+}
+
+static int tile_dma_map_sg(struct device *dev, struct scatterlist *sglist,
+			   int nents, enum dma_data_direction direction,
+			   struct dma_attrs *attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	BUG_ON(!valid_dma_direction(direction));
+
+	WARN_ON(nents == 0 || sglist->length == 0);
 
-		/* Figure out if we need to continue on the next page. */
-		size -= bytesleft;
-		bytesleft = PAGE_SIZE;
+	for_each_sg(sglist, sg, nents, i) {
+		sg->dma_address = sg_phys(sg);
+		__dma_prep_pa_range(sg->dma_address, sg->length, direction);
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+		sg->dma_length = sg->length;
+#endif
 	}
+
+	return nents;
 }
 
-/*
- * dma_map_single can be passed any memory address, and there appear
- * to be no alignment constraints.
- *
- * There is a chance that the start of the buffer will share a cache
- * line with some other data that has been touched in the meantime.
- */
-dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
-	       enum dma_data_direction direction)
+static void tile_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
+			      int nents, enum dma_data_direction direction,
+			      struct dma_attrs *attrs)
 {
-	dma_addr_t dma_addr = __pa(ptr);
+	struct scatterlist *sg;
+	int i;
 
 	BUG_ON(!valid_dma_direction(direction));
-	WARN_ON(size == 0);
+	for_each_sg(sglist, sg, nents, i) {
+		sg->dma_address = sg_phys(sg);
+		__dma_complete_pa_range(sg->dma_address, sg->length,
+					direction);
+	}
+}
+
+static dma_addr_t tile_dma_map_page(struct device *dev, struct page *page,
+				    unsigned long offset, size_t size,
+				    enum dma_data_direction direction,
+				    struct dma_attrs *attrs)
+{
+	BUG_ON(!valid_dma_direction(direction));
+
+	BUG_ON(offset + size > PAGE_SIZE);
+	__dma_prep_page(page, offset, size, direction);
+
+	return page_to_pa(page) + offset;
+}
+
+static void tile_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
+				size_t size, enum dma_data_direction direction,
+				struct dma_attrs *attrs)
+{
+	BUG_ON(!valid_dma_direction(direction));
+
+	__dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
+			    dma_address & (PAGE_SIZE - 1), size, direction);
+}
+
+static void tile_dma_sync_single_for_cpu(struct device *dev,
+					 dma_addr_t dma_handle,
+					 size_t size,
+					 enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
 
-	__dma_map_pa_range(dma_addr, size);
+	__dma_complete_pa_range(dma_handle, size, direction);
+}
 
-	return dma_addr;
+static void tile_dma_sync_single_for_device(struct device *dev,
+					    dma_addr_t dma_handle, size_t size,
+					    enum dma_data_direction direction)
+{
+	__dma_prep_pa_range(dma_handle, size, direction);
 }
-EXPORT_SYMBOL(dma_map_single);
 
-void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
-		 enum dma_data_direction direction)
+static void tile_dma_sync_sg_for_cpu(struct device *dev,
+				     struct scatterlist *sglist, int nelems,
+				     enum dma_data_direction direction)
 {
+	struct scatterlist *sg;
+	int i;
+
 	BUG_ON(!valid_dma_direction(direction));
+	WARN_ON(nelems == 0 || sglist->length == 0);
+
+	for_each_sg(sglist, sg, nelems, i) {
+		dma_sync_single_for_cpu(dev, sg->dma_address,
+					sg_dma_len(sg), direction);
+	}
 }
-EXPORT_SYMBOL(dma_unmap_single);
 
-int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
-	   enum dma_data_direction direction)
+static void tile_dma_sync_sg_for_device(struct device *dev,
+					struct scatterlist *sglist, int nelems,
+					enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	int i;
+
+	BUG_ON(!valid_dma_direction(direction));
+	WARN_ON(nelems == 0 || sglist->length == 0);
+
+	for_each_sg(sglist, sg, nelems, i) {
+		dma_sync_single_for_device(dev, sg->dma_address,
+					   sg_dma_len(sg), direction);
+	}
+}
+
+static inline int
+tile_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return 0;
+}
+
+static inline int
+tile_dma_supported(struct device *dev, u64 mask)
+{
+	return 1;
+}
+
+static struct dma_map_ops tile_default_dma_map_ops = {
+	.alloc = tile_dma_alloc_coherent,
+	.free = tile_dma_free_coherent,
+	.map_page = tile_dma_map_page,
+	.unmap_page = tile_dma_unmap_page,
+	.map_sg = tile_dma_map_sg,
+	.unmap_sg = tile_dma_unmap_sg,
+	.sync_single_for_cpu = tile_dma_sync_single_for_cpu,
+	.sync_single_for_device = tile_dma_sync_single_for_device,
+	.sync_sg_for_cpu = tile_dma_sync_sg_for_cpu,
+	.sync_sg_for_device = tile_dma_sync_sg_for_device,
+	.mapping_error = tile_dma_mapping_error,
+	.dma_supported = tile_dma_supported
+};
+
+struct dma_map_ops *tile_dma_map_ops = &tile_default_dma_map_ops;
+EXPORT_SYMBOL(tile_dma_map_ops);
+
+/* Generic PCI DMA mapping functions */
+
+static void *tile_pci_dma_alloc_coherent(struct device *dev, size_t size,
+					 dma_addr_t *dma_handle, gfp_t gfp,
+					 struct dma_attrs *attrs)
+{
+	int node = dev_to_node(dev);
+	int order = get_order(size);
+	struct page *pg;
+	dma_addr_t addr;
+
+	gfp |= __GFP_ZERO;
+
+	pg = homecache_alloc_pages_node(node, gfp, order, PAGE_HOME_DMA);
+	if (pg == NULL)
+		return NULL;
+
+	addr = page_to_phys(pg);
+
+	*dma_handle = addr + get_dma_offset(dev);
+
+	return page_address(pg);
+}
+
+/*
+ * Free memory that was allocated with tile_pci_dma_alloc_coherent.
+ */
+static void tile_pci_dma_free_coherent(struct device *dev, size_t size,
+				       void *vaddr, dma_addr_t dma_handle,
+				       struct dma_attrs *attrs)
+{
+	homecache_free_pages((unsigned long)vaddr, get_order(size));
+}
+
+static int tile_pci_dma_map_sg(struct device *dev, struct scatterlist *sglist,
+			       int nents, enum dma_data_direction direction,
+			       struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
 	int i;
@@ -143,73 +386,103 @@ int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
 
 	for_each_sg(sglist, sg, nents, i) {
 		sg->dma_address = sg_phys(sg);
-		__dma_map_pa_range(sg->dma_address, sg->length);
+		__dma_prep_pa_range(sg->dma_address, sg->length, direction);
+
+		sg->dma_address = sg->dma_address + get_dma_offset(dev);
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+		sg->dma_length = sg->length;
+#endif
 	}
 
 	return nents;
 }
-EXPORT_SYMBOL(dma_map_sg);
 
-void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
-	     enum dma_data_direction direction)
+static void tile_pci_dma_unmap_sg(struct device *dev,
+				  struct scatterlist *sglist, int nents,
+				  enum dma_data_direction direction,
+				  struct dma_attrs *attrs)
 {
+	struct scatterlist *sg;
+	int i;
+
 	BUG_ON(!valid_dma_direction(direction));
+	for_each_sg(sglist, sg, nents, i) {
+		sg->dma_address = sg_phys(sg);
+		__dma_complete_pa_range(sg->dma_address, sg->length,
+					direction);
+	}
 }
-EXPORT_SYMBOL(dma_unmap_sg);
 
-dma_addr_t dma_map_page(struct device *dev, struct page *page,
-			unsigned long offset, size_t size,
-			enum dma_data_direction direction)
+static dma_addr_t tile_pci_dma_map_page(struct device *dev, struct page *page,
+					unsigned long offset, size_t size,
+					enum dma_data_direction direction,
+					struct dma_attrs *attrs)
 {
 	BUG_ON(!valid_dma_direction(direction));
 
 	BUG_ON(offset + size > PAGE_SIZE);
-	homecache_flush_cache(page, 0);
+	__dma_prep_page(page, offset, size, direction);
 
-	return page_to_pa(page) + offset;
+	return page_to_pa(page) + offset + get_dma_offset(dev);
 }
-EXPORT_SYMBOL(dma_map_page);
 
-void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
-	       enum dma_data_direction direction)
+static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
+				    size_t size,
+				    enum dma_data_direction direction,
+				    struct dma_attrs *attrs)
 {
 	BUG_ON(!valid_dma_direction(direction));
+
+	dma_address -= get_dma_offset(dev);
+
+	__dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
+			    dma_address & (PAGE_SIZE - 1), size, direction);
 }
-EXPORT_SYMBOL(dma_unmap_page);
 
-void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
-			     size_t size, enum dma_data_direction direction)
+static void tile_pci_dma_sync_single_for_cpu(struct device *dev,
+					     dma_addr_t dma_handle,
+					     size_t size,
+					     enum dma_data_direction direction)
 {
 	BUG_ON(!valid_dma_direction(direction));
+
+	dma_handle -= get_dma_offset(dev);
+
+	__dma_complete_pa_range(dma_handle, size, direction);
 }
-EXPORT_SYMBOL(dma_sync_single_for_cpu);
 
-void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
-				size_t size, enum dma_data_direction direction)
+static void tile_pci_dma_sync_single_for_device(struct device *dev,
+						dma_addr_t dma_handle,
+						size_t size,
+						enum dma_data_direction
+						direction)
 {
-	unsigned long start = PFN_DOWN(dma_handle);
-	unsigned long end = PFN_DOWN(dma_handle + size - 1);
-	unsigned long i;
+	dma_handle -= get_dma_offset(dev);
 
-	BUG_ON(!valid_dma_direction(direction));
-	for (i = start; i <= end; ++i)
-		homecache_flush_cache(pfn_to_page(i), 0);
+	__dma_prep_pa_range(dma_handle, size, direction);
 }
-EXPORT_SYMBOL(dma_sync_single_for_device);
 
-void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
-		    enum dma_data_direction direction)
+static void tile_pci_dma_sync_sg_for_cpu(struct device *dev,
+					 struct scatterlist *sglist,
+					 int nelems,
+					 enum dma_data_direction direction)
 {
+	struct scatterlist *sg;
+	int i;
+
 	BUG_ON(!valid_dma_direction(direction));
-	WARN_ON(nelems == 0 || sg[0].length == 0);
+	WARN_ON(nelems == 0 || sglist->length == 0);
+
+	for_each_sg(sglist, sg, nelems, i) {
+		dma_sync_single_for_cpu(dev, sg->dma_address,
+					sg_dma_len(sg), direction);
+	}
 }
-EXPORT_SYMBOL(dma_sync_sg_for_cpu);
 
-/*
- * Flush and invalidate cache for scatterlist.
- */
-void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
-			    int nelems, enum dma_data_direction direction)
+static void tile_pci_dma_sync_sg_for_device(struct device *dev,
+					    struct scatterlist *sglist,
+					    int nelems,
+					    enum dma_data_direction direction)
 {
 	struct scatterlist *sg;
 	int i;
@@ -222,31 +495,136 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
 					   sg_dma_len(sg), direction);
 	}
 }
-EXPORT_SYMBOL(dma_sync_sg_for_device);
 
-void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
-				   unsigned long offset, size_t size,
-				   enum dma_data_direction direction)
+static inline int
+tile_pci_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return 0;
+}
+
+static inline int
+tile_pci_dma_supported(struct device *dev, u64 mask)
+{
+	return 1;
+}
+
+static struct dma_map_ops tile_pci_default_dma_map_ops = {
+	.alloc = tile_pci_dma_alloc_coherent,
+	.free = tile_pci_dma_free_coherent,
+	.map_page = tile_pci_dma_map_page,
+	.unmap_page = tile_pci_dma_unmap_page,
+	.map_sg = tile_pci_dma_map_sg,
+	.unmap_sg = tile_pci_dma_unmap_sg,
+	.sync_single_for_cpu = tile_pci_dma_sync_single_for_cpu,
+	.sync_single_for_device = tile_pci_dma_sync_single_for_device,
+	.sync_sg_for_cpu = tile_pci_dma_sync_sg_for_cpu,
+	.sync_sg_for_device = tile_pci_dma_sync_sg_for_device,
+	.mapping_error = tile_pci_dma_mapping_error,
+	.dma_supported = tile_pci_dma_supported
+};
+
+struct dma_map_ops *gx_pci_dma_map_ops = &tile_pci_default_dma_map_ops;
+EXPORT_SYMBOL(gx_pci_dma_map_ops);
+
+/* PCI DMA mapping functions for legacy PCI devices */
+
+#ifdef CONFIG_SWIOTLB
+static void *tile_swiotlb_alloc_coherent(struct device *dev, size_t size,
+					 dma_addr_t *dma_handle, gfp_t gfp,
+					 struct dma_attrs *attrs)
 {
-	dma_sync_single_for_cpu(dev, dma_handle + offset, size, direction);
+	gfp |= GFP_DMA;
+	return swiotlb_alloc_coherent(dev, size, dma_handle, gfp);
 }
-EXPORT_SYMBOL(dma_sync_single_range_for_cpu);
 
-void dma_sync_single_range_for_device(struct device *dev,
-				      dma_addr_t dma_handle,
-				      unsigned long offset, size_t size,
-				      enum dma_data_direction direction)
+static void tile_swiotlb_free_coherent(struct device *dev, size_t size,
+				       void *vaddr, dma_addr_t dma_addr,
+				       struct dma_attrs *attrs)
 {
-	dma_sync_single_for_device(dev, dma_handle + offset, size, direction);
+	swiotlb_free_coherent(dev, size, vaddr, dma_addr);
+}
+
+static struct dma_map_ops pci_swiotlb_dma_ops = {
+	.alloc = tile_swiotlb_alloc_coherent,
+	.free = tile_swiotlb_free_coherent,
+	.map_page = swiotlb_map_page,
+	.unmap_page = swiotlb_unmap_page,
+	.map_sg = swiotlb_map_sg_attrs,
+	.unmap_sg = swiotlb_unmap_sg_attrs,
+	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+	.sync_single_for_device = swiotlb_sync_single_for_device,
+	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+	.sync_sg_for_device = swiotlb_sync_sg_for_device,
+	.dma_supported = swiotlb_dma_supported,
+	.mapping_error = swiotlb_dma_mapping_error,
+};
+
+static struct dma_map_ops pci_hybrid_dma_ops = {
+	.alloc = tile_swiotlb_alloc_coherent,
+	.free = tile_swiotlb_free_coherent,
+	.map_page = tile_pci_dma_map_page,
+	.unmap_page = tile_pci_dma_unmap_page,
+	.map_sg = tile_pci_dma_map_sg,
+	.unmap_sg = tile_pci_dma_unmap_sg,
+	.sync_single_for_cpu = tile_pci_dma_sync_single_for_cpu,
+	.sync_single_for_device = tile_pci_dma_sync_single_for_device,
+	.sync_sg_for_cpu = tile_pci_dma_sync_sg_for_cpu,
+	.sync_sg_for_device = tile_pci_dma_sync_sg_for_device,
+	.mapping_error = tile_pci_dma_mapping_error,
+	.dma_supported = tile_pci_dma_supported
+};
+
+struct dma_map_ops *gx_legacy_pci_dma_map_ops = &pci_swiotlb_dma_ops;
+struct dma_map_ops *gx_hybrid_pci_dma_map_ops = &pci_hybrid_dma_ops;
+#else
+struct dma_map_ops *gx_legacy_pci_dma_map_ops;
+struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
+#endif
+EXPORT_SYMBOL(gx_legacy_pci_dma_map_ops);
+EXPORT_SYMBOL(gx_hybrid_pci_dma_map_ops);
+
+#ifdef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
+int dma_set_coherent_mask(struct device *dev, u64 mask)
+{
+	struct dma_map_ops *dma_ops = get_dma_ops(dev);
+
+	/*
+	 * For PCI devices with 64-bit DMA addressing capability, promote
+	 * the dma_ops to full capability for both streams and consistent
+	 * memory access. For 32-bit capable devices, limit the consistent 
+	 * memory DMA range to max_direct_dma_addr.
+	 */
+	if (dma_ops == gx_pci_dma_map_ops ||
+	    dma_ops == gx_hybrid_pci_dma_map_ops ||
+	    dma_ops == gx_legacy_pci_dma_map_ops) {
+		if (mask == DMA_BIT_MASK(64))
+			set_dma_ops(dev, gx_pci_dma_map_ops);
+		else if (mask > dev->archdata.max_direct_dma_addr)
+			mask = dev->archdata.max_direct_dma_addr;
+	}
+
+	if (!dma_supported(dev, mask))
+		return -EIO;
+	dev->coherent_dma_mask = mask;
+	return 0;
 }
-EXPORT_SYMBOL(dma_sync_single_range_for_device);
+EXPORT_SYMBOL(dma_set_coherent_mask);
+#endif
 
+#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
 /*
- * dma_alloc_noncoherent() returns non-cacheable memory, so there's no
- * need to do any flushing here.
+ * The generic dma_get_required_mask() uses the highest physical address
+ * (max_pfn) to provide the hint to the PCI drivers regarding 32-bit or
+ * 64-bit DMA configuration. Since TILEGx has I/O TLB/MMU, allowing the
+ * DMAs to use the full 64-bit PCI address space and not limited by
+ * the physical memory space, we always let the PCI devices use
+ * 64-bit DMA if they have that capability, by returning the 64-bit
+ * DMA mask here. The device driver has the option to use 32-bit DMA if
+ * the device is not capable of 64-bit DMA.
  */
-void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-		    enum dma_data_direction direction)
+u64 dma_get_required_mask(struct device *dev)
 {
+	return DMA_BIT_MASK(64);
 }
-EXPORT_SYMBOL(dma_cache_sync);
+EXPORT_SYMBOL_GPL(dma_get_required_mask);
+#endif
diff --git a/arch/tile/kernel/pci.c b/arch/tile/kernel/pci.c
index a1bb59eecc1..1f80a88c75a 100644
--- a/arch/tile/kernel/pci.c
+++ b/arch/tile/kernel/pci.c
@@ -20,7 +20,6 @@
 #include <linux/capability.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
-#include <linux/bootmem.h>
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/uaccess.h>
@@ -52,6 +51,8 @@
  *
  */
 
+static int pci_probe = 1;
+
 /*
  * This flag tells if the platform is TILEmpower that needs
  * special configuration for the PLX switch chip.
@@ -81,7 +82,7 @@ EXPORT_SYMBOL(pcibios_align_resource);
  * controller_id is the controller number, config type is 0 or 1 for
  * config0 or config1 operations.
  */
-static int __devinit tile_pcie_open(int controller_id, int config_type)
+static int tile_pcie_open(int controller_id, int config_type)
 {
 	char filename[32];
 	int fd;
@@ -97,8 +98,7 @@ static int __devinit tile_pcie_open(int controller_id, int config_type)
 /*
  * Get the IRQ numbers from the HV and set up the handlers for them.
  */
-static int __devinit tile_init_irqs(int controller_id,
-				 struct pci_controller *controller)
+static int tile_init_irqs(int controller_id, struct pci_controller *controller)
 {
 	char filename[32];
 	int fd;
@@ -141,10 +141,15 @@ static int __devinit tile_init_irqs(int controller_id,
  *
  * Returns the number of controllers discovered.
  */
-int __devinit tile_pci_init(void)
+int __init tile_pci_init(void)
 {
 	int i;
 
+	if (!pci_probe) {
+		pr_info("PCI: disabled by boot argument\n");
+		return 0;
+	}
+
 	pr_info("PCI: Searching for controllers...\n");
 
 	/* Re-init number of PCIe controllers to support hot-plug feature. */
@@ -193,7 +198,6 @@ int __devinit tile_pci_init(void)
 			controller->hv_cfg_fd[0] = hv_cfg_fd0;
 			controller->hv_cfg_fd[1] = hv_cfg_fd1;
 			controller->hv_mem_fd = hv_mem_fd;
-			controller->first_busno = 0;
 			controller->last_busno = 0xff;
 			controller->ops = &tile_cfg_ops;
 
@@ -237,7 +241,7 @@ static int tile_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
 }
 
 
-static void __devinit fixup_read_and_payload_sizes(void)
+static void fixup_read_and_payload_sizes(void)
 {
 	struct pci_dev *dev = NULL;
 	int smallest_max_payload = 0x1; /* Tile maxes out at 256 bytes. */
@@ -245,39 +249,20 @@ static void __devinit fixup_read_and_payload_sizes(void)
 	u16 new_values;
 
 	/* Scan for the smallest maximum payload size. */
-	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-		int pcie_caps_offset;
-		u32 devcap;
-		int max_payload;
-
-		pcie_caps_offset = pci_find_capability(dev, PCI_CAP_ID_EXP);
-		if (pcie_caps_offset == 0)
+	for_each_pci_dev(dev) {
+		if (!pci_is_pcie(dev))
 			continue;
 
-		pci_read_config_dword(dev, pcie_caps_offset + PCI_EXP_DEVCAP,
-				      &devcap);
-		max_payload = devcap & PCI_EXP_DEVCAP_PAYLOAD;
-		if (max_payload < smallest_max_payload)
-			smallest_max_payload = max_payload;
+		if (dev->pcie_mpss < smallest_max_payload)
+			smallest_max_payload = dev->pcie_mpss;
 	}
 
 	/* Now, set the max_payload_size for all devices to that value. */
 	new_values = (max_read_size << 12) | (smallest_max_payload << 5);
-	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-		int pcie_caps_offset;
-		u16 devctl;
-
-		pcie_caps_offset = pci_find_capability(dev, PCI_CAP_ID_EXP);
-		if (pcie_caps_offset == 0)
-			continue;
-
-		pci_read_config_word(dev, pcie_caps_offset + PCI_EXP_DEVCTL,
-				     &devctl);
-		devctl &= ~(PCI_EXP_DEVCTL_PAYLOAD | PCI_EXP_DEVCTL_READRQ);
-		devctl |= new_values;
-		pci_write_config_word(dev, pcie_caps_offset + PCI_EXP_DEVCTL,
-				      devctl);
-	}
+	for_each_pci_dev(dev)
+		pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL,
+				PCI_EXP_DEVCTL_PAYLOAD | PCI_EXP_DEVCTL_READRQ,
+				new_values);
 }
 
 
@@ -287,7 +272,7 @@ static void __devinit fixup_read_and_payload_sizes(void)
  * The controllers have been set up by the time we get here, by a call to
  * tile_pci_init.
  */
-int __devinit pcibios_init(void)
+int __init pcibios_init(void)
 {
 	int i;
 
@@ -298,7 +283,7 @@ int __devinit pcibios_init(void)
 	 * known to require at least 20ms here, but we use a more
 	 * conservative value.
 	 */
-	mdelay(250);
+	msleep(250);
 
 	/* Scan all of the recorded PCI controllers.  */
 	for (i = 0; i < TILE_NUM_PCIE; i++) {
@@ -310,6 +295,7 @@ int __devinit pcibios_init(void)
 		if (pci_scan_flags[i] == 0 && controllers[i].ops != NULL) {
 			struct pci_controller *controller = &controllers[i];
 			struct pci_bus *bus;
+			LIST_HEAD(resources);
 
 			if (tile_init_irqs(i, controller)) {
 				pr_err("PCI: Could not initialize IRQs\n");
@@ -318,18 +304,12 @@ int __devinit pcibios_init(void)
 
 			pr_info("PCI: initializing controller #%d\n", i);
 
-			/*
-			 * This comes from the generic Linux PCI driver.
-			 *
-			 * It reads the PCI tree for this bus into the Linux
-			 * data structures.
-			 *
-			 * This is inlined in linux/pci.h and calls into
-			 * pci_scan_bus_parented() in probe.c.
-			 */
-			bus = pci_scan_bus(0, controller->ops, controller);
+			pci_add_resource(&resources, &ioport_resource);
+			pci_add_resource(&resources, &iomem_resource);
+			bus = pci_scan_root_bus(NULL, 0, controller->ops,
+						controller, &resources);
 			controller->root_bus = bus;
-			controller->last_busno = bus->subordinate;
+			controller->last_busno = bus->busn_res.end;
 		}
 	}
 
@@ -390,7 +370,7 @@ subsys_initcall(pcibios_init);
 /*
  * No bus fixups needed.
  */
-void __devinit pcibios_fixup_bus(struct pci_bus *bus)
+void pcibios_fixup_bus(struct pci_bus *bus)
 {
 	/* Nothing needs to be done. */
 }
@@ -400,25 +380,17 @@ void pcibios_set_master(struct pci_dev *dev)
 	/* No special bus mastering setup handling. */
 }
 
-/*
- * This can be called from the generic PCI layer, but doesn't need to
- * do anything.
- */
-char __devinit *pcibios_setup(char *str)
+/* Process any "pci=" kernel boot arguments. */
+char *__init pcibios_setup(char *str)
 {
-	/* Nothing needs to be done. */
+	if (!strcmp(str, "off")) {
+		pci_probe = 0;
+		return NULL;
+	}
 	return str;
 }
 
 /*
- * This is called from the generic Linux layer.
- */
-void __devinit pcibios_update_irq(struct pci_dev *dev, int irq)
-{
-	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
-}
-
-/*
  * Enable memory and/or address decoding, as appropriate, for the
  * device described by the 'dev' struct.
  *
@@ -487,11 +459,8 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
  * specified bus & slot.
  */
 
-static int __devinit tile_cfg_read(struct pci_bus *bus,
-				   unsigned int devfn,
-				   int offset,
-				   int size,
-				   u32 *val)
+static int tile_cfg_read(struct pci_bus *bus, unsigned int devfn, int offset,
+			 int size, u32 *val)
 {
 	struct pci_controller *controller = bus->sysdata;
 	int busnum = bus->number & 0xff;
@@ -533,11 +502,8 @@ static int __devinit tile_cfg_read(struct pci_bus *bus,
  * See tile_cfg_read() for relevant comments.
  * Note that "val" is the value to write, not a pointer to that value.
  */
-static int __devinit tile_cfg_write(struct pci_bus *bus,
-				    unsigned int devfn,
-				    int offset,
-				    int size,
-				    u32 val)
+static int tile_cfg_write(struct pci_bus *bus, unsigned int devfn, int offset,
+			  int size, u32 val)
 {
 	struct pci_controller *controller = bus->sysdata;
 	int busnum = bus->number & 0xff;
diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
new file mode 100644
index 00000000000..e39f9c54280
--- /dev/null
+++ b/arch/tile/kernel/pci_gx.c
@@ -0,0 +1,1610 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mmzone.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/capability.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/irq.h>
+#include <linux/msi.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <linux/ctype.h>
+
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/byteorder.h>
+
+#include <gxio/iorpc_globals.h>
+#include <gxio/kiorpc.h>
+#include <gxio/trio.h>
+#include <gxio/iorpc_trio.h>
+#include <hv/drv_trio_intf.h>
+
+#include <arch/sim.h>
+
+/*
+ * This file containes the routines to search for PCI buses,
+ * enumerate the buses, and configure any attached devices.
+ */
+
+#define DEBUG_PCI_CFG	0
+
+#if DEBUG_PCI_CFG
+#define TRACE_CFG_WR(size, val, bus, dev, func, offset) \
+	pr_info("CFG WR %d-byte VAL %#x to bus %d dev %d func %d addr %u\n", \
+		size, val, bus, dev, func, offset & 0xFFF);
+#define TRACE_CFG_RD(size, val, bus, dev, func, offset) \
+	pr_info("CFG RD %d-byte VAL %#x from bus %d dev %d func %d addr %u\n", \
+		size, val, bus, dev, func, offset & 0xFFF);
+#else
+#define TRACE_CFG_WR(...)
+#define TRACE_CFG_RD(...)
+#endif
+
+static int pci_probe = 1;
+
+/* Information on the PCIe RC ports configuration. */
+static int pcie_rc[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];
+
+/*
+ * On some platforms with one or more Gx endpoint ports, we need to
+ * delay the PCIe RC port probe for a few seconds to work around
+ * a HW PCIe link-training bug. The exact delay is specified with
+ * a kernel boot argument in the form of "pcie_rc_delay=T,P,S",
+ * where T is the TRIO instance number, P is the port number and S is
+ * the delay in seconds. If the argument is specified, but the delay is
+ * not provided, the value will be DEFAULT_RC_DELAY.
+ */
+static int rc_delay[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];
+
+/* Default number of seconds that the PCIe RC port probe can be delayed. */
+#define DEFAULT_RC_DELAY	10
+
+/* The PCI I/O space size in each PCI domain. */
+#define IO_SPACE_SIZE		0x10000
+
+/* Provide shorter versions of some very long constant names. */
+#define AUTO_CONFIG_RC	\
+	TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC
+#define AUTO_CONFIG_RC_G1	\
+	TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC_G1
+#define AUTO_CONFIG_EP	\
+	TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT
+#define AUTO_CONFIG_EP_G1	\
+	TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT_G1
+
+/* Array of the PCIe ports configuration info obtained from the BIB. */
+struct pcie_trio_ports_property pcie_ports[TILEGX_NUM_TRIO];
+
+/* Number of configured TRIO instances. */
+int num_trio_shims;
+
+/* All drivers share the TRIO contexts defined here. */
+gxio_trio_context_t trio_contexts[TILEGX_NUM_TRIO];
+
+/* Pointer to an array of PCIe RC controllers. */
+struct pci_controller pci_controllers[TILEGX_NUM_TRIO * TILEGX_TRIO_PCIES];
+int num_rc_controllers;
+
+static struct pci_ops tile_cfg_ops;
+
+/* Mask of CPUs that should receive PCIe interrupts. */
+static struct cpumask intr_cpus_map;
+
+/* We don't need to worry about the alignment of resources. */
+resource_size_t pcibios_align_resource(void *data, const struct resource *res,
+				       resource_size_t size,
+				       resource_size_t align)
+{
+	return res->start;
+}
+EXPORT_SYMBOL(pcibios_align_resource);
+
+/*
+ * Pick a CPU to receive and handle the PCIe interrupts, based on the IRQ #.
+ * For now, we simply send interrupts to non-dataplane CPUs.
+ * We may implement methods to allow user to specify the target CPUs,
+ * e.g. via boot arguments.
+ */
+static int tile_irq_cpu(int irq)
+{
+	unsigned int count;
+	int i = 0;
+	int cpu;
+
+	count = cpumask_weight(&intr_cpus_map);
+	if (unlikely(count == 0)) {
+		pr_warning("intr_cpus_map empty, interrupts will be"
+			   " delievered to dataplane tiles\n");
+		return irq % (smp_height * smp_width);
+	}
+
+	count = irq % count;
+	for_each_cpu(cpu, &intr_cpus_map) {
+		if (i++ == count)
+			break;
+	}
+	return cpu;
+}
+
+/* Open a file descriptor to the TRIO shim. */
+static int tile_pcie_open(int trio_index)
+{
+	gxio_trio_context_t *context = &trio_contexts[trio_index];
+	int ret;
+	int mac;
+
+	/* This opens a file descriptor to the TRIO shim. */
+	ret = gxio_trio_init(context, trio_index);
+	if (ret < 0)
+		goto gxio_trio_init_failure;
+
+	/* Allocate an ASID for the kernel. */
+	ret = gxio_trio_alloc_asids(context, 1, 0, 0);
+	if (ret < 0) {
+		pr_err("PCI: ASID alloc failure on TRIO %d, give up\n",
+			trio_index);
+		goto asid_alloc_failure;
+	}
+
+	context->asid = ret;
+
+#ifdef USE_SHARED_PCIE_CONFIG_REGION
+	/*
+	 * Alloc a PIO region for config access, shared by all MACs per TRIO.
+	 * This shouldn't fail since the kernel is supposed to the first
+	 * client of the TRIO's PIO regions.
+	 */
+	ret = gxio_trio_alloc_pio_regions(context, 1, 0, 0);
+	if (ret < 0) {
+		pr_err("PCI: CFG PIO alloc failure on TRIO %d, give up\n",
+			trio_index);
+		goto pio_alloc_failure;
+	}
+
+	context->pio_cfg_index = ret;
+
+	/*
+	 * For PIO CFG, the bus_address_hi parameter is 0. The mac parameter
+	 * is also 0 because it is specified in PIO_REGION_SETUP_CFG_ADDR.
+	 */
+	ret = gxio_trio_init_pio_region_aux(context, context->pio_cfg_index,
+		0, 0, HV_TRIO_PIO_FLAG_CONFIG_SPACE);
+	if (ret < 0) {
+		pr_err("PCI: CFG PIO init failure on TRIO %d, give up\n",
+			trio_index);
+		goto pio_alloc_failure;
+	}
+#endif
+
+	/* Get the properties of the PCIe ports on this TRIO instance. */
+	ret = gxio_trio_get_port_property(context, &pcie_ports[trio_index]);
+	if (ret < 0) {
+		pr_err("PCI: PCIE_GET_PORT_PROPERTY failure, error %d,"
+		       " on TRIO %d\n", ret, trio_index);
+		goto get_port_property_failure;
+	}
+
+	context->mmio_base_mac =
+		iorpc_ioremap(context->fd, 0, HV_TRIO_CONFIG_IOREMAP_SIZE);
+	if (context->mmio_base_mac == NULL) {
+		pr_err("PCI: TRIO config space mapping failure, error %d,"
+		       " on TRIO %d\n", ret, trio_index);
+		ret = -ENOMEM;
+
+		goto trio_mmio_mapping_failure;
+	}
+
+	/* Check the port strap state which will override the BIB setting. */
+	for (mac = 0; mac < TILEGX_TRIO_PCIES; mac++) {
+		TRIO_PCIE_INTFC_PORT_CONFIG_t port_config;
+		unsigned int reg_offset;
+
+		/* Ignore ports that are not specified in the BIB. */
+		if (!pcie_ports[trio_index].ports[mac].allow_rc &&
+		    !pcie_ports[trio_index].ports[mac].allow_ep)
+			continue;
+
+		reg_offset =
+			(TRIO_PCIE_INTFC_PORT_CONFIG <<
+				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+			(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
+				TRIO_CFG_REGION_ADDR__INTFC_SHIFT) |
+			(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+		port_config.word =
+			__gxio_mmio_read(context->mmio_base_mac + reg_offset);
+
+		if (port_config.strap_state != AUTO_CONFIG_RC &&
+		    port_config.strap_state != AUTO_CONFIG_RC_G1) {
+			/*
+			 * If this is really intended to be an EP port, record
+			 * it so that the endpoint driver will know about it.
+			 */
+			if (port_config.strap_state == AUTO_CONFIG_EP ||
+			    port_config.strap_state == AUTO_CONFIG_EP_G1)
+				pcie_ports[trio_index].ports[mac].allow_ep = 1;
+		}
+	}
+
+	return ret;
+
+trio_mmio_mapping_failure:
+get_port_property_failure:
+asid_alloc_failure:
+#ifdef USE_SHARED_PCIE_CONFIG_REGION
+pio_alloc_failure:
+#endif
+	hv_dev_close(context->fd);
+gxio_trio_init_failure:
+	context->fd = -1;
+
+	return ret;
+}
+
+static int __init tile_trio_init(void)
+{
+	int i;
+
+	/* We loop over all the TRIO shims. */
+	for (i = 0; i < TILEGX_NUM_TRIO; i++) {
+		if (tile_pcie_open(i) < 0)
+			continue;
+		num_trio_shims++;
+	}
+
+	return 0;
+}
+postcore_initcall(tile_trio_init);
+
+static void tilegx_legacy_irq_ack(struct irq_data *d)
+{
+	__insn_mtspr(SPR_IPI_EVENT_RESET_K, 1UL << d->irq);
+}
+
+static void tilegx_legacy_irq_mask(struct irq_data *d)
+{
+	__insn_mtspr(SPR_IPI_MASK_SET_K, 1UL << d->irq);
+}
+
+static void tilegx_legacy_irq_unmask(struct irq_data *d)
+{
+	__insn_mtspr(SPR_IPI_MASK_RESET_K, 1UL << d->irq);
+}
+
+static struct irq_chip tilegx_legacy_irq_chip = {
+	.name			= "tilegx_legacy_irq",
+	.irq_ack		= tilegx_legacy_irq_ack,
+	.irq_mask		= tilegx_legacy_irq_mask,
+	.irq_unmask		= tilegx_legacy_irq_unmask,
+
+	/* TBD: support set_affinity. */
+};
+
+/*
+ * This is a wrapper function of the kernel level-trigger interrupt
+ * handler handle_level_irq() for PCI legacy interrupts. The TRIO
+ * is configured such that only INTx Assert interrupts are proxied
+ * to Linux which just calls handle_level_irq() after clearing the
+ * MAC INTx Assert status bit associated with this interrupt.
+ */
+static void trio_handle_level_irq(unsigned int irq, struct irq_desc *desc)
+{
+	struct pci_controller *controller = irq_desc_get_handler_data(desc);
+	gxio_trio_context_t *trio_context = controller->trio;
+	uint64_t intx = (uint64_t)irq_desc_get_chip_data(desc);
+	int mac = controller->mac;
+	unsigned int reg_offset;
+	uint64_t level_mask;
+
+	handle_level_irq(irq, desc);
+
+	/*
+	 * Clear the INTx Level status, otherwise future interrupts are
+	 * not sent.
+	 */
+	reg_offset = (TRIO_PCIE_INTFC_MAC_INT_STS <<
+		TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+		(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
+		TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
+		(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+	level_mask = TRIO_PCIE_INTFC_MAC_INT_STS__INT_LEVEL_MASK << intx;
+
+	__gxio_mmio_write(trio_context->mmio_base_mac + reg_offset, level_mask);
+}
+
+/*
+ * Create kernel irqs and set up the handlers for the legacy interrupts.
+ * Also some minimum initialization for the MSI support.
+ */
+static int tile_init_irqs(struct pci_controller *controller)
+{
+	int i;
+	int j;
+	int irq;
+	int result;
+
+	cpumask_copy(&intr_cpus_map, cpu_online_mask);
+
+
+	for (i = 0; i < 4; i++) {
+		gxio_trio_context_t *context = controller->trio;
+		int cpu;
+
+		/* Ask the kernel to allocate an IRQ. */
+		irq = irq_alloc_hwirq(-1);
+		if (!irq) {
+			pr_err("PCI: no free irq vectors, failed for %d\n", i);
+			goto free_irqs;
+		}
+		controller->irq_intx_table[i] = irq;
+
+		/* Distribute the 4 IRQs to different tiles. */
+		cpu = tile_irq_cpu(irq);
+
+		/* Configure the TRIO intr binding for this IRQ. */
+		result = gxio_trio_config_legacy_intr(context, cpu_x(cpu),
+						      cpu_y(cpu), KERNEL_PL,
+						      irq, controller->mac, i);
+		if (result < 0) {
+			pr_err("PCI: MAC intx config failed for %d\n", i);
+
+			goto free_irqs;
+		}
+
+		/* Register the IRQ handler with the kernel. */
+		irq_set_chip_and_handler(irq, &tilegx_legacy_irq_chip,
+					trio_handle_level_irq);
+		irq_set_chip_data(irq, (void *)(uint64_t)i);
+		irq_set_handler_data(irq, controller);
+	}
+
+	return 0;
+
+free_irqs:
+	for (j = 0; j < i; j++)
+		irq_free_hwirq(controller->irq_intx_table[j]);
+
+	return -1;
+}
+
+/*
+ * Return 1 if the port is strapped to operate in RC mode.
+ */
+static int
+strapped_for_rc(gxio_trio_context_t *trio_context, int mac)
+{
+	TRIO_PCIE_INTFC_PORT_CONFIG_t port_config;
+	unsigned int reg_offset;
+
+	/* Check the port configuration. */
+	reg_offset =
+		(TRIO_PCIE_INTFC_PORT_CONFIG <<
+			TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+		(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
+			TRIO_CFG_REGION_ADDR__INTFC_SHIFT) |
+		(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+	port_config.word =
+		__gxio_mmio_read(trio_context->mmio_base_mac + reg_offset);
+
+	if (port_config.strap_state == AUTO_CONFIG_RC ||
+	    port_config.strap_state == AUTO_CONFIG_RC_G1)
+		return 1;
+	else
+		return 0;
+}
+
+/*
+ * Find valid controllers and fill in pci_controller structs for each
+ * of them.
+ *
+ * Return the number of controllers discovered.
+ */
+int __init tile_pci_init(void)
+{
+	int ctl_index = 0;
+	int i, j;
+
+	if (!pci_probe) {
+		pr_info("PCI: disabled by boot argument\n");
+		return 0;
+	}
+
+	pr_info("PCI: Searching for controllers...\n");
+
+	if (num_trio_shims == 0 || sim_is_simulator())
+		return 0;
+
+	/*
+	 * Now determine which PCIe ports are configured to operate in RC
+	 * mode. There is a differece in the port configuration capability
+	 * between the Gx36 and Gx72 devices.
+	 *
+	 * The Gx36 has configuration capability for each of the 3 PCIe
+	 * interfaces (disable, auto endpoint, auto RC, etc.).
+	 * On the Gx72, you can only select one of the 3 PCIe interfaces per
+	 * TRIO to train automatically. Further, the allowable training modes
+	 * are reduced to four options (auto endpoint, auto RC, stream x1,
+	 * stream x4).
+	 *
+	 * For Gx36 ports, it must be allowed to be in RC mode by the
+	 * Board Information Block, and the hardware strapping pins must be
+	 * set to RC mode.
+	 *
+	 * For Gx72 ports, the port will operate in RC mode if either of the
+	 * following is true:
+	 * 1. It is allowed to be in RC mode by the Board Information Block,
+	 *    and the BIB doesn't allow the EP mode.
+	 * 2. It is allowed to be in either the RC or the EP mode by the BIB,
+	 *    and the hardware strapping pin is set to RC mode.
+	 */
+	for (i = 0; i < TILEGX_NUM_TRIO; i++) {
+		gxio_trio_context_t *context = &trio_contexts[i];
+
+		if (context->fd < 0)
+			continue;
+
+		for (j = 0; j < TILEGX_TRIO_PCIES; j++) {
+			int is_rc = 0;
+
+			if (pcie_ports[i].is_gx72 &&
+			    pcie_ports[i].ports[j].allow_rc) {
+				if (!pcie_ports[i].ports[j].allow_ep ||
+				    strapped_for_rc(context, j))
+					is_rc = 1;
+			} else if (pcie_ports[i].ports[j].allow_rc &&
+				   strapped_for_rc(context, j)) {
+				is_rc = 1;
+			}
+			if (is_rc) {
+				pcie_rc[i][j] = 1;
+				num_rc_controllers++;
+			}
+		}
+	}
+
+	/* Return if no PCIe ports are configured to operate in RC mode. */
+	if (num_rc_controllers == 0)
+		return 0;
+
+	/* Set the TRIO pointer and MAC index for each PCIe RC port. */
+	for (i = 0; i < TILEGX_NUM_TRIO; i++) {
+		for (j = 0; j < TILEGX_TRIO_PCIES; j++) {
+			if (pcie_rc[i][j]) {
+				pci_controllers[ctl_index].trio =
+					&trio_contexts[i];
+				pci_controllers[ctl_index].mac = j;
+				pci_controllers[ctl_index].trio_index = i;
+				ctl_index++;
+				if (ctl_index == num_rc_controllers)
+					goto out;
+			}
+		}
+	}
+
+out:
+	/* Configure each PCIe RC port. */
+	for (i = 0; i < num_rc_controllers; i++) {
+
+		/* Configure the PCIe MAC to run in RC mode. */
+		struct pci_controller *controller = &pci_controllers[i];
+
+		controller->index = i;
+		controller->ops = &tile_cfg_ops;
+
+		controller->io_space.start = PCIBIOS_MIN_IO +
+			(i * IO_SPACE_SIZE);
+		controller->io_space.end = controller->io_space.start +
+			IO_SPACE_SIZE - 1;
+		BUG_ON(controller->io_space.end > IO_SPACE_LIMIT);
+		controller->io_space.flags = IORESOURCE_IO;
+		snprintf(controller->io_space_name,
+			 sizeof(controller->io_space_name),
+			 "PCI I/O domain %d", i);
+		controller->io_space.name = controller->io_space_name;
+
+		/*
+		 * The PCI memory resource is located above the PA space.
+		 * For every host bridge, the BAR window or the MMIO aperture
+		 * is in range [3GB, 4GB - 1] of a 4GB space beyond the
+		 * PA space.
+		 */
+		controller->mem_offset = TILE_PCI_MEM_START +
+			(i * TILE_PCI_BAR_WINDOW_TOP);
+		controller->mem_space.start = controller->mem_offset +
+			TILE_PCI_BAR_WINDOW_TOP - TILE_PCI_BAR_WINDOW_SIZE;
+		controller->mem_space.end = controller->mem_offset +
+			TILE_PCI_BAR_WINDOW_TOP - 1;
+		controller->mem_space.flags = IORESOURCE_MEM;
+		snprintf(controller->mem_space_name,
+			 sizeof(controller->mem_space_name),
+			 "PCI mem domain %d", i);
+		controller->mem_space.name = controller->mem_space_name;
+	}
+
+	return num_rc_controllers;
+}
+
+/*
+ * (pin - 1) converts from the PCI standard's [1:4] convention to
+ * a normal [0:3] range.
+ */
+static int tile_map_irq(const struct pci_dev *dev, u8 device, u8 pin)
+{
+	struct pci_controller *controller =
+		(struct pci_controller *)dev->sysdata;
+	return controller->irq_intx_table[pin - 1];
+}
+
+static void fixup_read_and_payload_sizes(struct pci_controller *controller)
+{
+	gxio_trio_context_t *trio_context = controller->trio;
+	struct pci_bus *root_bus = controller->root_bus;
+	TRIO_PCIE_RC_DEVICE_CONTROL_t dev_control;
+	TRIO_PCIE_RC_DEVICE_CAP_t rc_dev_cap;
+	unsigned int reg_offset;
+	struct pci_bus *child;
+	int mac;
+	int err;
+
+	mac = controller->mac;
+
+	/* Set our max read request size to be 4KB. */
+	reg_offset =
+		(TRIO_PCIE_RC_DEVICE_CONTROL <<
+			TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+		(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_STANDARD <<
+			TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
+		(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+	dev_control.word = __gxio_mmio_read32(trio_context->mmio_base_mac +
+					      reg_offset);
+	dev_control.max_read_req_sz = 5;
+	__gxio_mmio_write32(trio_context->mmio_base_mac + reg_offset,
+			    dev_control.word);
+
+	/*
+	 * Set the max payload size supported by this Gx PCIe MAC.
+	 * Though Gx PCIe supports Max Payload Size of up to 1024 bytes,
+	 * experiments have shown that setting MPS to 256 yields the
+	 * best performance.
+	 */
+	reg_offset =
+		(TRIO_PCIE_RC_DEVICE_CAP <<
+			TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+		(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_STANDARD <<
+			TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
+		(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+	rc_dev_cap.word = __gxio_mmio_read32(trio_context->mmio_base_mac +
+					     reg_offset);
+	rc_dev_cap.mps_sup = 1;
+	__gxio_mmio_write32(trio_context->mmio_base_mac + reg_offset,
+			    rc_dev_cap.word);
+
+	/* Configure PCI Express MPS setting. */
+	list_for_each_entry(child, &root_bus->children, node)
+		pcie_bus_configure_settings(child);
+
+	/*
+	 * Set the mac_config register in trio based on the MPS/MRS of the link.
+	 */
+	reg_offset =
+		(TRIO_PCIE_RC_DEVICE_CONTROL <<
+			TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+		(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_STANDARD <<
+			TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
+		(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+	dev_control.word = __gxio_mmio_read32(trio_context->mmio_base_mac +
+						reg_offset);
+
+	err = gxio_trio_set_mps_mrs(trio_context,
+				    dev_control.max_payload_size,
+				    dev_control.max_read_req_sz,
+				    mac);
+	if (err < 0) {
+		pr_err("PCI: PCIE_CONFIGURE_MAC_MPS_MRS failure, "
+			"MAC %d on TRIO %d\n",
+			mac, controller->trio_index);
+	}
+}
+
+static int setup_pcie_rc_delay(char *str)
+{
+	unsigned long delay = 0;
+	unsigned long trio_index;
+	unsigned long mac;
+
+	if (str == NULL || !isdigit(*str))
+		return -EINVAL;
+	trio_index = simple_strtoul(str, (char **)&str, 10);
+	if (trio_index >= TILEGX_NUM_TRIO)
+		return -EINVAL;
+
+	if (*str != ',')
+		return -EINVAL;
+
+	str++;
+	if (!isdigit(*str))
+		return -EINVAL;
+	mac = simple_strtoul(str, (char **)&str, 10);
+	if (mac >= TILEGX_TRIO_PCIES)
+		return -EINVAL;
+
+	if (*str != '\0') {
+		if (*str != ',')
+			return -EINVAL;
+
+		str++;
+		if (!isdigit(*str))
+			return -EINVAL;
+		delay = simple_strtoul(str, (char **)&str, 10);
+	}
+
+	rc_delay[trio_index][mac] = delay ? : DEFAULT_RC_DELAY;
+	return 0;
+}
+early_param("pcie_rc_delay", setup_pcie_rc_delay);
+
+/* PCI initialization entry point, called by subsys_initcall. */
+int __init pcibios_init(void)
+{
+	resource_size_t offset;
+	LIST_HEAD(resources);
+	int next_busno;
+	int i;
+
+	tile_pci_init();
+
+	if (num_rc_controllers == 0)
+		return 0;
+
+	/*
+	 * Delay a bit in case devices aren't ready.  Some devices are
+	 * known to require at least 20ms here, but we use a more
+	 * conservative value.
+	 */
+	msleep(250);
+
+	/* Scan all of the recorded PCI controllers.  */
+	for (next_busno = 0, i = 0; i < num_rc_controllers; i++) {
+		struct pci_controller *controller = &pci_controllers[i];
+		gxio_trio_context_t *trio_context = controller->trio;
+		TRIO_PCIE_INTFC_PORT_STATUS_t port_status;
+		TRIO_PCIE_INTFC_TX_FIFO_CTL_t tx_fifo_ctl;
+		struct pci_bus *bus;
+		unsigned int reg_offset;
+		unsigned int class_code_revision;
+		int trio_index;
+		int mac;
+		int ret;
+
+		if (trio_context->fd < 0)
+			continue;
+
+		trio_index = controller->trio_index;
+		mac = controller->mac;
+
+		/*
+		 * Check for PCIe link-up status to decide if we need
+		 * to force the link to come up.
+		 */
+		reg_offset =
+			(TRIO_PCIE_INTFC_PORT_STATUS <<
+				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+			(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
+				TRIO_CFG_REGION_ADDR__INTFC_SHIFT) |
+			(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+		port_status.word =
+			__gxio_mmio_read(trio_context->mmio_base_mac +
+					 reg_offset);
+		if (!port_status.dl_up) {
+			if (rc_delay[trio_index][mac]) {
+				pr_info("Delaying PCIe RC TRIO init %d sec"
+					" on MAC %d on TRIO %d\n",
+					rc_delay[trio_index][mac], mac,
+					trio_index);
+				msleep(rc_delay[trio_index][mac] * 1000);
+			}
+			ret = gxio_trio_force_rc_link_up(trio_context, mac);
+			if (ret < 0)
+				pr_err("PCI: PCIE_FORCE_LINK_UP failure, "
+					"MAC %d on TRIO %d\n", mac, trio_index);
+		}
+
+		pr_info("PCI: Found PCI controller #%d on TRIO %d MAC %d\n", i,
+			trio_index, controller->mac);
+
+		/* Delay the bus probe if needed. */
+		if (rc_delay[trio_index][mac]) {
+			pr_info("Delaying PCIe RC bus enumerating %d sec"
+				" on MAC %d on TRIO %d\n",
+				rc_delay[trio_index][mac], mac,
+				trio_index);
+			msleep(rc_delay[trio_index][mac] * 1000);
+		} else {
+			/*
+			 * Wait a bit here because some EP devices
+			 * take longer to come up.
+			 */
+			msleep(1000);
+		}
+
+		/* Check for PCIe link-up status again. */
+		port_status.word =
+			__gxio_mmio_read(trio_context->mmio_base_mac +
+					 reg_offset);
+		if (!port_status.dl_up) {
+			if (pcie_ports[trio_index].ports[mac].removable) {
+				pr_info("PCI: link is down, MAC %d on TRIO %d\n",
+					mac, trio_index);
+				pr_info("This is expected if no PCIe card"
+					" is connected to this link\n");
+			} else
+				pr_err("PCI: link is down, MAC %d on TRIO %d\n",
+					mac, trio_index);
+			continue;
+		}
+
+		/*
+		 * Ensure that the link can come out of L1 power down state.
+		 * Strictly speaking, this is needed only in the case of
+		 * heavy RC-initiated DMAs.
+		 */
+		reg_offset =
+			(TRIO_PCIE_INTFC_TX_FIFO_CTL <<
+				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+			(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
+				TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
+			(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+		tx_fifo_ctl.word =
+			__gxio_mmio_read(trio_context->mmio_base_mac +
+					 reg_offset);
+		tx_fifo_ctl.min_p_credits = 0;
+		__gxio_mmio_write(trio_context->mmio_base_mac + reg_offset,
+				  tx_fifo_ctl.word);
+
+		/*
+		 * Change the device ID so that Linux bus crawl doesn't confuse
+		 * the internal bridge with any Tilera endpoints.
+		 */
+		reg_offset =
+			(TRIO_PCIE_RC_DEVICE_ID_VEN_ID <<
+				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+			(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_STANDARD <<
+				TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
+			(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+		__gxio_mmio_write32(trio_context->mmio_base_mac + reg_offset,
+				    (TILERA_GX36_RC_DEV_ID <<
+				    TRIO_PCIE_RC_DEVICE_ID_VEN_ID__DEV_ID_SHIFT) |
+				    TILERA_VENDOR_ID);
+
+		/* Set the internal P2P bridge class code. */
+		reg_offset =
+			(TRIO_PCIE_RC_REVISION_ID <<
+				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+			(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_STANDARD <<
+				TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
+			(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+		class_code_revision =
+			__gxio_mmio_read32(trio_context->mmio_base_mac +
+					   reg_offset);
+		class_code_revision = (class_code_revision & 0xff) |
+			(PCI_CLASS_BRIDGE_PCI << 16);
+
+		__gxio_mmio_write32(trio_context->mmio_base_mac +
+				    reg_offset, class_code_revision);
+
+#ifdef USE_SHARED_PCIE_CONFIG_REGION
+
+		/* Map in the MMIO space for the PIO region. */
+		offset = HV_TRIO_PIO_OFFSET(trio_context->pio_cfg_index) |
+			(((unsigned long long)mac) <<
+			TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT);
+
+#else
+
+		/* Alloc a PIO region for PCI config access per MAC. */
+		ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0);
+		if (ret < 0) {
+			pr_err("PCI: PCI CFG PIO alloc failure for mac %d "
+				"on TRIO %d, give up\n", mac, trio_index);
+
+			continue;
+		}
+
+		trio_context->pio_cfg_index[mac] = ret;
+
+		/* For PIO CFG, the bus_address_hi parameter is 0. */
+		ret = gxio_trio_init_pio_region_aux(trio_context,
+			trio_context->pio_cfg_index[mac],
+			mac, 0, HV_TRIO_PIO_FLAG_CONFIG_SPACE);
+		if (ret < 0) {
+			pr_err("PCI: PCI CFG PIO init failure for mac %d "
+				"on TRIO %d, give up\n", mac, trio_index);
+
+			continue;
+		}
+
+		offset = HV_TRIO_PIO_OFFSET(trio_context->pio_cfg_index[mac]) |
+			(((unsigned long long)mac) <<
+			TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT);
+
+#endif
+
+		/*
+		 * To save VMALLOC space, we take advantage of the fact that
+		 * bit 29 in the PIO CFG address format is reserved 0. With
+		 * TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT being 30,
+		 * this cuts VMALLOC space usage from 1GB to 512MB per mac.
+		 */
+		trio_context->mmio_base_pio_cfg[mac] =
+			iorpc_ioremap(trio_context->fd, offset, (1UL <<
+			(TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT - 1)));
+		if (trio_context->mmio_base_pio_cfg[mac] == NULL) {
+			pr_err("PCI: PIO map failure for mac %d on TRIO %d\n",
+				mac, trio_index);
+
+			continue;
+		}
+
+		/* Initialize the PCIe interrupts. */
+		if (tile_init_irqs(controller)) {
+			pr_err("PCI: IRQs init failure for mac %d on TRIO %d\n",
+				mac, trio_index);
+
+			continue;
+		}
+
+		/*
+		 * The PCI memory resource is located above the PA space.
+		 * The memory range for the PCI root bus should not overlap
+		 * with the physical RAM.
+		 */
+		pci_add_resource_offset(&resources, &controller->mem_space,
+					controller->mem_offset);
+		pci_add_resource(&resources, &controller->io_space);
+		controller->first_busno = next_busno;
+		bus = pci_scan_root_bus(NULL, next_busno, controller->ops,
+					controller, &resources);
+		controller->root_bus = bus;
+		next_busno = bus->busn_res.end + 1;
+	}
+
+	/* Do machine dependent PCI interrupt routing */
+	pci_fixup_irqs(pci_common_swizzle, tile_map_irq);
+
+	/*
+	 * This comes from the generic Linux PCI driver.
+	 *
+	 * It allocates all of the resources (I/O memory, etc)
+	 * associated with the devices read in above.
+	 */
+	pci_assign_unassigned_resources();
+
+	/* Record the I/O resources in the PCI controller structure. */
+	for (i = 0; i < num_rc_controllers; i++) {
+		struct pci_controller *controller = &pci_controllers[i];
+		gxio_trio_context_t *trio_context = controller->trio;
+		struct pci_bus *root_bus = pci_controllers[i].root_bus;
+		int ret;
+		int j;
+
+		/*
+		 * Skip controllers that are not properly initialized or
+		 * have down links.
+		 */
+		if (root_bus == NULL)
+			continue;
+
+		/* Configure the max_payload_size values for this domain. */
+		fixup_read_and_payload_sizes(controller);
+
+		/* Alloc a PIO region for PCI memory access for each RC port. */
+		ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0);
+		if (ret < 0) {
+			pr_err("PCI: MEM PIO alloc failure on TRIO %d mac %d, "
+			       "give up\n", controller->trio_index,
+			       controller->mac);
+
+			continue;
+		}
+
+		controller->pio_mem_index = ret;
+
+		/*
+		 * For PIO MEM, the bus_address_hi parameter is hard-coded 0
+		 * because we always assign 32-bit PCI bus BAR ranges.
+		 */
+		ret = gxio_trio_init_pio_region_aux(trio_context,
+						    controller->pio_mem_index,
+						    controller->mac,
+						    0,
+						    0);
+		if (ret < 0) {
+			pr_err("PCI: MEM PIO init failure on TRIO %d mac %d, "
+			       "give up\n", controller->trio_index,
+			       controller->mac);
+
+			continue;
+		}
+
+#ifdef CONFIG_TILE_PCI_IO
+		/*
+		 * Alloc a PIO region for PCI I/O space access for each RC port.
+		 */
+		ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0);
+		if (ret < 0) {
+			pr_err("PCI: I/O PIO alloc failure on TRIO %d mac %d, "
+			       "give up\n", controller->trio_index,
+			       controller->mac);
+
+			continue;
+		}
+
+		controller->pio_io_index = ret;
+
+		/*
+		 * For PIO IO, the bus_address_hi parameter is hard-coded 0
+		 * because PCI I/O address space is 32-bit.
+		 */
+		ret = gxio_trio_init_pio_region_aux(trio_context,
+						    controller->pio_io_index,
+						    controller->mac,
+						    0,
+						    HV_TRIO_PIO_FLAG_IO_SPACE);
+		if (ret < 0) {
+			pr_err("PCI: I/O PIO init failure on TRIO %d mac %d, "
+			       "give up\n", controller->trio_index,
+			       controller->mac);
+
+			continue;
+		}
+#endif
+
+		/*
+		 * Configure a Mem-Map region for each memory controller so
+		 * that Linux can map all of its PA space to the PCI bus.
+		 * Use the IOMMU to handle hash-for-home memory.
+		 */
+		for_each_online_node(j) {
+			unsigned long start_pfn = node_start_pfn[j];
+			unsigned long end_pfn = node_end_pfn[j];
+			unsigned long nr_pages = end_pfn - start_pfn;
+
+			ret = gxio_trio_alloc_memory_maps(trio_context, 1, 0,
+							  0);
+			if (ret < 0) {
+				pr_err("PCI: Mem-Map alloc failure on TRIO %d "
+				       "mac %d for MC %d, give up\n",
+				       controller->trio_index,
+				       controller->mac, j);
+
+				goto alloc_mem_map_failed;
+			}
+
+			controller->mem_maps[j] = ret;
+
+			/*
+			 * Initialize the Mem-Map and the I/O MMU so that all
+			 * the physical memory can be accessed by the endpoint
+			 * devices. The base bus address is set to the base CPA
+			 * of this memory controller plus an offset (see pci.h).
+			 * The region's base VA is set to the base CPA. The
+			 * I/O MMU table essentially translates the CPA to
+			 * the real PA. Implicitly, for node 0, we create
+			 * a separate Mem-Map region that serves as the inbound
+			 * window for legacy 32-bit devices. This is a direct
+			 * map of the low 4GB CPA space.
+			 */
+			ret = gxio_trio_init_memory_map_mmu_aux(trio_context,
+				controller->mem_maps[j],
+				start_pfn << PAGE_SHIFT,
+				nr_pages << PAGE_SHIFT,
+				trio_context->asid,
+				controller->mac,
+				(start_pfn << PAGE_SHIFT) +
+				TILE_PCI_MEM_MAP_BASE_OFFSET,
+				j,
+				GXIO_TRIO_ORDER_MODE_UNORDERED);
+			if (ret < 0) {
+				pr_err("PCI: Mem-Map init failure on TRIO %d "
+				       "mac %d for MC %d, give up\n",
+				       controller->trio_index,
+				       controller->mac, j);
+
+				goto alloc_mem_map_failed;
+			}
+			continue;
+
+alloc_mem_map_failed:
+			break;
+		}
+	}
+
+	return 0;
+}
+subsys_initcall(pcibios_init);
+
+/* No bus fixups needed. */
+void pcibios_fixup_bus(struct pci_bus *bus)
+{
+}
+
+/* Process any "pci=" kernel boot arguments. */
+char *__init pcibios_setup(char *str)
+{
+	if (!strcmp(str, "off")) {
+		pci_probe = 0;
+		return NULL;
+	}
+	return str;
+}
+
+/*
+ * Called for each device after PCI setup is done.
+ * We initialize the PCI device capabilities conservatively, assuming that
+ * all devices can only address the 32-bit DMA space. The exception here is
+ * that the device dma_offset is set to the value that matches the 64-bit
+ * capable devices. This is OK because dma_offset is not used by legacy
+ * dma_ops, nor by the hybrid dma_ops's streaming DMAs, which are 64-bit ops.
+ * This implementation matches the kernel design of setting PCI devices'
+ * coherent_dma_mask to 0xffffffffull by default, allowing the device drivers
+ * to skip calling pci_set_consistent_dma_mask(DMA_BIT_MASK(32)).
+ */
+static void pcibios_fixup_final(struct pci_dev *pdev)
+{
+	set_dma_ops(&pdev->dev, gx_legacy_pci_dma_map_ops);
+	set_dma_offset(&pdev->dev, TILE_PCI_MEM_MAP_BASE_OFFSET);
+	pdev->dev.archdata.max_direct_dma_addr =
+		TILE_PCI_MAX_DIRECT_DMA_ADDRESS;
+	pdev->dev.coherent_dma_mask = TILE_PCI_MAX_DIRECT_DMA_ADDRESS;
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pcibios_fixup_final);
+
+/* Map a PCI MMIO bus address into VA space. */
+void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
+{
+	struct pci_controller *controller = NULL;
+	resource_size_t bar_start;
+	resource_size_t bar_end;
+	resource_size_t offset;
+	resource_size_t start;
+	resource_size_t end;
+	int trio_fd;
+	int i;
+
+	start = phys_addr;
+	end = phys_addr + size - 1;
+
+	/*
+	 * By searching phys_addr in each controller's mem_space, we can
+	 * determine the controller that should accept the PCI memory access.
+	 */
+	for (i = 0; i < num_rc_controllers; i++) {
+		/*
+		 * Skip controllers that are not properly initialized or
+		 * have down links.
+		 */
+		if (pci_controllers[i].root_bus == NULL)
+			continue;
+
+		bar_start = pci_controllers[i].mem_space.start;
+		bar_end = pci_controllers[i].mem_space.end;
+
+		if ((start >= bar_start) && (end <= bar_end)) {
+			controller = &pci_controllers[i];
+			break;
+		}
+	}
+
+	if (controller == NULL)
+		return NULL;
+
+	trio_fd = controller->trio->fd;
+
+	/* Convert the resource start to the bus address offset. */
+	start = phys_addr - controller->mem_offset;
+
+	offset = HV_TRIO_PIO_OFFSET(controller->pio_mem_index) + start;
+
+	/* We need to keep the PCI bus address's in-page offset in the VA. */
+	return iorpc_ioremap(trio_fd, offset, size) +
+		(start & (PAGE_SIZE - 1));
+}
+EXPORT_SYMBOL(ioremap);
+
+#ifdef CONFIG_TILE_PCI_IO
+/* Map a PCI I/O address into VA space. */
+void __iomem *ioport_map(unsigned long port, unsigned int size)
+{
+	struct pci_controller *controller = NULL;
+	resource_size_t bar_start;
+	resource_size_t bar_end;
+	resource_size_t offset;
+	resource_size_t start;
+	resource_size_t end;
+	int trio_fd;
+	int i;
+
+	start = port;
+	end = port + size - 1;
+
+	/*
+	 * By searching the port in each controller's io_space, we can
+	 * determine the controller that should accept the PCI I/O access.
+	 */
+	for (i = 0; i < num_rc_controllers; i++) {
+		/*
+		 * Skip controllers that are not properly initialized or
+		 * have down links.
+		 */
+		if (pci_controllers[i].root_bus == NULL)
+			continue;
+
+		bar_start = pci_controllers[i].io_space.start;
+		bar_end = pci_controllers[i].io_space.end;
+
+		if ((start >= bar_start) && (end <= bar_end)) {
+			controller = &pci_controllers[i];
+			break;
+		}
+	}
+
+	if (controller == NULL)
+		return NULL;
+
+	trio_fd = controller->trio->fd;
+
+	/* Convert the resource start to the bus address offset. */
+	port -= controller->io_space.start;
+
+	offset = HV_TRIO_PIO_OFFSET(controller->pio_io_index) + port;
+
+	/* We need to keep the PCI bus address's in-page offset in the VA. */
+	return iorpc_ioremap(trio_fd, offset, size) + (port & (PAGE_SIZE - 1));
+}
+EXPORT_SYMBOL(ioport_map);
+
+void ioport_unmap(void __iomem *addr)
+{
+	iounmap(addr);
+}
+EXPORT_SYMBOL(ioport_unmap);
+#endif
+
+void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
+{
+	iounmap(addr);
+}
+EXPORT_SYMBOL(pci_iounmap);
+
+/****************************************************************
+ *
+ * Tile PCI config space read/write routines
+ *
+ ****************************************************************/
+
+/*
+ * These are the normal read and write ops
+ * These are expanded with macros from  pci_bus_read_config_byte() etc.
+ *
+ * devfn is the combined PCI device & function.
+ *
+ * offset is in bytes, from the start of config space for the
+ * specified bus & device.
+ */
+static int tile_cfg_read(struct pci_bus *bus, unsigned int devfn, int offset,
+			 int size, u32 *val)
+{
+	struct pci_controller *controller = bus->sysdata;
+	gxio_trio_context_t *trio_context = controller->trio;
+	int busnum = bus->number & 0xff;
+	int device = PCI_SLOT(devfn);
+	int function = PCI_FUNC(devfn);
+	int config_type = 1;
+	TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR_t cfg_addr;
+	void *mmio_addr;
+
+	/*
+	 * Map all accesses to the local device on root bus into the
+	 * MMIO space of the MAC. Accesses to the downstream devices
+	 * go to the PIO space.
+	 */
+	if (pci_is_root_bus(bus)) {
+		if (device == 0) {
+			/*
+			 * This is the internal downstream P2P bridge,
+			 * access directly.
+			 */
+			unsigned int reg_offset;
+
+			reg_offset = ((offset & 0xFFF) <<
+				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+				(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_PROTECTED
+				<< TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
+				(controller->mac <<
+					TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+			mmio_addr = trio_context->mmio_base_mac + reg_offset;
+
+			goto valid_device;
+
+		} else {
+			/*
+			 * We fake an empty device for (device > 0),
+			 * since there is only one device on bus 0.
+			 */
+			goto invalid_device;
+		}
+	}
+
+	/*
+	 * Accesses to the directly attached device have to be
+	 * sent as type-0 configs.
+	 */
+	if (busnum == (controller->first_busno + 1)) {
+		/*
+		 * There is only one device off of our built-in P2P bridge.
+		 */
+		if (device != 0)
+			goto invalid_device;
+
+		config_type = 0;
+	}
+
+	cfg_addr.word = 0;
+	cfg_addr.reg_addr = (offset & 0xFFF);
+	cfg_addr.fn = function;
+	cfg_addr.dev = device;
+	cfg_addr.bus = busnum;
+	cfg_addr.type = config_type;
+
+	/*
+	 * Note that we don't set the mac field in cfg_addr because the
+	 * mapping is per port.
+	 */
+	mmio_addr = trio_context->mmio_base_pio_cfg[controller->mac] +
+		cfg_addr.word;
+
+valid_device:
+
+	switch (size) {
+	case 4:
+		*val = __gxio_mmio_read32(mmio_addr);
+		break;
+
+	case 2:
+		*val = __gxio_mmio_read16(mmio_addr);
+		break;
+
+	case 1:
+		*val = __gxio_mmio_read8(mmio_addr);
+		break;
+
+	default:
+		return PCIBIOS_FUNC_NOT_SUPPORTED;
+	}
+
+	TRACE_CFG_RD(size, *val, busnum, device, function, offset);
+
+	return 0;
+
+invalid_device:
+
+	switch (size) {
+	case 4:
+		*val = 0xFFFFFFFF;
+		break;
+
+	case 2:
+		*val = 0xFFFF;
+		break;
+
+	case 1:
+		*val = 0xFF;
+		break;
+
+	default:
+		return PCIBIOS_FUNC_NOT_SUPPORTED;
+	}
+
+	return 0;
+}
+
+
+/*
+ * See tile_cfg_read() for relevent comments.
+ * Note that "val" is the value to write, not a pointer to that value.
+ */
+static int tile_cfg_write(struct pci_bus *bus, unsigned int devfn, int offset,
+			  int size, u32 val)
+{
+	struct pci_controller *controller = bus->sysdata;
+	gxio_trio_context_t *trio_context = controller->trio;
+	int busnum = bus->number & 0xff;
+	int device = PCI_SLOT(devfn);
+	int function = PCI_FUNC(devfn);
+	int config_type = 1;
+	TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR_t cfg_addr;
+	void *mmio_addr;
+	u32 val_32 = (u32)val;
+	u16 val_16 = (u16)val;
+	u8 val_8 = (u8)val;
+
+	/*
+	 * Map all accesses to the local device on root bus into the
+	 * MMIO space of the MAC. Accesses to the downstream devices
+	 * go to the PIO space.
+	 */
+	if (pci_is_root_bus(bus)) {
+		if (device == 0) {
+			/*
+			 * This is the internal downstream P2P bridge,
+			 * access directly.
+			 */
+			unsigned int reg_offset;
+
+			reg_offset = ((offset & 0xFFF) <<
+				TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+				(TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_PROTECTED
+				<< TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
+				(controller->mac <<
+					TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+			mmio_addr = trio_context->mmio_base_mac + reg_offset;
+
+			goto valid_device;
+
+		} else {
+			/*
+			 * We fake an empty device for (device > 0),
+			 * since there is only one device on bus 0.
+			 */
+			goto invalid_device;
+		}
+	}
+
+	/*
+	 * Accesses to the directly attached device have to be
+	 * sent as type-0 configs.
+	 */
+	if (busnum == (controller->first_busno + 1)) {
+		/*
+		 * There is only one device off of our built-in P2P bridge.
+		 */
+		if (device != 0)
+			goto invalid_device;
+
+		config_type = 0;
+	}
+
+	cfg_addr.word = 0;
+	cfg_addr.reg_addr = (offset & 0xFFF);
+	cfg_addr.fn = function;
+	cfg_addr.dev = device;
+	cfg_addr.bus = busnum;
+	cfg_addr.type = config_type;
+
+	/*
+	 * Note that we don't set the mac field in cfg_addr because the
+	 * mapping is per port.
+	 */
+	mmio_addr = trio_context->mmio_base_pio_cfg[controller->mac] +
+			cfg_addr.word;
+
+valid_device:
+
+	switch (size) {
+	case 4:
+		__gxio_mmio_write32(mmio_addr, val_32);
+		TRACE_CFG_WR(size, val_32, busnum, device, function, offset);
+		break;
+
+	case 2:
+		__gxio_mmio_write16(mmio_addr, val_16);
+		TRACE_CFG_WR(size, val_16, busnum, device, function, offset);
+		break;
+
+	case 1:
+		__gxio_mmio_write8(mmio_addr, val_8);
+		TRACE_CFG_WR(size, val_8, busnum, device, function, offset);
+		break;
+
+	default:
+		return PCIBIOS_FUNC_NOT_SUPPORTED;
+	}
+
+invalid_device:
+
+	return 0;
+}
+
+
+static struct pci_ops tile_cfg_ops = {
+	.read =         tile_cfg_read,
+	.write =        tile_cfg_write,
+};
+
+
+/* MSI support starts here. */
+static unsigned int tilegx_msi_startup(struct irq_data *d)
+{
+	if (d->msi_desc)
+		unmask_msi_irq(d);
+
+	return 0;
+}
+
+static void tilegx_msi_ack(struct irq_data *d)
+{
+	__insn_mtspr(SPR_IPI_EVENT_RESET_K, 1UL << d->irq);
+}
+
+static void tilegx_msi_mask(struct irq_data *d)
+{
+	mask_msi_irq(d);
+	__insn_mtspr(SPR_IPI_MASK_SET_K, 1UL << d->irq);
+}
+
+static void tilegx_msi_unmask(struct irq_data *d)
+{
+	__insn_mtspr(SPR_IPI_MASK_RESET_K, 1UL << d->irq);
+	unmask_msi_irq(d);
+}
+
+static struct irq_chip tilegx_msi_chip = {
+	.name			= "tilegx_msi",
+	.irq_startup		= tilegx_msi_startup,
+	.irq_ack		= tilegx_msi_ack,
+	.irq_mask		= tilegx_msi_mask,
+	.irq_unmask		= tilegx_msi_unmask,
+
+	/* TBD: support set_affinity. */
+};
+
+int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
+{
+	struct pci_controller *controller;
+	gxio_trio_context_t *trio_context;
+	struct msi_msg msg;
+	int default_irq;
+	uint64_t mem_map_base;
+	uint64_t mem_map_limit;
+	u64 msi_addr;
+	int mem_map;
+	int cpu;
+	int irq;
+	int ret;
+
+	irq = irq_alloc_hwirq(-1);
+	if (!irq)
+		return -ENOSPC;
+
+	/*
+	 * Since we use a 64-bit Mem-Map to accept the MSI write, we fail
+	 * devices that are not capable of generating a 64-bit message address.
+	 * These devices will fall back to using the legacy interrupts.
+	 * Most PCIe endpoint devices do support 64-bit message addressing.
+	 */
+	if (desc->msi_attrib.is_64 == 0) {
+		dev_printk(KERN_INFO, &pdev->dev,
+			"64-bit MSI message address not supported, "
+			"falling back to legacy interrupts.\n");
+
+		ret = -ENOMEM;
+		goto is_64_failure;
+	}
+
+	default_irq = desc->msi_attrib.default_irq;
+	controller = irq_get_handler_data(default_irq);
+
+	BUG_ON(!controller);
+
+	trio_context = controller->trio;
+
+	/*
+	 * Allocate a scatter-queue that will accept the MSI write and
+	 * trigger the TILE-side interrupts. We use the scatter-queue regions
+	 * before the mem map regions, because the latter are needed by more
+	 * applications.
+	 */
+	mem_map = gxio_trio_alloc_scatter_queues(trio_context, 1, 0, 0);
+	if (mem_map >= 0) {
+		TRIO_MAP_SQ_DOORBELL_FMT_t doorbell_template = {{
+			.pop = 0,
+			.doorbell = 1,
+		}};
+
+		mem_map += TRIO_NUM_MAP_MEM_REGIONS;
+		mem_map_base = MEM_MAP_INTR_REGIONS_BASE +
+			mem_map * MEM_MAP_INTR_REGION_SIZE;
+		mem_map_limit = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 1;
+
+		msi_addr = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 8;
+		msg.data = (unsigned int)doorbell_template.word;
+	} else {
+		/* SQ regions are out, allocate from map mem regions. */
+		mem_map = gxio_trio_alloc_memory_maps(trio_context, 1, 0, 0);
+		if (mem_map < 0) {
+			dev_printk(KERN_INFO, &pdev->dev,
+				"%s Mem-Map alloc failure. "
+				"Failed to initialize MSI interrupts. "
+				"Falling back to legacy interrupts.\n",
+				desc->msi_attrib.is_msix ? "MSI-X" : "MSI");
+			ret = -ENOMEM;
+			goto msi_mem_map_alloc_failure;
+		}
+
+		mem_map_base = MEM_MAP_INTR_REGIONS_BASE +
+			mem_map * MEM_MAP_INTR_REGION_SIZE;
+		mem_map_limit = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 1;
+
+		msi_addr = mem_map_base + TRIO_MAP_MEM_REG_INT3 -
+			TRIO_MAP_MEM_REG_INT0;
+
+		msg.data = mem_map;
+	}
+
+	/* We try to distribute different IRQs to different tiles. */
+	cpu = tile_irq_cpu(irq);
+
+	/*
+	 * Now call up to the HV to configure the MSI interrupt and
+	 * set up the IPI binding.
+	 */
+	ret = gxio_trio_config_msi_intr(trio_context, cpu_x(cpu), cpu_y(cpu),
+					KERNEL_PL, irq, controller->mac,
+					mem_map, mem_map_base, mem_map_limit,
+					trio_context->asid);
+	if (ret < 0) {
+		dev_printk(KERN_INFO, &pdev->dev, "HV MSI config failed.\n");
+
+		goto hv_msi_config_failure;
+	}
+
+	irq_set_msi_desc(irq, desc);
+
+	msg.address_hi = msi_addr >> 32;
+	msg.address_lo = msi_addr & 0xffffffff;
+
+	write_msi_msg(irq, &msg);
+	irq_set_chip_and_handler(irq, &tilegx_msi_chip, handle_level_irq);
+	irq_set_handler_data(irq, controller);
+
+	return 0;
+
+hv_msi_config_failure:
+	/* Free mem-map */
+msi_mem_map_alloc_failure:
+is_64_failure:
+	irq_free_hwirq(irq);
+	return ret;
+}
+
+void arch_teardown_msi_irq(unsigned int irq)
+{
+	irq_free_hwirq(irq);
+}
diff --git a/arch/tile/kernel/perf_event.c b/arch/tile/kernel/perf_event.c
new file mode 100644
index 00000000000..2bf6c9c135c
--- /dev/null
+++ b/arch/tile/kernel/perf_event.c
@@ -0,0 +1,1005 @@
+/*
+ * Copyright 2014 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ *
+ * Perf_events support for Tile processor.
+ *
+ * This code is based upon the x86 perf event
+ * code, which is:
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *  Copyright (C) 2009 Google, Inc., Stephane Eranian
+ */
+
+#include <linux/kprobes.h>
+#include <linux/kernel.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+#include <linux/bitmap.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/perf_event.h>
+#include <linux/atomic.h>
+#include <asm/traps.h>
+#include <asm/stack.h>
+#include <asm/pmc.h>
+#include <hv/hypervisor.h>
+
+#define TILE_MAX_COUNTERS	4
+
+#define PERF_COUNT_0_IDX	0
+#define PERF_COUNT_1_IDX	1
+#define AUX_PERF_COUNT_0_IDX	2
+#define AUX_PERF_COUNT_1_IDX	3
+
+struct cpu_hw_events {
+	int			n_events;
+	struct perf_event	*events[TILE_MAX_COUNTERS]; /* counter order */
+	struct perf_event	*event_list[TILE_MAX_COUNTERS]; /* enabled
+								order */
+	int			assign[TILE_MAX_COUNTERS];
+	unsigned long		active_mask[BITS_TO_LONGS(TILE_MAX_COUNTERS)];
+	unsigned long		used_mask;
+};
+
+/* TILE arch specific performance monitor unit */
+struct tile_pmu {
+	const char	*name;
+	int		version;
+	const int	*hw_events;	/* generic hw events table */
+	/* generic hw cache events table */
+	const int	(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
+				       [PERF_COUNT_HW_CACHE_OP_MAX]
+				       [PERF_COUNT_HW_CACHE_RESULT_MAX];
+	int		(*map_hw_event)(u64);	 /*method used to map
+						  hw events */
+	int		(*map_cache_event)(u64); /*method used to map
+						  cache events */
+
+	u64		max_period;		/* max sampling period */
+	u64		cntval_mask;		/* counter width mask */
+	int		cntval_bits;		/* counter width */
+	int		max_events;		/* max generic hw events
+						in map */
+	int		num_counters;		/* number base + aux counters */
+	int		num_base_counters;	/* number base counters */
+};
+
+DEFINE_PER_CPU(u64, perf_irqs);
+static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+#define TILE_OP_UNSUPP		(-1)
+
+#ifndef __tilegx__
+/* TILEPro hardware events map */
+static const int tile_hw_event_map[] = {
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0x01, /* ONE */
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x06, /* MP_BUNDLE_RETIRED */
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= TILE_OP_UNSUPP,
+	[PERF_COUNT_HW_CACHE_MISSES]		= TILE_OP_UNSUPP,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x16, /*
+					  MP_CONDITIONAL_BRANCH_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x14, /*
+					  MP_CONDITIONAL_BRANCH_MISSPREDICT */
+	[PERF_COUNT_HW_BUS_CYCLES]		= TILE_OP_UNSUPP,
+};
+#else
+/* TILEGx hardware events map */
+static const int tile_hw_event_map[] = {
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0x181, /* ONE */
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0xdb, /* INSTRUCTION_BUNDLE */
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= TILE_OP_UNSUPP,
+	[PERF_COUNT_HW_CACHE_MISSES]		= TILE_OP_UNSUPP,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0xd9, /*
+						COND_BRANCH_PRED_CORRECT */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0xda, /*
+						COND_BRANCH_PRED_INCORRECT */
+	[PERF_COUNT_HW_BUS_CYCLES]		= TILE_OP_UNSUPP,
+};
+#endif
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Generalized hw caching related hw_event table, filled
+ * in on a per model basis. A value of -1 means
+ * 'not supported', any other value means the
+ * raw hw_event ID.
+ */
+#ifndef __tilegx__
+/* TILEPro hardware cache event map */
+static const int tile_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
+				     [PERF_COUNT_HW_CACHE_OP_MAX]
+				     [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = 0x21, /* RD_MISS */
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = 0x22, /* WR_MISS */
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x12, /* MP_ICACHE_HIT_ISSUED */
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x1d, /* TLB_CNT */
+		[C(RESULT_MISS)] = 0x20, /* TLB_EXCEPTION */
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x13, /* MP_ITLB_HIT_ISSUED */
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+},
+};
+#else
+/* TILEGx hardware events map */
+static const int tile_cache_event_map[PERF_COUNT_HW_CACHE_MAX]
+				     [PERF_COUNT_HW_CACHE_OP_MAX]
+				     [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+	/*
+	 * Like some other architectures (e.g. ARM), the performance
+	 * counters don't differentiate between read and write
+	 * accesses/misses, so this isn't strictly correct, but it's the
+	 * best we can do. Writes and reads get combined.
+	 */
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = 0x44, /* RD_MISS */
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = 0x45, /* WR_MISS */
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = 0x40, /* TLB_CNT */
+		[C(RESULT_MISS)] = 0x43, /* TLB_EXCEPTION */
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = 0x40, /* TLB_CNT */
+		[C(RESULT_MISS)] = 0x43, /* TLB_EXCEPTION */
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = 0xd4, /* ITLB_MISS_INT */
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = 0xd4, /* ITLB_MISS_INT */
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = TILE_OP_UNSUPP,
+		[C(RESULT_MISS)] = TILE_OP_UNSUPP,
+	},
+},
+};
+#endif
+
+static atomic_t tile_active_events;
+static DEFINE_MUTEX(perf_intr_reserve_mutex);
+
+static int tile_map_hw_event(u64 config);
+static int tile_map_cache_event(u64 config);
+
+static int tile_pmu_handle_irq(struct pt_regs *regs, int fault);
+
+/*
+ * To avoid new_raw_count getting larger then pre_raw_count
+ * in tile_perf_event_update(), we limit the value of max_period to 2^31 - 1.
+ */
+static const struct tile_pmu tilepmu = {
+#ifndef __tilegx__
+	.name = "tilepro",
+#else
+	.name = "tilegx",
+#endif
+	.max_events = ARRAY_SIZE(tile_hw_event_map),
+	.map_hw_event = tile_map_hw_event,
+	.hw_events = tile_hw_event_map,
+	.map_cache_event = tile_map_cache_event,
+	.cache_events = &tile_cache_event_map,
+	.cntval_bits = 32,
+	.cntval_mask = (1ULL << 32) - 1,
+	.max_period = (1ULL << 31) - 1,
+	.num_counters = TILE_MAX_COUNTERS,
+	.num_base_counters = TILE_BASE_COUNTERS,
+};
+
+static const struct tile_pmu *tile_pmu __read_mostly;
+
+/*
+ * Check whether perf event is enabled.
+ */
+int tile_perf_enabled(void)
+{
+	return atomic_read(&tile_active_events) != 0;
+}
+
+/*
+ * Read Performance Counters.
+ */
+static inline u64 read_counter(int idx)
+{
+	u64 val = 0;
+
+	/* __insn_mfspr() only takes an immediate argument */
+	switch (idx) {
+	case PERF_COUNT_0_IDX:
+		val = __insn_mfspr(SPR_PERF_COUNT_0);
+		break;
+	case PERF_COUNT_1_IDX:
+		val = __insn_mfspr(SPR_PERF_COUNT_1);
+		break;
+	case AUX_PERF_COUNT_0_IDX:
+		val = __insn_mfspr(SPR_AUX_PERF_COUNT_0);
+		break;
+	case AUX_PERF_COUNT_1_IDX:
+		val = __insn_mfspr(SPR_AUX_PERF_COUNT_1);
+		break;
+	default:
+		WARN_ON_ONCE(idx > AUX_PERF_COUNT_1_IDX ||
+				idx < PERF_COUNT_0_IDX);
+	}
+
+	return val;
+}
+
+/*
+ * Write Performance Counters.
+ */
+static inline void write_counter(int idx, u64 value)
+{
+	/* __insn_mtspr() only takes an immediate argument */
+	switch (idx) {
+	case PERF_COUNT_0_IDX:
+		__insn_mtspr(SPR_PERF_COUNT_0, value);
+		break;
+	case PERF_COUNT_1_IDX:
+		__insn_mtspr(SPR_PERF_COUNT_1, value);
+		break;
+	case AUX_PERF_COUNT_0_IDX:
+		__insn_mtspr(SPR_AUX_PERF_COUNT_0, value);
+		break;
+	case AUX_PERF_COUNT_1_IDX:
+		__insn_mtspr(SPR_AUX_PERF_COUNT_1, value);
+		break;
+	default:
+		WARN_ON_ONCE(idx > AUX_PERF_COUNT_1_IDX ||
+				idx < PERF_COUNT_0_IDX);
+	}
+}
+
+/*
+ * Enable performance event by setting
+ * Performance Counter Control registers.
+ */
+static inline void tile_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long cfg, mask;
+	int shift, idx = hwc->idx;
+
+	/*
+	 * prevent early activation from tile_pmu_start() in hw_perf_enable
+	 */
+
+	if (WARN_ON_ONCE(idx == -1))
+		return;
+
+	if (idx < tile_pmu->num_base_counters)
+		cfg = __insn_mfspr(SPR_PERF_COUNT_CTL);
+	else
+		cfg = __insn_mfspr(SPR_AUX_PERF_COUNT_CTL);
+
+	switch (idx) {
+	case PERF_COUNT_0_IDX:
+	case AUX_PERF_COUNT_0_IDX:
+		mask = TILE_EVENT_MASK;
+		shift = 0;
+		break;
+	case PERF_COUNT_1_IDX:
+	case AUX_PERF_COUNT_1_IDX:
+		mask = TILE_EVENT_MASK << 16;
+		shift = 16;
+		break;
+	default:
+		WARN_ON_ONCE(idx < PERF_COUNT_0_IDX ||
+			idx > AUX_PERF_COUNT_1_IDX);
+		return;
+	}
+
+	/* Clear mask bits to enable the event. */
+	cfg &= ~mask;
+	cfg |= hwc->config << shift;
+
+	if (idx < tile_pmu->num_base_counters)
+		__insn_mtspr(SPR_PERF_COUNT_CTL, cfg);
+	else
+		__insn_mtspr(SPR_AUX_PERF_COUNT_CTL, cfg);
+}
+
+/*
+ * Disable performance event by clearing
+ * Performance Counter Control registers.
+ */
+static inline void tile_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long cfg, mask;
+	int idx = hwc->idx;
+
+	if (idx == -1)
+		return;
+
+	if (idx < tile_pmu->num_base_counters)
+		cfg = __insn_mfspr(SPR_PERF_COUNT_CTL);
+	else
+		cfg = __insn_mfspr(SPR_AUX_PERF_COUNT_CTL);
+
+	switch (idx) {
+	case PERF_COUNT_0_IDX:
+	case AUX_PERF_COUNT_0_IDX:
+		mask = TILE_PLM_MASK;
+		break;
+	case PERF_COUNT_1_IDX:
+	case AUX_PERF_COUNT_1_IDX:
+		mask = TILE_PLM_MASK << 16;
+		break;
+	default:
+		WARN_ON_ONCE(idx < PERF_COUNT_0_IDX ||
+			idx > AUX_PERF_COUNT_1_IDX);
+		return;
+	}
+
+	/* Set mask bits to disable the event. */
+	cfg |= mask;
+
+	if (idx < tile_pmu->num_base_counters)
+		__insn_mtspr(SPR_PERF_COUNT_CTL, cfg);
+	else
+		__insn_mtspr(SPR_AUX_PERF_COUNT_CTL, cfg);
+}
+
+/*
+ * Propagate event elapsed time into the generic event.
+ * Can only be executed on the CPU where the event is active.
+ * Returns the delta events processed.
+ */
+static u64 tile_perf_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int shift = 64 - tile_pmu->cntval_bits;
+	u64 prev_raw_count, new_raw_count;
+	u64 oldval;
+	int idx = hwc->idx;
+	u64 delta;
+
+	/*
+	 * Careful: an NMI might modify the previous event value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic event atomically:
+	 */
+again:
+	prev_raw_count = local64_read(&hwc->prev_count);
+	new_raw_count = read_counter(idx);
+
+	oldval = local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+				 new_raw_count);
+	if (oldval != prev_raw_count)
+		goto again;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (event-)time and add that to the generic event.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count.
+	 */
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
+}
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the event disabled in hw:
+ */
+static int tile_event_set_period(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+	s64 left = local64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int ret = 0;
+
+	/*
+	 * If we are way outside a reasonable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+	if (left > tile_pmu->max_period)
+		left = tile_pmu->max_period;
+
+	/*
+	 * The hw event starts counting from this event offset,
+	 * mark it to be able to extra future deltas:
+	 */
+	local64_set(&hwc->prev_count, (u64)-left);
+
+	write_counter(idx, (u64)(-left) & tile_pmu->cntval_mask);
+
+	perf_event_update_userpage(event);
+
+	return ret;
+}
+
+/*
+ * Stop the event but do not release the PMU counter
+ */
+static void tile_pmu_stop(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	if (__test_and_clear_bit(idx, cpuc->active_mask)) {
+		tile_pmu_disable_event(event);
+		cpuc->events[hwc->idx] = NULL;
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		tile_perf_event_update(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+/*
+ * Start an event (without re-assigning counter)
+ */
+static void tile_pmu_start(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx = event->hw.idx;
+
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	if (WARN_ON_ONCE(idx == -1))
+		return;
+
+	if (flags & PERF_EF_RELOAD) {
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+		tile_event_set_period(event);
+	}
+
+	event->hw.state = 0;
+
+	cpuc->events[idx] = event;
+	__set_bit(idx, cpuc->active_mask);
+
+	unmask_pmc_interrupts();
+
+	tile_pmu_enable_event(event);
+
+	perf_event_update_userpage(event);
+}
+
+/*
+ * Add a single event to the PMU.
+ *
+ * The event is added to the group of enabled events
+ * but only if it can be scehduled with existing events.
+ */
+static int tile_pmu_add(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc;
+	unsigned long mask;
+	int b, max_cnt;
+
+	hwc = &event->hw;
+
+	/*
+	 * We are full.
+	 */
+	if (cpuc->n_events == tile_pmu->num_counters)
+		return -ENOSPC;
+
+	cpuc->event_list[cpuc->n_events] = event;
+	cpuc->n_events++;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
+	/*
+	 * Find first empty counter.
+	 */
+	max_cnt = tile_pmu->num_counters;
+	mask = ~cpuc->used_mask;
+
+	/* Find next free counter. */
+	b = find_next_bit(&mask, max_cnt, 0);
+
+	/* Should not happen. */
+	if (WARN_ON_ONCE(b == max_cnt))
+		return -ENOSPC;
+
+	/*
+	 * Assign counter to event.
+	 */
+	event->hw.idx = b;
+	__set_bit(b, &cpuc->used_mask);
+
+	/*
+	 * Start if requested.
+	 */
+	if (flags & PERF_EF_START)
+		tile_pmu_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+/*
+ * Delete a single event from the PMU.
+ *
+ * The event is deleted from the group of enabled events.
+ * If it is the last event, disable PMU interrupt.
+ */
+static void tile_pmu_del(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int i;
+
+	/*
+	 * Remove event from list, compact list if necessary.
+	 */
+	for (i = 0; i < cpuc->n_events; i++) {
+		if (cpuc->event_list[i] == event) {
+			while (++i < cpuc->n_events)
+				cpuc->event_list[i-1] = cpuc->event_list[i];
+			--cpuc->n_events;
+			cpuc->events[event->hw.idx] = NULL;
+			__clear_bit(event->hw.idx, &cpuc->used_mask);
+			tile_pmu_stop(event, PERF_EF_UPDATE);
+			break;
+		}
+	}
+	/*
+	 * If there are no events left, then mask PMU interrupt.
+	 */
+	if (cpuc->n_events == 0)
+		mask_pmc_interrupts();
+	perf_event_update_userpage(event);
+}
+
+/*
+ * Propagate event elapsed time into the event.
+ */
+static inline void tile_pmu_read(struct perf_event *event)
+{
+	tile_perf_event_update(event);
+}
+
+/*
+ * Map generic events to Tile PMU.
+ */
+static int tile_map_hw_event(u64 config)
+{
+	if (config >= tile_pmu->max_events)
+		return -EINVAL;
+	return tile_pmu->hw_events[config];
+}
+
+/*
+ * Map generic hardware cache events to Tile PMU.
+ */
+static int tile_map_cache_event(u64 config)
+{
+	unsigned int cache_type, cache_op, cache_result;
+	int code;
+
+	if (!tile_pmu->cache_events)
+		return -ENOENT;
+
+	cache_type = (config >>  0) & 0xff;
+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+		return -EINVAL;
+
+	cache_op = (config >>  8) & 0xff;
+	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+		return -EINVAL;
+
+	cache_result = (config >> 16) & 0xff;
+	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	code = (*tile_pmu->cache_events)[cache_type][cache_op][cache_result];
+	if (code == TILE_OP_UNSUPP)
+		return -EINVAL;
+
+	return code;
+}
+
+static void tile_event_destroy(struct perf_event *event)
+{
+	if (atomic_dec_return(&tile_active_events) == 0)
+		release_pmc_hardware();
+}
+
+static int __tile_event_init(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	int code;
+
+	switch (attr->type) {
+	case PERF_TYPE_HARDWARE:
+		code = tile_pmu->map_hw_event(attr->config);
+		break;
+	case PERF_TYPE_HW_CACHE:
+		code = tile_pmu->map_cache_event(attr->config);
+		break;
+	case PERF_TYPE_RAW:
+		code = attr->config & TILE_EVENT_MASK;
+		break;
+	default:
+		/* Should not happen. */
+		return -EOPNOTSUPP;
+	}
+
+	if (code < 0)
+		return code;
+
+	hwc->config = code;
+	hwc->idx = -1;
+
+	if (attr->exclude_user)
+		hwc->config |= TILE_CTL_EXCL_USER;
+
+	if (attr->exclude_kernel)
+		hwc->config |= TILE_CTL_EXCL_KERNEL;
+
+	if (attr->exclude_hv)
+		hwc->config |= TILE_CTL_EXCL_HV;
+
+	if (!hwc->sample_period) {
+		hwc->sample_period = tile_pmu->max_period;
+		hwc->last_period = hwc->sample_period;
+		local64_set(&hwc->period_left, hwc->sample_period);
+	}
+	event->destroy = tile_event_destroy;
+	return 0;
+}
+
+static int tile_event_init(struct perf_event *event)
+{
+	int err = 0;
+	perf_irq_t old_irq_handler = NULL;
+
+	if (atomic_inc_return(&tile_active_events) == 1)
+		old_irq_handler = reserve_pmc_hardware(tile_pmu_handle_irq);
+
+	if (old_irq_handler) {
+		pr_warn("PMC hardware busy (reserved by oprofile)\n");
+
+		atomic_dec(&tile_active_events);
+		return -EBUSY;
+	}
+
+	switch (event->attr.type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
+	err = __tile_event_init(event);
+	if (err) {
+		if (event->destroy)
+			event->destroy(event);
+	}
+	return err;
+}
+
+static struct pmu tilera_pmu = {
+	.event_init	= tile_event_init,
+	.add		= tile_pmu_add,
+	.del		= tile_pmu_del,
+
+	.start		= tile_pmu_start,
+	.stop		= tile_pmu_stop,
+
+	.read		= tile_pmu_read,
+};
+
+/*
+ * PMU's IRQ handler, PMU has 2 interrupts, they share the same handler.
+ */
+int tile_pmu_handle_irq(struct pt_regs *regs, int fault)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	u64 val;
+	unsigned long status;
+	int bit;
+
+	__get_cpu_var(perf_irqs)++;
+
+	if (!atomic_read(&tile_active_events))
+		return 0;
+
+	status = pmc_get_overflow();
+	pmc_ack_overflow(status);
+
+	for_each_set_bit(bit, &status, tile_pmu->num_counters) {
+
+		event = cpuc->events[bit];
+
+		if (!event)
+			continue;
+
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+
+		hwc = &event->hw;
+
+		val = tile_perf_event_update(event);
+		if (val & (1ULL << (tile_pmu->cntval_bits - 1)))
+			continue;
+
+		perf_sample_data_init(&data, 0, event->hw.last_period);
+		if (!tile_event_set_period(event))
+			continue;
+
+		if (perf_event_overflow(event, &data, regs))
+			tile_pmu_stop(event, 0);
+	}
+
+	return 0;
+}
+
+static bool __init supported_pmu(void)
+{
+	tile_pmu = &tilepmu;
+	return true;
+}
+
+int __init init_hw_perf_events(void)
+{
+	supported_pmu();
+	perf_pmu_register(&tilera_pmu, "cpu", PERF_TYPE_RAW);
+	return 0;
+}
+arch_initcall(init_hw_perf_events);
+
+/* Callchain handling code. */
+
+/*
+ * Tile specific backtracing code for perf_events.
+ */
+static inline void perf_callchain(struct perf_callchain_entry *entry,
+		    struct pt_regs *regs)
+{
+	struct KBacktraceIterator kbt;
+	unsigned int i;
+
+	/*
+	 * Get the address just after the "jalr" instruction that
+	 * jumps to the handler for a syscall.  When we find this
+	 * address in a backtrace, we silently ignore it, which gives
+	 * us a one-step backtrace connection from the sys_xxx()
+	 * function in the kernel to the xxx() function in libc.
+	 * Otherwise, we lose the ability to properly attribute time
+	 * from the libc calls to the kernel implementations, since
+	 * oprofile only considers PCs from backtraces a pair at a time.
+	 */
+	unsigned long handle_syscall_pc = handle_syscall_link_address();
+
+	KBacktraceIterator_init(&kbt, NULL, regs);
+	kbt.profile = 1;
+
+	/*
+	 * The sample for the pc is already recorded.  Now we are adding the
+	 * address of the callsites on the stack.  Our iterator starts
+	 * with the frame of the (already sampled) call site.  If our
+	 * iterator contained a "return address" field, we could have just
+	 * used it and wouldn't have needed to skip the first
+	 * frame.  That's in effect what the arm and x86 versions do.
+	 * Instead we peel off the first iteration to get the equivalent
+	 * behavior.
+	 */
+
+	if (KBacktraceIterator_end(&kbt))
+		return;
+	KBacktraceIterator_next(&kbt);
+
+	/*
+	 * Set stack depth to 16 for user and kernel space respectively, that
+	 * is, total 32 stack frames.
+	 */
+	for (i = 0; i < 16; ++i) {
+		unsigned long pc;
+		if (KBacktraceIterator_end(&kbt))
+			break;
+		pc = kbt.it.pc;
+		if (pc != handle_syscall_pc)
+			perf_callchain_store(entry, pc);
+		KBacktraceIterator_next(&kbt);
+	}
+}
+
+void perf_callchain_user(struct perf_callchain_entry *entry,
+		    struct pt_regs *regs)
+{
+	perf_callchain(entry, regs);
+}
+
+void perf_callchain_kernel(struct perf_callchain_entry *entry,
+		      struct pt_regs *regs)
+{
+	perf_callchain(entry, regs);
+}
diff --git a/arch/tile/kernel/pmc.c b/arch/tile/kernel/pmc.c
new file mode 100644
index 00000000000..db62cc34b95
--- /dev/null
+++ b/arch/tile/kernel/pmc.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2014 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/atomic.h>
+#include <linux/interrupt.h>
+
+#include <asm/processor.h>
+#include <asm/pmc.h>
+
+perf_irq_t perf_irq = NULL;
+int handle_perf_interrupt(struct pt_regs *regs, int fault)
+{
+	int retval;
+
+	if (!perf_irq)
+		panic("Unexpected PERF_COUNT interrupt %d\n", fault);
+
+	nmi_enter();
+	retval = perf_irq(regs, fault);
+	nmi_exit();
+	return retval;
+}
+
+/* Reserve PMC hardware if it is available. */
+perf_irq_t reserve_pmc_hardware(perf_irq_t new_perf_irq)
+{
+	return cmpxchg(&perf_irq, NULL, new_perf_irq);
+}
+EXPORT_SYMBOL(reserve_pmc_hardware);
+
+/* Release PMC hardware. */
+void release_pmc_hardware(void)
+{
+	perf_irq = NULL;
+}
+EXPORT_SYMBOL(release_pmc_hardware);
+
+
+/*
+ * Get current overflow status of each performance counter,
+ * and auxiliary performance counter.
+ */
+unsigned long
+pmc_get_overflow(void)
+{
+	unsigned long status;
+
+	/*
+	 * merge base+aux into a single vector
+	 */
+	status = __insn_mfspr(SPR_PERF_COUNT_STS);
+	status |= __insn_mfspr(SPR_AUX_PERF_COUNT_STS) << TILE_BASE_COUNTERS;
+	return status;
+}
+
+/*
+ * Clear the status bit for the corresponding counter, if written
+ * with a one.
+ */
+void
+pmc_ack_overflow(unsigned long status)
+{
+	/*
+	 * clear overflow status by writing ones
+	 */
+	__insn_mtspr(SPR_PERF_COUNT_STS, status);
+	__insn_mtspr(SPR_AUX_PERF_COUNT_STS, status >> TILE_BASE_COUNTERS);
+}
+
+/*
+ * The perf count interrupts are masked and unmasked explicitly,
+ * and only here.  The normal irq_enable() does not enable them,
+ * and irq_disable() does not disable them.  That lets these
+ * routines drive the perf count interrupts orthogonally.
+ *
+ * We also mask the perf count interrupts on entry to the perf count
+ * interrupt handler in assembly code, and by default unmask them
+ * again (with interrupt critical section protection) just before
+ * returning from the interrupt.  If the perf count handler returns
+ * a non-zero error code, then we don't re-enable them before returning.
+ *
+ * For Pro, we rely on both interrupts being in the same word to update
+ * them atomically so we never have one enabled and one disabled.
+ */
+
+#if CHIP_HAS_SPLIT_INTR_MASK()
+# if INT_PERF_COUNT < 32 || INT_AUX_PERF_COUNT < 32
+#  error Fix assumptions about which word PERF_COUNT interrupts are in
+# endif
+#endif
+
+static inline unsigned long long pmc_mask(void)
+{
+	unsigned long long mask = 1ULL << INT_PERF_COUNT;
+	mask |= 1ULL << INT_AUX_PERF_COUNT;
+	return mask;
+}
+
+void unmask_pmc_interrupts(void)
+{
+	interrupt_mask_reset_mask(pmc_mask());
+}
+
+void mask_pmc_interrupts(void)
+{
+	interrupt_mask_set_mask(pmc_mask());
+}
diff --git a/arch/tile/kernel/proc.c b/arch/tile/kernel/proc.c
index 62d820833c6..6829a950864 100644
--- a/arch/tile/kernel/proc.c
+++ b/arch/tile/kernel/proc.c
@@ -22,7 +22,9 @@
 #include <linux/proc_fs.h>
 #include <linux/sysctl.h>
 #include <linux/hardirq.h>
+#include <linux/hugetlb.h>
 #include <linux/mman.h>
+#include <asm/unaligned.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/sections.h>
@@ -111,8 +113,7 @@ arch_initcall(proc_tile_init);
  * Support /proc/sys/tile directory
  */
 
-#ifndef __tilegx__  /* FIXME: GX: no support for unaligned access yet */
-static ctl_table unaligned_subtable[] = {
+static struct ctl_table unaligned_subtable[] = {
 	{
 		.procname	= "enabled",
 		.data		= &unaligned_fixup,
@@ -137,7 +138,7 @@ static ctl_table unaligned_subtable[] = {
 	{}
 };
 
-static ctl_table unaligned_table[] = {
+static struct ctl_table unaligned_table[] = {
 	{
 		.procname	= "unaligned_fixup",
 		.mode		= 0555,
@@ -145,7 +146,6 @@ static ctl_table unaligned_table[] = {
 	},
 	{}
 };
-#endif
 
 static struct ctl_path tile_path[] = {
 	{ .procname = "tile" },
@@ -154,9 +154,7 @@ static struct ctl_path tile_path[] = {
 
 static int __init proc_sys_tile_init(void)
 {
-#ifndef __tilegx__  /* FIXME: GX: no support for unaligned access yet */
 	register_sysctl_paths(tile_path, unaligned_table);
-#endif
 	return 0;
 }
 
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 4c1ac6e5347..16ed5894875 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -27,24 +27,25 @@
 #include <linux/kernel.h>
 #include <linux/tracehook.h>
 #include <linux/signal.h>
-#include <asm/system.h>
 #include <asm/stack.h>
+#include <asm/switch_to.h>
 #include <asm/homecache.h>
 #include <asm/syscalls.h>
 #include <asm/traps.h>
+#include <asm/setup.h>
+#include <asm/uaccess.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
 #include <arch/chip.h>
 #include <arch/abi.h>
-
+#include <arch/sim_def.h>
 
 /*
  * Use the (x86) "idle=poll" option to prefer low latency when leaving the
  * idle loop over low power while in the idle loop, e.g. if we have
  * one thread per core and we want to get threads out of futex waits fast.
  */
-static int no_idle_nap;
 static int __init idle_setup(char *str)
 {
 	if (!str)
@@ -52,105 +53,28 @@ static int __init idle_setup(char *str)
 
 	if (!strcmp(str, "poll")) {
 		pr_info("using polling idle threads.\n");
-		no_idle_nap = 1;
-	} else if (!strcmp(str, "halt"))
-		no_idle_nap = 0;
-	else
-		return -1;
-
-	return 0;
-}
-early_param("idle", idle_setup);
-
-/*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
- */
-void cpu_idle(void)
-{
-	int cpu = smp_processor_id();
-
-
-	current_thread_info()->status |= TS_POLLING;
-
-	if (no_idle_nap) {
-		while (1) {
-			while (!need_resched())
-				cpu_relax();
-			schedule();
-		}
-	}
-
-	/* endless idle loop with no priority at all */
-	while (1) {
-		tick_nohz_idle_enter();
-		rcu_idle_enter();
-		while (!need_resched()) {
-			if (cpu_is_offline(cpu))
-				BUG();  /* no HOTPLUG_CPU */
-
-			local_irq_disable();
-			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
-			current_thread_info()->status &= ~TS_POLLING;
-			/*
-			 * TS_POLLING-cleared state must be visible before we
-			 * test NEED_RESCHED:
-			 */
-			smp_mb();
-
-			if (!need_resched())
-				_cpu_idle();
-			else
-				local_irq_enable();
-			current_thread_info()->status |= TS_POLLING;
-		}
-		rcu_idle_exit();
-		tick_nohz_idle_exit();
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
+		cpu_idle_poll_ctrl(true);
+		return 0;
+	} else if (!strcmp(str, "halt")) {
+		return 0;
 	}
+	return -1;
 }
+early_param("idle", idle_setup);
 
-struct thread_info *alloc_thread_info_node(struct task_struct *task, int node)
+void arch_cpu_idle(void)
 {
-	struct page *page;
-	gfp_t flags = GFP_KERNEL;
-
-#ifdef CONFIG_DEBUG_STACK_USAGE
-	flags |= __GFP_ZERO;
-#endif
-
-	page = alloc_pages_node(node, flags, THREAD_SIZE_ORDER);
-	if (!page)
-		return NULL;
-
-	return (struct thread_info *)page_address(page);
+	__get_cpu_var(irq_stat).idle_timestamp = jiffies;
+	_cpu_idle();
 }
 
 /*
- * Free a thread_info node, and all of its derivative
- * data structures.
+ * Release a thread_info structure
  */
-void free_thread_info(struct thread_info *info)
+void arch_release_thread_info(struct thread_info *info)
 {
 	struct single_step_state *step_state = info->step_state;
 
-#ifdef CONFIG_HARDWALL
-	/*
-	 * We free a thread_info from the context of the task that has
-	 * been scheduled next, so the original task is already dead.
-	 * Calling deactivate here just frees up the data structures.
-	 * If the task we're freeing held the last reference to a
-	 * hardwall fd, it would have been released prior to this point
-	 * anyway via exit_files(), and "hardwall" would be NULL by now.
-	 */
-	if (info->task->thread.hardwall)
-		hardwall_deactivate(info->task);
-#endif
-
 	if (step_state) {
 
 		/*
@@ -169,25 +93,54 @@ void free_thread_info(struct thread_info *info)
 		 */
 		kfree(step_state);
 	}
-
-	free_pages((unsigned long)info, THREAD_SIZE_ORDER);
 }
 
 static void save_arch_state(struct thread_struct *t);
 
 int copy_thread(unsigned long clone_flags, unsigned long sp,
-		unsigned long stack_size,
-		struct task_struct *p, struct pt_regs *regs)
+		unsigned long arg, struct task_struct *p)
 {
-	struct pt_regs *childregs;
+	struct pt_regs *childregs = task_pt_regs(p);
 	unsigned long ksp;
+	unsigned long *callee_regs;
+
+	/*
+	 * Set up the stack and stack pointer appropriately for the
+	 * new child to find itself woken up in __switch_to().
+	 * The callee-saved registers must be on the stack to be read;
+	 * the new task will then jump to assembly support to handle
+	 * calling schedule_tail(), etc., and (for userspace tasks)
+	 * returning to the context set up in the pt_regs.
+	 */
+	ksp = (unsigned long) childregs;
+	ksp -= C_ABI_SAVE_AREA_SIZE;   /* interrupt-entry save area */
+	((long *)ksp)[0] = ((long *)ksp)[1] = 0;
+	ksp -= CALLEE_SAVED_REGS_COUNT * sizeof(unsigned long);
+	callee_regs = (unsigned long *)ksp;
+	ksp -= C_ABI_SAVE_AREA_SIZE;   /* __switch_to() save area */
+	((long *)ksp)[0] = ((long *)ksp)[1] = 0;
+	p->thread.ksp = ksp;
+
+	/* Record the pid of the task that created this one. */
+	p->thread.creator_pid = current->pid;
+
+	if (unlikely(p->flags & PF_KTHREAD)) {
+		/* kernel thread */
+		memset(childregs, 0, sizeof(struct pt_regs));
+		memset(&callee_regs[2], 0,
+		       (CALLEE_SAVED_REGS_COUNT - 2) * sizeof(unsigned long));
+		callee_regs[0] = sp;   /* r30 = function */
+		callee_regs[1] = arg;  /* r31 = arg */
+		childregs->ex1 = PL_ICS_EX1(KERNEL_PL, 0);
+		p->thread.pc = (unsigned long) ret_from_kernel_thread;
+		return 0;
+	}
 
 	/*
-	 * When creating a new kernel thread we pass sp as zero.
-	 * Assign it to a reasonable value now that we have the stack.
+	 * Start new thread in ret_from_fork so it schedules properly
+	 * and then return from interrupt like the parent.
 	 */
-	if (sp == 0 && regs->ex1 == PL_ICS_EX1(KERNEL_PL, 0))
-		sp = KSTK_TOP(p);
+	p->thread.pc = (unsigned long) ret_from_fork;
 
 	/*
 	 * Do not clone step state from the parent; each thread
@@ -195,52 +148,35 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	 */
 	task_thread_info(p)->step_state = NULL;
 
+#ifdef __tilegx__
 	/*
-	 * Start new thread in ret_from_fork so it schedules properly
-	 * and then return from interrupt like the parent.
+	 * Do not clone unalign jit fixup from the parent; each thread
+	 * must allocate its own on demand.
 	 */
-	p->thread.pc = (unsigned long) ret_from_fork;
-
-	/* Save user stack top pointer so we can ID the stack vm area later. */
-	p->thread.usp0 = sp;
-
-	/* Record the pid of the process that created this one. */
-	p->thread.creator_pid = current->pid;
+	task_thread_info(p)->unalign_jit_base = NULL;
+#endif
 
 	/*
 	 * Copy the registers onto the kernel stack so the
 	 * return-from-interrupt code will reload it into registers.
 	 */
-	childregs = task_pt_regs(p);
-	*childregs = *regs;
+	*childregs = *current_pt_regs();
 	childregs->regs[0] = 0;         /* return value is zero */
-	childregs->sp = sp;  /* override with new user stack pointer */
+	if (sp)
+		childregs->sp = sp;  /* override with new user stack pointer */
+	memcpy(callee_regs, &childregs->regs[CALLEE_SAVED_FIRST_REG],
+	       CALLEE_SAVED_REGS_COUNT * sizeof(unsigned long));
+
+	/* Save user stack top pointer so we can ID the stack vm area later. */
+	p->thread.usp0 = childregs->sp;
 
 	/*
 	 * If CLONE_SETTLS is set, set "tp" in the new task to "r4",
 	 * which is passed in as arg #5 to sys_clone().
 	 */
 	if (clone_flags & CLONE_SETTLS)
-		childregs->tp = regs->regs[4];
+		childregs->tp = childregs->regs[4];
 
-	/*
-	 * Copy the callee-saved registers from the passed pt_regs struct
-	 * into the context-switch callee-saved registers area.
-	 * This way when we start the interrupt-return sequence, the
-	 * callee-save registers will be correctly in registers, which
-	 * is how we assume the compiler leaves them as we start doing
-	 * the normal return-from-interrupt path after calling C code.
-	 * Zero out the C ABI save area to mark the top of the stack.
-	 */
-	ksp = (unsigned long) childregs;
-	ksp -= C_ABI_SAVE_AREA_SIZE;   /* interrupt-entry save area */
-	((long *)ksp)[0] = ((long *)ksp)[1] = 0;
-	ksp -= CALLEE_SAVED_REGS_COUNT * sizeof(unsigned long);
-	memcpy((void *)ksp, &regs->regs[CALLEE_SAVED_FIRST_REG],
-	       CALLEE_SAVED_REGS_COUNT * sizeof(unsigned long));
-	ksp -= C_ABI_SAVE_AREA_SIZE;   /* __switch_to() save area */
-	((long *)ksp)[0] = ((long *)ksp)[1] = 0;
-	p->thread.ksp = ksp;
 
 #if CHIP_HAS_TILE_DMA()
 	/*
@@ -251,20 +187,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	memset(&p->thread.dma_async_tlb, 0, sizeof(struct async_tlb));
 #endif
 
-#if CHIP_HAS_SN_PROC()
-	/* Likewise, the new thread is not running static processor code. */
-	p->thread.sn_proc_running = 0;
-	memset(&p->thread.sn_async_tlb, 0, sizeof(struct async_tlb));
-#endif
-
-#if CHIP_HAS_PROC_STATUS_SPR()
 	/* New thread has its miscellaneous processor state bits clear. */
 	p->thread.proc_status = 0;
-#endif
 
 #ifdef CONFIG_HARDWALL
 	/* New thread does not own any networks. */
-	p->thread.hardwall = NULL;
+	memset(&p->thread.hardwall[0], 0,
+	       sizeof(struct hardwall_task) * HARDWALL_TYPES);
 #endif
 
 
@@ -277,19 +206,32 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	return 0;
 }
 
+int set_unalign_ctl(struct task_struct *tsk, unsigned int val)
+{
+	task_thread_info(tsk)->align_ctl = val;
+	return 0;
+}
+
+int get_unalign_ctl(struct task_struct *tsk, unsigned long adr)
+{
+	return put_user(task_thread_info(tsk)->align_ctl,
+			(unsigned int __user *)adr);
+}
+
+static struct task_struct corrupt_current = { .comm = "<corrupt>" };
+
 /*
  * Return "current" if it looks plausible, or else a pointer to a dummy.
  * This can be helpful if we are just trying to emit a clean panic.
  */
 struct task_struct *validate_current(void)
 {
-	static struct task_struct corrupt = { .comm = "<corrupt>" };
 	struct task_struct *tsk = current;
 	if (unlikely((unsigned long)tsk < PAGE_OFFSET ||
-		     (void *)tsk > high_memory ||
+		     (high_memory && (void *)tsk > high_memory) ||
 		     ((unsigned long)tsk & (__alignof__(*tsk) - 1)) != 0)) {
 		pr_err("Corrupt 'current' %p (sp %#lx)\n", tsk, stack_pointer);
-		tsk = &corrupt;
+		tsk = &corrupt_current;
 	}
 	return tsk;
 }
@@ -428,15 +370,11 @@ static void save_arch_state(struct thread_struct *t)
 	t->system_save[2] = __insn_mfspr(SPR_SYSTEM_SAVE_0_2);
 	t->system_save[3] = __insn_mfspr(SPR_SYSTEM_SAVE_0_3);
 	t->intctrl_0 = __insn_mfspr(SPR_INTCTRL_0_STATUS);
-#if CHIP_HAS_PROC_STATUS_SPR()
 	t->proc_status = __insn_mfspr(SPR_PROC_STATUS);
-#endif
 #if !CHIP_HAS_FIXED_INTVEC_BASE()
 	t->interrupt_vector_base = __insn_mfspr(SPR_INTERRUPT_VECTOR_BASE_0);
 #endif
-#if CHIP_HAS_TILE_RTF_HWM()
 	t->tile_rtf_hwm = __insn_mfspr(SPR_TILE_RTF_HWM);
-#endif
 #if CHIP_HAS_DSTREAM_PF()
 	t->dstream_pf = __insn_mfspr(SPR_DSTREAM_PF);
 #endif
@@ -457,15 +395,11 @@ static void restore_arch_state(const struct thread_struct *t)
 	__insn_mtspr(SPR_SYSTEM_SAVE_0_2, t->system_save[2]);
 	__insn_mtspr(SPR_SYSTEM_SAVE_0_3, t->system_save[3]);
 	__insn_mtspr(SPR_INTCTRL_0_STATUS, t->intctrl_0);
-#if CHIP_HAS_PROC_STATUS_SPR()
 	__insn_mtspr(SPR_PROC_STATUS, t->proc_status);
-#endif
 #if !CHIP_HAS_FIXED_INTVEC_BASE()
 	__insn_mtspr(SPR_INTERRUPT_VECTOR_BASE_0, t->interrupt_vector_base);
 #endif
-#if CHIP_HAS_TILE_RTF_HWM()
 	__insn_mtspr(SPR_TILE_RTF_HWM, t->tile_rtf_hwm);
-#endif
 #if CHIP_HAS_DSTREAM_PF()
 	__insn_mtspr(SPR_DSTREAM_PF, t->dstream_pf);
 #endif
@@ -474,26 +408,11 @@ static void restore_arch_state(const struct thread_struct *t)
 
 void _prepare_arch_switch(struct task_struct *next)
 {
-#if CHIP_HAS_SN_PROC()
-	int snctl;
-#endif
 #if CHIP_HAS_TILE_DMA()
 	struct tile_dma_state *dma = &current->thread.tile_dma_state;
 	if (dma->enabled)
 		save_tile_dma_state(dma);
 #endif
-#if CHIP_HAS_SN_PROC()
-	/*
-	 * Suspend the static network processor if it was running.
-	 * We do not suspend the fabric itself, just like we don't
-	 * try to suspend the UDN.
-	 */
-	snctl = __insn_mfspr(SPR_SNCTL);
-	current->thread.sn_proc_running =
-		(snctl & SPR_SNCTL__FRZPROC_MASK) == 0;
-	if (current->thread.sn_proc_running)
-		__insn_mtspr(SPR_SNCTL, snctl | SPR_SNCTL__FRZPROC_MASK);
-#endif
 }
 
 
@@ -521,25 +440,9 @@ struct task_struct *__sched _switch_to(struct task_struct *prev,
 	/* Restore other arch state. */
 	restore_arch_state(&next->thread);
 
-#if CHIP_HAS_SN_PROC()
-	/*
-	 * Restart static network processor in the new process
-	 * if it was running before.
-	 */
-	if (next->thread.sn_proc_running) {
-		int snctl = __insn_mfspr(SPR_SNCTL);
-		__insn_mtspr(SPR_SNCTL, snctl & ~SPR_SNCTL__FRZPROC_MASK);
-	}
-#endif
-
 #ifdef CONFIG_HARDWALL
 	/* Enable or disable access to the network registers appropriately. */
-	if (prev->thread.hardwall != NULL) {
-		if (next->thread.hardwall == NULL)
-			restrict_network_mpls();
-	} else if (next->thread.hardwall != NULL) {
-		grant_network_mpls();
-	}
+	hardwall_switch_tasks(prev, next);
 #endif
 
 	/*
@@ -567,11 +470,18 @@ struct task_struct *__sched _switch_to(struct task_struct *prev,
  */
 int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
 {
+	/* If we enter in kernel mode, do nothing and exit the caller loop. */
+	if (!user_mode(regs))
+		return 0;
+
+	/* Enable interrupts; they are disabled again on return to caller. */
+	local_irq_enable();
+
 	if (thread_info_flags & _TIF_NEED_RESCHED) {
 		schedule();
 		return 1;
 	}
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
+#if CHIP_HAS_TILE_DMA()
 	if (thread_info_flags & _TIF_ASYNC_TLB) {
 		do_async_page_fault(regs);
 		return 1;
@@ -584,74 +494,15 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
-		if (current->replacement_session_keyring)
-			key_replace_session_keyring();
 		return 1;
 	}
 	if (thread_info_flags & _TIF_SINGLESTEP) {
-		if ((regs->ex1 & SPR_EX_CONTEXT_1_1__PL_MASK) == 0)
-			single_step_once(regs);
+		single_step_once(regs);
 		return 0;
 	}
 	panic("work_pending: bad flags %#x\n", thread_info_flags);
 }
 
-/* Note there is an implicit fifth argument if (clone_flags & CLONE_SETTLS). */
-SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
-		void __user *, parent_tidptr, void __user *, child_tidptr,
-		struct pt_regs *, regs)
-{
-	if (!newsp)
-		newsp = regs->sp;
-	return do_fork(clone_flags, newsp, regs, 0,
-		       parent_tidptr, child_tidptr);
-}
-
-/*
- * sys_execve() executes a new program.
- */
-SYSCALL_DEFINE4(execve, const char __user *, path,
-		const char __user *const __user *, argv,
-		const char __user *const __user *, envp,
-		struct pt_regs *, regs)
-{
-	long error;
-	char *filename;
-
-	filename = getname(path);
-	error = PTR_ERR(filename);
-	if (IS_ERR(filename))
-		goto out;
-	error = do_execve(filename, argv, envp, regs);
-	putname(filename);
-	if (error == 0)
-		single_step_execve();
-out:
-	return error;
-}
-
-#ifdef CONFIG_COMPAT
-long compat_sys_execve(const char __user *path,
-		       compat_uptr_t __user *argv,
-		       compat_uptr_t __user *envp,
-		       struct pt_regs *regs)
-{
-	long error;
-	char *filename;
-
-	filename = getname(path);
-	error = PTR_ERR(filename);
-	if (IS_ERR(filename))
-		goto out;
-	error = compat_do_execve(filename, argv, envp, regs);
-	putname(filename);
-	if (error == 0)
-		single_step_execve();
-out:
-	return error;
-}
-#endif
-
 unsigned long get_wchan(struct task_struct *p)
 {
 	struct KBacktraceIterator kbt;
@@ -669,37 +520,6 @@ unsigned long get_wchan(struct task_struct *p)
 	return 0;
 }
 
-/*
- * We pass in lr as zero (cleared in kernel_thread) and the caller
- * part of the backtrace ABI on the stack also zeroed (in copy_thread)
- * so that backtraces will stop with this function.
- * Note that we don't use r0, since copy_thread() clears it.
- */
-static void start_kernel_thread(int dummy, int (*fn)(int), int arg)
-{
-	do_exit(fn(arg));
-}
-
-/*
- * Create a kernel thread
- */
-int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
-{
-	struct pt_regs regs;
-
-	memset(&regs, 0, sizeof(regs));
-	regs.ex1 = PL_ICS_EX1(KERNEL_PL, 0);  /* run at kernel PL, no ICS */
-	regs.pc = (long) start_kernel_thread;
-	regs.flags = PT_FLAGS_CALLER_SAVES;   /* need to restore r1 and r2 */
-	regs.regs[1] = (long) fn;             /* function pointer */
-	regs.regs[2] = (long) arg;            /* parameter register */
-
-	/* Ok, create the new process.. */
-	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs,
-		       0, NULL, NULL);
-}
-EXPORT_SYMBOL(kernel_thread);
-
 /* Flush thread state. */
 void flush_thread(void)
 {
@@ -711,7 +531,15 @@ void flush_thread(void)
  */
 void exit_thread(void)
 {
-	/* Nothing */
+#ifdef CONFIG_HARDWALL
+	/*
+	 * Remove the task from the list of tasks that are associated
+	 * with any live hardwalls.  (If the task that is exiting held
+	 * the last reference to a hardwall fd, it would already have
+	 * been released and deactivated at this point.)
+	 */
+	hardwall_deactivate_all(current);
+#endif
 }
 
 void show_regs(struct pt_regs *regs)
@@ -720,24 +548,24 @@ void show_regs(struct pt_regs *regs)
 	int i;
 
 	pr_err("\n");
-	pr_err(" Pid: %d, comm: %20s, CPU: %d\n",
-	       tsk->pid, tsk->comm, smp_processor_id());
+	if (tsk != &corrupt_current)
+		show_regs_print_info(KERN_ERR);
 #ifdef __tilegx__
-	for (i = 0; i < 51; i += 3)
+	for (i = 0; i < 17; i++)
 		pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
-		       i, regs->regs[i], i+1, regs->regs[i+1],
-		       i+2, regs->regs[i+2]);
-	pr_err(" r51: "REGFMT" r52: "REGFMT" tp : "REGFMT"\n",
-	       regs->regs[51], regs->regs[52], regs->tp);
+		       i, regs->regs[i], i+18, regs->regs[i+18],
+		       i+36, regs->regs[i+36]);
+	pr_err(" r17: "REGFMT" r35: "REGFMT" tp : "REGFMT"\n",
+	       regs->regs[17], regs->regs[35], regs->tp);
 	pr_err(" sp : "REGFMT" lr : "REGFMT"\n", regs->sp, regs->lr);
 #else
-	for (i = 0; i < 52; i += 4)
+	for (i = 0; i < 13; i++)
 		pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT
 		       " r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
-		       i, regs->regs[i], i+1, regs->regs[i+1],
-		       i+2, regs->regs[i+2], i+3, regs->regs[i+3]);
-	pr_err(" r52: "REGFMT" tp : "REGFMT" sp : "REGFMT" lr : "REGFMT"\n",
-	       regs->regs[52], regs->tp, regs->sp, regs->lr);
+		       i, regs->regs[i], i+14, regs->regs[i+14],
+		       i+27, regs->regs[i+27], i+40, regs->regs[i+40]);
+	pr_err(" r13: "REGFMT" tp : "REGFMT" sp : "REGFMT" lr : "REGFMT"\n",
+	       regs->regs[13], regs->tp, regs->sp, regs->lr);
 #endif
 	pr_err(" pc : "REGFMT" ex1: %ld     faultnum: %ld\n",
 	       regs->pc, regs->ex1, regs->faultnum);
diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c
index e92e40527d6..de98c6ddf13 100644
--- a/arch/tile/kernel/ptrace.c
+++ b/arch/tile/kernel/ptrace.c
@@ -19,7 +19,14 @@
 #include <linux/kprobes.h>
 #include <linux/compat.h>
 #include <linux/uaccess.h>
+#include <linux/regset.h>
+#include <linux/elf.h>
+#include <linux/tracehook.h>
 #include <asm/traps.h>
+#include <arch/chip.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
 
 void user_enable_single_step(struct task_struct *child)
 {
@@ -45,6 +52,100 @@ void ptrace_disable(struct task_struct *child)
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 }
 
+/*
+ * Get registers from task and ready the result for userspace.
+ * Note that we localize the API issues to getregs() and putregs() at
+ * some cost in performance, e.g. we need a full pt_regs copy for
+ * PEEKUSR, and two copies for POKEUSR.  But in general we expect
+ * GETREGS/PUTREGS to be the API of choice anyway.
+ */
+static char *getregs(struct task_struct *child, struct pt_regs *uregs)
+{
+	*uregs = *task_pt_regs(child);
+
+	/* Set up flags ABI bits. */
+	uregs->flags = 0;
+#ifdef CONFIG_COMPAT
+	if (task_thread_info(child)->status & TS_COMPAT)
+		uregs->flags |= PT_FLAGS_COMPAT;
+#endif
+
+	return (char *)uregs;
+}
+
+/* Put registers back to task. */
+static void putregs(struct task_struct *child, struct pt_regs *uregs)
+{
+	struct pt_regs *regs = task_pt_regs(child);
+
+	/* Don't allow overwriting the kernel-internal flags word. */
+	uregs->flags = regs->flags;
+
+	/* Only allow setting the ICS bit in the ex1 word. */
+	uregs->ex1 = PL_ICS_EX1(USER_PL, EX1_ICS(uregs->ex1));
+
+	*regs = *uregs;
+}
+
+enum tile_regset {
+	REGSET_GPR,
+};
+
+static int tile_gpr_get(struct task_struct *target,
+			  const struct user_regset *regset,
+			  unsigned int pos, unsigned int count,
+			  void *kbuf, void __user *ubuf)
+{
+	struct pt_regs regs;
+
+	getregs(target, &regs);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &regs, 0,
+				   sizeof(regs));
+}
+
+static int tile_gpr_set(struct task_struct *target,
+			  const struct user_regset *regset,
+			  unsigned int pos, unsigned int count,
+			  const void *kbuf, const void __user *ubuf)
+{
+	int ret;
+	struct pt_regs regs;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &regs, 0,
+				 sizeof(regs));
+	if (ret)
+		return ret;
+
+	putregs(target, &regs);
+
+	return 0;
+}
+
+static const struct user_regset tile_user_regset[] = {
+	[REGSET_GPR] = {
+		.core_note_type = NT_PRSTATUS,
+		.n = ELF_NGREG,
+		.size = sizeof(elf_greg_t),
+		.align = sizeof(elf_greg_t),
+		.get = tile_gpr_get,
+		.set = tile_gpr_set,
+	},
+};
+
+static const struct user_regset_view tile_user_regset_view = {
+	.name = CHIP_ARCH_NAME,
+	.e_machine = ELF_ARCH,
+	.ei_osabi = ELF_OSABI,
+	.regsets = tile_user_regset,
+	.n = ARRAY_SIZE(tile_user_regset),
+};
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+	return &tile_user_regset_view;
+}
+
 long arch_ptrace(struct task_struct *child, long request,
 		 unsigned long addr, unsigned long data)
 {
@@ -53,14 +154,13 @@ long arch_ptrace(struct task_struct *child, long request,
 	long ret = -EIO;
 	char *childreg;
 	struct pt_regs copyregs;
-	int ex1_offset;
 
 	switch (request) {
 
 	case PTRACE_PEEKUSR:  /* Read register from pt_regs. */
 		if (addr >= PTREGS_SIZE)
 			break;
-		childreg = (char *)task_pt_regs(child) + addr;
+		childreg = getregs(child, &copyregs) + addr;
 #ifdef CONFIG_COMPAT
 		if (is_compat_task()) {
 			if (addr & (sizeof(compat_long_t)-1))
@@ -79,17 +179,7 @@ long arch_ptrace(struct task_struct *child, long request,
 	case PTRACE_POKEUSR:  /* Write register in pt_regs. */
 		if (addr >= PTREGS_SIZE)
 			break;
-		childreg = (char *)task_pt_regs(child) + addr;
-
-		/* Guard against overwrites of the privilege level. */
-		ex1_offset = PTREGS_OFFSET_EX1;
-#if defined(CONFIG_COMPAT) && defined(__BIG_ENDIAN)
-		if (is_compat_task())   /* point at low word */
-			ex1_offset += sizeof(compat_long_t);
-#endif
-		if (addr == ex1_offset)
-			data = PL_ICS_EX1(USER_PL, EX1_ICS(data));
-
+		childreg = getregs(child, &copyregs) + addr;
 #ifdef CONFIG_COMPAT
 		if (is_compat_task()) {
 			if (addr & (sizeof(compat_long_t)-1))
@@ -102,24 +192,20 @@ long arch_ptrace(struct task_struct *child, long request,
 				break;
 			*(long *)childreg = data;
 		}
+		putregs(child, &copyregs);
 		ret = 0;
 		break;
 
 	case PTRACE_GETREGS:  /* Get all registers from the child. */
-		if (copy_to_user(datap, task_pt_regs(child),
-				 sizeof(struct pt_regs)) == 0) {
-			ret = 0;
-		}
+		ret = copy_regset_to_user(child, &tile_user_regset_view,
+					  REGSET_GPR, 0,
+					  sizeof(struct pt_regs), datap);
 		break;
 
 	case PTRACE_SETREGS:  /* Set all registers in the child. */
-		if (copy_from_user(&copyregs, datap,
-				   sizeof(struct pt_regs)) == 0) {
-			copyregs.ex1 =
-				PL_ICS_EX1(USER_PL, EX1_ICS(copyregs.ex1));
-			*task_pt_regs(child) = copyregs;
-			ret = 0;
-		}
+		ret = copy_regset_from_user(child, &tile_user_regset_view,
+					    REGSET_GPR, 0,
+					    sizeof(struct pt_regs), datap);
 		break;
 
 	case PTRACE_GETFPREGS:  /* Get the child FPU state. */
@@ -128,12 +214,16 @@ long arch_ptrace(struct task_struct *child, long request,
 
 	case PTRACE_SETOPTIONS:
 		/* Support TILE-specific ptrace options. */
-		child->ptrace &= ~PT_TRACE_MASK_TILE;
+		BUILD_BUG_ON(PTRACE_O_MASK_TILE & PTRACE_O_MASK);
 		tmp = data & PTRACE_O_MASK_TILE;
 		data &= ~PTRACE_O_MASK_TILE;
 		ret = ptrace_request(child, request, addr, data);
-		if (tmp & PTRACE_O_TRACEMIGRATE)
-			child->ptrace |= PT_TRACE_MIGRATE;
+		if (ret == 0) {
+			unsigned int flags = child->ptrace;
+			flags &= ~(PTRACE_O_MASK_TILE << PT_OPT_FLAG_SHIFT);
+			flags |= (tmp << PT_OPT_FLAG_SHIFT);
+			child->ptrace = flags;
+		}
 		break;
 
 	default:
@@ -160,32 +250,44 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 }
 #endif
 
-void do_syscall_trace(void)
+int do_syscall_trace_enter(struct pt_regs *regs)
 {
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return;
+	if (test_thread_flag(TIF_SYSCALL_TRACE)) {
+		if (tracehook_report_syscall_entry(regs))
+			regs->regs[TREG_SYSCALL_NR] = -1;
+	}
 
-	if (!(current->ptrace & PT_PTRACED))
-		return;
+	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+		trace_sys_enter(regs, regs->regs[TREG_SYSCALL_NR]);
 
-	/*
-	 * The 0x80 provides a way for the tracing parent to distinguish
-	 * between a syscall stop and SIGTRAP delivery
-	 */
-	ptrace_notify(SIGTRAP|((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
+	return regs->regs[TREG_SYSCALL_NR];
+}
+
+void do_syscall_trace_exit(struct pt_regs *regs)
+{
+	long errno;
 
 	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
+	 * The standard tile calling convention returns the value (or negative
+	 * errno) in r0, and zero (or positive errno) in r1.
+	 * It saves a couple of cycles on the hot path to do this work in
+	 * registers only as we return, rather than updating the in-memory
+	 * struct ptregs.
 	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
+	errno = (long) regs->regs[0];
+	if (errno < 0 && errno > -4096)
+		regs->regs[1] = -errno;
+	else
+		regs->regs[1] = 0;
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall_exit(regs, 0);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
+		trace_sys_exit(regs, regs->regs[0]);
 }
 
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs)
 {
 	struct siginfo info;
 
@@ -201,5 +303,5 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 /* Handle synthetic interrupt delivered only by the simulator. */
 void __kprobes do_breakpoint(struct pt_regs* regs, int fault_num)
 {
-	send_sigtrap(current, regs, fault_num);
+	send_sigtrap(current, regs);
 }
diff --git a/arch/tile/kernel/reboot.c b/arch/tile/kernel/reboot.c
index baa3d905fee..6c5d2c070a1 100644
--- a/arch/tile/kernel/reboot.c
+++ b/arch/tile/kernel/reboot.c
@@ -16,6 +16,7 @@
 #include <linux/reboot.h>
 #include <linux/smp.h>
 #include <linux/pm.h>
+#include <linux/export.h>
 #include <asm/page.h>
 #include <asm/setup.h>
 #include <hv/hypervisor.h>
@@ -26,7 +27,6 @@
 
 void machine_halt(void)
 {
-	warn_early_printk();
 	arch_local_irq_disable_all();
 	smp_send_stop();
 	hv_halt();
@@ -34,7 +34,6 @@ void machine_halt(void)
 
 void machine_power_off(void)
 {
-	warn_early_printk();
 	arch_local_irq_disable_all();
 	smp_send_stop();
 	hv_power_off();
@@ -49,3 +48,4 @@ void machine_restart(char *cmd)
 
 /* No interesting distinction to be made here. */
 void (*pm_power_off)(void) = NULL;
+EXPORT_SYMBOL(pm_power_off);
diff --git a/arch/tile/kernel/regs_32.S b/arch/tile/kernel/regs_32.S
index caa13101c26..542cae17a93 100644
--- a/arch/tile/kernel/regs_32.S
+++ b/arch/tile/kernel/regs_32.S
@@ -13,14 +13,14 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/system.h>
 #include <asm/ptrace.h>
 #include <asm/asm-offsets.h>
 #include <arch/spr_def.h>
 #include <asm/processor.h>
+#include <asm/switch_to.h>
 
 /*
- * See <asm/system.h>; called with prev and next task_struct pointers.
+ * See <asm/switch_to.h>; called with prev and next task_struct pointers.
  * "prev" is returned in r0 for _switch_to and also for ret_from_fork.
  *
  * We want to save pc/sp in "prev", and get the new pc/sp from "next".
@@ -39,7 +39,7 @@
  */
 
 #if CALLEE_SAVED_REGS_COUNT != 24
-# error Mismatch between <asm/system.h> and kernel/entry.S
+# error Mismatch between <asm/switch_to.h> and kernel/entry.S
 #endif
 #define FRAME_SIZE ((2 + CALLEE_SAVED_REGS_COUNT) * 4)
 
diff --git a/arch/tile/kernel/regs_64.S b/arch/tile/kernel/regs_64.S
index f748c1e8528..bbffcc6f340 100644
--- a/arch/tile/kernel/regs_64.S
+++ b/arch/tile/kernel/regs_64.S
@@ -13,14 +13,14 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/system.h>
 #include <asm/ptrace.h>
 #include <asm/asm-offsets.h>
 #include <arch/spr_def.h>
 #include <asm/processor.h>
+#include <asm/switch_to.h>
 
 /*
- * See <asm/system.h>; called with prev and next task_struct pointers.
+ * See <asm/switch_to.h>; called with prev and next task_struct pointers.
  * "prev" is returned in r0 for _switch_to and also for ret_from_fork.
  *
  * We want to save pc/sp in "prev", and get the new pc/sp from "next".
@@ -39,7 +39,7 @@
  */
 
 #if CALLEE_SAVED_REGS_COUNT != 24
-# error Mismatch between <asm/system.h> and kernel/entry.S
+# error Mismatch between <asm/switch_to.h> and kernel/entry.S
 #endif
 #define FRAME_SIZE ((2 + CALLEE_SAVED_REGS_COUNT) * 8)
 
diff --git a/arch/tile/kernel/relocate_kernel.S b/arch/tile/kernel/relocate_kernel_32.S
index 010b418515f..e44fbcf8cbd 100644
--- a/arch/tile/kernel/relocate_kernel.S
+++ b/arch/tile/kernel/relocate_kernel_32.S
@@ -20,15 +20,6 @@
 #include <asm/page.h>
 #include <hv/hypervisor.h>
 
-#define ___hvb	MEM_SV_INTRPT + HV_GLUE_START_CPA
-
-#define ___hv_dispatch(f) (___hvb + (HV_DISPATCH_ENTRY_SIZE * f))
-
-#define ___hv_console_putc ___hv_dispatch(HV_DISPATCH_CONSOLE_PUTC)
-#define ___hv_halt         ___hv_dispatch(HV_DISPATCH_HALT)
-#define ___hv_reexec       ___hv_dispatch(HV_DISPATCH_REEXEC)
-#define ___hv_flush_remote ___hv_dispatch(HV_DISPATCH_FLUSH_REMOTE)
-
 #undef RELOCATE_NEW_KERNEL_VERBOSE
 
 STD_ENTRY(relocate_new_kernel)
@@ -43,8 +34,8 @@ STD_ENTRY(relocate_new_kernel)
 	addi	sp, sp, -8
 	/* we now have a stack (whether we need one or not) */
 
-	moveli	r40, lo16(___hv_console_putc)
-	auli	r40, r40, ha16(___hv_console_putc)
+	moveli	r40, lo16(hv_console_putc)
+	auli	r40, r40, ha16(hv_console_putc)
 
 #ifdef RELOCATE_NEW_KERNEL_VERBOSE
 	moveli	r0, 'r'
@@ -86,7 +77,6 @@ STD_ENTRY(relocate_new_kernel)
 	move	r30, sp
 	addi	sp, sp, -8
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/*
 	 * On TILEPro, we need to flush all tiles' caches, since we may
 	 * have been doing hash-for-home caching there.  Note that we
@@ -114,15 +104,14 @@ STD_ENTRY(relocate_new_kernel)
 	}
 	{
 	 move	r8, zero	 /* asids */
-	 moveli	r20, lo16(___hv_flush_remote)
+	 moveli	r20, lo16(hv_flush_remote)
 	}
 	{
 	 move	r9, zero	 /* asidcount */
-	 auli	r20, r20, ha16(___hv_flush_remote)
+	 auli	r20, r20, ha16(hv_flush_remote)
 	}
 
 	jalr	r20
-#endif
 
 	/* r33 is destination pointer, default to zero */
 
@@ -175,8 +164,8 @@ STD_ENTRY(relocate_new_kernel)
 	move	r0, r32
 	moveli	r1, 0		/* arg to hv_reexec is 64 bits */
 
-	moveli	r41, lo16(___hv_reexec)
-	auli	r41, r41, ha16(___hv_reexec)
+	moveli	r41, lo16(hv_reexec)
+	auli	r41, r41, ha16(hv_reexec)
 
 	jalr	r41
 
@@ -267,8 +256,8 @@ STD_ENTRY(relocate_new_kernel)
 	moveli	r0, '\n'
 	jalr	r40
 .Lhalt:
-	moveli	r41, lo16(___hv_halt)
-	auli	r41, r41, ha16(___hv_halt)
+	moveli	r41, lo16(hv_halt)
+	auli	r41, r41, ha16(hv_halt)
 
 	jalr	r41
 	STD_ENDPROC(relocate_new_kernel)
diff --git a/arch/tile/kernel/relocate_kernel_64.S b/arch/tile/kernel/relocate_kernel_64.S
new file mode 100644
index 00000000000..d9d8cf6176e
--- /dev/null
+++ b/arch/tile/kernel/relocate_kernel_64.S
@@ -0,0 +1,263 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * copy new kernel into place and then call hv_reexec
+ *
+ */
+
+#include <linux/linkage.h>
+#include <arch/chip.h>
+#include <asm/page.h>
+#include <hv/hypervisor.h>
+
+#undef RELOCATE_NEW_KERNEL_VERBOSE
+
+STD_ENTRY(relocate_new_kernel)
+
+	move	r30, r0		/* page list */
+	move	r31, r1		/* address of page we are on */
+	move	r32, r2		/* start address of new kernel */
+
+	shrui	r1, r1, PAGE_SHIFT
+	addi	r1, r1, 1
+	shli	sp, r1, PAGE_SHIFT
+	addi	sp, sp, -8
+	/* we now have a stack (whether we need one or not) */
+
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r40, hw2_last(hv_console_putc)
+	shl16insli r40, r40, hw1(hv_console_putc)
+	shl16insli r40, r40, hw0(hv_console_putc)
+
+	moveli	r0, 'r'
+	jalr	r40
+
+	moveli	r0, '_'
+	jalr	r40
+
+	moveli	r0, 'n'
+	jalr	r40
+
+	moveli	r0, '_'
+	jalr	r40
+
+	moveli	r0, 'k'
+	jalr	r40
+
+	moveli	r0, '\n'
+	jalr	r40
+#endif
+
+	/*
+	 * Throughout this code r30 is pointer to the element of page
+	 * list we are working on.
+	 *
+	 * Normally we get to the next element of the page list by
+	 * incrementing r30 by eight.  The exception is if the element
+	 * on the page list is an IND_INDIRECTION in which case we use
+	 * the element with the low bits masked off as the new value
+	 * of r30.
+	 *
+	 * To get this started, we need the value passed to us (which
+	 * will always be an IND_INDIRECTION) in memory somewhere with
+	 * r30 pointing at it.  To do that, we push the value passed
+	 * to us on the stack and make r30 point to it.
+	 */
+
+	st	sp, r30
+	move	r30, sp
+	addi	sp, sp, -16
+
+	/*
+	 * On TILE-GX, we need to flush all tiles' caches, since we may
+	 * have been doing hash-for-home caching there.  Note that we
+	 * must do this _after_ we're completely done modifying any memory
+	 * other than our output buffer (which we know is locally cached).
+	 * We want the caches to be fully clean when we do the reexec,
+	 * because the hypervisor is going to do this flush again at that
+	 * point, and we don't want that second flush to overwrite any memory.
+	 */
+	{
+	 move	r0, zero	 /* cache_pa */
+	 moveli	r1, hw2_last(HV_FLUSH_EVICT_L2)
+	}
+	{
+	 shl16insli	r1, r1, hw1(HV_FLUSH_EVICT_L2)
+	 movei	r2, -1		 /* cache_cpumask; -1 means all client tiles */
+	}
+	{
+	 shl16insli	r1, r1, hw0(HV_FLUSH_EVICT_L2)  /* cache_control */
+	 move	r3, zero	 /* tlb_va */
+	}
+	{
+	 move	r4, zero	 /* tlb_length */
+	 move	r5, zero	 /* tlb_pgsize */
+	}
+	{
+	 move	r6, zero	 /* tlb_cpumask */
+	 move	r7, zero	 /* asids */
+	}
+	{
+	 moveli	r20, hw2_last(hv_flush_remote)
+	 move	r8, zero	 /* asidcount */
+	}
+	shl16insli	r20, r20, hw1(hv_flush_remote)
+	shl16insli	r20, r20, hw0(hv_flush_remote)
+
+	jalr	r20
+
+	/* r33 is destination pointer, default to zero */
+
+	moveli	r33, 0
+
+.Lloop:	ld	r10, r30
+
+	andi	r9, r10, 0xf	/* low 4 bits tell us what type it is */
+	xor	r10, r10, r9	/* r10 is now value with low 4 bits stripped */
+
+	cmpeqi	r0, r9, 0x1	/* IND_DESTINATION */
+	beqzt	r0, .Ltry2
+
+	move	r33, r10
+
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, 'd'
+	jalr	r40
+#endif
+
+	addi	r30, r30, 8
+	j	.Lloop
+
+.Ltry2:
+	cmpeqi	r0, r9, 0x2	/* IND_INDIRECTION */
+	beqzt	r0, .Ltry4
+
+	move	r30, r10
+
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, 'i'
+	jalr	r40
+#endif
+
+	j	.Lloop
+
+.Ltry4:
+	cmpeqi	r0, r9, 0x4	/* IND_DONE */
+	beqzt	r0, .Ltry8
+
+	mf
+
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, 'D'
+	jalr	r40
+	moveli	r0, '\n'
+	jalr	r40
+#endif
+
+	move	r0, r32
+
+	moveli	r41, hw2_last(hv_reexec)
+	shl16insli	r41, r41, hw1(hv_reexec)
+	shl16insli	r41, r41, hw0(hv_reexec)
+
+	jalr	r41
+
+	/* we should not get here */
+
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, '?'
+	jalr	r40
+	moveli	r0, '\n'
+	jalr	r40
+#endif
+
+	j	.Lhalt
+
+.Ltry8:	cmpeqi	r0, r9, 0x8	/* IND_SOURCE */
+	beqz	r0, .Lerr	/* unknown type */
+
+	/* copy page at r10 to page at r33 */
+
+	move	r11, r33
+
+	moveli	r0, hw2_last(PAGE_SIZE)
+	shl16insli	r0, r0, hw1(PAGE_SIZE)
+	shl16insli	r0, r0, hw0(PAGE_SIZE)
+	add	r33, r33, r0
+
+	/* copy word at r10 to word at r11 until r11 equals r33 */
+
+	/* We know page size must be multiple of 8, so we can unroll
+	 * 8 times safely without any edge case checking.
+	 *
+	 * Issue a flush of the destination every 8 words to avoid
+	 * incoherence when starting the new kernel.  (Now this is
+	 * just good paranoia because the hv_reexec call will also
+	 * take care of this.)
+	 */
+
+1:
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0; addi	r11, r11, 8 }
+	{ ld	r0, r10; addi	r10, r10, 8 }
+	{ st	r11, r0 }
+	{ flush r11    ; addi	r11, r11, 8 }
+
+	cmpeq	r0, r33, r11
+	beqzt	r0, 1b
+
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, 's'
+	jalr	r40
+#endif
+
+	addi	r30, r30, 8
+	j	.Lloop
+
+
+.Lerr:
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, 'e'
+	jalr	r40
+	moveli	r0, 'r'
+	jalr	r40
+	moveli	r0, 'r'
+	jalr	r40
+	moveli	r0, '\n'
+	jalr	r40
+#endif
+.Lhalt:
+	moveli r41, hw2_last(hv_halt)
+	shl16insli r41, r41, hw1(hv_halt)
+	shl16insli r41, r41, hw0(hv_halt)
+
+	jalr	r41
+	STD_ENDPROC(relocate_new_kernel)
+
+	.section .rodata,"a"
+
+	.globl relocate_new_kernel_size
+relocate_new_kernel_size:
+	.long .Lend_relocate_new_kernel - relocate_new_kernel
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 5f85d8b34db..112ababa9e5 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -23,11 +23,15 @@
 #include <linux/irq.h>
 #include <linux/kexec.h>
 #include <linux/pci.h>
+#include <linux/swiotlb.h>
 #include <linux/initrd.h>
 #include <linux/io.h>
 #include <linux/highmem.h>
 #include <linux/smp.h>
 #include <linux/timex.h>
+#include <linux/hugetlb.h>
+#include <linux/start_kernel.h>
+#include <linux/screen_info.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
@@ -46,24 +50,41 @@ static inline int ABS(int x) { return x >= 0 ? x : -x; }
 /* Chip information */
 char chip_model[64] __write_once;
 
+#ifdef CONFIG_VT
+struct screen_info screen_info;
+#endif
+
 struct pglist_data node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
-/* We only create bootmem data on node 0. */
-static bootmem_data_t __initdata node0_bdata;
-
 /* Information on the NUMA nodes that we compute early */
-unsigned long __cpuinitdata node_start_pfn[MAX_NUMNODES];
-unsigned long __cpuinitdata node_end_pfn[MAX_NUMNODES];
+unsigned long node_start_pfn[MAX_NUMNODES];
+unsigned long node_end_pfn[MAX_NUMNODES];
 unsigned long __initdata node_memmap_pfn[MAX_NUMNODES];
 unsigned long __initdata node_percpu_pfn[MAX_NUMNODES];
 unsigned long __initdata node_free_pfn[MAX_NUMNODES];
 
 static unsigned long __initdata node_percpu[MAX_NUMNODES];
 
+/*
+ * per-CPU stack and boot info.
+ */
+DEFINE_PER_CPU(unsigned long, boot_sp) =
+	(unsigned long)init_stack + THREAD_SIZE;
+
+#ifdef CONFIG_SMP
+DEFINE_PER_CPU(unsigned long, boot_pc) = (unsigned long)start_kernel;
+#else
+/*
+ * The variable must be __initdata since it references __init code.
+ * With CONFIG_SMP it is per-cpu data, which is exempt from validation.
+ */
+unsigned long __initdata boot_pc = (unsigned long)start_kernel;
+#endif
+
 #ifdef CONFIG_HIGHMEM
 /* Page frame index of end of lowmem on each controller. */
-unsigned long __cpuinitdata node_lowmem_end_pfn[MAX_NUMNODES];
+unsigned long node_lowmem_end_pfn[MAX_NUMNODES];
 
 /* Number of pages that can be mapped into lowmem. */
 static unsigned long __initdata mappable_physpages;
@@ -94,7 +115,7 @@ static unsigned int __initdata maxnodemem_pfn[MAX_NUMNODES] = {
 };
 static nodemask_t __initdata isolnodes;
 
-#ifdef CONFIG_PCI
+#if defined(CONFIG_PCI) && !defined(__tilegx__)
 enum { DEFAULT_PCI_RESERVE_MB = 64 };
 static unsigned int __initdata pci_reserve_mb = DEFAULT_PCI_RESERVE_MB;
 unsigned long __initdata pci_reserve_start_pfn = -1U;
@@ -103,13 +124,11 @@ unsigned long __initdata pci_reserve_end_pfn = -1U;
 
 static int __init setup_maxmem(char *str)
 {
-	long maxmem_mb;
-	if (str == NULL || strict_strtol(str, 0, &maxmem_mb) != 0 ||
-	    maxmem_mb == 0)
+	unsigned long long maxmem;
+	if (str == NULL || (maxmem = memparse(str, NULL)) == 0)
 		return -EINVAL;
 
-	maxmem_pfn = (maxmem_mb >> (HPAGE_SHIFT - 20)) <<
-		(HPAGE_SHIFT - PAGE_SHIFT);
+	maxmem_pfn = (maxmem >> HPAGE_SHIFT) << (HPAGE_SHIFT - PAGE_SHIFT);
 	pr_info("Forcing RAM used to no more than %dMB\n",
 	       maxmem_pfn >> (20 - PAGE_SHIFT));
 	return 0;
@@ -119,14 +138,15 @@ early_param("maxmem", setup_maxmem);
 static int __init setup_maxnodemem(char *str)
 {
 	char *endp;
-	long maxnodemem_mb, node;
+	unsigned long long maxnodemem;
+	long node;
 
 	node = str ? simple_strtoul(str, &endp, 0) : INT_MAX;
-	if (node >= MAX_NUMNODES || *endp != ':' ||
-	    strict_strtol(endp+1, 0, &maxnodemem_mb) != 0)
+	if (node >= MAX_NUMNODES || *endp != ':')
 		return -EINVAL;
 
-	maxnodemem_pfn[node] = (maxnodemem_mb >> (HPAGE_SHIFT - 20)) <<
+	maxnodemem = memparse(endp+1, NULL);
+	maxnodemem_pfn[node] = (maxnodemem >> HPAGE_SHIFT) <<
 		(HPAGE_SHIFT - PAGE_SHIFT);
 	pr_info("Forcing RAM used on node %ld to no more than %dMB\n",
 	       node, maxnodemem_pfn[node] >> (20 - PAGE_SHIFT));
@@ -134,6 +154,65 @@ static int __init setup_maxnodemem(char *str)
 }
 early_param("maxnodemem", setup_maxnodemem);
 
+struct memmap_entry {
+	u64 addr;	/* start of memory segment */
+	u64 size;	/* size of memory segment */
+};
+static struct memmap_entry memmap_map[64];
+static int memmap_nr;
+
+static void add_memmap_region(u64 addr, u64 size)
+{
+	if (memmap_nr >= ARRAY_SIZE(memmap_map)) {
+		pr_err("Ooops! Too many entries in the memory map!\n");
+		return;
+	}
+	memmap_map[memmap_nr].addr = addr;
+	memmap_map[memmap_nr].size = size;
+	memmap_nr++;
+}
+
+static int __init setup_memmap(char *p)
+{
+	char *oldp;
+	u64 start_at, mem_size;
+
+	if (!p)
+		return -EINVAL;
+
+	if (!strncmp(p, "exactmap", 8)) {
+		pr_err("\"memmap=exactmap\" not valid on tile\n");
+		return 0;
+	}
+
+	oldp = p;
+	mem_size = memparse(p, &p);
+	if (p == oldp)
+		return -EINVAL;
+
+	if (*p == '@') {
+		pr_err("\"memmap=nn@ss\" (force RAM) invalid on tile\n");
+	} else if (*p == '#') {
+		pr_err("\"memmap=nn#ss\" (force ACPI data) invalid on tile\n");
+	} else if (*p == '$') {
+		start_at = memparse(p+1, &p);
+		add_memmap_region(start_at, mem_size);
+	} else {
+		if (mem_size == 0)
+			return -EINVAL;
+		maxmem_pfn = (mem_size >> HPAGE_SHIFT) <<
+			(HPAGE_SHIFT - PAGE_SHIFT);
+	}
+	return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", setup_memmap);
+
+static int __init setup_mem(char *str)
+{
+	return setup_maxmem(str);
+}
+early_param("mem", setup_mem);  /* compatibility with x86 */
+
 static int __init setup_isolnodes(char *str)
 {
 	char buf[MAX_NUMNODES * 5];
@@ -146,18 +225,15 @@ static int __init setup_isolnodes(char *str)
 }
 early_param("isolnodes", setup_isolnodes);
 
-#ifdef CONFIG_PCI
+#if defined(CONFIG_PCI) && !defined(__tilegx__)
 static int __init setup_pci_reserve(char* str)
 {
-	unsigned long mb;
-
-	if (str == NULL || strict_strtoul(str, 0, &mb) != 0 ||
-	    mb > 3 * 1024)
+	if (str == NULL || kstrtouint(str, 0, &pci_reserve_mb) != 0 ||
+	    pci_reserve_mb > 3 * 1024)
 		return -EINVAL;
 
-	pci_reserve_mb = mb;
 	pr_info("Reserving %dMB for PCIE root complex mappings\n",
-	       pci_reserve_mb);
+		pci_reserve_mb);
 	return 0;
 }
 early_param("pci_reserve", setup_pci_reserve);
@@ -189,7 +265,7 @@ early_param("vmalloc", parse_vmalloc);
 /*
  * Determine for each controller where its lowmem is mapped and how much of
  * it is mapped there.  On controller zero, the first few megabytes are
- * already mapped in as code at MEM_SV_INTRPT, so in principle we could
+ * already mapped in as code at MEM_SV_START, so in principle we could
  * start our data mappings higher up, but for now we don't bother, to avoid
  * additional confusion.
  *
@@ -270,7 +346,7 @@ static void *__init setup_pa_va_mapping(void)
  * This is up to 4 mappings for lowmem, one mapping per memory
  * controller, plus one for our text segment.
  */
-static void __cpuinit store_permanent_mappings(void)
+static void store_permanent_mappings(void)
 {
 	int i;
 
@@ -287,8 +363,8 @@ static void __cpuinit store_permanent_mappings(void)
 		hv_store_mapping(addr, pages << PAGE_SHIFT, pa);
 	}
 
-	hv_store_mapping((HV_VirtAddr)_stext,
-			 (uint32_t)(_einittext - _stext), 0);
+	hv_store_mapping((HV_VirtAddr)_text,
+			 (uint32_t)(_einittext - _text), 0);
 }
 
 /*
@@ -309,6 +385,7 @@ static void __init setup_memory(void)
 #if defined(CONFIG_HIGHMEM) || defined(__tilegx__)
 	long lowmem_pages;
 #endif
+	unsigned long physpages = 0;
 
 	/* We are using a char to hold the cpu_2_node[] mapping */
 	BUILD_BUG_ON(MAX_NUMNODES > 127);
@@ -368,8 +445,8 @@ static void __init setup_memory(void)
 				continue;
 			}
 		}
-		if (num_physpages + PFN_DOWN(range.size) > maxmem_pfn) {
-			int max_size = maxmem_pfn - num_physpages;
+		if (physpages + PFN_DOWN(range.size) > maxmem_pfn) {
+			int max_size = maxmem_pfn - physpages;
 			if (max_size > 0) {
 				pr_err("Maxmem reduced node %d to %d pages\n",
 				       i, max_size);
@@ -397,7 +474,7 @@ static void __init setup_memory(void)
 			continue;
 		}
 #endif
-#ifdef CONFIG_PCI
+#if defined(CONFIG_PCI) && !defined(__tilegx__)
 		/*
 		 * Blocks that overlap the pci reserved region must
 		 * have enough space to hold the maximum percpu data
@@ -426,7 +503,7 @@ static void __init setup_memory(void)
 		node_start_pfn[i] = start;
 		node_end_pfn[i] = end;
 		node_controller[i] = range.controller;
-		num_physpages += size;
+		physpages += size;
 		max_pfn = end;
 
 		/* Mark node as online */
@@ -445,7 +522,7 @@ static void __init setup_memory(void)
 	 * we're willing to use at 8 million pages (32GB of 4KB pages).
 	 */
 	cap = 8 * 1024 * 1024;  /* 8 million pages */
-	if (num_physpages > cap) {
+	if (physpages > cap) {
 		int num_nodes = num_online_nodes();
 		int cap_each = cap / num_nodes;
 		unsigned long dropped_pages = 0;
@@ -456,10 +533,10 @@ static void __init setup_memory(void)
 				node_end_pfn[i] = node_start_pfn[i] + cap_each;
 			}
 		}
-		num_physpages -= dropped_pages;
+		physpages -= dropped_pages;
 		pr_warning("Only using %ldMB memory;"
 		       " ignoring %ldMB.\n",
-		       num_physpages >> (20 - PAGE_SHIFT),
+		       physpages >> (20 - PAGE_SHIFT),
 		       dropped_pages >> (20 - PAGE_SHIFT));
 		pr_warning("Consider using a larger page size.\n");
 	}
@@ -477,7 +554,7 @@ static void __init setup_memory(void)
 
 	lowmem_pages = (mappable_physpages > MAXMEM_PFN) ?
 		MAXMEM_PFN : mappable_physpages;
-	highmem_pages = (long) (num_physpages - lowmem_pages);
+	highmem_pages = (long) (physpages - lowmem_pages);
 
 	pr_notice("%ldMB HIGHMEM available.\n",
 	       pages_to_mb(highmem_pages > 0 ? highmem_pages : 0));
@@ -494,7 +571,6 @@ static void __init setup_memory(void)
 		pr_warning("Use a HIGHMEM enabled kernel.\n");
 		max_low_pfn = MAXMEM_PFN;
 		max_pfn = MAXMEM_PFN;
-		num_physpages = MAXMEM_PFN;
 		node_end_pfn[0] = MAXMEM_PFN;
 	} else {
 		pr_notice("%ldMB memory available.\n",
@@ -519,41 +595,125 @@ static void __init setup_memory(void)
 #endif
 }
 
-static void __init setup_bootmem_allocator(void)
+/*
+ * On 32-bit machines, we only put bootmem on the low controller,
+ * since PAs > 4GB can't be used in bootmem.  In principle one could
+ * imagine, e.g., multiple 1 GB controllers all of which could support
+ * bootmem, but in practice using controllers this small isn't a
+ * particularly interesting scenario, so we just keep it simple and
+ * use only the first controller for bootmem on 32-bit machines.
+ */
+static inline int node_has_bootmem(int nid)
+{
+#ifdef CONFIG_64BIT
+	return 1;
+#else
+	return nid == 0;
+#endif
+}
+
+static inline unsigned long alloc_bootmem_pfn(int nid,
+					      unsigned long size,
+					      unsigned long goal)
+{
+	void *kva = __alloc_bootmem_node(NODE_DATA(nid), size,
+					 PAGE_SIZE, goal);
+	unsigned long pfn = kaddr_to_pfn(kva);
+	BUG_ON(goal && PFN_PHYS(pfn) != goal);
+	return pfn;
+}
+
+static void __init setup_bootmem_allocator_node(int i)
 {
-	unsigned long bootmap_size, first_alloc_pfn, last_alloc_pfn;
+	unsigned long start, end, mapsize, mapstart;
+
+	if (node_has_bootmem(i)) {
+		NODE_DATA(i)->bdata = &bootmem_node_data[i];
+	} else {
+		/* Share controller zero's bdata for now. */
+		NODE_DATA(i)->bdata = &bootmem_node_data[0];
+		return;
+	}
 
-	/* Provide a node 0 bdata. */
-	NODE_DATA(0)->bdata = &node0_bdata;
+	/* Skip up to after the bss in node 0. */
+	start = (i == 0) ? min_low_pfn : node_start_pfn[i];
 
-#ifdef CONFIG_PCI
-	/* Don't let boot memory alias the PCI region. */
-	last_alloc_pfn = min(max_low_pfn, pci_reserve_start_pfn);
+	/* Only lowmem, if we're a HIGHMEM build. */
+#ifdef CONFIG_HIGHMEM
+	end = node_lowmem_end_pfn[i];
 #else
-	last_alloc_pfn = max_low_pfn;
+	end = node_end_pfn[i];
 #endif
 
-	/*
-	 * Initialize the boot-time allocator (with low memory only):
-	 * The first argument says where to put the bitmap, and the
-	 * second says where the end of allocatable memory is.
-	 */
-	bootmap_size = init_bootmem(min_low_pfn, last_alloc_pfn);
+	/* No memory here. */
+	if (end == start)
+		return;
+
+	/* Figure out where the bootmem bitmap is located. */
+	mapsize = bootmem_bootmap_pages(end - start);
+	if (i == 0) {
+		/* Use some space right before the heap on node 0. */
+		mapstart = start;
+		start += mapsize;
+	} else {
+		/* Allocate bitmap on node 0 to avoid page table issues. */
+		mapstart = alloc_bootmem_pfn(0, PFN_PHYS(mapsize), 0);
+	}
+
+	/* Initialize a node. */
+	init_bootmem_node(NODE_DATA(i), mapstart, start, end);
 
+	/* Free all the space back into the allocator. */
+	free_bootmem(PFN_PHYS(start), PFN_PHYS(end - start));
+
+#if defined(CONFIG_PCI) && !defined(__tilegx__)
 	/*
-	 * Let the bootmem allocator use all the space we've given it
-	 * except for its own bitmap.
+	 * Throw away any memory aliased by the PCI region.
 	 */
-	first_alloc_pfn = min_low_pfn + PFN_UP(bootmap_size);
-	if (first_alloc_pfn >= last_alloc_pfn)
-		early_panic("Not enough memory on controller 0 for bootmem\n");
+	if (pci_reserve_start_pfn < end && pci_reserve_end_pfn > start) {
+		start = max(pci_reserve_start_pfn, start);
+		end = min(pci_reserve_end_pfn, end);
+		reserve_bootmem(PFN_PHYS(start), PFN_PHYS(end - start),
+				BOOTMEM_EXCLUSIVE);
+	}
+#endif
+}
+
+static void __init setup_bootmem_allocator(void)
+{
+	int i;
+	for (i = 0; i < MAX_NUMNODES; ++i)
+		setup_bootmem_allocator_node(i);
+
+	/* Reserve any memory excluded by "memmap" arguments. */
+	for (i = 0; i < memmap_nr; ++i) {
+		struct memmap_entry *m = &memmap_map[i];
+		reserve_bootmem(m->addr, m->size, BOOTMEM_DEFAULT);
+	}
 
-	free_bootmem(PFN_PHYS(first_alloc_pfn),
-		     PFN_PHYS(last_alloc_pfn - first_alloc_pfn));
+#ifdef CONFIG_BLK_DEV_INITRD
+	if (initrd_start) {
+		/* Make sure the initrd memory region is not modified. */
+		if (reserve_bootmem(initrd_start, initrd_end - initrd_start,
+				    BOOTMEM_EXCLUSIVE)) {
+			pr_crit("The initrd memory region has been polluted. Disabling it.\n");
+			initrd_start = 0;
+			initrd_end = 0;
+		} else {
+			/*
+			 * Translate initrd_start & initrd_end from PA to VA for
+			 * future access.
+			 */
+			initrd_start += PAGE_OFFSET;
+			initrd_end += PAGE_OFFSET;
+		}
+	}
+#endif
 
 #ifdef CONFIG_KEXEC
 	if (crashk_res.start != crashk_res.end)
-		reserve_bootmem(crashk_res.start, resource_size(&crashk_res), 0);
+		reserve_bootmem(crashk_res.start, resource_size(&crashk_res),
+				BOOTMEM_DEFAULT);
 #endif
 }
 
@@ -580,19 +740,13 @@ static int __init percpu_size(void)
 	return size;
 }
 
-static inline unsigned long alloc_bootmem_pfn(int size, unsigned long goal)
-{
-	void *kva = __alloc_bootmem(size, PAGE_SIZE, goal);
-	unsigned long pfn = kaddr_to_pfn(kva);
-	BUG_ON(goal && PFN_PHYS(pfn) != goal);
-	return pfn;
-}
-
 static void __init zone_sizes_init(void)
 {
 	unsigned long zones_size[MAX_NR_ZONES] = { 0 };
 	int size = percpu_size();
 	int num_cpus = smp_height * smp_width;
+	const unsigned long dma_end = (1UL << (32 - PAGE_SHIFT));
+
 	int i;
 
 	for (i = 0; i < num_cpus; ++i)
@@ -625,21 +779,22 @@ static void __init zone_sizes_init(void)
 		 * though, there'll be no lowmem, so we just alloc_bootmem
 		 * the memmap.  There will be no percpu memory either.
 		 */
-		if (__pfn_to_highbits(start) == 0) {
-			/* In low PAs, allocate via bootmem. */
+		if (i != 0 && cpu_isset(i, isolnodes)) {
+			node_memmap_pfn[i] =
+				alloc_bootmem_pfn(0, memmap_size, 0);
+			BUG_ON(node_percpu[i] != 0);
+		} else if (node_has_bootmem(start)) {
 			unsigned long goal = 0;
 			node_memmap_pfn[i] =
-				alloc_bootmem_pfn(memmap_size, goal);
+				alloc_bootmem_pfn(i, memmap_size, 0);
 			if (kdata_huge)
 				goal = PFN_PHYS(lowmem_end) - node_percpu[i];
 			if (node_percpu[i])
 				node_percpu_pfn[i] =
-				    alloc_bootmem_pfn(node_percpu[i], goal);
-		} else if (cpu_isset(i, isolnodes)) {
-			node_memmap_pfn[i] = alloc_bootmem_pfn(memmap_size, 0);
-			BUG_ON(node_percpu[i] != 0);
+					alloc_bootmem_pfn(i, node_percpu[i],
+							  goal);
 		} else {
-			/* In high PAs, just reserve some pages. */
+			/* In non-bootmem zones, just reserve some pages. */
 			node_memmap_pfn[i] = node_free_pfn[i];
 			node_free_pfn[i] += PFN_UP(memmap_size);
 			if (!kdata_huge) {
@@ -663,23 +818,24 @@ static void __init zone_sizes_init(void)
 		zones_size[ZONE_NORMAL] = end - start;
 #endif
 
-		/*
-		 * Everyone shares node 0's bootmem allocator, but
-		 * we use alloc_remap(), above, to put the actual
-		 * struct page array on the individual controllers,
-		 * which is most of the data that we actually care about.
-		 * We can't place bootmem allocators on the other
-		 * controllers since the bootmem allocator can only
-		 * operate on 32-bit physical addresses.
-		 */
-		NODE_DATA(i)->bdata = NODE_DATA(0)->bdata;
+		if (start < dma_end) {
+			zones_size[ZONE_DMA] = min(zones_size[ZONE_NORMAL],
+						   dma_end - start);
+			zones_size[ZONE_NORMAL] -= zones_size[ZONE_DMA];
+		} else {
+			zones_size[ZONE_DMA] = 0;
+		}
+
+		/* Take zone metadata from controller 0 if we're isolnode. */
+		if (node_isset(i, isolnodes))
+			NODE_DATA(i)->bdata = &bootmem_node_data[0];
 
 		free_area_init_node(i, zones_size, start, NULL);
 		printk(KERN_DEBUG "  Normal zone: %ld per-cpu pages\n",
 		       PFN_UP(node_percpu[i]));
 
 		/* Track the type of memory on each node */
-		if (zones_size[ZONE_NORMAL])
+		if (zones_size[ZONE_NORMAL] || zones_size[ZONE_DMA])
 			node_set_state(i, N_NORMAL_MEMORY);
 #ifdef CONFIG_HIGHMEM
 		if (end != start)
@@ -855,13 +1011,29 @@ subsys_initcall(topology_init);
 
 #endif /* CONFIG_NUMA */
 
+/*
+ * Initialize hugepage support on this cpu.  We do this on all cores
+ * early in boot: before argument parsing for the boot cpu, and after
+ * argument parsing but before the init functions run on the secondaries.
+ * So the values we set up here in the hypervisor may be overridden on
+ * the boot cpu as arguments are parsed.
+ */
+static void init_super_pages(void)
+{
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	int i;
+	for (i = 0; i < HUGE_SHIFT_ENTRIES; ++i)
+		hv_set_pte_super_shift(i, huge_shift[i]);
+#endif
+}
+
 /**
  * setup_cpu() - Do all necessary per-cpu, tile-specific initialization.
  * @boot: Is this the boot cpu?
  *
  * Called from setup_arch() on the boot cpu, or online_secondary().
  */
-void __cpuinit setup_cpu(int boot)
+void setup_cpu(int boot)
 {
 	/* The boot cpu sets up its permanent mappings much earlier. */
 	if (!boot)
@@ -872,9 +1044,6 @@ void __cpuinit setup_cpu(int boot)
 	arch_local_irq_unmask(INT_DMATLB_MISS);
 	arch_local_irq_unmask(INT_DMATLB_ACCESS);
 #endif
-#if CHIP_HAS_SN_PROC()
-	arch_local_irq_unmask(INT_SNITLB_MISS);
-#endif
 #ifdef __tilegx__
 	arch_local_irq_unmask(INT_SINGLE_STEP_K);
 #endif
@@ -889,10 +1058,6 @@ void __cpuinit setup_cpu(int boot)
 	/* Static network is not restricted. */
 	__insn_mtspr(SPR_MPL_SN_ACCESS_SET_0, 1);
 #endif
-#if CHIP_HAS_SN_PROC()
-	__insn_mtspr(SPR_MPL_SN_NOTIFY_SET_0, 1);
-	__insn_mtspr(SPR_MPL_SN_CPL_SET_0, 1);
-#endif
 
 	/*
 	 * Set the MPL for interrupt control 0 & 1 to the corresponding
@@ -909,12 +1074,14 @@ void __cpuinit setup_cpu(int boot)
 	/* Reset the network state on this cpu. */
 	reset_network_state();
 #endif
+
+	init_super_pages();
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
 
 static int __initdata set_initramfs_file;
-static char __initdata initramfs_file[128] = "initramfs.cpio.gz";
+static char __initdata initramfs_file[128] = "initramfs";
 
 static int __init setup_initramfs_file(char *str)
 {
@@ -928,9 +1095,9 @@ static int __init setup_initramfs_file(char *str)
 early_param("initramfs_file", setup_initramfs_file);
 
 /*
- * We look for an additional "initramfs.cpio.gz" file in the hvfs.
- * If there is one, we allocate some memory for it and it will be
- * unpacked to the initramfs after any built-in initramfs_data.
+ * We look for a file called "initramfs" in the hvfs.  If there is one, we
+ * allocate some memory for it and it will be unpacked to the initramfs.
+ * If it's compressed, the initd code will uncompress it first.
  */
 static void __init load_hv_initrd(void)
 {
@@ -938,12 +1105,22 @@ static void __init load_hv_initrd(void)
 	int fd, rc;
 	void *initrd;
 
+	/* If initrd has already been set, skip initramfs file in hvfs. */
+	if (initrd_start)
+		return;
+
 	fd = hv_fs_findfile((HV_VirtAddr) initramfs_file);
 	if (fd == HV_ENOENT) {
-		if (set_initramfs_file)
+		if (set_initramfs_file) {
 			pr_warning("No such hvfs initramfs file '%s'\n",
 				   initramfs_file);
-		return;
+			return;
+		} else {
+			/* Try old backwards-compatible name. */
+			fd = hv_fs_findfile((HV_VirtAddr)"initramfs.cpio.gz");
+			if (fd == HV_ENOENT)
+				return;
+		}
 	}
 	BUG_ON(fd < 0);
 	stat = hv_fs_fstat(fd);
@@ -970,6 +1147,25 @@ void __init free_initrd_mem(unsigned long begin, unsigned long end)
 	free_bootmem(__pa(begin), end - begin);
 }
 
+static int __init setup_initrd(char *str)
+{
+	char *endp;
+	unsigned long initrd_size;
+
+	initrd_size = str ? simple_strtoul(str, &endp, 0) : 0;
+	if (initrd_size == 0 || *endp != '@')
+		return -EINVAL;
+
+	initrd_start = simple_strtoul(endp+1, &endp, 0);
+	if (initrd_start == 0)
+		return -EINVAL;
+
+	initrd_end = initrd_start + initrd_size;
+
+	return 0;
+}
+early_param("initrd", setup_initrd);
+
 #else
 static inline void load_hv_initrd(void) {}
 #endif /* CONFIG_BLK_DEV_INITRD */
@@ -1037,7 +1233,7 @@ static void __init validate_va(void)
 #ifndef __tilegx__   /* FIXME: GX: probably some validation relevant here */
 	/*
 	 * Similarly, make sure we're only using allowed VAs.
-	 * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT,
+	 * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_START,
 	 * and 0 .. KERNEL_HIGH_VADDR.
 	 * In addition, make sure we CAN'T use the end of memory, since
 	 * we use the last chunk of each pgd for the pgd_list.
@@ -1052,7 +1248,7 @@ static void __init validate_va(void)
 		if (range.size == 0)
 			break;
 		if (range.start <= MEM_USER_INTRPT &&
-		    range.start + range.size >= MEM_HV_INTRPT)
+		    range.start + range.size >= MEM_HV_START)
 			user_kernel_ok = 1;
 		if (range.start == 0)
 			max_va = range.size;
@@ -1070,8 +1266,7 @@ static void __init validate_va(void)
 	if ((long)VMALLOC_START >= 0)
 		early_panic(
 			"Linux VMALLOC region below the 2GB line (%#lx)!\n"
-			"Reconfigure the kernel with fewer NR_HUGE_VMAPS\n"
-			"or smaller VMALLOC_RESERVE.\n",
+			"Reconfigure the kernel with smaller VMALLOC_RESERVE.\n",
 			VMALLOC_START);
 #endif
 }
@@ -1086,7 +1281,6 @@ static void __init validate_va(void)
 struct cpumask __write_once cpu_lotar_map;
 EXPORT_SYMBOL(cpu_lotar_map);
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 /*
  * hash_for_home_map lists all the tiles that hash-for-home data
  * will be cached on.  Note that this may includes tiles that are not
@@ -1096,11 +1290,10 @@ EXPORT_SYMBOL(cpu_lotar_map);
  */
 struct cpumask hash_for_home_map;
 EXPORT_SYMBOL(hash_for_home_map);
-#endif
 
 /*
  * cpu_cacheable_map lists all the cpus whose caches the hypervisor can
- * flush on our behalf.  It is set to cpu_possible_map OR'ed with
+ * flush on our behalf.  It is set to cpu_possible_mask OR'ed with
  * hash_for_home_map, and it is what should be passed to
  * hv_flush_remote() to flush all caches.  Note that if there are
  * dedicated hypervisor driver tiles that have authorized use of their
@@ -1186,20 +1379,16 @@ static void __init setup_cpu_maps(void)
 			      sizeof(cpu_lotar_map));
 	if (rc < 0) {
 		pr_err("warning: no HV_INQ_TILES_LOTAR; using AVAIL\n");
-		cpu_lotar_map = cpu_possible_map;
+		cpu_lotar_map = *cpu_possible_mask;
 	}
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/* Retrieve set of CPUs used for hash-for-home caching */
 	rc = hv_inquire_tiles(HV_INQ_TILES_HFH_CACHE,
 			      (HV_VirtAddr) hash_for_home_map.bits,
 			      sizeof(hash_for_home_map));
 	if (rc < 0)
 		early_panic("hv_inquire_tiles(HFH_CACHE) failed: rc %d\n", rc);
-	cpumask_or(&cpu_cacheable_map, &cpu_possible_map, &hash_for_home_map);
-#else
-	cpu_cacheable_map = cpu_possible_map;
-#endif
+	cpumask_or(&cpu_cacheable_map, cpu_possible_mask, &hash_for_home_map);
 }
 
 
@@ -1259,7 +1448,7 @@ void __init setup_arch(char **cmdline_p)
 	setup_cpu_maps();
 
 
-#ifdef CONFIG_PCI
+#if defined(CONFIG_PCI) && !defined(__tilegx__)
 	/*
 	 * Initialize the PCI structures.  This is done before memory
 	 * setup so that we know whether or not a pci_reserve region
@@ -1288,6 +1477,10 @@ void __init setup_arch(char **cmdline_p)
 	 * any memory using the bootmem allocator.
 	 */
 
+#ifdef CONFIG_SWIOTLB
+	swiotlb_init(0);
+#endif
+
 	paging_init();
 	setup_numa_mapping();
 	zone_sizes_init();
@@ -1390,26 +1583,26 @@ void __init setup_per_cpu_areas(void)
 		for (i = 0; i < size; i += PAGE_SIZE, ++pfn, ++pg) {
 
 			/* Update the vmalloc mapping and page home. */
-			pte_t *ptep =
-				virt_to_pte(NULL, (unsigned long)ptr + i);
+			unsigned long addr = (unsigned long)ptr + i;
+			pte_t *ptep = virt_to_kpte(addr);
 			pte_t pte = *ptep;
 			BUG_ON(pfn != pte_pfn(pte));
 			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_TILE_L3);
 			pte = set_remote_cache_cpu(pte, cpu);
-			set_pte(ptep, pte);
+			set_pte_at(&init_mm, addr, ptep, pte);
 
 			/* Update the lowmem mapping for consistency. */
 			lowmem_va = (unsigned long)pfn_to_kaddr(pfn);
-			ptep = virt_to_pte(NULL, lowmem_va);
+			ptep = virt_to_kpte(lowmem_va);
 			if (pte_huge(*ptep)) {
 				printk(KERN_DEBUG "early shatter of huge page"
 				       " at %#lx\n", lowmem_va);
 				shatter_pmd((pmd_t *)ptep);
-				ptep = virt_to_pte(NULL, lowmem_va);
+				ptep = virt_to_kpte(lowmem_va);
 				BUG_ON(pte_huge(*ptep));
 			}
 			BUG_ON(pfn != pte_pfn(*ptep));
-			set_pte(ptep, pte);
+			set_pte_at(&init_mm, lowmem_va, ptep, pte);
 		}
 	}
 
@@ -1438,16 +1631,17 @@ static struct resource code_resource = {
 };
 
 /*
- * We reserve all resources above 4GB so that PCI won't try to put
- * mappings above 4GB; the standard allows that for some devices but
- * the probing code trunates values to 32 bits.
+ * On Pro, we reserve all resources above 4GB so that PCI won't try to put
+ * mappings above 4GB.
  */
-#ifdef CONFIG_PCI
+#if defined(CONFIG_PCI) && !defined(__tilegx__)
 static struct resource* __init
 insert_non_bus_resource(void)
 {
 	struct resource *res =
 		kzalloc(sizeof(struct resource), GFP_ATOMIC);
+	if (!res)
+		return NULL;
 	res->name = "Non-Bus Physical Address Space";
 	res->start = (1ULL << 32);
 	res->end = -1LL;
@@ -1461,11 +1655,13 @@ insert_non_bus_resource(void)
 #endif
 
 static struct resource* __init
-insert_ram_resource(u64 start_pfn, u64 end_pfn)
+insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
 {
 	struct resource *res =
 		kzalloc(sizeof(struct resource), GFP_ATOMIC);
-	res->name = "System RAM";
+	if (!res)
+		return NULL;
+	res->name = reserved ? "Reserved" : "System RAM";
 	res->start = start_pfn << PAGE_SHIFT;
 	res->end = (end_pfn << PAGE_SHIFT) - 1;
 	res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
@@ -1485,10 +1681,9 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn)
 static int __init request_standard_resources(void)
 {
 	int i;
-	enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+	enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
 
-	iomem_resource.end = -1LL;
-#ifdef CONFIG_PCI
+#if defined(CONFIG_PCI) && !defined(__tilegx__)
 	insert_non_bus_resource();
 #endif
 
@@ -1496,16 +1691,16 @@ static int __init request_standard_resources(void)
 		u64 start_pfn = node_start_pfn[i];
 		u64 end_pfn = node_end_pfn[i];
 
-#ifdef CONFIG_PCI
+#if defined(CONFIG_PCI) && !defined(__tilegx__)
 		if (start_pfn <= pci_reserve_start_pfn &&
 		    end_pfn > pci_reserve_start_pfn) {
 			if (end_pfn > pci_reserve_end_pfn)
 				insert_ram_resource(pci_reserve_end_pfn,
-						     end_pfn);
+						    end_pfn, 0);
 			end_pfn = pci_reserve_start_pfn;
 		}
 #endif
-		insert_ram_resource(start_pfn, end_pfn);
+		insert_ram_resource(start_pfn, end_pfn, 0);
 	}
 
 	code_resource.start = __pa(_text - CODE_DELTA);
@@ -1516,6 +1711,13 @@ static int __init request_standard_resources(void)
 	insert_resource(&iomem_resource, &code_resource);
 	insert_resource(&iomem_resource, &data_resource);
 
+	/* Mark any "memmap" regions busy for the resource manager. */
+	for (i = 0; i < memmap_nr; ++i) {
+		struct memmap_entry *m = &memmap_map[i];
+		insert_ram_resource(PFN_DOWN(m->addr),
+				    PFN_UP(m->addr + m->size - 1), 1);
+	}
+
 #ifdef CONFIG_KEXEC
 	insert_resource(&iomem_resource, &crashk_res);
 #endif
diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c
index f79d4b88c74..d1d026f0126 100644
--- a/arch/tile/kernel/signal.c
+++ b/arch/tile/kernel/signal.c
@@ -33,19 +33,11 @@
 #include <asm/ucontext.h>
 #include <asm/sigframe.h>
 #include <asm/syscalls.h>
+#include <asm/vdso.h>
 #include <arch/interrupts.h>
 
 #define DEBUG_SIG 0
 
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-SYSCALL_DEFINE3(sigaltstack, const stack_t __user *, uss,
-		stack_t __user *, uoss, struct pt_regs *, regs)
-{
-	return do_sigaltstack(uss, uoss, regs->sp);
-}
-
-
 /*
  * Do a signal return; undo the signal stack.
  */
@@ -85,8 +77,9 @@ void signal_fault(const char *type, struct pt_regs *regs,
 }
 
 /* The assembly shim for this function arranges to ignore the return value. */
-SYSCALL_DEFINE1(rt_sigreturn, struct pt_regs *, regs)
+SYSCALL_DEFINE0(rt_sigreturn)
 {
+	struct pt_regs *regs = current_pt_regs();
 	struct rt_sigframe __user *frame =
 		(struct rt_sigframe __user *)(regs->sp);
 	sigset_t set;
@@ -96,13 +89,12 @@ SYSCALL_DEFINE1(rt_sigreturn, struct pt_regs *, regs)
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
 
-	sigdelsetmask(&set, ~_BLOCKABLE);
 	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
 		goto badframe;
 
-	if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
+	if (restore_altstack(&frame->uc.uc_stack))
 		goto badframe;
 
 	return 0;
@@ -193,17 +185,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	err |= __clear_user(&frame->save_area, sizeof(frame->save_area));
 	err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __put_user(NULL, &frame->uc.uc_link);
-	err |= __put_user((void __user *)(current->sas_ss_sp),
-			  &frame->uc.uc_stack.ss_sp);
-	err |= __put_user(sas_ss_flags(regs->sp),
-			  &frame->uc.uc_stack.ss_flags);
-	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+	err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, regs);
 	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
 	if (err)
 		goto give_sigsegv;
 
-	restorer = VDSO_BASE;
+	restorer = VDSO_SYM(&__vdso_rt_sigreturn);
 	if (ka->sa.sa_flags & SA_RESTORER)
 		restorer = (unsigned long) ka->sa.sa_restorer;
 
@@ -222,15 +210,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	regs->regs[1] = (unsigned long) &frame->info;
 	regs->regs[2] = (unsigned long) &frame->uc;
 	regs->flags |= PT_FLAGS_CALLER_SAVES;
-
-	/*
-	 * Notify any tracer that was single-stepping it.
-	 * The tracer may want to single-step inside the
-	 * handler too.
-	 */
-	if (test_thread_flag(TIF_SINGLESTEP))
-		ptrace_notify(SIGTRAP);
-
 	return 0;
 
 give_sigsegv:
@@ -242,10 +221,11 @@ give_sigsegv:
  * OK, we're invoking a handler
  */
 
-static int handle_signal(unsigned long sig, siginfo_t *info,
-			 struct k_sigaction *ka, sigset_t *oldset,
+static void handle_signal(unsigned long sig, siginfo_t *info,
+			 struct k_sigaction *ka,
 			 struct pt_regs *regs)
 {
+	sigset_t *oldset = sigmask_to_save();
 	int ret;
 
 	/* Are we from a system call? */
@@ -278,15 +258,10 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
 	else
 #endif
 		ret = setup_rt_frame(sig, ka, info, oldset, regs);
-	if (ret == 0) {
-		/* This code is only called from system calls or from
-		 * the work_pending path in the return-to-user code, and
-		 * either way we can re-enable interrupts unconditionally.
-		 */
-		block_sigmask(ka, sig);
-	}
-
-	return ret;
+	if (ret)
+		return;
+	signal_delivered(sig, info, ka, regs,
+			test_thread_flag(TIF_SINGLESTEP));
 }
 
 /*
@@ -299,7 +274,6 @@ void do_signal(struct pt_regs *regs)
 	siginfo_t info;
 	int signr;
 	struct k_sigaction ka;
-	sigset_t *oldset;
 
 	/*
 	 * i386 will check if we're coming from kernel mode and bail out
@@ -308,24 +282,10 @@ void do_signal(struct pt_regs *regs)
 	 * helpful, we can reinstate the check on "!user_mode(regs)".
 	 */
 
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee! Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		}
-
+		handle_signal(signr, &info, &ka, regs);
 		goto done;
 	}
 
@@ -350,10 +310,7 @@ void do_signal(struct pt_regs *regs)
 	}
 
 	/* If there's no signal to deliver, just put the saved sigmask back. */
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
-		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-	}
+	restore_saved_sigmask();
 
 done:
 	/* Avoid double syscall restart if there are nested signals. */
@@ -364,14 +321,13 @@ int show_unhandled_signals = 1;
 
 static int __init crashinfo(char *str)
 {
-	unsigned long val;
 	const char *word;
 
 	if (*str == '\0')
-		val = 2;
-	else if (*str != '=' || strict_strtoul(++str, 0, &val) != 0)
+		show_unhandled_signals = 2;
+	else if (*str != '=' || kstrtoint(++str, 0, &show_unhandled_signals) != 0)
 		return 0;
-	show_unhandled_signals = val;
+
 	switch (show_unhandled_signals) {
 	case 0:
 		word = "No";
diff --git a/arch/tile/kernel/single_step.c b/arch/tile/kernel/single_step.c
index b7a87950408..de07fa7d131 100644
--- a/arch/tile/kernel/single_step.c
+++ b/arch/tile/kernel/single_step.c
@@ -12,40 +12,30 @@
  *   more details.
  *
  * A code-rewriter that enables instruction single-stepping.
- * Derived from iLib's single-stepping code.
  */
 
-#ifndef __tilegx__   /* Hardware support for single step unavailable. */
-
-/* These functions are only used on the TILE platform */
+#include <linux/smp.h>
+#include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/thread_info.h>
 #include <linux/uaccess.h>
 #include <linux/mman.h>
 #include <linux/types.h>
 #include <linux/err.h>
+#include <linux/prctl.h>
 #include <asm/cacheflush.h>
+#include <asm/traps.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
 #include <arch/abi.h>
+#include <arch/spr_def.h>
 #include <arch/opcode.h>
 
-#define signExtend17(val) sign_extend((val), 17)
-#define TILE_X1_MASK (0xffffffffULL << 31)
-
-int unaligned_printk;
 
-static int __init setup_unaligned_printk(char *str)
-{
-	long val;
-	if (strict_strtol(str, 0, &val) != 0)
-		return 0;
-	unaligned_printk = val;
-	pr_info("Printk for each unaligned data accesses is %s\n",
-		unaligned_printk ? "enabled" : "disabled");
-	return 1;
-}
-__setup("unaligned_printk=", setup_unaligned_printk);
+#ifndef __tilegx__   /* Hardware support for single step unavailable. */
 
-unsigned int unaligned_fixup_count;
+#define signExtend17(val) sign_extend((val), 17)
+#define TILE_X1_MASK (0xffffffffULL << 31)
 
 enum mem_op {
 	MEMOP_NONE,
@@ -55,12 +45,13 @@ enum mem_op {
 	MEMOP_STORE_POSTINCR
 };
 
-static inline tile_bundle_bits set_BrOff_X1(tile_bundle_bits n, s32 offset)
+static inline tilepro_bundle_bits set_BrOff_X1(tilepro_bundle_bits n,
+	s32 offset)
 {
-	tile_bundle_bits result;
+	tilepro_bundle_bits result;
 
 	/* mask out the old offset */
-	tile_bundle_bits mask = create_BrOff_X1(-1);
+	tilepro_bundle_bits mask = create_BrOff_X1(-1);
 	result = n & (~mask);
 
 	/* or in the new offset */
@@ -69,10 +60,11 @@ static inline tile_bundle_bits set_BrOff_X1(tile_bundle_bits n, s32 offset)
 	return result;
 }
 
-static inline tile_bundle_bits move_X1(tile_bundle_bits n, int dest, int src)
+static inline tilepro_bundle_bits move_X1(tilepro_bundle_bits n, int dest,
+	int src)
 {
-	tile_bundle_bits result;
-	tile_bundle_bits op;
+	tilepro_bundle_bits result;
+	tilepro_bundle_bits op;
 
 	result = n & (~TILE_X1_MASK);
 
@@ -86,13 +78,13 @@ static inline tile_bundle_bits move_X1(tile_bundle_bits n, int dest, int src)
 	return result;
 }
 
-static inline tile_bundle_bits nop_X1(tile_bundle_bits n)
+static inline tilepro_bundle_bits nop_X1(tilepro_bundle_bits n)
 {
 	return move_X1(n, TREG_ZERO, TREG_ZERO);
 }
 
-static inline tile_bundle_bits addi_X1(
-	tile_bundle_bits n, int dest, int src, int imm)
+static inline tilepro_bundle_bits addi_X1(
+	tilepro_bundle_bits n, int dest, int src, int imm)
 {
 	n &= ~TILE_X1_MASK;
 
@@ -106,15 +98,26 @@ static inline tile_bundle_bits addi_X1(
 	return n;
 }
 
-static tile_bundle_bits rewrite_load_store_unaligned(
+static tilepro_bundle_bits rewrite_load_store_unaligned(
 	struct single_step_state *state,
-	tile_bundle_bits bundle,
+	tilepro_bundle_bits bundle,
 	struct pt_regs *regs,
 	enum mem_op mem_op,
 	int size, int sign_ext)
 {
 	unsigned char __user *addr;
 	int val_reg, addr_reg, err, val;
+	int align_ctl;
+
+	align_ctl = unaligned_fixup;
+	switch (task_thread_info(current)->align_ctl) {
+	case PR_UNALIGN_NOPRINT:
+		align_ctl = 1;
+		break;
+	case PR_UNALIGN_SIGBUS:
+		align_ctl = 0;
+		break;
+	}
 
 	/* Get address and value registers */
 	if (bundle & TILEPRO_BUNDLE_Y_ENCODING_MASK) {
@@ -152,9 +155,25 @@ static tile_bundle_bits rewrite_load_store_unaligned(
 	if (((unsigned long)addr % size) == 0)
 		return bundle;
 
-#ifndef __LITTLE_ENDIAN
-# error We assume little-endian representation with copy_xx_user size 2 here
-#endif
+	/*
+	 * Return SIGBUS with the unaligned address, if requested.
+	 * Note that we return SIGBUS even for completely invalid addresses
+	 * as long as they are in fact unaligned; this matches what the
+	 * tilepro hardware would be doing, if it could provide us with the
+	 * actual bad address in an SPR, which it doesn't.
+	 */
+	if (align_ctl == 0) {
+		siginfo_t info = {
+			.si_signo = SIGBUS,
+			.si_code = BUS_ADRALN,
+			.si_addr = addr
+		};
+		trace_unhandled_signal("unaligned trap", regs,
+				       (unsigned long)addr, SIGBUS);
+		force_sig_info(info.si_signo, &info, current);
+		return (tilepro_bundle_bits) 0;
+	}
+
 	/* Handle unaligned load/store */
 	if (mem_op == MEMOP_LOAD || mem_op == MEMOP_LOAD_POSTINCR) {
 		unsigned short val_16;
@@ -175,32 +194,31 @@ static tile_bundle_bits rewrite_load_store_unaligned(
 			state->update = 1;
 		}
 	} else {
+		unsigned short val_16;
 		val = (val_reg == TREG_ZERO) ? 0 : regs->regs[val_reg];
-		err = copy_to_user(addr, &val, size);
+		switch (size) {
+		case 2:
+			val_16 = val;
+			err = copy_to_user(addr, &val_16, sizeof(val_16));
+			break;
+		case 4:
+			err = copy_to_user(addr, &val, sizeof(val));
+			break;
+		default:
+			BUG();
+		}
 	}
 
 	if (err) {
 		siginfo_t info = {
-			.si_signo = SIGSEGV,
-			.si_code = SEGV_MAPERR,
-			.si_addr = addr
-		};
-		trace_unhandled_signal("segfault", regs,
-				       (unsigned long)addr, SIGSEGV);
-		force_sig_info(info.si_signo, &info, current);
-		return (tile_bundle_bits) 0;
-	}
-
-	if (unaligned_fixup == 0) {
-		siginfo_t info = {
 			.si_signo = SIGBUS,
 			.si_code = BUS_ADRALN,
 			.si_addr = addr
 		};
-		trace_unhandled_signal("unaligned trap", regs,
+		trace_unhandled_signal("bad address for unaligned fixup", regs,
 				       (unsigned long)addr, SIGBUS);
 		force_sig_info(info.si_signo, &info, current);
-		return (tile_bundle_bits) 0;
+		return (tilepro_bundle_bits) 0;
 	}
 
 	if (unaligned_printk || unaligned_fixup_count == 0) {
@@ -269,7 +287,7 @@ void single_step_execve(void)
 	ti->step_state = NULL;
 }
 
-/**
+/*
  * single_step_once() - entry point when single stepping has been triggered.
  * @regs: The machine register state
  *
@@ -288,20 +306,31 @@ void single_step_execve(void)
  */
 void single_step_once(struct pt_regs *regs)
 {
-	extern tile_bundle_bits __single_step_ill_insn;
-	extern tile_bundle_bits __single_step_j_insn;
-	extern tile_bundle_bits __single_step_addli_insn;
-	extern tile_bundle_bits __single_step_auli_insn;
+	extern tilepro_bundle_bits __single_step_ill_insn;
+	extern tilepro_bundle_bits __single_step_j_insn;
+	extern tilepro_bundle_bits __single_step_addli_insn;
+	extern tilepro_bundle_bits __single_step_auli_insn;
 	struct thread_info *info = (void *)current_thread_info();
 	struct single_step_state *state = info->step_state;
 	int is_single_step = test_ti_thread_flag(info, TIF_SINGLESTEP);
-	tile_bundle_bits __user *buffer, *pc;
-	tile_bundle_bits bundle;
+	tilepro_bundle_bits __user *buffer, *pc;
+	tilepro_bundle_bits bundle;
 	int temp_reg;
 	int target_reg = TREG_LR;
 	int err;
 	enum mem_op mem_op = MEMOP_NONE;
 	int size = 0, sign_ext = 0;  /* happy compiler */
+	int align_ctl;
+
+	align_ctl = unaligned_fixup;
+	switch (task_thread_info(current)->align_ctl) {
+	case PR_UNALIGN_NOPRINT:
+		align_ctl = 1;
+		break;
+	case PR_UNALIGN_SIGBUS:
+		align_ctl = 0;
+		break;
+	}
 
 	asm(
 "    .pushsection .rodata.single_step\n"
@@ -338,12 +367,10 @@ void single_step_once(struct pt_regs *regs)
 		}
 
 		/* allocate a cache line of writable, executable memory */
-		down_write(&current->mm->mmap_sem);
-		buffer = (void __user *) do_mmap(NULL, 0, 64,
+		buffer = (void __user *) vm_mmap(NULL, 0, 64,
 					  PROT_EXEC | PROT_READ | PROT_WRITE,
 					  MAP_PRIVATE | MAP_ANONYMOUS,
 					  0);
-		up_write(&current->mm->mmap_sem);
 
 		if (IS_ERR((void __force *)buffer)) {
 			kfree(state);
@@ -376,7 +403,7 @@ void single_step_once(struct pt_regs *regs)
 	if (regs->faultnum == INT_SWINT_1)
 		regs->pc -= 8;
 
-	pc = (tile_bundle_bits __user *)(regs->pc);
+	pc = (tilepro_bundle_bits __user *)(regs->pc);
 	if (get_user(bundle, pc) != 0) {
 		pr_err("Couldn't read instruction at %p trying to step\n", pc);
 		return;
@@ -519,7 +546,6 @@ void single_step_once(struct pt_regs *regs)
 			}
 			break;
 
-#if CHIP_HAS_WH64()
 		/* postincrement operations */
 		case IMM_0_OPCODE_X1:
 			switch (get_ImmOpcodeExtension_X1(bundle)) {
@@ -554,7 +580,6 @@ void single_step_once(struct pt_regs *regs)
 				break;
 			}
 			break;
-#endif /* CHIP_HAS_WH64() */
 		}
 
 		if (state->update) {
@@ -613,9 +638,9 @@ void single_step_once(struct pt_regs *regs)
 
 	/*
 	 * Check if we need to rewrite an unaligned load/store.
-	 * Returning zero is a special value meaning we need to SIGSEGV.
+	 * Returning zero is a special value meaning we generated a signal.
 	 */
-	if (mem_op != MEMOP_NONE && unaligned_fixup >= 0) {
+	if (mem_op != MEMOP_NONE && align_ctl >= 0) {
 		bundle = rewrite_load_store_unaligned(state, bundle, regs,
 						      mem_op, size, sign_ext);
 		if (bundle == 0)
@@ -654,9 +679,9 @@ void single_step_once(struct pt_regs *regs)
 		}
 
 		/* End with a jump back to the next instruction */
-		delta = ((regs->pc + TILE_BUNDLE_SIZE_IN_BYTES) -
+		delta = ((regs->pc + TILEPRO_BUNDLE_SIZE_IN_BYTES) -
 			(unsigned long)buffer) >>
-			TILE_LOG2_BUNDLE_ALIGNMENT_IN_BYTES;
+			TILEPRO_LOG2_BUNDLE_ALIGNMENT_IN_BYTES;
 		bundle = __single_step_j_insn;
 		bundle |= create_JOffLong_X1(delta);
 		err |= __put_user(bundle, buffer++);
@@ -684,9 +709,6 @@ void single_step_once(struct pt_regs *regs)
 }
 
 #else
-#include <linux/smp.h>
-#include <linux/ptrace.h>
-#include <arch/spr_def.h>
 
 static DEFINE_PER_CPU(unsigned long, ss_saved_pc);
 
@@ -729,10 +751,10 @@ void gx_singlestep_handle(struct pt_regs *regs, int fault_num)
 	} else if ((*ss_pc != regs->pc) ||
 		   (!(control & SPR_SINGLE_STEP_CONTROL_1__CANCELED_MASK))) {
 
-		ptrace_notify(SIGTRAP);
 		control |= SPR_SINGLE_STEP_CONTROL_1__CANCELED_MASK;
 		control |= SPR_SINGLE_STEP_CONTROL_1__INHIBIT_MASK;
 		__insn_mtspr(SPR_SINGLE_STEP_CONTROL_K, control);
+		send_sigtrap(current, regs);
 	}
 }
 
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index c52224d5ed4..01e8ab29f43 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -20,8 +20,13 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <asm/cacheflush.h>
+#include <asm/homecache.h>
 
-HV_Topology smp_topology __write_once;
+/*
+ * We write to width and height with a single store in head_NN.S,
+ * so make the variable aligned to "long".
+ */
+HV_Topology smp_topology __write_once __aligned(sizeof(long));
 EXPORT_SYMBOL(smp_topology);
 
 #if CHIP_HAS_IPI()
@@ -87,25 +92,6 @@ void send_IPI_allbutself(int tag)
 	send_IPI_many(&mask, tag);
 }
 
-
-/*
- * Provide smp_call_function_mask, but also run function locally
- * if specified in the mask.
- */
-void on_each_cpu_mask(const struct cpumask *mask, void (*func)(void *),
-		      void *info, bool wait)
-{
-	int cpu = get_cpu();
-	smp_call_function_many(mask, func, info, wait);
-	if (cpumask_test_cpu(cpu, mask)) {
-		local_irq_disable();
-		func(info);
-		local_irq_enable();
-	}
-	put_cpu();
-}
-
-
 /*
  * Functions related to starting/stopping cpus.
  */
@@ -119,10 +105,10 @@ static void smp_start_cpu_interrupt(void)
 /* Handler to stop the current cpu. */
 static void smp_stop_cpu_interrupt(void)
 {
-	set_cpu_online(smp_processor_id(), 0);
 	arch_local_irq_disable_all();
+	set_cpu_online(smp_processor_id(), 0);
 	for (;;)
-		asm("nap");
+		asm("nap; nop");
 }
 
 /* This function calls the 'stop' function on all other CPUs in the system. */
@@ -132,6 +118,12 @@ void smp_send_stop(void)
 	send_IPI_allbutself(MSG_TAG_STOP_CPU);
 }
 
+/* On panic, just wait; we may get an smp_send_stop() later on. */
+void panic_smp_self_stop(void)
+{
+	while (1)
+		asm("nap; nop");
+}
 
 /*
  * Dispatch code called from hv_message_intr() for HV_MSG_TILE hv messages.
@@ -180,9 +172,16 @@ static void ipi_flush_icache_range(void *info)
 void flush_icache_range(unsigned long start, unsigned long end)
 {
 	struct ipi_flush flush = { start, end };
-	preempt_disable();
-	on_each_cpu(ipi_flush_icache_range, &flush, 1);
-	preempt_enable();
+
+	/* If invoked with irqs disabled, we can not issue IPIs. */
+	if (irqs_disabled())
+		flush_remote(0, HV_FLUSH_EVICT_L1I, NULL, 0, 0, 0,
+			NULL, NULL, 0);
+	else {
+		preempt_disable();
+		on_each_cpu(ipi_flush_icache_range, &flush, 1);
+		preempt_enable();
+	}
 }
 
 
@@ -216,7 +215,7 @@ void __init ipi_init(void)
 		if (hv_get_ipi_pte(tile, KERNEL_PL, &pte) != 0)
 			panic("Failed to initialize IPI for cpu %d\n", cpu);
 
-		offset = hv_pte_get_pfn(pte) << PAGE_SHIFT;
+		offset = PFN_PHYS(pte_pfn(pte));
 		ipi_mappings[cpu] = ioremap_prot(offset, PAGE_SIZE, pte);
 	}
 #endif
diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c
index b949edcec20..732e9d13866 100644
--- a/arch/tile/kernel/smpboot.c
+++ b/arch/tile/kernel/smpboot.c
@@ -133,22 +133,24 @@ static __init int reset_init_affinity(void)
 }
 late_initcall(reset_init_affinity);
 
-static struct cpumask cpu_started __cpuinitdata;
+static struct cpumask cpu_started;
 
 /*
  * Activate a secondary processor.  Very minimal; don't add anything
  * to this path without knowing what you're doing, since SMP booting
  * is pretty fragile.
  */
-static void __cpuinit start_secondary(void)
+static void start_secondary(void)
 {
-	int cpuid = smp_processor_id();
+	int cpuid;
+
+	preempt_disable();
+
+	cpuid = smp_processor_id();
 
 	/* Set our thread pointer appropriately. */
 	set_my_cpu_offset(__per_cpu_offset[cpuid]);
 
-	preempt_disable();
-
 	/*
 	 * In large machines even this will slow us down, since we
 	 * will be contending for for the printk spinlock.
@@ -183,7 +185,7 @@ static void __cpuinit start_secondary(void)
 /*
  * Bring a secondary processor online.
  */
-void __cpuinit online_secondary(void)
+void online_secondary(void)
 {
 	/*
 	 * low-memory mappings have been cleared, flush them from
@@ -196,17 +198,9 @@ void __cpuinit online_secondary(void)
 	/* This must be done before setting cpu_online_mask */
 	wmb();
 
-	/*
-	 * We need to hold call_lock, so there is no inconsistency
-	 * between the time smp_call_function() determines number of
-	 * IPI recipients, and the time when the determination is made
-	 * for which cpus receive the IPI. Holding this
-	 * lock helps us to not include this cpu in a currently in progress
-	 * smp_call_function().
-	 */
-	ipi_call_lock();
+	notify_cpu_starting(smp_processor_id());
+
 	set_cpu_online(smp_processor_id(), 1);
-	ipi_call_unlock();
 	__get_cpu_var(cpu_state) = CPU_ONLINE;
 
 	/* Set up tile-specific state for this cpu. */
@@ -215,12 +209,10 @@ void __cpuinit online_secondary(void)
 	/* Set up tile-timer clock-event device on this cpu */
 	setup_tile_timer();
 
-	preempt_enable();
-
-	cpu_idle();
+	cpu_startup_entry(CPUHP_ONLINE);
 }
 
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	/* Wait 5s total for all CPUs for them to come online */
 	static int timeout;
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index 37ee4d037e0..c93977a6211 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -21,12 +21,16 @@
 #include <linux/stacktrace.h>
 #include <linux/uaccess.h>
 #include <linux/mmzone.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/string.h>
 #include <asm/backtrace.h>
 #include <asm/page.h>
-#include <asm/tlbflush.h>
 #include <asm/ucontext.h>
+#include <asm/switch_to.h>
 #include <asm/sigframe.h>
 #include <asm/stack.h>
+#include <asm/vdso.h>
 #include <arch/abi.h>
 #include <arch/interrupts.h>
 
@@ -44,72 +48,23 @@ static int in_kernel_stack(struct KBacktraceIterator *kbt, unsigned long sp)
 	return sp >= kstack_base && sp < kstack_base + THREAD_SIZE;
 }
 
-/* Is address valid for reading? */
-static int valid_address(struct KBacktraceIterator *kbt, unsigned long address)
-{
-	HV_PTE *l1_pgtable = kbt->pgtable;
-	HV_PTE *l2_pgtable;
-	unsigned long pfn;
-	HV_PTE pte;
-	struct page *page;
-
-	if (l1_pgtable == NULL)
-		return 0;	/* can't read user space in other tasks */
-
-#ifdef CONFIG_64BIT
-	/* Find the real l1_pgtable by looking in the l0_pgtable. */
-	pte = l1_pgtable[HV_L0_INDEX(address)];
-	if (!hv_pte_get_present(pte))
-		return 0;
-	pfn = hv_pte_get_pfn(pte);
-	if (pte_huge(pte)) {
-		if (!pfn_valid(pfn)) {
-			pr_err("L0 huge page has bad pfn %#lx\n", pfn);
-			return 0;
-		}
-		return hv_pte_get_present(pte) && hv_pte_get_readable(pte);
-	}
-	page = pfn_to_page(pfn);
-	BUG_ON(PageHighMem(page));  /* No HIGHMEM on 64-bit. */
-	l1_pgtable = (HV_PTE *)pfn_to_kaddr(pfn);
-#endif
-	pte = l1_pgtable[HV_L1_INDEX(address)];
-	if (!hv_pte_get_present(pte))
-		return 0;
-	pfn = hv_pte_get_pfn(pte);
-	if (pte_huge(pte)) {
-		if (!pfn_valid(pfn)) {
-			pr_err("huge page has bad pfn %#lx\n", pfn);
-			return 0;
-		}
-		return hv_pte_get_present(pte) && hv_pte_get_readable(pte);
-	}
-
-	page = pfn_to_page(pfn);
-	if (PageHighMem(page)) {
-		pr_err("L2 page table not in LOWMEM (%#llx)\n",
-		       HV_PFN_TO_CPA(pfn));
-		return 0;
-	}
-	l2_pgtable = (HV_PTE *)pfn_to_kaddr(pfn);
-	pte = l2_pgtable[HV_L2_INDEX(address)];
-	return hv_pte_get_present(pte) && hv_pte_get_readable(pte);
-}
-
 /* Callback for backtracer; basically a glorified memcpy */
 static bool read_memory_func(void *result, unsigned long address,
 			     unsigned int size, void *vkbt)
 {
 	int retval;
 	struct KBacktraceIterator *kbt = (struct KBacktraceIterator *)vkbt;
+
+	if (address == 0)
+		return 0;
 	if (__kernel_text_address(address)) {
 		/* OK to read kernel code. */
 	} else if (address >= PAGE_OFFSET) {
 		/* We only tolerate kernel-space reads of this task's stack */
 		if (!in_kernel_stack(kbt, address))
 			return 0;
-	} else if (!valid_address(kbt, address)) {
-		return 0;	/* invalid user-space address */
+	} else if (!kbt->is_current) {
+		return 0;	/* can't read from other user address spaces */
 	}
 	pagefault_disable();
 	retval = __copy_from_user_inatomic(result,
@@ -127,6 +82,8 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
 	unsigned long sp = kbt->it.sp;
 	struct pt_regs *p;
 
+	if (sp % sizeof(long) != 0)
+		return NULL;
 	if (!in_kernel_stack(kbt, sp))
 		return NULL;
 	if (!in_kernel_stack(kbt, sp + C_ABI_SAVE_AREA_SIZE + PTREGS_SIZE-1))
@@ -147,9 +104,8 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
 	    p->sp >= sp) {
 		if (kbt->verbose)
 			pr_err("  <%s while in kernel mode>\n", fault);
-	} else if (EX1_PL(p->ex1) == USER_PL &&
-	    p->pc < PAGE_OFFSET &&
-	    p->sp < PAGE_OFFSET) {
+	} else if (user_mode(p) &&
+		   p->sp < PAGE_OFFSET && p->sp != 0) {
 		if (kbt->verbose)
 			pr_err("  <%s while in user mode>\n", fault);
 	} else if (kbt->verbose) {
@@ -157,7 +113,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
 		       p->pc, p->sp, p->ex1);
 		p = NULL;
 	}
-	if (!kbt->profile || (INT_MASK(p->faultnum) & QUEUED_INTERRUPTS) == 0)
+	if (!kbt->profile || ((1ULL << p->faultnum) & QUEUED_INTERRUPTS) == 0)
 		return p;
 	return NULL;
 }
@@ -165,31 +121,31 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
 /* Is the pc pointing to a sigreturn trampoline? */
 static int is_sigreturn(unsigned long pc)
 {
-	return (pc == VDSO_BASE);
+	return current->mm && (pc == VDSO_SYM(&__vdso_rt_sigreturn));
 }
 
 /* Return a pt_regs pointer for a valid signal handler frame */
-static struct pt_regs *valid_sigframe(struct KBacktraceIterator* kbt)
+static struct pt_regs *valid_sigframe(struct KBacktraceIterator* kbt,
+				      struct rt_sigframe* kframe)
 {
 	BacktraceIterator *b = &kbt->it;
 
-	if (b->pc == VDSO_BASE) {
-		struct rt_sigframe *frame;
-		unsigned long sigframe_top =
-			b->sp + sizeof(struct rt_sigframe) - 1;
-		if (!valid_address(kbt, b->sp) ||
-		    !valid_address(kbt, sigframe_top)) {
-			if (kbt->verbose)
-				pr_err("  (odd signal: sp %#lx?)\n",
-				       (unsigned long)(b->sp));
+	if (is_sigreturn(b->pc) && b->sp < PAGE_OFFSET &&
+	    b->sp % sizeof(long) == 0) {
+		int retval;
+		pagefault_disable();
+		retval = __copy_from_user_inatomic(
+			kframe, (void __user __force *)b->sp,
+			sizeof(*kframe));
+		pagefault_enable();
+		if (retval != 0 ||
+		    (unsigned int)(kframe->info.si_signo) >= _NSIG)
 			return NULL;
-		}
-		frame = (struct rt_sigframe *)b->sp;
 		if (kbt->verbose) {
 			pr_err("  <received signal %d>\n",
-			       frame->info.si_signo);
+			       kframe->info.si_signo);
 		}
-		return (struct pt_regs *)&frame->uc.uc_mcontext;
+		return (struct pt_regs *)&kframe->uc.uc_mcontext;
 	}
 	return NULL;
 }
@@ -202,10 +158,11 @@ static int KBacktraceIterator_is_sigreturn(struct KBacktraceIterator *kbt)
 static int KBacktraceIterator_restart(struct KBacktraceIterator *kbt)
 {
 	struct pt_regs *p;
+	struct rt_sigframe kframe;
 
 	p = valid_fault_handler(kbt);
 	if (p == NULL)
-		p = valid_sigframe(kbt);
+		p = valid_sigframe(kbt, &kframe);
 	if (p == NULL)
 		return 0;
 	backtrace_init(&kbt->it, read_memory_func, kbt,
@@ -239,21 +196,21 @@ static int KBacktraceIterator_next_item_inclusive(
  */
 static void validate_stack(struct pt_regs *regs)
 {
-	int cpu = smp_processor_id();
+	int cpu = raw_smp_processor_id();
 	unsigned long ksp0 = get_current_ksp0();
-	unsigned long ksp0_base = ksp0 - THREAD_SIZE;
+	unsigned long ksp0_base = ksp0 & -THREAD_SIZE;
 	unsigned long sp = stack_pointer;
 
 	if (EX1_PL(regs->ex1) == KERNEL_PL && regs->sp >= ksp0) {
-		pr_err("WARNING: cpu %d: kernel stack page %#lx underrun!\n"
+		pr_err("WARNING: cpu %d: kernel stack %#lx..%#lx underrun!\n"
 		       "  sp %#lx (%#lx in caller), caller pc %#lx, lr %#lx\n",
-		       cpu, ksp0_base, sp, regs->sp, regs->pc, regs->lr);
+		       cpu, ksp0_base, ksp0, sp, regs->sp, regs->pc, regs->lr);
 	}
 
 	else if (sp < ksp0_base + sizeof(struct thread_info)) {
-		pr_err("WARNING: cpu %d: kernel stack page %#lx overrun!\n"
+		pr_err("WARNING: cpu %d: kernel stack %#lx..%#lx overrun!\n"
 		       "  sp %#lx (%#lx in caller), caller pc %#lx, lr %#lx\n",
-		       cpu, ksp0_base, sp, regs->sp, regs->pc, regs->lr);
+		       cpu, ksp0_base, ksp0, sp, regs->sp, regs->pc, regs->lr);
 	}
 }
 
@@ -265,41 +222,19 @@ void KBacktraceIterator_init(struct KBacktraceIterator *kbt,
 
 	/*
 	 * Set up callback information.  We grab the kernel stack base
-	 * so we will allow reads of that address range, and if we're
-	 * asking about the current process we grab the page table
-	 * so we can check user accesses before trying to read them.
-	 * We flush the TLB to avoid any weird skew issues.
+	 * so we will allow reads of that address range.
 	 */
-	is_current = (t == NULL);
+	is_current = (t == NULL || t == current);
 	kbt->is_current = is_current;
 	if (is_current)
 		t = validate_current();
 	kbt->task = t;
-	kbt->pgtable = NULL;
 	kbt->verbose = 0;   /* override in caller if desired */
 	kbt->profile = 0;   /* override in caller if desired */
 	kbt->end = KBT_ONGOING;
-	kbt->new_context = 0;
-	if (is_current) {
-		HV_PhysAddr pgdir_pa = hv_inquire_context().page_table;
-		if (pgdir_pa == (unsigned long)swapper_pg_dir - PAGE_OFFSET) {
-			/*
-			 * Not just an optimization: this also allows
-			 * this to work at all before va/pa mappings
-			 * are set up.
-			 */
-			kbt->pgtable = swapper_pg_dir;
-		} else {
-			struct page *page = pfn_to_page(PFN_DOWN(pgdir_pa));
-			if (!PageHighMem(page))
-				kbt->pgtable = __va(pgdir_pa);
-			else
-				pr_err("page table not in LOWMEM"
-				       " (%#llx)\n", pgdir_pa);
-		}
-		local_flush_tlb_all();
+	kbt->new_context = 1;
+	if (is_current)
 		validate_stack(regs);
-	}
 
 	if (regs == NULL) {
 		if (is_current || t->state == TASK_RUNNING) {
@@ -345,6 +280,95 @@ void KBacktraceIterator_next(struct KBacktraceIterator *kbt)
 }
 EXPORT_SYMBOL(KBacktraceIterator_next);
 
+static void describe_addr(struct KBacktraceIterator *kbt,
+			  unsigned long address,
+			  int have_mmap_sem, char *buf, size_t bufsize)
+{
+	struct vm_area_struct *vma;
+	size_t namelen, remaining;
+	unsigned long size, offset, adjust;
+	char *p, *modname;
+	const char *name;
+	int rc;
+
+	/*
+	 * Look one byte back for every caller frame (i.e. those that
+	 * aren't a new context) so we look up symbol data for the
+	 * call itself, not the following instruction, which may be on
+	 * a different line (or in a different function).
+	 */
+	adjust = !kbt->new_context;
+	address -= adjust;
+
+	if (address >= PAGE_OFFSET) {
+		/* Handle kernel symbols. */
+		BUG_ON(bufsize < KSYM_NAME_LEN);
+		name = kallsyms_lookup(address, &size, &offset,
+				       &modname, buf);
+		if (name == NULL) {
+			buf[0] = '\0';
+			return;
+		}
+		namelen = strlen(buf);
+		remaining = (bufsize - 1) - namelen;
+		p = buf + namelen;
+		rc = snprintf(p, remaining, "+%#lx/%#lx ",
+			      offset + adjust, size);
+		if (modname && rc < remaining)
+			snprintf(p + rc, remaining - rc, "[%s] ", modname);
+		buf[bufsize-1] = '\0';
+		return;
+	}
+
+	/* If we don't have the mmap_sem, we can't show any more info. */
+	buf[0] = '\0';
+	if (!have_mmap_sem)
+		return;
+
+	/* Find vma info. */
+	vma = find_vma(kbt->task->mm, address);
+	if (vma == NULL || address < vma->vm_start) {
+		snprintf(buf, bufsize, "[unmapped address] ");
+		return;
+	}
+
+	if (vma->vm_file) {
+		p = d_path(&vma->vm_file->f_path, buf, bufsize);
+		if (IS_ERR(p))
+			p = "?";
+		name = kbasename(p);
+	} else {
+		name = "anon";
+	}
+
+	/* Generate a string description of the vma info. */
+	namelen = strlen(name);
+	remaining = (bufsize - 1) - namelen;
+	memmove(buf, name, namelen);
+	snprintf(buf + namelen, remaining, "[%lx+%lx] ",
+		 vma->vm_start, vma->vm_end - vma->vm_start);
+}
+
+/*
+ * Avoid possible crash recursion during backtrace.  If it happens, it
+ * makes it easy to lose the actual root cause of the failure, so we
+ * put a simple guard on all the backtrace loops.
+ */
+static bool start_backtrace(void)
+{
+	if (current->thread.in_backtrace) {
+		pr_err("Backtrace requested while in backtrace!\n");
+		return false;
+	}
+	current->thread.in_backtrace = true;
+	return true;
+}
+
+static void end_backtrace(void)
+{
+	current->thread.in_backtrace = false;
+}
+
 /*
  * This method wraps the backtracer's more generic support.
  * It is only invoked from the architecture-specific code; show_stack()
@@ -353,7 +377,10 @@ EXPORT_SYMBOL(KBacktraceIterator_next);
 void tile_show_stack(struct KBacktraceIterator *kbt, int headers)
 {
 	int i;
+	int have_mmap_sem = 0;
 
+	if (!start_backtrace())
+		return;
 	if (headers) {
 		/*
 		 * Add a blank line since if we are called from panic(),
@@ -364,36 +391,21 @@ void tile_show_stack(struct KBacktraceIterator *kbt, int headers)
 		pr_err("Starting stack dump of tid %d, pid %d (%s)"
 		       " on cpu %d at cycle %lld\n",
 		       kbt->task->pid, kbt->task->tgid, kbt->task->comm,
-		       smp_processor_id(), get_cycles());
+		       raw_smp_processor_id(), get_cycles());
 	}
 	kbt->verbose = 1;
 	i = 0;
 	for (; !KBacktraceIterator_end(kbt); KBacktraceIterator_next(kbt)) {
-		char *modname;
-		const char *name;
-		unsigned long address = kbt->it.pc;
-		unsigned long offset, size;
 		char namebuf[KSYM_NAME_LEN+100];
+		unsigned long address = kbt->it.pc;
 
-		if (address >= PAGE_OFFSET)
-			name = kallsyms_lookup(address, &size, &offset,
-					       &modname, namebuf);
-		else
-			name = NULL;
-
-		if (!name)
-			namebuf[0] = '\0';
-		else {
-			size_t namelen = strlen(namebuf);
-			size_t remaining = (sizeof(namebuf) - 1) - namelen;
-			char *p = namebuf + namelen;
-			int rc = snprintf(p, remaining, "+%#lx/%#lx ",
-					  offset, size);
-			if (modname && rc < remaining)
-				snprintf(p + rc, remaining - rc,
-					 "[%s] ", modname);
-			namebuf[sizeof(namebuf)-1] = '\0';
-		}
+		/* Try to acquire the mmap_sem as we pass into userspace. */
+		if (address < PAGE_OFFSET && !have_mmap_sem && kbt->task->mm)
+			have_mmap_sem =
+				down_read_trylock(&kbt->task->mm->mmap_sem);
+
+		describe_addr(kbt, address, have_mmap_sem,
+			      namebuf, sizeof(namebuf));
 
 		pr_err("  frame %d: 0x%lx %s(sp 0x%lx)\n",
 		       i++, address, namebuf, (unsigned long)(kbt->it.sp));
@@ -408,6 +420,9 @@ void tile_show_stack(struct KBacktraceIterator *kbt, int headers)
 		pr_err("Stack dump stopped; next frame identical to this one\n");
 	if (headers)
 		pr_err("Stack dump complete\n");
+	if (have_mmap_sem)
+		up_read(&kbt->task->mm->mmap_sem);
+	end_backtrace();
 }
 EXPORT_SYMBOL(tile_show_stack);
 
@@ -448,7 +463,7 @@ void _KBacktraceIterator_init_current(struct KBacktraceIterator *kbt, ulong pc,
 				regs_to_pt_regs(&regs, pc, lr, sp, r52));
 }
 
-/* This is called only from kernel/sched.c, with esp == NULL */
+/* This is called only from kernel/sched/core.c, with esp == NULL */
 void show_stack(struct task_struct *task, unsigned long *esp)
 {
 	struct KBacktraceIterator kbt;
@@ -469,6 +484,8 @@ void save_stack_trace_tsk(struct task_struct *task, struct stack_trace *trace)
 	int skip = trace->skip;
 	int i = 0;
 
+	if (!start_backtrace())
+		goto done;
 	if (task == NULL || task == current)
 		KBacktraceIterator_init_current(&kbt);
 	else
@@ -482,6 +499,8 @@ void save_stack_trace_tsk(struct task_struct *task, struct stack_trace *trace)
 			break;
 		trace->entries[i++] = kbt.it.pc;
 	}
+	end_backtrace();
+done:
 	trace->nr_entries = i;
 }
 EXPORT_SYMBOL(save_stack_trace_tsk);
@@ -490,6 +509,7 @@ void save_stack_trace(struct stack_trace *trace)
 {
 	save_stack_trace_tsk(NULL, trace);
 }
+EXPORT_SYMBOL_GPL(save_stack_trace);
 
 #endif
 
diff --git a/arch/tile/kernel/sys.c b/arch/tile/kernel/sys.c
index cb44ba7ccd2..38debe70606 100644
--- a/arch/tile/kernel/sys.c
+++ b/arch/tile/kernel/sys.c
@@ -32,11 +32,19 @@
 #include <asm/syscalls.h>
 #include <asm/pgtable.h>
 #include <asm/homecache.h>
+#include <asm/cachectl.h>
 #include <arch/chip.h>
 
-SYSCALL_DEFINE0(flush_cache)
+SYSCALL_DEFINE3(cacheflush, unsigned long, addr, unsigned long, len,
+		unsigned long, flags)
 {
-	homecache_evict(cpumask_of(smp_processor_id()));
+	/* DCACHE is not particularly effective if not bound to one cpu. */
+	if (flags & DCACHE)
+		homecache_evict(cpumask_of(raw_smp_processor_id()));
+
+	if (flags & ICACHE)
+		flush_remote(0, HV_FLUSH_EVICT_L1I, mm_cpumask(current->mm),
+			     0, 0, 0, NULL, NULL, 0);
 	return 0;
 }
 
@@ -100,14 +108,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
 #define sys_readahead sys32_readahead
 #endif
 
-/* Call the trampolines to manage pt_regs where necessary. */
-#define sys_execve _sys_execve
-#define sys_sigaltstack _sys_sigaltstack
+/* Call the assembly trampolines where necessary. */
+#undef sys_rt_sigreturn
 #define sys_rt_sigreturn _sys_rt_sigreturn
 #define sys_clone _sys_clone
-#ifndef __tilegx__
-#define sys_cmpxchg_badaddr _sys_cmpxchg_badaddr
-#endif
 
 /*
  * Note that we can't include <linux/unistd.h> here since the header
diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
index 71ae728e9d0..a3ed12f8f83 100644
--- a/arch/tile/kernel/sysfs.c
+++ b/arch/tile/kernel/sysfs.c
@@ -93,6 +93,10 @@ HV_CONF_ATTR(mezz_part,		HV_CONFSTR_MEZZ_PART_NUM)
 HV_CONF_ATTR(mezz_serial,	HV_CONFSTR_MEZZ_SERIAL_NUM)
 HV_CONF_ATTR(mezz_revision,	HV_CONFSTR_MEZZ_REV)
 HV_CONF_ATTR(mezz_description,	HV_CONFSTR_MEZZ_DESC)
+HV_CONF_ATTR(cpumod_part,	HV_CONFSTR_CPUMOD_PART_NUM)
+HV_CONF_ATTR(cpumod_serial,	HV_CONFSTR_CPUMOD_SERIAL_NUM)
+HV_CONF_ATTR(cpumod_revision,	HV_CONFSTR_CPUMOD_REV)
+HV_CONF_ATTR(cpumod_description,HV_CONFSTR_CPUMOD_DESC)
 HV_CONF_ATTR(switch_control,	HV_CONFSTR_SWITCH_CONTROL)
 
 static struct attribute *board_attrs[] = {
@@ -104,6 +108,10 @@ static struct attribute *board_attrs[] = {
 	&dev_attr_mezz_serial.attr,
 	&dev_attr_mezz_revision.attr,
 	&dev_attr_mezz_description.attr,
+	&dev_attr_cpumod_part.attr,
+	&dev_attr_cpumod_serial.attr,
+	&dev_attr_cpumod_revision.attr,
+	&dev_attr_cpumod_description.attr,
 	&dev_attr_switch_control.attr,
 	NULL
 };
@@ -149,6 +157,67 @@ hvconfig_bin_read(struct file *filp, struct kobject *kobj,
 	return count;
 }
 
+static ssize_t hv_stats_show(struct device *dev,
+			     struct device_attribute *attr,
+			     char *page)
+{
+	int cpu = dev->id;
+	long lotar = HV_XY_TO_LOTAR(cpu_x(cpu), cpu_y(cpu));
+
+	ssize_t n = hv_confstr(HV_CONFSTR_HV_STATS,
+			       (unsigned long)page, PAGE_SIZE - 1,
+			       lotar, 0);
+	n = n < 0 ? 0 : min(n, (ssize_t)PAGE_SIZE - 1);
+	page[n] = '\0';
+	return n;
+}
+
+static ssize_t hv_stats_store(struct device *dev,
+			      struct device_attribute *attr,
+			      const char *page,
+			      size_t count)
+{
+	int cpu = dev->id;
+	long lotar = HV_XY_TO_LOTAR(cpu_x(cpu), cpu_y(cpu));
+
+	ssize_t n = hv_confstr(HV_CONFSTR_HV_STATS, 0, 0, lotar, 1);
+	return n < 0 ? n : count;
+}
+
+static DEVICE_ATTR(hv_stats, 0644, hv_stats_show, hv_stats_store);
+
+static int hv_stats_device_add(struct device *dev, struct subsys_interface *sif)
+{
+	int err, cpu = dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	err = sysfs_create_file(&dev->kobj, &dev_attr_hv_stats.attr);
+
+	return err;
+}
+
+static int hv_stats_device_remove(struct device *dev,
+				  struct subsys_interface *sif)
+{
+	int cpu = dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	sysfs_remove_file(&dev->kobj, &dev_attr_hv_stats.attr);
+	return 0;
+}
+
+
+static struct subsys_interface hv_stats_interface = {
+	.name			= "hv_stats",
+	.subsys			= &cpu_subsys,
+	.add_dev		= hv_stats_device_add,
+	.remove_dev		= hv_stats_device_remove,
+};
+
 static int __init create_sysfs_entries(void)
 {
 	int err = 0;
@@ -180,6 +249,21 @@ static int __init create_sysfs_entries(void)
 		err = sysfs_create_bin_file(hypervisor_kobj, &hvconfig_bin);
 	}
 
+	if (!err) {
+		/*
+		 * Don't bother adding the hv_stats files on each CPU if
+		 * our hypervisor doesn't supply statistics.
+		 */
+		int cpu = raw_smp_processor_id();
+		long lotar = HV_XY_TO_LOTAR(cpu_x(cpu), cpu_y(cpu));
+		char dummy;
+		ssize_t n = hv_confstr(HV_CONFSTR_HV_STATS,
+				       (unsigned long) &dummy, 1,
+				       lotar, 0);
+		if (n >= 0)
+			err = subsys_interface_register(&hv_stats_interface);
+	}
+
 	return err;
 }
 subsys_initcall(create_sysfs_entries);
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index f6f50f2a5e3..462dcd0c170 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -23,8 +23,10 @@
 #include <linux/smp.h>
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/timekeeper_internal.h>
 #include <asm/irq_regs.h>
 #include <asm/traps.h>
+#include <asm/vdso.h>
 #include <hv/hypervisor.h>
 #include <arch/interrupts.h>
 #include <arch/spr_def.h>
@@ -110,7 +112,6 @@ void __init time_init(void)
 	setup_tile_timer();
 }
 
-
 /*
  * Define the tile timer clock event device.  The timer is driven by
  * the TILE_TIMER_CONTROL register, which consists of a 31-bit down
@@ -159,7 +160,7 @@ static DEFINE_PER_CPU(struct clock_event_device, tile_timer) = {
 	.set_mode = tile_timer_set_mode,
 };
 
-void __cpuinit setup_tile_timer(void)
+void setup_tile_timer(void)
 {
 	struct clock_event_device *evt = &__get_cpu_var(tile_timer);
 
@@ -230,6 +231,52 @@ int setup_profiling_timer(unsigned int multiplier)
  */
 cycles_t ns2cycles(unsigned long nsecs)
 {
-	struct clock_event_device *dev = &__get_cpu_var(tile_timer);
-	return ((u64)nsecs * dev->mult) >> dev->shift;
+	/*
+	 * We do not have to disable preemption here as each core has the same
+	 * clock frequency.
+	 */
+	struct clock_event_device *dev = &__raw_get_cpu_var(tile_timer);
+
+	/*
+	 * as in clocksource.h and x86's timer.h, we split the calculation
+	 * into 2 parts to avoid unecessary overflow of the intermediate
+	 * value. This will not lead to any loss of precision.
+	 */
+	u64 quot = (u64)nsecs >> dev->shift;
+	u64 rem  = (u64)nsecs & ((1ULL << dev->shift) - 1);
+	return quot * dev->mult + ((rem * dev->mult) >> dev->shift);
+}
+
+void update_vsyscall_tz(void)
+{
+	/* Userspace gettimeofday will spin while this value is odd. */
+	++vdso_data->tz_update_count;
+	smp_wmb();
+	vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
+	vdso_data->tz_dsttime = sys_tz.tz_dsttime;
+	smp_wmb();
+	++vdso_data->tz_update_count;
+}
+
+void update_vsyscall(struct timekeeper *tk)
+{
+	struct timespec wall_time = tk_xtime(tk);
+	struct timespec *wtm = &tk->wall_to_monotonic;
+	struct clocksource *clock = tk->clock;
+
+	if (clock != &cycle_counter_cs)
+		return;
+
+	/* Userspace gettimeofday will spin while this value is odd. */
+	++vdso_data->tb_update_count;
+	smp_wmb();
+	vdso_data->xtime_tod_stamp = clock->cycle_last;
+	vdso_data->xtime_clock_sec = wall_time.tv_sec;
+	vdso_data->xtime_clock_nsec = wall_time.tv_nsec;
+	vdso_data->wtom_clock_sec = wtm->tv_sec;
+	vdso_data->wtom_clock_nsec = wtm->tv_nsec;
+	vdso_data->mult = clock->mult;
+	vdso_data->shift = clock->shift;
+	smp_wmb();
+	++vdso_data->tb_update_count;
 }
diff --git a/arch/tile/kernel/tlb.c b/arch/tile/kernel/tlb.c
index a5f241c24ca..f23b5351567 100644
--- a/arch/tile/kernel/tlb.c
+++ b/arch/tile/kernel/tlb.c
@@ -15,6 +15,7 @@
 
 #include <linux/cpumask.h>
 #include <linux/module.h>
+#include <linux/hugetlb.h>
 #include <asm/tlbflush.h>
 #include <asm/homecache.h>
 #include <hv/hypervisor.h>
@@ -49,25 +50,25 @@ void flush_tlb_current_task(void)
 	flush_tlb_mm(current->mm);
 }
 
-void flush_tlb_page_mm(const struct vm_area_struct *vma, struct mm_struct *mm,
+void flush_tlb_page_mm(struct vm_area_struct *vma, struct mm_struct *mm,
 		       unsigned long va)
 {
-	unsigned long size = hv_page_size(vma);
+	unsigned long size = vma_kernel_pagesize(vma);
 	int cache = (vma->vm_flags & VM_EXEC) ? HV_FLUSH_EVICT_L1I : 0;
 	flush_remote(0, cache, mm_cpumask(mm),
 		     va, size, size, mm_cpumask(mm), NULL, 0);
 }
 
-void flush_tlb_page(const struct vm_area_struct *vma, unsigned long va)
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
 {
 	flush_tlb_page_mm(vma, vma->vm_mm, va);
 }
 EXPORT_SYMBOL(flush_tlb_page);
 
-void flush_tlb_range(const struct vm_area_struct *vma,
+void flush_tlb_range(struct vm_area_struct *vma,
 		     unsigned long start, unsigned long end)
 {
-	unsigned long size = hv_page_size(vma);
+	unsigned long size = vma_kernel_pagesize(vma);
 	struct mm_struct *mm = vma->vm_mm;
 	int cache = (vma->vm_flags & VM_EXEC) ? HV_FLUSH_EVICT_L1I : 0;
 	flush_remote(0, cache, mm_cpumask(mm), start, end - start, size,
@@ -90,8 +91,14 @@ void flush_tlb_all(void)
 	}
 }
 
+/*
+ * Callers need to flush the L1I themselves if necessary, e.g. for
+ * kernel module unload.  Otherwise we assume callers are not using
+ * executable pgprot_t's.  Using EVICT_L1I means that dataplane cpus
+ * will get an unnecessary interrupt otherwise.
+ */
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	flush_remote(0, HV_FLUSH_EVICT_L1I, cpu_online_mask,
+	flush_remote(0, 0, NULL,
 		     start, end - start, PAGE_SIZE, cpu_online_mask, NULL, 0);
 }
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index 4f47b8a356d..f3ceb6308e4 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -15,12 +15,14 @@
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
+#include <linux/kdebug.h>
 #include <linux/module.h>
 #include <linux/reboot.h>
 #include <linux/uaccess.h>
 #include <linux/ptrace.h>
 #include <asm/stack.h>
 #include <asm/traps.h>
+#include <asm/setup.h>
 
 #include <arch/interrupts.h>
 #include <arch/spr_def.h>
@@ -28,7 +30,7 @@
 
 void __init trap_init(void)
 {
-	/* Nothing needed here since we link code at .intrpt1 */
+	/* Nothing needed here since we link code at .intrpt */
 }
 
 int unaligned_fixup = 1;
@@ -40,10 +42,9 @@ static int __init setup_unaligned_fixup(char *str)
 	 * will still parse the instruction, then fire a SIGBUS with
 	 * the correct address from inside the single_step code.
 	 */
-	long val;
-	if (strict_strtol(str, 0, &val) != 0)
+	if (kstrtoint(str, 0, &unaligned_fixup) != 0)
 		return 0;
-	unaligned_fixup = val;
+
 	pr_info("Fixups for unaligned data accesses are %s\n",
 	       unaligned_fixup >= 0 ?
 	       (unaligned_fixup ? "enabled" : "disabled") :
@@ -99,13 +100,7 @@ static int retry_gpv(unsigned int gpv_reason)
 
 #endif /* CHIP_HAS_TILE_DMA() */
 
-#ifdef __tilegx__
-#define bundle_bits tilegx_bundle_bits
-#else
-#define bundle_bits tile_bundle_bits
-#endif
-
-extern bundle_bits bpt_code;
+extern tile_bundle_bits bpt_code;
 
 asm(".pushsection .rodata.bpt_code,\"a\";"
     ".align 8;"
@@ -113,7 +108,7 @@ asm(".pushsection .rodata.bpt_code,\"a\";"
     ".size bpt_code,.-bpt_code;"
     ".popsection");
 
-static int special_ill(bundle_bits bundle, int *sigp, int *codep)
+static int special_ill(tile_bundle_bits bundle, int *sigp, int *codep)
 {
 	int sig, code, maxcode;
 
@@ -194,34 +189,119 @@ static int special_ill(bundle_bits bundle, int *sigp, int *codep)
 	return 1;
 }
 
+static const char *const int_name[] = {
+	[INT_MEM_ERROR] = "Memory error",
+	[INT_ILL] = "Illegal instruction",
+	[INT_GPV] = "General protection violation",
+	[INT_UDN_ACCESS] = "UDN access",
+	[INT_IDN_ACCESS] = "IDN access",
+#if CHIP_HAS_SN()
+	[INT_SN_ACCESS] = "SN access",
+#endif
+	[INT_SWINT_3] = "Software interrupt 3",
+	[INT_SWINT_2] = "Software interrupt 2",
+	[INT_SWINT_0] = "Software interrupt 0",
+	[INT_UNALIGN_DATA] = "Unaligned data",
+	[INT_DOUBLE_FAULT] = "Double fault",
+#ifdef __tilegx__
+	[INT_ILL_TRANS] = "Illegal virtual address",
+#endif
+};
+
+static int do_bpt(struct pt_regs *regs)
+{
+	unsigned long bundle, bcode, bpt;
+
+	bundle = *(unsigned long *)instruction_pointer(regs);
+
+	/*
+	 * bpt shoule be { bpt; nop }, which is 0x286a44ae51485000ULL.
+	 * we encode the unused least significant bits for other purpose.
+	 */
+	bpt = bundle & ~((1ULL << 12) - 1);
+	if (bpt != TILE_BPT_BUNDLE)
+		return 0;
+
+	bcode = bundle & ((1ULL << 12) - 1);
+	/*
+	 * notify the kprobe handlers, if instruction is likely to
+	 * pertain to them.
+	 */
+	switch (bcode) {
+	/* breakpoint_insn */
+	case 0:
+		notify_die(DIE_BREAK, "debug", regs, bundle,
+			INT_ILL, SIGTRAP);
+		break;
+	/* compiled_bpt */
+	case DIE_COMPILED_BPT:
+		notify_die(DIE_COMPILED_BPT, "debug", regs, bundle,
+			INT_ILL, SIGTRAP);
+		break;
+	/* breakpoint2_insn */
+	case DIE_SSTEPBP:
+		notify_die(DIE_SSTEPBP, "single_step", regs, bundle,
+			INT_ILL, SIGTRAP);
+		break;
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
 void __kprobes do_trap(struct pt_regs *regs, int fault_num,
 		       unsigned long reason)
 {
 	siginfo_t info = { 0 };
 	int signo, code;
-	unsigned long address;
-	bundle_bits instr;
+	unsigned long address = 0;
+	tile_bundle_bits instr;
+	int is_kernel = !user_mode(regs);
+
+	/* Handle breakpoints, etc. */
+	if (is_kernel && fault_num == INT_ILL && do_bpt(regs))
+		return;
 
-	/* Re-enable interrupts. */
-	local_irq_enable();
+	/* Re-enable interrupts, if they were previously enabled. */
+	if (!(regs->flags & PT_FLAGS_DISABLE_IRQ))
+		local_irq_enable();
 
 	/*
 	 * If it hits in kernel mode and we can't fix it up, just exit the
 	 * current process and hope for the best.
 	 */
-	if (!user_mode(regs)) {
-		if (fixup_exception(regs))  /* only UNALIGN_DATA in practice */
+	if (is_kernel) {
+		const char *name;
+		char buf[100];
+		if (fixup_exception(regs))  /* ILL_TRANS or UNALIGN_DATA */
 			return;
-		pr_alert("Kernel took bad trap %d at PC %#lx\n",
-		       fault_num, regs->pc);
+		if (fault_num >= 0 &&
+		    fault_num < sizeof(int_name)/sizeof(int_name[0]) &&
+		    int_name[fault_num] != NULL)
+			name = int_name[fault_num];
+		else
+			name = "Unknown interrupt";
 		if (fault_num == INT_GPV)
-			pr_alert("GPV_REASON is %#lx\n", reason);
+			snprintf(buf, sizeof(buf), "; GPV_REASON %#lx", reason);
+#ifdef __tilegx__
+		else if (fault_num == INT_ILL_TRANS)
+			snprintf(buf, sizeof(buf), "; address %#lx", reason);
+#endif
+		else
+			buf[0] = '\0';
+		pr_alert("Kernel took bad trap %d (%s) at PC %#lx%s\n",
+			 fault_num, name, regs->pc, buf);
 		show_regs(regs);
 		do_exit(SIGKILL);  /* FIXME: implement i386 die() */
 		return;
 	}
 
 	switch (fault_num) {
+	case INT_MEM_ERROR:
+		signo = SIGBUS;
+		code = BUS_OBJERR;
+		break;
 	case INT_ILL:
 		if (copy_from_user(&instr, (void __user *)regs->pc,
 				   sizeof(instr))) {
@@ -288,14 +368,15 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num,
 		address = regs->pc;
 		break;
 #ifdef __tilegx__
-	case INT_ILL_TRANS:
+	case INT_ILL_TRANS: {
+		/* Avoid a hardware erratum with the return address stack. */
+		fill_ra_stack();
+
 		signo = SIGSEGV;
+		address = reason;
 		code = SEGV_MAPERR;
-		if (reason & SPR_ILL_TRANS_REASON__I_STREAM_VA_RMASK)
-			address = regs->pc;
-		else
-			address = 0;  /* FIXME: GX: single-step for address */
 		break;
+	}
 #endif
 	default:
 		panic("Unexpected do_trap interrupt number %d", fault_num);
@@ -307,7 +388,8 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num,
 	info.si_addr = (void __user *)address;
 	if (signo == SIGILL)
 		info.si_trapno = fault_num;
-	trace_unhandled_signal("trap", regs, address, signo);
+	if (signo != SIGTRAP)
+		trace_unhandled_signal("trap", regs, address, signo);
 	force_sig_info(signo, &info, current);
 }
 
diff --git a/arch/tile/kernel/unaligned.c b/arch/tile/kernel/unaligned.c
new file mode 100644
index 00000000000..c02ea2a45f6
--- /dev/null
+++ b/arch/tile/kernel/unaligned.c
@@ -0,0 +1,1598 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * A code-rewriter that handles unaligned exception.
+ */
+
+#include <linux/smp.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/uaccess.h>
+#include <linux/mman.h>
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/compat.h>
+#include <linux/prctl.h>
+#include <asm/cacheflush.h>
+#include <asm/traps.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <arch/abi.h>
+#include <arch/spr_def.h>
+#include <arch/opcode.h>
+
+
+/*
+ * This file handles unaligned exception for tile-Gx. The tilepro's unaligned
+ * exception is supported out of single_step.c
+ */
+
+int unaligned_printk;
+
+static int __init setup_unaligned_printk(char *str)
+{
+	long val;
+	if (kstrtol(str, 0, &val) != 0)
+		return 0;
+	unaligned_printk = val;
+	pr_info("Printk for each unaligned data accesses is %s\n",
+		unaligned_printk ? "enabled" : "disabled");
+	return 1;
+}
+__setup("unaligned_printk=", setup_unaligned_printk);
+
+unsigned int unaligned_fixup_count;
+
+#ifdef __tilegx__
+
+/*
+ * Unalign data jit fixup code fragement. Reserved space is 128 bytes.
+ * The 1st 64-bit word saves fault PC address, 2nd word is the fault
+ * instruction bundle followed by 14 JIT bundles.
+ */
+
+struct unaligned_jit_fragment {
+	unsigned long       pc;
+	tilegx_bundle_bits  bundle;
+	tilegx_bundle_bits  insn[14];
+};
+
+/*
+ * Check if a nop or fnop at bundle's pipeline X0.
+ */
+
+static bool is_bundle_x0_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_X0(bundle) ==
+		  NOP_UNARY_OPCODE_X0) &&
+		 (get_RRROpcodeExtension_X0(bundle) ==
+		  UNARY_RRR_0_OPCODE_X0) &&
+		 (get_Opcode_X0(bundle) ==
+		  RRR_0_OPCODE_X0)) ||
+		((get_UnaryOpcodeExtension_X0(bundle) ==
+		  FNOP_UNARY_OPCODE_X0) &&
+		 (get_RRROpcodeExtension_X0(bundle) ==
+		  UNARY_RRR_0_OPCODE_X0) &&
+		 (get_Opcode_X0(bundle) ==
+		  RRR_0_OPCODE_X0)));
+}
+
+/*
+ * Check if nop or fnop at bundle's pipeline X1.
+ */
+
+static bool is_bundle_x1_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_X1(bundle) ==
+		  NOP_UNARY_OPCODE_X1) &&
+		 (get_RRROpcodeExtension_X1(bundle) ==
+		  UNARY_RRR_0_OPCODE_X1) &&
+		 (get_Opcode_X1(bundle) ==
+		  RRR_0_OPCODE_X1)) ||
+		((get_UnaryOpcodeExtension_X1(bundle) ==
+		  FNOP_UNARY_OPCODE_X1) &&
+		 (get_RRROpcodeExtension_X1(bundle) ==
+		  UNARY_RRR_0_OPCODE_X1) &&
+		 (get_Opcode_X1(bundle) ==
+		  RRR_0_OPCODE_X1)));
+}
+
+/*
+ * Check if nop or fnop at bundle's Y0 pipeline.
+ */
+
+static bool is_bundle_y0_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_Y0(bundle) ==
+		  NOP_UNARY_OPCODE_Y0) &&
+		 (get_RRROpcodeExtension_Y0(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y0) &&
+		 (get_Opcode_Y0(bundle) ==
+		  RRR_1_OPCODE_Y0)) ||
+		((get_UnaryOpcodeExtension_Y0(bundle) ==
+		  FNOP_UNARY_OPCODE_Y0) &&
+		 (get_RRROpcodeExtension_Y0(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y0) &&
+		 (get_Opcode_Y0(bundle) ==
+		  RRR_1_OPCODE_Y0)));
+}
+
+/*
+ * Check if nop or fnop at bundle's pipeline Y1.
+ */
+
+static bool is_bundle_y1_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_Y1(bundle) ==
+		  NOP_UNARY_OPCODE_Y1) &&
+		 (get_RRROpcodeExtension_Y1(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y1) &&
+		 (get_Opcode_Y1(bundle) ==
+		  RRR_1_OPCODE_Y1)) ||
+		((get_UnaryOpcodeExtension_Y1(bundle) ==
+		  FNOP_UNARY_OPCODE_Y1) &&
+		 (get_RRROpcodeExtension_Y1(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y1) &&
+		 (get_Opcode_Y1(bundle) ==
+		  RRR_1_OPCODE_Y1)));
+}
+
+/*
+ * Test if a bundle's y0 and y1 pipelines are both nop or fnop.
+ */
+
+static bool is_y0_y1_nop(tilegx_bundle_bits bundle)
+{
+	return is_bundle_y0_nop(bundle) && is_bundle_y1_nop(bundle);
+}
+
+/*
+ * Test if a bundle's x0 and x1 pipelines are both nop or fnop.
+ */
+
+static bool is_x0_x1_nop(tilegx_bundle_bits bundle)
+{
+	return is_bundle_x0_nop(bundle) && is_bundle_x1_nop(bundle);
+}
+
+/*
+ * Find the destination, source registers of fault unalign access instruction
+ * at X1 or Y2. Also, allocate up to 3 scratch registers clob1, clob2 and
+ * clob3, which are guaranteed different from any register used in the fault
+ * bundle. r_alias is used to return if the other instructions other than the
+ * unalign load/store shares same register with ra, rb and rd.
+ */
+
+static void find_regs(tilegx_bundle_bits bundle, uint64_t *rd, uint64_t *ra,
+		      uint64_t *rb, uint64_t *clob1, uint64_t *clob2,
+		      uint64_t *clob3, bool *r_alias)
+{
+	int i;
+	uint64_t reg;
+	uint64_t reg_map = 0, alias_reg_map = 0, map;
+	bool alias = false;
+
+	/*
+	 * Parse fault bundle, find potential used registers and mark
+	 * corresponding bits in reg_map and alias_map. These 2 bit maps
+	 * are used to find the scratch registers and determine if there
+	 * is register alais.
+	 */
+	if (bundle & TILEGX_BUNDLE_MODE_MASK) {  /* Y Mode Bundle. */
+
+		reg = get_SrcA_Y2(bundle);
+		reg_map |= 1ULL << reg;
+		*ra = reg;
+		reg = get_SrcBDest_Y2(bundle);
+		reg_map |= 1ULL << reg;
+
+		if (rd) {
+			/* Load. */
+			*rd = reg;
+			alias_reg_map = (1ULL << *rd) | (1ULL << *ra);
+		} else {
+			/* Store. */
+			*rb = reg;
+			alias_reg_map = (1ULL << *ra) | (1ULL << *rb);
+		}
+
+		if (!is_bundle_y1_nop(bundle)) {
+			reg = get_SrcA_Y1(bundle);
+			reg_map |= (1ULL << reg);
+			map = (1ULL << reg);
+
+			reg = get_SrcB_Y1(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			reg = get_Dest_Y1(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			if (map & alias_reg_map)
+				alias = true;
+		}
+
+		if (!is_bundle_y0_nop(bundle)) {
+			reg = get_SrcA_Y0(bundle);
+			reg_map |= (1ULL << reg);
+			map = (1ULL << reg);
+
+			reg = get_SrcB_Y0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			reg = get_Dest_Y0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			if (map & alias_reg_map)
+				alias = true;
+		}
+	} else	{ /* X Mode Bundle. */
+
+		reg = get_SrcA_X1(bundle);
+		reg_map |= (1ULL << reg);
+		*ra = reg;
+		if (rd)	{
+			/* Load. */
+			reg = get_Dest_X1(bundle);
+			reg_map |= (1ULL << reg);
+			*rd = reg;
+			alias_reg_map = (1ULL << *rd) | (1ULL << *ra);
+		} else {
+			/* Store. */
+			reg = get_SrcB_X1(bundle);
+			reg_map |= (1ULL << reg);
+			*rb = reg;
+			alias_reg_map = (1ULL << *ra) | (1ULL << *rb);
+		}
+
+		if (!is_bundle_x0_nop(bundle)) {
+			reg = get_SrcA_X0(bundle);
+			reg_map |= (1ULL << reg);
+			map = (1ULL << reg);
+
+			reg = get_SrcB_X0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			reg = get_Dest_X0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			if (map & alias_reg_map)
+				alias = true;
+		}
+	}
+
+	/*
+	 * "alias" indicates if the unalign access registers have collision
+	 * with others in the same bundle. We jsut simply test all register
+	 * operands case (RRR), ignored the case with immidate. If a bundle
+	 * has no register alias, we may do fixup in a simple or fast manner.
+	 * So if an immidata field happens to hit with a register, we may end
+	 * up fall back to the generic handling.
+	 */
+
+	*r_alias = alias;
+
+	/* Flip bits on reg_map. */
+	reg_map ^= -1ULL;
+
+	/* Scan reg_map lower 54(TREG_SP) bits to find 3 set bits. */
+	for (i = 0; i < TREG_SP; i++) {
+		if (reg_map & (0x1ULL << i)) {
+			if (*clob1 == -1) {
+				*clob1 = i;
+			} else if (*clob2 == -1) {
+				*clob2 = i;
+			} else if (*clob3 == -1) {
+				*clob3 = i;
+				return;
+			}
+		}
+	}
+}
+
+/*
+ * Sanity check for register ra, rb, rd, clob1/2/3. Return true if any of them
+ * is unexpected.
+ */
+
+static bool check_regs(uint64_t rd, uint64_t ra, uint64_t rb,
+		       uint64_t clob1, uint64_t clob2,  uint64_t clob3)
+{
+	bool unexpected = false;
+	if ((ra >= 56) && (ra != TREG_ZERO))
+		unexpected = true;
+
+	if ((clob1 >= 56) || (clob2 >= 56) || (clob3 >= 56))
+		unexpected = true;
+
+	if (rd != -1) {
+		if ((rd >= 56) && (rd != TREG_ZERO))
+			unexpected = true;
+	} else {
+		if ((rb >= 56) && (rb != TREG_ZERO))
+			unexpected = true;
+	}
+	return unexpected;
+}
+
+
+#define  GX_INSN_X0_MASK   ((1ULL << 31) - 1)
+#define  GX_INSN_X1_MASK   (((1ULL << 31) - 1) << 31)
+#define  GX_INSN_Y0_MASK   ((0xFULL << 27) | (0xFFFFFULL))
+#define  GX_INSN_Y1_MASK   (GX_INSN_Y0_MASK << 31)
+#define  GX_INSN_Y2_MASK   ((0x7FULL << 51) | (0x7FULL << 20))
+
+#ifdef __LITTLE_ENDIAN
+#define  GX_INSN_BSWAP(_bundle_)    (_bundle_)
+#else
+#define  GX_INSN_BSWAP(_bundle_)    swab64(_bundle_)
+#endif /* __LITTLE_ENDIAN */
+
+/*
+ * __JIT_CODE(.) creates template bundles in .rodata.unalign_data section.
+ * The corresponding static function jix_x#_###(.) generates partial or
+ * whole bundle based on the template and given arguments.
+ */
+
+#define __JIT_CODE(_X_)						\
+	asm (".pushsection .rodata.unalign_data, \"a\"\n"	\
+	     _X_"\n"						\
+	     ".popsection\n")
+
+__JIT_CODE("__unalign_jit_x1_mtspr:   {mtspr 0,  r0}");
+static tilegx_bundle_bits jit_x1_mtspr(int spr, int reg)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_mtspr;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_mtspr) & GX_INSN_X1_MASK) |
+		create_MT_Imm14_X1(spr) | create_SrcA_X1(reg);
+}
+
+__JIT_CODE("__unalign_jit_x1_mfspr:   {mfspr r0, 0}");
+static tilegx_bundle_bits  jit_x1_mfspr(int reg, int spr)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_mfspr;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_mfspr) & GX_INSN_X1_MASK) |
+		create_MF_Imm14_X1(spr) | create_Dest_X1(reg);
+}
+
+__JIT_CODE("__unalign_jit_x0_addi:   {addi  r0, r0, 0; iret}");
+static tilegx_bundle_bits  jit_x0_addi(int rd, int ra, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_addi;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_addi) & GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_Imm8_X0(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x1_ldna:   {ldna  r0, r0}");
+static tilegx_bundle_bits  jit_x1_ldna(int rd, int ra)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_ldna;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_ldna) &  GX_INSN_X1_MASK) |
+		create_Dest_X1(rd) | create_SrcA_X1(ra);
+}
+
+__JIT_CODE("__unalign_jit_x0_dblalign:   {dblalign r0, r0 ,r0}");
+static tilegx_bundle_bits  jit_x0_dblalign(int rd, int ra, int rb)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_dblalign;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_dblalign) & GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_SrcB_X0(rb);
+}
+
+__JIT_CODE("__unalign_jit_x1_iret:   {iret}");
+static tilegx_bundle_bits  jit_x1_iret(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_iret;
+	return GX_INSN_BSWAP(__unalign_jit_x1_iret) & GX_INSN_X1_MASK;
+}
+
+__JIT_CODE("__unalign_jit_x01_fnop:   {fnop;fnop}");
+static tilegx_bundle_bits  jit_x0_fnop(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x01_fnop;
+	return GX_INSN_BSWAP(__unalign_jit_x01_fnop) & GX_INSN_X0_MASK;
+}
+
+static tilegx_bundle_bits  jit_x1_fnop(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x01_fnop;
+	return GX_INSN_BSWAP(__unalign_jit_x01_fnop) & GX_INSN_X1_MASK;
+}
+
+__JIT_CODE("__unalign_jit_y2_dummy:   {fnop; fnop; ld zero, sp}");
+static tilegx_bundle_bits  jit_y2_dummy(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_y2_dummy;
+	return GX_INSN_BSWAP(__unalign_jit_y2_dummy) & GX_INSN_Y2_MASK;
+}
+
+static tilegx_bundle_bits  jit_y1_fnop(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_y2_dummy;
+	return GX_INSN_BSWAP(__unalign_jit_y2_dummy) & GX_INSN_Y1_MASK;
+}
+
+__JIT_CODE("__unalign_jit_x1_st1_add:  {st1_add r1, r0, 0}");
+static tilegx_bundle_bits  jit_x1_st1_add(int ra, int rb, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_st1_add;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_st1_add) &
+		(~create_SrcA_X1(-1)) &
+		GX_INSN_X1_MASK) | create_SrcA_X1(ra) |
+		create_SrcB_X1(rb) | create_Dest_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x1_st:  {crc32_8 r1, r0, r0; st  r0, r0}");
+static tilegx_bundle_bits  jit_x1_st(int ra, int rb)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_st;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_st) & GX_INSN_X1_MASK) |
+		create_SrcA_X1(ra) | create_SrcB_X1(rb);
+}
+
+__JIT_CODE("__unalign_jit_x1_st_add:  {st_add  r1, r0, 0}");
+static tilegx_bundle_bits  jit_x1_st_add(int ra, int rb, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_st_add;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_st_add) &
+		(~create_SrcA_X1(-1)) &
+		GX_INSN_X1_MASK) | create_SrcA_X1(ra) |
+		create_SrcB_X1(rb) | create_Dest_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x1_ld:  {crc32_8 r1, r0, r0; ld  r0, r0}");
+static tilegx_bundle_bits  jit_x1_ld(int rd, int ra)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_ld;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_ld) & GX_INSN_X1_MASK) |
+		create_Dest_X1(rd) | create_SrcA_X1(ra);
+}
+
+__JIT_CODE("__unalign_jit_x1_ld_add:  {ld_add  r1, r0, 0}");
+static tilegx_bundle_bits  jit_x1_ld_add(int rd, int ra, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_ld_add;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_ld_add) &
+		(~create_Dest_X1(-1)) &
+		GX_INSN_X1_MASK) | create_Dest_X1(rd) |
+		create_SrcA_X1(ra) | create_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x0_bfexts:  {bfexts r0, r0, 0, 0}");
+static tilegx_bundle_bits  jit_x0_bfexts(int rd, int ra, int bfs, int bfe)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_bfexts;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_bfexts) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_BFStart_X0(bfs) | create_BFEnd_X0(bfe);
+}
+
+__JIT_CODE("__unalign_jit_x0_bfextu:  {bfextu r0, r0, 0, 0}");
+static tilegx_bundle_bits  jit_x0_bfextu(int rd, int ra, int bfs, int bfe)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_bfextu;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_bfextu) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_BFStart_X0(bfs) | create_BFEnd_X0(bfe);
+}
+
+__JIT_CODE("__unalign_jit_x1_addi:  {bfextu r1, r1, 0, 0; addi r0, r0, 0}");
+static tilegx_bundle_bits  jit_x1_addi(int rd, int ra, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_addi;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_addi) & GX_INSN_X1_MASK) |
+		create_Dest_X1(rd) | create_SrcA_X1(ra) |
+		create_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x0_shrui:  {shrui r0, r0, 0; iret}");
+static tilegx_bundle_bits  jit_x0_shrui(int rd, int ra, int imm6)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_shrui;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_shrui) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_ShAmt_X0(imm6);
+}
+
+__JIT_CODE("__unalign_jit_x0_rotli:  {rotli r0, r0, 0; iret}");
+static tilegx_bundle_bits  jit_x0_rotli(int rd, int ra, int imm6)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_rotli;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_rotli) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_ShAmt_X0(imm6);
+}
+
+__JIT_CODE("__unalign_jit_x1_bnezt:  {bnezt r0, __unalign_jit_x1_bnezt}");
+static tilegx_bundle_bits  jit_x1_bnezt(int ra, int broff)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_bnezt;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_bnezt) &
+		GX_INSN_X1_MASK) |
+		create_SrcA_X1(ra) | create_BrOff_X1(broff);
+}
+
+#undef __JIT_CODE
+
+/*
+ * This function generates unalign fixup JIT.
+ *
+ * We first find unalign load/store instruction's destination, source
+ * registers: ra, rb and rd. and 3 scratch registers by calling
+ * find_regs(...). 3 scratch clobbers should not alias with any register
+ * used in the fault bundle. Then analyze the fault bundle to determine
+ * if it's a load or store, operand width, branch or address increment etc.
+ * At last generated JIT is copied into JIT code area in user space.
+ */
+
+static
+void jit_bundle_gen(struct pt_regs *regs, tilegx_bundle_bits bundle,
+		    int align_ctl)
+{
+	struct thread_info *info = current_thread_info();
+	struct unaligned_jit_fragment frag;
+	struct unaligned_jit_fragment *jit_code_area;
+	tilegx_bundle_bits bundle_2 = 0;
+	/* If bundle_2_enable = false, bundle_2 is fnop/nop operation. */
+	bool     bundle_2_enable = true;
+	uint64_t ra = -1, rb = -1, rd = -1, clob1 = -1, clob2 = -1, clob3 = -1;
+	/*
+	 * Indicate if the unalign access
+	 * instruction's registers hit with
+	 * others in the same bundle.
+	 */
+	bool     alias = false;
+	bool     load_n_store = true;
+	bool     load_store_signed = false;
+	unsigned int  load_store_size = 8;
+	bool     y1_br = false;  /* True, for a branch in same bundle at Y1.*/
+	int      y1_br_reg = 0;
+	/* True for link operation. i.e. jalr or lnk at Y1 */
+	bool     y1_lr = false;
+	int      y1_lr_reg = 0;
+	bool     x1_add = false;/* True, for load/store ADD instruction at X1*/
+	int      x1_add_imm8 = 0;
+	bool     unexpected = false;
+	int      n = 0, k;
+
+	jit_code_area =
+		(struct unaligned_jit_fragment *)(info->unalign_jit_base);
+
+	memset((void *)&frag, 0, sizeof(frag));
+
+	/* 0: X mode, Otherwise: Y mode. */
+	if (bundle & TILEGX_BUNDLE_MODE_MASK) {
+		unsigned int mod, opcode;
+
+		if (get_Opcode_Y1(bundle) == RRR_1_OPCODE_Y1 &&
+		    get_RRROpcodeExtension_Y1(bundle) ==
+		    UNARY_RRR_1_OPCODE_Y1) {
+
+			opcode = get_UnaryOpcodeExtension_Y1(bundle);
+
+			/*
+			 * Test "jalr", "jalrp", "jr", "jrp" instruction at Y1
+			 * pipeline.
+			 */
+			switch (opcode) {
+			case JALR_UNARY_OPCODE_Y1:
+			case JALRP_UNARY_OPCODE_Y1:
+				y1_lr = true;
+				y1_lr_reg = 55; /* Link register. */
+				/* FALLTHROUGH */
+			case JR_UNARY_OPCODE_Y1:
+			case JRP_UNARY_OPCODE_Y1:
+				y1_br = true;
+				y1_br_reg = get_SrcA_Y1(bundle);
+				break;
+			case LNK_UNARY_OPCODE_Y1:
+				/* "lnk" at Y1 pipeline. */
+				y1_lr = true;
+				y1_lr_reg = get_Dest_Y1(bundle);
+				break;
+			}
+		}
+
+		opcode = get_Opcode_Y2(bundle);
+		mod = get_Mode(bundle);
+
+		/*
+		 *  bundle_2 is bundle after making Y2 as a dummy operation
+		 *  - ld zero, sp
+		 */
+		bundle_2 = (bundle & (~GX_INSN_Y2_MASK)) | jit_y2_dummy();
+
+		/* Make Y1 as fnop if Y1 is a branch or lnk operation. */
+		if (y1_br || y1_lr) {
+			bundle_2 &= ~(GX_INSN_Y1_MASK);
+			bundle_2 |= jit_y1_fnop();
+		}
+
+		if (is_y0_y1_nop(bundle_2))
+			bundle_2_enable = false;
+
+		if (mod == MODE_OPCODE_YC2) {
+			/* Store. */
+			load_n_store = false;
+			load_store_size = 1 << opcode;
+			load_store_signed = false;
+			find_regs(bundle, 0, &ra, &rb, &clob1, &clob2,
+				  &clob3, &alias);
+			if (load_store_size > 8)
+				unexpected = true;
+		} else {
+			/* Load. */
+			load_n_store = true;
+			if (mod == MODE_OPCODE_YB2) {
+				switch (opcode) {
+				case LD_OPCODE_Y2:
+					load_store_signed = false;
+					load_store_size = 8;
+					break;
+				case LD4S_OPCODE_Y2:
+					load_store_signed = true;
+					load_store_size = 4;
+					break;
+				case LD4U_OPCODE_Y2:
+					load_store_signed = false;
+					load_store_size = 4;
+					break;
+				default:
+					unexpected = true;
+				}
+			} else if (mod == MODE_OPCODE_YA2) {
+				if (opcode == LD2S_OPCODE_Y2) {
+					load_store_signed = true;
+					load_store_size = 2;
+				} else if (opcode == LD2U_OPCODE_Y2) {
+					load_store_signed = false;
+					load_store_size = 2;
+				} else
+					unexpected = true;
+			} else
+				unexpected = true;
+			find_regs(bundle, &rd, &ra, &rb, &clob1, &clob2,
+				  &clob3, &alias);
+		}
+	} else {
+		unsigned int opcode;
+
+		/* bundle_2 is bundle after making X1 as "fnop". */
+		bundle_2 = (bundle & (~GX_INSN_X1_MASK)) | jit_x1_fnop();
+
+		if (is_x0_x1_nop(bundle_2))
+			bundle_2_enable = false;
+
+		if (get_Opcode_X1(bundle) == RRR_0_OPCODE_X1) {
+			opcode = get_UnaryOpcodeExtension_X1(bundle);
+
+			if (get_RRROpcodeExtension_X1(bundle) ==
+			    UNARY_RRR_0_OPCODE_X1) {
+				load_n_store = true;
+				find_regs(bundle, &rd, &ra, &rb, &clob1,
+					  &clob2, &clob3, &alias);
+
+				switch (opcode) {
+				case LD_UNARY_OPCODE_X1:
+					load_store_signed = false;
+					load_store_size = 8;
+					break;
+				case LD4S_UNARY_OPCODE_X1:
+					load_store_signed = true;
+					/* FALLTHROUGH */
+				case LD4U_UNARY_OPCODE_X1:
+					load_store_size = 4;
+					break;
+
+				case LD2S_UNARY_OPCODE_X1:
+					load_store_signed = true;
+					/* FALLTHROUGH */
+				case LD2U_UNARY_OPCODE_X1:
+					load_store_size = 2;
+					break;
+				default:
+					unexpected = true;
+				}
+			} else {
+				load_n_store = false;
+				load_store_signed = false;
+				find_regs(bundle, 0, &ra, &rb,
+					  &clob1, &clob2, &clob3,
+					  &alias);
+
+				opcode = get_RRROpcodeExtension_X1(bundle);
+				switch (opcode)	{
+				case ST_RRR_0_OPCODE_X1:
+					load_store_size = 8;
+					break;
+				case ST4_RRR_0_OPCODE_X1:
+					load_store_size = 4;
+					break;
+				case ST2_RRR_0_OPCODE_X1:
+					load_store_size = 2;
+					break;
+				default:
+					unexpected = true;
+				}
+			}
+		} else if (get_Opcode_X1(bundle) == IMM8_OPCODE_X1) {
+			load_n_store = true;
+			opcode = get_Imm8OpcodeExtension_X1(bundle);
+			switch (opcode)	{
+			case LD_ADD_IMM8_OPCODE_X1:
+				load_store_size = 8;
+				break;
+
+			case LD4S_ADD_IMM8_OPCODE_X1:
+				load_store_signed = true;
+				/* FALLTHROUGH */
+			case LD4U_ADD_IMM8_OPCODE_X1:
+				load_store_size = 4;
+				break;
+
+			case LD2S_ADD_IMM8_OPCODE_X1:
+				load_store_signed = true;
+				/* FALLTHROUGH */
+			case LD2U_ADD_IMM8_OPCODE_X1:
+				load_store_size = 2;
+				break;
+
+			case ST_ADD_IMM8_OPCODE_X1:
+				load_n_store = false;
+				load_store_size = 8;
+				break;
+			case ST4_ADD_IMM8_OPCODE_X1:
+				load_n_store = false;
+				load_store_size = 4;
+				break;
+			case ST2_ADD_IMM8_OPCODE_X1:
+				load_n_store = false;
+				load_store_size = 2;
+				break;
+			default:
+				unexpected = true;
+			}
+
+			if (!unexpected) {
+				x1_add = true;
+				if (load_n_store)
+					x1_add_imm8 = get_Imm8_X1(bundle);
+				else
+					x1_add_imm8 = get_Dest_Imm8_X1(bundle);
+			}
+
+			find_regs(bundle, load_n_store ? (&rd) : NULL,
+				  &ra, &rb, &clob1, &clob2, &clob3, &alias);
+		} else
+			unexpected = true;
+	}
+
+	/*
+	 * Some sanity check for register numbers extracted from fault bundle.
+	 */
+	if (check_regs(rd, ra, rb, clob1, clob2, clob3) == true)
+		unexpected = true;
+
+	/* Give warning if register ra has an aligned address. */
+	if (!unexpected)
+		WARN_ON(!((load_store_size - 1) & (regs->regs[ra])));
+
+
+	/*
+	 * Fault came from kernel space, here we only need take care of
+	 * unaligned "get_user/put_user" macros defined in "uaccess.h".
+	 * Basically, we will handle bundle like this:
+	 * {ld/2u/4s rd, ra; movei rx, 0} or {st/2/4 ra, rb; movei rx, 0}
+	 * (Refer to file "arch/tile/include/asm/uaccess.h" for details).
+	 * For either load or store, byte-wise operation is performed by calling
+	 * get_user() or put_user(). If the macro returns non-zero value,
+	 * set the value to rx, otherwise set zero to rx. Finally make pc point
+	 * to next bundle and return.
+	 */
+
+	if (EX1_PL(regs->ex1) != USER_PL) {
+
+		unsigned long rx = 0;
+		unsigned long x = 0, ret = 0;
+
+		if (y1_br || y1_lr || x1_add ||
+		    (load_store_signed !=
+		     (load_n_store && load_store_size == 4))) {
+			/* No branch, link, wrong sign-ext or load/store add. */
+			unexpected = true;
+		} else if (!unexpected) {
+			if (bundle & TILEGX_BUNDLE_MODE_MASK) {
+				/*
+				 * Fault bundle is Y mode.
+				 * Check if the Y1 and Y0 is the form of
+				 * { movei rx, 0; nop/fnop }, if yes,
+				 * find the rx.
+				 */
+
+				if ((get_Opcode_Y1(bundle) == ADDI_OPCODE_Y1)
+				    && (get_SrcA_Y1(bundle) == TREG_ZERO) &&
+				    (get_Imm8_Y1(bundle) == 0) &&
+				    is_bundle_y0_nop(bundle)) {
+					rx = get_Dest_Y1(bundle);
+				} else if ((get_Opcode_Y0(bundle) ==
+					    ADDI_OPCODE_Y0) &&
+					   (get_SrcA_Y0(bundle) == TREG_ZERO) &&
+					   (get_Imm8_Y0(bundle) == 0) &&
+					   is_bundle_y1_nop(bundle)) {
+					rx = get_Dest_Y0(bundle);
+				} else {
+					unexpected = true;
+				}
+			} else {
+				/*
+				 * Fault bundle is X mode.
+				 * Check if the X0 is 'movei rx, 0',
+				 * if yes, find the rx.
+				 */
+
+				if ((get_Opcode_X0(bundle) == IMM8_OPCODE_X0)
+				    && (get_Imm8OpcodeExtension_X0(bundle) ==
+					ADDI_IMM8_OPCODE_X0) &&
+				    (get_SrcA_X0(bundle) == TREG_ZERO) &&
+				    (get_Imm8_X0(bundle) == 0)) {
+					rx = get_Dest_X0(bundle);
+				} else {
+					unexpected = true;
+				}
+			}
+
+			/* rx should be less than 56. */
+			if (!unexpected && (rx >= 56))
+				unexpected = true;
+		}
+
+		if (!search_exception_tables(regs->pc)) {
+			/* No fixup in the exception tables for the pc. */
+			unexpected = true;
+		}
+
+		if (unexpected) {
+			/* Unexpected unalign kernel fault. */
+			struct task_struct *tsk = validate_current();
+
+			bust_spinlocks(1);
+
+			show_regs(regs);
+
+			if (unlikely(tsk->pid < 2)) {
+				panic("Kernel unalign fault running %s!",
+				      tsk->pid ? "init" : "the idle task");
+			}
+#ifdef SUPPORT_DIE
+			die("Oops", regs);
+#endif
+			bust_spinlocks(1);
+
+			do_group_exit(SIGKILL);
+
+		} else {
+			unsigned long i, b = 0;
+			unsigned char *ptr =
+				(unsigned char *)regs->regs[ra];
+			if (load_n_store) {
+				/* handle get_user(x, ptr) */
+				for (i = 0; i < load_store_size; i++) {
+					ret = get_user(b, ptr++);
+					if (!ret) {
+						/* Success! update x. */
+#ifdef __LITTLE_ENDIAN
+						x |= (b << (8 * i));
+#else
+						x <<= 8;
+						x |= b;
+#endif /* __LITTLE_ENDIAN */
+					} else {
+						x = 0;
+						break;
+					}
+				}
+
+				/* Sign-extend 4-byte loads. */
+				if (load_store_size == 4)
+					x = (long)(int)x;
+
+				/* Set register rd. */
+				regs->regs[rd] = x;
+
+				/* Set register rx. */
+				regs->regs[rx] = ret;
+
+				/* Bump pc. */
+				regs->pc += 8;
+
+			} else {
+				/* Handle put_user(x, ptr) */
+				x = regs->regs[rb];
+#ifdef __LITTLE_ENDIAN
+				b = x;
+#else
+				/*
+				 * Swap x in order to store x from low
+				 * to high memory same as the
+				 * little-endian case.
+				 */
+				switch (load_store_size) {
+				case 8:
+					b = swab64(x);
+					break;
+				case 4:
+					b = swab32(x);
+					break;
+				case 2:
+					b = swab16(x);
+					break;
+				}
+#endif /* __LITTLE_ENDIAN */
+				for (i = 0; i < load_store_size; i++) {
+					ret = put_user(b, ptr++);
+					if (ret)
+						break;
+					/* Success! shift 1 byte. */
+					b >>= 8;
+				}
+				/* Set register rx. */
+				regs->regs[rx] = ret;
+
+				/* Bump pc. */
+				regs->pc += 8;
+			}
+		}
+
+		unaligned_fixup_count++;
+
+		if (unaligned_printk) {
+			pr_info("%s/%d. Unalign fixup for kernel access "
+				"to userspace %lx.",
+				current->comm, current->pid, regs->regs[ra]);
+		}
+
+		/* Done! Return to the exception handler. */
+		return;
+	}
+
+	if ((align_ctl == 0) || unexpected) {
+		siginfo_t info = {
+			.si_signo = SIGBUS,
+			.si_code = BUS_ADRALN,
+			.si_addr = (unsigned char __user *)0
+		};
+		if (unaligned_printk)
+			pr_info("Unalign bundle: unexp @%llx, %llx",
+				(unsigned long long)regs->pc,
+				(unsigned long long)bundle);
+
+		if (ra < 56) {
+			unsigned long uaa = (unsigned long)regs->regs[ra];
+			/* Set bus Address. */
+			info.si_addr = (unsigned char __user *)uaa;
+		}
+
+		unaligned_fixup_count++;
+
+		trace_unhandled_signal("unaligned fixup trap", regs,
+				       (unsigned long)info.si_addr, SIGBUS);
+		force_sig_info(info.si_signo, &info, current);
+		return;
+	}
+
+#ifdef __LITTLE_ENDIAN
+#define UA_FIXUP_ADDR_DELTA          1
+#define UA_FIXUP_BFEXT_START(_B_)    0
+#define UA_FIXUP_BFEXT_END(_B_)     (8 * (_B_) - 1)
+#else /* __BIG_ENDIAN */
+#define UA_FIXUP_ADDR_DELTA          -1
+#define UA_FIXUP_BFEXT_START(_B_)   (64 - 8 * (_B_))
+#define UA_FIXUP_BFEXT_END(_B_)      63
+#endif /* __LITTLE_ENDIAN */
+
+
+
+	if ((ra != rb) && (rd != TREG_SP) && !alias &&
+	    !y1_br && !y1_lr && !x1_add) {
+		/*
+		 * Simple case: ra != rb and no register alias found,
+		 * and no branch or link. This will be the majority.
+		 * We can do a little better for simplae case than the
+		 * generic scheme below.
+		 */
+		if (!load_n_store) {
+			/*
+			 * Simple store: ra != rb, no need for scratch register.
+			 * Just store and rotate to right bytewise.
+			 */
+#ifdef __BIG_ENDIAN
+			frag.insn[n++] =
+				jit_x0_addi(ra, ra, load_store_size - 1) |
+				jit_x1_fnop();
+#endif /* __BIG_ENDIAN */
+			for (k = 0; k < load_store_size; k++) {
+				/* Store a byte. */
+				frag.insn[n++] =
+					jit_x0_rotli(rb, rb, 56) |
+					jit_x1_st1_add(ra, rb,
+						       UA_FIXUP_ADDR_DELTA);
+			}
+#ifdef __BIG_ENDIAN
+			frag.insn[n] = jit_x1_addi(ra, ra, 1);
+#else
+			frag.insn[n] = jit_x1_addi(ra, ra,
+						   -1 * load_store_size);
+#endif /* __LITTLE_ENDIAN */
+
+			if (load_store_size == 8) {
+				frag.insn[n] |= jit_x0_fnop();
+			} else if (load_store_size == 4) {
+				frag.insn[n] |= jit_x0_rotli(rb, rb, 32);
+			} else { /* = 2 */
+				frag.insn[n] |= jit_x0_rotli(rb, rb, 16);
+			}
+			n++;
+			if (bundle_2_enable)
+				frag.insn[n++] = bundle_2;
+			frag.insn[n++] = jit_x0_fnop() | jit_x1_iret();
+		} else {
+			if (rd == ra) {
+				/* Use two clobber registers: clob1/2. */
+				frag.insn[n++] =
+					jit_x0_addi(TREG_SP, TREG_SP, -16) |
+					jit_x1_fnop();
+				frag.insn[n++] =
+					jit_x0_addi(clob1, ra, 7) |
+					jit_x1_st_add(TREG_SP, clob1, -8);
+				frag.insn[n++] =
+					jit_x0_addi(clob2, ra, 0) |
+					jit_x1_st(TREG_SP, clob2);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(rd, ra);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(clob1, clob1);
+				/*
+				 * Note: we must make sure that rd must not
+				 * be sp. Recover clob1/2 from stack.
+				 */
+				frag.insn[n++] =
+					jit_x0_dblalign(rd, clob1, clob2) |
+					jit_x1_ld_add(clob2, TREG_SP, 8);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ld_add(clob1, TREG_SP, 16);
+			} else {
+				/* Use one clobber register: clob1 only. */
+				frag.insn[n++] =
+					jit_x0_addi(TREG_SP, TREG_SP, -16) |
+					jit_x1_fnop();
+				frag.insn[n++] =
+					jit_x0_addi(clob1, ra, 7) |
+					jit_x1_st(TREG_SP, clob1);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(rd, ra);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(clob1, clob1);
+				/*
+				 * Note: we must make sure that rd must not
+				 * be sp. Recover clob1 from stack.
+				 */
+				frag.insn[n++] =
+					jit_x0_dblalign(rd, clob1, ra) |
+					jit_x1_ld_add(clob1, TREG_SP, 16);
+			}
+
+			if (bundle_2_enable)
+				frag.insn[n++] = bundle_2;
+			/*
+			 * For non 8-byte load, extract corresponding bytes and
+			 * signed extension.
+			 */
+			if (load_store_size == 4) {
+				if (load_store_signed)
+					frag.insn[n++] =
+						jit_x0_bfexts(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(4),
+							UA_FIXUP_BFEXT_END(4)) |
+						jit_x1_fnop();
+				else
+					frag.insn[n++] =
+						jit_x0_bfextu(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(4),
+							UA_FIXUP_BFEXT_END(4)) |
+						jit_x1_fnop();
+			} else if (load_store_size == 2) {
+				if (load_store_signed)
+					frag.insn[n++] =
+						jit_x0_bfexts(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(2),
+							UA_FIXUP_BFEXT_END(2)) |
+						jit_x1_fnop();
+				else
+					frag.insn[n++] =
+						jit_x0_bfextu(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(2),
+							UA_FIXUP_BFEXT_END(2)) |
+						jit_x1_fnop();
+			}
+
+			frag.insn[n++] =
+				jit_x0_fnop()  |
+				jit_x1_iret();
+		}
+	} else if (!load_n_store) {
+
+		/*
+		 * Generic memory store cases: use 3 clobber registers.
+		 *
+		 * Alloc space for saveing clob2,1,3 on user's stack.
+		 * register clob3 points to where clob2 saved, followed by
+		 * clob1 and 3 from high to low memory.
+		 */
+		frag.insn[n++] =
+			jit_x0_addi(TREG_SP, TREG_SP, -32)    |
+			jit_x1_fnop();
+		frag.insn[n++] =
+			jit_x0_addi(clob3, TREG_SP, 16)  |
+			jit_x1_st_add(TREG_SP, clob3, 8);
+#ifdef __LITTLE_ENDIAN
+		frag.insn[n++] =
+			jit_x0_addi(clob1, ra, 0)   |
+			jit_x1_st_add(TREG_SP, clob1, 8);
+#else
+		frag.insn[n++] =
+			jit_x0_addi(clob1, ra, load_store_size - 1)   |
+			jit_x1_st_add(TREG_SP, clob1, 8);
+#endif
+		if (load_store_size == 8) {
+			/*
+			 * We save one byte a time, not for fast, but compact
+			 * code. After each store, data source register shift
+			 * right one byte. unchanged after 8 stores.
+			 */
+			frag.insn[n++] =
+				jit_x0_addi(clob2, TREG_ZERO, 7)     |
+				jit_x1_st_add(TREG_SP, clob2, 16);
+			frag.insn[n++] =
+				jit_x0_rotli(rb, rb, 56)      |
+				jit_x1_st1_add(clob1, rb, UA_FIXUP_ADDR_DELTA);
+			frag.insn[n++] =
+				jit_x0_addi(clob2, clob2, -1) |
+				jit_x1_bnezt(clob2, -1);
+			frag.insn[n++] =
+				jit_x0_fnop()                 |
+				jit_x1_addi(clob2, y1_br_reg, 0);
+		} else if (load_store_size == 4) {
+			frag.insn[n++] =
+				jit_x0_addi(clob2, TREG_ZERO, 3)     |
+				jit_x1_st_add(TREG_SP, clob2, 16);
+			frag.insn[n++] =
+				jit_x0_rotli(rb, rb, 56)      |
+				jit_x1_st1_add(clob1, rb, UA_FIXUP_ADDR_DELTA);
+			frag.insn[n++] =
+				jit_x0_addi(clob2, clob2, -1) |
+				jit_x1_bnezt(clob2, -1);
+			/*
+			 * same as 8-byte case, but need shift another 4
+			 * byte to recover rb for 4-byte store.
+			 */
+			frag.insn[n++] = jit_x0_rotli(rb, rb, 32)      |
+				jit_x1_addi(clob2, y1_br_reg, 0);
+		} else { /* =2 */
+			frag.insn[n++] =
+				jit_x0_addi(clob2, rb, 0)     |
+				jit_x1_st_add(TREG_SP, clob2, 16);
+			for (k = 0; k < 2; k++) {
+				frag.insn[n++] =
+					jit_x0_shrui(rb, rb, 8)  |
+					jit_x1_st1_add(clob1, rb,
+						       UA_FIXUP_ADDR_DELTA);
+			}
+			frag.insn[n++] =
+				jit_x0_addi(rb, clob2, 0)       |
+				jit_x1_addi(clob2, y1_br_reg, 0);
+		}
+
+		if (bundle_2_enable)
+			frag.insn[n++] = bundle_2;
+
+		if (y1_lr) {
+			frag.insn[n++] =
+				jit_x0_fnop()                    |
+				jit_x1_mfspr(y1_lr_reg,
+					     SPR_EX_CONTEXT_0_0);
+		}
+		if (y1_br) {
+			frag.insn[n++] =
+				jit_x0_fnop()                    |
+				jit_x1_mtspr(SPR_EX_CONTEXT_0_0,
+					     clob2);
+		}
+		if (x1_add) {
+			frag.insn[n++] =
+				jit_x0_addi(ra, ra, x1_add_imm8) |
+				jit_x1_ld_add(clob2, clob3, -8);
+		} else {
+			frag.insn[n++] =
+				jit_x0_fnop()                    |
+				jit_x1_ld_add(clob2, clob3, -8);
+		}
+		frag.insn[n++] =
+			jit_x0_fnop()   |
+			jit_x1_ld_add(clob1, clob3, -8);
+		frag.insn[n++] = jit_x0_fnop()   | jit_x1_ld(clob3, clob3);
+		frag.insn[n++] = jit_x0_fnop()   | jit_x1_iret();
+
+	} else {
+		/*
+		 * Generic memory load cases.
+		 *
+		 * Alloc space for saveing clob1,2,3 on user's stack.
+		 * register clob3 points to where clob1 saved, followed
+		 * by clob2 and 3 from high to low memory.
+		 */
+
+		frag.insn[n++] =
+			jit_x0_addi(TREG_SP, TREG_SP, -32) |
+			jit_x1_fnop();
+		frag.insn[n++] =
+			jit_x0_addi(clob3, TREG_SP, 16) |
+			jit_x1_st_add(TREG_SP, clob3, 8);
+		frag.insn[n++] =
+			jit_x0_addi(clob2, ra, 0) |
+			jit_x1_st_add(TREG_SP, clob2, 8);
+
+		if (y1_br) {
+			frag.insn[n++] =
+				jit_x0_addi(clob1, y1_br_reg, 0) |
+				jit_x1_st_add(TREG_SP, clob1, 16);
+		} else {
+			frag.insn[n++] =
+				jit_x0_fnop() |
+				jit_x1_st_add(TREG_SP, clob1, 16);
+		}
+
+		if (bundle_2_enable)
+			frag.insn[n++] = bundle_2;
+
+		if (y1_lr) {
+			frag.insn[n++] =
+				jit_x0_fnop()  |
+				jit_x1_mfspr(y1_lr_reg,
+					     SPR_EX_CONTEXT_0_0);
+		}
+
+		if (y1_br) {
+			frag.insn[n++] =
+				jit_x0_fnop() |
+				jit_x1_mtspr(SPR_EX_CONTEXT_0_0,
+					     clob1);
+		}
+
+		frag.insn[n++] =
+			jit_x0_addi(clob1, clob2, 7)      |
+			jit_x1_ldna(rd, clob2);
+		frag.insn[n++] =
+			jit_x0_fnop()                     |
+			jit_x1_ldna(clob1, clob1);
+		frag.insn[n++] =
+			jit_x0_dblalign(rd, clob1, clob2) |
+			jit_x1_ld_add(clob1, clob3, -8);
+		if (x1_add) {
+			frag.insn[n++] =
+				jit_x0_addi(ra, ra, x1_add_imm8) |
+				jit_x1_ld_add(clob2, clob3, -8);
+		} else {
+			frag.insn[n++] =
+				jit_x0_fnop()  |
+				jit_x1_ld_add(clob2, clob3, -8);
+		}
+
+		frag.insn[n++] =
+			jit_x0_fnop() |
+			jit_x1_ld(clob3, clob3);
+
+		if (load_store_size == 4) {
+			if (load_store_signed)
+				frag.insn[n++] =
+					jit_x0_bfexts(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(4),
+						UA_FIXUP_BFEXT_END(4)) |
+					jit_x1_fnop();
+			else
+				frag.insn[n++] =
+					jit_x0_bfextu(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(4),
+						UA_FIXUP_BFEXT_END(4)) |
+					jit_x1_fnop();
+		} else if (load_store_size == 2) {
+			if (load_store_signed)
+				frag.insn[n++] =
+					jit_x0_bfexts(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(2),
+						UA_FIXUP_BFEXT_END(2)) |
+					jit_x1_fnop();
+			else
+				frag.insn[n++] =
+					jit_x0_bfextu(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(2),
+						UA_FIXUP_BFEXT_END(2)) |
+					jit_x1_fnop();
+		}
+
+		frag.insn[n++] = jit_x0_fnop() | jit_x1_iret();
+	}
+
+	/* Max JIT bundle count is 14. */
+	WARN_ON(n > 14);
+
+	if (!unexpected) {
+		int status = 0;
+		int idx = (regs->pc >> 3) &
+			((1ULL << (PAGE_SHIFT - UNALIGN_JIT_SHIFT)) - 1);
+
+		frag.pc = regs->pc;
+		frag.bundle = bundle;
+
+		if (unaligned_printk) {
+			pr_info("%s/%d, Unalign fixup: pc=%lx "
+				"bundle=%lx %d %d %d %d %d %d %d %d.",
+				current->comm, current->pid,
+				(unsigned long)frag.pc,
+				(unsigned long)frag.bundle,
+				(int)alias, (int)rd, (int)ra,
+				(int)rb, (int)bundle_2_enable,
+				(int)y1_lr, (int)y1_br, (int)x1_add);
+
+			for (k = 0; k < n; k += 2)
+				pr_info("[%d] %016llx %016llx", k,
+					(unsigned long long)frag.insn[k],
+					(unsigned long long)frag.insn[k+1]);
+		}
+
+		/* Swap bundle byte order for big endian sys. */
+#ifdef __BIG_ENDIAN
+		frag.bundle = GX_INSN_BSWAP(frag.bundle);
+		for (k = 0; k < n; k++)
+			frag.insn[k] = GX_INSN_BSWAP(frag.insn[k]);
+#endif /* __BIG_ENDIAN */
+
+		status = copy_to_user((void __user *)&jit_code_area[idx],
+				      &frag, sizeof(frag));
+		if (status) {
+			/* Fail to copy JIT into user land. send SIGSEGV. */
+			siginfo_t info = {
+				.si_signo = SIGSEGV,
+				.si_code = SEGV_MAPERR,
+				.si_addr = (void __user *)&jit_code_area[idx]
+			};
+
+			pr_warn("Unalign fixup: pid=%d %s jit_code_area=%llx",
+				current->pid, current->comm,
+				(unsigned long long)&jit_code_area[idx]);
+
+			trace_unhandled_signal("segfault in unalign fixup",
+					       regs,
+					       (unsigned long)info.si_addr,
+					       SIGSEGV);
+			force_sig_info(info.si_signo, &info, current);
+			return;
+		}
+
+
+		/* Do a cheaper increment, not accurate. */
+		unaligned_fixup_count++;
+		__flush_icache_range((unsigned long)&jit_code_area[idx],
+				     (unsigned long)&jit_code_area[idx] +
+				     sizeof(frag));
+
+		/* Setup SPR_EX_CONTEXT_0_0/1 for returning to user program.*/
+		__insn_mtspr(SPR_EX_CONTEXT_0_0, regs->pc + 8);
+		__insn_mtspr(SPR_EX_CONTEXT_0_1, PL_ICS_EX1(USER_PL, 0));
+
+		/* Modify pc at the start of new JIT. */
+		regs->pc = (unsigned long)&jit_code_area[idx].insn[0];
+		/* Set ICS in SPR_EX_CONTEXT_K_1. */
+		regs->ex1 = PL_ICS_EX1(USER_PL, 1);
+	}
+}
+
+
+/*
+ * C function to generate unalign data JIT. Called from unalign data
+ * interrupt handler.
+ *
+ * First check if unalign fix is disabled or exception did not not come from
+ * user space or sp register points to unalign address, if true, generate a
+ * SIGBUS. Then map a page into user space as JIT area if it is not mapped
+ * yet. Genenerate JIT code by calling jit_bundle_gen(). After that return
+ * back to exception handler.
+ *
+ * The exception handler will "iret" to new generated JIT code after
+ * restoring caller saved registers. In theory, the JIT code will perform
+ * another "iret" to resume user's program.
+ */
+
+void do_unaligned(struct pt_regs *regs, int vecnum)
+{
+	tilegx_bundle_bits __user  *pc;
+	tilegx_bundle_bits bundle;
+	struct thread_info *info = current_thread_info();
+	int align_ctl;
+
+	/* Checks the per-process unaligned JIT flags */
+	align_ctl = unaligned_fixup;
+	switch (task_thread_info(current)->align_ctl) {
+	case PR_UNALIGN_NOPRINT:
+		align_ctl = 1;
+		break;
+	case PR_UNALIGN_SIGBUS:
+		align_ctl = 0;
+		break;
+	}
+
+	/* Enable iterrupt in order to access user land. */
+	local_irq_enable();
+
+	/*
+	 * The fault came from kernel space. Two choices:
+	 * (a) unaligned_fixup < 1, we will first call get/put_user fixup
+	 *     to return -EFAULT. If no fixup, simply panic the kernel.
+	 * (b) unaligned_fixup >=1, we will try to fix the unaligned access
+	 *     if it was triggered by get_user/put_user() macros. Panic the
+	 *     kernel if it is not fixable.
+	 */
+
+	if (EX1_PL(regs->ex1) != USER_PL) {
+
+		if (align_ctl < 1) {
+			unaligned_fixup_count++;
+			/* If exception came from kernel, try fix it up. */
+			if (fixup_exception(regs)) {
+				if (unaligned_printk)
+					pr_info("Unalign fixup: %d %llx @%llx",
+						(int)unaligned_fixup,
+						(unsigned long long)regs->ex1,
+						(unsigned long long)regs->pc);
+				return;
+			}
+			/* Not fixable. Go panic. */
+			panic("Unalign exception in Kernel. pc=%lx",
+			      regs->pc);
+			return;
+		} else {
+			/*
+			 * Try to fix the exception. If we can't, panic the
+			 * kernel.
+			 */
+			bundle = GX_INSN_BSWAP(
+				*((tilegx_bundle_bits *)(regs->pc)));
+			jit_bundle_gen(regs, bundle, align_ctl);
+			return;
+		}
+	}
+
+	/*
+	 * Fault came from user with ICS or stack is not aligned.
+	 * If so, we will trigger SIGBUS.
+	 */
+	if ((regs->sp & 0x7) || (regs->ex1) || (align_ctl < 0)) {
+		siginfo_t info = {
+			.si_signo = SIGBUS,
+			.si_code = BUS_ADRALN,
+			.si_addr = (unsigned char __user *)0
+		};
+
+		if (unaligned_printk)
+			pr_info("Unalign fixup: %d %llx @%llx",
+				(int)unaligned_fixup,
+				(unsigned long long)regs->ex1,
+				(unsigned long long)regs->pc);
+
+		unaligned_fixup_count++;
+
+		trace_unhandled_signal("unaligned fixup trap", regs, 0, SIGBUS);
+		force_sig_info(info.si_signo, &info, current);
+		return;
+	}
+
+
+	/* Read the bundle casued the exception! */
+	pc = (tilegx_bundle_bits __user *)(regs->pc);
+	if (get_user(bundle, pc) != 0) {
+		/* Probably never be here since pc is valid user address.*/
+		siginfo_t info = {
+			.si_signo = SIGSEGV,
+			.si_code = SEGV_MAPERR,
+			.si_addr = (void __user *)pc
+		};
+		pr_err("Couldn't read instruction at %p trying to step\n", pc);
+		trace_unhandled_signal("segfault in unalign fixup", regs,
+				       (unsigned long)info.si_addr, SIGSEGV);
+		force_sig_info(info.si_signo, &info, current);
+		return;
+	}
+
+	if (!info->unalign_jit_base) {
+		void __user *user_page;
+
+		/*
+		 * Allocate a page in userland.
+		 * For 64-bit processes we try to place the mapping far
+		 * from anything else that might be going on (specifically
+		 * 64 GB below the top of the user address space).  If it
+		 * happens not to be possible to put it there, it's OK;
+		 * the kernel will choose another location and we'll
+		 * remember it for later.
+		 */
+		if (is_compat_task())
+			user_page = NULL;
+		else
+			user_page = (void __user *)(TASK_SIZE - (1UL << 36)) +
+				(current->pid << PAGE_SHIFT);
+
+		user_page = (void __user *) vm_mmap(NULL,
+						    (unsigned long)user_page,
+						    PAGE_SIZE,
+						    PROT_EXEC | PROT_READ |
+						    PROT_WRITE,
+#ifdef CONFIG_HOMECACHE
+						    MAP_CACHE_HOME_TASK |
+#endif
+						    MAP_PRIVATE |
+						    MAP_ANONYMOUS,
+						    0);
+
+		if (IS_ERR((void __force *)user_page)) {
+			pr_err("Out of kernel pages trying do_mmap.\n");
+			return;
+		}
+
+		/* Save the address in the thread_info struct */
+		info->unalign_jit_base = user_page;
+		if (unaligned_printk)
+			pr_info("Unalign bundle: %d:%d, allocate page @%llx",
+				raw_smp_processor_id(), current->pid,
+				(unsigned long long)user_page);
+	}
+
+	/* Generate unalign JIT */
+	jit_bundle_gen(regs, GX_INSN_BSWAP(bundle), align_ctl);
+}
+
+#endif /* __tilegx__ */
diff --git a/arch/tile/kernel/usb.c b/arch/tile/kernel/usb.c
new file mode 100644
index 00000000000..5af8debc6a7
--- /dev/null
+++ b/arch/tile/kernel/usb.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Register the Tile-Gx USB interfaces as platform devices.
+ *
+ * The actual USB driver is just some glue (in
+ * drivers/usb/host/[eo]hci-tilegx.c) which makes the registers available
+ * to the standard kernel EHCI and OHCI drivers.
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/platform_device.h>
+#include <linux/usb/tilegx.h>
+#include <linux/types.h>
+
+static u64 ehci_dmamask = DMA_BIT_MASK(32);
+
+#define USB_HOST_DEF(unit, type, dmamask) \
+	static struct \
+	    tilegx_usb_platform_data tilegx_usb_platform_data_ ## type ## \
+		hci ## unit = { \
+		.dev_index = unit, \
+	}; \
+	\
+	static struct platform_device tilegx_usb_ ## type ## hci ## unit = { \
+		.name		= "tilegx-" #type "hci", \
+		.id		= unit, \
+		.dev = { \
+			.dma_mask		= dmamask, \
+			.coherent_dma_mask	= DMA_BIT_MASK(32), \
+			.platform_data = \
+				&tilegx_usb_platform_data_ ## type ## hci ## \
+				unit, \
+		}, \
+	};
+
+USB_HOST_DEF(0, e, &ehci_dmamask)
+USB_HOST_DEF(0, o, NULL)
+USB_HOST_DEF(1, e, &ehci_dmamask)
+USB_HOST_DEF(1, o, NULL)
+
+#undef USB_HOST_DEF
+
+static struct platform_device *tilegx_usb_devices[] __initdata = {
+	&tilegx_usb_ehci0,
+	&tilegx_usb_ehci1,
+	&tilegx_usb_ohci0,
+	&tilegx_usb_ohci1,
+};
+
+/** Add our set of possible USB devices. */
+static int __init tilegx_usb_init(void)
+{
+	platform_add_devices(tilegx_usb_devices,
+			     ARRAY_SIZE(tilegx_usb_devices));
+
+	return 0;
+}
+arch_initcall(tilegx_usb_init);
diff --git a/arch/tile/kernel/vdso.c b/arch/tile/kernel/vdso.c
new file mode 100644
index 00000000000..1533af24106
--- /dev/null
+++ b/arch/tile/kernel/vdso.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/binfmts.h>
+#include <linux/compat.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+
+#include <asm/vdso.h>
+#include <asm/mman.h>
+#include <asm/sections.h>
+
+#include <arch/sim.h>
+
+/* The alignment of the vDSO. */
+#define VDSO_ALIGNMENT  PAGE_SIZE
+
+
+static unsigned int vdso_pages;
+static struct page **vdso_pagelist;
+
+#ifdef CONFIG_COMPAT
+static unsigned int vdso32_pages;
+static struct page **vdso32_pagelist;
+#endif
+static int vdso_ready;
+
+/*
+ * The vdso data page.
+ */
+static union {
+	struct vdso_data	data;
+	u8			page[PAGE_SIZE];
+} vdso_data_store __page_aligned_data;
+
+struct vdso_data *vdso_data = &vdso_data_store.data;
+
+static unsigned int __read_mostly vdso_enabled = 1;
+
+static struct page **vdso_setup(void *vdso_kbase, unsigned int pages)
+{
+	int i;
+	struct page **pagelist;
+
+	pagelist = kzalloc(sizeof(struct page *) * (pages + 1), GFP_KERNEL);
+	BUG_ON(pagelist == NULL);
+	for (i = 0; i < pages - 1; i++) {
+		struct page *pg = virt_to_page(vdso_kbase + i*PAGE_SIZE);
+		ClearPageReserved(pg);
+		pagelist[i] = pg;
+	}
+	pagelist[pages - 1] = virt_to_page(vdso_data);
+	pagelist[pages] = NULL;
+
+	return pagelist;
+}
+
+static int __init vdso_init(void)
+{
+	int data_pages = sizeof(vdso_data_store) >> PAGE_SHIFT;
+
+	/*
+	 * We can disable vDSO support generally, but we need to retain
+	 * one page to support the two-bundle (16-byte) rt_sigreturn path.
+	 */
+	if (!vdso_enabled) {
+		size_t offset = (unsigned long)&__vdso_rt_sigreturn;
+		static struct page *sigret_page;
+		sigret_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		BUG_ON(sigret_page == NULL);
+		vdso_pagelist = &sigret_page;
+		vdso_pages = 1;
+		BUG_ON(offset >= PAGE_SIZE);
+		memcpy(page_address(sigret_page) + offset,
+		       vdso_start + offset, 16);
+#ifdef CONFIG_COMPAT
+		vdso32_pages = vdso_pages;
+		vdso32_pagelist = vdso_pagelist;
+#endif
+		vdso_ready = 1;
+		return 0;
+	}
+
+	vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT;
+	vdso_pages += data_pages;
+	vdso_pagelist = vdso_setup(vdso_start, vdso_pages);
+
+#ifdef CONFIG_COMPAT
+	vdso32_pages = (vdso32_end - vdso32_start) >> PAGE_SHIFT;
+	vdso32_pages += data_pages;
+	vdso32_pagelist = vdso_setup(vdso32_start, vdso32_pages);
+#endif
+
+	smp_wmb();
+	vdso_ready = 1;
+
+	return 0;
+}
+arch_initcall(vdso_init);
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_mm && vma->vm_start == VDSO_BASE)
+		return "[vdso]";
+#ifndef __tilegx__
+	if (vma->vm_start == MEM_USER_INTRPT)
+		return "[intrpt]";
+#endif
+	return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+	return NULL;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long address)
+{
+	return 0;
+}
+
+int in_gate_area_no_mm(unsigned long address)
+{
+	return 0;
+}
+
+int setup_vdso_pages(void)
+{
+	struct page **pagelist;
+	unsigned long pages;
+	struct mm_struct *mm = current->mm;
+	unsigned long vdso_base = 0;
+	int retval = 0;
+
+	if (!vdso_ready)
+		return 0;
+
+	mm->context.vdso_base = 0;
+
+	pagelist = vdso_pagelist;
+	pages = vdso_pages;
+#ifdef CONFIG_COMPAT
+	if (is_compat_task()) {
+		pagelist = vdso32_pagelist;
+		pages = vdso32_pages;
+	}
+#endif
+
+	/*
+	 * vDSO has a problem and was disabled, just don't "enable" it for the
+	 * process.
+	 */
+	if (pages == 0)
+		return 0;
+
+	vdso_base = get_unmapped_area(NULL, vdso_base,
+				      (pages << PAGE_SHIFT) +
+				      ((VDSO_ALIGNMENT - 1) & PAGE_MASK),
+				      0, 0);
+	if (IS_ERR_VALUE(vdso_base)) {
+		retval = vdso_base;
+		return retval;
+	}
+
+	/* Add required alignment. */
+	vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT);
+
+	/*
+	 * Put vDSO base into mm struct. We need to do this before calling
+	 * install_special_mapping or the perf counter mmap tracking code
+	 * will fail to recognise it as a vDSO (since arch_vma_name fails).
+	 */
+	mm->context.vdso_base = vdso_base;
+
+	/*
+	 * our vma flags don't have VM_WRITE so by default, the process isn't
+	 * allowed to write those pages.
+	 * gdb can break that with ptrace interface, and thus trigger COW on
+	 * those pages but it's then your responsibility to never do that on
+	 * the "data" page of the vDSO or you'll stop getting kernel updates
+	 * and your nice userland gettimeofday will be totally dead.
+	 * It's fine to use that for setting breakpoints in the vDSO code
+	 * pages though
+	 */
+	retval = install_special_mapping(mm, vdso_base,
+					 pages << PAGE_SHIFT,
+					 VM_READ|VM_EXEC |
+					 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
+					 pagelist);
+	if (retval)
+		mm->context.vdso_base = 0;
+
+	return retval;
+}
+
+static __init int vdso_func(char *s)
+{
+	return kstrtouint(s, 0, &vdso_enabled);
+}
+__setup("vdso=", vdso_func);
diff --git a/arch/tile/kernel/vdso/Makefile b/arch/tile/kernel/vdso/Makefile
new file mode 100644
index 00000000000..a025f63d54c
--- /dev/null
+++ b/arch/tile/kernel/vdso/Makefile
@@ -0,0 +1,118 @@
+# Symbols present in the vdso
+vdso-syms = rt_sigreturn gettimeofday
+
+# Files to link into the vdso
+obj-vdso = $(patsubst %, v%.o, $(vdso-syms))
+
+# Build rules
+targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds
+obj-vdso := $(addprefix $(obj)/, $(obj-vdso))
+
+# vdso32 is only for tilegx -m32 compat task.
+VDSO32-$(CONFIG_COMPAT) := y
+
+obj-y += vdso.o
+obj-$(VDSO32-y) += vdso32.o
+extra-y += vdso.lds
+CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
+
+# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
+CFLAGS_REMOVE_vdso.o = -pg
+CFLAGS_REMOVE_vdso32.o = -pg
+CFLAGS_REMOVE_vrt_sigreturn.o = -pg
+CFLAGS_REMOVE_vrt_sigreturn32.o = -pg
+CFLAGS_REMOVE_vgettimeofday.o = -pg
+CFLAGS_REMOVE_vgettimeofday32.o = -pg
+
+ifdef CONFIG_FEEDBACK_COLLECT
+# vDSO code runs in userspace, not collecting feedback data.
+CFLAGS_REMOVE_vdso.o = -ffeedback-generate
+CFLAGS_REMOVE_vdso32.o = -ffeedback-generate
+CFLAGS_REMOVE_vrt_sigreturn.o = -ffeedback-generate
+CFLAGS_REMOVE_vrt_sigreturn32.o = -ffeedback-generate
+CFLAGS_REMOVE_vgettimeofday.o = -ffeedback-generate
+CFLAGS_REMOVE_vgettimeofday32.o = -ffeedback-generate
+endif
+
+# Disable gcov profiling for VDSO code
+GCOV_PROFILE := n
+
+# Force dependency
+$(obj)/vdso.o: $(obj)/vdso.so
+
+# link rule for the .so file, .lds has to be first
+SYSCFLAGS_vdso.so.dbg = $(c_flags)
+$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso)
+	$(call if_changed,vdsold)
+
+
+# We also create a special relocatable object that should mirror the symbol
+# table and layout of the linked DSO.  With ld -R we can then refer to
+# these symbols in the kernel code rather than hand-coded addresses.
+extra-y += vdso-syms.o
+$(obj)/built-in.o: $(obj)/vdso-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o
+
+SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \
+                            $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+SYSCFLAGS_vdso_syms.o = -r
+$(obj)/vdso-syms.o: $(src)/vdso.lds $(obj)/vrt_sigreturn.o FORCE
+	$(call if_changed,vdsold)
+
+
+# strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+	$(call if_changed,objcopy)
+
+# actual build commands
+# The DSO images are built using a special linker script
+# Add -lgcc so tilepro gets static muldi3 and lshrdi3 definitions.
+# Make sure only to export the intended __vdso_xxx symbol offsets.
+quiet_cmd_vdsold = VDSOLD  $@
+      cmd_vdsold = $(CC) $(KCFLAGS) -nostdlib $(SYSCFLAGS_$(@F)) \
+                           -Wl,-T,$(filter-out FORCE,$^) -o $@.tmp -lgcc && \
+                   $(CROSS_COMPILE)objcopy \
+                           $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@
+
+# install commands for the unstripped file
+quiet_cmd_vdso_install = INSTALL $@
+      cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
+
+vdso.so: $(obj)/vdso.so.dbg
+	@mkdir -p $(MODLIB)/vdso
+	$(call cmd,vdso_install)
+
+vdso32.so: $(obj)/vdso32.so.dbg
+	$(call cmd,vdso_install)
+
+vdso_install: vdso.so
+vdso32_install: vdso32.so
+
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
+KBUILD_AFLAGS_32 += -m32 -s
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 += -m32 -fPIC -shared
+
+obj-vdso32 = $(patsubst %, v%32.o, $(vdso-syms))
+obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
+
+targets += $(obj-vdso32) vdso32.so vdso32.so.dbg
+
+$(obj-vdso32:%=%): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+$(obj-vdso32:%=%): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+
+$(obj)/vgettimeofday32.o: $(obj)/vgettimeofday.c
+	$(call if_changed_rule,cc_o_c)
+
+$(obj)/vrt_sigreturn32.o: $(obj)/vrt_sigreturn.S
+	$(call if_changed,as_o_S)
+
+# Force dependency
+$(obj)/vdso32.o: $(obj)/vdso32.so
+
+SYSCFLAGS_vdso32.so.dbg = -m32 -shared -s -Wl,-soname=linux-vdso32.so.1 \
+			    $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+$(obj)/vdso32.so.dbg: $(src)/vdso.lds $(obj-vdso32)
+	$(call if_changed,vdsold)
diff --git a/arch/tile/kernel/vdso/vdso.S b/arch/tile/kernel/vdso/vdso.S
new file mode 100644
index 00000000000..3467adb4163
--- /dev/null
+++ b/arch/tile/kernel/vdso/vdso.S
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+	__PAGE_ALIGNED_DATA
+
+	.global vdso_start, vdso_end
+	.align PAGE_SIZE
+vdso_start:
+	.incbin "arch/tile/kernel/vdso/vdso.so"
+	.align PAGE_SIZE
+vdso_end:
+
+	.previous
diff --git a/arch/tile/kernel/vdso/vdso.lds.S b/arch/tile/kernel/vdso/vdso.lds.S
new file mode 100644
index 00000000000..041cd6c39c8
--- /dev/null
+++ b/arch/tile/kernel/vdso/vdso.lds.S
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#define VDSO_VERSION_STRING	LINUX_2.6
+
+
+OUTPUT_ARCH(tile)
+
+/* The ELF entry point can be used to set the AT_SYSINFO value. */
+ENTRY(__vdso_rt_sigreturn);
+
+
+SECTIONS
+{
+	. = SIZEOF_HEADERS;
+
+	.hash		: { *(.hash) }			:text
+	.gnu.hash	: { *(.gnu.hash) }
+	.dynsym		: { *(.dynsym) }
+	.dynstr		: { *(.dynstr) }
+	.gnu.version	: { *(.gnu.version) }
+	.gnu.version_d	: { *(.gnu.version_d) }
+	.gnu.version_r	: { *(.gnu.version_r) }
+
+	.note		: { *(.note.*) }		:text	:note
+	.dynamic	: { *(.dynamic) }		:text	:dynamic
+
+	.eh_frame_hdr	: { *(.eh_frame_hdr) }		:text	:eh_frame_hdr
+	.eh_frame	: { KEEP (*(.eh_frame)) }	:text
+
+	.rodata	 : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+
+	/*
+	 * This linker script is used both with -r and with -shared.
+	 * For the layouts to match, we need to skip more than enough
+	 * space for the dynamic symbol table et al. If this amount
+	 * is insufficient, ld -shared will barf. Just increase it here.
+	 */
+	. = 0x1000;
+	.text		: { *(.text .text.*) }		:text
+
+	.data		: {
+		*(.got.plt) *(.got)
+		*(.data .data.* .gnu.linkonce.d.*)
+		*(.dynbss)
+		*(.bss .bss.* .gnu.linkonce.b.*)
+	}
+}
+
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+	text		PT_LOAD		FLAGS(5) FILEHDR PHDRS;	/* PF_R|PF_X */
+	dynamic		PT_DYNAMIC	FLAGS(4);		/* PF_R */
+	note		PT_NOTE		FLAGS(4);		/* PF_R */
+	eh_frame_hdr	PT_GNU_EH_FRAME;
+}
+
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION
+{
+	VDSO_VERSION_STRING {
+	global:
+		__vdso_rt_sigreturn;
+		__vdso_gettimeofday;
+		gettimeofday;
+	local:*;
+	};
+}
diff --git a/arch/tile/kernel/vdso/vdso32.S b/arch/tile/kernel/vdso/vdso32.S
new file mode 100644
index 00000000000..1d1ac3257e1
--- /dev/null
+++ b/arch/tile/kernel/vdso/vdso32.S
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+	__PAGE_ALIGNED_DATA
+
+	.global vdso32_start, vdso32_end
+	.align PAGE_SIZE
+vdso32_start:
+	.incbin "arch/tile/kernel/vdso/vdso32.so"
+	.align PAGE_SIZE
+vdso32_end:
+
+	.previous
diff --git a/arch/tile/kernel/vdso/vgettimeofday.c b/arch/tile/kernel/vdso/vgettimeofday.c
new file mode 100644
index 00000000000..51ec8e46f5f
--- /dev/null
+++ b/arch/tile/kernel/vdso/vgettimeofday.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#define VDSO_BUILD  /* avoid some shift warnings for -m32 in <asm/page.h> */
+#include <linux/time.h>
+#include <asm/timex.h>
+#include <asm/vdso.h>
+
+#if CHIP_HAS_SPLIT_CYCLE()
+static inline cycles_t get_cycles_inline(void)
+{
+	unsigned int high = __insn_mfspr(SPR_CYCLE_HIGH);
+	unsigned int low = __insn_mfspr(SPR_CYCLE_LOW);
+	unsigned int high2 = __insn_mfspr(SPR_CYCLE_HIGH);
+
+	while (unlikely(high != high2)) {
+		low = __insn_mfspr(SPR_CYCLE_LOW);
+		high = high2;
+		high2 = __insn_mfspr(SPR_CYCLE_HIGH);
+	}
+
+	return (((cycles_t)high) << 32) | low;
+}
+#define get_cycles get_cycles_inline
+#endif
+
+/*
+ * Find out the vDSO data page address in the process address space.
+ */
+inline unsigned long get_datapage(void)
+{
+	unsigned long ret;
+
+	/* vdso data page located in the 2nd vDSO page. */
+	asm volatile ("lnk %0" : "=r"(ret));
+	ret &= ~(PAGE_SIZE - 1);
+	ret += PAGE_SIZE;
+
+	return ret;
+}
+
+int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+	cycles_t cycles;
+	unsigned long count, sec, ns;
+	volatile struct vdso_data *vdso_data;
+
+	vdso_data = (struct vdso_data *)get_datapage();
+	/* The use of the timezone is obsolete, normally tz is NULL. */
+	if (unlikely(tz != NULL)) {
+		while (1) {
+			/* Spin until the update finish. */
+			count = vdso_data->tz_update_count;
+			if (count & 1)
+				continue;
+
+			tz->tz_minuteswest = vdso_data->tz_minuteswest;
+			tz->tz_dsttime = vdso_data->tz_dsttime;
+
+			/* Check whether updated, read again if so. */
+			if (count == vdso_data->tz_update_count)
+				break;
+		}
+	}
+
+	if (unlikely(tv == NULL))
+		return 0;
+
+	while (1) {
+		/* Spin until the update finish. */
+		count = vdso_data->tb_update_count;
+		if (count & 1)
+			continue;
+
+		cycles = (get_cycles() - vdso_data->xtime_tod_stamp);
+		ns = (cycles * vdso_data->mult) >> vdso_data->shift;
+		sec = vdso_data->xtime_clock_sec;
+		ns += vdso_data->xtime_clock_nsec;
+		if (ns >= NSEC_PER_SEC) {
+			ns -= NSEC_PER_SEC;
+			sec += 1;
+		}
+
+		/* Check whether updated, read again if so. */
+		if (count == vdso_data->tb_update_count)
+			break;
+	}
+
+	tv->tv_sec = sec;
+	tv->tv_usec = ns / 1000;
+
+	return 0;
+}
+
+int gettimeofday(struct timeval *tv, struct timezone *tz)
+	__attribute__((weak, alias("__vdso_gettimeofday")));
diff --git a/arch/tile/kernel/vdso/vrt_sigreturn.S b/arch/tile/kernel/vdso/vrt_sigreturn.S
new file mode 100644
index 00000000000..6326caf4a03
--- /dev/null
+++ b/arch/tile/kernel/vdso/vrt_sigreturn.S
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/linkage.h>
+#include <arch/abi.h>
+#include <asm/unistd.h>
+
+/*
+ * Note that libc has a copy of this function that it uses to compare
+ * against the PC when a stack backtrace ends, so if this code is
+ * changed, the libc implementation(s) should also be updated.
+ */
+ENTRY(__vdso_rt_sigreturn)
+	moveli TREG_SYSCALL_NR_NAME, __NR_rt_sigreturn
+	swint1
+	/* We don't use ENDPROC to avoid tagging this symbol as FUNC,
+	 * which confuses the perf tool.
+	 */
+	END(__vdso_rt_sigreturn)
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S
index 631f10de12f..f1819423ffc 100644
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -5,7 +5,7 @@
 #include <hv/hypervisor.h>
 
 /* Text loads starting from the supervisor interrupt vector address. */
-#define TEXT_OFFSET MEM_SV_INTRPT
+#define TEXT_OFFSET MEM_SV_START
 
 OUTPUT_ARCH(tile)
 ENTRY(_start)
@@ -13,7 +13,7 @@ jiffies = jiffies_64;
 
 PHDRS
 {
-  intrpt1 PT_LOAD ;
+  intrpt PT_LOAD ;
   text PT_LOAD ;
   data PT_LOAD ;
 }
@@ -24,23 +24,30 @@ SECTIONS
   #define LOAD_OFFSET TEXT_OFFSET
 
   /* Interrupt vectors */
-  .intrpt1 (LOAD_OFFSET) : AT ( 0 )   /* put at the start of physical memory */
+  .intrpt (LOAD_OFFSET) : AT ( 0 )   /* put at the start of physical memory */
   {
     _text = .;
-    _stext = .;
-    *(.intrpt1)
-  } :intrpt1 =0
+    *(.intrpt)
+  } :intrpt =0
 
   /* Hypervisor call vectors */
-  #include "hvglue.lds"
+  . = ALIGN(0x10000);
+  .hvglue : AT (ADDR(.hvglue) - LOAD_OFFSET) {
+    *(.hvglue)
+  } :NONE
 
   /* Now the real code */
   . = ALIGN(0x20000);
+  _stext = .;
   .text : AT (ADDR(.text) - LOAD_OFFSET) {
     HEAD_TEXT
     SCHED_TEXT
     LOCK_TEXT
+    KPROBES_TEXT
+    IRQENTRY_TEXT
     __fix_text_end = .;   /* tile-cpack won't rearrange before this */
+    ALIGN_FUNCTION();
+    *(.hottext*)
     TEXT_TEXT
     *(.text.*)
     *(.coldtext*)
@@ -58,27 +65,17 @@ SECTIONS
   #define LOAD_OFFSET PAGE_OFFSET
 
   . = ALIGN(PAGE_SIZE);
+  __init_begin = .;
   VMLINUX_SYMBOL(_sinitdata) = .;
   INIT_DATA_SECTION(16) :data =0
   PERCPU_SECTION(L2_CACHE_BYTES)
   . = ALIGN(PAGE_SIZE);
   VMLINUX_SYMBOL(_einitdata) = .;
+  __init_end = .;
 
   _sdata = .;                   /* Start of data section */
-
   RO_DATA_SECTION(PAGE_SIZE)
-
-  /* initially writeable, then read-only */
-  . = ALIGN(PAGE_SIZE);
-  __w1data_begin = .;
-  .w1data : AT(ADDR(.w1data) - LOAD_OFFSET) {
-    VMLINUX_SYMBOL(__w1data_begin) = .;
-    *(.w1data)
-    VMLINUX_SYMBOL(__w1data_end) = .;
-  }
-
   RW_DATA_SECTION(L2_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
-
   _edata = .;
 
   EXCEPTION_TABLE(L2_CACHE_BYTES)
diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig
index 669fcdba31e..2298cb1daff 100644
--- a/arch/tile/kvm/Kconfig
+++ b/arch/tile/kvm/Kconfig
@@ -18,7 +18,7 @@ if VIRTUALIZATION
 
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
-	depends on HAVE_KVM && MODULES && EXPERIMENTAL
+	depends on HAVE_KVM && MODULES
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	---help---
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile
index 0c26086ecbe..c4211cbb202 100644
--- a/arch/tile/lib/Makefile
+++ b/arch/tile/lib/Makefile
@@ -4,14 +4,15 @@
 
 lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
 	memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
-	strchr_$(BITS).o strlen_$(BITS).o
-
-ifeq ($(CONFIG_TILEGX),y)
-lib-y += memcpy_user_64.o
-else
-lib-y += atomic_32.o atomic_asm_32.o memcpy_tile64.o
-endif
+	strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o
 
+lib-$(CONFIG_TILEGX) += memcpy_user_64.o
+lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o
 lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o
 
 obj-$(CONFIG_MODULES) += exports.o
+
+# The finv_buffer_remote() and copy_{to,from}_user() routines can't
+# have -pg added, since they both rely on being leaf functions.
+CFLAGS_REMOVE_cacheflush.o = -pg
+CFLAGS_REMOVE_memcpy_user_64.o = -pg
diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c
index 771b251b409..c89b211fd9e 100644
--- a/arch/tile/lib/atomic_32.c
+++ b/arch/tile/lib/atomic_32.c
@@ -18,53 +18,14 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/atomic.h>
-#include <asm/futex.h>
 #include <arch/chip.h>
 
-/* See <asm/atomic_32.h> */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-/*
- * A block of memory containing locks for atomic ops. Each instance of this
- * struct will be homed on a different CPU.
- */
-struct atomic_locks_on_cpu {
-	int lock[ATOMIC_HASH_L2_SIZE];
-} __attribute__((aligned(ATOMIC_HASH_L2_SIZE * 4)));
-
-static DEFINE_PER_CPU(struct atomic_locks_on_cpu, atomic_lock_pool);
-
-/* The locks we'll use until __init_atomic_per_cpu is called. */
-static struct atomic_locks_on_cpu __initdata initial_atomic_locks;
-
-/* Hash into this vector to get a pointer to lock for the given atomic. */
-struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
-	__write_once = {
-	[0 ... ATOMIC_HASH_L1_SIZE-1] (&initial_atomic_locks)
-};
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 /* This page is remapped on startup to be hash-for-home. */
 int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
 
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
-static inline int *__atomic_hashed_lock(volatile void *v)
+int *__atomic_hashed_lock(volatile void *v)
 {
 	/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	unsigned long i =
-		(unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
-	unsigned long n = __insn_crc32_32(0, i);
-
-	/* Grab high bits for L1 index. */
-	unsigned long l1_index = n >> ((sizeof(n) * 8) - ATOMIC_HASH_L1_SHIFT);
-	/* Grab low bits for L2 index. */
-	unsigned long l2_index = n & (ATOMIC_HASH_L2_SIZE - 1);
-
-	return &atomic_lock_ptr[l1_index]->lock[l2_index];
-#else
 	/*
 	 * Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index.
 	 * Using mm works here because atomic_locks is page aligned.
@@ -73,26 +34,13 @@ static inline int *__atomic_hashed_lock(volatile void *v)
 				      (unsigned long)atomic_locks,
 				      2, (ATOMIC_HASH_SHIFT + 2) - 1);
 	return (int *)ptr;
-#endif
 }
 
 #ifdef CONFIG_SMP
 /* Return whether the passed pointer is a valid atomic lock pointer. */
 static int is_atomic_lock(int *p)
 {
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	int i;
-	for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
-
-		if (p >= &atomic_lock_ptr[i]->lock[0] &&
-		    p < &atomic_lock_ptr[i]->lock[ATOMIC_HASH_L2_SIZE]) {
-			return 1;
-		}
-	}
-	return 0;
-#else
 	return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE];
-#endif
 }
 
 void __atomic_fault_unlock(int *irqlock_word)
@@ -111,33 +59,32 @@ static inline int *__atomic_setup(volatile void *v)
 	return __atomic_hashed_lock(v);
 }
 
-int _atomic_xchg(atomic_t *v, int n)
+int _atomic_xchg(int *v, int n)
 {
-	return __atomic_xchg(&v->counter, __atomic_setup(v), n).val;
+	return __atomic_xchg(v, __atomic_setup(v), n).val;
 }
 EXPORT_SYMBOL(_atomic_xchg);
 
-int _atomic_xchg_add(atomic_t *v, int i)
+int _atomic_xchg_add(int *v, int i)
 {
-	return __atomic_xchg_add(&v->counter, __atomic_setup(v), i).val;
+	return __atomic_xchg_add(v, __atomic_setup(v), i).val;
 }
 EXPORT_SYMBOL(_atomic_xchg_add);
 
-int _atomic_xchg_add_unless(atomic_t *v, int a, int u)
+int _atomic_xchg_add_unless(int *v, int a, int u)
 {
 	/*
 	 * Note: argument order is switched here since it is easier
 	 * to use the first argument consistently as the "old value"
 	 * in the assembly, as is done for _atomic_cmpxchg().
 	 */
-	return __atomic_xchg_add_unless(&v->counter, __atomic_setup(v), u, a)
-		.val;
+	return __atomic_xchg_add_unless(v, __atomic_setup(v), u, a).val;
 }
 EXPORT_SYMBOL(_atomic_xchg_add_unless);
 
-int _atomic_cmpxchg(atomic_t *v, int o, int n)
+int _atomic_cmpxchg(int *v, int o, int n)
 {
-	return __atomic_cmpxchg(&v->counter, __atomic_setup(v), o, n).val;
+	return __atomic_cmpxchg(v, __atomic_setup(v), o, n).val;
 }
 EXPORT_SYMBOL(_atomic_cmpxchg);
 
@@ -160,78 +107,36 @@ unsigned long _atomic_xor(volatile unsigned long *p, unsigned long mask)
 EXPORT_SYMBOL(_atomic_xor);
 
 
-u64 _atomic64_xchg(atomic64_t *v, u64 n)
+long long _atomic64_xchg(long long *v, long long n)
 {
-	return __atomic64_xchg(&v->counter, __atomic_setup(v), n);
+	return __atomic64_xchg(v, __atomic_setup(v), n);
 }
 EXPORT_SYMBOL(_atomic64_xchg);
 
-u64 _atomic64_xchg_add(atomic64_t *v, u64 i)
+long long _atomic64_xchg_add(long long *v, long long i)
 {
-	return __atomic64_xchg_add(&v->counter, __atomic_setup(v), i);
+	return __atomic64_xchg_add(v, __atomic_setup(v), i);
 }
 EXPORT_SYMBOL(_atomic64_xchg_add);
 
-u64 _atomic64_xchg_add_unless(atomic64_t *v, u64 a, u64 u)
+long long _atomic64_xchg_add_unless(long long *v, long long a, long long u)
 {
 	/*
 	 * Note: argument order is switched here since it is easier
 	 * to use the first argument consistently as the "old value"
 	 * in the assembly, as is done for _atomic_cmpxchg().
 	 */
-	return __atomic64_xchg_add_unless(&v->counter, __atomic_setup(v),
-					  u, a);
+	return __atomic64_xchg_add_unless(v, __atomic_setup(v), u, a);
 }
 EXPORT_SYMBOL(_atomic64_xchg_add_unless);
 
-u64 _atomic64_cmpxchg(atomic64_t *v, u64 o, u64 n)
+long long _atomic64_cmpxchg(long long *v, long long o, long long n)
 {
-	return __atomic64_cmpxchg(&v->counter, __atomic_setup(v), o, n);
+	return __atomic64_cmpxchg(v, __atomic_setup(v), o, n);
 }
 EXPORT_SYMBOL(_atomic64_cmpxchg);
 
 
-static inline int *__futex_setup(int __user *v)
-{
-	/*
-	 * Issue a prefetch to the counter to bring it into cache.
-	 * As for __atomic_setup, but we can't do a read into the L1
-	 * since it might fault; instead we do a prefetch into the L2.
-	 */
-	__insn_prefetch(v);
-	return __atomic_hashed_lock((int __force *)v);
-}
-
-struct __get_user futex_set(u32 __user *v, int i)
-{
-	return __atomic_xchg((int __force *)v, __futex_setup(v), i);
-}
-
-struct __get_user futex_add(u32 __user *v, int n)
-{
-	return __atomic_xchg_add((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_or(u32 __user *v, int n)
-{
-	return __atomic_or((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_andn(u32 __user *v, int n)
-{
-	return __atomic_andn((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_xor(u32 __user *v, int n)
-{
-	return __atomic_xor((int __force *)v, __futex_setup(v), n);
-}
-
-struct __get_user futex_cmpxchg(u32 __user *v, int o, int n)
-{
-	return __atomic_cmpxchg((int __force *)v, __futex_setup(v), o, n);
-}
-
 /*
  * If any of the atomic or futex routines hit a bad address (not in
  * the page tables at kernel PL) this routine is called.  The futex
@@ -250,54 +155,8 @@ struct __get_user __atomic_bad_address(int __user *addr)
 }
 
 
-#if CHIP_HAS_CBOX_HOME_MAP()
-static int __init noatomichash(char *str)
-{
-	pr_warning("noatomichash is deprecated.\n");
-	return 1;
-}
-__setup("noatomichash", noatomichash);
-#endif
-
 void __init __init_atomic_per_cpu(void)
 {
-#if ATOMIC_LOCKS_FOUND_VIA_TABLE()
-
-	unsigned int i;
-	int actual_cpu;
-
-	/*
-	 * Before this is called from setup, we just have one lock for
-	 * all atomic objects/operations.  Here we replace the
-	 * elements of atomic_lock_ptr so that they point at per_cpu
-	 * integers.  This seemingly over-complex approach stems from
-	 * the fact that DEFINE_PER_CPU defines an entry for each cpu
-	 * in the grid, not each cpu from 0..ATOMIC_HASH_SIZE-1.  But
-	 * for efficient hashing of atomics to their locks we want a
-	 * compile time constant power of 2 for the size of this
-	 * table, so we use ATOMIC_HASH_SIZE.
-	 *
-	 * Here we populate atomic_lock_ptr from the per cpu
-	 * atomic_lock_pool, interspersing by actual cpu so that
-	 * subsequent elements are homed on consecutive cpus.
-	 */
-
-	actual_cpu = cpumask_first(cpu_possible_mask);
-
-	for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) {
-		/*
-		 * Preincrement to slightly bias against using cpu 0,
-		 * which has plenty of stuff homed on it already.
-		 */
-		actual_cpu = cpumask_next(actual_cpu, cpu_possible_mask);
-		if (actual_cpu >= nr_cpu_ids)
-			actual_cpu = cpumask_first(cpu_possible_mask);
-
-		atomic_lock_ptr[i] = &per_cpu(atomic_lock_pool, actual_cpu);
-	}
-
-#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
 	/* Validate power-of-two and "bigger than cpus" assumption */
 	BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1));
 	BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids);
@@ -321,9 +180,4 @@ void __init __init_atomic_per_cpu(void)
 	 * That should not produce more indices than ATOMIC_HASH_SIZE.
 	 */
 	BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE);
-
-#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
-
-	/* The futex code makes this assumption, so we validate it here. */
-	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
 }
diff --git a/arch/tile/lib/atomic_asm_32.S b/arch/tile/lib/atomic_asm_32.S
index 30638042691..6bda3132cd6 100644
--- a/arch/tile/lib/atomic_asm_32.S
+++ b/arch/tile/lib/atomic_asm_32.S
@@ -164,6 +164,7 @@ STD_ENTRY_SECTION(__atomic\name, .text.atomic)
 	STD_ENDPROC(__atomic\name)
 	.ifc \bitwidth,32
 	.pushsection __ex_table,"a"
+	.align  4
 	.word   1b, __atomic\name
 	.word   2b, __atomic\name
 	.word   __atomic\name, __atomic_bad_address
diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c
index 8928aace7a6..9c0ec22009a 100644
--- a/arch/tile/lib/cacheflush.c
+++ b/arch/tile/lib/cacheflush.c
@@ -12,6 +12,7 @@
  *   more details.
  */
 
+#include <linux/export.h>
 #include <asm/page.h>
 #include <asm/cacheflush.h>
 #include <arch/icache.h>
@@ -35,11 +36,26 @@ static inline void force_load(char *p)
  * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting
  * until the memory controller holds the flushed values.
  */
-void finv_buffer_remote(void *buffer, size_t size, int hfh)
+void __attribute__((optimize("omit-frame-pointer")))
+finv_buffer_remote(void *buffer, size_t size, int hfh)
 {
 	char *p, *base;
 	size_t step_size, load_count;
+
+	/*
+	 * On TILEPro the striping granularity is a fixed 8KB; on
+	 * TILE-Gx it is configurable, and we rely on the fact that
+	 * the hypervisor always configures maximum striping, so that
+	 * bits 9 and 10 of the PA are part of the stripe function, so
+	 * every 512 bytes we hit a striping boundary.
+	 *
+	 */
+#ifdef __tilegx__
+	const unsigned long STRIPE_WIDTH = 512;
+#else
 	const unsigned long STRIPE_WIDTH = 8192;
+#endif
+
 #ifdef __tilegx__
 	/*
 	 * On TILE-Gx, we must disable the dstream prefetcher before doing
@@ -74,7 +90,7 @@ void finv_buffer_remote(void *buffer, size_t size, int hfh)
 	 * memory, that one load would be sufficient, but since we may
 	 * be, we also need to back up to the last load issued to
 	 * another memory controller, which would be the point where
-	 * we crossed an 8KB boundary (the granularity of striping
+	 * we crossed a "striping" boundary (the granularity of striping
 	 * across memory controllers).  Keep backing up and doing this
 	 * until we are before the beginning of the buffer, or have
 	 * hit all the controllers.
@@ -88,12 +104,22 @@ void finv_buffer_remote(void *buffer, size_t size, int hfh)
 	 * every cache line on a full memory stripe on each
 	 * controller" that we simply do that, to simplify the logic.
 	 *
-	 * FIXME: See bug 9535 for some issues with this code.
+	 * On TILE-Gx the hash-for-home function is much more complex,
+	 * with the upshot being we can't readily guarantee we have
+	 * hit both entries in the 128-entry AMT that were hit by any
+	 * load in the entire range, so we just re-load them all.
+	 * With larger buffers, we may want to consider using a hypervisor
+	 * trap to issue loads directly to each hash-for-home tile for
+	 * each controller (doing it from Linux would trash the TLB).
 	 */
 	if (hfh) {
 		step_size = L2_CACHE_BYTES;
+#ifdef __tilegx__
+		load_count = (size + L2_CACHE_BYTES - 1) / L2_CACHE_BYTES;
+#else
 		load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) *
 			      (1 << CHIP_LOG_NUM_MSHIMS());
+#endif
 	} else {
 		step_size = STRIPE_WIDTH;
 		load_count = (1 << CHIP_LOG_NUM_MSHIMS());
@@ -109,7 +135,7 @@ void finv_buffer_remote(void *buffer, size_t size, int hfh)
 
 	/* Figure out how far back we need to go. */
 	base = p - (step_size * (load_count - 2));
-	if ((long)base < (long)buffer)
+	if ((unsigned long)base < (unsigned long)buffer)
 		base = buffer;
 
 	/*
@@ -122,18 +148,21 @@ void finv_buffer_remote(void *buffer, size_t size, int hfh)
 		force_load(p);
 
 	/*
-	 * Repeat, but with inv's instead of loads, to get rid of the
+	 * Repeat, but with finv's instead of loads, to get rid of the
 	 * data we just loaded into our own cache and the old home L3.
-	 * No need to unroll since inv's don't target a register.
+	 * No need to unroll since finv's don't target a register.
+	 * The finv's are guaranteed not to actually flush the data in
+	 * the buffer back to their home, since we just read it, so the
+	 * lines are clean in cache; we will only invalidate those lines.
 	 */
 	p = (char *)buffer + size - 1;
-	__insn_inv(p);
+	__insn_finv(p);
 	p -= step_size;
 	p = (char *)((unsigned long)p | (step_size - 1));
 	for (; p >= base; p -= step_size)
-		__insn_inv(p);
+		__insn_finv(p);
 
-	/* Wait for the load+inv's (and thus finvs) to have completed. */
+	/* Wait for these finv's (and thus the first finvs) to be done. */
 	__insn_mf();
 
 #ifdef __tilegx__
@@ -141,3 +170,4 @@ void finv_buffer_remote(void *buffer, size_t size, int hfh)
 	__insn_mtspr(SPR_DSTREAM_PF, old_dstream_pf);
 #endif
 }
+EXPORT_SYMBOL_GPL(finv_buffer_remote);
diff --git a/arch/tile/lib/checksum.c b/arch/tile/lib/checksum.c
index e4bab5bd3f3..c3ca3e64d9d 100644
--- a/arch/tile/lib/checksum.c
+++ b/arch/tile/lib/checksum.c
@@ -16,19 +16,6 @@
 #include <net/checksum.h>
 #include <linux/module.h>
 
-static inline unsigned int longto16(unsigned long x)
-{
-	unsigned long ret;
-#ifdef __tilegx__
-	ret = __insn_v2sadu(x, 0);
-	ret = __insn_v2sadu(ret, 0);
-#else
-	ret = __insn_sadh_u(x, 0);
-	ret = __insn_sadh_u(ret, 0);
-#endif
-	return ret;
-}
-
 __wsum do_csum(const unsigned char *buff, int len)
 {
 	int odd, count;
@@ -94,7 +81,7 @@ __wsum do_csum(const unsigned char *buff, int len)
 	}
 	if (len & 1)
 		result += *buff;
-	result = longto16(result);
+	result = csum_long(result);
 	if (odd)
 		result = swab16(result);
 out:
diff --git a/arch/tile/lib/cpumask.c b/arch/tile/lib/cpumask.c
index fdc403614d1..75947edccb2 100644
--- a/arch/tile/lib/cpumask.c
+++ b/arch/tile/lib/cpumask.c
@@ -16,6 +16,7 @@
 #include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/smp.h>
+#include <linux/export.h>
 
 /*
  * Allow cropping out bits beyond the end of the array.
@@ -50,3 +51,4 @@ int bitmap_parselist_crop(const char *bp, unsigned long *maskp, int nmaskbits)
 	} while (*bp != '\0' && *bp != '\n');
 	return 0;
 }
+EXPORT_SYMBOL(bitmap_parselist_crop);
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index 2a81d32de0d..82733c87d67 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -18,19 +18,10 @@
 
 /* arch/tile/lib/usercopy.S */
 #include <linux/uaccess.h>
-EXPORT_SYMBOL(__get_user_1);
-EXPORT_SYMBOL(__get_user_2);
-EXPORT_SYMBOL(__get_user_4);
-EXPORT_SYMBOL(__get_user_8);
-EXPORT_SYMBOL(__put_user_1);
-EXPORT_SYMBOL(__put_user_2);
-EXPORT_SYMBOL(__put_user_4);
-EXPORT_SYMBOL(__put_user_8);
 EXPORT_SYMBOL(strnlen_user_asm);
 EXPORT_SYMBOL(strncpy_from_user_asm);
 EXPORT_SYMBOL(clear_user_asm);
 EXPORT_SYMBOL(flush_user_asm);
-EXPORT_SYMBOL(inv_user_asm);
 EXPORT_SYMBOL(finv_user_asm);
 
 /* arch/tile/kernel/entry.S */
@@ -42,6 +33,12 @@ EXPORT_SYMBOL(dump_stack);
 /* arch/tile/kernel/head.S */
 EXPORT_SYMBOL(empty_zero_page);
 
+#ifdef CONFIG_FUNCTION_TRACER
+/* arch/tile/kernel/mcount_64.S */
+#include <asm/ftrace.h>
+EXPORT_SYMBOL(__mcount);
+#endif /* CONFIG_FUNCTION_TRACER */
+
 /* arch/tile/lib/, various memcpy files */
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__copy_to_user_inatomic);
@@ -63,6 +60,8 @@ EXPORT_SYMBOL(hv_dev_poll_cancel);
 EXPORT_SYMBOL(hv_dev_close);
 EXPORT_SYMBOL(hv_sysconf);
 EXPORT_SYMBOL(hv_confstr);
+EXPORT_SYMBOL(hv_get_rtc);
+EXPORT_SYMBOL(hv_set_rtc);
 
 /* libgcc.a */
 uint32_t __udivsi3(uint32_t dividend, uint32_t divisor);
@@ -90,4 +89,6 @@ uint64_t __ashrdi3(uint64_t, unsigned int);
 EXPORT_SYMBOL(__ashrdi3);
 uint64_t __ashldi3(uint64_t, unsigned int);
 EXPORT_SYMBOL(__ashldi3);
+int __ffsdi2(uint64_t);
+EXPORT_SYMBOL(__ffsdi2);
 #endif
diff --git a/arch/tile/lib/memchr_64.c b/arch/tile/lib/memchr_64.c
index 84fdc8d8e73..f8196b3a950 100644
--- a/arch/tile/lib/memchr_64.c
+++ b/arch/tile/lib/memchr_64.c
@@ -15,6 +15,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
+#include "string-endian.h"
 
 void *memchr(const void *s, int c, size_t n)
 {
@@ -35,15 +36,12 @@ void *memchr(const void *s, int c, size_t n)
 	p = (const uint64_t *)(s_int & -8);
 
 	/* Create eight copies of the byte for which we are looking. */
-	goal = 0x0101010101010101ULL * (uint8_t) c;
+	goal = copy_byte(c);
 
 	/* Read the first word, but munge it so that bytes before the array
 	 * will not match goal.
-	 *
-	 * Note that this shift count expression works because we know
-	 * shift counts are taken mod 64.
 	 */
-	before_mask = (1ULL << (s_int << 3)) - 1;
+	before_mask = MASK(s_int);
 	v = (*p | before_mask) ^ (goal & before_mask);
 
 	/* Compute the address of the last byte. */
@@ -65,7 +63,7 @@ void *memchr(const void *s, int c, size_t n)
 	/* We found a match, but it might be in a byte past the end
 	 * of the array.
 	 */
-	ret = ((char *)p) + (__insn_ctz(bits) >> 3);
+	ret = ((char *)p) + (CFZ(bits) >> 3);
 	return (ret <= last_byte_ptr) ? ret : NULL;
 }
 EXPORT_SYMBOL(memchr);
diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S
index 2a419a6122d..a2771ae5da5 100644
--- a/arch/tile/lib/memcpy_32.S
+++ b/arch/tile/lib/memcpy_32.S
@@ -22,14 +22,6 @@
 
 #include <linux/linkage.h>
 
-/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-#define memcpy __memcpy_asm
-#define __copy_to_user_inatomic __copy_to_user_inatomic_asm
-#define __copy_from_user_inatomic __copy_from_user_inatomic_asm
-#define __copy_from_user_zeroing __copy_from_user_zeroing_asm
-#endif
-
 #define IS_MEMCPY	  0
 #define IS_COPY_FROM_USER  1
 #define IS_COPY_FROM_USER_ZEROING  2
@@ -44,6 +36,7 @@
  */
 #define EX \
 	.pushsection __ex_table, "a"; \
+	.align 4; \
 	.word 9f, memcpy_common_fixup; \
 	.popsection; \
 	9
@@ -158,12 +151,9 @@ EX:	{ sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 }
 
 	{ addi r3, r1, 60; andi r9, r9, -64 }
 
-#if CHIP_HAS_WH64()
 	/* No need to prefetch dst, we'll just do the wh64
 	 * right before we copy a line.
 	 */
-#endif
-
 EX:	{ lw r5, r3; addi r3, r3, 64; movei r4, 1 }
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bnzt zero, .; move r27, lr }
@@ -171,21 +161,6 @@ EX:	{ lw r6, r3; addi r3, r3, 64 }
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bnzt zero, . }
 EX:	{ lw r7, r3; addi r3, r3, 64 }
-#if !CHIP_HAS_WH64()
-	/* Prefetch the dest */
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	/* Use a real load to cause a TLB miss if necessary.  We aren't using
-	 * r28, so this should be fine.
-	 */
-EX:	{ lw r28, r9; addi r9, r9, 64 }
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	{ prefetch r9; addi r9, r9, 64 }
-	/* Intentionally stall for a few cycles to leave L2 cache alone. */
-	{ bnzt zero, . }
-	{ prefetch r9; addi r9, r9, 64 }
-#endif
 	/* Intentionally stall for a few cycles to leave L2 cache alone. */
 	{ bz zero, .Lbig_loop2 }
 
@@ -286,13 +261,8 @@ EX:	{ lw r7, r3; addi r3, r3, 64 }
 	/* Fill second L1D line. */
 EX:	{ lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */
 
-#if CHIP_HAS_WH64()
 	/* Prepare destination line for writing. */
 EX:	{ wh64 r9; addi r9, r9, 64 }
-#else
-	/* Prefetch dest line */
-	{ prefetch r9; addi r9, r9, 64 }
-#endif
 	/* Load seven words that are L1D hits to cover wh64 L2 usage. */
 
 	/* Load the three remaining words from the last L1D line, which
@@ -330,16 +300,7 @@ EX:	{ lw r18, r1; addi r1, r1, 4 }                  /* r18 = WORD_8 */
 EX:	{ sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */
 EX:	{ sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */
 EX:	{ sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */
-#if CHIP_HAS_WH64()
 EX:	{ sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */
-#else
-	/* Back up the r9 to a cache line we are already storing to
-	 * if it gets past the end of the dest vector.  Strictly speaking,
-	 * we don't need to back up to the start of a cache line, but it's free
-	 * and tidy, so why not?
-	 */
-EX:	{ sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */
-#endif
 	/* Store second L1D line. */
 EX:	{ sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */
 EX:	{ sw r0, r19; addi r0, r0, 4 }                  /* store(WORD_5) */
@@ -403,7 +364,6 @@ EX:	{ sb r0, r3;   addi r0, r0, 1; addi r2, r2, -1 }
 
 .Ldest_is_word_aligned:
 
-#if CHIP_HAS_DWORD_ALIGN()
 EX:	{ andi r8, r0, 63; lwadd_na r6, r1, 4}
 	{ slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned }
 
@@ -511,26 +471,6 @@ EX:	{ swadd r0, r13, 4; addi r2, r2, -32 }
 	/* Move r1 back to the point where it corresponds to r0. */
 	{ addi r1, r1, -4 }
 
-#else /* !CHIP_HAS_DWORD_ALIGN() */
-
-	/* Compute right/left shift counts and load initial source words. */
-	{ andi r5, r1, -4; andi r3, r1, 3 }
-EX:	{ lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 }
-EX:	{ lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 }
-
-	/* Load and store one word at a time, using shifts and ORs
-	 * to correct for the misaligned src.
-	 */
-.Lcopy_unaligned_src_loop:
-	{ shr r6, r6, r3; shl r8, r7, r4 }
-EX:	{ lw r7, r5; or r8, r8, r6; move r6, r7 }
-EX:	{ sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 }
-	{ addi r5, r5, 4; slti_u r8, r2, 8 }
-	{ bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 }
-
-	{ bz r2, .Lcopy_unaligned_done }
-#endif /* !CHIP_HAS_DWORD_ALIGN() */
-
 	/* Fall through */
 
 /*
@@ -614,5 +554,6 @@ memcpy_fixup_loop:
 	.size memcpy_common_fixup, . - memcpy_common_fixup
 
 	.section __ex_table,"a"
+	.align 4
 	.word .Lcfu, .Lcopy_from_user_fixup_zero_remainder
 	.word .Lctu, .Lcopy_to_user_fixup_done
diff --git a/arch/tile/lib/memcpy_64.c b/arch/tile/lib/memcpy_64.c
index 3fab9a6a2bb..4815354b8cd 100644
--- a/arch/tile/lib/memcpy_64.c
+++ b/arch/tile/lib/memcpy_64.c
@@ -15,18 +15,20 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
-#define __memcpy memcpy
 /* EXPORT_SYMBOL() is in arch/tile/lib/exports.c since this should be asm. */
 
 /* Must be 8 bytes in size. */
-#define word_t uint64_t
+#define op_t uint64_t
 
-#if CHIP_L2_LINE_SIZE() != 64 && CHIP_L2_LINE_SIZE() != 128
-#error "Assumes 64 or 128 byte line size"
+/* Threshold value for when to enter the unrolled loops. */
+#define	OP_T_THRES	16
+
+#if CHIP_L2_LINE_SIZE() != 64
+#error "Assumes 64 byte line size"
 #endif
 
 /* How many cache lines ahead should we prefetch? */
-#define PREFETCH_LINES_AHEAD 3
+#define PREFETCH_LINES_AHEAD 4
 
 /*
  * Provide "base versions" of load and store for the normal code path.
@@ -52,15 +54,16 @@ void *memcpy(void *__restrict dstv, const void *__restrict srcv, size_t n)
  * macros to return a count of uncopied bytes due to mm fault.
  */
 #define RETVAL 0
-int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
+int __attribute__((optimize("omit-frame-pointer")))
+USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
 #endif
 {
 	char *__restrict dst1 = (char *)dstv;
 	const char *__restrict src1 = (const char *)srcv;
 	const char *__restrict src1_end;
 	const char *__restrict prefetch;
-	word_t *__restrict dst8;    /* 8-byte pointer to destination memory. */
-	word_t final; /* Final bytes to write to trailing word, if any */
+	op_t *__restrict dst8;    /* 8-byte pointer to destination memory. */
+	op_t final; /* Final bytes to write to trailing word, if any */
 	long i;
 
 	if (n < 16) {
@@ -80,104 +83,228 @@ int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
 	for (i = 0; i < PREFETCH_LINES_AHEAD; i++) {
 		__insn_prefetch(prefetch);
 		prefetch += CHIP_L2_LINE_SIZE();
-		prefetch = (prefetch > src1_end) ? prefetch : src1;
+		prefetch = (prefetch < src1_end) ? prefetch : src1;
 	}
 
 	/* Copy bytes until dst is word-aligned. */
-	for (; (uintptr_t)dst1 & (sizeof(word_t) - 1); n--)
+	for (; (uintptr_t)dst1 & (sizeof(op_t) - 1); n--)
 		ST1(dst1++, LD1(src1++));
 
 	/* 8-byte pointer to destination memory. */
-	dst8 = (word_t *)dst1;
-
-	if (__builtin_expect((uintptr_t)src1 & (sizeof(word_t) - 1), 0)) {
-		/*
-		 * Misaligned copy.  Copy 8 bytes at a time, but don't
-		 * bother with other fanciness.
-		 *
-		 * TODO: Consider prefetching and using wh64 as well.
-		 */
-
-		/* Create an aligned src8. */
-		const word_t *__restrict src8 =
-			(const word_t *)((uintptr_t)src1 & -sizeof(word_t));
-		word_t b;
-
-		word_t a = LD8(src8++);
-		for (; n >= sizeof(word_t); n -= sizeof(word_t)) {
-			b = LD8(src8++);
-			a = __insn_dblalign(a, b, src1);
-			ST8(dst8++, a);
-			a = b;
+	dst8 = (op_t *)dst1;
+
+	if (__builtin_expect((uintptr_t)src1 & (sizeof(op_t) - 1), 0)) {
+		/* Unaligned copy. */
+
+		op_t  tmp0 = 0, tmp1 = 0, tmp2, tmp3;
+		const op_t *src8 = (const op_t *) ((uintptr_t)src1 &
+						   -sizeof(op_t));
+		const void *srci = (void *)src1;
+		int m;
+
+		m = (CHIP_L2_LINE_SIZE() << 2) -
+			(((uintptr_t)dst8) & ((CHIP_L2_LINE_SIZE() << 2) - 1));
+		m = (n < m) ? n : m;
+		m /= sizeof(op_t);
+
+		/* Copy until 'dst' is cache-line-aligned. */
+		n -= (sizeof(op_t) * m);
+
+		switch (m % 4) {
+		case 0:
+			if (__builtin_expect(!m, 0))
+				goto _M0;
+			tmp1 = LD8(src8++);
+			tmp2 = LD8(src8++);
+			goto _8B3;
+		case 2:
+			m += 2;
+			tmp3 = LD8(src8++);
+			tmp0 = LD8(src8++);
+			goto _8B1;
+		case 3:
+			m += 1;
+			tmp2 = LD8(src8++);
+			tmp3 = LD8(src8++);
+			goto _8B2;
+		case 1:
+			m--;
+			tmp0 = LD8(src8++);
+			tmp1 = LD8(src8++);
+			if (__builtin_expect(!m, 0))
+				goto _8B0;
+		}
+
+		do {
+			tmp2 = LD8(src8++);
+			tmp0 =  __insn_dblalign(tmp0, tmp1, srci);
+			ST8(dst8++, tmp0);
+_8B3:
+			tmp3 = LD8(src8++);
+			tmp1 = __insn_dblalign(tmp1, tmp2, srci);
+			ST8(dst8++, tmp1);
+_8B2:
+			tmp0 = LD8(src8++);
+			tmp2 = __insn_dblalign(tmp2, tmp3, srci);
+			ST8(dst8++, tmp2);
+_8B1:
+			tmp1 = LD8(src8++);
+			tmp3 = __insn_dblalign(tmp3, tmp0, srci);
+			ST8(dst8++, tmp3);
+			m -= 4;
+		} while (m);
+
+_8B0:
+		tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+		ST8(dst8++, tmp0);
+		src8--;
+
+_M0:
+		if (__builtin_expect(n >= CHIP_L2_LINE_SIZE(), 0)) {
+			op_t tmp4, tmp5, tmp6, tmp7, tmp8;
+
+			prefetch = ((const char *)src8) +
+				CHIP_L2_LINE_SIZE() * PREFETCH_LINES_AHEAD;
+
+			for (tmp0 = LD8(src8++); n >= CHIP_L2_LINE_SIZE();
+			     n -= CHIP_L2_LINE_SIZE()) {
+				/* Prefetch and advance to next line to
+				   prefetch, but don't go past the end.  */
+				__insn_prefetch(prefetch);
+
+				/* Make sure prefetch got scheduled
+				   earlier.  */
+				__asm__ ("" : : : "memory");
+
+				prefetch += CHIP_L2_LINE_SIZE();
+				prefetch = (prefetch < src1_end) ? prefetch :
+					(const char *) src8;
+
+				tmp1 = LD8(src8++);
+				tmp2 = LD8(src8++);
+				tmp3 = LD8(src8++);
+				tmp4 = LD8(src8++);
+				tmp5 = LD8(src8++);
+				tmp6 = LD8(src8++);
+				tmp7 = LD8(src8++);
+				tmp8 = LD8(src8++);
+
+				tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+				tmp1 = __insn_dblalign(tmp1, tmp2, srci);
+				tmp2 = __insn_dblalign(tmp2, tmp3, srci);
+				tmp3 = __insn_dblalign(tmp3, tmp4, srci);
+				tmp4 = __insn_dblalign(tmp4, tmp5, srci);
+				tmp5 = __insn_dblalign(tmp5, tmp6, srci);
+				tmp6 = __insn_dblalign(tmp6, tmp7, srci);
+				tmp7 = __insn_dblalign(tmp7, tmp8, srci);
+
+				__insn_wh64(dst8);
+
+				ST8(dst8++, tmp0);
+				ST8(dst8++, tmp1);
+				ST8(dst8++, tmp2);
+				ST8(dst8++, tmp3);
+				ST8(dst8++, tmp4);
+				ST8(dst8++, tmp5);
+				ST8(dst8++, tmp6);
+				ST8(dst8++, tmp7);
+
+				tmp0 = tmp8;
+			}
+			src8--;
+		}
+
+		/* Copy the rest 8-byte chunks. */
+		if (n >= sizeof(op_t)) {
+			tmp0 = LD8(src8++);
+			for (; n >= sizeof(op_t); n -= sizeof(op_t)) {
+				tmp1 = LD8(src8++);
+				tmp0 = __insn_dblalign(tmp0, tmp1, srci);
+				ST8(dst8++, tmp0);
+				tmp0 = tmp1;
+			}
+			src8--;
 		}
 
 		if (n == 0)
 			return RETVAL;
 
-		b = ((const char *)src8 <= src1_end) ? *src8 : 0;
+		tmp0 = LD8(src8++);
+		tmp1 = ((const char *)src8 <= src1_end)
+			? LD8((op_t *)src8) : 0;
+		final = __insn_dblalign(tmp0, tmp1, srci);
 
-		/*
-		 * Final source bytes to write to trailing partial
-		 * word, if any.
-		 */
-		final = __insn_dblalign(a, b, src1);
 	} else {
 		/* Aligned copy. */
 
-		const word_t* __restrict src8 = (const word_t *)src1;
+		const op_t *__restrict src8 = (const op_t *)src1;
 
 		/* src8 and dst8 are both word-aligned. */
 		if (n >= CHIP_L2_LINE_SIZE()) {
 			/* Copy until 'dst' is cache-line-aligned. */
 			for (; (uintptr_t)dst8 & (CHIP_L2_LINE_SIZE() - 1);
-			     n -= sizeof(word_t))
+			     n -= sizeof(op_t))
 				ST8(dst8++, LD8(src8++));
 
 			for (; n >= CHIP_L2_LINE_SIZE(); ) {
-				__insn_wh64(dst8);
+				op_t tmp0, tmp1, tmp2, tmp3;
+				op_t tmp4, tmp5, tmp6, tmp7;
 
 				/*
 				 * Prefetch and advance to next line
-				 * to prefetch, but don't go past the end
+				 * to prefetch, but don't go past the
+				 * end.
 				 */
 				__insn_prefetch(prefetch);
+
+				/* Make sure prefetch got scheduled
+				   earlier.  */
+				__asm__ ("" : : : "memory");
+
 				prefetch += CHIP_L2_LINE_SIZE();
-				prefetch = (prefetch > src1_end) ? prefetch :
+				prefetch = (prefetch < src1_end) ? prefetch :
 					(const char *)src8;
 
 				/*
-				 * Copy an entire cache line.  Manually
-				 * unrolled to avoid idiosyncracies of
-				 * compiler unrolling.
+				 * Do all the loads before wh64.  This
+				 * is necessary if [src8, src8+7] and
+				 * [dst8, dst8+7] share the same cache
+				 * line and dst8 <= src8, as can be
+				 * the case when called from memmove,
+				 * or with code tested on x86 whose
+				 * memcpy always works with forward
+				 * copies.
 				 */
-#define COPY_WORD(offset) ({ ST8(dst8+offset, LD8(src8+offset)); n -= 8; })
-				COPY_WORD(0);
-				COPY_WORD(1);
-				COPY_WORD(2);
-				COPY_WORD(3);
-				COPY_WORD(4);
-				COPY_WORD(5);
-				COPY_WORD(6);
-				COPY_WORD(7);
-#if CHIP_L2_LINE_SIZE() == 128
-				COPY_WORD(8);
-				COPY_WORD(9);
-				COPY_WORD(10);
-				COPY_WORD(11);
-				COPY_WORD(12);
-				COPY_WORD(13);
-				COPY_WORD(14);
-				COPY_WORD(15);
-#elif CHIP_L2_LINE_SIZE() != 64
-# error Fix code that assumes particular L2 cache line sizes
-#endif
+				tmp0 = LD8(src8++);
+				tmp1 = LD8(src8++);
+				tmp2 = LD8(src8++);
+				tmp3 = LD8(src8++);
+				tmp4 = LD8(src8++);
+				tmp5 = LD8(src8++);
+				tmp6 = LD8(src8++);
+				tmp7 = LD8(src8++);
+
+				/* wh64 and wait for tmp7 load completion. */
+				__asm__ ("move %0, %0; wh64 %1\n"
+					 : : "r"(tmp7), "r"(dst8));
+
+				ST8(dst8++, tmp0);
+				ST8(dst8++, tmp1);
+				ST8(dst8++, tmp2);
+				ST8(dst8++, tmp3);
+				ST8(dst8++, tmp4);
+				ST8(dst8++, tmp5);
+				ST8(dst8++, tmp6);
+				ST8(dst8++, tmp7);
 
-				dst8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
-				src8 += CHIP_L2_LINE_SIZE() / sizeof(word_t);
+				n -= CHIP_L2_LINE_SIZE();
 			}
+#if CHIP_L2_LINE_SIZE() != 64
+# error "Fix code that assumes particular L2 cache line size."
+#endif
 		}
 
-		for (; n >= sizeof(word_t); n -= sizeof(word_t))
+		for (; n >= sizeof(op_t); n -= sizeof(op_t))
 			ST8(dst8++, LD8(src8++));
 
 		if (__builtin_expect(n == 0, 1))
@@ -188,6 +315,7 @@ int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
 
 	/* n != 0 if we get here.  Write out any trailing bytes. */
 	dst1 = (char *)dst8;
+#ifndef __BIG_ENDIAN__
 	if (n & 4) {
 		ST4((uint32_t *)dst1, final);
 		dst1 += 4;
@@ -202,11 +330,30 @@ int USERCOPY_FUNC(void *__restrict dstv, const void *__restrict srcv, size_t n)
 	}
 	if (n)
 		ST1((uint8_t *)dst1, final);
+#else
+	if (n & 4) {
+		ST4((uint32_t *)dst1, final >> 32);
+		dst1 += 4;
+        }
+        else
+        {
+		final >>= 32;
+        }
+	if (n & 2) {
+		ST2((uint16_t *)dst1, final >> 16);
+		dst1 += 2;
+        }
+        else
+        {
+		final >>= 16;
+        }
+	if (n & 1)
+		ST1((uint8_t *)dst1, final >> 8);
+#endif
 
 	return RETVAL;
 }
 
-
 #ifdef USERCOPY_FUNC
 #undef ST1
 #undef ST2
diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c
deleted file mode 100644
index b2fe15e0107..00000000000
--- a/arch/tile/lib/memcpy_tile64.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <asm/fixmap.h>
-#include <asm/kmap_types.h>
-#include <asm/tlbflush.h>
-#include <hv/hypervisor.h>
-#include <arch/chip.h>
-
-
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-
-/* Defined in memcpy.S */
-extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n);
-extern unsigned long __copy_to_user_inatomic_asm(
-	void __user *to, const void *from, unsigned long n);
-extern unsigned long __copy_from_user_inatomic_asm(
-	void *to, const void __user *from, unsigned long n);
-extern unsigned long __copy_from_user_zeroing_asm(
-	void *to, const void __user *from, unsigned long n);
-
-typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long);
-
-/* Size above which to consider TLB games for performance */
-#define LARGE_COPY_CUTOFF 2048
-
-/* Communicate to the simulator what we are trying to do. */
-#define sim_allow_multiple_caching(b) \
-  __insn_mtspr(SPR_SIM_CONTROL, \
-   SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS))
-
-/*
- * Copy memory by briefly enabling incoherent cacheline-at-a-time mode.
- *
- * We set up our own source and destination PTEs that we fully control.
- * This is the only way to guarantee that we don't race with another
- * thread that is modifying the PTE; we can't afford to try the
- * copy_{to,from}_user() technique of catching the interrupt, since
- * we must run with interrupts disabled to avoid the risk of some
- * other code seeing the incoherent data in our cache.  (Recall that
- * our cache is indexed by PA, so even if the other code doesn't use
- * our kmap_atomic virtual addresses, they'll still hit in cache using
- * the normal VAs that aren't supposed to hit in cache.)
- */
-static void memcpy_multicache(void *dest, const void *source,
-			      pte_t dst_pte, pte_t src_pte, int len)
-{
-	int idx;
-	unsigned long flags, newsrc, newdst;
-	pmd_t *pmdp;
-	pte_t *ptep;
-	int type0, type1;
-	int cpu = get_cpu();
-
-	/*
-	 * Disable interrupts so that we don't recurse into memcpy()
-	 * in an interrupt handler, nor accidentally reference
-	 * the PA of the source from an interrupt routine.  Also
-	 * notify the simulator that we're playing games so we don't
-	 * generate spurious coherency warnings.
-	 */
-	local_irq_save(flags);
-	sim_allow_multiple_caching(1);
-
-	/* Set up the new dest mapping */
-	type0 = kmap_atomic_idx_push();
-	idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0;
-	newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1));
-	pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst);
-	ptep = pte_offset_kernel(pmdp, newdst);
-	if (pte_val(*ptep) != pte_val(dst_pte)) {
-		set_pte(ptep, dst_pte);
-		local_flush_tlb_page(NULL, newdst, PAGE_SIZE);
-	}
-
-	/* Set up the new source mapping */
-	type1 = kmap_atomic_idx_push();
-	idx += (type0 - type1);
-	src_pte = hv_pte_set_nc(src_pte);
-	src_pte = hv_pte_clear_writable(src_pte);  /* be paranoid */
-	newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
-	pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
-	ptep = pte_offset_kernel(pmdp, newsrc);
-	__set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
-	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
-	/* Actually move the data. */
-	__memcpy_asm((void *)newdst, (const void *)newsrc, len);
-
-	/*
-	 * Remap the source as locally-cached and not OLOC'ed so that
-	 * we can inval without also invaling the remote cpu's cache.
-	 * This also avoids known errata with inv'ing cacheable oloc data.
-	 */
-	src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
-	src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
-	__set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
-	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
-
-	/*
-	 * Do the actual invalidation, covering the full L2 cache line
-	 * at the end since __memcpy_asm() is somewhat aggressive.
-	 */
-	__inv_buffer((void *)newsrc, len);
-
-	/*
-	 * We're done: notify the simulator that all is back to normal,
-	 * and re-enable interrupts and pre-emption.
-	 */
-	kmap_atomic_idx_pop();
-	kmap_atomic_idx_pop();
-	sim_allow_multiple_caching(0);
-	local_irq_restore(flags);
-	put_cpu();
-}
-
-/*
- * Identify large copies from remotely-cached memory, and copy them
- * via memcpy_multicache() if they look good, otherwise fall back
- * to the particular kind of copying passed as the memcpy_t function.
- */
-static unsigned long fast_copy(void *dest, const void *source, int len,
-			       memcpy_t func)
-{
-	/*
-	 * Check if it's big enough to bother with.  We may end up doing a
-	 * small copy via TLB manipulation if we're near a page boundary,
-	 * but presumably we'll make it up when we hit the second page.
-	 */
-	while (len >= LARGE_COPY_CUTOFF) {
-		int copy_size, bytes_left_on_page;
-		pte_t *src_ptep, *dst_ptep;
-		pte_t src_pte, dst_pte;
-		struct page *src_page, *dst_page;
-
-		/* Is the source page oloc'ed to a remote cpu? */
-retry_source:
-		src_ptep = virt_to_pte(current->mm, (unsigned long)source);
-		if (src_ptep == NULL)
-			break;
-		src_pte = *src_ptep;
-		if (!hv_pte_get_present(src_pte) ||
-		    !hv_pte_get_readable(src_pte) ||
-		    hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3)
-			break;
-		if (get_remote_cache_cpu(src_pte) == smp_processor_id())
-			break;
-		src_page = pfn_to_page(hv_pte_get_pfn(src_pte));
-		get_page(src_page);
-		if (pte_val(src_pte) != pte_val(*src_ptep)) {
-			put_page(src_page);
-			goto retry_source;
-		}
-		if (pte_huge(src_pte)) {
-			/* Adjust the PTE to correspond to a small page */
-			int pfn = hv_pte_get_pfn(src_pte);
-			pfn += (((unsigned long)source & (HPAGE_SIZE-1))
-				>> PAGE_SHIFT);
-			src_pte = pfn_pte(pfn, src_pte);
-			src_pte = pte_mksmall(src_pte);
-		}
-
-		/* Is the destination page writable? */
-retry_dest:
-		dst_ptep = virt_to_pte(current->mm, (unsigned long)dest);
-		if (dst_ptep == NULL) {
-			put_page(src_page);
-			break;
-		}
-		dst_pte = *dst_ptep;
-		if (!hv_pte_get_present(dst_pte) ||
-		    !hv_pte_get_writable(dst_pte)) {
-			put_page(src_page);
-			break;
-		}
-		dst_page = pfn_to_page(hv_pte_get_pfn(dst_pte));
-		if (dst_page == src_page) {
-			/*
-			 * Source and dest are on the same page; this
-			 * potentially exposes us to incoherence if any
-			 * part of src and dest overlap on a cache line.
-			 * Just give up rather than trying to be precise.
-			 */
-			put_page(src_page);
-			break;
-		}
-		get_page(dst_page);
-		if (pte_val(dst_pte) != pte_val(*dst_ptep)) {
-			put_page(dst_page);
-			goto retry_dest;
-		}
-		if (pte_huge(dst_pte)) {
-			/* Adjust the PTE to correspond to a small page */
-			int pfn = hv_pte_get_pfn(dst_pte);
-			pfn += (((unsigned long)dest & (HPAGE_SIZE-1))
-				>> PAGE_SHIFT);
-			dst_pte = pfn_pte(pfn, dst_pte);
-			dst_pte = pte_mksmall(dst_pte);
-		}
-
-		/* All looks good: create a cachable PTE and copy from it */
-		copy_size = len;
-		bytes_left_on_page =
-			PAGE_SIZE - (((int)source) & (PAGE_SIZE-1));
-		if (copy_size > bytes_left_on_page)
-			copy_size = bytes_left_on_page;
-		bytes_left_on_page =
-			PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1));
-		if (copy_size > bytes_left_on_page)
-			copy_size = bytes_left_on_page;
-		memcpy_multicache(dest, source, dst_pte, src_pte, copy_size);
-
-		/* Release the pages */
-		put_page(dst_page);
-		put_page(src_page);
-
-		/* Continue on the next page */
-		dest += copy_size;
-		source += copy_size;
-		len -= copy_size;
-	}
-
-	return func(dest, source, len);
-}
-
-void *memcpy(void *to, const void *from, __kernel_size_t n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return (void *)__memcpy_asm(to, from, n);
-	else
-		return (void *)fast_copy(to, from, n, __memcpy_asm);
-}
-
-unsigned long __copy_to_user_inatomic(void __user *to, const void *from,
-				      unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_to_user_inatomic_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_to_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_inatomic(void *to, const void __user *from,
-					unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_from_user_inatomic_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_from_user_inatomic_asm);
-}
-
-unsigned long __copy_from_user_zeroing(void *to, const void __user *from,
-				       unsigned long n)
-{
-	if (n < LARGE_COPY_CUTOFF)
-		return __copy_from_user_zeroing_asm(to, from, n);
-	else
-		return fast_copy(to, from, n, __copy_from_user_zeroing_asm);
-}
-
-#endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */
diff --git a/arch/tile/lib/memcpy_user_64.c b/arch/tile/lib/memcpy_user_64.c
index 4763b3aff1c..88c7016492c 100644
--- a/arch/tile/lib/memcpy_user_64.c
+++ b/arch/tile/lib/memcpy_user_64.c
@@ -14,7 +14,13 @@
  * Do memcpy(), but trap and return "n" when a load or store faults.
  *
  * Note: this idiom only works when memcpy() compiles to a leaf function.
- * If "sp" is updated during memcpy, the "jrp lr" will be incorrect.
+ * Here leaf function not only means it does not have calls, but also
+ * requires no stack operations (sp, stack frame pointer) and no
+ * use of callee-saved registers, else "jrp lr" will be incorrect since
+ * unwinding stack frame is bypassed. Since memcpy() is not complex so
+ * these conditions are satisfied here, but we need to be careful when
+ * modifying this file. This is not a clean solution but is the best
+ * one so far.
  *
  * Also note that we are capturing "n" from the containing scope here.
  */
@@ -25,6 +31,7 @@
 		    ".pushsection .coldtext.memcpy,\"ax\";"	\
 		    "2: { move r0, %2; jrp lr };"		\
 		    ".section __ex_table,\"a\";"		\
+		    ".align 8;"					\
 		    ".quad 1b, 2b;"				\
 		    ".popsection"				\
 		    : "=m" (*(p)) : "r" (v), "r" (n));		\
@@ -37,6 +44,7 @@
 		    ".pushsection .coldtext.memcpy,\"ax\";"	\
 		    "2: { move r0, %2; jrp lr };"		\
 		    ".section __ex_table,\"a\";"		\
+		    ".align 8;"					\
 		    ".quad 1b, 2b;"				\
 		    ".popsection"				\
 		    : "=r" (__v) : "m" (*(p)), "r" (n));	\
diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c
index 57dbb3a5bff..2042bfe6595 100644
--- a/arch/tile/lib/memset_32.c
+++ b/arch/tile/lib/memset_32.c
@@ -12,13 +12,10 @@
  *   more details.
  */
 
-#include <arch/chip.h>
-
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
-
-#undef memset
+#include <arch/chip.h>
 
 void *memset(void *s, int c, size_t n)
 {
@@ -26,11 +23,7 @@ void *memset(void *s, int c, size_t n)
 	int n32;
 	uint32_t v16, v32;
 	uint8_t *out8 = s;
-#if !CHIP_HAS_WH64()
-	int ahead32;
-#else
 	int to_align32;
-#endif
 
 	/* Experimentation shows that a trivial tight loop is a win up until
 	 * around a size of 20, where writing a word at a time starts to win.
@@ -61,21 +54,6 @@ void *memset(void *s, int c, size_t n)
 		return s;
 	}
 
-#if !CHIP_HAS_WH64()
-	/* Use a spare issue slot to start prefetching the first cache
-	 * line early. This instruction is free as the store can be buried
-	 * in otherwise idle issue slots doing ALU ops.
-	 */
-	__insn_prefetch(out8);
-
-	/* We prefetch the end so that a short memset that spans two cache
-	 * lines gets some prefetching benefit. Again we believe this is free
-	 * to issue.
-	 */
-	__insn_prefetch(&out8[n - 1]);
-#endif /* !CHIP_HAS_WH64() */
-
-
 	/* Align 'out8'. We know n >= 3 so this won't write past the end. */
 	while (((uintptr_t) out8 & 3) != 0) {
 		*out8++ = c;
@@ -96,90 +74,6 @@ void *memset(void *s, int c, size_t n)
 	/* This must be at least 8 or the following loop doesn't work. */
 #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4)
 
-#if !CHIP_HAS_WH64()
-
-	ahead32 = CACHE_LINE_SIZE_IN_WORDS;
-
-	/* We already prefetched the first and last cache lines, so
-	 * we only need to do more prefetching if we are storing
-	 * to more than two cache lines.
-	 */
-	if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) {
-		int i;
-
-		/* Prefetch the next several cache lines.
-		 * This is the setup code for the software-pipelined
-		 * loop below.
-		 */
-#define MAX_PREFETCH 5
-		ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS;
-		if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS)
-			ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS;
-
-		for (i = CACHE_LINE_SIZE_IN_WORDS;
-		     i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS)
-			__insn_prefetch(&out32[i]);
-	}
-
-	if (n32 > ahead32) {
-		while (1) {
-			int j;
-
-			/* Prefetch by reading one word several cache lines
-			 * ahead.  Since loads are non-blocking this will
-			 * cause the full cache line to be read while we are
-			 * finishing earlier cache lines.  Using a store
-			 * here causes microarchitectural performance
-			 * problems where a victimizing store miss goes to
-			 * the head of the retry FIFO and locks the pipe for
-			 * a few cycles.  So a few subsequent stores in this
-			 * loop go into the retry FIFO, and then later
-			 * stores see other stores to the same cache line
-			 * are already in the retry FIFO and themselves go
-			 * into the retry FIFO, filling it up and grinding
-			 * to a halt waiting for the original miss to be
-			 * satisfied.
-			 */
-			__insn_prefetch(&out32[ahead32]);
-
-#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0
-#error "Unhandled CACHE_LINE_SIZE_IN_WORDS"
-#endif
-
-			n32 -= CACHE_LINE_SIZE_IN_WORDS;
-
-			/* Save icache space by only partially unrolling
-			 * this loop.
-			 */
-			for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) {
-				*out32++ = v32;
-				*out32++ = v32;
-				*out32++ = v32;
-				*out32++ = v32;
-			}
-
-			/* To save compiled code size, reuse this loop even
-			 * when we run out of prefetching to do by dropping
-			 * ahead32 down.
-			 */
-			if (n32 <= ahead32) {
-				/* Not even a full cache line left,
-				 * so stop now.
-				 */
-				if (n32 < CACHE_LINE_SIZE_IN_WORDS)
-					break;
-
-				/* Choose a small enough value that we don't
-				 * prefetch past the end.  There's no sense
-				 * in touching cache lines we don't have to.
-				 */
-				ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1;
-			}
-		}
-	}
-
-#else /* CHIP_HAS_WH64() */
-
 	/* Determine how many words we need to emit before the 'out32'
 	 * pointer becomes aligned modulo the cache line size.
 	 */
@@ -236,8 +130,6 @@ void *memset(void *s, int c, size_t n)
 		n32 &= CACHE_LINE_SIZE_IN_WORDS - 1;
 	}
 
-#endif /* CHIP_HAS_WH64() */
-
 	/* Now handle any leftover values. */
 	if (n32 != 0) {
 		do {
diff --git a/arch/tile/lib/memset_64.c b/arch/tile/lib/memset_64.c
index 3873085711d..03ef69cd73d 100644
--- a/arch/tile/lib/memset_64.c
+++ b/arch/tile/lib/memset_64.c
@@ -12,13 +12,11 @@
  *   more details.
  */
 
-#include <arch/chip.h>
-
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
-
-#undef memset
+#include <arch/chip.h>
+#include "string-endian.h"
 
 void *memset(void *s, int c, size_t n)
 {
@@ -70,8 +68,7 @@ void *memset(void *s, int c, size_t n)
 	n64 = n >> 3;
 
 	/* Tile input byte out to 64 bits. */
-	/* KLUDGE */
-	v64 = 0x0101010101010101ULL * (uint8_t)c;
+	v64 = copy_byte(c);
 
 	/* This must be at least 8 or the following loop doesn't work. */
 #define CACHE_LINE_SIZE_IN_DOUBLEWORDS (CHIP_L2_LINE_SIZE() / 8)
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c
index b16ac49a968..b34f79aada4 100644
--- a/arch/tile/lib/spinlock_32.c
+++ b/arch/tile/lib/spinlock_32.c
@@ -101,7 +101,7 @@ EXPORT_SYMBOL(arch_spin_unlock_wait);
  * preserve the semantic that the same read lock can be acquired in an
  * interrupt context.
  */
-inline int arch_read_trylock(arch_rwlock_t *rwlock)
+int arch_read_trylock(arch_rwlock_t *rwlock)
 {
 	u32 val;
 	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
diff --git a/arch/tile/lib/spinlock_common.h b/arch/tile/lib/spinlock_common.h
index c1010980913..6ac37509fac 100644
--- a/arch/tile/lib/spinlock_common.h
+++ b/arch/tile/lib/spinlock_common.h
@@ -60,5 +60,5 @@ static void delay_backoff(int iterations)
 	loops += __insn_crc32_32(stack_pointer, get_cycles_low()) &
 		(loops - 1);
 
-	relax(1 << exponent);
+	relax(loops);
 }
diff --git a/arch/tile/lib/strchr_32.c b/arch/tile/lib/strchr_32.c
index c94e6f7ae7b..841fe696301 100644
--- a/arch/tile/lib/strchr_32.c
+++ b/arch/tile/lib/strchr_32.c
@@ -16,8 +16,6 @@
 #include <linux/string.h>
 #include <linux/module.h>
 
-#undef strchr
-
 char *strchr(const char *s, int c)
 {
 	int z, g;
diff --git a/arch/tile/lib/strchr_64.c b/arch/tile/lib/strchr_64.c
index 617a9273aaa..fe6e31c06f8 100644
--- a/arch/tile/lib/strchr_64.c
+++ b/arch/tile/lib/strchr_64.c
@@ -15,8 +15,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
-
-#undef strchr
+#include "string-endian.h"
 
 char *strchr(const char *s, int c)
 {
@@ -27,19 +26,15 @@ char *strchr(const char *s, int c)
 	const uint64_t *p = (const uint64_t *)(s_int & -8);
 
 	/* Create eight copies of the byte for which we are looking. */
-	const uint64_t goal = 0x0101010101010101ULL * (uint8_t) c;
+	const uint64_t goal = copy_byte(c);
 
 	/* Read the first aligned word, but force bytes before the string to
 	 * match neither zero nor goal (we make sure the high bit of each
 	 * byte is 1, and the low 7 bits are all the opposite of the goal
 	 * byte).
-	 *
-	 * Note that this shift count expression works because we know shift
-	 * counts are taken mod 64.
 	 */
-	const uint64_t before_mask = (1ULL << (s_int << 3)) - 1;
-	uint64_t v = (*p | before_mask) ^
-		(goal & __insn_v1shrsi(before_mask, 1));
+	const uint64_t before_mask = MASK(s_int);
+	uint64_t v = (*p | before_mask) ^ (goal & __insn_v1shrui(before_mask, 1));
 
 	uint64_t zero_matches, goal_matches;
 	while (1) {
@@ -55,8 +50,8 @@ char *strchr(const char *s, int c)
 		v = *++p;
 	}
 
-	z = __insn_ctz(zero_matches);
-	g = __insn_ctz(goal_matches);
+	z = CFZ(zero_matches);
+	g = CFZ(goal_matches);
 
 	/* If we found c before '\0' we got a match. Note that if c == '\0'
 	 * then g == z, and we correctly return the address of the '\0'
diff --git a/arch/tile/lib/string-endian.h b/arch/tile/lib/string-endian.h
new file mode 100644
index 00000000000..2e49cbfe937
--- /dev/null
+++ b/arch/tile/lib/string-endian.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * Provide a mask based on the pointer alignment that
+ * sets up non-zero bytes before the beginning of the string.
+ * The MASK expression works because shift counts are taken mod 64.
+ * Also, specify how to count "first" and "last" bits
+ * when the bits have been read as a word.
+ */
+
+#include <asm/byteorder.h>
+
+#ifdef __LITTLE_ENDIAN
+#define MASK(x) (__insn_shl(1ULL, (x << 3)) - 1)
+#define NULMASK(x) ((2ULL << x) - 1)
+#define CFZ(x) __insn_ctz(x)
+#define REVCZ(x) __insn_clz(x)
+#else
+#define MASK(x) (__insn_shl(-2LL, ((-x << 3) - 1)))
+#define NULMASK(x) (-2LL << (63 - x))
+#define CFZ(x) __insn_clz(x)
+#define REVCZ(x) __insn_ctz(x)
+#endif
+
+/*
+ * Create eight copies of the byte in a uint64_t.  Byte Shuffle uses
+ * the bytes of srcB as the index into the dest vector to select a
+ * byte.  With all indices of zero, the first byte is copied into all
+ * the other bytes.
+ */
+static inline uint64_t copy_byte(uint8_t byte)
+{
+	return __insn_shufflebytes(byte, 0, 0);
+}
diff --git a/arch/tile/lib/strlen_32.c b/arch/tile/lib/strlen_32.c
index 4974292a553..f26f88e11e4 100644
--- a/arch/tile/lib/strlen_32.c
+++ b/arch/tile/lib/strlen_32.c
@@ -16,8 +16,6 @@
 #include <linux/string.h>
 #include <linux/module.h>
 
-#undef strlen
-
 size_t strlen(const char *s)
 {
 	/* Get an aligned pointer. */
diff --git a/arch/tile/lib/strlen_64.c b/arch/tile/lib/strlen_64.c
index 1c92d46202a..9583fc3361f 100644
--- a/arch/tile/lib/strlen_64.c
+++ b/arch/tile/lib/strlen_64.c
@@ -15,8 +15,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/module.h>
-
-#undef strlen
+#include "string-endian.h"
 
 size_t strlen(const char *s)
 {
@@ -24,15 +23,13 @@ size_t strlen(const char *s)
 	const uintptr_t s_int = (uintptr_t) s;
 	const uint64_t *p = (const uint64_t *)(s_int & -8);
 
-	/* Read the first word, but force bytes before the string to be nonzero.
-	 * This expression works because we know shift counts are taken mod 64.
-	 */
-	uint64_t v = *p | ((1ULL << (s_int << 3)) - 1);
+	/* Read and MASK the first word. */
+	uint64_t v = *p | MASK(s_int);
 
 	uint64_t bits;
 	while ((bits = __insn_v1cmpeqi(v, 0)) == 0)
 		v = *++p;
 
-	return ((const char *)p) + (__insn_ctz(bits) >> 3) - s;
+	return ((const char *)p) + (CFZ(bits) >> 3) - s;
 }
 EXPORT_SYMBOL(strlen);
diff --git a/arch/tile/lib/strnlen_32.c b/arch/tile/lib/strnlen_32.c
new file mode 100644
index 00000000000..1434141d9e0
--- /dev/null
+++ b/arch/tile/lib/strnlen_32.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+size_t strnlen(const char *s, size_t count)
+{
+	/* Get an aligned pointer. */
+	const uintptr_t s_int = (uintptr_t) s;
+	const uint32_t *p = (const uint32_t *)(s_int & -4);
+	size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
+	size_t len;
+	uint32_t v, bits;
+
+	/* Avoid page fault risk by not reading any bytes when count is 0. */
+	if (count == 0)
+		return 0;
+
+	/* Read first word, but force bytes before the string to be nonzero. */
+	v = *p | ((1 << ((s_int << 3) & 31)) - 1);
+
+	while ((bits = __insn_seqb(v, 0)) == 0) {
+		if (bytes_read >= count) {
+			/* Read COUNT bytes and didn't find the terminator. */
+			return count;
+		}
+		v = *++p;
+		bytes_read += sizeof(v);
+	}
+
+	len = ((const char *) p) + (__insn_ctz(bits) >> 3) - s;
+	return (len < count ? len : count);
+}
+EXPORT_SYMBOL(strnlen);
diff --git a/arch/tile/lib/strnlen_64.c b/arch/tile/lib/strnlen_64.c
new file mode 100644
index 00000000000..2e8de6a5136
--- /dev/null
+++ b/arch/tile/lib/strnlen_64.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include "string-endian.h"
+
+size_t strnlen(const char *s, size_t count)
+{
+	/* Get an aligned pointer. */
+	const uintptr_t s_int = (uintptr_t) s;
+	const uint64_t *p = (const uint64_t *)(s_int & -8);
+	size_t bytes_read = sizeof(*p) - (s_int & (sizeof(*p) - 1));
+	size_t len;
+	uint64_t v, bits;
+
+	/* Avoid page fault risk by not reading any bytes when count is 0. */
+	if (count == 0)
+		return 0;
+
+	/* Read and MASK the first word. */
+	v = *p | MASK(s_int);
+
+	while ((bits = __insn_v1cmpeqi(v, 0)) == 0) {
+		if (bytes_read >= count) {
+			/* Read COUNT bytes and didn't find the terminator. */
+			return count;
+		}
+		v = *++p;
+		bytes_read += sizeof(v);
+	}
+
+	len = ((const char *) p) + (CFZ(bits) >> 3) - s;
+	return (len < count ? len : count);
+}
+EXPORT_SYMBOL(strnlen);
diff --git a/arch/tile/lib/uaccess.c b/arch/tile/lib/uaccess.c
index f8d398c9ee7..030abe3ee4f 100644
--- a/arch/tile/lib/uaccess.c
+++ b/arch/tile/lib/uaccess.c
@@ -22,11 +22,3 @@ int __range_ok(unsigned long addr, unsigned long size)
 		 is_arch_mappable_range(addr, size));
 }
 EXPORT_SYMBOL(__range_ok);
-
-#ifdef CONFIG_DEBUG_COPY_FROM_USER
-void copy_from_user_overflow(void)
-{
-       WARN(1, "Buffer overflow detected!\n");
-}
-EXPORT_SYMBOL(copy_from_user_overflow);
-#endif
diff --git a/arch/tile/lib/usercopy_32.S b/arch/tile/lib/usercopy_32.S
index 979f76d8374..1bc16222463 100644
--- a/arch/tile/lib/usercopy_32.S
+++ b/arch/tile/lib/usercopy_32.S
@@ -19,82 +19,6 @@
 
 /* Access user memory, but use MMU to avoid propagating kernel exceptions. */
 
-	.pushsection .fixup,"ax"
-
-get_user_fault:
-	{ move r0, zero; move r1, zero }
-	{ movei r2, -EFAULT; jrp lr }
-	ENDPROC(get_user_fault)
-
-put_user_fault:
-	{ movei r0, -EFAULT; jrp lr }
-	ENDPROC(put_user_fault)
-
-	.popsection
-
-/*
- * __get_user_N functions take a pointer in r0, and return 0 in r2
- * on success, with the value in r0; or else -EFAULT in r2.
- */
-#define __get_user_N(bytes, LOAD) \
-	STD_ENTRY(__get_user_##bytes); \
-1:	{ LOAD r0, r0; move r1, zero; move r2, zero }; \
-	jrp lr; \
-	STD_ENDPROC(__get_user_##bytes); \
-	.pushsection __ex_table,"a"; \
-	.word 1b, get_user_fault; \
-	.popsection
-
-__get_user_N(1, lb_u)
-__get_user_N(2, lh_u)
-__get_user_N(4, lw)
-
-/*
- * __get_user_8 takes a pointer in r0, and returns 0 in r2
- * on success, with the value in r0/r1; or else -EFAULT in r2.
- */
-	STD_ENTRY(__get_user_8);
-1:	{ lw r0, r0; addi r1, r0, 4 };
-2:	{ lw r1, r1; move r2, zero };
-	jrp lr;
-	STD_ENDPROC(__get_user_8);
-	.pushsection __ex_table,"a";
-	.word 1b, get_user_fault;
-	.word 2b, get_user_fault;
-	.popsection
-
-/*
- * __put_user_N functions take a value in r0 and a pointer in r1,
- * and return 0 in r0 on success or -EFAULT on failure.
- */
-#define __put_user_N(bytes, STORE) \
-	STD_ENTRY(__put_user_##bytes); \
-1:	{ STORE r1, r0; move r0, zero }; \
-	jrp lr; \
-	STD_ENDPROC(__put_user_##bytes); \
-	.pushsection __ex_table,"a"; \
-	.word 1b, put_user_fault; \
-	.popsection
-
-__put_user_N(1, sb)
-__put_user_N(2, sh)
-__put_user_N(4, sw)
-
-/*
- * __put_user_8 takes a value in r0/r1 and a pointer in r2,
- * and returns 0 in r0 on success or -EFAULT on failure.
- */
-STD_ENTRY(__put_user_8)
-1:      { sw r2, r0; addi r2, r2, 4 }
-2:      { sw r2, r1; move r0, zero }
-	jrp lr
-	STD_ENDPROC(__put_user_8)
-	.pushsection __ex_table,"a"
-	.word 1b, put_user_fault
-	.word 2b, put_user_fault
-	.popsection
-
-
 /*
  * strnlen_user_asm takes the pointer in r0, and the length bound in r1.
  * It returns the length, including the terminating NUL, or zero on exception.
@@ -112,6 +36,7 @@ strnlen_user_fault:
 	{ move r0, zero; jrp lr }
 	ENDPROC(strnlen_user_fault)
 	.section __ex_table,"a"
+	.align 4
 	.word 1b, strnlen_user_fault
 	.popsection
 
@@ -123,18 +48,20 @@ strnlen_user_fault:
  */
 STD_ENTRY(strncpy_from_user_asm)
 	{ bz r2, 2f; move r3, r0 }
-1:      { lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+1:	{ lb_u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
 	{ sb r0, r4; addi r0, r0, 1 }
-	bz r2, 2f
-	bnzt r4, 1b
-	addi r0, r0, -1   /* don't count the trailing NUL */
-2:      { sub r0, r0, r3; jrp lr }
+	bz r4, 2f
+	bnzt r2, 1b
+	{ sub r0, r0, r3; jrp lr }
+2:	addi r0, r0, -1   /* don't count the trailing NUL */
+	{ sub r0, r0, r3; jrp lr }
 	STD_ENDPROC(strncpy_from_user_asm)
 	.pushsection .fixup,"ax"
 strncpy_from_user_fault:
 	{ movei r0, -EFAULT; jrp lr }
 	ENDPROC(strncpy_from_user_fault)
 	.section __ex_table,"a"
+	.align 4
 	.word 1b, strncpy_from_user_fault
 	.popsection
 
@@ -153,6 +80,7 @@ STD_ENTRY(clear_user_asm)
 	bnzt r1, 1b
 2:      { move r0, r1; jrp lr }
 	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
 
@@ -162,6 +90,7 @@ STD_ENTRY(clear_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(clear_user_asm)
 	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
 
@@ -181,25 +110,7 @@ STD_ENTRY(flush_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(flush_user_asm)
 	.pushsection __ex_table,"a"
-	.word 1b, 2b
-	.popsection
-
-/*
- * inv_user_asm takes the user target address in r0 and the
- * number of bytes to invalidate in r1.
- * It returns the number of not inv'able bytes (hopefully zero) in r0.
- */
-STD_ENTRY(inv_user_asm)
-	bz r1, 2f
-	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
-	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
-	{ and r0, r0, r2; and r1, r1, r2 }
-	{ sub r1, r1, r0 }
-1:      { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
-	{ addi r0, r0, CHIP_INV_STRIDE(); bnzt r1, 1b }
-2:      { move r0, r1; jrp lr }
-	STD_ENDPROC(inv_user_asm)
-	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
 
@@ -219,5 +130,6 @@ STD_ENTRY(finv_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(finv_user_asm)
 	.pushsection __ex_table,"a"
+	.align 4
 	.word 1b, 2b
 	.popsection
diff --git a/arch/tile/lib/usercopy_64.S b/arch/tile/lib/usercopy_64.S
index 2ff44f87b78..b3b31a3306f 100644
--- a/arch/tile/lib/usercopy_64.S
+++ b/arch/tile/lib/usercopy_64.S
@@ -19,55 +19,6 @@
 
 /* Access user memory, but use MMU to avoid propagating kernel exceptions. */
 
-	.pushsection .fixup,"ax"
-
-get_user_fault:
-	{ movei r1, -EFAULT; move r0, zero }
-	jrp lr
-	ENDPROC(get_user_fault)
-
-put_user_fault:
-	{ movei r0, -EFAULT; jrp lr }
-	ENDPROC(put_user_fault)
-
-	.popsection
-
-/*
- * __get_user_N functions take a pointer in r0, and return 0 in r1
- * on success, with the value in r0; or else -EFAULT in r1.
- */
-#define __get_user_N(bytes, LOAD) \
-	STD_ENTRY(__get_user_##bytes); \
-1:	{ LOAD r0, r0; move r1, zero }; \
-	jrp lr; \
-	STD_ENDPROC(__get_user_##bytes); \
-	.pushsection __ex_table,"a"; \
-	.quad 1b, get_user_fault; \
-	.popsection
-
-__get_user_N(1, ld1u)
-__get_user_N(2, ld2u)
-__get_user_N(4, ld4u)
-__get_user_N(8, ld)
-
-/*
- * __put_user_N functions take a value in r0 and a pointer in r1,
- * and return 0 in r0 on success or -EFAULT on failure.
- */
-#define __put_user_N(bytes, STORE) \
-	STD_ENTRY(__put_user_##bytes); \
-1:	{ STORE r1, r0; move r0, zero }; \
-	jrp lr; \
-	STD_ENDPROC(__put_user_##bytes); \
-	.pushsection __ex_table,"a"; \
-	.quad 1b, put_user_fault; \
-	.popsection
-
-__put_user_N(1, st1)
-__put_user_N(2, st2)
-__put_user_N(4, st4)
-__put_user_N(8, st)
-
 /*
  * strnlen_user_asm takes the pointer in r0, and the length bound in r1.
  * It returns the length, including the terminating NUL, or zero on exception.
@@ -85,6 +36,7 @@ strnlen_user_fault:
 	{ move r0, zero; jrp lr }
 	ENDPROC(strnlen_user_fault)
 	.section __ex_table,"a"
+	.align 8
 	.quad 1b, strnlen_user_fault
 	.popsection
 
@@ -96,18 +48,20 @@ strnlen_user_fault:
  */
 STD_ENTRY(strncpy_from_user_asm)
 	{ beqz r2, 2f; move r3, r0 }
-1:      { ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
+1:	{ ld1u r4, r1; addi r1, r1, 1; addi r2, r2, -1 }
 	{ st1 r0, r4; addi r0, r0, 1 }
-	beqz r2, 2f
-	bnezt r4, 1b
-	addi r0, r0, -1   /* don't count the trailing NUL */
-2:      { sub r0, r0, r3; jrp lr }
+	beqz r4, 2f
+	bnezt r2, 1b
+	{ sub r0, r0, r3; jrp lr }
+2:	addi r0, r0, -1   /* don't count the trailing NUL */
+	{ sub r0, r0, r3; jrp lr }
 	STD_ENDPROC(strncpy_from_user_asm)
 	.pushsection .fixup,"ax"
 strncpy_from_user_fault:
 	{ movei r0, -EFAULT; jrp lr }
 	ENDPROC(strncpy_from_user_fault)
 	.section __ex_table,"a"
+	.align 8
 	.quad 1b, strncpy_from_user_fault
 	.popsection
 
@@ -126,6 +80,7 @@ STD_ENTRY(clear_user_asm)
 	bnezt r1, 1b
 2:      { move r0, r1; jrp lr }
 	.pushsection __ex_table,"a"
+	.align 8
 	.quad 1b, 2b
 	.popsection
 
@@ -135,6 +90,7 @@ STD_ENTRY(clear_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(clear_user_asm)
 	.pushsection __ex_table,"a"
+	.align 8
 	.quad 1b, 2b
 	.popsection
 
@@ -154,25 +110,7 @@ STD_ENTRY(flush_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(flush_user_asm)
 	.pushsection __ex_table,"a"
-	.quad 1b, 2b
-	.popsection
-
-/*
- * inv_user_asm takes the user target address in r0 and the
- * number of bytes to invalidate in r1.
- * It returns the number of not inv'able bytes (hopefully zero) in r0.
- */
-STD_ENTRY(inv_user_asm)
-	beqz r1, 2f
-	{ movei r2, L2_CACHE_BYTES; add r1, r0, r1 }
-	{ sub r2, zero, r2; addi r1, r1, L2_CACHE_BYTES-1 }
-	{ and r0, r0, r2; and r1, r1, r2 }
-	{ sub r1, r1, r0 }
-1:      { inv r0; addi r1, r1, -CHIP_INV_STRIDE() }
-	{ addi r0, r0, CHIP_INV_STRIDE(); bnezt r1, 1b }
-2:      { move r0, r1; jrp lr }
-	STD_ENDPROC(inv_user_asm)
-	.pushsection __ex_table,"a"
+	.align 8
 	.quad 1b, 2b
 	.popsection
 
@@ -192,5 +130,6 @@ STD_ENTRY(finv_user_asm)
 2:      { move r0, r1; jrp lr }
 	STD_ENDPROC(finv_user_asm)
 	.pushsection __ex_table,"a"
+	.align 8
 	.quad 1b, 2b
 	.popsection
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 55e58e93bfc..23f044e8a7a 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -21,6 +21,8 @@
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/sections.h>
+#include <asm/vdso.h>
+#include <arch/sim.h>
 
 /* Notify a running simulator, if any, that an exec just occurred. */
 static void sim_notify_exec(const char *binary_name)
@@ -35,28 +37,57 @@ static void sim_notify_exec(const char *binary_name)
 	} while (c);
 }
 
-static int notify_exec(void)
+static int notify_exec(struct mm_struct *mm)
 {
-	int retval = 0;  /* failure */
-	struct vm_area_struct *vma = current->mm->mmap;
-	while (vma) {
-		if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
+	char *buf, *path;
+	struct vm_area_struct *vma;
+
+	if (!sim_is_simulator())
+		return 1;
+
+	if (mm->exe_file == NULL)
+		return 0;
+
+	for (vma = current->mm->mmap; ; vma = vma->vm_next) {
+		if (vma == NULL)
+			return 0;
+		if (vma->vm_file == mm->exe_file)
 			break;
-		vma = vma->vm_next;
 	}
-	if (vma) {
-		char *buf = (char *) __get_free_page(GFP_KERNEL);
-		if (buf) {
-			char *path = d_path(&vma->vm_file->f_path,
-					    buf, PAGE_SIZE);
-			if (!IS_ERR(path)) {
-				sim_notify_exec(path);
-				retval = 1;
-			}
-			free_page((unsigned long)buf);
+
+	buf = (char *) __get_free_page(GFP_KERNEL);
+	if (buf == NULL)
+		return 0;
+
+	path = d_path(&mm->exe_file->f_path, buf, PAGE_SIZE);
+	if (IS_ERR(path)) {
+		free_page((unsigned long)buf);
+		return 0;
+	}
+
+	/*
+	 * Notify simulator of an ET_DYN object so we know the load address.
+	 * The somewhat cryptic overuse of SIM_CONTROL_DLOPEN allows us
+	 * to be backward-compatible with older simulator releases.
+	 */
+	if (vma->vm_start == (ELF_ET_DYN_BASE & PAGE_MASK)) {
+		char buf[64];
+		int i;
+
+		snprintf(buf, sizeof(buf), "0x%lx:@", vma->vm_start);
+		for (i = 0; ; ++i) {
+			char c = buf[i];
+			__insn_mtspr(SPR_SIM_CONTROL,
+				     (SIM_CONTROL_DLOPEN
+				      | (c << _SIM_CONTROL_OPERATOR_BITS)));
+			if (c == '\0')
+				break;
 		}
 	}
-	return retval;
+
+	sim_notify_exec(path);
+	free_page((unsigned long)buf);
+	return 1;
 }
 
 /* Notify a running simulator, if any, that we loaded an interpreter. */
@@ -72,63 +103,23 @@ static void sim_notify_interp(unsigned long load_addr)
 }
 
 
-/* Kernel address of page used to map read-only kernel data into userspace. */
-static void *vdso_page;
-
-/* One-entry array used for install_special_mapping. */
-static struct page *vdso_pages[1];
-
-static int __init vdso_setup(void)
-{
-	vdso_page = (void *)get_zeroed_page(GFP_ATOMIC);
-	memcpy(vdso_page, __rt_sigreturn, __rt_sigreturn_end - __rt_sigreturn);
-	vdso_pages[0] = virt_to_page(vdso_page);
-	return 0;
-}
-device_initcall(vdso_setup);
-
-const char *arch_vma_name(struct vm_area_struct *vma)
-{
-	if (vma->vm_private_data == vdso_pages)
-		return "[vdso]";
-#ifndef __tilegx__
-	if (vma->vm_start == MEM_USER_INTRPT)
-		return "[intrpt]";
-#endif
-	return NULL;
-}
-
 int arch_setup_additional_pages(struct linux_binprm *bprm,
 				int executable_stack)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long vdso_base;
 	int retval = 0;
 
+	down_write(&mm->mmap_sem);
+
 	/*
 	 * Notify the simulator that an exec just occurred.
 	 * If we can't find the filename of the mapping, just use
 	 * whatever was passed as the linux_binprm filename.
 	 */
-	if (!notify_exec())
+	if (!notify_exec(mm))
 		sim_notify_exec(bprm->filename);
 
-	down_write(&mm->mmap_sem);
-
-	/*
-	 * MAYWRITE to allow gdb to COW and set breakpoints
-	 *
-	 * Make sure the vDSO gets into every core dump.  Dumping its
-	 * contents makes post-mortem fully interpretable later
-	 * without matching up the same kernel and hardware config to
-	 * see what PC values meant.
-	 */
-	vdso_base = VDSO_BASE;
-	retval = install_special_mapping(mm, vdso_base, PAGE_SIZE,
-					 VM_READ|VM_EXEC|
-					 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-					 VM_ALWAYSDUMP,
-					 vdso_pages);
+	retval = setup_vdso_pages();
 
 #ifndef __tilegx__
 	/*
@@ -140,7 +131,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 	if (!retval) {
 		unsigned long addr = MEM_USER_INTRPT;
 		addr = mmap_region(NULL, addr, INTRPT_SIZE,
-				   MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
 				   VM_READ|VM_EXEC|
 				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
 		if (addr > (unsigned long) -PAGE_SIZE)
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index c1eaaa1fcc2..6c0571216a9 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -34,8 +34,8 @@
 #include <linux/hugetlb.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/kdebug.h>
 
-#include <asm/system.h>
 #include <asm/pgalloc.h>
 #include <asm/sections.h>
 #include <asm/traps.h>
@@ -71,9 +71,10 @@ static noinline void force_sig_info_fault(const char *type, int si_signo,
  * Synthesize the fault a PL0 process would get by doing a word-load of
  * an unaligned address or a high kernel address.
  */
-SYSCALL_DEFINE2(cmpxchg_badaddr, unsigned long, address,
-		struct pt_regs *, regs)
+SYSCALL_DEFINE1(cmpxchg_badaddr, unsigned long, address)
 {
+	struct pt_regs *regs = current_pt_regs();
+
 	if (address >= PAGE_OFFSET)
 		force_sig_info_fault("atomic segfault", SIGSEGV, SEGV_MAPERR,
 				     address, INT_DTLB_MISS, current, regs);
@@ -122,16 +123,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 	pmd_k = pmd_offset(pud_k, address);
 	if (!pmd_present(*pmd_k))
 		return NULL;
-	if (!pmd_present(*pmd)) {
+	if (!pmd_present(*pmd))
 		set_pmd(pmd, *pmd_k);
-		arch_flush_lazy_mmu_mode();
-	} else
+	else
 		BUG_ON(pmd_ptfn(*pmd) != pmd_ptfn(*pmd_k));
 	return pmd_k;
 }
 
 /*
- * Handle a fault on the vmalloc or module mapping area
+ * Handle a fault on the vmalloc area.
  */
 static inline int vmalloc_fault(pgd_t *pgd, unsigned long address)
 {
@@ -149,8 +149,6 @@ static inline int vmalloc_fault(pgd_t *pgd, unsigned long address)
 	pmd_k = vmalloc_sync_one(pgd, address);
 	if (!pmd_k)
 		return -1;
-	if (pmd_huge(*pmd_k))
-		return 0;   /* support TILE huge_vmap() API */
 	pte_k = pte_offset_kernel(pmd_k, address);
 	if (!pte_present(*pte_k))
 		return -1;
@@ -188,7 +186,7 @@ static pgd_t *get_current_pgd(void)
 	HV_Context ctx = hv_inquire_context();
 	unsigned long pgd_pfn = ctx.page_table >> PAGE_SHIFT;
 	struct page *pgd_page = pfn_to_page(pgd_pfn);
-	BUG_ON(PageHighMem(pgd_page));   /* oops, HIGHPTE? */
+	BUG_ON(PageHighMem(pgd_page));
 	return (pgd_t *) __va(ctx.page_table);
 }
 
@@ -204,9 +202,14 @@ static pgd_t *get_current_pgd(void)
  * interrupt or a critical region, and must do as little as possible.
  * Similarly, we can't use atomic ops here, since we may be handling a
  * fault caused by an atomic op access.
+ *
+ * If we find a migrating PTE while we're in an NMI context, and we're
+ * at a PC that has a registered exception handler, we don't wait,
+ * since this thread may (e.g.) have been interrupted while migrating
+ * its own stack, which would then cause us to self-deadlock.
  */
 static int handle_migrating_pte(pgd_t *pgd, int fault_num,
-				unsigned long address,
+				unsigned long address, unsigned long pc,
 				int is_kernel_mode, int write)
 {
 	pud_t *pud;
@@ -228,6 +231,8 @@ static int handle_migrating_pte(pgd_t *pgd, int fault_num,
 		pte_offset_kernel(pmd, address);
 	pteval = *pte;
 	if (pte_migrating(pteval)) {
+		if (in_nmi() && search_exception_tables(pc))
+			return 0;
 		wait_for_migration(pte);
 		return 1;
 	}
@@ -267,12 +272,15 @@ static int handle_page_fault(struct pt_regs *regs,
 	int si_code;
 	int is_kernel_mode;
 	pgd_t *pgd;
+	unsigned int flags;
 
 	/* on TILE, protection faults are always writes */
 	if (!is_page_fault)
 		write = 1;
 
-	is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
+	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+	is_kernel_mode = !user_mode(regs);
 
 	tsk = validate_current();
 
@@ -301,7 +309,7 @@ static int handle_page_fault(struct pt_regs *regs,
 	 * rather than trying to patch up the existing PTE.
 	 */
 	pgd = get_current_pgd();
-	if (handle_migrating_pte(pgd, fault_num, address,
+	if (handle_migrating_pte(pgd, fault_num, address, regs->pc,
 				 is_kernel_mode, write))
 		return 1;
 
@@ -336,9 +344,12 @@ static int handle_page_fault(struct pt_regs *regs,
 	/*
 	 * If we're trying to touch user-space addresses, we must
 	 * be either at PL0, or else with interrupts enabled in the
-	 * kernel, so either way we can re-enable interrupts here.
+	 * kernel, so either way we can re-enable interrupts here
+	 * unless we are doing atomic access to user space with
+	 * interrupts disabled.
 	 */
-	local_irq_enable();
+	if (!(regs->flags & PT_FLAGS_DISABLE_IRQ))
+		local_irq_enable();
 
 	mm = tsk->mm;
 
@@ -351,6 +362,9 @@ static int handle_page_fault(struct pt_regs *regs,
 		goto bad_area_nosemaphore;
 	}
 
+	if (!is_kernel_mode)
+		flags |= FAULT_FLAG_USER;
+
 	/*
 	 * When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -373,6 +387,8 @@ static int handle_page_fault(struct pt_regs *regs,
 			vma = NULL;  /* happy compiler */
 			goto bad_area_nosemaphore;
 		}
+
+retry:
 		down_read(&mm->mmap_sem);
 	}
 
@@ -409,18 +425,22 @@ good_area:
 #endif
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	} else {
 		if (!is_page_fault || !(vma->vm_flags & VM_READ))
 			goto bad_area;
 	}
 
- survive:
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	fault = handle_mm_fault(mm, vma, address, write);
+	fault = handle_mm_fault(mm, vma, address, flags);
+
+	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
+		return 0;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
@@ -428,33 +448,33 @@ good_area:
 			goto do_sigbus;
 		BUG();
 	}
-	if (fault & VM_FAULT_MAJOR)
-		tsk->maj_flt++;
-	else
-		tsk->min_flt++;
+	if (flags & FAULT_FLAG_ALLOW_RETRY) {
+		if (fault & VM_FAULT_MAJOR)
+			tsk->maj_flt++;
+		else
+			tsk->min_flt++;
+		if (fault & VM_FAULT_RETRY) {
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
+
+			 /*
+			  * No need to up_read(&mm->mmap_sem) as we would
+			  * have already released it in __lock_page_or_retry
+			  * in mm/filemap.c.
+			  */
+			goto retry;
+		}
+	}
 
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
-	/*
-	 * If this was an asynchronous fault,
-	 * restart the appropriate engine.
-	 */
-	switch (fault_num) {
 #if CHIP_HAS_TILE_DMA()
+	/* If this was a DMA TLB fault, restart the DMA engine. */
+	switch (fault_num) {
 	case INT_DMATLB_MISS:
 	case INT_DMATLB_MISS_DWNCL:
 	case INT_DMATLB_ACCESS:
 	case INT_DMATLB_ACCESS_DWNCL:
 		__insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__REQUEST_MASK);
 		break;
-#endif
-#if CHIP_HAS_SN_PROC()
-	case INT_SNITLB_MISS:
-	case INT_SNITLB_MISS_DWNCL:
-		__insn_mtspr(SPR_SNCTL,
-			     __insn_mfspr(SPR_SNCTL) &
-			     ~SPR_SNCTL__FRZPROC_MASK);
-		break;
-#endif
 	}
 #endif
 
@@ -535,15 +555,10 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (is_global_init(tsk)) {
-		yield();
-		down_read(&mm->mmap_sem);
-		goto survive;
-	}
-	pr_alert("VM: killing process %s\n", tsk->comm);
-	if (!is_kernel_mode)
-		do_group_exit(SIGKILL);
-	goto no_context;
+	if (is_kernel_mode)
+		goto no_context;
+	pagefault_out_of_memory();
+	return 0;
 
 do_sigbus:
 	up_read(&mm->mmap_sem);
@@ -666,7 +681,7 @@ struct intvec_state do_page_fault_ics(struct pt_regs *regs, int fault_num,
 	 */
 	if (fault_num == INT_DTLB_ACCESS)
 		write = 1;
-	if (handle_migrating_pte(pgd, fault_num, address, 1, write))
+	if (handle_migrating_pte(pgd, fault_num, address, pc, 1, write))
 		return state;
 
 	/* Return zero so that we continue on with normal fault handling. */
@@ -689,8 +704,60 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 {
 	int is_page_fault;
 
+#ifdef CONFIG_KPROBES
+	/*
+	 * This is to notify the fault handler of the kprobes.  The
+	 * exception code is redundant as it is also carried in REGS,
+	 * but we pass it anyhow.
+	 */
+	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, -1,
+		       regs->faultnum, SIGSEGV) == NOTIFY_STOP)
+		return;
+#endif
+
+#ifdef __tilegx__
+	/*
+	 * We don't need early do_page_fault_ics() support, since unlike
+	 * Pro we don't need to worry about unlocking the atomic locks.
+	 * There is only one current case in GX where we touch any memory
+	 * under ICS other than our own kernel stack, and we handle that
+	 * here.  (If we crash due to trying to touch our own stack,
+	 * we're in too much trouble for C code to help out anyway.)
+	 */
+	if (write & ~1) {
+		unsigned long pc = write & ~1;
+		if (pc >= (unsigned long) __start_unalign_asm_code &&
+		    pc < (unsigned long) __end_unalign_asm_code) {
+			struct thread_info *ti = current_thread_info();
+			/*
+			 * Our EX_CONTEXT is still what it was from the
+			 * initial unalign exception, but now we've faulted
+			 * on the JIT page.  We would like to complete the
+			 * page fault however is appropriate, and then retry
+			 * the instruction that caused the unalign exception.
+			 * Our state has been "corrupted" by setting the low
+			 * bit in "sp", and stashing r0..r3 in the
+			 * thread_info area, so we revert all of that, then
+			 * continue as if this were a normal page fault.
+			 */
+			regs->sp &= ~1UL;
+			regs->regs[0] = ti->unalign_jit_tmp[0];
+			regs->regs[1] = ti->unalign_jit_tmp[1];
+			regs->regs[2] = ti->unalign_jit_tmp[2];
+			regs->regs[3] = ti->unalign_jit_tmp[3];
+			write &= 1;
+		} else {
+			pr_alert("%s/%d: ICS set at page fault at %#lx: %#lx\n",
+				 current->comm, current->pid, pc, address);
+			show_regs(regs);
+			do_group_exit(SIGKILL);
+			return;
+		}
+	}
+#else
 	/* This case should have been handled by do_page_fault_ics(). */
 	BUG_ON(write & ~1);
+#endif
 
 #if CHIP_HAS_TILE_DMA()
 	/*
@@ -719,10 +786,6 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 	case INT_DMATLB_MISS:
 	case INT_DMATLB_MISS_DWNCL:
 #endif
-#if CHIP_HAS_SN_PROC()
-	case INT_SNITLB_MISS:
-	case INT_SNITLB_MISS_DWNCL:
-#endif
 		is_page_fault = 1;
 		break;
 
@@ -738,8 +801,8 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 		panic("Bad fault number %d in do_page_fault", fault_num);
 	}
 
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
-	if (EX1_PL(regs->ex1) != USER_PL) {
+#if CHIP_HAS_TILE_DMA()
+	if (!user_mode(regs)) {
 		struct async_tlb *async;
 		switch (fault_num) {
 #if CHIP_HAS_TILE_DMA()
@@ -750,12 +813,6 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 			async = &current->thread.dma_async_tlb;
 			break;
 #endif
-#if CHIP_HAS_SN_PROC()
-		case INT_SNITLB_MISS:
-		case INT_SNITLB_MISS_DWNCL:
-			async = &current->thread.sn_async_tlb;
-			break;
-#endif
 		default:
 			async = NULL;
 		}
@@ -788,14 +845,22 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 }
 
 
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
+#if CHIP_HAS_TILE_DMA()
 /*
- * Check an async_tlb structure to see if a deferred fault is waiting,
- * and if so pass it to the page-fault code.
+ * This routine effectively re-issues asynchronous page faults
+ * when we are returning to user space.
  */
-static void handle_async_page_fault(struct pt_regs *regs,
-				    struct async_tlb *async)
+void do_async_page_fault(struct pt_regs *regs)
 {
+	struct async_tlb *async = &current->thread.dma_async_tlb;
+
+	/*
+	 * Clear thread flag early.  If we re-interrupt while processing
+	 * code here, we will reset it and recall this routine before
+	 * returning to user space.
+	 */
+	clear_thread_flag(TIF_ASYNC_TLB);
+
 	if (async->fault_num) {
 		/*
 		 * Clear async->fault_num before calling the page-fault
@@ -809,35 +874,15 @@ static void handle_async_page_fault(struct pt_regs *regs,
 				  async->address, async->is_write);
 	}
 }
-
-/*
- * This routine effectively re-issues asynchronous page faults
- * when we are returning to user space.
- */
-void do_async_page_fault(struct pt_regs *regs)
-{
-	/*
-	 * Clear thread flag early.  If we re-interrupt while processing
-	 * code here, we will reset it and recall this routine before
-	 * returning to user space.
-	 */
-	clear_thread_flag(TIF_ASYNC_TLB);
-
-#if CHIP_HAS_TILE_DMA()
-	handle_async_page_fault(regs, &current->thread.dma_async_tlb);
-#endif
-#if CHIP_HAS_SN_PROC()
-	handle_async_page_fault(regs, &current->thread.sn_async_tlb);
-#endif
-}
-#endif /* CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() */
+#endif /* CHIP_HAS_TILE_DMA() */
 
 
 void vmalloc_sync_all(void)
 {
 #ifdef __tilegx__
 	/* Currently all L1 kernel pmd's are static and shared. */
-	BUG_ON(pgd_index(VMALLOC_END) != pgd_index(VMALLOC_START));
+	BUILD_BUG_ON(pgd_index(VMALLOC_END - PAGE_SIZE) !=
+		     pgd_index(VMALLOC_START));
 #else
 	/*
 	 * Note that races in the updates of insync and start aren't
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index 31dbbd9afe4..0dc21829477 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -93,7 +93,7 @@ static DEFINE_PER_CPU(struct kmap_amps, amps);
  * If we examine it earlier we are exposed to a race where it looks
  * writable earlier, but becomes immutable before we write the PTE.
  */
-static void kmap_atomic_register(struct page *page, enum km_type type,
+static void kmap_atomic_register(struct page *page, int type,
 				 unsigned long va, pte_t *ptep, pte_t pteval)
 {
 	unsigned long flags;
@@ -114,7 +114,6 @@ static void kmap_atomic_register(struct page *page, enum km_type type,
 
 	list_add(&amp->list, &amp_list);
 	set_pte(ptep, pteval);
-	arch_flush_lazy_mmu_mode();
 
 	spin_unlock(&amp_lock);
 	homecache_kpte_unlock(flags);
@@ -224,12 +223,12 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 }
 EXPORT_SYMBOL(kmap_atomic_prot);
 
-void *__kmap_atomic(struct page *page)
+void *kmap_atomic(struct page *page)
 {
 	/* PAGE_NONE is a magic value that tells us to check immutability. */
 	return kmap_atomic_prot(page, PAGE_NONE);
 }
-EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic);
 
 void __kunmap_atomic(void *kvaddr)
 {
@@ -259,7 +258,6 @@ void __kunmap_atomic(void *kvaddr)
 		BUG_ON(vaddr >= (unsigned long)high_memory);
 	}
 
-	arch_flush_lazy_mmu_mode();
 	pagefault_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index 1cc6ae477c9..33294fdc402 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -30,6 +30,7 @@
 #include <linux/cache.h>
 #include <linux/smp.h>
 #include <linux/module.h>
+#include <linux/hugetlb.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -42,12 +43,9 @@
 #include "migrate.h"
 
 
-#if CHIP_HAS_COHERENT_LOCAL_CACHE()
-
 /*
  * The noallocl2 option suppresses all use of the L2 cache to cache
- * locally from a remote home.  There's no point in using it if we
- * don't have coherent local caching, though.
+ * locally from a remote home.
  */
 static int __write_once noallocl2;
 static int __init set_noallocl2(char *str)
@@ -57,16 +55,6 @@ static int __init set_noallocl2(char *str)
 }
 early_param("noallocl2", set_noallocl2);
 
-#else
-
-#define noallocl2 0
-
-#endif
-
-/* Provide no-op versions of these routines to keep flush_remote() cleaner. */
-#define mark_caches_evicted_start() 0
-#define mark_caches_evicted_finish(mask, timestamp) do {} while (0)
-
 
 /*
  * Update the irq_stat for cpus that we are going to interrupt
@@ -106,7 +94,6 @@ static void hv_flush_update(const struct cpumask *cache_cpumask,
  *    there's never any good reason for hv_flush_remote() to fail.
  *  - Accepts a 32-bit PFN rather than a 64-bit PA, which generally
  *    is the type that Linux wants to pass around anyway.
- *  - Centralizes the mark_caches_evicted() handling.
  *  - Canonicalizes that lengths of zero make cpumasks NULL.
  *  - Handles deferring TLB flushes for dataplane tiles.
  *  - Tracks remote interrupts in the per-cpu irq_cpustat_t.
@@ -125,7 +112,6 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
 		  HV_Remote_ASID *asids, int asidcount)
 {
 	int rc;
-	int timestamp = 0;  /* happy compiler */
 	struct cpumask cache_cpumask_copy, tlb_cpumask_copy;
 	struct cpumask *cache_cpumask, *tlb_cpumask;
 	HV_PhysAddr cache_pa;
@@ -156,15 +142,11 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
 	hv_flush_update(cache_cpumask, tlb_cpumask, tlb_va, tlb_length,
 			asids, asidcount);
 	cache_pa = (HV_PhysAddr)cache_pfn << PAGE_SHIFT;
-	if (cache_control & HV_FLUSH_EVICT_L2)
-		timestamp = mark_caches_evicted_start();
 	rc = hv_flush_remote(cache_pa, cache_control,
 			     cpumask_bits(cache_cpumask),
 			     tlb_va, tlb_length, tlb_pgsize,
 			     cpumask_bits(tlb_cpumask),
 			     asids, asidcount);
-	if (cache_control & HV_FLUSH_EVICT_L2)
-		mark_caches_evicted_finish(cache_cpumask, timestamp);
 	if (rc == 0)
 		return;
 	cpumask_scnprintf(cache_buf, sizeof(cache_buf), &cache_cpumask_copy);
@@ -179,85 +161,88 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
 	panic("Unsafe to continue.");
 }
 
-void flush_remote_page(struct page *page, int order)
+static void homecache_finv_page_va(void* va, int home)
 {
-	int i, pages = (1 << order);
-	for (i = 0; i < pages; ++i, ++page) {
-		void *p = kmap_atomic(page);
-		int hfh = 0;
-		int home = page_home(page);
-#if CHIP_HAS_CBOX_HOME_MAP()
-		if (home == PAGE_HOME_HASH)
-			hfh = 1;
-		else
-#endif
-			BUG_ON(home < 0 || home >= NR_CPUS);
-		finv_buffer_remote(p, PAGE_SIZE, hfh);
-		kunmap_atomic(p);
+	int cpu = get_cpu();
+	if (home == cpu) {
+		finv_buffer_local(va, PAGE_SIZE);
+	} else if (home == PAGE_HOME_HASH) {
+		finv_buffer_remote(va, PAGE_SIZE, 1);
+	} else {
+		BUG_ON(home < 0 || home >= NR_CPUS);
+		finv_buffer_remote(va, PAGE_SIZE, 0);
 	}
+	put_cpu();
 }
 
-void homecache_evict(const struct cpumask *mask)
+void homecache_finv_map_page(struct page *page, int home)
 {
-	flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0);
+	unsigned long flags;
+	unsigned long va;
+	pte_t *ptep;
+	pte_t pte;
+
+	if (home == PAGE_HOME_UNCACHED)
+		return;
+	local_irq_save(flags);
+#ifdef CONFIG_HIGHMEM
+	va = __fix_to_virt(FIX_KMAP_BEGIN + kmap_atomic_idx_push() +
+			   (KM_TYPE_NR * smp_processor_id()));
+#else
+	va = __fix_to_virt(FIX_HOMECACHE_BEGIN + smp_processor_id());
+#endif
+	ptep = virt_to_kpte(va);
+	pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
+	__set_pte(ptep, pte_set_home(pte, home));
+	homecache_finv_page_va((void *)va, home);
+	__pte_clear(ptep);
+	hv_flush_page(va, PAGE_SIZE);
+#ifdef CONFIG_HIGHMEM
+	kmap_atomic_idx_pop();
+#endif
+	local_irq_restore(flags);
 }
 
-/*
- * Return a mask of the cpus whose caches currently own these pages.
- * The return value is whether the pages are all coherently cached
- * (i.e. none are immutable, incoherent, or uncached).
- */
-static int homecache_mask(struct page *page, int pages,
-			  struct cpumask *home_mask)
+static void homecache_finv_page_home(struct page *page, int home)
 {
-	int i;
-	int cached_coherently = 1;
-	cpumask_clear(home_mask);
-	for (i = 0; i < pages; ++i) {
-		int home = page_home(&page[i]);
-		if (home == PAGE_HOME_IMMUTABLE ||
-		    home == PAGE_HOME_INCOHERENT) {
-			cpumask_copy(home_mask, cpu_possible_mask);
-			return 0;
-		}
-#if CHIP_HAS_CBOX_HOME_MAP()
-		if (home == PAGE_HOME_HASH) {
-			cpumask_or(home_mask, home_mask, &hash_for_home_map);
-			continue;
-		}
-#endif
-		if (home == PAGE_HOME_UNCACHED) {
-			cached_coherently = 0;
-			continue;
-		}
-		BUG_ON(home < 0 || home >= NR_CPUS);
-		cpumask_set_cpu(home, home_mask);
-	}
-	return cached_coherently;
+	if (!PageHighMem(page) && home == page_home(page))
+		homecache_finv_page_va(page_address(page), home);
+	else
+		homecache_finv_map_page(page, home);
 }
 
-/*
- * Return the passed length, or zero if it's long enough that we
- * believe we should evict the whole L2 cache.
- */
-static unsigned long cache_flush_length(unsigned long length)
+static inline bool incoherent_home(int home)
 {
-	return (length >= CHIP_L2_CACHE_SIZE()) ? HV_FLUSH_EVICT_L2 : length;
+	return home == PAGE_HOME_IMMUTABLE || home == PAGE_HOME_INCOHERENT;
 }
 
-/* Flush a page out of whatever cache(s) it is in. */
-void homecache_flush_cache(struct page *page, int order)
+static void homecache_finv_page_internal(struct page *page, int force_map)
 {
-	int pages = 1 << order;
-	int length = cache_flush_length(pages * PAGE_SIZE);
-	unsigned long pfn = page_to_pfn(page);
-	struct cpumask home_mask;
-
-	homecache_mask(page, pages, &home_mask);
-	flush_remote(pfn, length, &home_mask, 0, 0, 0, NULL, NULL, 0);
-	sim_validate_lines_evicted(PFN_PHYS(pfn), pages * PAGE_SIZE);
+	int home = page_home(page);
+	if (home == PAGE_HOME_UNCACHED)
+		return;
+	if (incoherent_home(home)) {
+		int cpu;
+		for_each_cpu(cpu, &cpu_cacheable_map)
+			homecache_finv_map_page(page, cpu);
+	} else if (force_map) {
+		/* Force if, e.g., the normal mapping is migrating. */
+		homecache_finv_map_page(page, home);
+	} else {
+		homecache_finv_page_home(page, home);
+	}
+	sim_validate_lines_evicted(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);
 }
 
+void homecache_finv_page(struct page *page)
+{
+	homecache_finv_page_internal(page, 0);
+}
+
+void homecache_evict(const struct cpumask *mask)
+{
+	flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0);
+}
 
 /* Report the home corresponding to a given PTE. */
 static int pte_to_home(pte_t pte)
@@ -271,10 +256,8 @@ static int pte_to_home(pte_t pte)
 		return PAGE_HOME_INCOHERENT;
 	case HV_PTE_MODE_UNCACHED:
 		return PAGE_HOME_UNCACHED;
-#if CHIP_HAS_CBOX_HOME_MAP()
 	case HV_PTE_MODE_CACHE_HASH_L3:
 		return PAGE_HOME_HASH;
-#endif
 	}
 	panic("Bad PTE %#llx\n", pte.val);
 }
@@ -331,20 +314,16 @@ pte_t pte_set_home(pte_t pte, int home)
 						      HV_PTE_MODE_CACHE_NO_L3);
 			}
 		} else
-#if CHIP_HAS_CBOX_HOME_MAP()
 		if (hash_default)
 			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
 		else
-#endif
 			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
 		pte = hv_pte_set_nc(pte);
 		break;
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	case PAGE_HOME_HASH:
 		pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
 		break;
-#endif
 
 	default:
 		BUG_ON(home < 0 || home >= NR_CPUS ||
@@ -354,7 +333,6 @@ pte_t pte_set_home(pte_t pte, int home)
 		break;
 	}
 
-#if CHIP_HAS_NC_AND_NOALLOC_BITS()
 	if (noallocl2)
 		pte = hv_pte_set_no_alloc_l2(pte);
 
@@ -363,7 +341,6 @@ pte_t pte_set_home(pte_t pte, int home)
 	    hv_pte_get_mode(pte) == HV_PTE_MODE_CACHE_NO_L3) {
 		pte = hv_pte_set_mode(pte, HV_PTE_MODE_UNCACHED);
 	}
-#endif
 
 	/* Checking this case here gives a better panic than from the hv. */
 	BUG_ON(hv_pte_get_mode(pte) == 0);
@@ -379,21 +356,16 @@ EXPORT_SYMBOL(pte_set_home);
  * so they're not suitable for anything but infrequent use.
  */
 
-#if CHIP_HAS_CBOX_HOME_MAP()
-static inline int initial_page_home(void) { return PAGE_HOME_HASH; }
-#else
-static inline int initial_page_home(void) { return 0; }
-#endif
-
 int page_home(struct page *page)
 {
 	if (PageHighMem(page)) {
-		return initial_page_home();
+		return PAGE_HOME_HASH;
 	} else {
 		unsigned long kva = (unsigned long)page_address(page);
-		return pte_to_home(*virt_to_pte(NULL, kva));
+		return pte_to_home(*virt_to_kpte(kva));
 	}
 }
+EXPORT_SYMBOL(page_home);
 
 void homecache_change_page_home(struct page *page, int order, int home)
 {
@@ -409,12 +381,13 @@ void homecache_change_page_home(struct page *page, int order, int home)
 		     NULL, 0);
 
 	for (i = 0; i < pages; ++i, kva += PAGE_SIZE) {
-		pte_t *ptep = virt_to_pte(NULL, kva);
+		pte_t *ptep = virt_to_kpte(kva);
 		pte_t pteval = *ptep;
 		BUG_ON(!pte_present(pteval) || pte_huge(pteval));
 		__set_pte(ptep, pte_set_home(pteval, home));
 	}
 }
+EXPORT_SYMBOL(homecache_change_page_home);
 
 struct page *homecache_alloc_pages(gfp_t gfp_mask,
 				   unsigned int order, int home)
@@ -439,22 +412,25 @@ struct page *homecache_alloc_pages_node(int nid, gfp_t gfp_mask,
 	return page;
 }
 
-void homecache_free_pages(unsigned long addr, unsigned int order)
+void __homecache_free_pages(struct page *page, unsigned int order)
 {
-	struct page *page;
-
-	if (addr == 0)
-		return;
-
-	VM_BUG_ON(!virt_addr_valid((void *)addr));
-	page = virt_to_page((void *)addr);
 	if (put_page_testzero(page)) {
-		homecache_change_page_home(page, order, initial_page_home());
+		homecache_change_page_home(page, order, PAGE_HOME_HASH);
 		if (order == 0) {
-			free_hot_cold_page(page, 0);
+			free_hot_cold_page(page, false);
 		} else {
 			init_page_count(page);
 			__free_pages(page, order);
 		}
 	}
 }
+EXPORT_SYMBOL(__homecache_free_pages);
+
+void homecache_free_pages(unsigned long addr, unsigned int order)
+{
+	if (addr != 0) {
+		VM_BUG_ON(!virt_addr_valid((void *)addr));
+		__homecache_free_pages(virt_to_page((void *)addr), order);
+	}
+}
+EXPORT_SYMBOL(homecache_free_pages);
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 42cfcba4e1e..e514899e110 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -27,85 +27,129 @@
 #include <linux/mman.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+
+/*
+ * Provide an additional huge page size (in addition to the regular default
+ * huge page size) if no "hugepagesz" arguments are specified.
+ * Note that it must be smaller than the default huge page size so
+ * that it's possible to allocate them on demand from the buddy allocator.
+ * You can change this to 64K (on a 16K build), 256K, 1M, or 4M,
+ * or not define it at all.
+ */
+#define ADDITIONAL_HUGE_SIZE (1024 * 1024UL)
+
+/* "Extra" page-size multipliers, one per level of the page table. */
+int huge_shift[HUGE_SHIFT_ENTRIES] = {
+#ifdef ADDITIONAL_HUGE_SIZE
+#define ADDITIONAL_HUGE_SHIFT __builtin_ctzl(ADDITIONAL_HUGE_SIZE / PAGE_SIZE)
+	[HUGE_SHIFT_PAGE] = ADDITIONAL_HUGE_SHIFT
+#endif
+};
+
+#endif
 
 pte_t *huge_pte_alloc(struct mm_struct *mm,
 		      unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	pte_t *pte = NULL;
 
-	/* We do not yet support multiple huge page sizes. */
-	BUG_ON(sz != PMD_SIZE);
+	addr &= -sz;   /* Mask off any low bits in the address. */
 
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
-	if (pud)
-		pte = (pte_t *) pmd_alloc(mm, pud, addr);
-	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
 
-	return pte;
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	if (sz >= PGDIR_SIZE) {
+		BUG_ON(sz != PGDIR_SIZE &&
+		       sz != PGDIR_SIZE << huge_shift[HUGE_SHIFT_PGDIR]);
+		return (pte_t *)pud;
+	} else {
+		pmd_t *pmd = pmd_alloc(mm, pud, addr);
+		if (sz >= PMD_SIZE) {
+			BUG_ON(sz != PMD_SIZE &&
+			       sz != (PMD_SIZE << huge_shift[HUGE_SHIFT_PMD]));
+			return (pte_t *)pmd;
+		}
+		else {
+			if (sz != PAGE_SIZE << huge_shift[HUGE_SHIFT_PAGE])
+				panic("Unexpected page size %#lx\n", sz);
+			return pte_alloc_map(mm, NULL, pmd, addr);
+		}
+	}
+#else
+	BUG_ON(sz != PMD_SIZE);
+	return (pte_t *) pmd_alloc(mm, pud, addr);
+#endif
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+static pte_t *get_pte(pte_t *base, int index, int level)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd = NULL;
-
-	pgd = pgd_offset(mm, addr);
-	if (pgd_present(*pgd)) {
-		pud = pud_offset(pgd, addr);
-		if (pud_present(*pud))
-			pmd = pmd_offset(pud, addr);
+	pte_t *ptep = base + index;
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	if (!pte_present(*ptep) && huge_shift[level] != 0) {
+		unsigned long mask = -1UL << huge_shift[level];
+		pte_t *super_ptep = base + (index & mask);
+		pte_t pte = *super_ptep;
+		if (pte_present(pte) && pte_super(pte))
+			ptep = super_ptep;
 	}
-	return (pte_t *) pmd;
+#endif
+	return ptep;
 }
 
-#ifdef HUGETLB_TEST
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
-	unsigned long start = address;
-	int length = 1;
-	int nr;
-	struct page *page;
-	struct vm_area_struct *vma;
-
-	vma = find_vma(mm, addr);
-	if (!vma || !is_vm_hugetlb_page(vma))
-		return ERR_PTR(-EINVAL);
-
-	pte = huge_pte_offset(mm, address);
-
-	/* hugetlb should be locked, and hence, prefaulted */
-	WARN_ON(!pte || pte_none(*pte));
-
-	page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
-	WARN_ON(!PageHead(page));
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	pte_t *pte;
+#endif
 
-	return page;
-}
+	/* Get the top-level page table entry. */
+	pgd = (pgd_t *)get_pte((pte_t *)mm->pgd, pgd_index(addr), 0);
 
-int pmd_huge(pmd_t pmd)
-{
-	return 0;
-}
+	/* We don't have four levels. */
+	pud = pud_offset(pgd, addr);
+#ifndef __PAGETABLE_PUD_FOLDED
+# error support fourth page table level
+#endif
+	if (!pud_present(*pud))
+		return NULL;
+
+	/* Check for an L0 huge PTE, if we have three levels. */
+#ifndef __PAGETABLE_PMD_FOLDED
+	if (pud_huge(*pud))
+		return (pte_t *)pud;
+
+	pmd = (pmd_t *)get_pte((pte_t *)pud_page_vaddr(*pud),
+			       pmd_index(addr), 1);
+	if (!pmd_present(*pmd))
+		return NULL;
+#else
+	pmd = pmd_offset(pud, addr);
+#endif
 
-int pud_huge(pud_t pud)
-{
-	return 0;
-}
+	/* Check for an L1 huge PTE. */
+	if (pmd_huge(*pmd))
+		return (pte_t *)pmd;
+
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	/* Check for an L2 huge PTE. */
+	pte = get_pte((pte_t *)pmd_page_vaddr(*pmd), pte_index(addr), 2);
+	if (!pte_present(*pte))
+		return NULL;
+	if (pte_super(*pte))
+		return pte;
+#endif
 
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
-{
 	return NULL;
 }
 
-#else
-
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write)
 {
@@ -149,50 +193,21 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 	return 0;
 }
 
-#endif
-
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 		unsigned long addr, unsigned long len,
 		unsigned long pgoff, unsigned long flags)
 {
 	struct hstate *h = hstate_file(file);
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	unsigned long start_addr;
-
-	if (len > mm->cached_hole_size) {
-		start_addr = mm->free_area_cache;
-	} else {
-		start_addr = TASK_UNMAPPED_BASE;
-		mm->cached_hole_size = 0;
-	}
-
-full_search:
-	addr = ALIGN(start_addr, huge_page_size(h));
-
-	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-		/* At this point:  (!vma || addr < vma->vm_end). */
-		if (TASK_SIZE - len < addr) {
-			/*
-			 * Start a new search - just in case we missed
-			 * some holes.
-			 */
-			if (start_addr != TASK_UNMAPPED_BASE) {
-				start_addr = TASK_UNMAPPED_BASE;
-				mm->cached_hole_size = 0;
-				goto full_search;
-			}
-			return -ENOMEM;
-		}
-		if (!vma || addr + len <= vma->vm_start) {
-			mm->free_area_cache = addr + len;
-			return addr;
-		}
-		if (addr + mm->cached_hole_size < vma->vm_start)
-			mm->cached_hole_size = vma->vm_start - addr;
-		addr = ALIGN(vma->vm_end, huge_page_size(h));
-	}
+	struct vm_unmapped_area_info info;
+
+	info.flags = 0;
+	info.length = len;
+	info.low_limit = TASK_UNMAPPED_BASE;
+	info.high_limit = TASK_SIZE;
+	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+	info.align_offset = 0;
+	return vm_unmapped_area(&info);
 }
 
 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
@@ -200,92 +215,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
 		unsigned long pgoff, unsigned long flags)
 {
 	struct hstate *h = hstate_file(file);
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma, *prev_vma;
-	unsigned long base = mm->mmap_base, addr = addr0;
-	unsigned long largest_hole = mm->cached_hole_size;
-	int first_time = 1;
-
-	/* don't allow allocations above current base */
-	if (mm->free_area_cache > base)
-		mm->free_area_cache = base;
-
-	if (len <= largest_hole) {
-		largest_hole = 0;
-		mm->free_area_cache  = base;
-	}
-try_again:
-	/* make sure it can fit in the remaining address space */
-	if (mm->free_area_cache < len)
-		goto fail;
-
-	/* either no address requested or can't fit in requested address hole */
-	addr = (mm->free_area_cache - len) & huge_page_mask(h);
-	do {
-		/*
-		 * Lookup failure means no vma is above this address,
-		 * i.e. return with success:
-		 */
-		vma = find_vma_prev(mm, addr, &prev_vma);
-		if (!vma) {
-			return addr;
-			break;
-		}
+	struct vm_unmapped_area_info info;
+	unsigned long addr;
 
-		/*
-		 * new region fits between prev_vma->vm_end and
-		 * vma->vm_start, use it:
-		 */
-		if (addr + len <= vma->vm_start &&
-			    (!prev_vma || (addr >= prev_vma->vm_end))) {
-			/* remember the address as a hint for next time */
-			mm->cached_hole_size = largest_hole;
-			mm->free_area_cache = addr;
-			return addr;
-		} else {
-			/* pull free_area_cache down to the first hole */
-			if (mm->free_area_cache == vma->vm_end) {
-				mm->free_area_cache = vma->vm_start;
-				mm->cached_hole_size = largest_hole;
-			}
-		}
+	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+	info.length = len;
+	info.low_limit = PAGE_SIZE;
+	info.high_limit = current->mm->mmap_base;
+	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+	info.align_offset = 0;
+	addr = vm_unmapped_area(&info);
 
-		/* remember the largest hole we saw so far */
-		if (addr + largest_hole < vma->vm_start)
-			largest_hole = vma->vm_start - addr;
-
-		/* try just below the current vma->vm_start */
-		addr = (vma->vm_start - len) & huge_page_mask(h);
-
-	} while (len <= vma->vm_start);
-
-fail:
-	/*
-	 * if hint left us with no space for the requested
-	 * mapping then try again:
-	 */
-	if (first_time) {
-		mm->free_area_cache = base;
-		largest_hole = 0;
-		first_time = 0;
-		goto try_again;
-	}
 	/*
 	 * A failed mmap() very likely causes application failure,
 	 * so fall back to the bottom-up function here. This scenario
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	mm->free_area_cache = TASK_UNMAPPED_BASE;
-	mm->cached_hole_size = ~0UL;
-	addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
-			len, pgoff, flags);
-
-	/*
-	 * Restore the topdown base:
-	 */
-	mm->free_area_cache = base;
-	mm->cached_hole_size = ~0UL;
+	if (addr & ~PAGE_MASK) {
+		VM_BUG_ON(addr != -ENOMEM);
+		info.flags = 0;
+		info.low_limit = TASK_UNMAPPED_BASE;
+		info.high_limit = TASK_SIZE;
+		addr = vm_unmapped_area(&info);
+	}
 
 	return addr;
 }
@@ -322,21 +275,102 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return hugetlb_get_unmapped_area_topdown(file, addr, len,
 				pgoff, flags);
 }
+#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
 
-static __init int setup_hugepagesz(char *opt)
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+static __init int __setup_hugepagesz(unsigned long ps)
 {
-	unsigned long ps = memparse(opt, &opt);
-	if (ps == PMD_SIZE) {
-		hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
-	} else if (ps == PUD_SIZE) {
-		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+	int log_ps = __builtin_ctzl(ps);
+	int level, base_shift;
+
+	if ((1UL << log_ps) != ps || (log_ps & 1) != 0) {
+		pr_warn("Not enabling %ld byte huge pages;"
+			" must be a power of four.\n", ps);
+		return -EINVAL;
+	}
+
+	if (ps > 64*1024*1024*1024UL) {
+		pr_warn("Not enabling %ld MB huge pages;"
+			" largest legal value is 64 GB .\n", ps >> 20);
+		return -EINVAL;
+	} else if (ps >= PUD_SIZE) {
+		static long hv_jpage_size;
+		if (hv_jpage_size == 0)
+			hv_jpage_size = hv_sysconf(HV_SYSCONF_PAGE_SIZE_JUMBO);
+		if (hv_jpage_size != PUD_SIZE) {
+			pr_warn("Not enabling >= %ld MB huge pages:"
+				" hypervisor reports size %ld\n",
+				PUD_SIZE >> 20, hv_jpage_size);
+			return -EINVAL;
+		}
+		level = 0;
+		base_shift = PUD_SHIFT;
+	} else if (ps >= PMD_SIZE) {
+		level = 1;
+		base_shift = PMD_SHIFT;
+	} else if (ps > PAGE_SIZE) {
+		level = 2;
+		base_shift = PAGE_SHIFT;
 	} else {
-		pr_err("hugepagesz: Unsupported page size %lu M\n",
-			ps >> 20);
-		return 0;
+		pr_err("hugepagesz: huge page size %ld too small\n", ps);
+		return -EINVAL;
+	}
+
+	if (log_ps != base_shift) {
+		int shift_val = log_ps - base_shift;
+		if (huge_shift[level] != 0) {
+			int old_shift = base_shift + huge_shift[level];
+			pr_warn("Not enabling %ld MB huge pages;"
+				" already have size %ld MB.\n",
+				ps >> 20, (1UL << old_shift) >> 20);
+			return -EINVAL;
+		}
+		if (hv_set_pte_super_shift(level, shift_val) != 0) {
+			pr_warn("Not enabling %ld MB huge pages;"
+				" no hypervisor support.\n", ps >> 20);
+			return -EINVAL;
+		}
+		printk(KERN_DEBUG "Enabled %ld MB huge pages\n", ps >> 20);
+		huge_shift[level] = shift_val;
+	}
+
+	hugetlb_add_hstate(log_ps - PAGE_SHIFT);
+
+	return 0;
+}
+
+static bool saw_hugepagesz;
+
+static __init int setup_hugepagesz(char *opt)
+{
+	if (!saw_hugepagesz) {
+		saw_hugepagesz = true;
+		memset(huge_shift, 0, sizeof(huge_shift));
 	}
-	return 1;
+	return __setup_hugepagesz(memparse(opt, NULL));
 }
 __setup("hugepagesz=", setup_hugepagesz);
 
-#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
+#ifdef ADDITIONAL_HUGE_SIZE
+/*
+ * Provide an additional huge page size if no "hugepagesz" args are given.
+ * In that case, all the cores have properly set up their hv super_shift
+ * already, but we need to notify the hugetlb code to enable the
+ * new huge page size from the Linux point of view.
+ */
+static __init int add_default_hugepagesz(void)
+{
+	if (!saw_hugepagesz) {
+		BUILD_BUG_ON(ADDITIONAL_HUGE_SIZE >= PMD_SIZE ||
+			     ADDITIONAL_HUGE_SIZE <= PAGE_SIZE);
+		BUILD_BUG_ON((PAGE_SIZE << ADDITIONAL_HUGE_SHIFT) !=
+			     ADDITIONAL_HUGE_SIZE);
+		BUILD_BUG_ON(ADDITIONAL_HUGE_SHIFT & 1);
+		hugetlb_add_hstate(ADDITIONAL_HUGE_SHIFT);
+	}
+	return 0;
+}
+arch_initcall(add_default_hugepagesz);
+#endif
+
+#endif /* CONFIG_HUGETLB_SUPER_PAGES */
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 7309988c979..bfb3127b4df 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -38,7 +38,6 @@
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/processor.h>
-#include <asm/system.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/dma.h>
@@ -83,7 +82,7 @@ static int num_l2_ptes[MAX_NUMNODES];
 
 static void init_prealloc_ptes(int node, int pages)
 {
-	BUG_ON(pages & (HV_L2_ENTRIES-1));
+	BUG_ON(pages & (PTRS_PER_PTE - 1));
 	if (pages) {
 		num_l2_ptes[node] = pages;
 		l2_ptes[node] = __alloc_bootmem(pages * sizeof(pte_t),
@@ -107,10 +106,8 @@ pte_t *get_prealloc_pte(unsigned long pfn)
  */
 static int initial_heap_home(void)
 {
-#if CHIP_HAS_CBOX_HOME_MAP()
 	if (hash_default)
 		return PAGE_HOME_HASH;
-#endif
 	return smp_processor_id();
 }
 
@@ -132,14 +129,9 @@ static void __init assign_pte(pmd_t *pmd, pte_t *page_table)
 
 #ifdef __tilegx__
 
-#if HV_L1_SIZE != HV_L2_SIZE
-# error Rework assumption that L1 and L2 page tables are same size.
-#endif
-
-/* Since pmd_t arrays and pte_t arrays are the same size, just use casts. */
 static inline pmd_t *alloc_pmd(void)
 {
-	return (pmd_t *)alloc_pte();
+	return __alloc_bootmem(L1_KERNEL_PGTABLE_SIZE, HV_PAGE_TABLE_ALIGN, 0);
 }
 
 static inline void assign_pmd(pud_t *pud, pmd_t *pmd)
@@ -156,7 +148,21 @@ void __init shatter_pmd(pmd_t *pmd)
 	assign_pte(pmd, pte);
 }
 
-#ifdef CONFIG_HIGHMEM
+#ifdef __tilegx__
+static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
+{
+	pud_t *pud = pud_offset(&pgtables[pgd_index(va)], va);
+	if (pud_none(*pud))
+		assign_pmd(pud, alloc_pmd());
+	return pmd_offset(pud, va);
+}
+#else
+static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
+{
+	return pmd_offset(pud_offset(&pgtables[pgd_index(va)], va), va);
+}
+#endif
+
 /*
  * This function initializes a certain range of kernel virtual memory
  * with new bootmem page tables, everywhere page tables are missing in
@@ -169,34 +175,24 @@ void __init shatter_pmd(pmd_t *pmd)
  * checking the pgd every time.
  */
 static void __init page_table_range_init(unsigned long start,
-					 unsigned long end, pgd_t *pgd_base)
+					 unsigned long end, pgd_t *pgd)
 {
-	pgd_t *pgd;
-	int pgd_idx;
 	unsigned long vaddr;
-
-	vaddr = start;
-	pgd_idx = pgd_index(vaddr);
-	pgd = pgd_base + pgd_idx;
-
-	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
-		pmd_t *pmd = pmd_offset(pud_offset(pgd, vaddr), vaddr);
+	start = round_down(start, PMD_SIZE);
+	end = round_up(end, PMD_SIZE);
+	for (vaddr = start; vaddr < end; vaddr += PMD_SIZE) {
+		pmd_t *pmd = get_pmd(pgd, vaddr);
 		if (pmd_none(*pmd))
 			assign_pte(pmd, alloc_pte());
-		vaddr += PMD_SIZE;
 	}
 }
-#endif /* CONFIG_HIGHMEM */
-
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 
 static int __initdata ktext_hash = 1;  /* .text pages */
 static int __initdata kdata_hash = 1;  /* .data and .bss pages */
 int __write_once hash_default = 1;     /* kernel allocator pages */
 EXPORT_SYMBOL(hash_default);
 int __write_once kstack_hash = 1;      /* if no homecaching, use h4h */
-#endif /* CHIP_HAS_CBOX_HOME_MAP */
 
 /*
  * CPUs to use to for striping the pages of kernel data.  If hash-for-home
@@ -214,14 +210,12 @@ int __write_once kdata_huge;       /* if no homecaching, small pages */
 static pgprot_t __init construct_pgprot(pgprot_t prot, int home)
 {
 	prot = pte_set_home(prot, home);
-#if CHIP_HAS_CBOX_HOME_MAP()
 	if (home == PAGE_HOME_IMMUTABLE) {
 		if (ktext_hash)
 			prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_HASH_L3);
 		else
 			prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_NO_L3);
 	}
-#endif
 	return prot;
 }
 
@@ -233,40 +227,28 @@ static pgprot_t __init init_pgprot(ulong address)
 {
 	int cpu;
 	unsigned long page;
-	enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+	enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/* For kdata=huge, everything is just hash-for-home. */
 	if (kdata_huge)
 		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
-#endif
 
 	/* We map the aliased pages of permanent text inaccessible. */
 	if (address < (ulong) _sinittext - CODE_DELTA)
 		return PAGE_NONE;
 
-	/*
-	 * We map read-only data non-coherent for performance.  We could
-	 * use neighborhood caching on TILE64, but it's not clear it's a win.
-	 */
+	/* We map read-only data non-coherent for performance. */
 	if ((address >= (ulong) __start_rodata &&
 	     address < (ulong) __end_rodata) ||
 	    address == (ulong) empty_zero_page) {
 		return construct_pgprot(PAGE_KERNEL_RO, PAGE_HOME_IMMUTABLE);
 	}
 
-	/* As a performance optimization, keep the boot init stack here. */
-	if (address >= (ulong)&init_thread_union &&
-	    address < (ulong)&init_thread_union + THREAD_SIZE)
-		return construct_pgprot(PAGE_KERNEL, smp_processor_id());
-
 #ifndef __tilegx__
-#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 	/* Force the atomic_locks[] array page to be hash-for-home. */
 	if (address == (ulong) atomic_locks)
 		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
 #endif
-#endif
 
 	/*
 	 * Everything else that isn't data or bss is heap, so mark it
@@ -284,28 +266,18 @@ static pgprot_t __init init_pgprot(ulong address)
 	if (address >= (ulong) _end || address < (ulong) _einitdata)
 		return construct_pgprot(PAGE_KERNEL, initial_heap_home());
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/* Use hash-for-home if requested for data/bss. */
 	if (kdata_hash)
 		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
-#endif
-
-	/*
-	 * Make the w1data homed like heap to start with, to avoid
-	 * making it part of the page-striped data area when we're just
-	 * going to convert it to read-only soon anyway.
-	 */
-	if (address >= (ulong)__w1data_begin && address < (ulong)__w1data_end)
-		return construct_pgprot(PAGE_KERNEL, initial_heap_home());
 
 	/*
 	 * Otherwise we just hand out consecutive cpus.  To avoid
 	 * requiring this function to hold state, we just walk forward from
-	 * _sdata by PAGE_SIZE, skipping the readonly and init data, to reach
-	 * the requested address, while walking cpu home around kdata_mask.
-	 * This is typically no more than a dozen or so iterations.
+	 * __end_rodata by PAGE_SIZE, skipping the readonly and init data, to
+	 * reach the requested address, while walking cpu home around
+	 * kdata_mask. This is typically no more than a dozen or so iterations.
 	 */
-	page = (((ulong)__w1data_end) + PAGE_SIZE - 1) & PAGE_MASK;
+	page = (((ulong)__end_rodata) + PAGE_SIZE - 1) & PAGE_MASK;
 	BUG_ON(address < page || address >= (ulong)_end);
 	cpu = cpumask_first(&kdata_mask);
 	for (; page < address; page += PAGE_SIZE) {
@@ -315,11 +287,9 @@ static pgprot_t __init init_pgprot(ulong address)
 		if (page == (ulong)empty_zero_page)
 			continue;
 #ifndef __tilegx__
-#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 		if (page == (ulong)atomic_locks)
 			continue;
 #endif
-#endif
 		cpu = cpumask_next(cpu, &kdata_mask);
 		if (cpu == NR_CPUS)
 			cpu = cpumask_first(&kdata_mask);
@@ -362,7 +332,7 @@ static int __init setup_ktext(char *str)
 
 	ktext_arg_seen = 1;
 
-	/* Default setting on Tile64: use a huge page */
+	/* Default setting: use a huge page */
 	if (strcmp(str, "huge") == 0)
 		pr_info("ktext: using one huge locally cached page\n");
 
@@ -408,28 +378,11 @@ static inline pgprot_t ktext_set_nocache(pgprot_t prot)
 {
 	if (!ktext_nocache)
 		prot = hv_pte_set_nc(prot);
-#if CHIP_HAS_NC_AND_NOALLOC_BITS()
 	else
 		prot = hv_pte_set_no_alloc_l2(prot);
-#endif
 	return prot;
 }
 
-#ifndef __tilegx__
-static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
-{
-	return pmd_offset(pud_offset(&pgtables[pgd_index(va)], va), va);
-}
-#else
-static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
-{
-	pud_t *pud = pud_offset(&pgtables[pgd_index(va)], va);
-	if (pud_none(*pud))
-		assign_pmd(pud, alloc_pmd());
-	return pmd_offset(pud, va);
-}
-#endif
-
 /* Temporary page table we use for staging. */
 static pgd_t pgtables[PTRS_PER_PGD]
  __attribute__((aligned(HV_PAGE_TABLE_ALIGN)));
@@ -450,6 +403,7 @@ static pgd_t pgtables[PTRS_PER_PGD]
  */
 static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 {
+	unsigned long long irqmask;
 	unsigned long address, pfn;
 	pmd_t *pmd;
 	pte_t *pte;
@@ -458,7 +412,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	struct cpumask kstripe_mask;
 	int rc, i;
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	if (ktext_arg_seen && ktext_hash) {
 		pr_warning("warning: \"ktext\" boot argument ignored"
 			   " if \"kcache_hash\" sets up text hash-for-home\n");
@@ -475,7 +428,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 			  " kcache_hash=all or =allbutstack\n");
 		kdata_huge = 0;
 	}
-#endif
 
 	/*
 	 * Set up a mask for cpus to use for kernel striping.
@@ -556,8 +508,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 		}
 	}
 
-	address = MEM_SV_INTRPT;
+	address = MEM_SV_START;
 	pmd = get_pmd(pgtables, address);
+	pfn = 0;  /* code starts at PA 0 */
 	if (ktext_small) {
 		/* Allocate an L2 PTE for the kernel text */
 		int cpu = 0;
@@ -579,11 +532,16 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 			prot = ktext_set_nocache(prot);
 		}
 
-		BUG_ON(address != (unsigned long)_stext);
-		pfn = 0;  /* code starts at PA 0 */
-		pte = alloc_pte();
-		for (pte_ofs = 0; address < (unsigned long)_einittext;
-		     pfn++, pte_ofs++, address += PAGE_SIZE) {
+		BUG_ON(address != (unsigned long)_text);
+		pte = NULL;
+		for (; address < (unsigned long)_einittext;
+		     pfn++, address += PAGE_SIZE) {
+			pte_ofs = pte_index(address);
+			if (pte_ofs == 0) {
+				if (pte)
+					assign_pte(pmd++, pte);
+				pte = alloc_pte();
+			}
 			if (!ktext_local) {
 				prot = set_remote_cache_cpu(prot, cpu);
 				cpu = cpumask_next(cpu, &ktext_mask);
@@ -592,17 +550,16 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 			}
 			pte[pte_ofs] = pfn_pte(pfn, prot);
 		}
-		assign_pte(pmd, pte);
+		if (pte)
+			assign_pte(pmd, pte);
 	} else {
 		pte_t pteval = pfn_pte(0, PAGE_KERNEL_EXEC);
 		pteval = pte_mkhuge(pteval);
-#if CHIP_HAS_CBOX_HOME_MAP()
 		if (ktext_hash) {
 			pteval = hv_pte_set_mode(pteval,
 						 HV_PTE_MODE_CACHE_HASH_L3);
 			pteval = ktext_set_nocache(pteval);
 		} else
-#endif /* CHIP_HAS_CBOX_HOME_MAP() */
 		if (cpumask_weight(&ktext_mask) == 1) {
 			pteval = set_remote_cache_cpu(pteval,
 					      cpumask_first(&ktext_mask));
@@ -615,7 +572,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 		else
 			pteval = hv_pte_set_mode(pteval,
 						 HV_PTE_MODE_CACHE_NO_L3);
-		*(pte_t *)pmd = pteval;
+		for (; address < (unsigned long)_einittext;
+		     pfn += PFN_DOWN(HPAGE_SIZE), address += HPAGE_SIZE)
+			*(pte_t *)(pmd++) = pfn_pte(pfn, pteval);
 	}
 
 	/* Set swapper_pgprot here so it is flushed to memory right away. */
@@ -630,10 +589,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	 *  - install pgtables[] as the real page table
 	 *  - flush the TLB so the new page table takes effect
 	 */
+	irqmask = interrupt_mask_save_mask();
+	interrupt_mask_set_mask(-1ULL);
 	rc = flush_and_install_context(__pa(pgtables),
 				       init_pgprot((unsigned long)pgtables),
 				       __get_cpu_var(current_asid),
 				       cpumask_bits(my_cpu_mask));
+	interrupt_mask_restore_mask(irqmask);
 	BUG_ON(rc != 0);
 
 	/* Copy the page table back to the normal swapper_pg_dir. */
@@ -696,6 +658,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 #endif /* CONFIG_HIGHMEM */
 
 
+#ifndef CONFIG_64BIT
 static void __init init_free_pfn_range(unsigned long start, unsigned long end)
 {
 	unsigned long pfn;
@@ -725,7 +688,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end)
 		}
 		init_page_count(page);
 		__free_pages(page, order);
-		totalram_pages += count;
+		adjust_managed_page_count(page, count);
 
 		page += count;
 		pfn += count;
@@ -738,16 +701,15 @@ static void __init set_non_bootmem_pages_init(void)
 	for_each_zone(z) {
 		unsigned long start, end;
 		int nid = z->zone_pgdat->node_id;
+#ifdef CONFIG_HIGHMEM
 		int idx = zone_idx(z);
+#endif
 
 		start = z->zone_start_pfn;
-		if (start == 0)
-			continue;  /* bootmem */
 		end = start + z->spanned_pages;
-		if (idx == ZONE_NORMAL) {
-			BUG_ON(start != node_start_pfn[nid]);
-			start = node_free_pfn[nid];
-		}
+		start = max(start, node_free_pfn[nid]);
+		start = max(start, max_low_pfn);
+
 #ifdef CONFIG_HIGHMEM
 		if (idx == ZONE_HIGHMEM)
 			totalhigh_pages += z->spanned_pages;
@@ -768,6 +730,7 @@ static void __init set_non_bootmem_pages_init(void)
 		init_free_pfn_range(start, end);
 	}
 }
+#endif
 
 /*
  * paging_init() sets up the page tables - note that all of lowmem is
@@ -775,9 +738,6 @@ static void __init set_non_bootmem_pages_init(void)
  */
 void __init paging_init(void)
 {
-#ifdef CONFIG_HIGHMEM
-	unsigned long vaddr, end;
-#endif
 #ifdef __tilegx__
 	pud_t *pud;
 #endif
@@ -785,14 +745,11 @@ void __init paging_init(void)
 
 	kernel_physical_mapping_init(pgd_base);
 
+	/* Fixed mappings, only the page table structure has to be created. */
+	page_table_range_init(fix_to_virt(__end_of_fixed_addresses - 1),
+			      FIXADDR_TOP, pgd_base);
+
 #ifdef CONFIG_HIGHMEM
-	/*
-	 * Fixed mappings, only the page table structure has to be
-	 * created - mappings will be set by set_fixmap():
-	 */
-	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
-	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
-	page_table_range_init(vaddr, end, pgd_base);
 	permanent_kmaps_init(pgd_base);
 #endif
 
@@ -804,7 +761,7 @@ void __init paging_init(void)
 	 * changing init_mm once we get up and running, and there's no
 	 * need for e.g. vmalloc_sync_all().
 	 */
-	BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END));
+	BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END - 1));
 	pud = pud_offset(pgd_base + pgd_index(VMALLOC_START), VMALLOC_START);
 	assign_pmd(pud, alloc_pmd());
 #endif
@@ -829,7 +786,6 @@ static void __init set_max_mapnr_init(void)
 
 void __init mem_init(void)
 {
-	int codesize, datasize, initsize;
 	int i;
 #ifndef __tilegx__
 	void *last;
@@ -854,24 +810,14 @@ void __init mem_init(void)
 	set_max_mapnr_init();
 
 	/* this will put all bootmem onto the freelists */
-	totalram_pages += free_all_bootmem();
+	free_all_bootmem();
 
+#ifndef CONFIG_64BIT
 	/* count all remaining LOWMEM and give all HIGHMEM to page allocator */
 	set_non_bootmem_pages_init();
+#endif
 
-	codesize =  (unsigned long)&_etext - (unsigned long)&_text;
-	datasize =  (unsigned long)&_end - (unsigned long)&_sdata;
-	initsize =  (unsigned long)&_einittext - (unsigned long)&_sinittext;
-	initsize += (unsigned long)&_einitdata - (unsigned long)&_sinitdata;
-
-	pr_info("Memory: %luk/%luk available (%dk kernel code, %dk data, %dk init, %ldk highmem)\n",
-		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
-		num_physpages << (PAGE_SHIFT-10),
-		codesize >> 10,
-		datasize >> 10,
-		initsize >> 10,
-		(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
-	       );
+	mem_init_print_info(NULL);
 
 	/*
 	 * In debug mode, dump some interesting memory mappings.
@@ -882,10 +828,6 @@ void __init mem_init(void)
 	printk(KERN_DEBUG "  PKMAP   %#lx - %#lx\n",
 	       PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP) - 1);
 #endif
-#ifdef CONFIG_HUGEVMAP
-	printk(KERN_DEBUG "  HUGEMAP %#lx - %#lx\n",
-	       HUGE_VMAP_BASE, HUGE_VMAP_END - 1);
-#endif
 	printk(KERN_DEBUG "  VMALLOC %#lx - %#lx\n",
 	       _VMALLOC_START, _VMALLOC_END - 1);
 #ifdef __tilegx__
@@ -941,6 +883,14 @@ int remove_memory(u64 start, u64 size)
 {
 	return -EINVAL;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+	/* TODO */
+	return -EBUSY;
+}
+#endif
 #endif
 
 struct kmem_cache *pgd_cache;
@@ -952,26 +902,6 @@ void __init pgtable_cache_init(void)
 		panic("pgtable_cache_init(): Cannot create pgd cache");
 }
 
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-/*
- * The __w1data area holds data that is only written during initialization,
- * and is read-only and thus freely cacheable thereafter.  Fix the page
- * table entries that cover that region accordingly.
- */
-static void mark_w1data_ro(void)
-{
-	/* Loop over page table entries */
-	unsigned long addr = (unsigned long)__w1data_begin;
-	BUG_ON((addr & (PAGE_SIZE-1)) != 0);
-	for (; addr <= (unsigned long)__w1data_end - 1; addr += PAGE_SIZE) {
-		unsigned long pfn = kaddr_to_pfn((void *)addr);
-		pte_t *ptep = virt_to_pte(NULL, addr);
-		BUG_ON(pte_huge(*ptep));   /* not relevant for kdata_huge */
-		set_pte_at(&init_mm, addr, ptep, pfn_pte(pfn, PAGE_KERNEL_RO));
-	}
-}
-#endif
-
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static long __write_once initfree;
 #else
@@ -982,7 +912,7 @@ static long __write_once initfree = 1;
 static int __init set_initfree(char *str)
 {
 	long val;
-	if (strict_strtol(str, 0, &val) == 0) {
+	if (kstrtol(str, 0, &val) == 0) {
 		initfree = val;
 		pr_info("initfree: %s free init pages\n",
 			initfree ? "will" : "won't");
@@ -1011,7 +941,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
 		 */
 		int pfn = kaddr_to_pfn((void *)addr);
 		struct page *page = pfn_to_page(pfn);
-		pte_t *ptep = virt_to_pte(NULL, addr);
+		pte_t *ptep = virt_to_kpte(addr);
 		if (!initfree) {
 			/*
 			 * If debugging page accesses then do not free
@@ -1022,31 +952,24 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
 			pte_clear(&init_mm, addr, ptep);
 			continue;
 		}
-		__ClearPageReserved(page);
-		init_page_count(page);
 		if (pte_huge(*ptep))
 			BUG_ON(!kdata_huge);
 		else
 			set_pte_at(&init_mm, addr, ptep,
 				   pfn_pte(pfn, PAGE_KERNEL));
 		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
-		free_page(addr);
-		totalram_pages++;
+		free_reserved_page(page);
 	}
 	pr_info("Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
 }
 
 void free_initmem(void)
 {
-	const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
+	const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;
 
 	/*
-	 * Evict the dirty initdata on the boot cpu, evict the w1data
-	 * wherever it's homed, and evict all the init code everywhere.
-	 * We are guaranteed that no one will touch the init pages any
-	 * more, and although other cpus may be touching the w1data,
-	 * we only actually change the caching on tile64, which won't
-	 * be keeping local copies in the other tiles' caches anyway.
+	 * Evict the cache on all cores to avoid incoherence.
+	 * We are guaranteed that no one will touch the init pages any more.
 	 */
 	homecache_evict(&cpu_cacheable_map);
 
@@ -1057,26 +980,11 @@ void free_initmem(void)
 
 	/*
 	 * Free the pages mapped from 0xc0000000 that correspond to code
-	 * pages from MEM_SV_INTRPT that we won't use again after init.
+	 * pages from MEM_SV_START that we won't use again after init.
 	 */
 	free_init_pages("unused kernel text",
 			(unsigned long)_sinittext - text_delta,
 			(unsigned long)_einittext - text_delta);
-
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-	/*
-	 * Upgrade the .w1data section to globally cached.
-	 * We don't do this on tilepro, since the cache architecture
-	 * pretty much makes it irrelevant, and in any case we end
-	 * up having racing issues with other tiles that may touch
-	 * the data after we flush the cache but before we update
-	 * the PTEs and flush the TLBs, causing sharer shootdowns
-	 * later.  Even though this is to clean data, it seems like
-	 * an unnecessary complication.
-	 */
-	mark_w1data_ro();
-#endif
-
 	/* Do a global TLB flush so everyone sees the changes. */
 	flush_tlb_all();
 }
diff --git a/arch/tile/mm/migrate.h b/arch/tile/mm/migrate.h
index cd45a0837fa..91683d97917 100644
--- a/arch/tile/mm/migrate.h
+++ b/arch/tile/mm/migrate.h
@@ -24,6 +24,9 @@
 /*
  * This function is used as a helper when setting up the initial
  * page table (swapper_pg_dir).
+ *
+ * You must mask ALL interrupts prior to invoking this code, since
+ * you can't legally touch the stack during the cache flush.
  */
 extern int flush_and_install_context(HV_PhysAddr page_table, HV_PTE access,
 				     HV_ASID asid,
@@ -39,6 +42,9 @@ extern int flush_and_install_context(HV_PhysAddr page_table, HV_PTE access,
  *
  * Note that any non-NULL pointers must not point to the page that
  * is handled by the stack_pte itself.
+ *
+ * You must mask ALL interrupts prior to invoking this code, since
+ * you can't legally touch the stack during the cache flush.
  */
 extern int homecache_migrate_stack_and_flush(pte_t stack_pte, unsigned long va,
 				     size_t length, pte_t *stack_ptep,
diff --git a/arch/tile/mm/migrate_32.S b/arch/tile/mm/migrate_32.S
index ac01a7cdf77..772085491bf 100644
--- a/arch/tile/mm/migrate_32.S
+++ b/arch/tile/mm/migrate_32.S
@@ -40,8 +40,7 @@
 #define FRAME_R32	16
 #define FRAME_R33	20
 #define FRAME_R34	24
-#define FRAME_R35	28
-#define FRAME_SIZE	32
+#define FRAME_SIZE	28
 
 
 
@@ -66,12 +65,11 @@
 #define r_my_cpumask	r5
 
 /* Locals (callee-save); must not be more than FRAME_xxx above. */
-#define r_save_ics	r30
-#define r_context_lo	r31
-#define r_context_hi	r32
-#define r_access_lo	r33
-#define r_access_hi	r34
-#define r_asid		r35
+#define r_context_lo	r30
+#define r_context_hi	r31
+#define r_access_lo	r32
+#define r_access_hi	r33
+#define r_asid		r34
 
 STD_ENTRY(flush_and_install_context)
 	/*
@@ -104,11 +102,7 @@ STD_ENTRY(flush_and_install_context)
 	 sw r_tmp, r33
 	 addi r_tmp, sp, FRAME_R34
 	}
-	{
-	 sw r_tmp, r34
-	 addi r_tmp, sp, FRAME_R35
-	}
-	sw r_tmp, r35
+	sw r_tmp, r34
 
 	/* Move some arguments to callee-save registers. */
 	{
@@ -121,13 +115,6 @@ STD_ENTRY(flush_and_install_context)
 	}
 	move r_asid, r_asid_in
 
-	/* Disable interrupts, since we can't use our stack. */
-	{
-	 mfspr r_save_ics, INTERRUPT_CRITICAL_SECTION
-	 movei r_tmp, 1
-	}
-	mtspr INTERRUPT_CRITICAL_SECTION, r_tmp
-
 	/* First, flush our L2 cache. */
 	{
 	 move r0, zero  /* cache_pa */
@@ -149,7 +136,7 @@ STD_ENTRY(flush_and_install_context)
 	 move r8, zero  /* asids */
 	 move r9, zero  /* asidcount */
 	}
-	jal hv_flush_remote
+	jal _hv_flush_remote
 	bnz r0, .Ldone
 
 	/* Now install the new page table. */
@@ -163,9 +150,9 @@ STD_ENTRY(flush_and_install_context)
 	}
 	{
 	 move r4, r_asid
-	 movei r5, HV_CTX_DIRECTIO
+	 moveli r5, HV_CTX_DIRECTIO | CTX_PAGE_FLAG
 	}
-	jal hv_install_context
+	jal _hv_install_context
 	bnz r0, .Ldone
 
 	/* Finally, flush the TLB. */
@@ -175,9 +162,6 @@ STD_ENTRY(flush_and_install_context)
 	}
 
 .Ldone:
-	/* Reset interrupts back how they were before. */
-	mtspr INTERRUPT_CRITICAL_SECTION, r_save_ics
-
 	/* Restore the callee-saved registers and return. */
 	addli lr, sp, FRAME_SIZE
 	{
@@ -202,10 +186,6 @@ STD_ENTRY(flush_and_install_context)
 	}
 	{
 	 lw r34, r_tmp
-	 addli r_tmp, sp, FRAME_R35
-	}
-	{
-	 lw r35, r_tmp
 	 addi sp, sp, FRAME_SIZE
 	}
 	jrp lr
diff --git a/arch/tile/mm/migrate_64.S b/arch/tile/mm/migrate_64.S
index e76fea688be..a49eee38f87 100644
--- a/arch/tile/mm/migrate_64.S
+++ b/arch/tile/mm/migrate_64.S
@@ -38,8 +38,7 @@
 #define FRAME_R30	16
 #define FRAME_R31	24
 #define FRAME_R32	32
-#define FRAME_R33	40
-#define FRAME_SIZE	48
+#define FRAME_SIZE	40
 
 
 
@@ -60,10 +59,9 @@
 #define r_my_cpumask	r3
 
 /* Locals (callee-save); must not be more than FRAME_xxx above. */
-#define r_save_ics	r30
-#define r_context	r31
-#define r_access	r32
-#define r_asid		r33
+#define r_context	r30
+#define r_access	r31
+#define r_asid		r32
 
 /*
  * Caller-save locals and frame constants are the same as
@@ -93,11 +91,7 @@ STD_ENTRY(flush_and_install_context)
 	 st r_tmp, r31
 	 addi r_tmp, sp, FRAME_R32
 	}
-	{
-	 st r_tmp, r32
-	 addi r_tmp, sp, FRAME_R33
-	}
-	st r_tmp, r33
+	st r_tmp, r32
 
 	/* Move some arguments to callee-save registers. */
 	{
@@ -106,13 +100,6 @@ STD_ENTRY(flush_and_install_context)
 	}
 	move r_asid, r_asid_in
 
-	/* Disable interrupts, since we can't use our stack. */
-	{
-	 mfspr r_save_ics, INTERRUPT_CRITICAL_SECTION
-	 movei r_tmp, 1
-	}
-	mtspr INTERRUPT_CRITICAL_SECTION, r_tmp
-
 	/* First, flush our L2 cache. */
 	{
 	 move r0, zero  /* cache_pa */
@@ -136,7 +123,7 @@ STD_ENTRY(flush_and_install_context)
 	}
 	{
 	 move r8, zero  /* asidcount */
-	 jal hv_flush_remote
+	 jal _hv_flush_remote
 	}
 	bnez r0, 1f
 
@@ -147,9 +134,9 @@ STD_ENTRY(flush_and_install_context)
 	}
 	{
 	 move r2, r_asid
-	 movei r3, HV_CTX_DIRECTIO
+	 moveli r3, HV_CTX_DIRECTIO | CTX_PAGE_FLAG
 	}
-	jal hv_install_context
+	jal _hv_install_context
 	bnez r0, 1f
 
 	/* Finally, flush the TLB. */
@@ -158,10 +145,7 @@ STD_ENTRY(flush_and_install_context)
 	 jal hv_flush_all
 	}
 
-1:      /* Reset interrupts back how they were before. */
-	mtspr INTERRUPT_CRITICAL_SECTION, r_save_ics
-
-	/* Restore the callee-saved registers and return. */
+1:	/* Restore the callee-saved registers and return. */
 	addli lr, sp, FRAME_SIZE
 	{
 	 ld lr, lr
@@ -177,10 +161,6 @@ STD_ENTRY(flush_and_install_context)
 	}
 	{
 	 ld r32, r_tmp
-	 addli r_tmp, sp, FRAME_R33
-	}
-	{
-	 ld r33, r_tmp
 	 addi sp, sp, FRAME_SIZE
 	}
 	jrp lr
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c
index f96f4cec602..851a94e6ae5 100644
--- a/arch/tile/mm/mmap.c
+++ b/arch/tile/mm/mmap.c
@@ -58,18 +58,36 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 #else
 	int is_32bit = 0;
 #endif
+	unsigned long random_factor = 0UL;
+
+	/*
+	 *  8 bits of randomness in 32bit mmaps, 24 address space bits
+	 * 12 bits of randomness in 64bit mmaps, 28 address space bits
+	 */
+	if (current->flags & PF_RANDOMIZE) {
+		if (is_32bit)
+			random_factor = get_random_int() % (1<<8);
+		else
+			random_factor = get_random_int() % (1<<12);
+
+		random_factor <<= PAGE_SHIFT;
+	}
 
 	/*
 	 * Use standard layout if the expected stack growth is unlimited
 	 * or we are running native 64 bits.
 	 */
-	if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
-		mm->mmap_base = TASK_UNMAPPED_BASE;
+	if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
+		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
 		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base(mm);
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+	unsigned long range_end = mm->brk + 0x02000000;
+	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index de7d8e21e01..5e86eac4bfa 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -27,7 +27,6 @@
 #include <linux/vmalloc.h>
 #include <linux/smp.h>
 
-#include <asm/system.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/fixmap.h>
@@ -62,7 +61,7 @@ void show_mem(unsigned int filter)
 	       global_page_state(NR_PAGETABLE),
 	       global_page_state(NR_BOUNCE),
 	       global_page_state(NR_FILE_PAGES),
-	       nr_swap_pages);
+	       get_nr_swap_pages());
 
 	for_each_zone(zone) {
 		unsigned long flags, order, total = 0, largest_order = -1;
@@ -84,64 +83,6 @@ void show_mem(unsigned int filter)
 	}
 }
 
-/*
- * Associate a virtual page frame with a given physical page frame
- * and protection flags for that frame.
- */
-static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pgd = swapper_pg_dir + pgd_index(vaddr);
-	if (pgd_none(*pgd)) {
-		BUG();
-		return;
-	}
-	pud = pud_offset(pgd, vaddr);
-	if (pud_none(*pud)) {
-		BUG();
-		return;
-	}
-	pmd = pmd_offset(pud, vaddr);
-	if (pmd_none(*pmd)) {
-		BUG();
-		return;
-	}
-	pte = pte_offset_kernel(pmd, vaddr);
-	/* <pfn,flags> stored as-is, to permit clearing entries */
-	set_pte(pte, pfn_pte(pfn, flags));
-
-	/*
-	 * It's enough to flush this one mapping.
-	 * This appears conservative since it is only called
-	 * from __set_fixmap.
-	 */
-	local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
-}
-
-void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
-{
-	unsigned long address = __fix_to_virt(idx);
-
-	if (idx >= __end_of_fixed_addresses) {
-		BUG();
-		return;
-	}
-	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
-}
-
-#if defined(CONFIG_HIGHPTE)
-pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
-{
-	pte_t *pte = kmap_atomic(pmd_page(*dir)) +
-		(pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
-	return &pte[pte_index(address)];
-}
-#endif
-
 /**
  * shatter_huge_page() - ensure a given address is mapped by a small page.
  *
@@ -178,23 +119,19 @@ void shatter_huge_page(unsigned long addr)
 	if (!pmd_huge_page(*pmd))
 		return;
 
-	/*
-	 * Grab the pgd_lock, since we may need it to walk the pgd_list,
-	 * and since we need some kind of lock here to avoid races.
-	 */
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock_irqsave(&init_mm.page_table_lock, flags);
 	if (!pmd_huge_page(*pmd)) {
 		/* Lost the race to convert the huge page. */
-		spin_unlock_irqrestore(&pgd_lock, flags);
+		spin_unlock_irqrestore(&init_mm.page_table_lock, flags);
 		return;
 	}
 
 	/* Shatter the huge page into the preallocated L2 page table. */
-	pmd_populate_kernel(&init_mm, pmd,
-			    get_prealloc_pte(pte_pfn(*(pte_t *)pmd)));
+	pmd_populate_kernel(&init_mm, pmd, get_prealloc_pte(pmd_pfn(*pmd)));
 
 #ifdef __PAGETABLE_PMD_FOLDED
 	/* Walk every pgd on the system and update the pmd there. */
+	spin_lock(&pgd_lock);
 	list_for_each(pos, &pgd_list) {
 		pmd_t *copy_pmd;
 		pgd = list_to_pgd(pos) + pgd_index(addr);
@@ -202,6 +139,7 @@ void shatter_huge_page(unsigned long addr)
 		copy_pmd = pmd_offset(pud, addr);
 		__set_pmd(copy_pmd, *pmd);
 	}
+	spin_unlock(&pgd_lock);
 #endif
 
 	/* Tell every cpu to notice the change. */
@@ -209,7 +147,7 @@ void shatter_huge_page(unsigned long addr)
 		     cpu_possible_mask, NULL, 0);
 
 	/* Hold the lock until the TLB flush is finished to avoid races. */
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock_irqrestore(&init_mm.page_table_lock, flags);
 }
 
 /*
@@ -218,9 +156,13 @@ void shatter_huge_page(unsigned long addr)
  * against pageattr.c; it is the unique case in which a valid change
  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
  * vmalloc faults work because attached pagetables are never freed.
- * The locking scheme was chosen on the basis of manfred's
- * recommendations and having no core impact whatsoever.
- * -- wli
+ *
+ * The lock is always taken with interrupts disabled, unlike on x86
+ * and other platforms, because we need to take the lock in
+ * shatter_huge_page(), which may be called from an interrupt context.
+ * We are not at risk from the tlbflush IPI deadlock that was seen on
+ * x86, since we use the flush_remote() API to have the hypervisor do
+ * the TLB flushes regardless of irq disabling.
  */
 DEFINE_SPINLOCK(pgd_lock);
 LIST_HEAD(pgd_list);
@@ -288,35 +230,32 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 #define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)
 
-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+struct page *pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
+			       int order)
 {
 	gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO;
 	struct page *p;
-#if L2_USER_PGTABLE_ORDER > 0
 	int i;
-#endif
-
-#ifdef CONFIG_HIGHPTE
-	flags |= __GFP_HIGHMEM;
-#endif
 
 	p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
 	if (p == NULL)
 		return NULL;
 
-#if L2_USER_PGTABLE_ORDER > 0
+	if (!pgtable_page_ctor(p)) {
+		__free_pages(p, L2_USER_PGTABLE_ORDER);
+		return NULL;
+	}
+
 	/*
 	 * Make every page have a page_count() of one, not just the first.
 	 * We don't use __GFP_COMP since it doesn't look like it works
 	 * correctly with tlb_remove_page().
 	 */
-	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+	for (i = 1; i < order; ++i) {
 		init_page_count(p+i);
 		inc_zone_page_state(p+i, NR_PAGETABLE);
 	}
-#endif
 
-	pgtable_page_ctor(p);
 	return p;
 }
 
@@ -325,28 +264,28 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
  * process).  We have to correct whatever pte_alloc_one() did before
  * returning the pages to the allocator.
  */
-void pte_free(struct mm_struct *mm, struct page *p)
+void pgtable_free(struct mm_struct *mm, struct page *p, int order)
 {
 	int i;
 
 	pgtable_page_dtor(p);
 	__free_page(p);
 
-	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+	for (i = 1; i < order; ++i) {
 		__free_page(p+i);
 		dec_zone_page_state(p+i, NR_PAGETABLE);
 	}
 }
 
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
-		    unsigned long address)
+void __pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte,
+			unsigned long address, int order)
 {
 	int i;
 
 	pgtable_page_dtor(pte);
 	tlb_remove_page(tlb, pte);
 
-	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+	for (i = 1; i < order; ++i) {
 		tlb_remove_page(tlb, pte + i);
 		dec_zone_page_state(pte + i, NR_PAGETABLE);
 	}
@@ -389,6 +328,17 @@ void ptep_set_wrprotect(struct mm_struct *mm,
 
 #endif
 
+/*
+ * Return a pointer to the PTE that corresponds to the given
+ * address in the given page table.  A NULL page table just uses
+ * the standard kernel page table; the preferred API in this case
+ * is virt_to_kpte().
+ *
+ * The returned pointer can point to a huge page in other levels
+ * of the page table than the bottom, if the huge page is present
+ * in the page table.  For bottom-level PTEs, the returned pointer
+ * can point to a PTE that is either present or not.
+ */
 pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -402,13 +352,23 @@ pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
 	pud = pud_offset(pgd, addr);
 	if (!pud_present(*pud))
 		return NULL;
+	if (pud_huge_page(*pud))
+		return (pte_t *)pud;
 	pmd = pmd_offset(pud, addr);
-	if (pmd_huge_page(*pmd))
-		return (pte_t *)pmd;
 	if (!pmd_present(*pmd))
 		return NULL;
+	if (pmd_huge_page(*pmd))
+		return (pte_t *)pmd;
 	return pte_offset_kernel(pmd, addr);
 }
+EXPORT_SYMBOL(virt_to_pte);
+
+pte_t *virt_to_kpte(unsigned long kaddr)
+{
+	BUG_ON(kaddr < PAGE_OFFSET);
+	return virt_to_pte(NULL, kaddr);
+}
+EXPORT_SYMBOL(virt_to_kpte);
 
 pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
 {
@@ -470,10 +430,18 @@ void __set_pte(pte_t *ptep, pte_t pte)
 
 void set_pte(pte_t *ptep, pte_t pte)
 {
-	struct page *page = pfn_to_page(pte_pfn(pte));
-
-	/* Update the home of a PTE if necessary */
-	pte = pte_set_home(pte, page_home(page));
+	if (pte_present(pte) &&
+	    (!CHIP_HAS_MMIO() || hv_pte_get_mode(pte) != HV_PTE_MODE_MMIO)) {
+		/* The PTE actually references physical memory. */
+		unsigned long pfn = pte_pfn(pte);
+		if (pfn_valid(pfn)) {
+			/* Update the home of the PTE from the struct page. */
+			pte = pte_set_home(pte, page_home(pfn_to_page(pfn)));
+		} else if (hv_pte_get_mode(pte) == 0) {
+			/* remap_pfn_range(), etc, must supply PTE mode. */
+			panic("set_pte(): out-of-range PFN and mode 0\n");
+		}
+	}
 
 	__set_pte(ptep, pte);
 }
@@ -481,7 +449,7 @@ void set_pte(pte_t *ptep, pte_t pte)
 /* Can this mm load a PTE with cached_priority set? */
 static inline int mm_is_priority_cached(struct mm_struct *mm)
 {
-	return mm->context.priority_cached;
+	return mm->context.priority_cached != 0;
 }
 
 /*
@@ -491,8 +459,8 @@ static inline int mm_is_priority_cached(struct mm_struct *mm)
 void start_mm_caching(struct mm_struct *mm)
 {
 	if (!mm_is_priority_cached(mm)) {
-		mm->context.priority_cached = -1U;
-		hv_set_caching(-1U);
+		mm->context.priority_cached = -1UL;
+		hv_set_caching(-1UL);
 	}
 }
 
@@ -507,7 +475,7 @@ void start_mm_caching(struct mm_struct *mm)
  * Presumably we'll come back later and have more luck and clear
  * the value then; for now we'll just keep the cache marked for priority.
  */
-static unsigned int update_priority_cached(struct mm_struct *mm)
+static unsigned long update_priority_cached(struct mm_struct *mm)
 {
 	if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
 		struct vm_area_struct *vm;
@@ -575,20 +543,13 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
 	addr = area->addr;
 	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
 			       phys_addr, pgprot)) {
-		remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+		free_vm_area(area);
 		return NULL;
 	}
 	return (__force void __iomem *) (offset + (char *)addr);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-/* Map a PCI MMIO bus address into VA space. */
-void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
-{
-	panic("ioremap for PCI MMIO is not supported");
-}
-EXPORT_SYMBOL(ioremap);
-
 /* Unmap an MMIO VA mapping. */
 void iounmap(volatile void __iomem *addr_in)
 {
@@ -606,12 +567,7 @@ void iounmap(volatile void __iomem *addr_in)
 	   in parallel. Reuse of the virtual address is prevented by
 	   leaving it in the global lists until we're done with it.
 	   cpa takes care of the direct mappings. */
-	read_lock(&vmlist_lock);
-	for (p = vmlist; p; p = p->next) {
-		if (p->addr == addr)
-			break;
-	}
-	read_unlock(&vmlist_lock);
+	p = find_vm_area((void *)addr);
 
 	if (!p) {
 		pr_err("iounmap: bad address %p\n", addr);