26 files changed, 669 insertions, 294 deletions
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index 0bb42642343..143473e3a0b 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -2,6 +2,7 @@ include include/asm-generic/Kbuild.asm
 
 header-y += ../arch/
 
+header-y += cachectl.h
 header-y += ucontext.h
 header-y += hardwall.h
 
@@ -21,7 +22,6 @@ generic-y += ipcbuf.h
 generic-y += irq_regs.h
 generic-y += kdebug.h
 generic-y += local.h
-generic-y += module.h
 generic-y += msgbuf.h
 generic-y += mutex.h
 generic-y += param.h
diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h
index 54d1da826f9..e7fb5cfb959 100644
--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -303,7 +303,14 @@ void __init_atomic_per_cpu(void);
 void __atomic_fault_unlock(int *lock_ptr);
 #endif
 
+/* Return a pointer to the lock for the given address. */
+int *__atomic_hashed_lock(volatile void *v);
+
 /* Private helper routines in lib/atomic_asm_32.S */
+struct __get_user {
+	unsigned long val;
+	int err;
+};
 extern struct __get_user __atomic_cmpxchg(volatile int *p,
 					  int *lock, int o, int n);
 extern struct __get_user __atomic_xchg(volatile int *p, int *lock, int n);
@@ -319,6 +326,9 @@ extern u64 __atomic64_xchg_add(volatile u64 *p, int *lock, u64 n);
 extern u64 __atomic64_xchg_add_unless(volatile u64 *p,
 				      int *lock, u64 o, u64 n);
 
+/* Return failure from the atomic wrappers. */
+struct __get_user __atomic_bad_address(int __user *addr);
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_TILE_ATOMIC_32_H */
diff --git a/arch/tile/include/asm/bitops.h b/arch/tile/include/asm/bitops.h
index 16f1fa51fea..bd186c4eaa5 100644
--- a/arch/tile/include/asm/bitops.h
+++ b/arch/tile/include/asm/bitops.h
@@ -77,6 +77,11 @@ static inline int ffs(int x)
 	return __builtin_ffs(x);
 }
 
+static inline int fls64(__u64 w)
+{
+	return (sizeof(__u64) * 8) - __builtin_clzll(w);
+}
+
 /**
  * fls - find last set bit in word
  * @x: the word to search
@@ -90,12 +95,7 @@ static inline int ffs(int x)
  */
 static inline int fls(int x)
 {
-	return (sizeof(int) * 8) - __builtin_clz(x);
-}
-
-static inline int fls64(__u64 w)
-{
-	return (sizeof(__u64) * 8) - __builtin_clzll(w);
+	return fls64((unsigned int) x);
 }
 
 static inline unsigned int __arch_hweight32(unsigned int w)
diff --git a/arch/tile/include/asm/byteorder.h b/arch/tile/include/asm/byteorder.h
index 9558416d578..fb72ecf4921 100644
--- a/arch/tile/include/asm/byteorder.h
+++ b/arch/tile/include/asm/byteorder.h
@@ -1 +1,21 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#if defined (__BIG_ENDIAN__)
+#include <linux/byteorder/big_endian.h>
+#elif defined (__LITTLE_ENDIAN__)
 #include <linux/byteorder/little_endian.h>
+#else
+#error "__BIG_ENDIAN__ or __LITTLE_ENDIAN__ must be defined."
+#endif
diff --git a/arch/tile/include/asm/cachectl.h b/arch/tile/include/asm/cachectl.h
new file mode 100644
index 00000000000..af4c9f9154d
--- /dev/null
+++ b/arch/tile/include/asm/cachectl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_CACHECTL_H
+#define _ASM_TILE_CACHECTL_H
+
+/*
+ * Options for cacheflush system call.
+ *
+ * The ICACHE flush is performed on all cores currently running the
+ * current process's address space.  The intent is for user
+ * applications to be able to modify code, invoke the system call,
+ * then allow arbitrary other threads in the same address space to see
+ * the newly-modified code.  Passing a length of CHIP_L1I_CACHE_SIZE()
+ * or more invalidates the entire icache on all cores in the address
+ * spaces.  (Note: currently this option invalidates the entire icache
+ * regardless of the requested address and length, but we may choose
+ * to honor the arguments at some point.)
+ *
+ * Flush and invalidation of memory can normally be performed with the
+ * __insn_flush(), __insn_inv(), and __insn_finv() instructions from
+ * userspace.  The DCACHE option to the system call allows userspace
+ * to flush the entire L1+L2 data cache from the core.  In this case,
+ * the address and length arguments are not used.  The DCACHE flush is
+ * restricted to the current core, not all cores in the address space.
+ */
+#define	ICACHE	(1<<0)		/* invalidate L1 instruction cache */
+#define	DCACHE	(1<<1)		/* flush and invalidate data cache */
+#define	BCACHE	(ICACHE|DCACHE)	/* flush both caches               */
+
+#endif	/* _ASM_TILE_CACHECTL_H */
diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h
index 4b4b28969a6..69adc08d36a 100644
--- a/arch/tile/include/asm/compat.h
+++ b/arch/tile/include/asm/compat.h
@@ -242,9 +242,6 @@ long compat_sys_fallocate(int fd, int mode,
 long compat_sys_sched_rr_get_interval(compat_pid_t pid,
 				      struct compat_timespec __user *interval);
 
-/* Tilera Linux syscalls that don't have "compat" versions. */
-#define compat_sys_flush_cache sys_flush_cache
-
 /* These are the intvec_64.S trampolines. */
 long _compat_sys_execve(const char __user *path,
 			const compat_uptr_t __user *argv,
diff --git a/arch/tile/include/asm/elf.h b/arch/tile/include/asm/elf.h
index 623a6bb741c..d16d006d660 100644
--- a/arch/tile/include/asm/elf.h
+++ b/arch/tile/include/asm/elf.h
@@ -44,7 +44,11 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
 #else
 #define ELF_CLASS	ELFCLASS32
 #endif
+#ifdef __BIG_ENDIAN__
+#define ELF_DATA	ELFDATA2MSB
+#else
 #define ELF_DATA	ELFDATA2LSB
+#endif
 
 /*
  * There seems to be a bug in how compat_binfmt_elf.c works: it
@@ -59,6 +63,7 @@ enum { ELF_ARCH = CHIP_ELF_TYPE() };
  */
 #define elf_check_arch(x)  \
 	((x)->e_ident[EI_CLASS] == ELF_CLASS && \
+	 (x)->e_ident[EI_DATA] == ELF_DATA && \
 	 (x)->e_machine == CHIP_ELF_TYPE())
 
 /* The module loader only handles a few relocation types. */
diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h
index d03ec124a59..5909ac3d721 100644
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -28,29 +28,81 @@
 #include <linux/futex.h>
 #include <linux/uaccess.h>
 #include <linux/errno.h>
+#include <asm/atomic.h>
 
-extern struct __get_user futex_set(u32 __user *v, int i);
-extern struct __get_user futex_add(u32 __user *v, int n);
-extern struct __get_user futex_or(u32 __user *v, int n);
-extern struct __get_user futex_andn(u32 __user *v, int n);
-extern struct __get_user futex_cmpxchg(u32 __user *v, int o, int n);
+/*
+ * Support macros for futex operations.  Do not use these macros directly.
+ * They assume "ret", "val", "oparg", and "uaddr" in the lexical context.
+ * __futex_cmpxchg() additionally assumes "oldval".
+ */
+
+#ifdef __tilegx__
+
+#define __futex_asm(OP) \
+	asm("1: {" #OP " %1, %3, %4; movei %0, 0 }\n"		\
+	    ".pushsection .fixup,\"ax\"\n"			\
+	    "0: { movei %0, %5; j 9f }\n"			\
+	    ".section __ex_table,\"a\"\n"			\
+	    ".quad 1b, 0b\n"					\
+	    ".popsection\n"					\
+	    "9:"						\
+	    : "=r" (ret), "=r" (val), "+m" (*(uaddr))		\
+	    : "r" (uaddr), "r" (oparg), "i" (-EFAULT))
+
+#define __futex_set() __futex_asm(exch4)
+#define __futex_add() __futex_asm(fetchadd4)
+#define __futex_or() __futex_asm(fetchor4)
+#define __futex_andn() ({ oparg = ~oparg; __futex_asm(fetchand4); })
+#define __futex_cmpxchg() \
+	({ __insn_mtspr(SPR_CMPEXCH_VALUE, oldval); __futex_asm(cmpexch4); })
+
+#define __futex_xor()						\
+	({							\
+		u32 oldval, n = oparg;				\
+		if ((ret = __get_user(oldval, uaddr)) == 0) {	\
+			do {					\
+				oparg = oldval ^ n;		\
+				__futex_cmpxchg();		\
+			} while (ret == 0 && oldval != val);	\
+		}						\
+	})
+
+/* No need to prefetch, since the atomic ops go to the home cache anyway. */
+#define __futex_prolog()
 
-#ifndef __tilegx__
-extern struct __get_user futex_xor(u32 __user *v, int n);
 #else
-static inline struct __get_user futex_xor(u32 __user *uaddr, int n)
-{
-	struct __get_user asm_ret = __get_user_4(uaddr);
-	if (!asm_ret.err) {
-		int oldval, newval;
-		do {
-			oldval = asm_ret.val;
-			newval = oldval ^ n;
-			asm_ret = futex_cmpxchg(uaddr, oldval, newval);
-		} while (asm_ret.err == 0 && oldval != asm_ret.val);
+
+#define __futex_call(FN)						\
+	{								\
+		struct __get_user gu = FN((u32 __force *)uaddr, lock, oparg); \
+		val = gu.val;						\
+		ret = gu.err;						\
 	}
-	return asm_ret;
-}
+
+#define __futex_set() __futex_call(__atomic_xchg)
+#define __futex_add() __futex_call(__atomic_xchg_add)
+#define __futex_or() __futex_call(__atomic_or)
+#define __futex_andn() __futex_call(__atomic_andn)
+#define __futex_xor() __futex_call(__atomic_xor)
+
+#define __futex_cmpxchg()						\
+	{								\
+		struct __get_user gu = __atomic_cmpxchg((u32 __force *)uaddr, \
+							lock, oldval, oparg); \
+		val = gu.val;						\
+		ret = gu.err;						\
+	}
+
+/*
+ * Find the lock pointer for the atomic calls to use, and issue a
+ * prefetch to the user address to bring it into cache.  Similar to
+ * __atomic_setup(), but we can't do a read into the L1 since it might
+ * fault; instead we do a prefetch into the L2.
+ */
+#define __futex_prolog()					\
+	int *lock;						\
+	__insn_prefetch(uaddr);					\
+	lock = __atomic_hashed_lock((int __force *)uaddr)
 #endif
 
 static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
@@ -59,8 +111,12 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	int cmp = (encoded_op >> 24) & 15;
 	int oparg = (encoded_op << 8) >> 20;
 	int cmparg = (encoded_op << 20) >> 20;
-	int ret;
-	struct __get_user asm_ret;
+	int uninitialized_var(val), ret;
+
+	__futex_prolog();
+
+	/* The 32-bit futex code makes this assumption, so validate it here. */
+	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
 
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
@@ -71,46 +127,45 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	pagefault_disable();
 	switch (op) {
 	case FUTEX_OP_SET:
-		asm_ret = futex_set(uaddr, oparg);
+		__futex_set();
 		break;
 	case FUTEX_OP_ADD:
-		asm_ret = futex_add(uaddr, oparg);
+		__futex_add();
 		break;
 	case FUTEX_OP_OR:
-		asm_ret = futex_or(uaddr, oparg);
+		__futex_or();
 		break;
 	case FUTEX_OP_ANDN:
-		asm_ret = futex_andn(uaddr, oparg);
+		__futex_andn();
 		break;
 	case FUTEX_OP_XOR:
-		asm_ret = futex_xor(uaddr, oparg);
+		__futex_xor();
 		break;
 	default:
-		asm_ret.err = -ENOSYS;
+		ret = -ENOSYS;
+		break;
 	}
 	pagefault_enable();
 
-	ret = asm_ret.err;
-
 	if (!ret) {
 		switch (cmp) {
 		case FUTEX_OP_CMP_EQ:
-			ret = (asm_ret.val == cmparg);
+			ret = (val == cmparg);
 			break;
 		case FUTEX_OP_CMP_NE:
-			ret = (asm_ret.val != cmparg);
+			ret = (val != cmparg);
 			break;
 		case FUTEX_OP_CMP_LT:
-			ret = (asm_ret.val < cmparg);
+			ret = (val < cmparg);
 			break;
 		case FUTEX_OP_CMP_GE:
-			ret = (asm_ret.val >= cmparg);
+			ret = (val >= cmparg);
 			break;
 		case FUTEX_OP_CMP_LE:
-			ret = (asm_ret.val <= cmparg);
+			ret = (val <= cmparg);
 			break;
 		case FUTEX_OP_CMP_GT:
-			ret = (asm_ret.val > cmparg);
+			ret = (val > cmparg);
 			break;
 		default:
 			ret = -ENOSYS;
@@ -120,22 +175,20 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 }
 
 static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
-						u32 oldval, u32 newval)
+						u32 oldval, u32 oparg)
 {
-	struct __get_user asm_ret;
+	int ret, val;
+
+	__futex_prolog();
 
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
-	asm_ret = futex_cmpxchg(uaddr, oldval, newval);
-	*uval = asm_ret.val;
-	return asm_ret.err;
-}
+	__futex_cmpxchg();
 
-#ifndef __tilegx__
-/* Return failure from the atomic wrappers. */
-struct __get_user __atomic_bad_address(int __user *addr);
-#endif
+	*uval = val;
+	return ret;
+}
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/tile/include/asm/hardwall.h b/arch/tile/include/asm/hardwall.h
index 2ac422848c7..47514a58d68 100644
--- a/arch/tile/include/asm/hardwall.h
+++ b/arch/tile/include/asm/hardwall.h
@@ -11,12 +11,14 @@
  *   NON INFRINGEMENT.  See the GNU General Public License for
  *   more details.
  *
- * Provide methods for the HARDWALL_FILE for accessing the UDN.
+ * Provide methods for access control of per-cpu resources like
+ * UDN, IDN, or IPI.
  */
 
 #ifndef _ASM_TILE_HARDWALL_H
 #define _ASM_TILE_HARDWALL_H
 
+#include <arch/chip.h>
 #include <linux/ioctl.h>
 
 #define HARDWALL_IOCTL_BASE 0xa2
@@ -24,8 +26,9 @@
 /*
  * The HARDWALL_CREATE() ioctl is a macro with a "size" argument.
  * The resulting ioctl value is passed to the kernel in conjunction
- * with a pointer to a little-endian bitmask of cpus, which must be
- * physically in a rectangular configuration on the chip.
+ * with a pointer to a standard kernel bitmask of cpus.
+ * For network resources (UDN or IDN) the bitmask must physically
+ * represent a rectangular configuration on the chip.
  * The "size" is the number of bytes of cpu mask data.
  */
 #define _HARDWALL_CREATE 1
@@ -44,13 +47,7 @@
 #define HARDWALL_GET_ID \
  _IO(HARDWALL_IOCTL_BASE, _HARDWALL_GET_ID)
 
-#ifndef __KERNEL__
-
-/* This is the canonical name expected by userspace. */
-#define HARDWALL_FILE "/dev/hardwall"
-
-#else
-
+#ifdef __KERNEL__
 /* /proc hooks for hardwall. */
 struct proc_dir_entry;
 #ifdef CONFIG_HARDWALL
@@ -59,7 +56,6 @@ int proc_pid_hardwall(struct task_struct *task, char *buffer);
 #else
 static inline void proc_tile_hardwall_init(struct proc_dir_entry *root) {}
 #endif
-
 #endif
 
 #endif /* _ASM_TILE_HARDWALL_H */
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h
index d396d180516..b2042380a5a 100644
--- a/arch/tile/include/asm/hugetlb.h
+++ b/arch/tile/include/asm/hugetlb.h
@@ -106,4 +106,25 @@ static inline void arch_release_hugepage(struct page *page)
 {
 }
 
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+				       struct page *page, int writable)
+{
+	size_t pagesize = huge_page_size(hstate_vma(vma));
+	if (pagesize != PUD_SIZE && pagesize != PMD_SIZE)
+		entry = pte_mksuper(entry);
+	return entry;
+}
+#define arch_make_huge_pte arch_make_huge_pte
+
+/* Sizes to scale up page size for PTEs with HV_PTE_SUPER bit. */
+enum {
+	HUGE_SHIFT_PGDIR = 0,
+	HUGE_SHIFT_PMD = 1,
+	HUGE_SHIFT_PAGE = 2,
+	HUGE_SHIFT_ENTRIES
+};
+extern int huge_shift[HUGE_SHIFT_ENTRIES];
+#endif
+
 #endif /* _ASM_TILE_HUGETLB_H */
diff --git a/arch/tile/include/asm/irqflags.h b/arch/tile/include/asm/irqflags.h
index 5db0ce54284..b4e96fef2cf 100644
--- a/arch/tile/include/asm/irqflags.h
+++ b/arch/tile/include/asm/irqflags.h
@@ -28,10 +28,10 @@
  */
 #if CHIP_HAS_AUX_PERF_COUNTERS()
 #define LINUX_MASKABLE_INTERRUPTS_HI \
-       (~(INT_MASK_HI(INT_PERF_COUNT) | INT_MASK_HI(INT_AUX_PERF_COUNT)))
+	(~(INT_MASK_HI(INT_PERF_COUNT) | INT_MASK_HI(INT_AUX_PERF_COUNT)))
 #else
 #define LINUX_MASKABLE_INTERRUPTS_HI \
-       (~(INT_MASK_HI(INT_PERF_COUNT)))
+	(~(INT_MASK_HI(INT_PERF_COUNT)))
 #endif
 
 #else
@@ -90,6 +90,14 @@
 	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K_0, (unsigned long)(__m)); \
 	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K_1, (unsigned long)(__m>>32)); \
 } while (0)
+#define interrupt_mask_save_mask() \
+	(__insn_mfspr(SPR_INTERRUPT_MASK_SET_K_0) | \
+	 (((unsigned long long)__insn_mfspr(SPR_INTERRUPT_MASK_SET_K_1))<<32))
+#define interrupt_mask_restore_mask(mask) do { \
+	unsigned long long __m = (mask); \
+	__insn_mtspr(SPR_INTERRUPT_MASK_K_0, (unsigned long)(__m)); \
+	__insn_mtspr(SPR_INTERRUPT_MASK_K_1, (unsigned long)(__m>>32)); \
+} while (0)
 #else
 #define interrupt_mask_set(n) \
 	__insn_mtspr(SPR_INTERRUPT_MASK_SET_K, (1UL << (n)))
@@ -101,6 +109,10 @@
 	__insn_mtspr(SPR_INTERRUPT_MASK_SET_K, (mask))
 #define interrupt_mask_reset_mask(mask) \
 	__insn_mtspr(SPR_INTERRUPT_MASK_RESET_K, (mask))
+#define interrupt_mask_save_mask() \
+	__insn_mfspr(SPR_INTERRUPT_MASK_K)
+#define interrupt_mask_restore_mask(mask) \
+	__insn_mtspr(SPR_INTERRUPT_MASK_K, (mask))
 #endif
 
 /*
@@ -122,7 +134,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 
 /* Disable all interrupts, including NMIs. */
 #define arch_local_irq_disable_all() \
-	interrupt_mask_set_mask(-1UL)
+	interrupt_mask_set_mask(-1ULL)
 
 /* Re-enable all maskable interrupts. */
 #define arch_local_irq_enable() \
@@ -179,7 +191,7 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 #ifdef __tilegx__
 
 #if INT_MEM_ERROR != 0
-# error Fix IRQ_DISABLED() macro
+# error Fix IRQS_DISABLED() macro
 #endif
 
 /* Return 0 or 1 to indicate whether interrupts are currently disabled. */
@@ -207,9 +219,10 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 	mtspr   SPR_INTERRUPT_MASK_SET_K, tmp
 
 /* Enable interrupts. */
-#define IRQ_ENABLE(tmp0, tmp1)					\
+#define IRQ_ENABLE_LOAD(tmp0, tmp1)				\
 	GET_INTERRUPTS_ENABLED_MASK_PTR(tmp0);			\
-	ld      tmp0, tmp0;					\
+	ld      tmp0, tmp0
+#define IRQ_ENABLE_APPLY(tmp0, tmp1)				\
 	mtspr   SPR_INTERRUPT_MASK_RESET_K, tmp0
 
 #else /* !__tilegx__ */
@@ -253,17 +266,22 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 	mtspr   SPR_INTERRUPT_MASK_SET_K_1, tmp
 
 /* Enable interrupts. */
-#define IRQ_ENABLE(tmp0, tmp1)					\
+#define IRQ_ENABLE_LOAD(tmp0, tmp1)				\
 	GET_INTERRUPTS_ENABLED_MASK_PTR(tmp0);			\
 	{							\
 	 lw     tmp0, tmp0;					\
 	 addi   tmp1, tmp0, 4					\
 	};							\
-	lw      tmp1, tmp1;					\
+	lw      tmp1, tmp1
+#define IRQ_ENABLE_APPLY(tmp0, tmp1)				\
 	mtspr   SPR_INTERRUPT_MASK_RESET_K_0, tmp0;		\
 	mtspr   SPR_INTERRUPT_MASK_RESET_K_1, tmp1
 #endif
 
+#define IRQ_ENABLE(tmp0, tmp1)					\
+	IRQ_ENABLE_LOAD(tmp0, tmp1);				\
+	IRQ_ENABLE_APPLY(tmp0, tmp1)
+
 /*
  * Do the CPU's IRQ-state tracing from assembly code. We call a
  * C function, but almost everywhere we do, we don't mind clobbering
diff --git a/arch/tile/include/asm/kexec.h b/arch/tile/include/asm/kexec.h
index c11a6cc73bb..fc98ccfc98a 100644
--- a/arch/tile/include/asm/kexec.h
+++ b/arch/tile/include/asm/kexec.h
@@ -19,12 +19,24 @@
 
 #include <asm/page.h>
 
+#ifndef __tilegx__
 /* Maximum physical address we can use pages from. */
 #define KEXEC_SOURCE_MEMORY_LIMIT TASK_SIZE
 /* Maximum address we can reach in physical address mode. */
 #define KEXEC_DESTINATION_MEMORY_LIMIT TASK_SIZE
 /* Maximum address we can use for the control code buffer. */
 #define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+#else
+/* We need to limit the memory below PGDIR_SIZE since
+ * we only setup page table for [0, PGDIR_SIZE) before final kexec.
+ */
+/* Maximum physical address we can use pages from. */
+#define KEXEC_SOURCE_MEMORY_LIMIT PGDIR_SIZE
+/* Maximum address we can reach in physical address mode. */
+#define KEXEC_DESTINATION_MEMORY_LIMIT PGDIR_SIZE
+/* Maximum address we can use for the control code buffer. */
+#define KEXEC_CONTROL_MEMORY_LIMIT PGDIR_SIZE
+#endif
 
 #define KEXEC_CONTROL_PAGE_SIZE	PAGE_SIZE
 
diff --git a/arch/tile/include/asm/mmu.h b/arch/tile/include/asm/mmu.h
index 92f94c77b6e..e2c78909679 100644
--- a/arch/tile/include/asm/mmu.h
+++ b/arch/tile/include/asm/mmu.h
@@ -21,7 +21,7 @@ struct mm_context {
 	 * Written under the mmap_sem semaphore; read without the
 	 * semaphore but atomically, but it is conservatively set.
 	 */
-	unsigned int priority_cached;
+	unsigned long priority_cached;
 };
 
 typedef struct mm_context mm_context_t;
diff --git a/arch/tile/include/asm/mmu_context.h b/arch/tile/include/asm/mmu_context.h
index 15fb2464112..37f0b741dee 100644
--- a/arch/tile/include/asm/mmu_context.h
+++ b/arch/tile/include/asm/mmu_context.h
@@ -30,11 +30,15 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	return 0;
 }
 
-/* Note that arch/tile/kernel/head.S also calls hv_install_context() */
+/*
+ * Note that arch/tile/kernel/head_NN.S and arch/tile/mm/migrate_NN.S
+ * also call hv_install_context().
+ */
 static inline void __install_page_table(pgd_t *pgdir, int asid, pgprot_t prot)
 {
 	/* FIXME: DIRECTIO should not always be set. FIXME. */
-	int rc = hv_install_context(__pa(pgdir), prot, asid, HV_CTX_DIRECTIO);
+	int rc = hv_install_context(__pa(pgdir), prot, asid,
+				    HV_CTX_DIRECTIO | CTX_PAGE_FLAG);
 	if (rc < 0)
 		panic("hv_install_context failed: %d", rc);
 }
diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h
new file mode 100644
index 00000000000..44ed07ccd3d
--- /dev/null
+++ b/arch/tile/include/asm/module.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_MODULE_H
+#define _ASM_TILE_MODULE_H
+
+#include <arch/chip.h>
+
+#include <asm-generic/module.h>
+
+/* We can't use modules built with different page sizes. */
+#if defined(CONFIG_PAGE_SIZE_16KB)
+# define MODULE_PGSZ " 16KB"
+#elif defined(CONFIG_PAGE_SIZE_64KB)
+# define MODULE_PGSZ " 64KB"
+#else
+# define MODULE_PGSZ ""
+#endif
+
+/* We don't really support no-SMP so tag if someone tries. */
+#ifdef CONFIG_SMP
+#define MODULE_NOSMP ""
+#else
+#define MODULE_NOSMP " nosmp"
+#endif
+
+#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_NOSMP
+
+#endif /* _ASM_TILE_MODULE_H */
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index db93518fac0..9d9131e5c55 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -20,8 +20,17 @@
 #include <arch/chip.h>
 
 /* PAGE_SHIFT and HPAGE_SHIFT determine the page sizes. */
-#define PAGE_SHIFT	HV_LOG2_PAGE_SIZE_SMALL
-#define HPAGE_SHIFT	HV_LOG2_PAGE_SIZE_LARGE
+#if defined(CONFIG_PAGE_SIZE_16KB)
+#define PAGE_SHIFT	14
+#define CTX_PAGE_FLAG	HV_CTX_PG_SM_16K
+#elif defined(CONFIG_PAGE_SIZE_64KB)
+#define PAGE_SHIFT	16
+#define CTX_PAGE_FLAG	HV_CTX_PG_SM_64K
+#else
+#define PAGE_SHIFT	HV_LOG2_DEFAULT_PAGE_SIZE_SMALL
+#define CTX_PAGE_FLAG	0
+#endif
+#define HPAGE_SHIFT	HV_LOG2_DEFAULT_PAGE_SIZE_LARGE
 
 #define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
 #define HPAGE_SIZE	(_AC(1, UL) << HPAGE_SHIFT)
@@ -78,8 +87,7 @@ typedef HV_PTE pgprot_t;
 /*
  * User L2 page tables are managed as one L2 page table per page,
  * because we use the page allocator for them.  This keeps the allocation
- * simple and makes it potentially useful to implement HIGHPTE at some point.
- * However, it's also inefficient, since L2 page tables are much smaller
+ * simple, but it's also inefficient, since L2 page tables are much smaller
  * than pages (currently 2KB vs 64KB).  So we should revisit this.
  */
 typedef struct page *pgtable_t;
@@ -128,7 +136,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
 
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 
-#define HUGE_MAX_HSTATE		2
+#define HUGE_MAX_HSTATE		6
 
 #ifdef CONFIG_HUGETLB_PAGE
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
diff --git a/arch/tile/include/asm/pgalloc.h b/arch/tile/include/asm/pgalloc.h
index e919c0bdc22..1b902508b66 100644
--- a/arch/tile/include/asm/pgalloc.h
+++ b/arch/tile/include/asm/pgalloc.h
@@ -19,24 +19,24 @@
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <asm/fixmap.h>
+#include <asm/page.h>
 #include <hv/hypervisor.h>
 
 /* Bits for the size of the second-level page table. */
-#define L2_KERNEL_PGTABLE_SHIFT \
-  (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL + HV_LOG2_PTE_SIZE)
+#define L2_KERNEL_PGTABLE_SHIFT _HV_LOG2_L2_SIZE(HPAGE_SHIFT, PAGE_SHIFT)
+
+/* How big is a kernel L2 page table? */
+#define L2_KERNEL_PGTABLE_SIZE (1UL << L2_KERNEL_PGTABLE_SHIFT)
 
 /* We currently allocate user L2 page tables by page (unlike kernel L2s). */
-#if L2_KERNEL_PGTABLE_SHIFT < HV_LOG2_PAGE_SIZE_SMALL
-#define L2_USER_PGTABLE_SHIFT HV_LOG2_PAGE_SIZE_SMALL
+#if L2_KERNEL_PGTABLE_SHIFT < PAGE_SHIFT
+#define L2_USER_PGTABLE_SHIFT PAGE_SHIFT
 #else
 #define L2_USER_PGTABLE_SHIFT L2_KERNEL_PGTABLE_SHIFT
 #endif
 
 /* How many pages do we need, as an "order", for a user L2 page table? */
-#define L2_USER_PGTABLE_ORDER (L2_USER_PGTABLE_SHIFT - HV_LOG2_PAGE_SIZE_SMALL)
-
-/* How big is a kernel L2 page table? */
-#define L2_KERNEL_PGTABLE_SIZE (1 << L2_KERNEL_PGTABLE_SHIFT)
+#define L2_USER_PGTABLE_ORDER (L2_USER_PGTABLE_SHIFT - PAGE_SHIFT)
 
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
@@ -50,14 +50,14 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 static inline void pmd_populate_kernel(struct mm_struct *mm,
 				       pmd_t *pmd, pte_t *ptep)
 {
-	set_pmd(pmd, ptfn_pmd(__pa(ptep) >> HV_LOG2_PAGE_TABLE_ALIGN,
+	set_pmd(pmd, ptfn_pmd(HV_CPA_TO_PTFN(__pa(ptep)),
 			      __pgprot(_PAGE_PRESENT)));
 }
 
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 				pgtable_t page)
 {
-	set_pmd(pmd, ptfn_pmd(HV_PFN_TO_PTFN(page_to_pfn(page)),
+	set_pmd(pmd, ptfn_pmd(HV_CPA_TO_PTFN(PFN_PHYS(page_to_pfn(page))),
 			      __pgprot(_PAGE_PRESENT)));
 }
 
@@ -68,8 +68,20 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 
-extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address);
-extern void pte_free(struct mm_struct *mm, struct page *pte);
+extern pgtable_t pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
+				   int order);
+extern void pgtable_free(struct mm_struct *mm, struct page *pte, int order);
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
+				      unsigned long address)
+{
+	return pgtable_alloc_one(mm, address, L2_USER_PGTABLE_ORDER);
+}
+
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+	pgtable_free(mm, pte, L2_USER_PGTABLE_ORDER);
+}
 
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
@@ -85,8 +97,13 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 	pte_free(mm, virt_to_page(pte));
 }
 
-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
-			   unsigned long address);
+extern void __pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte,
+			       unsigned long address, int order);
+static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
+				  unsigned long address)
+{
+	__pgtable_free_tlb(tlb, pte, address, L2_USER_PGTABLE_ORDER);
+}
 
 #define check_pgt_cache()	do { } while (0)
 
@@ -104,19 +121,44 @@ void shatter_pmd(pmd_t *pmd);