11 files changed, 994 insertions, 794 deletions
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 55e58e93bfc..23f044e8a7a 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -21,6 +21,8 @@
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/sections.h>
+#include <asm/vdso.h>
+#include <arch/sim.h>
 
 /* Notify a running simulator, if any, that an exec just occurred. */
 static void sim_notify_exec(const char *binary_name)
@@ -35,28 +37,57 @@ static void sim_notify_exec(const char *binary_name)
 	} while (c);
 }
 
-static int notify_exec(void)
+static int notify_exec(struct mm_struct *mm)
 {
-	int retval = 0;  /* failure */
-	struct vm_area_struct *vma = current->mm->mmap;
-	while (vma) {
-		if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file)
+	char *buf, *path;
+	struct vm_area_struct *vma;
+
+	if (!sim_is_simulator())
+		return 1;
+
+	if (mm->exe_file == NULL)
+		return 0;
+
+	for (vma = current->mm->mmap; ; vma = vma->vm_next) {
+		if (vma == NULL)
+			return 0;
+		if (vma->vm_file == mm->exe_file)
 			break;
-		vma = vma->vm_next;
 	}
-	if (vma) {
-		char *buf = (char *) __get_free_page(GFP_KERNEL);
-		if (buf) {
-			char *path = d_path(&vma->vm_file->f_path,
-					    buf, PAGE_SIZE);
-			if (!IS_ERR(path)) {
-				sim_notify_exec(path);
-				retval = 1;
-			}
-			free_page((unsigned long)buf);
+
+	buf = (char *) __get_free_page(GFP_KERNEL);
+	if (buf == NULL)
+		return 0;
+
+	path = d_path(&mm->exe_file->f_path, buf, PAGE_SIZE);
+	if (IS_ERR(path)) {
+		free_page((unsigned long)buf);
+		return 0;
+	}
+
+	/*
+	 * Notify simulator of an ET_DYN object so we know the load address.
+	 * The somewhat cryptic overuse of SIM_CONTROL_DLOPEN allows us
+	 * to be backward-compatible with older simulator releases.
+	 */
+	if (vma->vm_start == (ELF_ET_DYN_BASE & PAGE_MASK)) {
+		char buf[64];
+		int i;
+
+		snprintf(buf, sizeof(buf), "0x%lx:@", vma->vm_start);
+		for (i = 0; ; ++i) {
+			char c = buf[i];
+			__insn_mtspr(SPR_SIM_CONTROL,
+				     (SIM_CONTROL_DLOPEN
+				      | (c << _SIM_CONTROL_OPERATOR_BITS)));
+			if (c == '\0')
+				break;
 		}
 	}
-	return retval;
+
+	sim_notify_exec(path);
+	free_page((unsigned long)buf);
+	return 1;
 }
 
 /* Notify a running simulator, if any, that we loaded an interpreter. */
@@ -72,63 +103,23 @@ static void sim_notify_interp(unsigned long load_addr)
 }
 
 
-/* Kernel address of page used to map read-only kernel data into userspace. */
-static void *vdso_page;
-
-/* One-entry array used for install_special_mapping. */
-static struct page *vdso_pages[1];
-
-static int __init vdso_setup(void)
-{
-	vdso_page = (void *)get_zeroed_page(GFP_ATOMIC);
-	memcpy(vdso_page, __rt_sigreturn, __rt_sigreturn_end - __rt_sigreturn);
-	vdso_pages[0] = virt_to_page(vdso_page);
-	return 0;
-}
-device_initcall(vdso_setup);
-
-const char *arch_vma_name(struct vm_area_struct *vma)
-{
-	if (vma->vm_private_data == vdso_pages)
-		return "[vdso]";
-#ifndef __tilegx__
-	if (vma->vm_start == MEM_USER_INTRPT)
-		return "[intrpt]";
-#endif
-	return NULL;
-}
-
 int arch_setup_additional_pages(struct linux_binprm *bprm,
 				int executable_stack)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long vdso_base;
 	int retval = 0;
 
+	down_write(&mm->mmap_sem);
+
 	/*
 	 * Notify the simulator that an exec just occurred.
 	 * If we can't find the filename of the mapping, just use
 	 * whatever was passed as the linux_binprm filename.
 	 */
-	if (!notify_exec())
+	if (!notify_exec(mm))
 		sim_notify_exec(bprm->filename);
 
-	down_write(&mm->mmap_sem);
-
-	/*
-	 * MAYWRITE to allow gdb to COW and set breakpoints
-	 *
-	 * Make sure the vDSO gets into every core dump.  Dumping its
-	 * contents makes post-mortem fully interpretable later
-	 * without matching up the same kernel and hardware config to
-	 * see what PC values meant.
-	 */
-	vdso_base = VDSO_BASE;
-	retval = install_special_mapping(mm, vdso_base, PAGE_SIZE,
-					 VM_READ|VM_EXEC|
-					 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
-					 VM_ALWAYSDUMP,
-					 vdso_pages);
+	retval = setup_vdso_pages();
 
 #ifndef __tilegx__
 	/*
@@ -140,7 +131,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 	if (!retval) {
 		unsigned long addr = MEM_USER_INTRPT;
 		addr = mmap_region(NULL, addr, INTRPT_SIZE,
-				   MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
 				   VM_READ|VM_EXEC|
 				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
 		if (addr > (unsigned long) -PAGE_SIZE)
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index f295b4ac941..6c0571216a9 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -24,7 +24,6 @@
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
-#include <linux/smp_lock.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/tty.h>
@@ -35,8 +34,8 @@
 #include <linux/hugetlb.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/kdebug.h>
 
-#include <asm/system.h>
 #include <asm/pgalloc.h>
 #include <asm/sections.h>
 #include <asm/traps.h>
@@ -44,15 +43,18 @@
 
 #include <arch/interrupts.h>
 
-static noinline void force_sig_info_fault(int si_signo, int si_code,
-	unsigned long address, int fault_num, struct task_struct *tsk)
+static noinline void force_sig_info_fault(const char *type, int si_signo,
+					  int si_code, unsigned long address,
+					  int fault_num,
+					  struct task_struct *tsk,
+					  struct pt_regs *regs)
 {
 	siginfo_t info;
 
 	if (unlikely(tsk->pid < 2)) {
 		panic("Signal %d (code %d) at %#lx sent to %s!",
 		      si_signo, si_code & 0xffff, address,
-		      tsk->pid ? "init" : "the idle task");
+		      is_idle_task(tsk) ? "the idle task" : "init");
 	}
 
 	info.si_signo = si_signo;
@@ -60,6 +62,7 @@ static noinline void force_sig_info_fault(int si_signo, int si_code,
 	info.si_code = si_code;
 	info.si_addr = (void __user *)address;
 	info.si_trapno = fault_num;
+	trace_unhandled_signal(type, regs, address, si_signo);
 	force_sig_info(si_signo, &info, tsk);
 }
 
@@ -68,15 +71,17 @@ static noinline void force_sig_info_fault(int si_signo, int si_code,
  * Synthesize the fault a PL0 process would get by doing a word-load of
  * an unaligned address or a high kernel address.
  */
-SYSCALL_DEFINE2(cmpxchg_badaddr, unsigned long, address,
-		struct pt_regs *, regs)
+SYSCALL_DEFINE1(cmpxchg_badaddr, unsigned long, address)
 {
+	struct pt_regs *regs = current_pt_regs();
+
 	if (address >= PAGE_OFFSET)
-		force_sig_info_fault(SIGSEGV, SEGV_MAPERR, address,
-				     INT_DTLB_MISS, current);
+		force_sig_info_fault("atomic segfault", SIGSEGV, SEGV_MAPERR,
+				     address, INT_DTLB_MISS, current, regs);
 	else
-		force_sig_info_fault(SIGBUS, BUS_ADRALN, address,
-				     INT_UNALIGN_DATA, current);
+		force_sig_info_fault("atomic alignment fault", SIGBUS,
+				     BUS_ADRALN, address,
+				     INT_UNALIGN_DATA, current, regs);
 
 	/*
 	 * Adjust pc to point at the actual instruction, which is unusual
@@ -118,16 +123,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 	pmd_k = pmd_offset(pud_k, address);
 	if (!pmd_present(*pmd_k))
 		return NULL;
-	if (!pmd_present(*pmd)) {
+	if (!pmd_present(*pmd))
 		set_pmd(pmd, *pmd_k);
-		arch_flush_lazy_mmu_mode();
-	} else
+	else
 		BUG_ON(pmd_ptfn(*pmd) != pmd_ptfn(*pmd_k));
 	return pmd_k;
 }
 
 /*
- * Handle a fault on the vmalloc or module mapping area
+ * Handle a fault on the vmalloc area.
  */
 static inline int vmalloc_fault(pgd_t *pgd, unsigned long address)
 {
@@ -145,8 +149,6 @@ static inline int vmalloc_fault(pgd_t *pgd, unsigned long address)
 	pmd_k = vmalloc_sync_one(pgd, address);
 	if (!pmd_k)
 		return -1;
-	if (pmd_huge(*pmd_k))
-		return 0;   /* support TILE huge_vmap() API */
 	pte_k = pte_offset_kernel(pmd_k, address);
 	if (!pte_present(*pte_k))
 		return -1;
@@ -184,7 +186,7 @@ static pgd_t *get_current_pgd(void)
 	HV_Context ctx = hv_inquire_context();
 	unsigned long pgd_pfn = ctx.page_table >> PAGE_SHIFT;
 	struct page *pgd_page = pfn_to_page(pgd_pfn);
-	BUG_ON(PageHighMem(pgd_page));   /* oops, HIGHPTE? */
+	BUG_ON(PageHighMem(pgd_page));
 	return (pgd_t *) __va(ctx.page_table);
 }
 
@@ -200,9 +202,14 @@ static pgd_t *get_current_pgd(void)
  * interrupt or a critical region, and must do as little as possible.
  * Similarly, we can't use atomic ops here, since we may be handling a
  * fault caused by an atomic op access.
+ *
+ * If we find a migrating PTE while we're in an NMI context, and we're
+ * at a PC that has a registered exception handler, we don't wait,
+ * since this thread may (e.g.) have been interrupted while migrating
+ * its own stack, which would then cause us to self-deadlock.
  */
 static int handle_migrating_pte(pgd_t *pgd, int fault_num,
-				unsigned long address,
+				unsigned long address, unsigned long pc,
 				int is_kernel_mode, int write)
 {
 	pud_t *pud;
@@ -224,6 +231,8 @@ static int handle_migrating_pte(pgd_t *pgd, int fault_num,
 		pte_offset_kernel(pmd, address);
 	pteval = *pte;
 	if (pte_migrating(pteval)) {
+		if (in_nmi() && search_exception_tables(pc))
+			return 0;
 		wait_for_migration(pte);
 		return 1;
 	}
@@ -263,12 +272,15 @@ static int handle_page_fault(struct pt_regs *regs,
 	int si_code;
 	int is_kernel_mode;
 	pgd_t *pgd;
+	unsigned int flags;
 
 	/* on TILE, protection faults are always writes */
 	if (!is_page_fault)
 		write = 1;
 
-	is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
+	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+	is_kernel_mode = !user_mode(regs);
 
 	tsk = validate_current();
 
@@ -291,13 +303,13 @@ static int handle_page_fault(struct pt_regs *regs,
 	/*
 	 * Early on, we need to check for migrating PTE entries;
 	 * see homecache.c.  If we find a migrating PTE, we wait until
-	 * the backing page claims to be done migrating, then we procede.
+	 * the backing page claims to be done migrating, then we proceed.
 	 * For kernel PTEs, we rewrite the PTE and return and retry.
 	 * Otherwise, we treat the fault like a normal "no PTE" fault,
 	 * rather than trying to patch up the existing PTE.
 	 */
 	pgd = get_current_pgd();
-	if (handle_migrating_pte(pgd, fault_num, address,
+	if (handle_migrating_pte(pgd, fault_num, address, regs->pc,
 				 is_kernel_mode, write))
 		return 1;
 
@@ -332,9 +344,12 @@ static int handle_page_fault(struct pt_regs *regs,
 	/*
 	 * If we're trying to touch user-space addresses, we must
 	 * be either at PL0, or else with interrupts enabled in the
-	 * kernel, so either way we can re-enable interrupts here.
+	 * kernel, so either way we can re-enable interrupts here
+	 * unless we are doing atomic access to user space with
+	 * interrupts disabled.
 	 */
-	local_irq_enable();
+	if (!(regs->flags & PT_FLAGS_DISABLE_IRQ))
+		local_irq_enable();
 
 	mm = tsk->mm;
 
@@ -347,6 +362,9 @@ static int handle_page_fault(struct pt_regs *regs,
 		goto bad_area_nosemaphore;
 	}
 
+	if (!is_kernel_mode)
+		flags |= FAULT_FLAG_USER;
+
 	/*
 	 * When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -369,6 +387,8 @@ static int handle_page_fault(struct pt_regs *regs,
 			vma = NULL;  /* happy compiler */
 			goto bad_area_nosemaphore;
 		}
+
+retry:
 		down_read(&mm->mmap_sem);
 	}
 
@@ -405,18 +425,22 @@ good_area:
 #endif
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	} else {
 		if (!is_page_fault || !(vma->vm_flags & VM_READ))
 			goto bad_area;
 	}
 
- survive:
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	fault = handle_mm_fault(mm, vma, address, write);
+	fault = handle_mm_fault(mm, vma, address, flags);
+
+	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
+		return 0;
+
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
@@ -424,33 +448,33 @@ good_area:
 			goto do_sigbus;
 		BUG();
 	}
-	if (fault & VM_FAULT_MAJOR)
-		tsk->maj_flt++;
-	else
-		tsk->min_flt++;
+	if (flags & FAULT_FLAG_ALLOW_RETRY) {
+		if (fault & VM_FAULT_MAJOR)
+			tsk->maj_flt++;
+		else
+			tsk->min_flt++;
+		if (fault & VM_FAULT_RETRY) {
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
+
+			 /*
+			  * No need to up_read(&mm->mmap_sem) as we would
+			  * have already released it in __lock_page_or_retry
+			  * in mm/filemap.c.
+			  */
+			goto retry;
+		}
+	}
 
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
-	/*
-	 * If this was an asynchronous fault,
-	 * restart the appropriate engine.
-	 */
-	switch (fault_num) {
 #if CHIP_HAS_TILE_DMA()
+	/* If this was a DMA TLB fault, restart the DMA engine. */
+	switch (fault_num) {
 	case INT_DMATLB_MISS:
 	case INT_DMATLB_MISS_DWNCL:
 	case INT_DMATLB_ACCESS:
 	case INT_DMATLB_ACCESS_DWNCL:
 		__insn_mtspr(SPR_DMA_CTR, SPR_DMA_CTR__REQUEST_MASK);
 		break;
-#endif
-#if CHIP_HAS_SN_PROC()
-	case INT_SNITLB_MISS:
-	case INT_SNITLB_MISS_DWNCL:
-		__insn_mtspr(SPR_SNCTL,
-			     __insn_mfspr(SPR_SNCTL) &
-			     ~SPR_SNCTL__FRZPROC_MASK);
-		break;
-#endif
 	}
 #endif
 
@@ -472,8 +496,8 @@ bad_area_nosemaphore:
 		 */
 		local_irq_enable();
 
-		force_sig_info_fault(SIGSEGV, si_code, address,
-				     fault_num, tsk);
+		force_sig_info_fault("segfault", SIGSEGV, si_code, address,
+				     fault_num, tsk, regs);
 		return 0;
 	}
 
@@ -511,7 +535,7 @@ no_context:
 
 	if (unlikely(tsk->pid < 2)) {
 		panic("Kernel page fault running %s!",
-		      tsk->pid ? "init" : "the idle task");
+		      is_idle_task(tsk) ? "the idle task" : "init");
 	}
 
 	/*
@@ -531,15 +555,10 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (is_global_init(tsk)) {
-		yield();
-		down_read(&mm->mmap_sem);
-		goto survive;
-	}
-	pr_alert("VM: killing process %s\n", tsk->comm);
-	if (!is_kernel_mode)
-		do_group_exit(SIGKILL);
-	goto no_context;
+	if (is_kernel_mode)
+		goto no_context;
+	pagefault_out_of_memory();
+	return 0;
 
 do_sigbus:
 	up_read(&mm->mmap_sem);
@@ -548,7 +567,8 @@ do_sigbus:
 	if (is_kernel_mode)
 		goto no_context;
 
-	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, fault_num, tsk);
+	force_sig_info_fault("bus error", SIGBUS, BUS_ADRERR, address,
+			     fault_num, tsk, regs);
 	return 0;
 }
 
@@ -656,20 +676,12 @@ struct intvec_state do_page_fault_ics(struct pt_regs *regs, int fault_num,
 	}
 
 	/*
-	 * NOTE: the one other type of access that might bring us here
-	 * are the memory ops in __tns_atomic_acquire/__tns_atomic_release,
-	 * but we don't have to check specially for them since we can
-	 * always safely return to the address of the fault and retry,
-	 * since no separate atomic locks are involved.
-	 */
-
-	/*
 	 * Now that we have released the atomic lock (if necessary),
 	 * it's safe to spin if the PTE that caused the fault was migrating.
 	 */
 	if (fault_num == INT_DTLB_ACCESS)
 		write = 1;
-	if (handle_migrating_pte(pgd, fault_num, address, 1, write))
+	if (handle_migrating_pte(pgd, fault_num, address, pc, 1, write))
 		return state;
 
 	/* Return zero so that we continue on with normal fault handling. */
@@ -692,8 +704,60 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 {
 	int is_page_fault;
 
+#ifdef CONFIG_KPROBES
+	/*
+	 * This is to notify the fault handler of the kprobes.  The
+	 * exception code is redundant as it is also carried in REGS,
+	 * but we pass it anyhow.
+	 */
+	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, -1,
+		       regs->faultnum, SIGSEGV) == NOTIFY_STOP)
+		return;
+#endif
+
+#ifdef __tilegx__
+	/*
+	 * We don't need early do_page_fault_ics() support, since unlike
+	 * Pro we don't need to worry about unlocking the atomic locks.
+	 * There is only one current case in GX where we touch any memory
+	 * under ICS other than our own kernel stack, and we handle that
+	 * here.  (If we crash due to trying to touch our own stack,
+	 * we're in too much trouble for C code to help out anyway.)
+	 */
+	if (write & ~1) {
+		unsigned long pc = write & ~1;
+		if (pc >= (unsigned long) __start_unalign_asm_code &&
+		    pc < (unsigned long) __end_unalign_asm_code) {
+			struct thread_info *ti = current_thread_info();
+			/*
+			 * Our EX_CONTEXT is still what it was from the
+			 * initial unalign exception, but now we've faulted
+			 * on the JIT page.  We would like to complete the
+			 * page fault however is appropriate, and then retry
+			 * the instruction that caused the unalign exception.
+			 * Our state has been "corrupted" by setting the low
+			 * bit in "sp", and stashing r0..r3 in the
+			 * thread_info area, so we revert all of that, then
+			 * continue as if this were a normal page fault.
+			 */
+			regs->sp &= ~1UL;
+			regs->regs[0] = ti->unalign_jit_tmp[0];
+			regs->regs[1] = ti->unalign_jit_tmp[1];
+			regs->regs[2] = ti->unalign_jit_tmp[2];
+			regs->regs[3] = ti->unalign_jit_tmp[3];
+			write &= 1;
+		} else {
+			pr_alert("%s/%d: ICS set at page fault at %#lx: %#lx\n",
+				 current->comm, current->pid, pc, address);
+			show_regs(regs);
+			do_group_exit(SIGKILL);
+			return;
+		}
+	}
+#else
 	/* This case should have been handled by do_page_fault_ics(). */
 	BUG_ON(write & ~1);
+#endif
 
 #if CHIP_HAS_TILE_DMA()
 	/*
@@ -722,10 +786,6 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 	case INT_DMATLB_MISS:
 	case INT_DMATLB_MISS_DWNCL:
 #endif
-#if CHIP_HAS_SN_PROC()
-	case INT_SNITLB_MISS:
-	case INT_SNITLB_MISS_DWNCL:
-#endif
 		is_page_fault = 1;
 		break;
 
@@ -741,7 +801,8 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 		panic("Bad fault number %d in do_page_fault", fault_num);
 	}
 
-	if (EX1_PL(regs->ex1) != USER_PL) {
+#if CHIP_HAS_TILE_DMA()
+	if (!user_mode(regs)) {
 		struct async_tlb *async;
 		switch (fault_num) {
 #if CHIP_HAS_TILE_DMA()
@@ -752,12 +813,6 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 			async = &current->thread.dma_async_tlb;
 			break;
 #endif
-#if CHIP_HAS_SN_PROC()
-		case INT_SNITLB_MISS:
-		case INT_SNITLB_MISS_DWNCL:
-			async = &current->thread.sn_async_tlb;
-			break;
-#endif
 		default:
 			async = NULL;
 		}
@@ -784,19 +839,28 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 			return;
 		}
 	}
+#endif
 
 	handle_page_fault(regs, fault_num, is_page_fault, address, write);
 }
 
 
-#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
+#if CHIP_HAS_TILE_DMA()
 /*
- * Check an async_tlb structure to see if a deferred fault is waiting,
- * and if so pass it to the page-fault code.
+ * This routine effectively re-issues asynchronous page faults
+ * when we are returning to user space.
  */
-static void handle_async_page_fault(struct pt_regs *regs,
-				    struct async_tlb *async)
+void do_async_page_fault(struct pt_regs *regs)
 {
+	struct async_tlb *async = &current->thread.dma_async_tlb;
+
+	/*
+	 * Clear thread flag early.  If we re-interrupt while processing
+	 * code here, we will reset it and recall this routine before
+	 * returning to user space.
+	 */
+	clear_thread_flag(TIF_ASYNC_TLB);
+
 	if (async->fault_num) {
 		/*
 		 * Clear async->fault_num before calling the page-fault
@@ -810,35 +874,15 @@ static void handle_async_page_fault(struct pt_regs *regs,
 				  async->address, async->is_write);
 	}
 }
-#endif /* CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() */
-
-
-/*
- * This routine effectively re-issues asynchronous page faults
- * when we are returning to user space.
- */
-void do_async_page_fault(struct pt_regs *regs)
-{
-	/*
-	 * Clear thread flag early.  If we re-interrupt while processing
-	 * code here, we will reset it and recall this routine before
-	 * returning to user space.
-	 */
-	clear_thread_flag(TIF_ASYNC_TLB);
+#endif /* CHIP_HAS_TILE_DMA() */
 
-#if CHIP_HAS_TILE_DMA()
-	handle_async_page_fault(regs, &current->thread.dma_async_tlb);
-#endif
-#if CHIP_HAS_SN_PROC()
-	handle_async_page_fault(regs, &current->thread.sn_async_tlb);
-#endif
-}
 
 void vmalloc_sync_all(void)
 {
 #ifdef __tilegx__
 	/* Currently all L1 kernel pmd's are static and shared. */
-	BUG_ON(pgd_index(VMALLOC_END) != pgd_index(VMALLOC_START));
+	BUILD_BUG_ON(pgd_index(VMALLOC_END - PAGE_SIZE) !=
+		     pgd_index(VMALLOC_START));
 #else
 	/*
 	 * Note that races in the updates of insync and start aren't
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index 31dbbd9afe4..0dc21829477 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -93,7 +93,7 @@ static DEFINE_PER_CPU(struct kmap_amps, amps);
  * If we examine it earlier we are exposed to a race where it looks
  * writable earlier, but becomes immutable before we write the PTE.
  */
-static void kmap_atomic_register(struct page *page, enum km_type type,
+static void kmap_atomic_register(struct page *page, int type,
 				 unsigned long va, pte_t *ptep, pte_t pteval)
 {
 	unsigned long flags;
@@ -114,7 +114,6 @@ static void kmap_atomic_register(struct page *page, enum km_type type,
 
 	list_add(&amp->list, &amp_list);
 	set_pte(ptep, pteval);
-	arch_flush_lazy_mmu_mode();
 
 	spin_unlock(&amp_lock);
 	homecache_kpte_unlock(flags);
@@ -224,12 +223,12 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 }
 EXPORT_SYMBOL(kmap_atomic_prot);
 
-void *__kmap_atomic(struct page *page)
+void *kmap_atomic(struct page *page)
 {
 	/* PAGE_NONE is a magic value that tells us to check immutability. */
 	return kmap_atomic_prot(page, PAGE_NONE);
 }
-EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic);
 
 void __kunmap_atomic(void *kvaddr)
 {
@@ -259,7 +258,6 @@ void __kunmap_atomic(void *kvaddr)
 		BUG_ON(vaddr >= (unsigned long)high_memory);
 	}
 
-	arch_flush_lazy_mmu_mode();
 	pagefault_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index d78df3a6ee1..33294fdc402 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -30,6 +30,7 @@
 #include <linux/cache.h>
 #include <linux/smp.h>
 #include <linux/module.h>
+#include <linux/hugetlb.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -42,12 +43,9 @@
 #include "migrate.h"
 
 
-#if CHIP_HAS_COHERENT_LOCAL_CACHE()
-
 /*
  * The noallocl2 option suppresses all use of the L2 cache to cache
- * locally from a remote home.  There's no point in using it if we
- * don't have coherent local caching, though.
+ * locally from a remote home.
  */
 static int __write_once noallocl2;
 static int __init set_noallocl2(char *str)
@@ -57,16 +55,6 @@ static int __init set_noallocl2(char *str)
 }
 early_param("noallocl2", set_noallocl2);
 
-#else
-
-#define noallocl2 0
-
-#endif
-
-/* Provide no-op versions of these routines to keep flush_remote() cleaner. */
-#define mark_caches_evicted_start() 0
-#define mark_caches_evicted_finish(mask, timestamp) do {} while (0)
-
 
 /*
  * Update the irq_stat for cpus that we are going to interrupt
@@ -106,7 +94,6 @@ static void hv_flush_update(const struct cpumask *cache_cpumask,
  *    there's never any good reason for hv_flush_remote() to fail.
  *  - Accepts a 32-bit PFN rather than a 64-bit PA, which generally
  *    is the type that Linux wants to pass around anyway.
- *  - Centralizes the mark_caches_evicted() handling.
  *  - Canonicalizes that lengths of zero make cpumasks NULL.
  *  - Handles deferring TLB flushes for dataplane tiles.
  *  - Tracks remote interrupts in the per-cpu irq_cpustat_t.
@@ -125,7 +112,6 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
 		  HV_Remote_ASID *asids, int asidcount)
 {
 	int rc;
-	int timestamp = 0;  /* happy compiler */
 	struct cpumask cache_cpumask_copy, tlb_cpumask_copy;
 	struct cpumask *cache_cpumask, *tlb_cpumask;
 	HV_PhysAddr cache_pa;
@@ -156,15 +142,11 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
 	hv_flush_update(cache_cpumask, tlb_cpumask, tlb_va, tlb_length,
 			asids, asidcount);
 	cache_pa = (HV_PhysAddr)cache_pfn << PAGE_SHIFT;
-	if (cache_control & HV_FLUSH_EVICT_L2)
-		timestamp = mark_caches_evicted_start();
 	rc = hv_flush_remote(cache_pa, cache_control,
 			     cpumask_bits(cache_cpumask),
 			     tlb_va, tlb_length, tlb_pgsize,
 			     cpumask_bits(tlb_cpumask),
 			     asids, asidcount);
-	if (cache_control & HV_FLUSH_EVICT_L2)
-		mark_caches_evicted_finish(cache_cpumask, timestamp);
 	if (rc == 0)
 		return;
 	cpumask_scnprintf(cache_buf, sizeof(cache_buf), &cache_cpumask_copy);
@@ -179,59 +161,88 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
 	panic("Unsafe to continue.");
 }
 
-void homecache_evict(const struct cpumask *mask)
+static void homecache_finv_page_va(void* va, int home)
 {
-	flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0);
+	int cpu = get_cpu();
+	if (home == cpu) {
+		finv_buffer_local(va, PAGE_SIZE);
+	} else if (home == PAGE_HOME_HASH) {
+		finv_buffer_remote(va, PAGE_SIZE, 1);
+	} else {
+		BUG_ON(home < 0 || home >= NR_CPUS);
+		finv_buffer_remote(va, PAGE_SIZE, 0);
+	}
+	put_cpu();
 }
 
-/* Return a mask of the cpus whose caches currently own these pages. */
-static void homecache_mask(struct page *page, int pages,
-			   struct cpumask *home_mask)
+void homecache_finv_map_page(struct page *page, int home)
 {
-	int i;
-	cpumask_clear(home_mask);
-	for (i = 0; i < pages; ++i) {
-		int home = page_home(&page[i]);
-		if (home == PAGE_HOME_IMMUTABLE ||
-		    home == PAGE_HOME_INCOHERENT) {
-			cpumask_copy(home_mask, cpu_possible_mask);
-			return;
-		}
-#if CHIP_HAS_CBOX_HOME_MAP()
-		if (home == PAGE_HOME_HASH) {
-			cpumask_or(home_mask, home_mask, &hash_for_home_map);
-			continue;
-		}
+	unsigned long flags;
+	unsigned long va;
+	pte_t *ptep;
+	pte_t pte;
+
+	if (home == PAGE_HOME_UNCACHED)
+		return;
+	local_irq_save(flags);
+#ifdef CONFIG_HIGHMEM
+	va = __fix_to_virt(FIX_KMAP_BEGIN + kmap_atomic_idx_push() +
+			   (KM_TYPE_NR * smp_processor_id()));
+#else
+	va = __fix_to_virt(FIX_HOMECACHE_BEGIN + smp_processor_id());
 #endif
-		if (home == PAGE_HOME_UNCACHED)
-			continue;
-		BUG_ON(home < 0 || home >= NR_CPUS);
-		cpumask_set_cpu(home, home_mask);
-	}
+	ptep = virt_to_kpte(va);
+	pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
+	__set_pte(ptep, pte_set_home(pte, home));
+	homecache_finv_page_va((void *)va, home);
+	__pte_clear(ptep);
+	hv_flush_page(va, PAGE_SIZE);
+#ifdef CONFIG_HIGHMEM
+	kmap_atomic_idx_pop();
+#endif
+	local_irq_restore(flags);
 }
 
-/*
- * Return the passed length, or zero if it's long enough that we
- * believe we should evict the whole L2 cache.
- */
-static unsigned long cache_flush_length(unsigned long length)
+static void homecache_finv_page_home(struct page *page, int home)
+{
+	if (!PageHighMem(page) && home == page_home(page))
+		homecache_finv_page_va(page_address(page), home);
+	else
+		homecache_finv_map_page(page, home);
+}
+
+static inline bool incoherent_home(int home)
+{
+	return home == PAGE_HOME_IMMUTABLE || home == PAGE_HOME_INCOHERENT;
+}
+
+static void homecache_finv_page_internal(struct page *page, int force_map)
 {
-	return (length >= CHIP_L2_CACHE_SIZE()) ? HV_FLUSH_EVICT_L2 : length;
+	int home = page_home(page);
+	if (home == PAGE_HOME_UNCACHED)
+		return;
+	if (incoherent_home(home)) {
+		int cpu;
+		for_each_cpu(cpu, &cpu_cacheable_map)
+			homecache_finv_map_page(page, cpu);
+	} else if (force_map) {
+		/* Force if, e.g., the normal mapping is migrating. */
+		homecache_finv_map_page(page, home);
+	} else {
+		homecache_finv_page_home(page, home);
+	}
+	sim_validate_lines_evicted(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);
 }
 
-/* Flush a page out of whatever cache(s) it is in. */
-void homecache_flush_cache(struct page *page, int order)
+void homecache_finv_page(struct page *page)
 {
-	int pages = 1 << order;
-	int length = cache_flush_length(pages * PAGE_SIZE);
-	unsigned long pfn = page_to_pfn(page);
-	struct cpumask home_mask;
-
-	homecache_mask(page, pages, &home_mask);
-	flush_remote(pfn, length, &home_mask, 0, 0, 0, NULL, NULL, 0);
-	sim_validate_lines_evicted(PFN_PHYS(pfn), pages * PAGE_SIZE);
+	homecache_finv_page_internal(page, 0);
 }
 
+void homecache_evict(const struct cpumask *mask)
+{
+	flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0);
+}
 
 /* Report the home corresponding to a given PTE. */
 static int pte_to_home(pte_t pte)
@@ -245,10 +256,8 @@ static int pte_to_home(pte_t pte)
 		return PAGE_HOME_INCOHERENT;
 	case HV_PTE_MODE_UNCACHED:
 		return PAGE_HOME_UNCACHED;
-#if CHIP_HAS_CBOX_HOME_MAP()
 	case HV_PTE_MODE_CACHE_HASH_L3:
 		return PAGE_HOME_HASH;
-#endif
 	}
 	panic("Bad PTE %#llx\n", pte.val);
 }
@@ -305,20 +314,16 @@ pte_t pte_set_home(pte_t pte, int home)
 						      HV_PTE_MODE_CACHE_NO_L3);
 			}
 		} else
-#if CHIP_HAS_CBOX_HOME_MAP()
 		if (hash_default)
 			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
 		else
-#endif
 			pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
 		pte = hv_pte_set_nc(pte);
 		break;
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	case PAGE_HOME_HASH:
 		pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_HASH_L3);
 		break;
-#endif
 
 	default:
 		BUG_ON(home < 0 || home >= NR_CPUS ||
@@ -328,7 +333,6 @@ pte_t pte_set_home(pte_t pte, int home)
 		break;
 	}
 
-#if CHIP_HAS_NC_AND_NOALLOC_BITS()
 	if (noallocl2)
 		pte = hv_pte_set_no_alloc_l2(pte);
 
@@ -337,7 +341,6 @@ pte_t pte_set_home(pte_t pte, int home)
 	    hv_pte_get_mode(pte) == HV_PTE_MODE_CACHE_NO_L3) {
 		pte = hv_pte_set_mode(pte, HV_PTE_MODE_UNCACHED);
 	}
-#endif
 
 	/* Checking this case here gives a better panic than from the hv. */
 	BUG_ON(hv_pte_get_mode(pte) == 0);
@@ -353,21 +356,16 @@ EXPORT_SYMBOL(pte_set_home);
  * so they're not suitable for anything but infrequent use.
  */
 
-#if CHIP_HAS_CBOX_HOME_MAP()
-static inline int initial_page_home(void) { return PAGE_HOME_HASH; }
-#else
-static inline int initial_page_home(void) { return 0; }
-#endif
-
 int page_home(struct page *page)
 {
 	if (PageHighMem(page)) {
-		return initial_page_home();
+		return PAGE_HOME_HASH;
 	} else {
 		unsigned long kva = (unsigned long)page_address(page);
-		return pte_to_home(*virt_to_pte(NULL, kva));
+		return pte_to_home(*virt_to_kpte(kva));
 	}
 }
+EXPORT_SYMBOL(page_home);
 
 void homecache_change_page_home(struct page *page, int order, int home)
 {
@@ -383,12 +381,13 @@ void homecache_change_page_home(struct page *page, int order, int home)
 		     NULL, 0);
 
 	for (i = 0; i < pages; ++i, kva += PAGE_SIZE) {
-		pte_t *ptep = virt_to_pte(NULL, kva);
+		pte_t *ptep = virt_to_kpte(kva);
 		pte_t pteval = *ptep;
 		BUG_ON(!pte_present(pteval) || pte_huge(pteval));
-		*ptep = pte_set_home(pteval, home);
+		__set_pte(ptep, pte_set_home(pteval, home));
 	}
 }
+EXPORT_SYMBOL(homecache_change_page_home);
 
 struct page *homecache_alloc_pages(gfp_t gfp_mask,
 				   unsigned int order, int home)
@@ -413,19 +412,25 @@ struct page *homecache_alloc_pages_node(int nid, gfp_t gfp_mask,
 	return page;
 }
 
-void homecache_free_pages(unsigned long addr, unsigned int order)
+void __homecache_free_pages(struct page *page, unsigned int order)
 {
-	struct page *page;
-
-	if (addr == 0)
-		return;
-
-	VM_BUG_ON(!virt_addr_valid((void *)addr));
-	page = virt_to_page((void *)addr);
 	if (put_page_testzero(page)) {
-		int pages = (1 << order);
-		homecache_change_page_home(page, order, initial_page_home());
-		while (pages--)
-			__free_page(page++);
+		homecache_change_page_home(page, order, PAGE_HOME_HASH);
+		if (order == 0) {
+			free_hot_cold_page(page, false);
+		} else {
+			init_page_count(page);
+			__free_pages(page, order);
+		}
+	}
+}
+EXPORT_SYMBOL(__homecache_free_pages);
+
+void homecache_free_pages(unsigned long addr, unsigned int order)
+{
+	if (addr != 0) {
+		VM_BUG_ON(!virt_addr_valid((void *)addr));
+		__homecache_free_pages(virt_to_page((void *)addr), order);
 	}
 }
+EXPORT_SYMBOL(homecache_free_pages);
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 24688b697a8..e514899e110 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -21,92 +21,135 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/sysctl.h>
 #include <linux/mman.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+
+/*
+ * Provide an additional huge page size (in addition to the regular default
+ * huge page size) if no "hugepagesz" arguments are specified.
+ * Note that it must be smaller than the default huge page size so
+ * that it's possible to allocate them on demand from the buddy allocator.
+ * You can change this to 64K (on a 16K build), 256K, 1M, or 4M,
+ * or not define it at all.
+ */
+#define ADDITIONAL_HUGE_SIZE (1024 * 1024UL)
+
+/* "Extra" page-size multipliers, one per level of the page table. */
+int huge_shift[HUGE_SHIFT_ENTRIES] = {
+#ifdef ADDITIONAL_HUGE_SIZE
+#define ADDITIONAL_HUGE_SHIFT __builtin_ctzl(ADDITIONAL_HUGE_SIZE / PAGE_SIZE)
+	[HUGE_SHIFT_PAGE] = ADDITIONAL_HUGE_SHIFT
+#endif
+};
+
+#endif
 
 pte_t *huge_pte_alloc(struct mm_struct *mm,
 		      unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	pte_t *pte = NULL;
 
-	/* We do not yet support multiple huge page sizes. */
-	BUG_ON(sz != PMD_SIZE);
+	addr &= -sz;   /* Mask off any low bits in the address. */
 
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
-	if (pud)
-		pte = (pte_t *) pmd_alloc(mm, pud, addr);
-	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
 
-	return pte;
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	if (sz >= PGDIR_SIZE) {
+		BUG_ON(sz != PGDIR_SIZE &&
+		       sz != PGDIR_SIZE << huge_shift[HUGE_SHIFT_PGDIR]);
+		return (pte_t *)pud;
+	} else {
+		pmd_t *pmd = pmd_alloc(mm, pud, addr);
+		if (sz >= PMD_SIZE) {
+			BUG_ON(sz != PMD_SIZE &&
+			       sz != (PMD_SIZE << huge_shift[HUGE_SHIFT_PMD]));
+			return (pte_t *)pmd;
+		}
+		else {
+			if (sz != PAGE_SIZE << huge_shift[HUGE_SHIFT_PAGE])
+				panic("Unexpected page size %#lx\n", sz);
+			return pte_alloc_map(mm, NULL, pmd, addr);
+		}
+	}
+#else
+	BUG_ON(sz != PMD_SIZE);
+	return (pte_t *) pmd_alloc(mm, pud, addr);
+#endif
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+static pte_t *get_pte(pte_t *base, int index, int level)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd = NULL;
-
-	pgd = pgd_offset(mm, addr);
-	if (pgd_present(*pgd)) {
-		pud = pud_offset(pgd, addr);
-		if (pud_present(*pud))
-			pmd = pmd_offset(pud, addr);
+	pte_t *ptep = base + index;
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	if (!pte_present(*ptep) && huge_shift[level] != 0) {
+		unsigned long mask = -1UL << huge_shift[level];
+		pte_t *super_ptep = base + (index & mask);
+		pte_t pte = *super_ptep;
+		if (pte_present(pte) && pte_super(pte))
+			ptep = super_ptep;
 	}
-	return (pte_t *) pmd;
+#endif
+	return ptep;
 }
 
-#ifdef HUGETLB_TEST
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
-	unsigned long start = address;
-	int length = 1;
-	int nr;
-	struct page *page;
-	struct vm_area_struct *vma;
-
-	vma = find_vma(mm, addr);
-	if (!vma || !is_vm_hugetlb_page(vma))
-		return ERR_PTR(-EINVAL);
-
-	pte = huge_pte_offset(mm, address);
-
-	/* hugetlb should be locked, and hence, prefaulted */
-	WARN_ON(!pte || pte_none(*pte));
-
-	page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
-	WARN_ON(!PageHead(page));
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	pte_t *pte;
+#endif
 
-	return page;
-}
+	/* Get the top-level page table entry. */
+	pgd = (pgd_t *)get_pte((pte_t *)mm->pgd, pgd_index(addr), 0);
 
-int pmd_huge(pmd_t pmd)
-{
-	return 0;
-}
+	/* We don't have four levels. */
+	pud = pud_offset(pgd, addr);
+#ifndef __PAGETABLE_PUD_FOLDED
+# error support fourth page table level
+#endif
+	if (!pud_present(*pud))
+		return NULL;
+
+	/* Check for an L0 huge PTE, if we have three levels. */
+#ifndef __PAGETABLE_PMD_FOLDED
+	if (pud_huge(*pud))
+		return (pte_t *)pud;
+
+	pmd = (pmd_t *)get_pte((pte_t *)pud_page_vaddr(*pud),
+			       pmd_index(addr), 1);
+	if (!pmd_present(*pmd))
+		return NULL;
+#else
+	pmd = pmd_offset(pud, addr);
+#endif
 
-int pud_huge(pud_t pud)
-{
-	return 0;
-}
+	/* Check for an L1 huge PTE. */
+	if (pmd_huge(*pmd))
+		return (pte_t *)pmd;
+
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+	/* Check for an L2 huge PTE. */
+	pte = get_pte((pte_t *)pmd_page_vaddr(*pmd), pte_index(addr), 2);
+	if (!pte_present(*pte))
+		return NULL;
+	if (pte_super(*pte))
+		return pte;
+#endif
 
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-			     pmd_t *pmd, int write)
-{
 	return NULL;
 }
 
-#else
-
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write)
 {
@@ -150,50 +193,21 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 	return 0;
 }
 
-#endif
-
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 		unsigned long addr, unsigned long len,
 		unsigned long pgoff, unsigned long flags)
 {
 	struct hstate *h = hstate_file(file);
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	unsigned long start_addr;
-
-	if (len > mm->cached_hole_size) {
-		start_addr = mm->free_area_cache;
-	} else {
-		start_addr = TASK_UNMAPPED_BASE;
-		mm->cached_hole_size = 0;
-	}
-
-full_search:
-	addr = ALIGN(start_addr, huge_page_size(h));
-
-	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-		/* At this point:  (!vma || addr < vma->vm_end). */
-		if (TASK_SIZE - len < addr) {
-			/*
-			 * Start a new search - just in case we missed
-			 * some holes.
-			 */
-			if (start_addr != TASK_UNMAPPED_BASE) {
-				start_addr = TASK_UNMAPPED_BASE;
-				mm->cached_hole_size = 0;
-				goto full_search;
-			}
-			return -ENOMEM;
-		}
-		if (!vma || addr + len <= vma->vm_start) {
-			mm->free_area_cache = addr + len;
-			return addr;
-		}
-		if (addr + mm->cached_hole_size < vma->vm_start)
-			mm->cached_hole_size = vma->vm_start - addr;
-		addr = ALIGN(vma->vm_end, huge_page_size(h));
-	}
+	struct vm_unmapped_area_info info;
+
+	info.flags = 0;
+	info.length = len;
+	info.low_limit = TASK_UNMAPPED_BASE;
+	info.high_limit = TASK_SIZE;
+	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+	info.align_offset = 0;
+	return vm_unmapped_area(&info);
 }
 
 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
@@ -201,92 +215,30 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
 		unsigned long pgoff, unsigned long flags)
 {
 	struct hstate *h = hstate_file(file);
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma, *prev_vma;
-	unsigned long base = mm->mmap_base, addr = addr0;
-	unsigned long largest_hole = mm->cached_hole_size;
-	int first_time = 1;
-
-	/* don't allow allocations above current base */
-	if (mm->free_area_cache > base)
-		mm->free_area_cache = base;
-
-	if (len <= largest_hole) {
-		largest_hole = 0;
-		mm->free_area_cache  = base;
-	}
-try_again:
-	/* make sure it can fit in the remaining address space */
-	if (mm->free_area_cache < len)
-		goto fail;
-
-	/* either no address requested or cant fit in requested address hole */
-	addr = (mm->free_area_cache - len) & huge_page_mask(h);
-	do {
-		/*
-		 * Lookup failure means no vma is above this address,
-		 * i.e. return with success:
-		 */
-		vma = find_vma_prev(mm, addr, &prev_vma);
-		if (!vma) {
-			return addr;
-			break;
-		}
+	struct vm_unmapped_area_info info;
+	unsigned long addr;
 
-		/*
-		 * new region fits between prev_vma->vm_end and
-		 * vma->vm_start, use it:
-		 */
-		if (addr + len <= vma->vm_start &&
-			    (!prev_vma || (addr >= prev_vma->vm_end))) {
-			/* remember the address as a hint for next time */
-			mm->cached_hole_size = largest_hole;
-			mm->free_area_cache = addr;
-			return addr;
-		} else {
-			/* pull free_area_cache down to the first hole */
-			if (mm->free_area_cache == vma->vm_end) {
-				mm->free_area_cache = vma->vm_start;
-				mm->cached_hole_size = largest_hole;
-			}
-		}
+	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+	info.length = len;
+	info.low_limit = PAGE_SIZE;
+	info.high_limit = current->mm->mmap_base;
+	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+	info.align_offset = 0;
+	addr = vm_unmapped_area(&info);
 
-		/* remember the largest hole we saw so far */
-		if (addr + largest_hole < vma->vm_start)
-			largest_hole = vma->vm_start - addr;
-
-		/* try just below the current vma->vm_start */
-		addr = (vma->vm_start - len) & huge_page_mask(h);
-
-	} while (len <= vma->vm_start);
-
-fail:
-	/*
-	 * if hint left us with no space for the requested
-	 * mapping then try again:
-	 */
-	if (first_time) {
-		mm->free_area_cache = base;
-		largest_hole = 0;
-		first_time = 0;
-		goto try_again;
-	}
 	/*
 	 * A failed mmap() very likely causes application failure,
 	 * so fall back to the bottom-up function here. This scenario
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	mm->free_area_cache = TASK_UNMAPPED_BASE;
-	mm->cached_hole_size = ~0UL;
-	addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
-			len, pgoff, flags);
-
-	/*
-	 * Restore the topdown base:
-	 */
-	mm->free_area_cache = base;
-	mm->cached_hole_size = ~0UL;
+	if (addr & ~PAGE_MASK) {
+		VM_BUG_ON(addr != -ENOMEM);
+		info.flags = 0;
+		info.low_limit = TASK_UNMAPPED_BASE;
+		info.high_limit = TASK_SIZE;
+		addr = vm_unmapped_area(&info);
+	}
 
 	return addr;
 }
@@ -323,21 +275,102 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return hugetlb_get_unmapped_area_topdown(file, addr, len,
 				pgoff, flags);
 }
+#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
 
-static __init int setup_hugepagesz(char *opt)
+#ifdef CONFIG_HUGETLB_SUPER_PAGES
+static __init int __setup_hugepagesz(unsigned long ps)
 {
-	unsigned long ps = memparse(opt, &opt);
-	if (ps == PMD_SIZE) {
-		hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
-	} else if (ps == PUD_SIZE) {
-		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+	int log_ps = __builtin_ctzl(ps);
+	int level, base_shift;
+
+	if ((1UL << log_ps) != ps || (log_ps & 1) != 0) {
+		pr_warn("Not enabling %ld byte huge pages;"
+			" must be a power of four.\n", ps);
+		return -EINVAL;
+	}
+
+	if (ps > 64*1024*1024*1024UL) {
+		pr_warn("Not enabling %ld MB huge pages;"
+			" largest legal value is 64 GB .\n", ps >> 20);
+		return -EINVAL;
+	} else if (ps >= PUD_SIZE) {
+		static long hv_jpage_size;
+		if (hv_jpage_size == 0)
+			hv_jpage_size = hv_sysconf(HV_SYSCONF_PAGE_SIZE_JUMBO);
+		if (hv_jpage_size != PUD_SIZE) {
+			pr_warn("Not enabling >= %ld MB huge pages:"
+				" hypervisor reports size %ld\n",
+				PUD_SIZE >> 20, hv_jpage_size);
+			return -EINVAL;
+		}
+		level = 0;
+		base_shift = PUD_SHIFT;
+	} else if (ps >= PMD_SIZE) {
+		level = 1;
+		base_shift = PMD_SHIFT;
+	} else if (ps > PAGE_SIZE) {
+		level = 2;
+		base_shift = PAGE_SHIFT;
 	} else {
-		pr_err("hugepagesz: Unsupported page size %lu M\n",
-			ps >> 20);
-		return 0;
+		pr_err("hugepagesz: huge page size %ld too small\n", ps);
+		return -EINVAL;
+	}
+
+	if (log_ps != base_shift) {
+		int shift_val = log_ps - base_shift;
+		if (huge_shift[level] != 0) {
+			int old_shift = base_shift + huge_shift[level];
+			pr_warn("Not enabling %ld MB huge pages;"
+				" already have size %ld MB.\n",
+				ps >> 20, (1UL << old_shift) >> 20);
+			return -EINVAL;
+		}
+		if (hv_set_pte_super_shift(level, shift_val) != 0) {
+			pr_warn("Not enabling %ld MB huge pages;"
+				" no hypervisor support.\n", ps >> 20);
+			return -EINVAL;
+		}
+		printk(KERN_DEBUG "Enabled %ld MB huge pages\n", ps >> 20);
+		huge_shift[level] = shift_val;
+	}
+
+	hugetlb_add_hstate(log_ps - PAGE_SHIFT);
+
+	return 0;
+}
+
+static bool saw_hugepagesz;
+
+static __init int setup_hugepagesz(char *opt)
+{
+	if (!saw_hugepagesz) {
+		saw_hugepagesz = true;
+		memset(huge_shift, 0, sizeof(huge_shift));
 	}
-	return 1;
+	return __setup_hugepagesz(memparse(opt, NULL));
 }
 __setup("hugepagesz=", setup_hugepagesz);
 
-#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
+#ifdef ADDITIONAL_HUGE_SIZE
+/*
+ * Provide an additional huge page size if no "hugepagesz" args are given.
+ * In that case, all the cores have properly set up their hv super_shift
+ * already, but we need to notify the hugetlb code to enable the
+ * new huge page size from the Linux point of view.
+ */
+static __init int add_default_hugepagesz(void)
+{
+	if (!saw_hugepagesz) {
+		BUILD_BUG_ON(ADDITIONAL_HUGE_SIZE >= PMD_SIZE ||
+			     ADDITIONAL_HUGE_SIZE <= PAGE_SIZE);
+		BUILD_BUG_ON((PAGE_SIZE << ADDITIONAL_HUGE_SHIFT) !=
+			     ADDITIONAL_HUGE_SIZE);
+		BUILD_BUG_ON(ADDITIONAL_HUGE_SHIFT & 1);
+		hugetlb_add_hstate(ADDITIONAL_HUGE_SHIFT);
+	}
+	return 0;
+}
+arch_initcall(add_default_hugepagesz);
+#endif
+
+#endif /* CONFIG_HUGETLB_SUPER_PAGES */
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 0b9ce69b0ee..bfb3127b4df 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -38,7 +38,6 @@
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/processor.h>
-#include <asm/system.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/dma.h>
@@ -53,26 +52,13 @@
 
 #include "migrate.h"
 
-/*
- * We could set FORCE_MAX_ZONEORDER to "(HPAGE_SHIFT - PAGE_SHIFT + 1)"
- * in the Tile Kconfig, but this generates configure warnings.
- * Do it here and force people to get it right to compile this file.
- * The problem is that with 4KB small pages and 16MB huge pages,
- * the default value doesn't allow us to group enough small pages
- * together to make up a huge page.
- */
-#if CONFIG_FORCE_MAX_ZONEORDER < HPAGE_SHIFT - PAGE_SHIFT + 1
-# error "Change FORCE_MAX_ZONEORDER in arch/tile/Kconfig to match page size"
-#endif
-
 #define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0))
 
 #ifndef __tilegx__
 unsigned long VMALLOC_RESERVE = CONFIG_VMALLOC_RESERVE;
+EXPORT_SYMBOL(VMALLOC_RESERVE);
 #endif
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
 /* Create an L2 page table */
 static pte_t * __init alloc_pte(void)
 {
@@ -96,7 +82,7 @@ static int num_l2_ptes[MAX_NUMNODES];
 
 static void init_prealloc_ptes(int node, int pages)
 {
-	BUG_ON(pages & (HV_L2_ENTRIES-1));
+	BUG_ON(pages & (PTRS_PER_PTE - 1));
 	if (pages) {
 		num_l2_ptes[node] = pages;
 		l2_ptes[node] = __alloc_bootmem(pages * sizeof(pte_t),
@@ -120,10 +106,8 @@ pte_t *get_prealloc_pte(unsigned long pfn)
  */
 static int initial_heap_home(void)
 {
-#if CHIP_HAS_CBOX_HOME_MAP()
 	if (hash_default)
 		return PAGE_HOME_HASH;
-#endif
 	return smp_processor_id();
 }
 
@@ -145,14 +129,9 @@ static void __init assign_pte(pmd_t *pmd, pte_t *page_table)
 
 #ifdef __tilegx__
 
-#if HV_L1_SIZE != HV_L2_SIZE
-# error Rework assumption that L1 and L2 page tables are same size.
-#endif
-
-/* Since pmd_t arrays and pte_t arrays are the same size, just use casts. */
 static inline pmd_t *alloc_pmd(void)
 {
-	return (pmd_t *)alloc_pte();
+	return __alloc_bootmem(L1_KERNEL_PGTABLE_SIZE, HV_PAGE_TABLE_ALIGN, 0);
 }
 
 static inline void assign_pmd(pud_t *pud, pmd_t *pmd)
@@ -169,7 +148,21 @@ void __init shatter_pmd(pmd_t *pmd)
 	assign_pte(pmd, pte);
 }
 
-#ifdef CONFIG_HIGHMEM
+#ifdef __tilegx__
+static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
+{
+	pud_t *pud = pud_offset(&pgtables[pgd_index(va)], va);
+	if (pud_none(*pud))
+		assign_pmd(pud, alloc_pmd());
+	return pmd_offset(pud, va);
+}
+#else
+static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
+{
+	return pmd_offset(pud_offset(&pgtables[pgd_index(va)], va), va);
+}
+#endif
+
 /*
  * This function initializes a certain range of kernel virtual memory
  * with new bootmem page tables, everywhere page tables are missing in
@@ -182,34 +175,24 @@ void __init shatter_pmd(pmd_t *pmd)
  * checking the pgd every time.
  */
 static void __init page_table_range_init(unsigned long start,
-					 unsigned long end, pgd_t *pgd_base)
+					 unsigned long end, pgd_t *pgd)
 {
-	pgd_t *pgd;
-	int pgd_idx;
 	unsigned long vaddr;
-
-	vaddr = start;
-	pgd_idx = pgd_index(vaddr);
-	pgd = pgd_base + pgd_idx;
-
-	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
-		pmd_t *pmd = pmd_offset(pud_offset(pgd, vaddr), vaddr);
+	start = round_down(start, PMD_SIZE);
+	end = round_up(end, PMD_SIZE);
+	for (vaddr = start; vaddr < end; vaddr += PMD_SIZE) {
+		pmd_t *pmd = get_pmd(pgd, vaddr);
 		if (pmd_none(*pmd))
 			assign_pte(pmd, alloc_pte());
-		vaddr += PMD_SIZE;
 	}
 }
-#endif /* CONFIG_HIGHMEM */
-
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 
 static int __initdata ktext_hash = 1;  /* .text pages */
 static int __initdata kdata_hash = 1;  /* .data and .bss pages */
 int __write_once hash_default = 1;     /* kernel allocator pages */
 EXPORT_SYMBOL(hash_default);
 int __write_once kstack_hash = 1;      /* if no homecaching, use h4h */
-#endif /* CHIP_HAS_CBOX_HOME_MAP */
 
 /*
  * CPUs to use to for striping the pages of kernel data.  If hash-for-home
@@ -227,14 +210,12 @@ int __write_once kdata_huge;       /* if no homecaching, small pages */
 static pgprot_t __init construct_pgprot(pgprot_t prot, int home)
 {
 	prot = pte_set_home(prot, home);
-#if CHIP_HAS_CBOX_HOME_MAP()
 	if (home == PAGE_HOME_IMMUTABLE) {
 		if (ktext_hash)
 			prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_HASH_L3);
 		else
 			prot = hv_pte_set_mode(prot, HV_PTE_MODE_CACHE_NO_L3);
 	}
-#endif
 	return prot;
 }
 
@@ -246,40 +227,28 @@ static pgprot_t __init init_pgprot(ulong address)
 {
 	int cpu;
 	unsigned long page;
-	enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+	enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/* For kdata=huge, everything is just hash-for-home. */
 	if (kdata_huge)
 		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
-#endif
 
 	/* We map the aliased pages of permanent text inaccessible. */
 	if (address < (ulong) _sinittext - CODE_DELTA)
 		return PAGE_NONE;
 
-	/*
-	 * We map read-only data non-coherent for performance.  We could
-	 * use neighborhood caching on TILE64, but it's not clear it's a win.
-	 */
+	/* We map read-only data non-coherent for performance. */
 	if ((address >= (ulong) __start_rodata &&
 	     address < (ulong) __end_rodata) ||
 	    address == (ulong) empty_zero_page) {
 		return construct_pgprot(PAGE_KERNEL_RO, PAGE_HOME_IMMUTABLE);
 	}
 
-	/* As a performance optimization, keep the boot init stack here. */
-	if (address >= (ulong)&init_thread_union &&
-	    address < (ulong)&init_thread_union + THREAD_SIZE)
-		return construct_pgprot(PAGE_KERNEL, smp_processor_id());
-
 #ifndef __tilegx__
-#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 	/* Force the atomic_locks[] array page to be hash-for-home. */
 	if (address == (ulong) atomic_locks)
 		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
 #endif
-#endif
 
 	/*
 	 * Everything else that isn't data or bss is heap, so mark it
@@ -297,28 +266,18 @@ static pgprot_t __init init_pgprot(ulong address)
 	if (address >= (ulong) _end || address < (ulong) _einitdata)
 		return construct_pgprot(PAGE_KERNEL, initial_heap_home());
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	/* Use hash-for-home if requested for data/bss. */
 	if (kdata_hash)
 		return construct_pgprot(PAGE_KERNEL, PAGE_HOME_HASH);
-#endif
-
-	/*
-	 * Make the w1data homed like heap to start with, to avoid
-	 * making it part of the page-striped data area when we're just
-	 * going to convert it to read-only soon anyway.
-	 */
-	if (address >= (ulong)__w1data_begin && address < (ulong)__w1data_end)
-		return construct_pgprot(PAGE_KERNEL, initial_heap_home());
 
 	/*
 	 * Otherwise we just hand out consecutive cpus.  To avoid
 	 * requiring this function to hold state, we just walk forward from
-	 * _sdata by PAGE_SIZE, skipping the readonly and init data, to reach
-	 * the requested address, while walking cpu home around kdata_mask.
-	 * This is typically no more than a dozen or so iterations.
+	 * __end_rodata by PAGE_SIZE, skipping the readonly and init data, to
+	 * reach the requested address, while walking cpu home around
+	 * kdata_mask. This is typically no more than a dozen or so iterations.
 	 */
-	page = (((ulong)__w1data_end) + PAGE_SIZE - 1) & PAGE_MASK;
+	page = (((ulong)__end_rodata) + PAGE_SIZE - 1) & PAGE_MASK;
 	BUG_ON(address < page || address >= (ulong)_end);
 	cpu = cpumask_first(&kdata_mask);
 	for (; page < address; page += PAGE_SIZE) {
@@ -328,11 +287,9 @@ static pgprot_t __init init_pgprot(ulong address)
 		if (page == (ulong)empty_zero_page)
 			continue;
 #ifndef __tilegx__
-#if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 		if (page == (ulong)atomic_locks)
 			continue;
 #endif
-#endif
 		cpu = cpumask_next(cpu, &kdata_mask);
 		if (cpu == NR_CPUS)
 			cpu = cpumask_first(&kdata_mask);
@@ -375,7 +332,7 @@ static int __init setup_ktext(char *str)
 
 	ktext_arg_seen = 1;
 
-	/* Default setting on Tile64: use a huge page */
+	/* Default setting: use a huge page */
 	if (strcmp(str, "huge") == 0)
 		pr_info("ktext: using one huge locally cached page\n");
 
@@ -421,31 +378,14 @@ static inline pgprot_t ktext_set_nocache(pgprot_t prot)
 {
 	if (!ktext_nocache)
 		prot = hv_pte_set_nc(prot);
-#if CHIP_HAS_NC_AND_NOALLOC_BITS()
 	else
 		prot = hv_pte_set_no_alloc_l2(prot);
-#endif
 	return prot;
 }
 
-#ifndef __tilegx__
-static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
-{
-	return pmd_offset(pud_offset(&pgtables[pgd_index(va)], va), va);
-}
-#else
-static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
-{
-	pud_t *pud = pud_offset(&pgtables[pgd_index(va)], va);
-	if (pud_none(*pud))
-		assign_pmd(pud, alloc_pmd());
-	return pmd_offset(pud, va);
-}
-#endif
-
 /* Temporary page table we use for staging. */
 static pgd_t pgtables[PTRS_PER_PGD]
- __attribute__((section(".init.page")));
+ __attribute__((aligned(HV_PAGE_TABLE_ALIGN)));
 
 /*
  * This maps the physical memory to kernel virtual address space, a total
@@ -463,6 +403,7 @@ static pgd_t pgtables[PTRS_PER_PGD]
  */
 static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 {
+	unsigned long long irqmask;
 	unsigned long address, pfn;
 	pmd_t *pmd;
 	pte_t *pte;
@@ -471,7 +412,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	struct cpumask kstripe_mask;
 	int rc, i;
 
-#if CHIP_HAS_CBOX_HOME_MAP()
 	if (ktext_arg_seen && ktext_hash) {
 		pr_warning("warning: \"ktext\" boot argument ignored"
 			   " if \"kcache_hash\" sets up text hash-for-home\n");
@@ -488,7 +428,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 			  " kcache_hash=all or =allbutstack\n");
 		kdata_huge = 0;
 	}
-#endif
 
 	/*
 	 * Set up a mask for cpus to use for kernel striping.
@@ -569,8 +508,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 		}
 	}
 
-	address = MEM_SV_INTRPT;
+	address = MEM_SV_START;
 	pmd = get_pmd(pgtables, address);
+	pfn = 0;  /* code starts at PA 0 */
 	if (ktext_small) {
 		/* Allocate an L2 PTE for the kernel text */
 		int cpu = 0;
@@ -592,11 +532,16 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 			prot = ktext_set_nocache(prot);
 		}
 
-		BUG_ON(address != (unsigned long)_stext);
-		pfn = 0;  /* code starts at PA 0 */
-		pte = alloc_pte();
-		for (pte_ofs = 0; address < (unsigned long)_einittext;
-		     pfn++, pte_ofs++, address += PAGE_SIZE) {
+		BUG_ON(address != (unsigned long)_text);
+		pte = NULL;
+		for (; address < (unsigned long)_einittext;
+		     pfn++, address += PAGE_SIZE) {
+			pte_ofs = pte_index(address);
+			if (pte_ofs == 0) {
+				if (pte)
+					assign_pte(pmd++, pte);
+				pte = alloc_pte();
+			}
 			if (!ktext_local) {
 				prot = set_remote_cache_cpu(prot, cpu);
 				cpu = cpumask_next(cpu, &ktext_mask);
@@ -605,17 +550,16 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 			}
 			pte[pte_ofs] = pfn_pte(pfn, prot);
 		}
-		assign_pte(pmd, pte);
+		if (pte)
+			assign_pte(pmd, pte);
 	} else {
 		pte_t pteval = pfn_pte(0, PAGE_KERNEL_EXEC);
 		pteval = pte_mkhuge(pteval);
-#if CHIP_HAS_CBOX_HOME_MAP()
 		if (ktext_hash) {
 			pteval = hv_pte_set_mode(pteval,
 						 HV_PTE_MODE_CACHE_HASH_L3);
 			pteval = ktext_set_nocache(pteval);
 		} else
-#endif /* CHIP_HAS_CBOX_HOME_MAP() */
 		if (cpumask_weight(&ktext_mask) == 1) {
 			pteval = set_remote_cache_cpu(pteval,
 					      cpumask_first(&ktext_mask));
@@ -628,7 +572,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 		else
 			pteval = hv_pte_set_mode(pteval,
 						 HV_PTE_MODE_CACHE_NO_L3);
-		*(pte_t *)pmd = pteval;
+		for (; address < (unsigned long)_einittext;
+		     pfn += PFN_DOWN(HPAGE_SIZE), address += HPAGE_SIZE)
+			*(pte_t *)(pmd++) = pfn_pte(pfn, pteval);
 	}
 
 	/* Set swapper_pgprot here so it is flushed to memory right away. */
@@ -643,16 +589,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	 *  - install pgtables[] as the real page table
 	 *  - flush the TLB so the new page table takes effect
 	 */
+	irqmask = interrupt_mask_save_mask();
+	interrupt_mask_set_mask(-1ULL);
 	rc = flush_and_install_context(__pa(pgtables),
 				       init_pgprot((unsigned long)pgtables),
 				       __get_cpu_var(current_asid),
 				       cpumask_bits(my_cpu_mask));
+	interrupt_mask_restore_mask(irqmask);
 	BUG_ON(rc != 0);
 
 	/* Copy the page table back to the normal swapper_pg_dir. */
 	memcpy(pgd_base, pgtables, sizeof(pgtables));
 	__install_page_table(pgd_base, __get_cpu_var(current_asid),
 			     swapper_pgprot);
+
+	/*
+	 * We just read swapper_pgprot and thus brought it into the cache,
+	 * with its new home & caching mode.  When we start the other CPUs,
+	 * they're going to reference swapper_pgprot via their initial fake
+	 * VA-is-PA mappings, which cache everything locally.  At that
+	 * time, if it's in our cache with a conflicting home, the
+	 * simulator's coherence checker will complain.  So, flush it out
+	 * of our cache; we're not going to ever use it again anyway.
+	 */
+	__insn_finv(&swapper_pgprot);
 }
 
 /*
@@ -698,6 +658,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 #endif /* CONFIG_HIGHMEM */
 
 
+#ifndef CONFIG_64BIT
 static void __init init_free_pfn_range(unsigned long start, unsigned long end)
 {
 	unsigned long pfn;
@@ -727,7 +688,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end)
 		}
 		init_page_count(page);
 		__free_pages(page, order);
-		totalram_pages += count;
+		adjust_managed_page_count(page, count);
 
 		page += count;
 		pfn += count;
@@ -740,16 +701,15 @@ static void __init set_non_bootmem_pages_init(void)
 	for_each_zone(z) {
 		unsigned long start, end;
 		int nid = z->zone_pgdat->node_id;
+#ifdef CONFIG_HIGHMEM
 		int idx = zone_idx(z);
+#endif
 
 		start = z->zone_start_pfn;
-		if (start == 0)
-			continue;  /* bootmem */
 		end = start + z->spanned_pages;
-		if (idx == ZONE_NORMAL) {
-			BUG_ON(start != node_start_pfn[nid]);
-			start = node_free_pfn[nid];
-		}
+		start = max(start, node_free_pfn[nid]);
+		start = max(start, max_low_pfn);
+
 #ifdef CONFIG_HIGHMEM
 		if (idx == ZONE_HIGHMEM)
 			totalhigh_pages += z->spanned_pages;
@@ -770,6 +730,7 @@ static void __init set_non_bootmem_pages_init(void)
 		init_free_pfn_range(start, end);
 	}
 }
+#endif
 
 /*
  * paging_init() sets up the page tables - note that all of lowmem is
@@ -777,9 +738,6 @@ static void __init set_non_bootmem_pages_init(void)
  */
 void __init paging_init(void)
 {
-#ifdef CONFIG_HIGHMEM
-	unsigned long vaddr, end;
-#endif
 #ifdef __tilegx__
 	pud_t *pud;
 #endif
@@ -787,14 +745,11 @@ void __init paging_init(void)
 
 	kernel_physical_mapping_init(pgd_base);
 
+	/* Fixed mappings, only the page table structure has to be created. */
+	page_table_range_init(fix_to_virt(__end_of_fixed_addresses - 1),
+			      FIXADDR_TOP, pgd_base);
+
 #ifdef CONFIG_HIGHMEM
-	/*
-	 * Fixed mappings, only the page table structure has to be
-	 * created - mappings will be set by set_fixmap():
-	 */
-	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
-	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
-	page_table_range_init(vaddr, end, pgd_base);
 	permanent_kmaps_init(pgd_base);
 #endif
 
@@ -806,7 +761,7 @@ void __init paging_init(void)
 	 * changing init_mm once we get up and running, and there's no
 	 * need for e.g. vmalloc_sync_all().
 	 */
-	BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END));
+	BUILD_BUG_ON(pgd_index(VMALLOC_START) != pgd_index(VMALLOC_END - 1));
 	pud = pud_offset(pgd_base + pgd_index(VMALLOC_START), VMALLOC_START);
 	assign_pmd(pud, alloc_pmd());
 #endif
@@ -831,15 +786,13 @@ static void __init set_max_mapnr_init(void)
 
 void __init mem_init(void)
 {
-	int codesize, datasize, initsize;
 	int i;
 #ifndef __tilegx__
 	void *last;
 #endif
 
 #ifdef CONFIG_FLATMEM
-	if (!mem_map)
-		BUG();
+	BUG_ON(!mem_map);
 #endif
 
 #ifdef CONFIG_HIGHMEM
@@ -857,24 +810,14 @@ void __init mem_init(void)
 	set_max_mapnr_init();
 
 	/* this will put all bootmem onto the freelists */
-	totalram_pages += free_all_bootmem();
+	free_all_bootmem();
 
+#ifndef CONFIG_64BIT
 	/* count all remaining LOWMEM and give all HIGHMEM to page allocator */
 	set_non_bootmem_pages_init();
+#endif
 
-	codesize =  (unsigned long)&_etext - (unsigned long)&_text;
-	datasize =  (unsigned long)&_end - (unsigned long)&_sdata;
-	initsize =  (unsigned long)&_einittext - (unsigned long)&_sinittext;
-	initsize += (unsigned long)&_einitdata - (unsigned long)&_sinitdata;
-
-	pr_info("Memory: %luk/%luk available (%dk kernel code, %dk data, %dk init, %ldk highmem)\n",
-		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
-		num_physpages << (PAGE_SHIFT-10),
-		codesize >> 10,
-		datasize >> 10,
-		initsize >> 10,
-		(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
-	       );
+	mem_init_print_info(NULL);
 
 	/*
 	 * In debug mode, dump some interesting memory mappings.
@@ -885,10 +828,6 @@ void __init mem_init(void)
 	printk(KERN_DEBUG "  PKMAP   %#lx - %#lx\n",
 	       PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP) - 1);
 #endif
-#ifdef CONFIG_HUGEVMAP
-	printk(KERN_DEBUG "  HUGEMAP %#lx - %#lx\n",
-	       HUGE_VMAP_BASE, HUGE_VMAP_END - 1);
-#endif
 	printk(KERN_DEBUG "  VMALLOC %#lx - %#lx\n",
 	       _VMALLOC_START, _VMALLOC_END - 1);
 #ifdef __tilegx__
@@ -944,41 +883,25 @@ int remove_memory(u64 start, u64 size)
 {
 	return -EINVAL;
 }
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+	/* TODO */
+	return -EBUSY;
+}
+#endif
 #endif
 
 struct kmem_cache *pgd_cache;
 
 void __init pgtable_cache_init(void)
 {
-	pgd_cache = kmem_cache_create("pgd",
-				PTRS_PER_PGD*sizeof(pgd_t),
-				PTRS_PER_PGD*sizeof(pgd_t),
-				0,
-				NULL);
+	pgd_cache = kmem_cache_create("pgd", SIZEOF_PGD, SIZEOF_PGD, 0, NULL);
 	if (!pgd_cache)
 		panic("pgtable_cache_init(): Cannot create pgd cache");
 }
 
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-/*
- * The __w1data area holds data that is only written during initialization,
- * and is read-only and thus freely cacheable thereafter.  Fix the page
- * table entries that cover that region accordingly.
- */
-static void mark_w1data_ro(void)
-{
-	/* Loop over page table entries */
-	unsigned long addr = (unsigned long)__w1data_begin;
-	BUG_ON((addr & (PAGE_SIZE-1)) != 0);
-	for (; addr <= (unsigned long)__w1data_end - 1; addr += PAGE_SIZE) {
-		unsigned long pfn = kaddr_to_pfn((void *)addr);
-		pte_t *ptep = virt_to_pte(NULL, addr);
-		BUG_ON(pte_huge(*ptep));   /* not relevant for kdata_huge */
-		set_pte_at(&init_mm, addr, ptep, pfn_pte(pfn, PAGE_KERNEL_RO));
-	}
-}
-#endif
-
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static long __write_once initfree;
 #else
@@ -989,7 +912,7 @@ static long __write_once initfree = 1;
 static int __init set_initfree(char *str)
 {
 	long val;
-	if (strict_strtol(str, 0, &val)) {
+	if (kstrtol(str, 0, &val) == 0) {
 		initfree = val;
 		pr_info("initfree: %s free init pages\n",
 			initfree ? "will" : "won't");
@@ -1018,7 +941,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
 		 */
 		int pfn = kaddr_to_pfn((void *)addr);
 		struct page *page = pfn_to_page(pfn);
-		pte_t *ptep = virt_to_pte(NULL, addr);
+		pte_t *ptep = virt_to_kpte(addr);
 		if (!initfree) {
 			/*
 			 * If debugging page accesses then do not free
@@ -1029,31 +952,24 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
 			pte_clear(&init_mm, addr, ptep);
 			continue;
 		}
-		__ClearPageReserved(page);
-		init_page_count(page);
 		if (pte_huge(*ptep))
 			BUG_ON(!kdata_huge);
 		else
 			set_pte_at(&init_mm, addr, ptep,
 				   pfn_pte(pfn, PAGE_KERNEL));
 		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
-		free_page(addr);
-		totalram_pages++;
+		free_reserved_page(page);
 	}
 	pr_info("Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
 }
 
 void free_initmem(void)
 {
-	const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
+	const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;
 
 	/*
-	 * Evict the dirty initdata on the boot cpu, evict the w1data
-	 * wherever it's homed, and evict all the init code everywhere.
-	 * We are guaranteed that no one will touch the init pages any
-	 * more, and although other cpus may be touching the w1data,
-	 * we only actually change the caching on tile64, which won't
-	 * be keeping local copies in the other tiles' caches anyway.
+	 * Evict the cache on all cores to avoid incoherence.
+	 * We are guaranteed that no one will touch the init pages any more.
 	 */
 	homecache_evict(&cpu_cacheable_map);
 
@@ -1064,26 +980,11 @@ void free_initmem(void)
 
 	/*
 	 * Free the pages mapped from 0xc0000000 that correspond to code
-	 * pages from MEM_SV_INTRPT that we won't use again after init.
+	 * pages from MEM_SV_START that we won't use again after init.
 	 */
 	free_init_pages("unused kernel text",
 			(unsigned long)_sinittext - text_delta,
 			(unsigned long)_einittext - text_delta);
-
-#if !CHIP_HAS_COHERENT_LOCAL_CACHE()
-	/*
-	 * Upgrade the .w1data section to globally cached.
-	 * We don't do this on tilepro, since the cache architecture
-	 * pretty much makes it irrelevant, and in any case we end
-	 * up having racing issues with other tiles that may touch
-	 * the data after we flush the cache but before we update
-	 * the PTEs and flush the TLBs, causing sharer shootdowns
-	 * later.  Even though this is to clean data, it seems like
-	 * an unnecessary complication.
-	 */
-	mark_w1data_ro();
-#endif
-
 	/* Do a global TLB flush so everyone sees the changes. */
 	flush_tlb_all();
 }
diff --git a/arch/tile/mm/migrate.h b/arch/tile/mm/migrate.h
index cd45a0837fa..91683d97917 100644
--- a/arch/tile/mm/migrate.h
+++ b/arch/tile/mm/migrate.h
@@ -24,6 +24,9 @@
 /*
  * This function is used as a helper when setting up the initial
  * page table (swapper_pg_dir).
+ *
+ * You must mask ALL interrupts prior to invoking this code, since
+ * you can't legally touch the stack during the cache flush.
  */
 extern int flush_and_install_context(HV_PhysAddr page_table, HV_PTE access,
 				     HV_ASID asid,
@@ -39,6 +42,9 @@ extern int flush_and_install_context(HV_PhysAddr page_table, HV_PTE access,
  *
  * Note that any non-NULL pointers must not point to the page that
  * is handled by the stack_pte itself.
+ *
+ * You must mask ALL interrupts prior to invoking this code, since
+ * you can't legally touch the stack during the cache flush.
  */
 extern int homecache_migrate_stack_and_flush(pte_t stack_pte, unsigned long va,
 				     size_t length, pte_t *stack_ptep,
diff --git a/arch/tile/mm/migrate_32.S b/arch/tile/mm/migrate_32.S
index f738765cd1e..772085491bf 100644
--- a/arch/tile/mm/migrate_32.S
+++ b/arch/tile/mm/migrate_32.S
@@ -18,6 +18,7 @@
 #include <linux/linkage.h>
 #include <linux/threads.h>
 #include <asm/page.h>
+#include <asm/thread_info.h>
 #include <asm/types.h>
 #include <asm/asm-offsets.h>
 #include <hv/hypervisor.h>
@@ -39,8 +40,7 @@
 #define FRAME_R32	16
 #define FRAME_R33	20
 #define FRAME_R34	24
-#define FRAME_R35	28
-#define FRAME_SIZE	32
+#define FRAME_SIZE	28
 
 
 
@@ -65,12 +65,11 @@
 #define r_my_cpumask	r5
 
 /* Locals (callee-save); must not be more than FRAME_xxx above. */
-#define r_save_ics	r30
-#define r_context_lo	r31
-#define r_context_hi	r32
-#define r_access_lo	r33
-#define r_access_hi	r34
-#define r_asid		r35
+#define r_context_lo	r30
+#define r_context_hi	r31
+#define r_access_lo	r32
+#define r_access_hi	r33
+#define r_asid		r34
 
 STD_ENTRY(flush_and_install_context)
 	/*
@@ -103,11 +102,7 @@ STD_ENTRY(flush_and_install_context)
 	 sw r_tmp, r33
 	 addi r_tmp, sp, FRAME_R34
 	}
-	{
-	 sw r_tmp, r34
-	 addi r_tmp, sp, FRAME_R35
-	}
-	sw r_tmp, r35
+	sw r_tmp, r34
 
 	/* Move some arguments to callee-save registers. */
 	{
@@ -120,13 +115,6 @@ STD_ENTRY(flush_and_install_context)
 	}
 	move r_asid, r_asid_in
 
-	/* Disable interrupts, since we can't use our stack. */
-	{
-	 mfspr r_save_ics, INTERRUPT_CRITICAL_SECTION
-	 movei r_tmp, 1
-	}
-	mtspr INTERRUPT_CRITICAL_SECTION, r_tmp
-
 	/* First, flush our L2 cache. */
 	{
 	 move r0, zero  /* cache_pa */
@@ -148,7 +136,7 @@ STD_ENTRY(flush_and_install_context)
 	 move r8, zero  /* asids */
 	 move r9, zero  /* asidcount */
 	}
-	jal hv_flush_remote
+	jal _hv_flush_remote
 	bnz r0, .Ldone
 
 	/* Now install the new page table. */
@@ -162,9 +150,9 @@ STD_ENTRY(flush_and_install_context)
 	}
 	{
 	 move r4, r_asid
-	 movei r5, HV_CTX_DIRECTIO
+	 moveli r5, HV_CTX_DIRECTIO | CTX_PAGE_FLAG
 	}
-	jal hv_install_context
+	jal _hv_install_context
 	bnz r0, .Ldone
 
 	/* Finally, flush the TLB. */
@@ -174,9 +162,6 @@ STD_ENTRY(flush_and_install_context)
 	}
 
 .Ldone:
-	/* Reset interrupts back how they were before. */
-	mtspr INTERRUPT_CRITICAL_SECTION, r_save_ics
-
 	/* Restore the callee-saved registers and return. */
 	addli lr, sp, FRAME_SIZE
 	{
@@ -201,10 +186,6 @@ STD_ENTRY(flush_and_install_context)
 	}
 	{
 	 lw r34, r_tmp
-	 addli r_tmp, sp, FRAME_R35
-	}
-	{
-	 lw r35, r_tmp
 	 addi sp, sp, FRAME_SIZE
 	}
 	jrp lr
diff --git a/arch/tile/mm/migrate_64.S b/arch/tile/mm/migrate_64.S
new file mode 100644
index 00000000000..a49eee38f87
--- /dev/null
+++ b/arch/tile/mm/migrate_64.S
@@ -0,0 +1,167 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * This routine is a helper for migrating the home of a set of pages to
+ * a new cpu.  See the documentation in homecache.c for more information.
+ */
+
+#include <linux/linkage.h>
+#include <linux/threads.h>
+#include <asm/page.h>
+#include <asm/thread_info.h>
+#include <asm/types.h>
+#include <asm/asm-offsets.h>
+#include <hv/hypervisor.h>
+
+	.text
+
+/*
+ * First, some definitions that apply to all the code in the file.
+ */
+
+/* Locals (caller-save) */
+#define r_tmp		r10
+#define r_save_sp	r11
+
+/* What we save where in the stack frame; must include all callee-saves. */
+#define FRAME_SP	8
+#define FRAME_R30	16
+#define FRAME_R31	24
+#define FRAME_R32	32
+#define FRAME_SIZE	40
+
+
+
+
+/*
+ * On entry:
+ *
+ *   r0 the new context PA to install (moved to r_context)
+ *   r1 PTE to use for context access (moved to r_access)
+ *   r2 ASID to use for new context (moved to r_asid)
+ *   r3 pointer to cpumask with just this cpu set in it (r_my_cpumask)
+ */
+
+/* Arguments (caller-save) */
+#define r_context_in	r0
+#define r_access_in	r1
+#define r_asid_in	r2
+#define r_my_cpumask	r3
+
+/* Locals (callee-save); must not be more than FRAME_xxx above. */
+#define r_context	r30
+#define r_access	r31
+#define r_asid		r32
+
+/*
+ * Caller-save locals and frame constants are the same as
+ * for homecache_migrate_stack_and_flush.
+ */
+
+STD_ENTRY(flush_and_install_context)
+	/*
+	 * Create a stack frame; we can't touch it once we flush the
+	 * cache until we install the new page table and flush the TLB.
+	 */
+	{
+	 move r_save_sp, sp
+	 st sp, lr
+	 addi sp, sp, -FRAME_SIZE
+	}
+	addi r_tmp, sp, FRAME_SP
+	{
+	 st r_tmp, r_save_sp
+	 addi r_tmp, sp, FRAME_R30
+	}
+	{
+	 st r_tmp, r30
+	 addi r_tmp, sp, FRAME_R31
+	}
+	{
+	 st r_tmp, r31
+	 addi r_tmp, sp, FRAME_R32
+	}
+	st r_tmp, r32
+
+	/* Move some arguments to callee-save registers. */
+	{
+	 move r_context, r_context_in
+	 move r_access, r_access_in
+	}
+	move r_asid, r_asid_in
+
+	/* First, flush our L2 cache. */
+	{
+	 move r0, zero  /* cache_pa */
+	 moveli r1, hw2_last(HV_FLUSH_EVICT_L2)  /* cache_control */
+	}
+	{
+	 shl16insli r1, r1, hw1(HV_FLUSH_EVICT_L2)
+	 move r2, r_my_cpumask  /* cache_cpumask */
+	}
+	{
+	 shl16insli r1, r1, hw0(HV_FLUSH_EVICT_L2)
+	 move r3, zero  /* tlb_va */
+	}
+	{
+	 move r4, zero  /* tlb_length */
+	 move r5, zero  /* tlb_pgsize */
+	}
+	{
+	 move r6, zero  /* tlb_cpumask */
+	 move r7, zero  /* asids */
+	}
+	{
+	 move r8, zero  /* asidcount */
+	 jal _hv_flush_remote
+	}
+	bnez r0, 1f
+
+	/* Now install the new page table. */
+	{
+	 move r0, r_context
+	 move r1, r_access
+	}
+	{
+	 move r2, r_asid
+	 moveli r3, HV_CTX_DIRECTIO | CTX_PAGE_FLAG
+	}
+	jal _hv_install_context
+	bnez r0, 1f
+
+	/* Finally, flush the TLB. */
+	{
+	 movei r0, 0   /* preserve_global */
+	 jal hv_flush_all
+	}
+
+1:	/* Restore the callee-saved registers and return. */
+	addli lr, sp, FRAME_SIZE
+	{
+	 ld lr, lr
+	 addli r_tmp, sp, FRAME_R30
+	}
+	{
+	 ld r30, r_tmp
+	 addli r_tmp, sp, FRAME_R31
+	}
+	{
+	 ld r31, r_tmp
+	 addli r_tmp, sp, FRAME_R32
+	}
+	{
+	 ld r32, r_tmp
+	 addi sp, sp, FRAME_SIZE
+	}
+	jrp lr
+	STD_ENDPROC(flush_and_install_context)
diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c
index f96f4cec602..851a94e6ae5 100644
--- a/arch/tile/mm/mmap.c
+++ b/arch/tile/mm/mmap.c
@@ -58,18 +58,36 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 #else
 	int is_32bit = 0;
 #endif
+	unsigned long random_factor = 0UL;
+
+	/*
+	 *  8 bits of randomness in 32bit mmaps, 24 address space bits
+	 * 12 bits of randomness in 64bit mmaps, 28 address space bits
+	 */
+	if (current->flags & PF_RANDOMIZE) {
+		if (is_32bit)
+			random_factor = get_random_int() % (1<<8);
+		else
+			random_factor = get_random_int() % (1<<12);
+
+		random_factor <<= PAGE_SHIFT;
+	}
 
 	/*
 	 * Use standard layout if the expected stack growth is unlimited
 	 * or we are running native 64 bits.
 	 */
-	if (!is_32bit || rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
-		mm->mmap_base = TASK_UNMAPPED_BASE;
+	if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) {
+		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
 		mm->get_unmapped_area = arch_get_unmapped_area;
-		mm->unmap_area = arch_unmap_area;
 	} else {
 		mm->mmap_base = mmap_base(mm);
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-		mm->unmap_area = arch_unmap_area_topdown;
 	}
 }
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+	unsigned long range_end = mm->brk + 0x02000000;
+	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 1f5430c53d0..5e86eac4bfa 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -27,7 +27,6 @@
 #include <linux/vmalloc.h>
 #include <linux/smp.h>
 
-#include <asm/system.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/fixmap.h>
@@ -41,7 +40,7 @@
  * The normal show_free_areas() is too verbose on Tile, with dozens
  * of processors and often four NUMA zones each with high and lowmem.
  */
-void show_mem(void)
+void show_mem(unsigned int filter)
 {
 	struct zone *zone;
 
@@ -62,7 +61,7 @@ void show_mem(void)
 	       global_page_state(NR_PAGETABLE),
 	       global_page_state(NR_BOUNCE),
 	       global_page_state(NR_FILE_PAGES),
-	       nr_swap_pages);
+	       get_nr_swap_pages());
 
 	for_each_zone(zone) {
 		unsigned long flags, order, total = 0, largest_order = -1;
@@ -84,63 +83,72 @@ void show_mem(void)
 	}
 }
 
-/*
- * Associate a virtual page frame with a given physical page frame
- * and protection flags for that frame.
+/**
+ * shatter_huge_page() - ensure a given address is mapped by a small page.
+ *
+ * This function converts a huge PTE mapping kernel LOWMEM into a bunch
+ * of small PTEs with the same caching.  No cache flush required, but we
+ * must do a global TLB flush.
+ *
+ * Any caller that wishes to modify a kernel mapping that might
+ * have been made with a huge page should call this function,
+ * since doing so properly avoids race conditions with installing the
+ * newly-shattered page and then flushing all the TLB entries.
+ *
+ * @addr: Address at which to shatter any existing huge page.
  */
-static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+void shatter_huge_page(unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte;
+	unsigned long flags = 0;  /* happy compiler */
+#ifdef __PAGETABLE_PMD_FOLDED
+	struct list_head *pos;
+#endif
 
-	pgd = swapper_pg_dir + pgd_index(vaddr);
-	if (pgd_none(*pgd)) {
-		BUG();
-		return;
-	}
-	pud = pud_offset(pgd, vaddr);
-	if (pud_none(*pud)) {
-		BUG();
+	/* Get a pointer to the pmd entry that we need to change. */
+	addr &= HPAGE_MASK;
+	BUG_ON(pgd_addr_invalid(addr));
+	BUG_ON(addr < PAGE_OFFSET);  /* only for kernel LOWMEM */
+	pgd = swapper_pg_dir + pgd_index(addr);
+	pud = pud_offset(pgd, addr);
+	BUG_ON(!pud_present(*pud));
+	pmd = pmd_offset(pud, addr);
+	BUG_ON(!pmd_present(*pmd));
+	if (!pmd_huge_page(*pmd))
 		return;
-	}
-	pmd = pmd_offset(pud, vaddr);
-	if (pmd_none(*pmd)) {
-		BUG();
+
+	spin_lock_irqsave(&init_mm.page_table_lock, flags);
+	if (!pmd_huge_page(*pmd)) {
+		/* Lost the race to convert the huge page. */
+		spin_unlock_irqrestore(&init_mm.page_table_lock, flags);
 		return;
 	}
-	pte = pte_offset_kernel(pmd, vaddr);
-	/* <pfn,flags> stored as-is, to permit clearing entries */
-	set_pte(pte, pfn_pte(pfn, flags));
-
-	/*
-	 * It's enough to flush this one mapping.
-	 * This appears conservative since it is only called
-	 * from __set_fixmap.
-	 */
-	local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
-}
 
-void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
-{
-	unsigned long address = __fix_to_virt(idx);
-
-	if (idx >= __end_of_fixed_addresses) {
-		BUG();
-		return;
+	/* Shatter the huge page into the preallocated L2 page table. */
+	pmd_populate_kernel(&init_mm, pmd, get_prealloc_pte(pmd_pfn(*pmd)));
+
+#ifdef __PAGETABLE_PMD_FOLDED
+	/* Walk every pgd on the system and update the pmd there. */
+	spin_lock(&pgd_lock);
+	list_for_each(pos, &pgd_list) {
+		pmd_t *copy_pmd;
+		pgd = list_to_pgd(pos) + pgd_index(addr);
+		pud = pud_offset(pgd, addr);
+		copy_pmd = pmd_offset(pud, addr);
+		__set_pmd(copy_pmd, *pmd);
 	}
-	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
-}
+	spin_unlock(&pgd_lock);
+#endif
 
-#if defined(CONFIG_HIGHPTE)
-pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
-{
-	pte_t *pte = kmap_atomic(pmd_page(*dir)) +
-		(pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
-	return &pte[pte_index(address)];
+	/* Tell every cpu to notice the change. */
+	flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
+		     cpu_possible_mask, NULL, 0);
+
+	/* Hold the lock until the TLB flush is finished to avoid races. */
+	spin_unlock_irqrestore(&init_mm.page_table_lock, flags);
 }
-#endif
 
 /*
  * List of all pgd's needed so it can invalidate entries in both cached
@@ -148,9 +156,13 @@ pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
  * against pageattr.c; it is the unique case in which a valid change
  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
  * vmalloc faults work because attached pagetables are never freed.
- * The locking scheme was chosen on the basis of manfred's
- * recommendations and having no core impact whatsoever.
- * -- wli
+ *
+ * The lock is always taken with interrupts disabled, unlike on x86
+ * and other platforms, because we need to take the lock in
+ * shatter_huge_page(), which may be called from an interrupt context.
+ * We are not at risk from the tlbflush IPI deadlock that was seen on
+ * x86, since we use the flush_remote() API to have the hypervisor do
+ * the TLB flushes regardless of irq disabling.
  */
 DEFINE_SPINLOCK(pgd_lock);
 LIST_HEAD(pgd_list);
@@ -184,9 +196,9 @@ static void pgd_ctor(pgd_t *pgd)
 	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
 #endif
 
-	clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,
-			swapper_pg_dir + KERNEL_PGD_INDEX_START,
-			KERNEL_PGD_PTRS);
+	memcpy(pgd + KERNEL_PGD_INDEX_START,
+	       swapper_pg_dir + KERNEL_PGD_INDEX_START,
+	       KERNEL_PGD_PTRS * sizeof(pgd_t));
 
 	pgd_list_add(pgd);
 	spin_unlock_irqrestore(&pgd_lock, flags);
@@ -218,20 +230,32 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 #define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)
 
-struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+struct page *pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
+			       int order)
 {
-	gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP;
+	gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO;
 	struct page *p;
-
-#ifdef CONFIG_HIGHPTE
-	flags |= __GFP_HIGHMEM;
-#endif
+	int i;
 
 	p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
 	if (p == NULL)
 		return NULL;
 
-	pgtable_page_ctor(p);
+	if (!pgtable_page_ctor(p)) {
+		__free_pages(p, L2_USER_PGTABLE_ORDER);
+		return NULL;
+	}
+
+	/*
+	 * Make every page have a page_count() of one, not just the first.
+	 * We don't use __GFP_COMP since it doesn't look like it works
+	 * correctly with tlb_remove_page().
+	 */
+	for (i = 1; i < order; ++i) {
+		init_page_count(p+i);
+		inc_zone_page_state(p+i, NR_PAGETABLE);
+	}
+
 	return p;
 }
 
@@ -240,30 +264,30 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
  * process).  We have to correct whatever pte_alloc_one() did before
  * returning the pages to the allocator.
  */
-void pte_free(struct mm_struct *mm, struct page *p)
+void pgtable_free(struct mm_struct *mm, struct page *p, int order)
 {
+	int i;
+
 	pgtable_page_dtor(p);
-	__free_pages(p, L2_USER_PGTABLE_ORDER);
+	__free_page(p);
+
+	for (i = 1; i < order; ++i) {
+		__free_page(p+i);
+		dec_zone_page_state(p+i, NR_PAGETABLE);
+	}
 }
 
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
-		    unsigned long address)
+void __pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte,
+			unsigned long address, int order)
 {
 	int i;
 
 	pgtable_page_dtor(pte);
-	tlb->need_flush = 1;
-	if (tlb_fast_mode(tlb)) {
-		struct page *pte_pages[L2_USER_PGTABLE_PAGES];
-		for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)
-			pte_pages[i] = pte + i;
-		free_pages_and_swap_cache(pte_pages, L2_USER_PGTABLE_PAGES);
-		return;
-	}
-	for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) {
-		tlb->pages[tlb->nr++] = pte + i;
-		if (tlb->nr >= FREE_PTE_NR)
-			tlb_flush_mmu(tlb, 0, 0);
+	tlb_remove_page(tlb, pte);
+
+	for (i = 1; i < order; ++i) {
+		tlb_remove_page(tlb, pte + i);
+		dec_zone_page_state(pte + i, NR_PAGETABLE);
 	}
 }
 
@@ -304,6 +328,17 @@ void ptep_set_wrprotect(struct mm_struct *mm,
 
 #endif
 
+/*
+ * Return a pointer to the PTE that corresponds to the given
+ * address in the given page table.  A NULL page table just uses
+ * the standard kernel page table; the preferred API in this case
+ * is virt_to_kpte().
+ *
+ * The returned pointer can point to a huge page in other levels
+ * of the page table than the bottom, if the huge page is present
+ * in the page table.  For bottom-level PTEs, the returned pointer
+ * can point to a PTE that is either present or not.
+ */
 pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -317,13 +352,23 @@ pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
 	pud = pud_offset(pgd, addr);
 	if (!pud_present(*pud))
 		return NULL;
+	if (pud_huge_page(*pud))
+		return (pte_t *)pud;
 	pmd = pmd_offset(pud, addr);
-	if (pmd_huge_page(*pmd))
-		return (pte_t *)pmd;
 	if (!pmd_present(*pmd))
 		return NULL;
+	if (pmd_huge_page(*pmd))
+		return (pte_t *)pmd;
 	return pte_offset_kernel(pmd, addr);
 }
+EXPORT_SYMBOL(virt_to_pte);
+
+pte_t *virt_to_kpte(unsigned long kaddr)
+{
+	BUG_ON(kaddr < PAGE_OFFSET);
+	return virt_to_pte(NULL, kaddr);
+}
+EXPORT_SYMBOL(virt_to_kpte);
 
 pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
 {
@@ -346,41 +391,65 @@ int get_remote_cache_cpu(pgprot_t prot)
 	return x + y * smp_width;
 }
 
-void set_pte_order(pte_t *ptep, pte_t pte, int order)
+/*
+ * Convert a kernel VA to a PA and homing information.
+ */
+int va_to_cpa_and_pte(void *va, unsigned long long *cpa, pte_t *pte)
 {
-	unsigned long pfn = pte_pfn(pte);
-	struct page *page = pfn_to_page(pfn);
+	struct page *page = virt_to_page(va);
+	pte_t null_pte = { 0 };
+
+	*cpa = __pa(va);
 
-	/* Update the home of a PTE if necessary */
-	pte = pte_set_home(pte, page_home(page));
+	/* Note that this is not writing a page table, just returning a pte. */
+	*pte = pte_set_home(null_pte, page_home(page));
+
+	return 0; /* return non-zero if not hfh? */
+}
+EXPORT_SYMBOL(va_to_cpa_and_pte);
 
+void __set_pte(pte_t *ptep, pte_t pte)
+{
 #ifdef __tilegx__
 	*ptep = pte;
 #else
-	/*
-	 * When setting a PTE, write the high bits first, then write
-	 * the low bits.  This sets the "present" bit only after the
-	 * other bits are in place.  If a particular PTE update
-	 * involves transitioning from one valid PTE to another, it
-	 * may be necessary to call set_pte_order() more than once,
-	 * transitioning via a suitable intermediate state.
-	 * Note that this sequence also means that if we are transitioning
-	 * from any migrating PTE to a non-migrating one, we will not
-	 * see a half-updated PTE with the migrating bit off.
-	 */
-#if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
-# error Must write the present and migrating bits last
-#endif
-	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
-	barrier();
-	((u32 *)ptep)[0] = (u32)(pte_val(pte));
-#endif
+# if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
+#  error Must write the present and migrating bits last
+# endif
+	if (pte_present(pte)) {
+		((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
+		barrier();
+		((u32 *)ptep)[0] = (u32)(pte_val(pte));
+	} else {
+		((u32 *)ptep)[0] = (u32)(pte_val(pte));
+		barrier();
+		((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
+	}
+#endif /* __tilegx__ */
+}
+
+void set_pte(pte_t *ptep, pte_t pte)
+{
+	if (pte_present(pte) &&
+	    (!CHIP_HAS_MMIO() || hv_pte_get_mode(pte) != HV_PTE_MODE_MMIO)) {
+		/* The PTE actually references physical memory. */
+		unsigned long pfn = pte_pfn(pte);
+		if (pfn_valid(pfn)) {
+			/* Update the home of the PTE from the struct page. */
+			pte = pte_set_home(pte, page_home(pfn_to_page(pfn)));
+		} else if (hv_pte_get_mode(pte) == 0) {
+			/* remap_pfn_range(), etc, must supply PTE mode. */
+			panic("set_pte(): out-of-range PFN and mode 0\n");
+		}
+	}
+
+	__set_pte(ptep, pte);
 }
 
 /* Can this mm load a PTE with cached_priority set? */
 static inline int mm_is_priority_cached(struct mm_struct *mm)
 {
-	return mm->context.priority_cached;
+	return mm->context.priority_cached != 0;
 }
 
 /*
@@ -390,8 +459,8 @@ static inline int mm_is_priority_cached(struct mm_struct *mm)
 void start_mm_caching(struct mm_struct *mm)
 {
 	if (!mm_is_priority_cached(mm)) {
-		mm->context.priority_cached = -1U;
-		hv_set_caching(-1U);
+		mm->context.priority_cached = -1UL;
+		hv_set_caching(-1UL);
 	}
 }
 
@@ -406,7 +475,7 @@ void start_mm_caching(struct mm_struct *mm)
  * Presumably we'll come back later and have more luck and clear
  * the value then; for now we'll just keep the cache marked for priority.
  */
-static unsigned int update_priority_cached(struct mm_struct *mm)
+static unsigned long update_priority_cached(struct mm_struct *mm)
 {
 	if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
 		struct vm_area_struct *vm;
@@ -474,20 +543,13 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
 	addr = area->addr;
 	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
 			       phys_addr, pgprot)) {
-		remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+		free_vm_area(area);
 		return NULL;
 	}
 	return (__force void __iomem *) (offset + (char *)addr);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
-/* Map a PCI MMIO bus address into VA space. */
-void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
-{
-	panic("ioremap for PCI MMIO is not supported");
-}
-EXPORT_SYMBOL(ioremap);
-
 /* Unmap an MMIO VA mapping. */
 void iounmap(volatile void __iomem *addr_in)
 {
@@ -505,12 +567,7 @@ void iounmap(volatile void __iomem *addr_in)
 	   in parallel. Reuse of the virtual address is prevented by
 	   leaving it in the global lists until we're done with it.
 	   cpa takes care of the direct mappings. */
-	read_lock(&vmlist_lock);
-	for (p = vmlist; p; p = p->next) {
-		if (p->addr == addr)
-			break;
-	}
-	read_unlock(&vmlist_lock);
+	p = find_vm_area((void *)addr);
 
 	if (!p) {
 		pr_err("iounmap: bad address %p\n", addr);