1 files changed, 438 insertions, 224 deletions
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 60d74f793a1..541dc610150 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -2,7 +2,7 @@
  * Page fault handler for SH with an MMU.
  *
  *  Copyright (C) 1999  Niibe Yutaka
- *  Copyright (C) 2003 - 2007  Paul Mundt
+ *  Copyright (C) 2003 - 2012  Paul Mundt
  *
  *  Based on linux/arch/i386/mm/fault.c:
  *   Copyright (C) 1995  Linus Torvalds
@@ -15,289 +15,503 @@
 #include <linux/mm.h>
 #include <linux/hardirq.h>
 #include <linux/kprobes.h>
-#include <asm/system.h>
+#include <linux/perf_event.h>
+#include <linux/kdebug.h>
+#include <asm/io_trapped.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
-#include <asm/kgdb.h>
+#include <asm/traps.h>
+
+static inline int notify_page_fault(struct pt_regs *regs, int trap)
+{
+	int ret = 0;
+
+	if (kprobes_built_in() && !user_mode(regs)) {
+		preempt_disable();
+		if (kprobe_running() && kprobe_fault_handler(regs, trap))
+			ret = 1;
+		preempt_enable();
+	}
+
+	return ret;
+}
+
+static void
+force_sig_info_fault(int si_signo, int si_code, unsigned long address,
+		     struct task_struct *tsk)
+{
+	siginfo_t info;
+
+	info.si_signo	= si_signo;
+	info.si_errno	= 0;
+	info.si_code	= si_code;
+	info.si_addr	= (void __user *)address;
+
+	force_sig_info(si_signo, &info, tsk);
+}
 
 /*
- * This routine handles page faults.  It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
+ * This is useful to dump out the page tables associated with
+ * 'addr' in mm 'mm'.
  */
-asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
-					unsigned long writeaccess,
-					unsigned long address)
+static void show_pte(struct mm_struct *mm, unsigned long addr)
 {
-	struct task_struct *tsk;
-	struct mm_struct *mm;
-	struct vm_area_struct * vma;
-	int si_code;
-	int fault;
-	siginfo_t info;
+	pgd_t *pgd;
 
-	trace_hardirqs_on();
-	local_irq_enable();
+	if (mm) {
+		pgd = mm->pgd;
+	} else {
+		pgd = get_TTB();
 
-#ifdef CONFIG_SH_KGDB
-	if (kgdb_nofault && kgdb_bus_err_hook)
-		kgdb_bus_err_hook();
-#endif
+		if (unlikely(!pgd))
+			pgd = swapper_pg_dir;
+	}
 
-	tsk = current;
-	mm = tsk->mm;
-	si_code = SEGV_MAPERR;
+	printk(KERN_ALERT "pgd = %p\n", pgd);
+	pgd += pgd_index(addr);
+	printk(KERN_ALERT "[%08lx] *pgd=%0*Lx", addr,
+	       (u32)(sizeof(*pgd) * 2), (u64)pgd_val(*pgd));
 
-	if (unlikely(address >= TASK_SIZE)) {
-		/*
-		 * Synchronize this task's top level page-table
-		 * with the 'reference' page table.
-		 *
-		 * Do _not_ use "tsk" here. We might be inside
-		 * an interrupt in the middle of a task switch..
-		 */
-		int offset = pgd_index(address);
-		pgd_t *pgd, *pgd_k;
-		pud_t *pud, *pud_k;
-		pmd_t *pmd, *pmd_k;
-
-		pgd = get_TTB() + offset;
-		pgd_k = swapper_pg_dir + offset;
-
-		/* This will never happen with the folded page table. */
-		if (!pgd_present(*pgd)) {
-			if (!pgd_present(*pgd_k))
-				goto bad_area_nosemaphore;
-			set_pgd(pgd, *pgd_k);
-			return;
+	do {
+		pud_t *pud;
+		pmd_t *pmd;
+		pte_t *pte;
+
+		if (pgd_none(*pgd))
+			break;
+
+		if (pgd_bad(*pgd)) {
+			printk("(bad)");
+			break;
 		}
 
-		pud = pud_offset(pgd, address);
-		pud_k = pud_offset(pgd_k, address);
-		if (pud_present(*pud) || !pud_present(*pud_k))
-			goto bad_area_nosemaphore;
-		set_pud(pud, *pud_k);
+		pud = pud_offset(pgd, addr);
+		if (PTRS_PER_PUD != 1)
+			printk(", *pud=%0*Lx", (u32)(sizeof(*pud) * 2),
+			       (u64)pud_val(*pud));
 
-		pmd = pmd_offset(pud, address);
-		pmd_k = pmd_offset(pud_k, address);
-		if (pmd_present(*pmd) || !pmd_present(*pmd_k))
-			goto bad_area_nosemaphore;
-		set_pmd(pmd, *pmd_k);
+		if (pud_none(*pud))
+			break;
 
-		return;
+		if (pud_bad(*pud)) {
+			printk("(bad)");
+			break;
+		}
+
+		pmd = pmd_offset(pud, addr);
+		if (PTRS_PER_PMD != 1)
+			printk(", *pmd=%0*Lx", (u32)(sizeof(*pmd) * 2),
+			       (u64)pmd_val(*pmd));
+
+		if (pmd_none(*pmd))
+			break;
+
+		if (pmd_bad(*pmd)) {
+			printk("(bad)");
+			break;
+		}
+
+		/* We must not map this if we have highmem enabled */
+		if (PageHighMem(pfn_to_page(pmd_val(*pmd) >> PAGE_SHIFT)))
+			break;
+
+		pte = pte_offset_kernel(pmd, addr);
+		printk(", *pte=%0*Lx", (u32)(sizeof(*pte) * 2),
+		       (u64)pte_val(*pte));
+	} while (0);
+
+	printk("\n");
+}
+
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+	unsigned index = pgd_index(address);
+	pgd_t *pgd_k;
+	pud_t *pud, *pud_k;
+	pmd_t *pmd, *pmd_k;
+
+	pgd += index;
+	pgd_k = init_mm.pgd + index;
+
+	if (!pgd_present(*pgd_k))
+		return NULL;
+
+	pud = pud_offset(pgd, address);
+	pud_k = pud_offset(pgd_k, address);
+	if (!pud_present(*pud_k))
+		return NULL;
+
+	if (!pud_present(*pud))
+	    set_pud(pud, *pud_k);
+
+	pmd = pmd_offset(pud, address);
+	pmd_k = pmd_offset(pud_k, address);
+	if (!pmd_present(*pmd_k))
+		return NULL;
+
+	if (!pmd_present(*pmd))
+		set_pmd(pmd, *pmd_k);
+	else {
+		/*
+		 * The page tables are fully synchronised so there must
+		 * be another reason for the fault. Return NULL here to
+		 * signal that we have not taken care of the fault.
+		 */
+		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+		return NULL;
 	}
 
-	/*
-	 * If we're in an interrupt or have no user
-	 * context, we must not take the fault..
-	 */
-	if (in_atomic() || !mm)
-		goto no_context;
+	return pmd_k;
+}
 
-	down_read(&mm->mmap_sem);
+#ifdef CONFIG_SH_STORE_QUEUES
+#define __FAULT_ADDR_LIMIT	P3_ADDR_MAX
+#else
+#define __FAULT_ADDR_LIMIT	VMALLOC_END
+#endif
 
-	vma = find_vma(mm, address);
-	if (!vma)
-		goto bad_area;
-	if (vma->vm_start <= address)
-		goto good_area;
-	if (!(vma->vm_flags & VM_GROWSDOWN))
-		goto bad_area;
-	if (expand_stack(vma, address))
-		goto bad_area;
 /*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
+ * Handle a fault on the vmalloc or module mapping area
  */
-good_area:
-	si_code = SEGV_ACCERR;
-	if (writeaccess) {
-		if (!(vma->vm_flags & VM_WRITE))
-			goto bad_area;
-	} else {
-		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-			goto bad_area;
-	}
+static noinline int vmalloc_fault(unsigned long address)
+{
+	pgd_t *pgd_k;
+	pmd_t *pmd_k;
+	pte_t *pte_k;
+
+	/* Make sure we are in vmalloc/module/P3 area: */
+	if (!(address >= VMALLOC_START && address < __FAULT_ADDR_LIMIT))
+		return -1;
 
 	/*
-	 * If for any reason at all we couldn't handle the fault,
-	 * make sure we exit gracefully rather than endlessly redo
-	 * the fault.
+	 * Synchronize this task's top level page-table
+	 * with the 'reference' page table.
+	 *
+	 * Do _not_ use "current" here. We might be inside
+	 * an interrupt in the middle of a task switch..
 	 */
-survive:
-	fault = handle_mm_fault(mm, vma, address, writeaccess);
-	if (unlikely(fault & VM_FAULT_ERROR)) {
-		if (fault & VM_FAULT_OOM)
-			goto out_of_memory;
-		else if (fault & VM_FAULT_SIGBUS)
-			goto do_sigbus;
-		BUG();
-	}
-	if (fault & VM_FAULT_MAJOR)
-		tsk->maj_flt++;
-	else
-		tsk->min_flt++;
+	pgd_k = get_TTB();
+	pmd_k = vmalloc_sync_one(pgd_k, address);
+	if (!pmd_k)
+		return -1;
 
-	up_read(&mm->mmap_sem);
-	return;
+	pte_k = pte_offset_kernel(pmd_k, address);
+	if (!pte_present(*pte_k))
+		return -1;
 
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-	up_read(&mm->mmap_sem);
+	return 0;
+}
 
-bad_area_nosemaphore:
-	if (user_mode(regs)) {
-		info.si_signo = SIGSEGV;
-		info.si_errno = 0;
-		info.si_code = si_code;
-		info.si_addr = (void *) address;
-		force_sig_info(SIGSEGV, &info, tsk);
+static void
+show_fault_oops(struct pt_regs *regs, unsigned long address)
+{
+	if (!oops_may_print())
 		return;
-	}
 
-no_context:
+	printk(KERN_ALERT "BUG: unable to handle kernel ");
+	if (address < PAGE_SIZE)
+		printk(KERN_CONT "NULL pointer dereference");
+	else
+		printk(KERN_CONT "paging request");
+
+	printk(KERN_CONT " at %08lx\n", address);
+	printk(KERN_ALERT "PC:");
+	printk_address(regs->pc, 1);
+
+	show_pte(NULL, address);
+}
+
+static noinline void
+no_context(struct pt_regs *regs, unsigned long error_code,
+	   unsigned long address)
+{
 	/* Are we prepared to handle this kernel fault?  */
 	if (fixup_exception(regs))
 		return;
 
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- *
- */
+	if (handle_trapped_io(regs, address))
+		return;
 
+	/*
+	 * Oops. The kernel tried to access some bad page. We'll have to
+	 * terminate things with extreme prejudice.
+	 */
 	bust_spinlocks(1);
 
-	if (oops_may_print()) {
-		__typeof__(pte_val(__pte(0))) page;
+	show_fault_oops(regs, address);
 
-		if (address < PAGE_SIZE)
-			printk(KERN_ALERT "Unable to handle kernel NULL "
-					  "pointer dereference");
-		else
-			printk(KERN_ALERT "Unable to handle kernel paging "
-					  "request");
-		printk(" at virtual address %08lx\n", address);
-		printk(KERN_ALERT "pc = %08lx\n", regs->pc);
-		page = (unsigned long)get_TTB();
-		if (page) {
-			page = ((__typeof__(page) *)page)[address >> PGDIR_SHIFT];
-			printk(KERN_ALERT "*pde = %08lx\n", page);
-			if (page & _PAGE_PRESENT) {
-				page &= PAGE_MASK;
-				address &= 0x003ff000;
-				page = ((__typeof__(page) *)
-						__va(page))[address >>
-							    PAGE_SHIFT];
-				printk(KERN_ALERT "*pte = %08lx\n", page);
-			}
-		}
-	}
-
-	die("Oops", regs, writeaccess);
+	die("Oops", regs, error_code);
 	bust_spinlocks(0);
 	do_exit(SIGKILL);
+}
 
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
-	up_read(&mm->mmap_sem);
-	if (is_global_init(current)) {
-		yield();
-		down_read(&mm->mmap_sem);
-		goto survive;
+static void
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+		       unsigned long address, int si_code)
+{
+	struct task_struct *tsk = current;
+
+	/* User mode accesses just cause a SIGSEGV */
+	if (user_mode(regs)) {
+		/*
+		 * It's possible to have interrupts off here:
+		 */
+		local_irq_enable();
+
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+
+		return;
 	}
-	printk("VM: killing process %s\n", tsk->comm);
-	if (user_mode(regs))
-		do_group_exit(SIGKILL);
-	goto no_context;
 
-do_sigbus:
-	up_read(&mm->mmap_sem);
+	no_context(regs, error_code, address);
+}
+
+static noinline void
+bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+		     unsigned long address)
+{
+	__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+}
+
+static void
+__bad_area(struct pt_regs *regs, unsigned long error_code,
+	   unsigned long address, int si_code)
+{
+	struct mm_struct *mm = current->mm;
 
 	/*
-	 * Send a sigbus, regardless of whether we were in kernel
-	 * or user mode.
+	 * Something tried to access memory that isn't in our memory map..
+	 * Fix it, but check if it's kernel or user first..
 	 */
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRERR;
-	info.si_addr = (void *)address;
-	force_sig_info(SIGBUS, &info, tsk);
+	up_read(&mm->mmap_sem);
 
-	/* Kernel mode? Handle exceptions or die */
-	if (!user_mode(regs))
-		goto no_context;
+	__bad_area_nosemaphore(regs, error_code, address, si_code);
 }
 
-#ifdef CONFIG_SH_STORE_QUEUES
-/*
- * This is a special case for the SH-4 store queues, as pages for this
- * space still need to be faulted in before it's possible to flush the
- * store queue cache for writeout to the remapped region.
- */
-#define P3_ADDR_MAX		(P4SEG_STORE_QUE + 0x04000000)
-#else
-#define P3_ADDR_MAX		P4SEG
-#endif
+static noinline void
+bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+	__bad_area(regs, error_code, address, SEGV_MAPERR);
+}
 
-/*
- * Called with interrupts disabled.
- */
-asmlinkage int __kprobes __do_page_fault(struct pt_regs *regs,
-					 unsigned long writeaccess,
-					 unsigned long address)
+static noinline void
+bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
+		      unsigned long address)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	pte_t entry;
-
-#ifdef CONFIG_SH_KGDB
-	if (kgdb_nofault && kgdb_bus_err_hook)
-		kgdb_bus_err_hook();
-#endif
+	__bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+
+static void
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+	struct task_struct *tsk = current;
+	struct mm_struct *mm = tsk->mm;
+
+	up_read(&mm->mmap_sem);
+
+	/* Kernel mode? Handle exceptions or die: */
+	if (!user_mode(regs))
+		no_context(regs, error_code, address);
 
+	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+static noinline int
+mm_fault_error(struct pt_regs *regs, unsigned long error_code,
+	       unsigned long address, unsigned int fault)
+{
 	/*
-	 * We don't take page faults for P1, P2, and parts of P4, these
-	 * are always mapped, whether it be due to legacy behaviour in
-	 * 29-bit mode, or due to PMB configuration in 32-bit mode.
+	 * Pagefault was interrupted by SIGKILL. We have no reason to
+	 * continue pagefault.
 	 */
-	if (address >= P3SEG && address < P3_ADDR_MAX) {
-		pgd = pgd_offset_k(address);
-	} else {
-		if (unlikely(address >= TASK_SIZE || !current->mm))
+	if (fatal_signal_pending(current)) {
+		if (!(fault & VM_FAULT_RETRY))
+			up_read(&current->mm->mmap_sem);
+		if (!user_mode(regs))
+			no_context(regs, error_code, address);
+		return 1;
+	}
+
+	if (!(fault & VM_FAULT_ERROR))
+		return 0;
+
+	if (fault & VM_FAULT_OOM) {
+		/* Kernel mode? Handle exceptions or die: */
+		if (!user_mode(regs)) {
+			up_read(&current->mm->mmap_sem);
+			no_context(regs, error_code, address);
 			return 1;
+		}
+		up_read(&current->mm->mmap_sem);
 
-		pgd = pgd_offset(current->mm, address);
+		/*
+		 * We ran out of memory, call the OOM killer, and return the
+		 * userspace (which will retry the fault, or kill us if we got
+		 * oom-killed):
+		 */
+		pagefault_out_of_memory();
+	} else {
+		if (fault & VM_FAULT_SIGBUS)
+			do_sigbus(regs, error_code, address);
+		else
+			BUG();
 	}
 
-	pud = pud_offset(pgd, address);
-	if (pud_none_or_clear_bad(pud))
-		return 1;
-	pmd = pmd_offset(pud, address);
-	if (pmd_none_or_clear_bad(pmd))
-		return 1;
+	return 1;
+}
+
+static inline int access_error(int error_code, struct vm_area_struct *vma)
+{
+	if (error_code & FAULT_CODE_WRITE) {
+		/* write, present and write, not present: */
+		if (unlikely(!(vma->vm_flags & VM_WRITE)))
+			return 1;
+		return 0;
+	}
 
-	pte = pte_offset_kernel(pmd, address);
-	entry = *pte;
-	if (unlikely(pte_none(entry) || pte_not_present(entry)))
+	/* ITLB miss on NX page */
+	if (unlikely((error_code & FAULT_CODE_ITLB) &&
+		     !(vma->vm_flags & VM_EXEC)))
 		return 1;
-	if (unlikely(writeaccess && !pte_write(entry)))
+
+	/* read, not present: */
+	if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
 		return 1;
 
-	if (writeaccess)
-		entry = pte_mkdirty(entry);
-	entry = pte_mkyoung(entry);
+	return 0;
+}
 
-	set_pte(pte, entry);
-	update_mmu_cache(NULL, address, entry);
+static int fault_in_kernel_space(unsigned long address)
+{
+	return address >= TASK_SIZE;
+}
 
-	return 0;
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
+					unsigned long error_code,
+					unsigned long address)
+{
+	unsigned long vec;
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	struct vm_area_struct * vma;
+	int fault;
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+	tsk = current;
+	mm = tsk->mm;
+	vec = lookup_exception_vector();
+
+	/*
+	 * We fault-in kernel-space virtual memory on-demand. The
+	 * 'reference' page table is init_mm.pgd.
+	 *
+	 * NOTE! We MUST NOT take any locks for this case. We may
+	 * be in an interrupt or a critical region, and should
+	 * only copy the information from the master page table,
+	 * nothing more.
+	 */
+	if (unlikely(fault_in_kernel_space(address))) {
+		if (vmalloc_fault(address) >= 0)
+			return;
+		if (notify_page_fault(regs, vec))
+			return;
+
+		bad_area_nosemaphore(regs, error_code, address);
+		return;
+	}
+
+	if (unlikely(notify_page_fault(regs, vec)))
+		return;
+
+	/* Only enable interrupts if they were on before the fault */
+	if ((regs->sr & SR_IMASK) != SR_IMASK)
+		local_irq_enable();
+
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+	/*
+	 * If we're in an interrupt, have no user context or are running
+	 * in an atomic region then we must not take the fault:
+	 */
+	if (unlikely(in_atomic() || !mm)) {
+		bad_area_nosemaphore(regs, error_code, address);
+		return;
+	}
+
+retry:
+	down_read(&mm->mmap_sem);
+
+	vma = find_vma(mm, address);
+	if (unlikely(!vma)) {
+		bad_area(regs, error_code, address);
+		return;
+	}
+	if (likely(vma->vm_start <= address))
+		goto good_area;
+	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+		bad_area(regs, error_code, address);
+		return;
+	}
+	if (unlikely(expand_stack(vma, address))) {
+		bad_area(regs, error_code, address);
+		return;
+	}
+
+	/*
+	 * Ok, we have a good vm_area for this memory access, so
+	 * we can handle it..
+	 */
+good_area:
+	if (unlikely(access_error(error_code, vma))) {
+		bad_area_access_error(regs, error_code, address);
+		return;
+	}
+
+	set_thread_fault_code(error_code);
+
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
+	if (error_code & FAULT_CODE_WRITE)
+		flags |= FAULT_FLAG_WRITE;
+
+	/*
+	 * If for any reason at all we couldn't handle the fault,
+	 * make sure we exit gracefully rather than endlessly redo
+	 * the fault.
+	 */
+	fault = handle_mm_fault(mm, vma, address, flags);
+
+	if (unlikely(fault & (VM_FAULT_RETRY | VM_FAULT_ERROR)))
+		if (mm_fault_error(regs, error_code, address, fault))
+			return;
+
+	if (flags & FAULT_FLAG_ALLOW_RETRY) {
+		if (fault & VM_FAULT_MAJOR) {
+			tsk->maj_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
+				      regs, address);
+		} else {
+			tsk->min_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
+				      regs, address);
+		}
+		if (fault & VM_FAULT_RETRY) {
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
+
+			/*
+			 * No need to up_read(&mm->mmap_sem) as we would
+			 * have already released it in __lock_page_or_retry
+			 * in mm/filemap.c.
+			 */
+			goto retry;
+		}
+	}
+
+	up_read(&mm->mmap_sem);
 }