diff options
Diffstat (limited to 'arch/powerpc/mm/fault.c')
| -rw-r--r-- | arch/powerpc/mm/fault.c | 323 |
1 files changed, 216 insertions, 107 deletions
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 7b251079926..51ab9e7e6c3 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -29,16 +29,23 @@ #include <linux/module.h> #include <linux/kprobes.h> #include <linux/kdebug.h> +#include <linux/perf_event.h> +#include <linux/magic.h> +#include <linux/ratelimit.h> +#include <linux/context_tracking.h> +#include <asm/firmware.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/mmu_context.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> #include <asm/siginfo.h> +#include <asm/debug.h> +#include <mm/mmu_decl.h> +#include "icswx.h" #ifdef CONFIG_KPROBES static inline int notify_page_fault(struct pt_regs *regs) @@ -99,31 +106,80 @@ static int store_updates_sp(struct pt_regs *regs) } return 0; } +/* + * do_page_fault error handling helpers + */ -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) -static void do_dabr(struct pt_regs *regs, unsigned long address, - unsigned long error_code) +#define MM_FAULT_RETURN 0 +#define MM_FAULT_CONTINUE -1 +#define MM_FAULT_ERR(sig) (sig) + +static int do_sigbus(struct pt_regs *regs, unsigned long address) { siginfo_t info; - if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code, - 11, SIGSEGV) == NOTIFY_STOP) - return; + up_read(¤t->mm->mmap_sem); - if (debugger_dabr_match(regs)) - return; + if (user_mode(regs)) { + current->thread.trap_nr = BUS_ADRERR; + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void __user *)address; + force_sig_info(SIGBUS, &info, current); + return MM_FAULT_RETURN; + } + return MM_FAULT_ERR(SIGBUS); +} + +static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) +{ + /* + * Pagefault was interrupted by SIGKILL. We have no reason to + * continue the pagefault. + */ + if (fatal_signal_pending(current)) { + /* + * If we have retry set, the mmap semaphore will have + * alrady been released in __lock_page_or_retry(). Else + * we release it now. + */ + if (!(fault & VM_FAULT_RETRY)) + up_read(¤t->mm->mmap_sem); + /* Coming from kernel, we need to deal with uaccess fixups */ + if (user_mode(regs)) + return MM_FAULT_RETURN; + return MM_FAULT_ERR(SIGKILL); + } + + /* No fault: be happy */ + if (!(fault & VM_FAULT_ERROR)) + return MM_FAULT_CONTINUE; + + /* Out of memory */ + if (fault & VM_FAULT_OOM) { + up_read(¤t->mm->mmap_sem); - /* Clear the DABR */ - set_dabr(0); + /* + * We ran out of memory, or some other thing happened to us that + * made us unable to handle the page fault gracefully. + */ + if (!user_mode(regs)) + return MM_FAULT_ERR(SIGKILL); + pagefault_out_of_memory(); + return MM_FAULT_RETURN; + } + + /* Bus error. x86 handles HWPOISON here, we'll add this if/when + * we support the feature in HW + */ + if (fault & VM_FAULT_SIGBUS) + return do_sigbus(regs, addr); - /* Deliver the signal to userspace */ - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = TRAP_HWBKPT; - info.si_addr = (void __user *)address; - force_sig_info(SIGTRAP, &info, current); + /* We don't understand the fault code, this is fatal */ + BUG(); + return MM_FAULT_CONTINUE; } -#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/ /* * For 600- and 800-family processors, the error_code parameter is DSISR @@ -141,13 +197,16 @@ static void do_dabr(struct pt_regs *regs, unsigned long address, int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long error_code) { + enum ctx_state prev_state = exception_enter(); struct vm_area_struct * vma; struct mm_struct *mm = current->mm; - siginfo_t info; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; int code = SEGV_MAPERR; - int is_write = 0, ret; + int is_write = 0; int trap = TRAP(regs); int is_exec = trap == 0x400; + int fault; + int rc = 0, store_update_sp = 0; #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) /* @@ -164,27 +223,49 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, is_write = error_code & ESR_DST; #endif /* CONFIG_4xx || CONFIG_BOOKE */ +#ifdef CONFIG_PPC_ICSWX + /* + * we need to do this early because this "data storage + * interrupt" does not update the DAR/DEAR so we don't want to + * look at it + */ + if (error_code & ICSWX_DSI_UCT) { + rc = acop_handle_fault(regs, address, error_code); + if (rc) + goto bail; + } +#endif /* CONFIG_PPC_ICSWX */ + if (notify_page_fault(regs)) - return 0; + goto bail; if (unlikely(debugger_fault_handler(regs))) - return 0; + goto bail; /* On a kernel SLB miss we can only check for a valid exception entry */ - if (!user_mode(regs) && (address >= TASK_SIZE)) - return SIGSEGV; + if (!user_mode(regs) && (address >= TASK_SIZE)) { + rc = SIGSEGV; + goto bail; + } -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) +#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \ + defined(CONFIG_PPC_BOOK3S_64)) if (error_code & DSISR_DABRMATCH) { - /* DABR match */ - do_dabr(regs, address, error_code); - return 0; + /* breakpoint match */ + do_break(regs, address, error_code); + goto bail; } -#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/ +#endif + + /* We restore the interrupt state now */ + if (!arch_irq_disabled_regs(regs)) + local_irq_enable(); if (in_atomic() || mm == NULL) { - if (!user_mode(regs)) - return SIGSEGV; + if (!user_mode(regs)) { + rc = SIGSEGV; + goto bail; + } /* in_atomic() in user mode is really bad, as is current->mm == NULL. */ printk(KERN_EMERG "Page fault in user mode with " @@ -194,6 +275,19 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + /* + * We want to do this outside mmap_sem, because reading code around nip + * can result in fault, which will cause a deadlock when called with + * mmap_sem held + */ + if (user_mode(regs)) + store_update_sp = store_updates_sp(regs); + + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -213,7 +307,15 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, if (!user_mode(regs) && !search_exception_tables(regs->nip)) goto bad_area_nosemaphore; +retry: down_read(&mm->mmap_sem); + } else { + /* + * The above down_read_trylock() might have succeeded in + * which case we'll have missed the might_sleep() from + * down_read(): + */ + might_sleep(); } vma = find_vma(mm, address); @@ -251,8 +353,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, * between the last mapped region and the stack will * expand the stack rather than segfaulting. */ - if (address + 2048 < uregs->gpr[1] - && (!user_mode(regs) || !store_updates_sp(regs))) + if (address + 2048 < uregs->gpr[1] && !store_update_sp) goto bad_area; } if (expand_stack(vma, address)) @@ -267,6 +368,12 @@ good_area: goto bad_area; #endif /* CONFIG_6xx */ #if defined(CONFIG_8xx) + /* 8xx sometimes need to load a invalid/non-present TLBs. + * These must be invalidated separately as linux mm don't. + */ + if (error_code & 0x40000000) /* no translation? */ + _tlbil_va(address, 0, 0, 0); + /* The MPC8xx seems to always set 0x80000000, which is * "undefined". Of those that can be set, this is the only * one which seems bad. @@ -277,48 +384,38 @@ good_area: #endif /* CONFIG_8xx */ if (is_exec) { -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) - /* protection fault */ +#ifdef CONFIG_PPC_STD_MMU + /* Protection fault on exec go straight to failure on + * Hash based MMUs as they either don't support per-page + * execute permission, or if they do, it's handled already + * at the hash level. This test would probably have to + * be removed if we change the way this works to make hash + * processors use the same I/D cache coherency mechanism + * as embedded. + */ if (error_code & DSISR_PROTFAULT) goto bad_area; +#endif /* CONFIG_PPC_STD_MMU */ + /* * Allow execution from readable areas if the MMU does not * provide separate controls over reading and executing. + * + * Note: That code used to not be enabled for 4xx/BookE. + * It is now as I/D cache coherency for these is done at + * set_pte_at() time and I see no reason why the test + * below wouldn't be valid on those processors. This -may- + * break programs compiled with a really old ABI though. */ if (!(vma->vm_flags & VM_EXEC) && (cpu_has_feature(CPU_FTR_NOEXECUTE) || !(vma->vm_flags & (VM_READ | VM_WRITE)))) goto bad_area; -#else - pte_t *ptep; - pmd_t *pmdp; - - /* Since 4xx/Book-E supports per-page execute permission, - * we lazily flush dcache to icache. */ - ptep = NULL; - if (get_pteptr(mm, address, &ptep, &pmdp)) { - spinlock_t *ptl = pte_lockptr(mm, pmdp); - spin_lock(ptl); - if (pte_present(*ptep)) { - struct page *page = pte_page(*ptep); - - if (!test_bit(PG_arch_1, &page->flags)) { - flush_dcache_icache_page(page); - set_bit(PG_arch_1, &page->flags); - } - pte_update(ptep, 0, _PAGE_HWEXEC); - _tlbie(address, mm->context.id); - pte_unmap_unlock(ptep, ptl); - up_read(&mm->mmap_sem); - return 0; - } - pte_unmap_unlock(ptep, ptl); - } -#endif /* a write */ } else if (is_write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; /* a read */ } else { /* protection fault */ @@ -333,21 +430,52 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - survive: - ret = handle_mm_fault(mm, vma, address, is_write); - if (unlikely(ret & VM_FAULT_ERROR)) { - if (ret & VM_FAULT_OOM) - goto out_of_memory; - else if (ret & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); + fault = handle_mm_fault(mm, vma, address, flags); + if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { + rc = mm_fault_error(regs, address, fault); + if (rc >= MM_FAULT_RETURN) + goto bail; + else + rc = 0; } - if (ret & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; + + /* + * Major/minor page fault accounting is only done on the + * initial attempt. If we go through a retry, it is extremely + * likely that the page will be found in page cache at that point. + */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault & VM_FAULT_MAJOR) { + current->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, + regs, address); +#ifdef CONFIG_PPC_SMLPAR + if (firmware_has_feature(FW_FEATURE_CMO)) { + u32 page_ins; + + preempt_disable(); + page_ins = be32_to_cpu(get_lppaca()->page_ins); + page_ins += 1 << PAGE_FACTOR; + get_lppaca()->page_ins = cpu_to_be32(page_ins); + preempt_enable(); + } +#endif /* CONFIG_PPC_SMLPAR */ + } else { + current->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, + regs, address); + } + if (fault & VM_FAULT_RETRY) { + /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk + * of starvation. */ + flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; + goto retry; + } + } + up_read(&mm->mmap_sem); - return 0; + goto bail; bad_area: up_read(&mm->mmap_sem); @@ -356,44 +484,20 @@ bad_area_nosemaphore: /* User mode accesses cause a SIGSEGV */ if (user_mode(regs)) { _exception(SIGSEGV, regs, code, address); - return 0; + goto bail; } - if (is_exec && (error_code & DSISR_PROTFAULT) - && printk_ratelimit()) - printk(KERN_CRIT "kernel tried to execute NX-protected" - " page (%lx) - exploit attempt? (uid: %d)\n", - address, current->uid); + if (is_exec && (error_code & DSISR_PROTFAULT)) + printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected" + " page (%lx) - exploit attempt? (uid: %d)\n", + address, from_kuid(&init_user_ns, current_uid())); - return SIGSEGV; + rc = SIGSEGV; -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", current->comm); - if (user_mode(regs)) - do_group_exit(SIGKILL); - return SIGKILL; +bail: + exception_exit(prev_state); + return rc; -do_sigbus: - up_read(&mm->mmap_sem); - if (user_mode(regs)) { - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, current); - return 0; - } - return SIGBUS; } /* @@ -404,6 +508,7 @@ do_sigbus: void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) { const struct exception_table_entry *entry; + unsigned long *stackend; /* Are we prepared to handle this fault? */ if ((entry = search_exception_tables(regs->nip)) != NULL) { @@ -432,5 +537,9 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", regs->nip); + stackend = end_of_stack(current); + if (current != &init_task && *stackend != STACK_END_MAGIC) + printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + die("Kernel access of bad area", regs, sig); } |
