diff options
Diffstat (limited to 'arch/powerpc/kernel')
168 files changed, 59190 insertions, 13707 deletions
diff --git a/arch/powerpc/kernel/.gitignore b/arch/powerpc/kernel/.gitignore new file mode 100644 index 00000000000..c5f676c3c22 --- /dev/null +++ b/arch/powerpc/kernel/.gitignore @@ -0,0 +1 @@ +vmlinux.lds diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 572d4f5eaac..670c312d914 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -2,55 +2,167 @@ # Makefile for the linux kernel. # +CFLAGS_prom.o = -I$(src)/../../../scripts/dtc/libfdt +CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' + +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror + ifeq ($(CONFIG_PPC64),y) -EXTRA_CFLAGS += -mno-minimal-toc +CFLAGS_prom_init.o += $(NO_MINIMAL_TOC) endif ifeq ($(CONFIG_PPC32),y) CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif -obj-y := semaphore.o cputable.o ptrace.o syscalls.o \ - signal_32.o pmc.o -obj-$(CONFIG_PPC64) += setup_64.o binfmt_elf32.o sys_ppc32.o \ - ptrace32.o systbl.o -obj-$(CONFIG_ALTIVEC) += vecemu.o vector.o -obj-$(CONFIG_POWER4) += idle_power4.o -obj-$(CONFIG_PPC_OF) += of_device.o -obj-$(CONFIG_PPC_RTAS) += rtas.o -obj-$(CONFIG_IBMVIO) += vio.o +ifdef CONFIG_FUNCTION_TRACER +# Do not trace early boot code +CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog +CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog +CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog +CFLAGS_REMOVE_prom.o = -pg -mno-sched-epilog +# do not trace tracer code +CFLAGS_REMOVE_ftrace.o = -pg -mno-sched-epilog +# timers used by tracing +CFLAGS_REMOVE_time.o = -pg -mno-sched-epilog +endif -ifeq ($(CONFIG_PPC_MERGE),y) +obj-y := cputable.o ptrace.o syscalls.o \ + irq.o align.o signal_32.o pmc.o vdso.o \ + process.o systbl.o idle.o \ + signal.o sysfs.o cacheinfo.o time.o \ + prom.o traps.o setup-common.o \ + udbg.o misc.o io.o dma.o \ + misc_$(CONFIG_WORD_SIZE).o vdso32/ +obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ + signal_64.o ptrace32.o \ + paca.o nvram_64.o firmware.o +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o +obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o +obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o +obj64-$(CONFIG_RELOCATABLE) += reloc_64.o +obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o +obj-$(CONFIG_PPC64) += vdso64/ +obj-$(CONFIG_ALTIVEC) += vecemu.o +obj-$(CONFIG_PPC_970_NAP) += idle_power4.o +obj-$(CONFIG_PPC_P7_NAP) += idle_power7.o +obj-$(CONFIG_PPC_OF) += of_platform.o prom_parse.o +procfs-y := proc_powerpc.o +obj-$(CONFIG_PROC_FS) += $(procfs-y) +rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI) := rtas_pci.o +obj-$(CONFIG_PPC_RTAS) += rtas.o rtas-rtc.o $(rtaspci-y-y) +obj-$(CONFIG_PPC_RTAS_DAEMON) += rtasd.o +obj-$(CONFIG_RTAS_FLASH) += rtas_flash.o +obj-$(CONFIG_RTAS_PROC) += rtas-proc.o +obj-$(CONFIG_IBMVIO) += vio.o +obj-$(CONFIG_IBMEBUS) += ibmebus.o +obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \ + eeh_driver.o eeh_event.o eeh_sysfs.o +obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_FA_DUMP) += fadump.o +ifeq ($(CONFIG_PPC32),y) +obj-$(CONFIG_E500) += idle_e500.o +endif +obj-$(CONFIG_6xx) += idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o +obj-$(CONFIG_TAU) += tau_6xx.o +obj-$(CONFIG_HIBERNATION) += swsusp.o suspend.o +ifeq ($(CONFIG_FSL_BOOKE),y) +obj-$(CONFIG_HIBERNATION) += swsusp_booke.o +else +obj-$(CONFIG_HIBERNATION) += swsusp_$(CONFIG_WORD_SIZE).o +endif +obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o +obj-$(CONFIG_MODULES) += module.o module_$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_44x) += cpu_setup_44x.o +obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o +obj-$(CONFIG_PPC_DOORBELL) += dbell.o +obj-$(CONFIG_JUMP_LABEL) += jump_label.o -extra-$(CONFIG_PPC_STD_MMU) := head_32.o -extra-$(CONFIG_PPC64) := head_64.o -extra-$(CONFIG_40x) := head_4xx.o +extra-y := head_$(CONFIG_WORD_SIZE).o +extra-$(CONFIG_40x) := head_40x.o extra-$(CONFIG_44x) := head_44x.o extra-$(CONFIG_FSL_BOOKE) := head_fsl_booke.o extra-$(CONFIG_8xx) := head_8xx.o extra-y += vmlinux.lds -obj-y += process.o init_task.o time.o \ - prom.o traps.o setup-common.o -obj-$(CONFIG_PPC32) += entry_32.o setup_32.o misc_32.o systbl.o -obj-$(CONFIG_PPC64) += misc_64.o -obj-$(CONFIG_PPC_OF) += prom_init.o +obj-$(CONFIG_RELOCATABLE_PPC32) += reloc_32.o + +obj-$(CONFIG_PPC32) += entry_32.o setup_32.o +obj-$(CONFIG_PPC64) += dma-iommu.o iommu.o +obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_MODULES) += ppc_ksyms.o obj-$(CONFIG_BOOTX_TEXT) += btext.o -obj-$(CONFIG_6xx) += idle_6xx.o +obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_UPROBES) += uprobes.o +obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o + +pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o +obj-$(CONFIG_PCI) += pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \ + pci-common.o pci_of_scan.o +obj-$(CONFIG_PCI_MSI) += msi.o +obj-$(CONFIG_KEXEC) += machine_kexec.o crash.o \ + machine_kexec_$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_AUDIT) += audit.o +obj64-$(CONFIG_AUDIT) += compat_audit.o + +obj-$(CONFIG_PPC_IO_WORKAROUNDS) += io-workarounds.o -ifeq ($(CONFIG_PPC_ISERIES),y) -$(obj)/head_64.o: $(obj)/lparmap.s -AFLAGS_head_64.o += -I$(obj) +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o +obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o + +ifneq ($(CONFIG_PPC_INDIRECT_PIO),y) +obj-y += iomap.o endif -else -# stuff used from here for ARCH=ppc or ARCH=ppc64 -obj-$(CONFIG_PPC64) += traps.o process.o init_task.o time.o \ - setup-common.o +obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM) += tm.o +obj-$(CONFIG_PPC64) += $(obj64-y) +obj-$(CONFIG_PPC32) += $(obj32-y) +ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),) +obj-y += ppc_save_regs.o endif +obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o +obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o + +# Disable GCOV in odd or sensitive code +GCOV_PROFILE_prom_init.o := n +GCOV_PROFILE_ftrace.o := n +GCOV_PROFILE_machine_kexec_64.o := n +GCOV_PROFILE_machine_kexec_32.o := n +GCOV_PROFILE_kprobes.o := n + extra-$(CONFIG_PPC_FPU) += fpu.o +extra-$(CONFIG_ALTIVEC) += vector.o extra-$(CONFIG_PPC64) += entry_64.o +extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o + +extra-y += systbl_chk.i +$(obj)/systbl.o: systbl_chk + +quiet_cmd_systbl_chk = CALL $< + cmd_systbl_chk = $(CONFIG_SHELL) $< $(obj)/systbl_chk.i + +PHONY += systbl_chk +systbl_chk: $(src)/systbl_chk.sh $(obj)/systbl_chk.i + $(call cmd,systbl_chk) + +ifeq ($(CONFIG_PPC_OF_BOOT_TRAMPOLINE),y) +$(obj)/built-in.o: prom_init_check + +quiet_cmd_prom_init_check = CALL $< + cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" "$(obj)/prom_init.o" + +PHONY += prom_init_check +prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o + $(call cmd,prom_init_check) +endif + +clean-files := vmlinux.lds diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c new file mode 100644 index 00000000000..34f55524d45 --- /dev/null +++ b/arch/powerpc/kernel/align.c @@ -0,0 +1,1042 @@ +/* align.c - handle alignment exceptions for the Power PC. + * + * Copyright (c) 1996 Paul Mackerras <paulus@cs.anu.edu.au> + * Copyright (c) 1998-1999 TiVo, Inc. + * PowerPC 403GCX modifications. + * Copyright (c) 1999 Grant Erickson <grant@lcse.umn.edu> + * PowerPC 403GCX/405GP modifications. + * Copyright (c) 2001-2002 PPC64 team, IBM Corp + * 64-bit and Power4 support + * Copyright (c) 2005 Benjamin Herrenschmidt, IBM Corp + * <benh@kernel.crashing.org> + * Merge ppc32 and ppc64 implementations + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <asm/processor.h> +#include <asm/uaccess.h> +#include <asm/cache.h> +#include <asm/cputable.h> +#include <asm/emulated_ops.h> +#include <asm/switch_to.h> +#include <asm/disassemble.h> + +struct aligninfo { + unsigned char len; + unsigned char flags; +}; + + +#define INVALID { 0, 0 } + +/* Bits in the flags field */ +#define LD 0 /* load */ +#define ST 1 /* store */ +#define SE 2 /* sign-extend value, or FP ld/st as word */ +#define F 4 /* to/from fp regs */ +#define U 8 /* update index register */ +#define M 0x10 /* multiple load/store */ +#define SW 0x20 /* byte swap */ +#define S 0x40 /* single-precision fp or... */ +#define SX 0x40 /* ... byte count in XER */ +#define HARD 0x80 /* string, stwcx. */ +#define E4 0x40 /* SPE endianness is word */ +#define E8 0x80 /* SPE endianness is double word */ +#define SPLT 0x80 /* VSX SPLAT load */ + +/* DSISR bits reported for a DCBZ instruction: */ +#define DCBZ 0x5f /* 8xx/82xx dcbz faults when cache not enabled */ + +/* + * The PowerPC stores certain bits of the instruction that caused the + * alignment exception in the DSISR register. This array maps those + * bits to information about the operand length and what the + * instruction would do. + */ +static struct aligninfo aligninfo[128] = { + { 4, LD }, /* 00 0 0000: lwz / lwarx */ + INVALID, /* 00 0 0001 */ + { 4, ST }, /* 00 0 0010: stw */ + INVALID, /* 00 0 0011 */ + { 2, LD }, /* 00 0 0100: lhz */ + { 2, LD+SE }, /* 00 0 0101: lha */ + { 2, ST }, /* 00 0 0110: sth */ + { 4, LD+M }, /* 00 0 0111: lmw */ + { 4, LD+F+S }, /* 00 0 1000: lfs */ + { 8, LD+F }, /* 00 0 1001: lfd */ + { 4, ST+F+S }, /* 00 0 1010: stfs */ + { 8, ST+F }, /* 00 0 1011: stfd */ + { 16, LD }, /* 00 0 1100: lq */ + { 8, LD }, /* 00 0 1101: ld/ldu/lwa */ + INVALID, /* 00 0 1110 */ + { 8, ST }, /* 00 0 1111: std/stdu */ + { 4, LD+U }, /* 00 1 0000: lwzu */ + INVALID, /* 00 1 0001 */ + { 4, ST+U }, /* 00 1 0010: stwu */ + INVALID, /* 00 1 0011 */ + { 2, LD+U }, /* 00 1 0100: lhzu */ + { 2, LD+SE+U }, /* 00 1 0101: lhau */ + { 2, ST+U }, /* 00 1 0110: sthu */ + { 4, ST+M }, /* 00 1 0111: stmw */ + { 4, LD+F+S+U }, /* 00 1 1000: lfsu */ + { 8, LD+F+U }, /* 00 1 1001: lfdu */ + { 4, ST+F+S+U }, /* 00 1 1010: stfsu */ + { 8, ST+F+U }, /* 00 1 1011: stfdu */ + { 16, LD+F }, /* 00 1 1100: lfdp */ + INVALID, /* 00 1 1101 */ + { 16, ST+F }, /* 00 1 1110: stfdp */ + INVALID, /* 00 1 1111 */ + { 8, LD }, /* 01 0 0000: ldx */ + INVALID, /* 01 0 0001 */ + { 8, ST }, /* 01 0 0010: stdx */ + INVALID, /* 01 0 0011 */ + INVALID, /* 01 0 0100 */ + { 4, LD+SE }, /* 01 0 0101: lwax */ + INVALID, /* 01 0 0110 */ + INVALID, /* 01 0 0111 */ + { 4, LD+M+HARD+SX }, /* 01 0 1000: lswx */ + { 4, LD+M+HARD }, /* 01 0 1001: lswi */ + { 4, ST+M+HARD+SX }, /* 01 0 1010: stswx */ + { 4, ST+M+HARD }, /* 01 0 1011: stswi */ + INVALID, /* 01 0 1100 */ + { 8, LD+U }, /* 01 0 1101: ldu */ + INVALID, /* 01 0 1110 */ + { 8, ST+U }, /* 01 0 1111: stdu */ + { 8, LD+U }, /* 01 1 0000: ldux */ + INVALID, /* 01 1 0001 */ + { 8, ST+U }, /* 01 1 0010: stdux */ + INVALID, /* 01 1 0011 */ + INVALID, /* 01 1 0100 */ + { 4, LD+SE+U }, /* 01 1 0101: lwaux */ + INVALID, /* 01 1 0110 */ + INVALID, /* 01 1 0111 */ + INVALID, /* 01 1 1000 */ + INVALID, /* 01 1 1001 */ + INVALID, /* 01 1 1010 */ + INVALID, /* 01 1 1011 */ + INVALID, /* 01 1 1100 */ + INVALID, /* 01 1 1101 */ + INVALID, /* 01 1 1110 */ + INVALID, /* 01 1 1111 */ + INVALID, /* 10 0 0000 */ + INVALID, /* 10 0 0001 */ + INVALID, /* 10 0 0010: stwcx. */ + INVALID, /* 10 0 0011 */ + INVALID, /* 10 0 0100 */ + INVALID, /* 10 0 0101 */ + INVALID, /* 10 0 0110 */ + INVALID, /* 10 0 0111 */ + { 4, LD+SW }, /* 10 0 1000: lwbrx */ + INVALID, /* 10 0 1001 */ + { 4, ST+SW }, /* 10 0 1010: stwbrx */ + INVALID, /* 10 0 1011 */ + { 2, LD+SW }, /* 10 0 1100: lhbrx */ + { 4, LD+SE }, /* 10 0 1101 lwa */ + { 2, ST+SW }, /* 10 0 1110: sthbrx */ + { 16, ST }, /* 10 0 1111: stq */ + INVALID, /* 10 1 0000 */ + INVALID, /* 10 1 0001 */ + INVALID, /* 10 1 0010 */ + INVALID, /* 10 1 0011 */ + INVALID, /* 10 1 0100 */ + INVALID, /* 10 1 0101 */ + INVALID, /* 10 1 0110 */ + INVALID, /* 10 1 0111 */ + INVALID, /* 10 1 1000 */ + INVALID, /* 10 1 1001 */ + INVALID, /* 10 1 1010 */ + INVALID, /* 10 1 1011 */ + INVALID, /* 10 1 1100 */ + INVALID, /* 10 1 1101 */ + INVALID, /* 10 1 1110 */ + { 0, ST+HARD }, /* 10 1 1111: dcbz */ + { 4, LD }, /* 11 0 0000: lwzx */ + INVALID, /* 11 0 0001 */ + { 4, ST }, /* 11 0 0010: stwx */ + INVALID, /* 11 0 0011 */ + { 2, LD }, /* 11 0 0100: lhzx */ + { 2, LD+SE }, /* 11 0 0101: lhax */ + { 2, ST }, /* 11 0 0110: sthx */ + INVALID, /* 11 0 0111 */ + { 4, LD+F+S }, /* 11 0 1000: lfsx */ + { 8, LD+F }, /* 11 0 1001: lfdx */ + { 4, ST+F+S }, /* 11 0 1010: stfsx */ + { 8, ST+F }, /* 11 0 1011: stfdx */ + { 16, LD+F }, /* 11 0 1100: lfdpx */ + { 4, LD+F+SE }, /* 11 0 1101: lfiwax */ + { 16, ST+F }, /* 11 0 1110: stfdpx */ + { 4, ST+F }, /* 11 0 1111: stfiwx */ + { 4, LD+U }, /* 11 1 0000: lwzux */ + INVALID, /* 11 1 0001 */ + { 4, ST+U }, /* 11 1 0010: stwux */ + INVALID, /* 11 1 0011 */ + { 2, LD+U }, /* 11 1 0100: lhzux */ + { 2, LD+SE+U }, /* 11 1 0101: lhaux */ + { 2, ST+U }, /* 11 1 0110: sthux */ + INVALID, /* 11 1 0111 */ + { 4, LD+F+S+U }, /* 11 1 1000: lfsux */ + { 8, LD+F+U }, /* 11 1 1001: lfdux */ + { 4, ST+F+S+U }, /* 11 1 1010: stfsux */ + { 8, ST+F+U }, /* 11 1 1011: stfdux */ + INVALID, /* 11 1 1100 */ + { 4, LD+F }, /* 11 1 1101: lfiwzx */ + INVALID, /* 11 1 1110 */ + INVALID, /* 11 1 1111 */ +}; + +/* + * The dcbz (data cache block zero) instruction + * gives an alignment fault if used on non-cacheable + * memory. We handle the fault mainly for the + * case when we are running with the cache disabled + * for debugging. + */ +static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr) +{ + long __user *p; + int i, size; + +#ifdef __powerpc64__ + size = ppc64_caches.dline_size; +#else + size = L1_CACHE_BYTES; +#endif + p = (long __user *) (regs->dar & -size); + if (user_mode(regs) && !access_ok(VERIFY_WRITE, p, size)) + return -EFAULT; + for (i = 0; i < size / sizeof(long); ++i) + if (__put_user_inatomic(0, p+i)) + return -EFAULT; + return 1; +} + +/* + * Emulate load & store multiple instructions + * On 64-bit machines, these instructions only affect/use the + * bottom 4 bytes of each register, and the loads clear the + * top 4 bytes of the affected register. + */ +#ifdef __BIG_ENDIAN__ +#ifdef CONFIG_PPC64 +#define REG_BYTE(rp, i) *((u8 *)((rp) + ((i) >> 2)) + ((i) & 3) + 4) +#else +#define REG_BYTE(rp, i) *((u8 *)(rp) + (i)) +#endif +#endif + +#ifdef __LITTLE_ENDIAN__ +#define REG_BYTE(rp, i) (*(((u8 *)((rp) + ((i)>>2)) + ((i)&3)))) +#endif + +#define SWIZ_PTR(p) ((unsigned char __user *)((p) ^ swiz)) + +static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, + unsigned int reg, unsigned int nb, + unsigned int flags, unsigned int instr, + unsigned long swiz) +{ + unsigned long *rptr; + unsigned int nb0, i, bswiz; + unsigned long p; + + /* + * We do not try to emulate 8 bytes multiple as they aren't really + * available in our operating environments and we don't try to + * emulate multiples operations in kernel land as they should never + * be used/generated there at least not on unaligned boundaries + */ + if (unlikely((nb > 4) || !user_mode(regs))) + return 0; + + /* lmw, stmw, lswi/x, stswi/x */ + nb0 = 0; + if (flags & HARD) { + if (flags & SX) { + nb = regs->xer & 127; + if (nb == 0) + return 1; + } else { + unsigned long pc = regs->nip ^ (swiz & 4); + + if (__get_user_inatomic(instr, + (unsigned int __user *)pc)) + return -EFAULT; + if (swiz == 0 && (flags & SW)) + instr = cpu_to_le32(instr); + nb = (instr >> 11) & 0x1f; + if (nb == 0) + nb = 32; + } + if (nb + reg * 4 > 128) { + nb0 = nb + reg * 4 - 128; + nb = 128 - reg * 4; + } +#ifdef __LITTLE_ENDIAN__ + /* + * String instructions are endian neutral but the code + * below is not. Force byte swapping on so that the + * effects of swizzling are undone in the load/store + * loops below. + */ + flags ^= SW; +#endif + } else { + /* lwm, stmw */ + nb = (32 - reg) * 4; + } + + if (!access_ok((flags & ST ? VERIFY_WRITE: VERIFY_READ), addr, nb+nb0)) + return -EFAULT; /* bad address */ + + rptr = ®s->gpr[reg]; + p = (unsigned long) addr; + bswiz = (flags & SW)? 3: 0; + + if (!(flags & ST)) { + /* + * This zeroes the top 4 bytes of the affected registers + * in 64-bit mode, and also zeroes out any remaining + * bytes of the last register for lsw*. + */ + memset(rptr, 0, ((nb + 3) / 4) * sizeof(unsigned long)); + if (nb0 > 0) + memset(®s->gpr[0], 0, + ((nb0 + 3) / 4) * sizeof(unsigned long)); + + for (i = 0; i < nb; ++i, ++p) + if (__get_user_inatomic(REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) + return -EFAULT; + if (nb0 > 0) { + rptr = ®s->gpr[0]; + addr += nb; + for (i = 0; i < nb0; ++i, ++p) + if (__get_user_inatomic(REG_BYTE(rptr, + i ^ bswiz), + SWIZ_PTR(p))) + return -EFAULT; + } + + } else { + for (i = 0; i < nb; ++i, ++p) + if (__put_user_inatomic(REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) + return -EFAULT; + if (nb0 > 0) { + rptr = ®s->gpr[0]; + addr += nb; + for (i = 0; i < nb0; ++i, ++p) + if (__put_user_inatomic(REG_BYTE(rptr, + i ^ bswiz), + SWIZ_PTR(p))) + return -EFAULT; + } + } + return 1; +} + +/* + * Emulate floating-point pair loads and stores. + * Only POWER6 has these instructions, and it does true little-endian, + * so we don't need the address swizzling. + */ +static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg, + unsigned int flags) +{ + char *ptr0 = (char *) ¤t->thread.TS_FPR(reg); + char *ptr1 = (char *) ¤t->thread.TS_FPR(reg+1); + int i, ret, sw = 0; + + if (reg & 1) + return 0; /* invalid form: FRS/FRT must be even */ + if (flags & SW) + sw = 7; + ret = 0; + for (i = 0; i < 8; ++i) { + if (!(flags & ST)) { + ret |= __get_user(ptr0[i^sw], addr + i); + ret |= __get_user(ptr1[i^sw], addr + i + 8); + } else { + ret |= __put_user(ptr0[i^sw], addr + i); + ret |= __put_user(ptr1[i^sw], addr + i + 8); + } + } + if (ret) + return -EFAULT; + return 1; /* exception handled and fixed up */ +} + +#ifdef CONFIG_PPC64 +static int emulate_lq_stq(struct pt_regs *regs, unsigned char __user *addr, + unsigned int reg, unsigned int flags) +{ + char *ptr0 = (char *)®s->gpr[reg]; + char *ptr1 = (char *)®s->gpr[reg+1]; + int i, ret, sw = 0; + + if (reg & 1) + return 0; /* invalid form: GPR must be even */ + if (flags & SW) + sw = 7; + ret = 0; + for (i = 0; i < 8; ++i) { + if (!(flags & ST)) { + ret |= __get_user(ptr0[i^sw], addr + i); + ret |= __get_user(ptr1[i^sw], addr + i + 8); + } else { + ret |= __put_user(ptr0[i^sw], addr + i); + ret |= __put_user(ptr1[i^sw], addr + i + 8); + } + } + if (ret) + return -EFAULT; + return 1; /* exception handled and fixed up */ +} +#endif /* CONFIG_PPC64 */ + +#ifdef CONFIG_SPE + +static struct aligninfo spe_aligninfo[32] = { + { 8, LD+E8 }, /* 0 00 00: evldd[x] */ + { 8, LD+E4 }, /* 0 00 01: evldw[x] */ + { 8, LD }, /* 0 00 10: evldh[x] */ + INVALID, /* 0 00 11 */ + { 2, LD }, /* 0 01 00: evlhhesplat[x] */ + INVALID, /* 0 01 01 */ + { 2, LD }, /* 0 01 10: evlhhousplat[x] */ + { 2, LD+SE }, /* 0 01 11: evlhhossplat[x] */ + { 4, LD }, /* 0 10 00: evlwhe[x] */ + INVALID, /* 0 10 01 */ + { 4, LD }, /* 0 10 10: evlwhou[x] */ + { 4, LD+SE }, /* 0 10 11: evlwhos[x] */ + { 4, LD+E4 }, /* 0 11 00: evlwwsplat[x] */ + INVALID, /* 0 11 01 */ + { 4, LD }, /* 0 11 10: evlwhsplat[x] */ + INVALID, /* 0 11 11 */ + + { 8, ST+E8 }, /* 1 00 00: evstdd[x] */ + { 8, ST+E4 }, /* 1 00 01: evstdw[x] */ + { 8, ST }, /* 1 00 10: evstdh[x] */ + INVALID, /* 1 00 11 */ + INVALID, /* 1 01 00 */ + INVALID, /* 1 01 01 */ + INVALID, /* 1 01 10 */ + INVALID, /* 1 01 11 */ + { 4, ST }, /* 1 10 00: evstwhe[x] */ + INVALID, /* 1 10 01 */ + { 4, ST }, /* 1 10 10: evstwho[x] */ + INVALID, /* 1 10 11 */ + { 4, ST+E4 }, /* 1 11 00: evstwwe[x] */ + INVALID, /* 1 11 01 */ + { 4, ST+E4 }, /* 1 11 10: evstwwo[x] */ + INVALID, /* 1 11 11 */ +}; + +#define EVLDD 0x00 +#define EVLDW 0x01 +#define EVLDH 0x02 +#define EVLHHESPLAT 0x04 +#define EVLHHOUSPLAT 0x06 +#define EVLHHOSSPLAT 0x07 +#define EVLWHE 0x08 +#define EVLWHOU 0x0A +#define EVLWHOS 0x0B +#define EVLWWSPLAT 0x0C +#define EVLWHSPLAT 0x0E +#define EVSTDD 0x10 +#define EVSTDW 0x11 +#define EVSTDH 0x12 +#define EVSTWHE 0x18 +#define EVSTWHO 0x1A +#define EVSTWWE 0x1C +#define EVSTWWO 0x1E + +/* + * Emulate SPE loads and stores. + * Only Book-E has these instructions, and it does true little-endian, + * so we don't need the address swizzling. + */ +static int emulate_spe(struct pt_regs *regs, unsigned int reg, + unsigned int instr) +{ + int ret; + union { + u64 ll; + u32 w[2]; + u16 h[4]; + u8 v[8]; + } data, temp; + unsigned char __user *p, *addr; + unsigned long *evr = ¤t->thread.evr[reg]; + unsigned int nb, flags; + + instr = (instr >> 1) & 0x1f; + + /* DAR has the operand effective address */ + addr = (unsigned char __user *)regs->dar; + + nb = spe_aligninfo[instr].len; + flags = spe_aligninfo[instr].flags; + + /* Verify the address of the operand */ + if (unlikely(user_mode(regs) && + !access_ok((flags & ST ? VERIFY_WRITE : VERIFY_READ), + addr, nb))) + return -EFAULT; + + /* userland only */ + if (unlikely(!user_mode(regs))) + return 0; + + flush_spe_to_thread(current); + + /* If we are loading, get the data from user space, else + * get it from register values + */ + if (flags & ST) { + data.ll = 0; + switch (instr) { + case EVSTDD: + case EVSTDW: + case EVSTDH: + data.w[0] = *evr; + data.w[1] = regs->gpr[reg]; + break; + case EVSTWHE: + data.h[2] = *evr >> 16; + data.h[3] = regs->gpr[reg] >> 16; + break; + case EVSTWHO: + data.h[2] = *evr & 0xffff; + data.h[3] = regs->gpr[reg] & 0xffff; + break; + case EVSTWWE: + data.w[1] = *evr; + break; + case EVSTWWO: + data.w[1] = regs->gpr[reg]; + break; + default: + return -EINVAL; + } + } else { + temp.ll = data.ll = 0; + ret = 0; + p = addr; + + switch (nb) { + case 8: + ret |= __get_user_inatomic(temp.v[0], p++); + ret |= __get_user_inatomic(temp.v[1], p++); + ret |= __get_user_inatomic(temp.v[2], p++); + ret |= __get_user_inatomic(temp.v[3], p++); + case 4: + ret |= __get_user_inatomic(temp.v[4], p++); + ret |= __get_user_inatomic(temp.v[5], p++); + case 2: + ret |= __get_user_inatomic(temp.v[6], p++); + ret |= __get_user_inatomic(temp.v[7], p++); + if (unlikely(ret)) + return -EFAULT; + } + + switch (instr) { + case EVLDD: + case EVLDW: + case EVLDH: + data.ll = temp.ll; + break; + case EVLHHESPLAT: + data.h[0] = temp.h[3]; + data.h[2] = temp.h[3]; + break; + case EVLHHOUSPLAT: + case EVLHHOSSPLAT: + data.h[1] = temp.h[3]; + data.h[3] = temp.h[3]; + break; + case EVLWHE: + data.h[0] = temp.h[2]; + data.h[2] = temp.h[3]; + break; + case EVLWHOU: + case EVLWHOS: + data.h[1] = temp.h[2]; + data.h[3] = temp.h[3]; + break; + case EVLWWSPLAT: + data.w[0] = temp.w[1]; + data.w[1] = temp.w[1]; + break; + case EVLWHSPLAT: + data.h[0] = temp.h[2]; + data.h[1] = temp.h[2]; + data.h[2] = temp.h[3]; + data.h[3] = temp.h[3]; + break; + default: + return -EINVAL; + } + } + + if (flags & SW) { + switch (flags & 0xf0) { + case E8: + data.ll = swab64(data.ll); + break; + case E4: + data.w[0] = swab32(data.w[0]); + data.w[1] = swab32(data.w[1]); + break; + /* Its half word endian */ + default: + data.h[0] = swab16(data.h[0]); + data.h[1] = swab16(data.h[1]); + data.h[2] = swab16(data.h[2]); + data.h[3] = swab16(data.h[3]); + break; + } + } + + if (flags & SE) { + data.w[0] = (s16)data.h[1]; + data.w[1] = (s16)data.h[3]; + } + + /* Store result to memory or update registers */ + if (flags & ST) { + ret = 0; + p = addr; + switch (nb) { + case 8: + ret |= __put_user_inatomic(data.v[0], p++); + ret |= __put_user_inatomic(data.v[1], p++); + ret |= __put_user_inatomic(data.v[2], p++); + ret |= __put_user_inatomic(data.v[3], p++); + case 4: + ret |= __put_user_inatomic(data.v[4], p++); + ret |= __put_user_inatomic(data.v[5], p++); + case 2: + ret |= __put_user_inatomic(data.v[6], p++); + ret |= __put_user_inatomic(data.v[7], p++); + } + if (unlikely(ret)) + return -EFAULT; + } else { + *evr = data.w[0]; + regs->gpr[reg] = data.w[1]; + } + + return 1; +} +#endif /* CONFIG_SPE */ + +#ifdef CONFIG_VSX +/* + * Emulate VSX instructions... + */ +static int emulate_vsx(unsigned char __user *addr, unsigned int reg, + unsigned int areg, struct pt_regs *regs, + unsigned int flags, unsigned int length, + unsigned int elsize) +{ + char *ptr; + unsigned long *lptr; + int ret = 0; + int sw = 0; + int i, j; + + /* userland only */ + if (unlikely(!user_mode(regs))) + return 0; + + flush_vsx_to_thread(current); + + if (reg < 32) + ptr = (char *) ¤t->thread.fp_state.fpr[reg][0]; + else + ptr = (char *) ¤t->thread.vr_state.vr[reg - 32]; + + lptr = (unsigned long *) ptr; + +#ifdef __LITTLE_ENDIAN__ + if (flags & SW) { + elsize = length; + sw = length-1; + } else { + /* + * The elements are BE ordered, even in LE mode, so process + * them in reverse order. + */ + addr += length - elsize; + + /* 8 byte memory accesses go in the top 8 bytes of the VR */ + if (length == 8) + ptr += 8; + } +#else + if (flags & SW) + sw = elsize-1; +#endif + + for (j = 0; j < length; j += elsize) { + for (i = 0; i < elsize; ++i) { + if (flags & ST) + ret |= __put_user(ptr[i^sw], addr + i); + else + ret |= __get_user(ptr[i^sw], addr + i); + } + ptr += elsize; +#ifdef __LITTLE_ENDIAN__ + addr -= elsize; +#else + addr += elsize; +#endif + } + +#ifdef __BIG_ENDIAN__ +#define VSX_HI 0 +#define VSX_LO 1 +#else +#define VSX_HI 1 +#define VSX_LO 0 +#endif + + if (!ret) { + if (flags & U) + regs->gpr[areg] = regs->dar; + + /* Splat load copies the same data to top and bottom 8 bytes */ + if (flags & SPLT) + lptr[VSX_LO] = lptr[VSX_HI]; + /* For 8 byte loads, zero the low 8 bytes */ + else if (!(flags & ST) && (8 == length)) + lptr[VSX_LO] = 0; + } else + return -EFAULT; + + return 1; +} +#endif + +/* + * Called on alignment exception. Attempts to fixup + * + * Return 1 on success + * Return 0 if unable to handle the interrupt + * Return -EFAULT if data address is bad + */ + +int fix_alignment(struct pt_regs *regs) +{ + unsigned int instr, nb, flags, instruction = 0; + unsigned int reg, areg; + unsigned int dsisr; + unsigned char __user *addr; + unsigned long p, swiz; + int ret, i; + union data { + u64 ll; + double dd; + unsigned char v[8]; + struct { +#ifdef __LITTLE_ENDIAN__ + int low32; + unsigned hi32; +#else + unsigned hi32; + int low32; +#endif + } x32; + struct { +#ifdef __LITTLE_ENDIAN__ + short low16; + unsigned char hi48[6]; +#else + unsigned char hi48[6]; + short low16; +#endif + } x16; + } data; + + /* + * We require a complete register set, if not, then our assembly + * is broken + */ + CHECK_FULL_REGS(regs); + + dsisr = regs->dsisr; + + /* Some processors don't provide us with a DSISR we can use here, + * let's make one up from the instruction + */ + if (cpu_has_feature(CPU_FTR_NODSISRALIGN)) { + unsigned long pc = regs->nip; + + if (cpu_has_feature(CPU_FTR_PPC_LE) && (regs->msr & MSR_LE)) + pc ^= 4; + if (unlikely(__get_user_inatomic(instr, + (unsigned int __user *)pc))) + return -EFAULT; + if (cpu_has_feature(CPU_FTR_REAL_LE) && (regs->msr & MSR_LE)) + instr = cpu_to_le32(instr); + dsisr = make_dsisr(instr); + instruction = instr; + } + + /* extract the operation and registers from the dsisr */ + reg = (dsisr >> 5) & 0x1f; /* source/dest register */ + areg = dsisr & 0x1f; /* register to update */ + +#ifdef CONFIG_SPE + if ((instr >> 26) == 0x4) { + PPC_WARN_ALIGNMENT(spe, regs); + return emulate_spe(regs, reg, instr); + } +#endif + + instr = (dsisr >> 10) & 0x7f; + instr |= (dsisr >> 13) & 0x60; + + /* Lookup the operation in our table */ + nb = aligninfo[instr].len; + flags = aligninfo[instr].flags; + + /* ldbrx/stdbrx overlap lfs/stfs in the DSISR unfortunately */ + if (IS_XFORM(instruction) && ((instruction >> 1) & 0x3ff) == 532) { + nb = 8; + flags = LD+SW; + } else if (IS_XFORM(instruction) && + ((instruction >> 1) & 0x3ff) == 660) { + nb = 8; + flags = ST+SW; + } + + /* Byteswap little endian loads and stores */ + swiz = 0; + if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) { + flags ^= SW; +#ifdef __BIG_ENDIAN__ + /* + * So-called "PowerPC little endian" mode works by + * swizzling addresses rather than by actually doing + * any byte-swapping. To emulate this, we XOR each + * byte address with 7. We also byte-swap, because + * the processor's address swizzling depends on the + * operand size (it xors the address with 7 for bytes, + * 6 for halfwords, 4 for words, 0 for doublewords) but + * we will xor with 7 and load/store each byte separately. + */ + if (cpu_has_feature(CPU_FTR_PPC_LE)) + swiz = 7; +#endif + } + + /* DAR has the operand effective address */ + addr = (unsigned char __user *)regs->dar; + +#ifdef CONFIG_VSX + if ((instruction & 0xfc00003e) == 0x7c000018) { + unsigned int elsize; + + /* Additional register addressing bit (64 VSX vs 32 FPR/GPR) */ + reg |= (instruction & 0x1) << 5; + /* Simple inline decoder instead of a table */ + /* VSX has only 8 and 16 byte memory accesses */ + nb = 8; + if (instruction & 0x200) + nb = 16; + + /* Vector stores in little-endian mode swap individual + elements, so process them separately */ + elsize = 4; + if (instruction & 0x80) + elsize = 8; + + flags = 0; + if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) + flags |= SW; + if (instruction & 0x100) + flags |= ST; + if (instruction & 0x040) + flags |= U; + /* splat load needs a special decoder */ + if ((instruction & 0x400) == 0){ + flags |= SPLT; + nb = 8; + } + PPC_WARN_ALIGNMENT(vsx, regs); + return emulate_vsx(addr, reg, areg, regs, flags, nb, elsize); + } +#endif + /* A size of 0 indicates an instruction we don't support, with + * the exception of DCBZ which is handled as a special case here + */ + if (instr == DCBZ) { + PPC_WARN_ALIGNMENT(dcbz, regs); + return emulate_dcbz(regs, addr); + } + if (unlikely(nb == 0)) + return 0; + + /* Load/Store Multiple instructions are handled in their own + * function + */ + if (flags & M) { + PPC_WARN_ALIGNMENT(multiple, regs); + return emulate_multiple(regs, addr, reg, nb, + flags, instr, swiz); + } + + /* Verify the address of the operand */ + if (unlikely(user_mode(regs) && + !access_ok((flags & ST ? VERIFY_WRITE : VERIFY_READ), + addr, nb))) + return -EFAULT; + + /* Force the fprs into the save area so we can reference them */ + if (flags & F) { + /* userland only */ + if (unlikely(!user_mode(regs))) + return 0; + flush_fp_to_thread(current); + } + + if ((nb == 16)) { + if (flags & F) { + /* Special case for 16-byte FP loads and stores */ + PPC_WARN_ALIGNMENT(fp_pair, regs); + return emulate_fp_pair(addr, reg, flags); + } else { +#ifdef CONFIG_PPC64 + /* Special case for 16-byte loads and stores */ + PPC_WARN_ALIGNMENT(lq_stq, regs); + return emulate_lq_stq(regs, addr, reg, flags); +#else + return 0; +#endif + } + } + + PPC_WARN_ALIGNMENT(unaligned, regs); + + /* If we are loading, get the data from user space, else + * get it from register values + */ + if (!(flags & ST)) { + unsigned int start = 0; + + switch (nb) { + case 4: + start = offsetof(union data, x32.low32); + break; + case 2: + start = offsetof(union data, x16.low16); + break; + } + + data.ll = 0; + ret = 0; + p = (unsigned long)addr; + + for (i = 0; i < nb; i++) + ret |= __get_user_inatomic(data.v[start + i], + SWIZ_PTR(p++)); + + if (unlikely(ret)) + return -EFAULT; + + } else if (flags & F) { + data.ll = current->thread.TS_FPR(reg); + if (flags & S) { + /* Single-precision FP store requires conversion... */ +#ifdef CONFIG_PPC_FPU + preempt_disable(); + enable_kernel_fp(); + cvt_df(&data.dd, (float *)&data.x32.low32); + preempt_enable(); +#else + return 0; +#endif + } + } else + data.ll = regs->gpr[reg]; + + if (flags & SW) { + switch (nb) { + case 8: + data.ll = swab64(data.ll); + break; + case 4: + data.x32.low32 = swab32(data.x32.low32); + break; + case 2: + data.x16.low16 = swab16(data.x16.low16); + break; + } + } + + /* Perform other misc operations like sign extension + * or floating point single precision conversion + */ + switch (flags & ~(U|SW)) { + case LD+SE: /* sign extending integer loads */ + case LD+F+SE: /* sign extend for lfiwax */ + if ( nb == 2 ) + data.ll = data.x16.low16; + else /* nb must be 4 */ + data.ll = data.x32.low32; + break; + + /* Single-precision FP load requires conversion... */ + case LD+F+S: +#ifdef CONFIG_PPC_FPU + preempt_disable(); + enable_kernel_fp(); + cvt_fd((float *)&data.x32.low32, &data.dd); + preempt_enable(); +#else + return 0; +#endif + break; + } + + /* Store result to memory or update registers */ + if (flags & ST) { + unsigned int start = 0; + + switch (nb) { + case 4: + start = offsetof(union data, x32.low32); + break; + case 2: + start = offsetof(union data, x16.low16); + break; + } + + ret = 0; + p = (unsigned long)addr; + + for (i = 0; i < nb; i++) + ret |= __put_user_inatomic(data.v[start + i], + SWIZ_PTR(p++)); + + if (unlikely(ret)) + return -EFAULT; + } else if (flags & F) + current->thread.TS_FPR(reg) = data.ll; + else + regs->gpr[reg] = data.ll; + + /* Update RA as needed */ + if (flags & U) + regs->gpr[areg] = regs->dar; + + return 1; +} diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 330cd783206..f5995a91221 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -13,7 +13,6 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -22,13 +21,13 @@ #include <linux/types.h> #include <linux/mman.h> #include <linux/mm.h> +#include <linux/suspend.h> +#include <linux/hrtimer.h> #ifdef CONFIG_PPC64 #include <linux/time.h> #include <linux/hardirq.h> -#else -#include <linux/ptrace.h> -#include <linux/suspend.h> #endif +#include <linux/kbuild.h> #include <asm/io.h> #include <asm/page.h> @@ -37,51 +36,78 @@ #include <asm/cputable.h> #include <asm/thread_info.h> #include <asm/rtas.h> +#include <asm/vdso_datapage.h> #ifdef CONFIG_PPC64 #include <asm/paca.h> #include <asm/lppaca.h> -#include <asm/iSeries/HvLpEvent.h> #include <asm/cache.h> -#include <asm/systemcfg.h> #include <asm/compat.h> +#include <asm/mmu.h> +#include <asm/hvcall.h> +#include <asm/xics.h> +#endif +#ifdef CONFIG_PPC_POWERNV +#include <asm/opal.h> +#endif +#if defined(CONFIG_KVM) || defined(CONFIG_KVM_GUEST) +#include <linux/kvm_host.h> +#endif +#if defined(CONFIG_KVM) && defined(CONFIG_PPC_BOOK3S) +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> #endif -#define DEFINE(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val)) +#ifdef CONFIG_PPC32 +#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +#include "head_booke.h" +#endif +#endif -#define BLANK() asm volatile("\n->" : : ) +#if defined(CONFIG_PPC_FSL_BOOK3E) +#include "../mm/mmu_decl.h" +#endif int main(void) { DEFINE(THREAD, offsetof(struct task_struct, thread)); DEFINE(MM, offsetof(struct task_struct, mm)); + DEFINE(MMCONTEXTID, offsetof(struct mm_struct, context.id)); #ifdef CONFIG_PPC64 DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context)); + DEFINE(SIGSEGV, SIGSEGV); + DEFINE(NMI_MASK, NMI_MASK); + DEFINE(THREAD_DSCR, offsetof(struct thread_struct, dscr)); + DEFINE(THREAD_DSCR_INHERIT, offsetof(struct thread_struct, dscr_inherit)); + DEFINE(TASKTHREADPPR, offsetof(struct task_struct, thread.ppr)); #else - DEFINE(THREAD_INFO, offsetof(struct task_struct, thread_info)); - DEFINE(PTRACE, offsetof(struct task_struct, ptrace)); + DEFINE(THREAD_INFO, offsetof(struct task_struct, stack)); + DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16)); + DEFINE(KSP_LIMIT, offsetof(struct thread_struct, ksp_limit)); #endif /* CONFIG_PPC64 */ DEFINE(KSP, offsetof(struct thread_struct, ksp)); DEFINE(PT_REGS, offsetof(struct thread_struct, regs)); +#ifdef CONFIG_BOOKE + DEFINE(THREAD_NORMSAVES, offsetof(struct thread_struct, normsave[0])); +#endif DEFINE(THREAD_FPEXC_MODE, offsetof(struct thread_struct, fpexc_mode)); - DEFINE(THREAD_FPR0, offsetof(struct thread_struct, fpr[0])); - DEFINE(THREAD_FPSCR, offsetof(struct thread_struct, fpscr)); + DEFINE(THREAD_FPSTATE, offsetof(struct thread_struct, fp_state)); + DEFINE(THREAD_FPSAVEAREA, offsetof(struct thread_struct, fp_save_area)); + DEFINE(FPSTATE_FPSCR, offsetof(struct thread_fp_state, fpscr)); #ifdef CONFIG_ALTIVEC - DEFINE(THREAD_VR0, offsetof(struct thread_struct, vr[0])); + DEFINE(THREAD_VRSTATE, offsetof(struct thread_struct, vr_state)); + DEFINE(THREAD_VRSAVEAREA, offsetof(struct thread_struct, vr_save_area)); DEFINE(THREAD_VRSAVE, offsetof(struct thread_struct, vrsave)); - DEFINE(THREAD_VSCR, offsetof(struct thread_struct, vscr)); DEFINE(THREAD_USED_VR, offsetof(struct thread_struct, used_vr)); + DEFINE(VRSTATE_VSCR, offsetof(struct thread_vr_state, vscr)); #endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX + DEFINE(THREAD_USED_VSR, offsetof(struct thread_struct, used_vsr)); +#endif /* CONFIG_VSX */ #ifdef CONFIG_PPC64 DEFINE(KSP_VSID, offsetof(struct thread_struct, ksp_vsid)); #else /* CONFIG_PPC64 */ DEFINE(PGDIR, offsetof(struct thread_struct, pgdir)); - DEFINE(LAST_SYSCALL, offsetof(struct thread_struct, last_syscall)); -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0)); - DEFINE(PT_PTRACED, PT_PTRACED); -#endif #ifdef CONFIG_SPE DEFINE(THREAD_EVR0, offsetof(struct thread_struct, evr[0])); DEFINE(THREAD_ACC, offsetof(struct thread_struct, acc)); @@ -89,15 +115,52 @@ int main(void) DEFINE(THREAD_USED_SPE, offsetof(struct thread_struct, used_spe)); #endif /* CONFIG_SPE */ #endif /* CONFIG_PPC64 */ +#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) + DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, debug.dbcr0)); +#endif +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu)); +#endif +#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) + DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu)); +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 + DEFINE(THREAD_TAR, offsetof(struct thread_struct, tar)); + DEFINE(THREAD_BESCR, offsetof(struct thread_struct, bescr)); + DEFINE(THREAD_EBBHR, offsetof(struct thread_struct, ebbhr)); + DEFINE(THREAD_EBBRR, offsetof(struct thread_struct, ebbrr)); + DEFINE(THREAD_SIAR, offsetof(struct thread_struct, siar)); + DEFINE(THREAD_SDAR, offsetof(struct thread_struct, sdar)); + DEFINE(THREAD_SIER, offsetof(struct thread_struct, sier)); + DEFINE(THREAD_MMCR0, offsetof(struct thread_struct, mmcr0)); + DEFINE(THREAD_MMCR2, offsetof(struct thread_struct, mmcr2)); +#endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + DEFINE(PACATMSCRATCH, offsetof(struct paca_struct, tm_scratch)); + DEFINE(THREAD_TM_TFHAR, offsetof(struct thread_struct, tm_tfhar)); + DEFINE(THREAD_TM_TEXASR, offsetof(struct thread_struct, tm_texasr)); + DEFINE(THREAD_TM_TFIAR, offsetof(struct thread_struct, tm_tfiar)); + DEFINE(THREAD_TM_TAR, offsetof(struct thread_struct, tm_tar)); + DEFINE(THREAD_TM_PPR, offsetof(struct thread_struct, tm_ppr)); + DEFINE(THREAD_TM_DSCR, offsetof(struct thread_struct, tm_dscr)); + DEFINE(PT_CKPT_REGS, offsetof(struct thread_struct, ckpt_regs)); + DEFINE(THREAD_TRANSACT_VRSTATE, offsetof(struct thread_struct, + transact_vr)); + DEFINE(THREAD_TRANSACT_VRSAVE, offsetof(struct thread_struct, + transact_vrsave)); + DEFINE(THREAD_TRANSACT_FPSTATE, offsetof(struct thread_struct, + transact_fp)); + /* Local pt_regs on stack for Transactional Memory funcs. */ + DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD + + sizeof(struct pt_regs) + 16); +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); + DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); - DEFINE(TI_SC_NOERR, offsetof(struct thread_info, syscall_noerror)); -#ifdef CONFIG_PPC32 DEFINE(TI_TASK, offsetof(struct thread_info, task)); - DEFINE(TI_EXECDOMAIN, offsetof(struct thread_info, exec_domain)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); -#endif /* CONFIG_PPC32 */ #ifdef CONFIG_PPC64 DEFINE(DCACHEL1LINESIZE, offsetof(struct ppc64_caches, dline_size)); @@ -106,42 +169,93 @@ int main(void) DEFINE(ICACHEL1LINESIZE, offsetof(struct ppc64_caches, iline_size)); DEFINE(ICACHEL1LOGLINESIZE, offsetof(struct ppc64_caches, log_iline_size)); DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page)); - DEFINE(PLATFORM, offsetof(struct systemcfg, platform)); - DEFINE(PLATFORM_LPAR, PLATFORM_LPAR); - /* paca */ DEFINE(PACA_SIZE, sizeof(struct paca_struct)); + DEFINE(PACA_LOCK_TOKEN, offsetof(struct paca_struct, lock_token)); DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index)); DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start)); DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack)); DEFINE(PACACURRENT, offsetof(struct paca_struct, __current)); DEFINE(PACASAVEDMSR, offsetof(struct paca_struct, saved_msr)); - DEFINE(PACASTABREAL, offsetof(struct paca_struct, stab_real)); - DEFINE(PACASTABVIRT, offsetof(struct paca_struct, stab_addr)); DEFINE(PACASTABRR, offsetof(struct paca_struct, stab_rr)); DEFINE(PACAR1, offsetof(struct paca_struct, saved_r1)); DEFINE(PACATOC, offsetof(struct paca_struct, kernel_toc)); - DEFINE(PACAPROCENABLED, offsetof(struct paca_struct, proc_enabled)); + DEFINE(PACAKBASE, offsetof(struct paca_struct, kernelbase)); + DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); + DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); + DEFINE(PACAIRQHAPPENED, offsetof(struct paca_struct, irq_happened)); + DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); +#ifdef CONFIG_PPC_MM_SLICES + DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct, + context.low_slices_psize)); + DEFINE(PACAHIGHSLICEPSIZE, offsetof(struct paca_struct, + context.high_slices_psize)); + DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); +#endif /* CONFIG_PPC_MM_SLICES */ + +#ifdef CONFIG_PPC_BOOK3E + DEFINE(PACAPGD, offsetof(struct paca_struct, pgd)); + DEFINE(PACA_KERNELPGD, offsetof(struct paca_struct, kernel_pgd)); + DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen)); + DEFINE(PACA_EXTLB, offsetof(struct paca_struct, extlb)); + DEFINE(PACA_EXMC, offsetof(struct paca_struct, exmc)); + DEFINE(PACA_EXCRIT, offsetof(struct paca_struct, excrit)); + DEFINE(PACA_EXDBG, offsetof(struct paca_struct, exdbg)); + DEFINE(PACA_MC_STACK, offsetof(struct paca_struct, mc_kstack)); + DEFINE(PACA_CRIT_STACK, offsetof(struct paca_struct, crit_kstack)); + DEFINE(PACA_DBG_STACK, offsetof(struct paca_struct, dbg_kstack)); + DEFINE(PACA_TCD_PTR, offsetof(struct paca_struct, tcd_ptr)); + + DEFINE(TCD_ESEL_NEXT, + offsetof(struct tlb_core_data, esel_next)); + DEFINE(TCD_ESEL_MAX, + offsetof(struct tlb_core_data, esel_max)); + DEFINE(TCD_ESEL_FIRST, + offsetof(struct tlb_core_data, esel_first)); + DEFINE(TCD_LOCK, offsetof(struct tlb_core_data, lock)); +#endif /* CONFIG_PPC_BOOK3E */ + +#ifdef CONFIG_PPC_STD_MMU_64 + DEFINE(PACASTABREAL, offsetof(struct paca_struct, stab_real)); + DEFINE(PACASTABVIRT, offsetof(struct paca_struct, stab_addr)); DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); - DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); -#ifdef CONFIG_HUGETLB_PAGE - DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas)); - DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas)); -#endif /* CONFIG_HUGETLB_PAGE */ - DEFINE(PACADEFAULTDECR, offsetof(struct paca_struct, default_decr)); + DEFINE(PACAVMALLOCSLLP, offsetof(struct paca_struct, vmalloc_sllp)); +#ifdef CONFIG_PPC_MM_SLICES + DEFINE(MMUPSIZESLLP, offsetof(struct mmu_psize_def, sllp)); +#else + DEFINE(PACACONTEXTSLLP, offsetof(struct paca_struct, context.sllp)); +#endif /* CONFIG_PPC_MM_SLICES */ DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen)); DEFINE(PACA_EXMC, offsetof(struct paca_struct, exmc)); DEFINE(PACA_EXSLB, offsetof(struct paca_struct, exslb)); - DEFINE(PACA_EXDSI, offsetof(struct paca_struct, exdsi)); + DEFINE(PACALPPACAPTR, offsetof(struct paca_struct, lppaca_ptr)); + DEFINE(PACA_SLBSHADOWPTR, offsetof(struct paca_struct, slb_shadow_ptr)); + DEFINE(SLBSHADOW_STACKVSID, + offsetof(struct slb_shadow, save_area[SLB_NUM_BOLTED - 1].vsid)); + DEFINE(SLBSHADOW_STACKESID, + offsetof(struct slb_shadow, save_area[SLB_NUM_BOLTED - 1].esid)); + DEFINE(SLBSHADOW_SAVEAREA, offsetof(struct slb_shadow, save_area)); + DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use)); + DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx)); + DEFINE(LPPACA_YIELDCOUNT, offsetof(struct lppaca, yield_count)); + DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx)); +#endif /* CONFIG_PPC_STD_MMU_64 */ DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp)); - DEFINE(PACALPPACA, offsetof(struct paca_struct, lppaca)); +#ifdef CONFIG_PPC_BOOK3S_64 + DEFINE(PACAMCEMERGSP, offsetof(struct paca_struct, mc_emergency_sp)); + DEFINE(PACA_IN_MCE, offsetof(struct paca_struct, in_mce)); +#endif DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id)); - - DEFINE(LPPACASRR0, offsetof(struct lppaca, saved_srr0)); - DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1)); - DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int)); - DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int)); + DEFINE(PACAKEXECSTATE, offsetof(struct paca_struct, kexec_state)); + DEFINE(PACA_DSCR, offsetof(struct paca_struct, dscr_default)); + DEFINE(PACA_STARTTIME, offsetof(struct paca_struct, starttime)); + DEFINE(PACA_STARTTIME_USER, offsetof(struct paca_struct, starttime_user)); + DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time)); + DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time)); + DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save)); + DEFINE(PACA_NAPSTATELOST, offsetof(struct paca_struct, nap_state_lost)); + DEFINE(PACA_SPRG_VDSO, offsetof(struct paca_struct, sprg_vdso)); #endif /* CONFIG_PPC64 */ /* RTAS */ @@ -149,17 +263,18 @@ int main(void) DEFINE(RTASENTRY, offsetof(struct rtas_t, entry)); /* Interrupt register frame */ - DEFINE(STACK_FRAME_OVERHEAD, STACK_FRAME_OVERHEAD); -#ifndef CONFIG_PPC64 - DEFINE(INT_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs)); -#else /* CONFIG_PPC64 */ + DEFINE(INT_FRAME_SIZE, STACK_INT_FRAME_SIZE); DEFINE(SWITCH_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs)); - /* 288 = # of volatile regs, int & fp, for leaf routines */ - /* which do not stack a frame. See the PPC64 ABI. */ - DEFINE(INT_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 288); +#ifdef CONFIG_PPC64 /* Create extra stack space for SRR0 and SRR1 when calling prom/rtas. */ DEFINE(PROM_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16); DEFINE(RTAS_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16); + + /* hcall statistics */ + DEFINE(HCALL_STAT_SIZE, sizeof(struct hcall_stats)); + DEFINE(HCALL_STAT_CALLS, offsetof(struct hcall_stats, num_calls)); + DEFINE(HCALL_STAT_TB, offsetof(struct hcall_stats, tb_total)); + DEFINE(HCALL_STAT_PURR, offsetof(struct hcall_stats, purr_total)); #endif /* CONFIG_PPC64 */ DEFINE(GPR0, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[0])); DEFINE(GPR1, STACK_FRAME_OVERHEAD+offsetof(struct pt_regs, gpr[1])); @@ -228,6 +343,26 @@ int main(void) DEFINE(_SRR1, STACK_FRAME_OVERHEAD+sizeof(struct pt_regs)+8); #endif /* CONFIG_PPC64 */ +#if defined(CONFIG_PPC32) +#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) + DEFINE(EXC_LVL_SIZE, STACK_EXC_LVL_FRAME_SIZE); + DEFINE(MAS0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0)); + /* we overload MMUCR for 44x on MAS0 since they are mutually exclusive */ + DEFINE(MMUCR, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0)); + DEFINE(MAS1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas1)); + DEFINE(MAS2, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas2)); + DEFINE(MAS3, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas3)); + DEFINE(MAS6, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas6)); + DEFINE(MAS7, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas7)); + DEFINE(_SRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr0)); + DEFINE(_SRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr1)); + DEFINE(_CSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr0)); + DEFINE(_CSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr1)); + DEFINE(_DSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr0)); + DEFINE(_DSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr1)); + DEFINE(SAVED_KSP_LIMIT, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, saved_ksp_limit)); +#endif +#endif DEFINE(CLONE_VM, CLONE_VM); DEFINE(CLONE_UNTRACED, CLONE_UNTRACED); @@ -236,38 +371,367 @@ int main(void) #endif /* ! CONFIG_PPC64 */ /* About the CPU features table */ - DEFINE(CPU_SPEC_ENTRY_SIZE, sizeof(struct cpu_spec)); - DEFINE(CPU_SPEC_PVR_MASK, offsetof(struct cpu_spec, pvr_mask)); - DEFINE(CPU_SPEC_PVR_VALUE, offsetof(struct cpu_spec, pvr_value)); DEFINE(CPU_SPEC_FEATURES, offsetof(struct cpu_spec, cpu_features)); DEFINE(CPU_SPEC_SETUP, offsetof(struct cpu_spec, cpu_setup)); + DEFINE(CPU_SPEC_RESTORE, offsetof(struct cpu_spec, cpu_restore)); -#ifndef CONFIG_PPC64 DEFINE(pbe_address, offsetof(struct pbe, address)); DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); DEFINE(pbe_next, offsetof(struct pbe, next)); +#ifndef CONFIG_PPC64 DEFINE(TASK_SIZE, TASK_SIZE); DEFINE(NUM_USER_SEGMENTS, TASK_SIZE>>28); -#else /* CONFIG_PPC64 */ - /* systemcfg offsets for use by vdso */ - DEFINE(CFG_TB_ORIG_STAMP, offsetof(struct systemcfg, tb_orig_stamp)); - DEFINE(CFG_TB_TICKS_PER_SEC, offsetof(struct systemcfg, tb_ticks_per_sec)); - DEFINE(CFG_TB_TO_XS, offsetof(struct systemcfg, tb_to_xs)); - DEFINE(CFG_STAMP_XSEC, offsetof(struct systemcfg, stamp_xsec)); - DEFINE(CFG_TB_UPDATE_COUNT, offsetof(struct systemcfg, tb_update_count)); - DEFINE(CFG_TZ_MINUTEWEST, offsetof(struct systemcfg, tz_minuteswest)); - DEFINE(CFG_TZ_DSTTIME, offsetof(struct systemcfg, tz_dsttime)); - DEFINE(CFG_SYSCALL_MAP32, offsetof(struct systemcfg, syscall_map_32)); - DEFINE(CFG_SYSCALL_MAP64, offsetof(struct systemcfg, syscall_map_64)); +#endif /* ! CONFIG_PPC64 */ - /* timeval/timezone offsets for use by vdso */ + /* datapage offsets for use by vdso */ + DEFINE(CFG_TB_ORIG_STAMP, offsetof(struct vdso_data, tb_orig_stamp)); + DEFINE(CFG_TB_TICKS_PER_SEC, offsetof(struct vdso_data, tb_ticks_per_sec)); + DEFINE(CFG_TB_TO_XS, offsetof(struct vdso_data, tb_to_xs)); + DEFINE(CFG_STAMP_XSEC, offsetof(struct vdso_data, stamp_xsec)); + DEFINE(CFG_TB_UPDATE_COUNT, offsetof(struct vdso_data, tb_update_count)); + DEFINE(CFG_TZ_MINUTEWEST, offsetof(struct vdso_data, tz_minuteswest)); + DEFINE(CFG_TZ_DSTTIME, offsetof(struct vdso_data, tz_dsttime)); + DEFINE(CFG_SYSCALL_MAP32, offsetof(struct vdso_data, syscall_map_32)); + DEFINE(WTOM_CLOCK_SEC, offsetof(struct vdso_data, wtom_clock_sec)); + DEFINE(WTOM_CLOCK_NSEC, offsetof(struct vdso_data, wtom_clock_nsec)); + DEFINE(STAMP_XTIME, offsetof(struct vdso_data, stamp_xtime)); + DEFINE(STAMP_SEC_FRAC, offsetof(struct vdso_data, stamp_sec_fraction)); + DEFINE(CFG_ICACHE_BLOCKSZ, offsetof(struct vdso_data, icache_block_size)); + DEFINE(CFG_DCACHE_BLOCKSZ, offsetof(struct vdso_data, dcache_block_size)); + DEFINE(CFG_ICACHE_LOGBLOCKSZ, offsetof(struct vdso_data, icache_log_block_size)); + DEFINE(CFG_DCACHE_LOGBLOCKSZ, offsetof(struct vdso_data, dcache_log_block_size)); +#ifdef CONFIG_PPC64 + DEFINE(CFG_SYSCALL_MAP64, offsetof(struct vdso_data, syscall_map_64)); DEFINE(TVAL64_TV_SEC, offsetof(struct timeval, tv_sec)); DEFINE(TVAL64_TV_USEC, offsetof(struct timeval, tv_usec)); DEFINE(TVAL32_TV_SEC, offsetof(struct compat_timeval, tv_sec)); DEFINE(TVAL32_TV_USEC, offsetof(struct compat_timeval, tv_usec)); + DEFINE(TSPC64_TV_SEC, offsetof(struct timespec, tv_sec)); + DEFINE(TSPC64_TV_NSEC, offsetof(struct timespec, tv_nsec)); + DEFINE(TSPC32_TV_SEC, offsetof(struct compat_timespec, tv_sec)); + DEFINE(TSPC32_TV_NSEC, offsetof(struct compat_timespec, tv_nsec)); +#else + DEFINE(TVAL32_TV_SEC, offsetof(struct timeval, tv_sec)); + DEFINE(TVAL32_TV_USEC, offsetof(struct timeval, tv_usec)); + DEFINE(TSPC32_TV_SEC, offsetof(struct timespec, tv_sec)); + DEFINE(TSPC32_TV_NSEC, offsetof(struct timespec, tv_nsec)); +#endif + /* timeval/timezone offsets for use by vdso */ DEFINE(TZONE_TZ_MINWEST, offsetof(struct timezone, tz_minuteswest)); DEFINE(TZONE_TZ_DSTTIME, offsetof(struct timezone, tz_dsttime)); -#endif /* CONFIG_PPC64 */ + + /* Other bits used by the vdso */ + DEFINE(CLOCK_REALTIME, CLOCK_REALTIME); + DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); + DEFINE(NSEC_PER_SEC, NSEC_PER_SEC); + DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC); + +#ifdef CONFIG_BUG + DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); +#endif + + DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE); + DEFINE(PTE_SIZE, sizeof(pte_t)); + +#ifdef CONFIG_KVM + DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack)); + DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); + DEFINE(VCPU_GUEST_PID, offsetof(struct kvm_vcpu, arch.pid)); + DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); + DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave)); + DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fp.fpr)); +#ifdef CONFIG_ALTIVEC + DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr.vr)); +#endif + DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); + DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr)); + DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); +#ifdef CONFIG_PPC_BOOK3S + DEFINE(VCPU_TAR, offsetof(struct kvm_vcpu, arch.tar)); +#endif + DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); + DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc)); +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr)); + DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0)); + DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1)); + DEFINE(VCPU_SPRG0, offsetof(struct kvm_vcpu, arch.shregs.sprg0)); + DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.shregs.sprg1)); + DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2)); + DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3)); +#endif + DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3)); + DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4)); + DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5)); + DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6)); + DEFINE(VCPU_SHARED_SPRG7, offsetof(struct kvm_vcpu_arch_shared, sprg7)); + DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid)); + DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1)); + DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared)); + DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr)); + DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr)); +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) + DEFINE(VCPU_SHAREDBE, offsetof(struct kvm_vcpu, arch.shared_big_endian)); +#endif + + DEFINE(VCPU_SHARED_MAS0, offsetof(struct kvm_vcpu_arch_shared, mas0)); + DEFINE(VCPU_SHARED_MAS1, offsetof(struct kvm_vcpu_arch_shared, mas1)); + DEFINE(VCPU_SHARED_MAS2, offsetof(struct kvm_vcpu_arch_shared, mas2)); + DEFINE(VCPU_SHARED_MAS7_3, offsetof(struct kvm_vcpu_arch_shared, mas7_3)); + DEFINE(VCPU_SHARED_MAS4, offsetof(struct kvm_vcpu_arch_shared, mas4)); + DEFINE(VCPU_SHARED_MAS6, offsetof(struct kvm_vcpu_arch_shared, mas6)); + + DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); + DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); + + /* book3s */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1)); + DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid)); + DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); + DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1)); + DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock)); + DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); + DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); + DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); + DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); + DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); + DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); + DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr)); + DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty)); +#endif +#ifdef CONFIG_PPC_BOOK3S + DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); + DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr)); + DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr)); + DEFINE(VCPU_IC, offsetof(struct kvm_vcpu, arch.ic)); + DEFINE(VCPU_VTB, offsetof(struct kvm_vcpu, arch.vtb)); + DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr)); + DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr)); + DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor)); + DEFINE(VCPU_IAMR, offsetof(struct kvm_vcpu, arch.iamr)); + DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl)); + DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr)); + DEFINE(VCPU_DABRX, offsetof(struct kvm_vcpu, arch.dabrx)); + DEFINE(VCPU_DAWR, offsetof(struct kvm_vcpu, arch.dawr)); + DEFINE(VCPU_DAWRX, offsetof(struct kvm_vcpu, arch.dawrx)); + DEFINE(VCPU_CIABR, offsetof(struct kvm_vcpu, arch.ciabr)); + DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); + DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec)); + DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires)); + DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions)); + DEFINE(VCPU_CEDED, offsetof(struct kvm_vcpu, arch.ceded)); + DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded)); + DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr)); + DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc)); + DEFINE(VCPU_SPMC, offsetof(struct kvm_vcpu, arch.spmc)); + DEFINE(VCPU_SIAR, offsetof(struct kvm_vcpu, arch.siar)); + DEFINE(VCPU_SDAR, offsetof(struct kvm_vcpu, arch.sdar)); + DEFINE(VCPU_SIER, offsetof(struct kvm_vcpu, arch.sier)); + DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); + DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max)); + DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); + DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); + DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); + DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr)); + DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); + DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap)); + DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar)); + DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr)); + DEFINE(VCPU_FSCR, offsetof(struct kvm_vcpu, arch.fscr)); + DEFINE(VCPU_SHADOW_FSCR, offsetof(struct kvm_vcpu, arch.shadow_fscr)); + DEFINE(VCPU_PSPB, offsetof(struct kvm_vcpu, arch.pspb)); + DEFINE(VCPU_EBBHR, offsetof(struct kvm_vcpu, arch.ebbhr)); + DEFINE(VCPU_EBBRR, offsetof(struct kvm_vcpu, arch.ebbrr)); + DEFINE(VCPU_BESCR, offsetof(struct kvm_vcpu, arch.bescr)); + DEFINE(VCPU_CSIGR, offsetof(struct kvm_vcpu, arch.csigr)); + DEFINE(VCPU_TACR, offsetof(struct kvm_vcpu, arch.tacr)); + DEFINE(VCPU_TCSCR, offsetof(struct kvm_vcpu, arch.tcscr)); + DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop)); + DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort)); + DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1)); + DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); + DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); + DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); + DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads)); + DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm)); + DEFINE(VCORE_TB_OFFSET, offsetof(struct kvmppc_vcore, tb_offset)); + DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr)); + DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr)); + DEFINE(VCORE_DPDES, offsetof(struct kvmppc_vcore, dpdes)); + DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige)); + DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv)); + DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb)); +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + DEFINE(VCPU_TFHAR, offsetof(struct kvm_vcpu, arch.tfhar)); + DEFINE(VCPU_TFIAR, offsetof(struct kvm_vcpu, arch.tfiar)); + DEFINE(VCPU_TEXASR, offsetof(struct kvm_vcpu, arch.texasr)); + DEFINE(VCPU_GPR_TM, offsetof(struct kvm_vcpu, arch.gpr_tm)); + DEFINE(VCPU_FPRS_TM, offsetof(struct kvm_vcpu, arch.fp_tm.fpr)); + DEFINE(VCPU_VRS_TM, offsetof(struct kvm_vcpu, arch.vr_tm.vr)); + DEFINE(VCPU_VRSAVE_TM, offsetof(struct kvm_vcpu, arch.vrsave_tm)); + DEFINE(VCPU_CR_TM, offsetof(struct kvm_vcpu, arch.cr_tm)); + DEFINE(VCPU_LR_TM, offsetof(struct kvm_vcpu, arch.lr_tm)); + DEFINE(VCPU_CTR_TM, offsetof(struct kvm_vcpu, arch.ctr_tm)); + DEFINE(VCPU_AMR_TM, offsetof(struct kvm_vcpu, arch.amr_tm)); + DEFINE(VCPU_PPR_TM, offsetof(struct kvm_vcpu, arch.ppr_tm)); + DEFINE(VCPU_DSCR_TM, offsetof(struct kvm_vcpu, arch.dscr_tm)); + DEFINE(VCPU_TAR_TM, offsetof(struct kvm_vcpu, arch.tar_tm)); +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + DEFINE(PACA_SVCPU, offsetof(struct paca_struct, shadow_vcpu)); +# define SVCPU_FIELD(x, f) DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f)) +#else +# define SVCPU_FIELD(x, f) +#endif +# define HSTATE_FIELD(x, f) DEFINE(x, offsetof(struct paca_struct, kvm_hstate.f)) +#else /* 32-bit */ +# define SVCPU_FIELD(x, f) DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, f)) +# define HSTATE_FIELD(x, f) DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, hstate.f)) +#endif + + SVCPU_FIELD(SVCPU_CR, cr); + SVCPU_FIELD(SVCPU_XER, xer); + SVCPU_FIELD(SVCPU_CTR, ctr); + SVCPU_FIELD(SVCPU_LR, lr); + SVCPU_FIELD(SVCPU_PC, pc); + SVCPU_FIELD(SVCPU_R0, gpr[0]); + SVCPU_FIELD(SVCPU_R1, gpr[1]); + SVCPU_FIELD(SVCPU_R2, gpr[2]); + SVCPU_FIELD(SVCPU_R3, gpr[3]); + SVCPU_FIELD(SVCPU_R4, gpr[4]); + SVCPU_FIELD(SVCPU_R5, gpr[5]); + SVCPU_FIELD(SVCPU_R6, gpr[6]); + SVCPU_FIELD(SVCPU_R7, gpr[7]); + SVCPU_FIELD(SVCPU_R8, gpr[8]); + SVCPU_FIELD(SVCPU_R9, gpr[9]); + SVCPU_FIELD(SVCPU_R10, gpr[10]); + SVCPU_FIELD(SVCPU_R11, gpr[11]); + SVCPU_FIELD(SVCPU_R12, gpr[12]); + SVCPU_FIELD(SVCPU_R13, gpr[13]); + SVCPU_FIELD(SVCPU_FAULT_DSISR, fault_dsisr); + SVCPU_FIELD(SVCPU_FAULT_DAR, fault_dar); + SVCPU_FIELD(SVCPU_LAST_INST, last_inst); + SVCPU_FIELD(SVCPU_SHADOW_SRR1, shadow_srr1); +#ifdef CONFIG_PPC_BOOK3S_32 + SVCPU_FIELD(SVCPU_SR, sr); +#endif +#ifdef CONFIG_PPC64 + SVCPU_FIELD(SVCPU_SLB, slb); + SVCPU_FIELD(SVCPU_SLB_MAX, slb_max); + SVCPU_FIELD(SVCPU_SHADOW_FSCR, shadow_fscr); +#endif + + HSTATE_FIELD(HSTATE_HOST_R1, host_r1); + HSTATE_FIELD(HSTATE_HOST_R2, host_r2); + HSTATE_FIELD(HSTATE_HOST_MSR, host_msr); + HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler); + HSTATE_FIELD(HSTATE_SCRATCH0, scratch0); + HSTATE_FIELD(HSTATE_SCRATCH1, scratch1); + HSTATE_FIELD(HSTATE_SCRATCH2, scratch2); + HSTATE_FIELD(HSTATE_IN_GUEST, in_guest); + HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5); + HSTATE_FIELD(HSTATE_NAPPING, napping); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + HSTATE_FIELD(HSTATE_HWTHREAD_REQ, hwthread_req); + HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state); + HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); + HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore); + HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys); + HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr); + HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); + HSTATE_FIELD(HSTATE_PTID, ptid); + HSTATE_FIELD(HSTATE_MMCR, host_mmcr); + HSTATE_FIELD(HSTATE_PMC, host_pmc); + HSTATE_FIELD(HSTATE_PURR, host_purr); + HSTATE_FIELD(HSTATE_SPURR, host_spurr); + HSTATE_FIELD(HSTATE_DSCR, host_dscr); + HSTATE_FIELD(HSTATE_DABR, dabr); + HSTATE_FIELD(HSTATE_DECEXP, dec_expires); + DEFINE(IPI_PRIORITY, IPI_PRIORITY); +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + +#ifdef CONFIG_PPC_BOOK3S_64 + HSTATE_FIELD(HSTATE_CFAR, cfar); + HSTATE_FIELD(HSTATE_PPR, ppr); + HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr); +#endif /* CONFIG_PPC_BOOK3S_64 */ + +#else /* CONFIG_PPC_BOOK3S */ + DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); + DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); + DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); + DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr)); + DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc)); + DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); + DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); + DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); + DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save)); +#endif /* CONFIG_PPC_BOOK3S */ +#endif /* CONFIG_KVM */ + +#ifdef CONFIG_KVM_GUEST + DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared, + scratch1)); + DEFINE(KVM_MAGIC_SCRATCH2, offsetof(struct kvm_vcpu_arch_shared, + scratch2)); + DEFINE(KVM_MAGIC_SCRATCH3, offsetof(struct kvm_vcpu_arch_shared, + scratch3)); + DEFINE(KVM_MAGIC_INT, offsetof(struct kvm_vcpu_arch_shared, + int_pending)); + DEFINE(KVM_MAGIC_MSR, offsetof(struct kvm_vcpu_arch_shared, msr)); + DEFINE(KVM_MAGIC_CRITICAL, offsetof(struct kvm_vcpu_arch_shared, + critical)); + DEFINE(KVM_MAGIC_SR, offsetof(struct kvm_vcpu_arch_shared, sr)); +#endif + +#ifdef CONFIG_44x + DEFINE(PGD_T_LOG2, PGD_T_LOG2); + DEFINE(PTE_T_LOG2, PTE_T_LOG2); +#endif +#ifdef CONFIG_PPC_FSL_BOOK3E + DEFINE(TLBCAM_SIZE, sizeof(struct tlbcam)); + DEFINE(TLBCAM_MAS0, offsetof(struct tlbcam, MAS0)); + DEFINE(TLBCAM_MAS1, offsetof(struct tlbcam, MAS1)); + DEFINE(TLBCAM_MAS2, offsetof(struct tlbcam, MAS2)); + DEFINE(TLBCAM_MAS3, offsetof(struct tlbcam, MAS3)); + DEFINE(TLBCAM_MAS7, offsetof(struct tlbcam, MAS7)); +#endif + +#if defined(CONFIG_KVM) && defined(CONFIG_SPE) + DEFINE(VCPU_EVR, offsetof(struct kvm_vcpu, arch.evr[0])); + DEFINE(VCPU_ACC, offsetof(struct kvm_vcpu, arch.acc)); + DEFINE(VCPU_SPEFSCR, offsetof(struct kvm_vcpu, arch.spefscr)); + DEFINE(VCPU_HOST_SPEFSCR, offsetof(struct kvm_vcpu, arch.host_spefscr)); +#endif + +#ifdef CONFIG_KVM_BOOKE_HV + DEFINE(VCPU_HOST_MAS4, offsetof(struct kvm_vcpu, arch.host_mas4)); + DEFINE(VCPU_HOST_MAS6, offsetof(struct kvm_vcpu, arch.host_mas6)); + DEFINE(VCPU_EPLC, offsetof(struct kvm_vcpu, arch.eplc)); +#endif + +#ifdef CONFIG_KVM_EXIT_TIMING + DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu, + arch.timing_exit.tv32.tbu)); + DEFINE(VCPU_TIMING_EXIT_TBL, offsetof(struct kvm_vcpu, + arch.timing_exit.tv32.tbl)); + DEFINE(VCPU_TIMING_LAST_ENTER_TBU, offsetof(struct kvm_vcpu, + arch.timing_last_enter.tv32.tbu)); + DEFINE(VCPU_TIMING_LAST_ENTER_TBL, offsetof(struct kvm_vcpu, + arch.timing_last_enter.tv32.tbl)); +#endif + +#ifdef CONFIG_PPC_POWERNV + DEFINE(OPAL_MC_GPR3, offsetof(struct opal_machine_check_event, gpr3)); + DEFINE(OPAL_MC_SRR0, offsetof(struct opal_machine_check_event, srr0)); + DEFINE(OPAL_MC_SRR1, offsetof(struct opal_machine_check_event, srr1)); + DEFINE(PACA_OPAL_MC_EVT, offsetof(struct paca_struct, opal_mc_evt)); +#endif + return 0; } diff --git a/arch/powerpc/kernel/audit.c b/arch/powerpc/kernel/audit.c new file mode 100644 index 00000000000..a4dab7cab34 --- /dev/null +++ b/arch/powerpc/kernel/audit.c @@ -0,0 +1,83 @@ +#include <linux/init.h> +#include <linux/types.h> +#include <linux/audit.h> +#include <asm/unistd.h> + +static unsigned dir_class[] = { +#include <asm-generic/audit_dir_write.h> +~0U +}; + +static unsigned read_class[] = { +#include <asm-generic/audit_read.h> +~0U +}; + +static unsigned write_class[] = { +#include <asm-generic/audit_write.h> +~0U +}; + +static unsigned chattr_class[] = { +#include <asm-generic/audit_change_attr.h> +~0U +}; + +static unsigned signal_class[] = { +#include <asm-generic/audit_signal.h> +~0U +}; + +int audit_classify_arch(int arch) +{ +#ifdef CONFIG_PPC64 + if (arch == AUDIT_ARCH_PPC) + return 1; +#endif + return 0; +} + +int audit_classify_syscall(int abi, unsigned syscall) +{ +#ifdef CONFIG_PPC64 + extern int ppc32_classify_syscall(unsigned); + if (abi == AUDIT_ARCH_PPC) + return ppc32_classify_syscall(syscall); +#endif + switch(syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_socketcall: + return 4; + case __NR_execve: + return 5; + default: + return 0; + } +} + +static int __init audit_classes_init(void) +{ +#ifdef CONFIG_PPC64 + extern __u32 ppc32_dir_class[]; + extern __u32 ppc32_write_class[]; + extern __u32 ppc32_read_class[]; + extern __u32 ppc32_chattr_class[]; + extern __u32 ppc32_signal_class[]; + audit_register_class(AUDIT_CLASS_WRITE_32, ppc32_write_class); + audit_register_class(AUDIT_CLASS_READ_32, ppc32_read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ppc32_dir_class); + audit_register_class(AUDIT_CLASS_CHATTR_32, ppc32_chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL_32, ppc32_signal_class); +#endif + audit_register_class(AUDIT_CLASS_WRITE, write_class); + audit_register_class(AUDIT_CLASS_READ, read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); + audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL, signal_class); + return 0; +} + +__initcall(audit_classes_init); diff --git a/arch/powerpc/kernel/binfmt_elf32.c b/arch/powerpc/kernel/binfmt_elf32.c deleted file mode 100644 index 8ad6b0f3365..00000000000 --- a/arch/powerpc/kernel/binfmt_elf32.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * binfmt_elf32.c: Support 32-bit PPC ELF binaries on Power3 and followons. - * based on the SPARC64 version. - * Copyright (C) 1995, 1996, 1997, 1998 David S. Miller (davem@redhat.com) - * Copyright (C) 1995, 1996, 1997, 1998 Jakub Jelinek (jj@ultra.linux.cz) - * - * Copyright (C) 2000,2001 Ken Aaker (kdaaker@rchland.vnet.ibm.com), IBM Corp - * Copyright (C) 2001 Anton Blanchard (anton@au.ibm.com), IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define ELF_ARCH EM_PPC -#define ELF_CLASS ELFCLASS32 -#define ELF_DATA ELFDATA2MSB; - -#include <asm/processor.h> -#include <linux/module.h> -#include <linux/config.h> -#include <linux/elfcore.h> -#include <linux/compat.h> - -#define elf_prstatus elf_prstatus32 -struct elf_prstatus32 -{ - struct elf_siginfo pr_info; /* Info associated with signal */ - short pr_cursig; /* Current signal */ - unsigned int pr_sigpend; /* Set of pending signals */ - unsigned int pr_sighold; /* Set of held signals */ - pid_t pr_pid; - pid_t pr_ppid; - pid_t pr_pgrp; - pid_t pr_sid; - struct compat_timeval pr_utime; /* User time */ - struct compat_timeval pr_stime; /* System time */ - struct compat_timeval pr_cutime; /* Cumulative user time */ - struct compat_timeval pr_cstime; /* Cumulative system time */ - elf_gregset_t pr_reg; /* General purpose registers. */ - int pr_fpvalid; /* True if math co-processor being used. */ -}; - -#define elf_prpsinfo elf_prpsinfo32 -struct elf_prpsinfo32 -{ - char pr_state; /* numeric process state */ - char pr_sname; /* char for pr_state */ - char pr_zomb; /* zombie */ - char pr_nice; /* nice val */ - unsigned int pr_flag; /* flags */ - u32 pr_uid; - u32 pr_gid; - pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; - /* Lots missing */ - char pr_fname[16]; /* filename of executable */ - char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ -}; - -#include <linux/time.h> - -#undef cputime_to_timeval -#define cputime_to_timeval cputime_to_compat_timeval -static __inline__ void -cputime_to_compat_timeval(const cputime_t cputime, struct compat_timeval *value) -{ - unsigned long jiffies = cputime_to_jiffies(cputime); - value->tv_usec = (jiffies % HZ) * (1000000L / HZ); - value->tv_sec = jiffies / HZ; -} - -#define init_elf_binfmt init_elf32_binfmt - -#include "../../../fs/binfmt_elf.c" diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index bdfba92b2b3..41c011cb607 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -3,22 +3,21 @@ * * Benjamin Herrenschmidt <benh@kernel.crashing.org> */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/init.h> -#include <linux/module.h> +#include <linux/export.h> +#include <linux/memblock.h> #include <asm/sections.h> #include <asm/prom.h> #include <asm/btext.h> -#include <asm/prom.h> #include <asm/page.h> #include <asm/mmu.h> #include <asm/pgtable.h> #include <asm/io.h> -#include <asm/lmb.h> #include <asm/processor.h> +#include <asm/udbg.h> #define NO_SCROLL @@ -26,20 +25,18 @@ static void scrollscreen(void); #endif -static void draw_byte(unsigned char c, long locX, long locY); -static void draw_byte_32(unsigned char *bits, unsigned int *base, int rb); -static void draw_byte_16(unsigned char *bits, unsigned int *base, int rb); -static void draw_byte_8(unsigned char *bits, unsigned int *base, int rb); +#define __force_data __attribute__((__section__(".data"))) -static int g_loc_X; -static int g_loc_Y; -static int g_max_loc_X; -static int g_max_loc_Y; +static int g_loc_X __force_data; +static int g_loc_Y __force_data; +static int g_max_loc_X __force_data; +static int g_max_loc_Y __force_data; -static int dispDeviceRowBytes; -static int dispDeviceDepth; -static int dispDeviceRect[4]; -static unsigned char *dispDeviceBase, *logicalDisplayBase; +static int dispDeviceRowBytes __force_data; +static int dispDeviceDepth __force_data; +static int dispDeviceRect[4] __force_data; +static unsigned char *dispDeviceBase __force_data; +static unsigned char *logicalDisplayBase __force_data; unsigned long disp_BAT[2] __initdata = {0, 0}; @@ -47,9 +44,29 @@ unsigned long disp_BAT[2] __initdata = {0, 0}; static unsigned char vga_font[cmapsz]; -int boot_text_mapped; +int boot_text_mapped __force_data = 0; int force_printk_to_btext = 0; +extern void rmci_on(void); +extern void rmci_off(void); + +static inline void rmci_maybe_on(void) +{ +#if defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) && defined(CONFIG_PPC64) + if (!(mfmsr() & MSR_DR)) + rmci_on(); +#endif +} + +static inline void rmci_maybe_off(void) +{ +#if defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) && defined(CONFIG_PPC64) + if (!(mfmsr() & MSR_DR)) + rmci_off(); +#endif +} + + #ifdef CONFIG_PPC32 /* Calc BAT values for mapping the display and store them * in disp_BAT. Those values are then used from head.S to map @@ -57,7 +74,7 @@ int force_printk_to_btext = 0; * * The display is mapped to virtual address 0xD0000000, rather * than 1:1, because some some CHRP machines put the frame buffer - * in the region starting at 0xC0000000 (KERNELBASE). + * in the region starting at 0xC0000000 (PAGE_OFFSET). * This mapping is temporary and will disappear as soon as the * setup done by MMU_Init() is applied. * @@ -66,10 +83,9 @@ int force_printk_to_btext = 0; * is really badly aligned, but I didn't encounter this case * yet. */ -void __init -btext_prepare_BAT(void) +void __init btext_prepare_BAT(void) { - unsigned long vaddr = KERNELBASE + 0x10000000; + unsigned long vaddr = PAGE_OFFSET + 0x10000000; unsigned long addr; unsigned long lowbits; @@ -95,12 +111,13 @@ btext_prepare_BAT(void) } #endif -/* This function will enable the early boot text when doing OF booting. This - * way, xmon output should work too + +/* This function can be used to enable the early boot text when doing + * OF booting or within bootx init. It must be followed by a btext_unmap() + * call before the logical address becomes unusable */ -void __init -btext_setup_display(int width, int height, int depth, int pitch, - unsigned long address) +void __init btext_setup_display(int width, int height, int depth, int pitch, + unsigned long address) { g_loc_X = 0; g_loc_Y = 0; @@ -109,13 +126,18 @@ btext_setup_display(int width, int height, int depth, int pitch, logicalDisplayBase = (unsigned char *)address; dispDeviceBase = (unsigned char *)address; dispDeviceRowBytes = pitch; - dispDeviceDepth = depth; + dispDeviceDepth = depth == 15 ? 16 : depth; dispDeviceRect[0] = dispDeviceRect[1] = 0; dispDeviceRect[2] = width; dispDeviceRect[3] = height; boot_text_mapped = 1; } +void __init btext_unmap(void) +{ + boot_text_mapped = 0; +} + /* Here's a small text engine to use during early boot * or for debugging purposes * @@ -127,7 +149,7 @@ btext_setup_display(int width, int height, int depth, int pitch, * changes. */ -void map_boot_text(void) +void btext_map(void) { unsigned long base, offset, size; unsigned char *vbase; @@ -151,32 +173,43 @@ int btext_initialize(struct device_node *np) { unsigned int width, height, depth, pitch; unsigned long address = 0; - u32 *prop; + const u32 *prop; - prop = (u32 *)get_property(np, "width", NULL); + prop = of_get_property(np, "linux,bootx-width", NULL); + if (prop == NULL) + prop = of_get_property(np, "width", NULL); if (prop == NULL) return -EINVAL; width = *prop; - prop = (u32 *)get_property(np, "height", NULL); + prop = of_get_property(np, "linux,bootx-height", NULL); + if (prop == NULL) + prop = of_get_property(np, "height", NULL); if (prop == NULL) return -EINVAL; height = *prop; - prop = (u32 *)get_property(np, "depth", NULL); + prop = of_get_property(np, "linux,bootx-depth", NULL); + if (prop == NULL) + prop = of_get_property(np, "depth", NULL); if (prop == NULL) return -EINVAL; depth = *prop; pitch = width * ((depth + 7) / 8); - prop = (u32 *)get_property(np, "linebytes", NULL); - if (prop) + prop = of_get_property(np, "linux,bootx-linebytes", NULL); + if (prop == NULL) + prop = of_get_property(np, "linebytes", NULL); + if (prop && *prop != 0xffffffffu) pitch = *prop; if (pitch == 1) pitch = 0x1000; - prop = (u32 *)get_property(np, "address", NULL); + prop = of_get_property(np, "linux,bootx-addr", NULL); + if (prop == NULL) + prop = of_get_property(np, "address", NULL); if (prop) address = *prop; - /* FIXME: Add support for PCI reg properties */ - + /* FIXME: Add support for PCI reg properties. Right now, only + * reliable on macs + */ if (address == 0) return -EINVAL; @@ -184,28 +217,25 @@ int btext_initialize(struct device_node *np) g_loc_Y = 0; g_max_loc_X = width / 8; g_max_loc_Y = height / 16; - logicalDisplayBase = (unsigned char *)address; dispDeviceBase = (unsigned char *)address; dispDeviceRowBytes = pitch; - dispDeviceDepth = depth; + dispDeviceDepth = depth == 15 ? 16 : depth; dispDeviceRect[0] = dispDeviceRect[1] = 0; dispDeviceRect[2] = width; dispDeviceRect[3] = height; - map_boot_text(); + btext_map(); return 0; } -void __init init_boot_display(void) +int __init btext_find_display(int allow_nonstdout) { - char *name; + const char *name; struct device_node *np = NULL; int rc = -ENODEV; - printk("trying to initialize btext ...\n"); - - name = (char *)get_property(of_chosen, "linux,stdout-path", NULL); + name = of_get_property(of_chosen, "linux,stdout-path", NULL); if (name != NULL) { np = of_find_node_by_path(name); if (np != NULL) { @@ -218,18 +248,19 @@ void __init init_boot_display(void) } if (np) rc = btext_initialize(np); - if (rc == 0) - return; + if (rc == 0 || !allow_nonstdout) + return rc; - for (np = NULL; (np = of_find_node_by_type(np, "display"));) { - if (get_property(np, "linux,opened", NULL)) { + for_each_node_by_type(np, "display") { + if (of_get_property(np, "linux,opened", NULL)) { printk("trying %s ...\n", np->full_name); rc = btext_initialize(np); printk("result: %d\n", rc); } if (rc == 0) - return; + break; } + return rc; } /* Calc the base address of a given point (x,y) */ @@ -267,7 +298,7 @@ void btext_update_display(unsigned long phys, int width, int height, iounmap(logicalDisplayBase); boot_text_mapped = 0; } - map_boot_text(); + btext_map(); g_loc_X = 0; g_loc_Y = 0; g_max_loc_X = width / 8; @@ -277,153 +308,92 @@ EXPORT_SYMBOL(btext_update_display); void btext_clearscreen(void) { - unsigned long *base = (unsigned long *)calc_base(0, 0); + unsigned int *base = (unsigned int *)calc_base(0, 0); unsigned long width = ((dispDeviceRect[2] - dispDeviceRect[0]) * - (dispDeviceDepth >> 3)) >> 3; + (dispDeviceDepth >> 3)) >> 2; int i,j; + rmci_maybe_on(); for (i=0; i<(dispDeviceRect[3] - dispDeviceRect[1]); i++) { - unsigned long *ptr = base; + unsigned int *ptr = base; for(j=width; j; --j) *(ptr++) = 0; - base += (dispDeviceRowBytes >> 3); + base += (dispDeviceRowBytes >> 2); } + rmci_maybe_off(); } -#ifndef NO_SCROLL -static void scrollscreen(void) +void btext_flushscreen(void) { - unsigned long *src = (unsigned long *)calc_base(0,16); - unsigned long *dst = (unsigned long *)calc_base(0,0); - unsigned long width = ((dispDeviceRect[2] - dispDeviceRect[0]) * - (dispDeviceDepth >> 3)) >> 3; + unsigned int *base = (unsigned int *)calc_base(0, 0); + unsigned long width = ((dispDeviceRect[2] - dispDeviceRect[0]) * + (dispDeviceDepth >> 3)) >> 2; int i,j; - for (i=0; i<(dispDeviceRect[3] - dispDeviceRect[1] - 16); i++) - { - unsigned long *src_ptr = src; - unsigned long *dst_ptr = dst; - for(j=width; j; --j) - *(dst_ptr++) = *(src_ptr++); - src += (dispDeviceRowBytes >> 3); - dst += (dispDeviceRowBytes >> 3); - } - for (i=0; i<16; i++) + for (i=0; i < (dispDeviceRect[3] - dispDeviceRect[1]); i++) { - unsigned long *dst_ptr = dst; - for(j=width; j; --j) - *(dst_ptr++) = 0; - dst += (dispDeviceRowBytes >> 3); + unsigned int *ptr = base; + for(j = width; j > 0; j -= 8) { + __asm__ __volatile__ ("dcbst 0,%0" :: "r" (ptr)); + ptr += 8; + } + base += (dispDeviceRowBytes >> 2); } + __asm__ __volatile__ ("sync" ::: "memory"); } -#endif /* ndef NO_SCROLL */ -void btext_drawchar(char c) +void btext_flushline(void) { - int cline = 0; -#ifdef NO_SCROLL - int x; -#endif - if (!boot_text_mapped) - return; + unsigned int *base = (unsigned int *)calc_base(0, g_loc_Y << 4); + unsigned long width = ((dispDeviceRect[2] - dispDeviceRect[0]) * + (dispDeviceDepth >> 3)) >> 2; + int i,j; - switch (c) { - case '\b': - if (g_loc_X > 0) - --g_loc_X; - break; - case '\t': - g_loc_X = (g_loc_X & -8) + 8; - break; - case '\r': - g_loc_X = 0; - break; - case '\n': - g_loc_X = 0; - g_loc_Y++; - cline = 1; - break; - default: - draw_byte(c, g_loc_X++, g_loc_Y); - } - if (g_loc_X >= g_max_loc_X) { - g_loc_X = 0; - g_loc_Y++; - cline = 1; - } -#ifndef NO_SCROLL - while (g_loc_Y >= g_max_loc_Y) { - scrollscreen(); - g_loc_Y--; - } -#else - /* wrap around from bottom to top of screen so we don't - waste time scrolling each line. -- paulus. */ - if (g_loc_Y >= g_max_loc_Y) - g_loc_Y = 0; - if (cline) { - for (x = 0; x < g_max_loc_X; ++x) - draw_byte(' ', x, g_loc_Y); + for (i=0; i < 16; i++) + { + unsigned int *ptr = base; + for(j = width; j > 0; j -= 8) { + __asm__ __volatile__ ("dcbst 0,%0" :: "r" (ptr)); + ptr += 8; + } + base += (dispDeviceRowBytes >> 2); } -#endif + __asm__ __volatile__ ("sync" ::: "memory"); } -void btext_drawstring(const char *c) -{ - if (!boot_text_mapped) - return; - while (*c) - btext_drawchar(*c++); -} -void btext_drawhex(unsigned long v) +#ifndef NO_SCROLL +static void scrollscreen(void) { - char *hex_table = "0123456789abcdef"; - - if (!boot_text_mapped) - return; -#ifdef CONFIG_PPC64 - btext_drawchar(hex_table[(v >> 60) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 56) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 52) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 48) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 44) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 40) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 36) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 32) & 0x0000000FUL]); -#endif - btext_drawchar(hex_table[(v >> 28) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 24) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 20) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 16) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 12) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 8) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 4) & 0x0000000FUL]); - btext_drawchar(hex_table[(v >> 0) & 0x0000000FUL]); - btext_drawchar(' '); -} + unsigned int *src = (unsigned int *)calc_base(0,16); + unsigned int *dst = (unsigned int *)calc_base(0,0); + unsigned long width = ((dispDeviceRect[2] - dispDeviceRect[0]) * + (dispDeviceDepth >> 3)) >> 2; + int i,j; -static void draw_byte(unsigned char c, long locX, long locY) -{ - unsigned char *base = calc_base(locX << 3, locY << 4); - unsigned char *font = &vga_font[((unsigned int)c) * 16]; - int rb = dispDeviceRowBytes; + rmci_maybe_on(); - switch(dispDeviceDepth) { - case 24: - case 32: - draw_byte_32(font, (unsigned int *)base, rb); - break; - case 15: - case 16: - draw_byte_16(font, (unsigned int *)base, rb); - break; - case 8: - draw_byte_8(font, (unsigned int *)base, rb); - break; + for (i=0; i<(dispDeviceRect[3] - dispDeviceRect[1] - 16); i++) + { + unsigned int *src_ptr = src; + unsigned int *dst_ptr = dst; + for(j=width; j; --j) + *(dst_ptr++) = *(src_ptr++); + src += (dispDeviceRowBytes >> 2); + dst += (dispDeviceRowBytes >> 2); + } + for (i=0; i<16; i++) + { + unsigned int *dst_ptr = dst; + for(j=width; j; --j) + *(dst_ptr++) = 0; + dst += (dispDeviceRowBytes >> 2); } + + rmci_maybe_off(); } +#endif /* ndef NO_SCROLL */ static unsigned int expand_bits_8[16] = { 0x00000000, @@ -473,7 +443,7 @@ static void draw_byte_32(unsigned char *font, unsigned int *base, int rb) } } -static void draw_byte_16(unsigned char *font, unsigned int *base, int rb) +static inline void draw_byte_16(unsigned char *font, unsigned int *base, int rb) { int l, bits; int fg = 0xFFFFFFFFUL; @@ -491,7 +461,7 @@ static void draw_byte_16(unsigned char *font, unsigned int *base, int rb) } } -static void draw_byte_8(unsigned char *font, unsigned int *base, int rb) +static inline void draw_byte_8(unsigned char *font, unsigned int *base, int rb) { int l, bits; int fg = 0x0F0F0F0FUL; @@ -507,6 +477,128 @@ static void draw_byte_8(unsigned char *font, unsigned int *base, int rb) } } +static noinline void draw_byte(unsigned char c, long locX, long locY) +{ + unsigned char *base = calc_base(locX << 3, locY << 4); + unsigned char *font = &vga_font[((unsigned int)c) * 16]; + int rb = dispDeviceRowBytes; + + rmci_maybe_on(); + switch(dispDeviceDepth) { + case 24: + case 32: + draw_byte_32(font, (unsigned int *)base, rb); + break; + case 15: + case 16: + draw_byte_16(font, (unsigned int *)base, rb); + break; + case 8: + draw_byte_8(font, (unsigned int *)base, rb); + break; + } + rmci_maybe_off(); +} + +void btext_drawchar(char c) +{ + int cline = 0; +#ifdef NO_SCROLL + int x; +#endif + if (!boot_text_mapped) + return; + + switch (c) { + case '\b': + if (g_loc_X > 0) + --g_loc_X; + break; + case '\t': + g_loc_X = (g_loc_X & -8) + 8; + break; + case '\r': + g_loc_X = 0; + break; + case '\n': + g_loc_X = 0; + g_loc_Y++; + cline = 1; + break; + default: + draw_byte(c, g_loc_X++, g_loc_Y); + } + if (g_loc_X >= g_max_loc_X) { + g_loc_X = 0; + g_loc_Y++; + cline = 1; + } +#ifndef NO_SCROLL + while (g_loc_Y >= g_max_loc_Y) { + scrollscreen(); + g_loc_Y--; + } +#else + /* wrap around from bottom to top of screen so we don't + waste time scrolling each line. -- paulus. */ + if (g_loc_Y >= g_max_loc_Y) + g_loc_Y = 0; + if (cline) { + for (x = 0; x < g_max_loc_X; ++x) + draw_byte(' ', x, g_loc_Y); + } +#endif +} + +void btext_drawstring(const char *c) +{ + if (!boot_text_mapped) + return; + while (*c) + btext_drawchar(*c++); +} + +void btext_drawtext(const char *c, unsigned int len) +{ + if (!boot_text_mapped) + return; + while (len--) + btext_drawchar(*c++); +} + +void btext_drawhex(unsigned long v) +{ + if (!boot_text_mapped) + return; +#ifdef CONFIG_PPC64 + btext_drawchar(hex_asc_hi(v >> 56)); + btext_drawchar(hex_asc_lo(v >> 56)); + btext_drawchar(hex_asc_hi(v >> 48)); + btext_drawchar(hex_asc_lo(v >> 48)); + btext_drawchar(hex_asc_hi(v >> 40)); + btext_drawchar(hex_asc_lo(v >> 40)); + btext_drawchar(hex_asc_hi(v >> 32)); + btext_drawchar(hex_asc_lo(v >> 32)); +#endif + btext_drawchar(hex_asc_hi(v >> 24)); + btext_drawchar(hex_asc_lo(v >> 24)); + btext_drawchar(hex_asc_hi(v >> 16)); + btext_drawchar(hex_asc_lo(v >> 16)); + btext_drawchar(hex_asc_hi(v >> 8)); + btext_drawchar(hex_asc_lo(v >> 8)); + btext_drawchar(hex_asc_hi(v)); + btext_drawchar(hex_asc_lo(v)); + btext_drawchar(' '); +} + +void __init udbg_init_btext(void) +{ + /* If btext is enabled, we might have a BAT setup for early display, + * thus we do enable some very basic udbg output + */ + udbg_putc = btext_drawchar; +} + static unsigned char vga_font[cmapsz] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x81, 0xa5, 0x81, 0x81, 0xbd, @@ -851,3 +943,4 @@ static unsigned char vga_font[cmapsz] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; + diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c new file mode 100644 index 00000000000..40198d50b4c --- /dev/null +++ b/arch/powerpc/kernel/cacheinfo.c @@ -0,0 +1,849 @@ +/* + * Processor cache information made available to userspace via sysfs; + * intended to be compatible with x86 intel_cacheinfo implementation. + * + * Copyright 2008 IBM Corporation + * Author: Nathan Lynch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + */ + +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/kernel.h> +#include <linux/kobject.h> +#include <linux/list.h> +#include <linux/notifier.h> +#include <linux/of.h> +#include <linux/percpu.h> +#include <linux/slab.h> +#include <asm/prom.h> + +#include "cacheinfo.h" + +/* per-cpu object for tracking: + * - a "cache" kobject for the top-level directory + * - a list of "index" objects representing the cpu's local cache hierarchy + */ +struct cache_dir { + struct kobject *kobj; /* bare (not embedded) kobject for cache + * directory */ + struct cache_index_dir *index; /* list of index objects */ +}; + +/* "index" object: each cpu's cache directory has an index + * subdirectory corresponding to a cache object associated with the + * cpu. This object's lifetime is managed via the embedded kobject. + */ +struct cache_index_dir { + struct kobject kobj; + struct cache_index_dir *next; /* next index in parent directory */ + struct cache *cache; +}; + +/* Template for determining which OF properties to query for a given + * cache type */ +struct cache_type_info { + const char *name; + const char *size_prop; + + /* Allow for both [di]-cache-line-size and + * [di]-cache-block-size properties. According to the PowerPC + * Processor binding, -line-size should be provided if it + * differs from the cache block size (that which is operated + * on by cache instructions), so we look for -line-size first. + * See cache_get_line_size(). */ + + const char *line_size_props[2]; + const char *nr_sets_prop; +}; + +/* These are used to index the cache_type_info array. */ +#define CACHE_TYPE_UNIFIED 0 +#define CACHE_TYPE_INSTRUCTION 1 +#define CACHE_TYPE_DATA 2 + +static const struct cache_type_info cache_type_info[] = { + { + /* PowerPC Processor binding says the [di]-cache-* + * must be equal on unified caches, so just use + * d-cache properties. */ + .name = "Unified", + .size_prop = "d-cache-size", + .line_size_props = { "d-cache-line-size", + "d-cache-block-size", }, + .nr_sets_prop = "d-cache-sets", + }, + { + .name = "Instruction", + .size_prop = "i-cache-size", + .line_size_props = { "i-cache-line-size", + "i-cache-block-size", }, + .nr_sets_prop = "i-cache-sets", + }, + { + .name = "Data", + .size_prop = "d-cache-size", + .line_size_props = { "d-cache-line-size", + "d-cache-block-size", }, + .nr_sets_prop = "d-cache-sets", + }, +}; + +/* Cache object: each instance of this corresponds to a distinct cache + * in the system. There are separate objects for Harvard caches: one + * each for instruction and data, and each refers to the same OF node. + * The refcount of the OF node is elevated for the lifetime of the + * cache object. A cache object is released when its shared_cpu_map + * is cleared (see cache_cpu_clear). + * + * A cache object is on two lists: an unsorted global list + * (cache_list) of cache objects; and a singly-linked list + * representing the local cache hierarchy, which is ordered by level + * (e.g. L1d -> L1i -> L2 -> L3). + */ +struct cache { + struct device_node *ofnode; /* OF node for this cache, may be cpu */ + struct cpumask shared_cpu_map; /* online CPUs using this cache */ + int type; /* split cache disambiguation */ + int level; /* level not explicit in device tree */ + struct list_head list; /* global list of cache objects */ + struct cache *next_local; /* next cache of >= level */ +}; + +static DEFINE_PER_CPU(struct cache_dir *, cache_dir_pcpu); + +/* traversal/modification of this list occurs only at cpu hotplug time; + * access is serialized by cpu hotplug locking + */ +static LIST_HEAD(cache_list); + +static struct cache_index_dir *kobj_to_cache_index_dir(struct kobject *k) +{ + return container_of(k, struct cache_index_dir, kobj); +} + +static const char *cache_type_string(const struct cache *cache) +{ + return cache_type_info[cache->type].name; +} + +static void cache_init(struct cache *cache, int type, int level, + struct device_node *ofnode) +{ + cache->type = type; + cache->level = level; + cache->ofnode = of_node_get(ofnode); + INIT_LIST_HEAD(&cache->list); + list_add(&cache->list, &cache_list); +} + +static struct cache *new_cache(int type, int level, struct device_node *ofnode) +{ + struct cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_KERNEL); + if (cache) + cache_init(cache, type, level, ofnode); + + return cache; +} + +static void release_cache_debugcheck(struct cache *cache) +{ + struct cache *iter; + + list_for_each_entry(iter, &cache_list, list) + WARN_ONCE(iter->next_local == cache, + "cache for %s(%s) refers to cache for %s(%s)\n", + iter->ofnode->full_name, + cache_type_string(iter), + cache->ofnode->full_name, + cache_type_string(cache)); +} + +static void release_cache(struct cache *cache) +{ + if (!cache) + return; + + pr_debug("freeing L%d %s cache for %s\n", cache->level, + cache_type_string(cache), cache->ofnode->full_name); + + release_cache_debugcheck(cache); + list_del(&cache->list); + of_node_put(cache->ofnode); + kfree(cache); +} + +static void cache_cpu_set(struct cache *cache, int cpu) +{ + struct cache *next = cache; + + while (next) { + WARN_ONCE(cpumask_test_cpu(cpu, &next->shared_cpu_map), + "CPU %i already accounted in %s(%s)\n", + cpu, next->ofnode->full_name, + cache_type_string(next)); + cpumask_set_cpu(cpu, &next->shared_cpu_map); + next = next->next_local; + } +} + +static int cache_size(const struct cache *cache, unsigned int *ret) +{ + const char *propname; + const __be32 *cache_size; + + propname = cache_type_info[cache->type].size_prop; + + cache_size = of_get_property(cache->ofnode, propname, NULL); + if (!cache_size) + return -ENODEV; + + *ret = of_read_number(cache_size, 1); + return 0; +} + +static int cache_size_kb(const struct cache *cache, unsigned int *ret) +{ + unsigned int size; + + if (cache_size(cache, &size)) + return -ENODEV; + + *ret = size / 1024; + return 0; +} + +/* not cache_line_size() because that's a macro in include/linux/cache.h */ +static int cache_get_line_size(const struct cache *cache, unsigned int *ret) +{ + const __be32 *line_size; + int i, lim; + + lim = ARRAY_SIZE(cache_type_info[cache->type].line_size_props); + + for (i = 0; i < lim; i++) { + const char *propname; + + propname = cache_type_info[cache->type].line_size_props[i]; + line_size = of_get_property(cache->ofnode, propname, NULL); + if (line_size) + break; + } + + if (!line_size) + return -ENODEV; + + *ret = of_read_number(line_size, 1); + return 0; +} + +static int cache_nr_sets(const struct cache *cache, unsigned int *ret) +{ + const char *propname; + const __be32 *nr_sets; + + propname = cache_type_info[cache->type].nr_sets_prop; + + nr_sets = of_get_property(cache->ofnode, propname, NULL); + if (!nr_sets) + return -ENODEV; + + *ret = of_read_number(nr_sets, 1); + return 0; +} + +static int cache_associativity(const struct cache *cache, unsigned int *ret) +{ + unsigned int line_size; + unsigned int nr_sets; + unsigned int size; + + if (cache_nr_sets(cache, &nr_sets)) + goto err; + + /* If the cache is fully associative, there is no need to + * check the other properties. + */ + if (nr_sets == 1) { + *ret = 0; + return 0; + } + + if (cache_get_line_size(cache, &line_size)) + goto err; + if (cache_size(cache, &size)) + goto err; + + if (!(nr_sets > 0 && size > 0 && line_size > 0)) + goto err; + + *ret = (size / nr_sets) / line_size; + return 0; +err: + return -ENODEV; +} + +/* helper for dealing with split caches */ +static struct cache *cache_find_first_sibling(struct cache *cache) +{ + struct cache *iter; + + if (cache->type == CACHE_TYPE_UNIFIED) + return cache; + + list_for_each_entry(iter, &cache_list, list) + if (iter->ofnode == cache->ofnode && iter->next_local == cache) + return iter; + + return cache; +} + +/* return the first cache on a local list matching node */ +static struct cache *cache_lookup_by_node(const struct device_node *node) +{ + struct cache *cache = NULL; + struct cache *iter; + + list_for_each_entry(iter, &cache_list, list) { + if (iter->ofnode != node) + continue; + cache = cache_find_first_sibling(iter); + break; + } + + return cache; +} + +static bool cache_node_is_unified(const struct device_node *np) +{ + return of_get_property(np, "cache-unified", NULL); +} + +static struct cache *cache_do_one_devnode_unified(struct device_node *node, + int level) +{ + struct cache *cache; + + pr_debug("creating L%d ucache for %s\n", level, node->full_name); + + cache = new_cache(CACHE_TYPE_UNIFIED, level, node); + + return cache; +} + +static struct cache *cache_do_one_devnode_split(struct device_node *node, + int level) +{ + struct cache *dcache, *icache; + + pr_debug("creating L%d dcache and icache for %s\n", level, + node->full_name); + + dcache = new_cache(CACHE_TYPE_DATA, level, node); + icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node); + + if (!dcache || !icache) + goto err; + + dcache->next_local = icache; + + return dcache; +err: + release_cache(dcache); + release_cache(icache); + return NULL; +} + +static struct cache *cache_do_one_devnode(struct device_node *node, int level) +{ + struct cache *cache; + + if (cache_node_is_unified(node)) + cache = cache_do_one_devnode_unified(node, level); + else + cache = cache_do_one_devnode_split(node, level); + + return cache; +} + +static struct cache *cache_lookup_or_instantiate(struct device_node *node, + int level) +{ + struct cache *cache; + + cache = cache_lookup_by_node(node); + + WARN_ONCE(cache && cache->level != level, + "cache level mismatch on lookup (got %d, expected %d)\n", + cache->level, level); + + if (!cache) + cache = cache_do_one_devnode(node, level); + + return cache; +} + +static void link_cache_lists(struct cache *smaller, struct cache *bigger) +{ + while (smaller->next_local) { + if (smaller->next_local == bigger) + return; /* already linked */ + smaller = smaller->next_local; + } + + smaller->next_local = bigger; +} + +static void do_subsidiary_caches_debugcheck(struct cache *cache) +{ + WARN_ON_ONCE(cache->level != 1); + WARN_ON_ONCE(strcmp(cache->ofnode->type, "cpu")); +} + +static void do_subsidiary_caches(struct cache *cache) +{ + struct device_node *subcache_node; + int level = cache->level; + + do_subsidiary_caches_debugcheck(cache); + + while ((subcache_node = of_find_next_cache_node(cache->ofnode))) { + struct cache *subcache; + + level++; + subcache = cache_lookup_or_instantiate(subcache_node, level); + of_node_put(subcache_node); + if (!subcache) + break; + + link_cache_lists(cache, subcache); + cache = subcache; + } +} + +static struct cache *cache_chain_instantiate(unsigned int cpu_id) +{ + struct device_node *cpu_node; + struct cache *cpu_cache = NULL; + + pr_debug("creating cache object(s) for CPU %i\n", cpu_id); + + cpu_node = of_get_cpu_node(cpu_id, NULL); + WARN_ONCE(!cpu_node, "no OF node found for CPU %i\n", cpu_id); + if (!cpu_node) + goto out; + + cpu_cache = cache_lookup_or_instantiate(cpu_node, 1); + if (!cpu_cache) + goto out; + + do_subsidiary_caches(cpu_cache); + + cache_cpu_set(cpu_cache, cpu_id); +out: + of_node_put(cpu_node); + + return cpu_cache; +} + +static struct cache_dir *cacheinfo_create_cache_dir(unsigned int cpu_id) +{ + struct cache_dir *cache_dir; + struct device *dev; + struct kobject *kobj = NULL; + + dev = get_cpu_device(cpu_id); + WARN_ONCE(!dev, "no dev for CPU %i\n", cpu_id); + if (!dev) + goto err; + + kobj = kobject_create_and_add("cache", &dev->kobj); + if (!kobj) + goto err; + + cache_dir = kzalloc(sizeof(*cache_dir), GFP_KERNEL); + if (!cache_dir) + goto err; + + cache_dir->kobj = kobj; + + WARN_ON_ONCE(per_cpu(cache_dir_pcpu, cpu_id) != NULL); + + per_cpu(cache_dir_pcpu, cpu_id) = cache_dir; + + return cache_dir; +err: + kobject_put(kobj); + return NULL; +} + +static void cache_index_release(struct kobject *kobj) +{ + struct cache_index_dir *index; + + index = kobj_to_cache_index_dir(kobj); + + pr_debug("freeing index directory for L%d %s cache\n", + index->cache->level, cache_type_string(index->cache)); + + kfree(index); +} + +static ssize_t cache_index_show(struct kobject *k, struct attribute *attr, char *buf) +{ + struct kobj_attribute *kobj_attr; + + kobj_attr = container_of(attr, struct kobj_attribute, attr); + + return kobj_attr->show(k, kobj_attr, buf); +} + +static struct cache *index_kobj_to_cache(struct kobject *k) +{ + struct cache_index_dir *index; + + index = kobj_to_cache_index_dir(k); + + return index->cache; +} + +static ssize_t size_show(struct kobject *k, struct kobj_attribute *attr, char *buf) +{ + unsigned int size_kb; + struct cache *cache; + + cache = index_kobj_to_cache(k); + + if (cache_size_kb(cache, &size_kb)) + return -ENODEV; + + return sprintf(buf, "%uK\n", size_kb); +} + +static struct kobj_attribute cache_size_attr = + __ATTR(size, 0444, size_show, NULL); + + +static ssize_t line_size_show(struct kobject *k, struct kobj_attribute *attr, char *buf) +{ + unsigned int line_size; + struct cache *cache; + + cache = index_kobj_to_cache(k); + + if (cache_get_line_size(cache, &line_size)) + return -ENODEV; + + return sprintf(buf, "%u\n", line_size); +} + +static struct kobj_attribute cache_line_size_attr = + __ATTR(coherency_line_size, 0444, line_size_show, NULL); + +static ssize_t nr_sets_show(struct kobject *k, struct kobj_attribute *attr, char *buf) +{ + unsigned int nr_sets; + struct cache *cache; + + cache = index_kobj_to_cache(k); + + if (cache_nr_sets(cache, &nr_sets)) + return -ENODEV; + + return sprintf(buf, "%u\n", nr_sets); +} + +static struct kobj_attribute cache_nr_sets_attr = + __ATTR(number_of_sets, 0444, nr_sets_show, NULL); + +static ssize_t associativity_show(struct kobject *k, struct kobj_attribute *attr, char *buf) +{ + unsigned int associativity; + struct cache *cache; + + cache = index_kobj_to_cache(k); + + if (cache_associativity(cache, &associativity)) + return -ENODEV; + + return sprintf(buf, "%u\n", associativity); +} + +static struct kobj_attribute cache_assoc_attr = + __ATTR(ways_of_associativity, 0444, associativity_show, NULL); + +static ssize_t type_show(struct kobject *k, struct kobj_attribute *attr, char *buf) +{ + struct cache *cache; + + cache = index_kobj_to_cache(k); + + return sprintf(buf, "%s\n", cache_type_string(cache)); +} + +static struct kobj_attribute cache_type_attr = + __ATTR(type, 0444, type_show, NULL); + +static ssize_t level_show(struct kobject *k, struct kobj_attribute *attr, char *buf) +{ + struct cache_index_dir *index; + struct cache *cache; + + index = kobj_to_cache_index_dir(k); + cache = index->cache; + + return sprintf(buf, "%d\n", cache->level); +} + +static struct kobj_attribute cache_level_attr = + __ATTR(level, 0444, level_show, NULL); + +static ssize_t shared_cpu_map_show(struct kobject *k, struct kobj_attribute *attr, char *buf) +{ + struct cache_index_dir *index; + struct cache *cache; + int len; + int n = 0; + + index = kobj_to_cache_index_dir(k); + cache = index->cache; + len = PAGE_SIZE - 2; + + if (len > 1) { + n = cpumask_scnprintf(buf, len, &cache->shared_cpu_map); + buf[n++] = '\n'; + buf[n] = '\0'; + } + return n; +} + +static struct kobj_attribute cache_shared_cpu_map_attr = + __ATTR(shared_cpu_map, 0444, shared_cpu_map_show, NULL); + +/* Attributes which should always be created -- the kobject/sysfs core + * does this automatically via kobj_type->default_attrs. This is the + * minimum data required to uniquely identify a cache. + */ +static struct attribute *cache_index_default_attrs[] = { + &cache_type_attr.attr, + &cache_level_attr.attr, + &cache_shared_cpu_map_attr.attr, + NULL, +}; + +/* Attributes which should be created if the cache device node has the + * right properties -- see cacheinfo_create_index_opt_attrs + */ +static struct kobj_attribute *cache_index_opt_attrs[] = { + &cache_size_attr, + &cache_line_size_attr, + &cache_nr_sets_attr, + &cache_assoc_attr, +}; + +static const struct sysfs_ops cache_index_ops = { + .show = cache_index_show, +}; + +static struct kobj_type cache_index_type = { + .release = cache_index_release, + .sysfs_ops = &cache_index_ops, + .default_attrs = cache_index_default_attrs, +}; + +static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) +{ + const char *cache_name; + const char *cache_type; + struct cache *cache; + char *buf; + int i; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return; + + cache = dir->cache; + cache_name = cache->ofnode->full_name; + cache_type = cache_type_string(cache); + + /* We don't want to create an attribute that can't provide a + * meaningful value. Check the return value of each optional + * attribute's ->show method before registering the + * attribute. + */ + for (i = 0; i < ARRAY_SIZE(cache_index_opt_attrs); i++) { + struct kobj_attribute *attr; + ssize_t rc; + + attr = cache_index_opt_attrs[i]; + + rc = attr->show(&dir->kobj, attr, buf); + if (rc <= 0) { + pr_debug("not creating %s attribute for " + "%s(%s) (rc = %zd)\n", + attr->attr.name, cache_name, + cache_type, rc); + continue; + } + if (sysfs_create_file(&dir->kobj, &attr->attr)) + pr_debug("could not create %s attribute for %s(%s)\n", + attr->attr.name, cache_name, cache_type); + } + + kfree(buf); +} + +static void cacheinfo_create_index_dir(struct cache *cache, int index, + struct cache_dir *cache_dir) +{ + struct cache_index_dir *index_dir; + int rc; + + index_dir = kzalloc(sizeof(*index_dir), GFP_KERNEL); + if (!index_dir) + goto err; + + index_dir->cache = cache; + + rc = kobject_init_and_add(&index_dir->kobj, &cache_index_type, + cache_dir->kobj, "index%d", index); + if (rc) + goto err; + + index_dir->next = cache_dir->index; + cache_dir->index = index_dir; + + cacheinfo_create_index_opt_attrs(index_dir); + + return; +err: + kfree(index_dir); +} + +static void cacheinfo_sysfs_populate(unsigned int cpu_id, + struct cache *cache_list) +{ + struct cache_dir *cache_dir; + struct cache *cache; + int index = 0; + + cache_dir = cacheinfo_create_cache_dir(cpu_id); + if (!cache_dir) + return; + + cache = cache_list; + while (cache) { + cacheinfo_create_index_dir(cache, index, cache_dir); + index++; + cache = cache->next_local; + } +} + +void cacheinfo_cpu_online(unsigned int cpu_id) +{ + struct cache *cache; + + cache = cache_chain_instantiate(cpu_id); + if (!cache) + return; + + cacheinfo_sysfs_populate(cpu_id, cache); +} + +/* functions needed to remove cache entry for cpu offline or suspend/resume */ + +#if (defined(CONFIG_PPC_PSERIES) && defined(CONFIG_SUSPEND)) || \ + defined(CONFIG_HOTPLUG_CPU) + +static struct cache *cache_lookup_by_cpu(unsigned int cpu_id) +{ + struct device_node *cpu_node; + struct cache *cache; + + cpu_node = of_get_cpu_node(cpu_id, NULL); + WARN_ONCE(!cpu_node, "no OF node found for CPU %i\n", cpu_id); + if (!cpu_node) + return NULL; + + cache = cache_lookup_by_node(cpu_node); + of_node_put(cpu_node); + + return cache; +} + +static void remove_index_dirs(struct cache_dir *cache_dir) +{ + struct cache_index_dir *index; + + index = cache_dir->index; + + while (index) { + struct cache_index_dir *next; + + next = index->next; + kobject_put(&index->kobj); + index = next; + } +} + +static void remove_cache_dir(struct cache_dir *cache_dir) +{ + remove_index_dirs(cache_dir); + + /* Remove cache dir from sysfs */ + kobject_del(cache_dir->kobj); + + kobject_put(cache_dir->kobj); + + kfree(cache_dir); +} + +static void cache_cpu_clear(struct cache *cache, int cpu) +{ + while (cache) { + struct cache *next = cache->next_local; + + WARN_ONCE(!cpumask_test_cpu(cpu, &cache->shared_cpu_map), + "CPU %i not accounted in %s(%s)\n", + cpu, cache->ofnode->full_name, + cache_type_string(cache)); + + cpumask_clear_cpu(cpu, &cache->shared_cpu_map); + + /* Release the cache object if all the cpus using it + * are offline */ + if (cpumask_empty(&cache->shared_cpu_map)) + release_cache(cache); + + cache = next; + } +} + +void cacheinfo_cpu_offline(unsigned int cpu_id) +{ + struct cache_dir *cache_dir; + struct cache *cache; + + /* Prevent userspace from seeing inconsistent state - remove + * the sysfs hierarchy first */ + cache_dir = per_cpu(cache_dir_pcpu, cpu_id); + + /* careful, sysfs population may have failed */ + if (cache_dir) + remove_cache_dir(cache_dir); + + per_cpu(cache_dir_pcpu, cpu_id) = NULL; + + /* clear the CPU's bit in its cache chain, possibly freeing + * cache objects */ + cache = cache_lookup_by_cpu(cpu_id); + if (cache) + cache_cpu_clear(cache, cpu_id); +} +#endif /* (CONFIG_PPC_PSERIES && CONFIG_SUSPEND) || CONFIG_HOTPLUG_CPU */ diff --git a/arch/powerpc/kernel/cacheinfo.h b/arch/powerpc/kernel/cacheinfo.h new file mode 100644 index 00000000000..a7b74d36acd --- /dev/null +++ b/arch/powerpc/kernel/cacheinfo.h @@ -0,0 +1,8 @@ +#ifndef _PPC_CACHEINFO_H +#define _PPC_CACHEINFO_H + +/* These are just hooks for sysfs.c to use. */ +extern void cacheinfo_cpu_online(unsigned int cpu_id); +extern void cacheinfo_cpu_offline(unsigned int cpu_id); + +#endif /* _PPC_CACHEINFO_H */ diff --git a/arch/powerpc/kernel/compat_audit.c b/arch/powerpc/kernel/compat_audit.c new file mode 100644 index 00000000000..108ff14e212 --- /dev/null +++ b/arch/powerpc/kernel/compat_audit.c @@ -0,0 +1,43 @@ +#undef __powerpc64__ +#include <asm/unistd.h> + +unsigned ppc32_dir_class[] = { +#include <asm-generic/audit_dir_write.h> +~0U +}; + +unsigned ppc32_chattr_class[] = { +#include <asm-generic/audit_change_attr.h> +~0U +}; + +unsigned ppc32_write_class[] = { +#include <asm-generic/audit_write.h> +~0U +}; + +unsigned ppc32_read_class[] = { +#include <asm-generic/audit_read.h> +~0U +}; + +unsigned ppc32_signal_class[] = { +#include <asm-generic/audit_signal.h> +~0U +}; + +int ppc32_classify_syscall(unsigned syscall) +{ + switch(syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_socketcall: + return 4; + case __NR_execve: + return 5; + default: + return 1; + } +} diff --git a/arch/powerpc/kernel/cpu_setup_44x.S b/arch/powerpc/kernel/cpu_setup_44x.S new file mode 100644 index 00000000000..e32b4a9a2c2 --- /dev/null +++ b/arch/powerpc/kernel/cpu_setup_44x.S @@ -0,0 +1,74 @@ +/* + * This file contains low level CPU setup functions. + * Valentine Barshak <vbarshak@ru.mvista.com> + * MontaVista Software, Inc (c) 2007 + * + * Based on cpu_setup_6xx code by + * Benjamin Herrenschmidt <benh@kernel.crashing.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/processor.h> +#include <asm/cputable.h> +#include <asm/ppc_asm.h> + +_GLOBAL(__setup_cpu_440ep) + b __init_fpu_44x +_GLOBAL(__setup_cpu_440epx) + mflr r4 + bl __init_fpu_44x + bl __plb_disable_wrp + bl __fixup_440A_mcheck + mtlr r4 + blr +_GLOBAL(__setup_cpu_440grx) + mflr r4 + bl __plb_disable_wrp + bl __fixup_440A_mcheck + mtlr r4 + blr +_GLOBAL(__setup_cpu_460ex) +_GLOBAL(__setup_cpu_460gt) +_GLOBAL(__setup_cpu_460sx) +_GLOBAL(__setup_cpu_apm821xx) + mflr r4 + bl __init_fpu_44x + bl __fixup_440A_mcheck + mtlr r4 + blr + +_GLOBAL(__setup_cpu_440x5) +_GLOBAL(__setup_cpu_440gx) +_GLOBAL(__setup_cpu_440spe) + b __fixup_440A_mcheck + +/* enable APU between CPU and FPU */ +_GLOBAL(__init_fpu_44x) + mfspr r3,SPRN_CCR0 + /* Clear DAPUIB flag in CCR0 */ + rlwinm r3,r3,0,12,10 + mtspr SPRN_CCR0,r3 + isync + blr + +/* + * Workaround for the incorrect write to DDR SDRAM errata. + * The write address can be corrupted during writes to + * DDR SDRAM when write pipelining is enabled on PLB0. + * Disable write pipelining here. + */ +#define DCRN_PLB4A0_ACR 0x81 + +_GLOBAL(__plb_disable_wrp) + mfdcr r3,DCRN_PLB4A0_ACR + /* clear WRP bit in PLB4A0_ACR */ + rlwinm r3,r3,0,8,6 + mtdcr DCRN_PLB4A0_ACR,r3 + isync + blr + diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S new file mode 100644 index 00000000000..f8cd9fba4d3 --- /dev/null +++ b/arch/powerpc/kernel/cpu_setup_6xx.S @@ -0,0 +1,489 @@ +/* + * This file contains low level CPU setup functions. + * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cache.h> +#include <asm/mmu.h> + +_GLOBAL(__setup_cpu_603) + mflr r5 +BEGIN_MMU_FTR_SECTION + li r10,0 + mtspr SPRN_SPRG_603_LRU,r10 /* init SW LRU tracking */ +END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) +BEGIN_FTR_SECTION + bl __init_fpu_registers +END_FTR_SECTION_IFCLR(CPU_FTR_FPU_UNAVAILABLE) + bl setup_common_caches + mtlr r5 + blr +_GLOBAL(__setup_cpu_604) + mflr r5 + bl setup_common_caches + bl setup_604_hid0 + mtlr r5 + blr +_GLOBAL(__setup_cpu_750) + mflr r5 + bl __init_fpu_registers + bl setup_common_caches + bl setup_750_7400_hid0 + mtlr r5 + blr +_GLOBAL(__setup_cpu_750cx) + mflr r5 + bl __init_fpu_registers + bl setup_common_caches + bl setup_750_7400_hid0 + bl setup_750cx + mtlr r5 + blr +_GLOBAL(__setup_cpu_750fx) + mflr r5 + bl __init_fpu_registers + bl setup_common_caches + bl setup_750_7400_hid0 + bl setup_750fx + mtlr r5 + blr +_GLOBAL(__setup_cpu_7400) + mflr r5 + bl __init_fpu_registers + bl setup_7400_workarounds + bl setup_common_caches + bl setup_750_7400_hid0 + mtlr r5 + blr +_GLOBAL(__setup_cpu_7410) + mflr r5 + bl __init_fpu_registers + bl setup_7410_workarounds + bl setup_common_caches + bl setup_750_7400_hid0 + li r3,0 + mtspr SPRN_L2CR2,r3 + mtlr r5 + blr +_GLOBAL(__setup_cpu_745x) + mflr r5 + bl setup_common_caches + bl setup_745x_specifics + mtlr r5 + blr + +/* Enable caches for 603's, 604, 750 & 7400 */ +setup_common_caches: + mfspr r11,SPRN_HID0 + andi. r0,r11,HID0_DCE + ori r11,r11,HID0_ICE|HID0_DCE + ori r8,r11,HID0_ICFI + bne 1f /* don't invalidate the D-cache */ + ori r8,r8,HID0_DCI /* unless it wasn't enabled */ +1: sync + mtspr SPRN_HID0,r8 /* enable and invalidate caches */ + sync + mtspr SPRN_HID0,r11 /* enable caches */ + sync + isync + blr + +/* 604, 604e, 604ev, ... + * Enable superscalar execution & branch history table + */ +setup_604_hid0: + mfspr r11,SPRN_HID0 + ori r11,r11,HID0_SIED|HID0_BHTE + ori r8,r11,HID0_BTCD + sync + mtspr SPRN_HID0,r8 /* flush branch target address cache */ + sync /* on 604e/604r */ + mtspr SPRN_HID0,r11 + sync + isync + blr + +/* 7400 <= rev 2.7 and 7410 rev = 1.0 suffer from some + * erratas we work around here. + * Moto MPC710CE.pdf describes them, those are errata + * #3, #4 and #5 + * Note that we assume the firmware didn't choose to + * apply other workarounds (there are other ones documented + * in the .pdf). It appear that Apple firmware only works + * around #3 and with the same fix we use. We may want to + * check if the CPU is using 60x bus mode in which case + * the workaround for errata #4 is useless. Also, we may + * want to explicitly clear HID0_NOPDST as this is not + * needed once we have applied workaround #5 (though it's + * not set by Apple's firmware at least). + */ +setup_7400_workarounds: + mfpvr r3 + rlwinm r3,r3,0,20,31 + cmpwi 0,r3,0x0207 + ble 1f + blr +setup_7410_workarounds: + mfpvr r3 + rlwinm r3,r3,0,20,31 + cmpwi 0,r3,0x0100 + bnelr +1: + mfspr r11,SPRN_MSSSR0 + /* Errata #3: Set L1OPQ_SIZE to 0x10 */ + rlwinm r11,r11,0,9,6 + oris r11,r11,0x0100 + /* Errata #4: Set L2MQ_SIZE to 1 (check for MPX mode first ?) */ + oris r11,r11,0x0002 + /* Errata #5: Set DRLT_SIZE to 0x01 */ + rlwinm r11,r11,0,5,2 + oris r11,r11,0x0800 + sync + mtspr SPRN_MSSSR0,r11 + sync + isync + blr + +/* 740/750/7400/7410 + * Enable Store Gathering (SGE), Address Brodcast (ABE), + * Branch History Table (BHTE), Branch Target ICache (BTIC) + * Dynamic Power Management (DPM), Speculative (SPD) + * Clear Instruction cache throttling (ICTC) + */ +setup_750_7400_hid0: + mfspr r11,SPRN_HID0 + ori r11,r11,HID0_SGE | HID0_ABE | HID0_BHTE | HID0_BTIC + oris r11,r11,HID0_DPM@h +BEGIN_FTR_SECTION + xori r11,r11,HID0_BTIC +END_FTR_SECTION_IFSET(CPU_FTR_NO_BTIC) +BEGIN_FTR_SECTION + xoris r11,r11,HID0_DPM@h /* disable dynamic power mgmt */ +END_FTR_SECTION_IFSET(CPU_FTR_NO_DPM) + li r3,HID0_SPD + andc r11,r11,r3 /* clear SPD: enable speculative */ + li r3,0 + mtspr SPRN_ICTC,r3 /* Instruction Cache Throttling off */ + isync + mtspr SPRN_HID0,r11 + sync + isync + blr + +/* 750cx specific + * Looks like we have to disable NAP feature for some PLL settings... + * (waiting for confirmation) + */ +setup_750cx: + mfspr r10, SPRN_HID1 + rlwinm r10,r10,4,28,31 + cmpwi cr0,r10,7 + cmpwi cr1,r10,9 + cmpwi cr2,r10,11 + cror 4*cr0+eq,4*cr0+eq,4*cr1+eq + cror 4*cr0+eq,4*cr0+eq,4*cr2+eq + bnelr + lwz r6,CPU_SPEC_FEATURES(r4) + li r7,CPU_FTR_CAN_NAP + andc r6,r6,r7 + stw r6,CPU_SPEC_FEATURES(r4) + blr + +/* 750fx specific + */ +setup_750fx: + blr + +/* MPC 745x + * Enable Store Gathering (SGE), Branch Folding (FOLD) + * Branch History Table (BHTE), Branch Target ICache (BTIC) + * Dynamic Power Management (DPM), Speculative (SPD) + * Ensure our data cache instructions really operate. + * Timebase has to be running or we wouldn't have made it here, + * just ensure we don't disable it. + * Clear Instruction cache throttling (ICTC) + * Enable L2 HW prefetch + */ +setup_745x_specifics: + /* We check for the presence of an L3 cache setup by + * the firmware. If any, we disable NAP capability as + * it's known to be bogus on rev 2.1 and earlier + */ +BEGIN_FTR_SECTION + mfspr r11,SPRN_L3CR + andis. r11,r11,L3CR_L3E@h + beq 1f +END_FTR_SECTION_IFSET(CPU_FTR_L3CR) + lwz r6,CPU_SPEC_FEATURES(r4) + andi. r0,r6,CPU_FTR_L3_DISABLE_NAP + beq 1f + li r7,CPU_FTR_CAN_NAP + andc r6,r6,r7 + stw r6,CPU_SPEC_FEATURES(r4) +1: + mfspr r11,SPRN_HID0 + + /* All of the bits we have to set..... + */ + ori r11,r11,HID0_SGE | HID0_FOLD | HID0_BHTE + ori r11,r11,HID0_LRSTK | HID0_BTIC + oris r11,r11,HID0_DPM@h +BEGIN_MMU_FTR_SECTION + oris r11,r11,HID0_HIGH_BAT@h +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) +BEGIN_FTR_SECTION + xori r11,r11,HID0_BTIC +END_FTR_SECTION_IFSET(CPU_FTR_NO_BTIC) +BEGIN_FTR_SECTION + xoris r11,r11,HID0_DPM@h /* disable dynamic power mgmt */ +END_FTR_SECTION_IFSET(CPU_FTR_NO_DPM) + + /* All of the bits we have to clear.... + */ + li r3,HID0_SPD | HID0_NOPDST | HID0_NOPTI + andc r11,r11,r3 /* clear SPD: enable speculative */ + li r3,0 + + mtspr SPRN_ICTC,r3 /* Instruction Cache Throttling off */ + isync + mtspr SPRN_HID0,r11 + sync + isync + + /* Enable L2 HW prefetch, if L2 is enabled + */ + mfspr r3,SPRN_L2CR + andis. r3,r3,L2CR_L2E@h + beqlr + mfspr r3,SPRN_MSSCR0 + ori r3,r3,3 + sync + mtspr SPRN_MSSCR0,r3 + sync + isync + blr + +/* + * Initialize the FPU registers. This is needed to work around an errata + * in some 750 cpus where using a not yet initialized FPU register after + * power on reset may hang the CPU + */ +_GLOBAL(__init_fpu_registers) + mfmsr r10 + ori r11,r10,MSR_FP + mtmsr r11 + isync + addis r9,r3,empty_zero_page@ha + addi r9,r9,empty_zero_page@l + REST_32FPRS(0,r9) + sync + mtmsr r10 + isync + blr + + +/* Definitions for the table use to save CPU states */ +#define CS_HID0 0 +#define CS_HID1 4 +#define CS_HID2 8 +#define CS_MSSCR0 12 +#define CS_MSSSR0 16 +#define CS_ICTRL 20 +#define CS_LDSTCR 24 +#define CS_LDSTDB 28 +#define CS_SIZE 32 + + .data + .balign L1_CACHE_BYTES +cpu_state_storage: + .space CS_SIZE + .balign L1_CACHE_BYTES,0 + .text + +/* Called in normal context to backup CPU 0 state. This + * does not include cache settings. This function is also + * called for machine sleep. This does not include the MMU + * setup, BATs, etc... but rather the "special" registers + * like HID0, HID1, MSSCR0, etc... + */ +_GLOBAL(__save_cpu_setup) + /* Some CR fields are volatile, we back it up all */ + mfcr r7 + + /* Get storage ptr */ + lis r5,cpu_state_storage@h + ori r5,r5,cpu_state_storage@l + + /* Save HID0 (common to all CONFIG_6xx cpus) */ + mfspr r3,SPRN_HID0 + stw r3,CS_HID0(r5) + + /* Now deal with CPU type dependent registers */ + mfspr r3,SPRN_PVR + srwi r3,r3,16 + cmplwi cr0,r3,0x8000 /* 7450 */ + cmplwi cr1,r3,0x000c /* 7400 */ + cmplwi cr2,r3,0x800c /* 7410 */ + cmplwi cr3,r3,0x8001 /* 7455 */ + cmplwi cr4,r3,0x8002 /* 7457 */ + cmplwi cr5,r3,0x8003 /* 7447A */ + cmplwi cr6,r3,0x7000 /* 750FX */ + cmplwi cr7,r3,0x8004 /* 7448 */ + /* cr1 is 7400 || 7410 */ + cror 4*cr1+eq,4*cr1+eq,4*cr2+eq + /* cr0 is 74xx */ + cror 4*cr0+eq,4*cr0+eq,4*cr3+eq + cror 4*cr0+eq,4*cr0+eq,4*cr4+eq + cror 4*cr0+eq,4*cr0+eq,4*cr1+eq + cror 4*cr0+eq,4*cr0+eq,4*cr5+eq + cror 4*cr0+eq,4*cr0+eq,4*cr7+eq + bne 1f + /* Backup 74xx specific regs */ + mfspr r4,SPRN_MSSCR0 + stw r4,CS_MSSCR0(r5) + mfspr r4,SPRN_MSSSR0 + stw r4,CS_MSSSR0(r5) + beq cr1,1f + /* Backup 745x specific registers */ + mfspr r4,SPRN_HID1 + stw r4,CS_HID1(r5) + mfspr r4,SPRN_ICTRL + stw r4,CS_ICTRL(r5) + mfspr r4,SPRN_LDSTCR + stw r4,CS_LDSTCR(r5) + mfspr r4,SPRN_LDSTDB + stw r4,CS_LDSTDB(r5) +1: + bne cr6,1f + /* Backup 750FX specific registers */ + mfspr r4,SPRN_HID1 + stw r4,CS_HID1(r5) + /* If rev 2.x, backup HID2 */ + mfspr r3,SPRN_PVR + andi. r3,r3,0xff00 + cmpwi cr0,r3,0x0200 + bne 1f + mfspr r4,SPRN_HID2 + stw r4,CS_HID2(r5) +1: + mtcr r7 + blr + +/* Called with no MMU context (typically MSR:IR/DR off) to + * restore CPU state as backed up by the previous + * function. This does not include cache setting + */ +_GLOBAL(__restore_cpu_setup) + /* Some CR fields are volatile, we back it up all */ + mfcr r7 + + /* Get storage ptr */ + lis r5,(cpu_state_storage-KERNELBASE)@h + ori r5,r5,cpu_state_storage@l + + /* Restore HID0 */ + lwz r3,CS_HID0(r5) + sync + isync + mtspr SPRN_HID0,r3 + sync + isync + + /* Now deal with CPU type dependent registers */ + mfspr r3,SPRN_PVR + srwi r3,r3,16 + cmplwi cr0,r3,0x8000 /* 7450 */ + cmplwi cr1,r3,0x000c /* 7400 */ + cmplwi cr2,r3,0x800c /* 7410 */ + cmplwi cr3,r3,0x8001 /* 7455 */ + cmplwi cr4,r3,0x8002 /* 7457 */ + cmplwi cr5,r3,0x8003 /* 7447A */ + cmplwi cr6,r3,0x7000 /* 750FX */ + cmplwi cr7,r3,0x8004 /* 7448 */ + /* cr1 is 7400 || 7410 */ + cror 4*cr1+eq,4*cr1+eq,4*cr2+eq + /* cr0 is 74xx */ + cror 4*cr0+eq,4*cr0+eq,4*cr3+eq + cror 4*cr0+eq,4*cr0+eq,4*cr4+eq + cror 4*cr0+eq,4*cr0+eq,4*cr1+eq + cror 4*cr0+eq,4*cr0+eq,4*cr5+eq + cror 4*cr0+eq,4*cr0+eq,4*cr7+eq + bne 2f + /* Restore 74xx specific regs */ + lwz r4,CS_MSSCR0(r5) + sync + mtspr SPRN_MSSCR0,r4 + sync + isync + lwz r4,CS_MSSSR0(r5) + sync + mtspr SPRN_MSSSR0,r4 + sync + isync + bne cr2,1f + /* Clear 7410 L2CR2 */ + li r4,0 + mtspr SPRN_L2CR2,r4 +1: beq cr1,2f + /* Restore 745x specific registers */ + lwz r4,CS_HID1(r5) + sync + mtspr SPRN_HID1,r4 + isync + sync + lwz r4,CS_ICTRL(r5) + sync + mtspr SPRN_ICTRL,r4 + isync + sync + lwz r4,CS_LDSTCR(r5) + sync + mtspr SPRN_LDSTCR,r4 + isync + sync + lwz r4,CS_LDSTDB(r5) + sync + mtspr SPRN_LDSTDB,r4 + isync + sync +2: bne cr6,1f + /* Restore 750FX specific registers + * that is restore HID2 on rev 2.x and PLL config & switch + * to PLL 0 on all + */ + /* If rev 2.x, restore HID2 with low voltage bit cleared */ + mfspr r3,SPRN_PVR + andi. r3,r3,0xff00 + cmpwi cr0,r3,0x0200 + bne 4f + lwz r4,CS_HID2(r5) + rlwinm r4,r4,0,19,17 + mtspr SPRN_HID2,r4 + sync +4: + lwz r4,CS_HID1(r5) + rlwinm r5,r4,0,16,14 + mtspr SPRN_HID1,r5 + /* Wait for PLL to stabilize */ + mftbl r5 +3: mftbl r6 + sub r6,r6,r5 + cmplwi cr0,r6,10000 + ble 3b + /* Setup final PLL */ + mtspr SPRN_HID1,r4 +1: + mtcr r7 + blr + diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S new file mode 100644 index 00000000000..4f1393d2007 --- /dev/null +++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S @@ -0,0 +1,225 @@ +/* + * This file contains low level CPU setup functions. + * Kumar Gala <galak@kernel.crashing.org> + * Copyright 2009 Freescale Semiconductor, Inc. + * + * Based on cpu_setup_6xx code by + * Benjamin Herrenschmidt <benh@kernel.crashing.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/processor.h> +#include <asm/cputable.h> +#include <asm/ppc_asm.h> +#include <asm/mmu-book3e.h> +#include <asm/asm-offsets.h> + +_GLOBAL(__e500_icache_setup) + mfspr r0, SPRN_L1CSR1 + andi. r3, r0, L1CSR1_ICE + bnelr /* Already enabled */ + oris r0, r0, L1CSR1_CPE@h + ori r0, r0, (L1CSR1_ICFI | L1CSR1_ICLFR | L1CSR1_ICE) + mtspr SPRN_L1CSR1, r0 /* Enable I-Cache */ + isync + blr + +_GLOBAL(__e500_dcache_setup) + mfspr r0, SPRN_L1CSR0 + andi. r3, r0, L1CSR0_DCE + bnelr /* Already enabled */ + msync + isync + li r0, 0 + mtspr SPRN_L1CSR0, r0 /* Disable */ + msync + isync + li r0, (L1CSR0_DCFI | L1CSR0_CLFC) + mtspr SPRN_L1CSR0, r0 /* Invalidate */ + isync +1: mfspr r0, SPRN_L1CSR0 + andi. r3, r0, L1CSR0_CLFC + bne+ 1b /* Wait for lock bits reset */ + oris r0, r0, L1CSR0_CPE@h + ori r0, r0, L1CSR0_DCE + msync + isync + mtspr SPRN_L1CSR0, r0 /* Enable */ + isync + blr + +/* + * FIXME - we haven't yet done testing to determine a reasonable default + * value for PW20_WAIT_IDLE_BIT. + */ +#define PW20_WAIT_IDLE_BIT 50 /* 1ms, TB frequency is 41.66MHZ */ +_GLOBAL(setup_pw20_idle) + mfspr r3, SPRN_PWRMGTCR0 + + /* Set PW20_WAIT bit, enable pw20 state*/ + ori r3, r3, PWRMGTCR0_PW20_WAIT + li r11, PW20_WAIT_IDLE_BIT + + /* Set Automatic PW20 Core Idle Count */ + rlwimi r3, r11, PWRMGTCR0_PW20_ENT_SHIFT, PWRMGTCR0_PW20_ENT + + mtspr SPRN_PWRMGTCR0, r3 + + blr + +/* + * FIXME - we haven't yet done testing to determine a reasonable default + * value for AV_WAIT_IDLE_BIT. + */ +#define AV_WAIT_IDLE_BIT 50 /* 1ms, TB frequency is 41.66MHZ */ +_GLOBAL(setup_altivec_idle) + mfspr r3, SPRN_PWRMGTCR0 + + /* Enable Altivec Idle */ + oris r3, r3, PWRMGTCR0_AV_IDLE_PD_EN@h + li r11, AV_WAIT_IDLE_BIT + + /* Set Automatic AltiVec Idle Count */ + rlwimi r3, r11, PWRMGTCR0_AV_IDLE_CNT_SHIFT, PWRMGTCR0_AV_IDLE_CNT + + mtspr SPRN_PWRMGTCR0, r3 + + blr + +_GLOBAL(__setup_cpu_e6500) + mflr r6 +#ifdef CONFIG_PPC64 + bl setup_altivec_ivors + /* Touch IVOR42 only if the CPU supports E.HV category */ + mfspr r10,SPRN_MMUCFG + rlwinm. r10,r10,0,MMUCFG_LPIDSIZE + beq 1f + bl setup_lrat_ivor +1: +#endif + bl setup_pw20_idle + bl setup_altivec_idle + bl __setup_cpu_e5500 + mtlr r6 + blr + +#ifdef CONFIG_PPC32 +_GLOBAL(__setup_cpu_e200) + /* enable dedicated debug exception handling resources (Debug APU) */ + mfspr r3,SPRN_HID0 + ori r3,r3,HID0_DAPUEN@l + mtspr SPRN_HID0,r3 + b __setup_e200_ivors +_GLOBAL(__setup_cpu_e500v1) +_GLOBAL(__setup_cpu_e500v2) + mflr r4 + bl __e500_icache_setup + bl __e500_dcache_setup + bl __setup_e500_ivors +#if defined(CONFIG_FSL_RIO) || defined(CONFIG_FSL_PCI) + /* Ensure that RFXE is set */ + mfspr r3,SPRN_HID1 + oris r3,r3,HID1_RFXE@h + mtspr SPRN_HID1,r3 +#endif + mtlr r4 + blr +_GLOBAL(__setup_cpu_e500mc) +_GLOBAL(__setup_cpu_e5500) + mflr r5 + bl __e500_icache_setup + bl __e500_dcache_setup + bl __setup_e500mc_ivors + /* + * We only want to touch IVOR38-41 if we're running on hardware + * that supports category E.HV. The architectural way to determine + * this is MMUCFG[LPIDSIZE]. + */ + mfspr r3, SPRN_MMUCFG + rlwinm. r3, r3, 0, MMUCFG_LPIDSIZE + beq 1f + bl __setup_ehv_ivors + b 2f +1: + lwz r3, CPU_SPEC_FEATURES(r4) + /* We need this check as cpu_setup is also called for + * the secondary cores. So, if we have already cleared + * the feature on the primary core, avoid doing it on the + * secondary core. + */ + andis. r6, r3, CPU_FTR_EMB_HV@h + beq 2f + rlwinm r3, r3, 0, ~CPU_FTR_EMB_HV + stw r3, CPU_SPEC_FEATURES(r4) +2: + mtlr r5 + blr +#endif + +#ifdef CONFIG_PPC_BOOK3E_64 +_GLOBAL(__restore_cpu_e6500) + mflr r5 + bl setup_altivec_ivors + /* Touch IVOR42 only if the CPU supports E.HV category */ + mfspr r10,SPRN_MMUCFG + rlwinm. r10,r10,0,MMUCFG_LPIDSIZE + beq 1f + bl setup_lrat_ivor +1: + bl setup_pw20_idle + bl setup_altivec_idle + bl __restore_cpu_e5500 + mtlr r5 + blr + +_GLOBAL(__restore_cpu_e5500) + mflr r4 + bl __e500_icache_setup + bl __e500_dcache_setup + bl __setup_base_ivors + bl setup_perfmon_ivor + bl setup_doorbell_ivors + /* + * We only want to touch IVOR38-41 if we're running on hardware + * that supports category E.HV. The architectural way to determine + * this is MMUCFG[LPIDSIZE]. + */ + mfspr r10,SPRN_MMUCFG + rlwinm. r10,r10,0,MMUCFG_LPIDSIZE + beq 1f + bl setup_ehv_ivors +1: + mtlr r4 + blr + +_GLOBAL(__setup_cpu_e5500) + mflr r5 + bl __e500_icache_setup + bl __e500_dcache_setup + bl __setup_base_ivors + bl setup_perfmon_ivor + bl setup_doorbell_ivors + /* + * We only want to touch IVOR38-41 if we're running on hardware + * that supports category E.HV. The architectural way to determine + * this is MMUCFG[LPIDSIZE]. + */ + mfspr r10,SPRN_MMUCFG + rlwinm. r10,r10,0,MMUCFG_LPIDSIZE + beq 1f + bl setup_ehv_ivors + b 2f +1: + ld r10,CPU_SPEC_FEATURES(r4) + LOAD_REG_IMMEDIATE(r9,CPU_FTR_EMB_HV) + andc r10,r10,r9 + std r10,CPU_SPEC_FEATURES(r4) +2: + mtlr r5 + blr +#endif diff --git a/arch/powerpc/kernel/cpu_setup_pa6t.S b/arch/powerpc/kernel/cpu_setup_pa6t.S new file mode 100644 index 00000000000..d62cb9cae4e --- /dev/null +++ b/arch/powerpc/kernel/cpu_setup_pa6t.S @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2006-2007 PA Semi, Inc + * + * Maintained by: Olof Johansson <olof@lixom.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cache.h> + +/* Right now, restore and setup are the same thing */ +_GLOBAL(__restore_cpu_pa6t) +_GLOBAL(__setup_cpu_pa6t) + /* Do nothing if not running in HV mode */ + mfmsr r0 + rldicl. r0,r0,4,63 + beqlr + + mfspr r0,SPRN_HID5 + ori r0,r0,0x38 + mtspr SPRN_HID5,r0 + + mfspr r0,SPRN_LPCR + ori r0,r0,0x7000 + mtspr SPRN_LPCR,r0 + + blr diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S new file mode 100644 index 00000000000..46733535cc0 --- /dev/null +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -0,0 +1,182 @@ +/* + * This file contains low level CPU setup functions. + * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cache.h> + +/* Entry: r3 = crap, r4 = ptr to cputable entry + * + * Note that we can be called twice for pseudo-PVRs + */ +_GLOBAL(__setup_cpu_power7) + mflr r11 + bl __init_hvmode_206 + mtlr r11 + beqlr + li r0,0 + mtspr SPRN_LPID,r0 + mfspr r3,SPRN_LPCR + bl __init_LPCR + bl __init_tlb_power7 + mtlr r11 + blr + +_GLOBAL(__restore_cpu_power7) + mflr r11 + mfmsr r3 + rldicl. r0,r3,4,63 + beqlr + li r0,0 + mtspr SPRN_LPID,r0 + mfspr r3,SPRN_LPCR + bl __init_LPCR + bl __init_tlb_power7 + mtlr r11 + blr + +_GLOBAL(__setup_cpu_power8) + mflr r11 + bl __init_FSCR + bl __init_PMU + bl __init_hvmode_206 + mtlr r11 + beqlr + li r0,0 + mtspr SPRN_LPID,r0 + mfspr r3,SPRN_LPCR + ori r3, r3, LPCR_PECEDH + bl __init_LPCR + bl __init_HFSCR + bl __init_tlb_power8 + bl __init_PMU_HV + mtlr r11 + blr + +_GLOBAL(__restore_cpu_power8) + mflr r11 + bl __init_FSCR + bl __init_PMU + mfmsr r3 + rldicl. r0,r3,4,63 + mtlr r11 + beqlr + li r0,0 + mtspr SPRN_LPID,r0 + mfspr r3,SPRN_LPCR + ori r3, r3, LPCR_PECEDH + bl __init_LPCR + bl __init_HFSCR + bl __init_tlb_power8 + bl __init_PMU_HV + mtlr r11 + blr + +__init_hvmode_206: + /* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */ + mfmsr r3 + rldicl. r0,r3,4,63 + bnelr + ld r5,CPU_SPEC_FEATURES(r4) + LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE) + xor r5,r5,r6 + std r5,CPU_SPEC_FEATURES(r4) + blr + +__init_LPCR: + /* Setup a sane LPCR: + * Called with initial LPCR in R3 + * + * LPES = 0b01 (HSRR0/1 used for 0x500) + * PECE = 0b111 + * DPFD = 4 + * HDICE = 0 + * VC = 0b100 (VPM0=1, VPM1=0, ISL=0) + * VRMASD = 0b10000 (L=1, LP=00) + * + * Other bits untouched for now + */ + li r5,1 + rldimi r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2 + ori r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2) + li r5,4 + rldimi r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3 + clrrdi r3,r3,1 /* clear HDICE */ + li r5,4 + rldimi r3,r5, LPCR_VC_SH, 0 + li r5,0x10 + rldimi r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5 + mtspr SPRN_LPCR,r3 + isync + blr + +__init_FSCR: + mfspr r3,SPRN_FSCR + ori r3,r3,FSCR_TAR|FSCR_DSCR|FSCR_EBB + mtspr SPRN_FSCR,r3 + blr + +__init_HFSCR: + mfspr r3,SPRN_HFSCR + ori r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|\ + HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB + mtspr SPRN_HFSCR,r3 + blr + +/* + * Clear the TLB using the specified IS form of tlbiel instruction + * (invalidate by congruence class). P7 has 128 CCs., P8 has 512. + * + * r3 = IS field + */ +__init_tlb_power7: + li r3,0xc00 /* IS field = 0b11 */ +_GLOBAL(__flush_tlb_power7) + li r6,128 + mtctr r6 + mr r7,r3 /* IS field */ + ptesync +2: tlbiel r7 + addi r7,r7,0x1000 + bdnz 2b + ptesync +1: blr + +__init_tlb_power8: + li r3,0xc00 /* IS field = 0b11 */ +_GLOBAL(__flush_tlb_power8) + li r6,512 + mtctr r6 + mr r7,r3 /* IS field */ + ptesync +2: tlbiel r7 + addi r7,r7,0x1000 + bdnz 2b + ptesync +1: blr + +__init_PMU_HV: + li r5,0 + mtspr SPRN_MMCRC,r5 + mtspr SPRN_MMCRH,r5 + blr + +__init_PMU: + li r5,0 + mtspr SPRN_MMCRS,r5 + mtspr SPRN_MMCRA,r5 + mtspr SPRN_MMCR0,r5 + mtspr SPRN_MMCR1,r5 + mtspr SPRN_MMCR2,r5 + blr diff --git a/arch/powerpc/kernel/cpu_setup_ppc970.S b/arch/powerpc/kernel/cpu_setup_ppc970.S new file mode 100644 index 00000000000..12fac8df01c --- /dev/null +++ b/arch/powerpc/kernel/cpu_setup_ppc970.S @@ -0,0 +1,210 @@ +/* + * This file contains low level CPU setup functions. + * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cache.h> + +_GLOBAL(__cpu_preinit_ppc970) + /* Do nothing if not running in HV mode */ + mfmsr r0 + rldicl. r0,r0,4,63 + beqlr + + /* Make sure HID4:rm_ci is off before MMU is turned off, that large + * pages are enabled with HID4:61 and clear HID5:DCBZ_size and + * HID5:DCBZ32_ill + */ + li r0,0 + mfspr r3,SPRN_HID4 + rldimi r3,r0,40,23 /* clear bit 23 (rm_ci) */ + rldimi r3,r0,2,61 /* clear bit 61 (lg_pg_en) */ + sync + mtspr SPRN_HID4,r3 + isync + sync + mfspr r3,SPRN_HID5 + rldimi r3,r0,6,56 /* clear bits 56 & 57 (DCBZ*) */ + sync + mtspr SPRN_HID5,r3 + isync + sync + + /* Setup some basic HID1 features */ + mfspr r0,SPRN_HID1 + li r3,0x1200 /* enable i-fetch cacheability */ + sldi r3,r3,44 /* and prefetch */ + or r0,r0,r3 + mtspr SPRN_HID1,r0 + mtspr SPRN_HID1,r0 + isync + + /* Clear HIOR */ + li r0,0 + sync + mtspr SPRN_HIOR,0 /* Clear interrupt prefix */ + isync + blr + +/* Definitions for the table use to save CPU states */ +#define CS_HID0 0 +#define CS_HID1 8 +#define CS_HID4 16 +#define CS_HID5 24 +#define CS_SIZE 32 + + .data + .balign L1_CACHE_BYTES,0 +cpu_state_storage: + .space CS_SIZE + .balign L1_CACHE_BYTES,0 + .text + + +_GLOBAL(__setup_cpu_ppc970) + /* Do nothing if not running in HV mode */ + mfmsr r0 + rldicl. r0,r0,4,63 + beq no_hv_mode + + mfspr r0,SPRN_HID0 + li r11,5 /* clear DOZE and SLEEP */ + rldimi r0,r11,52,8 /* set NAP and DPM */ + li r11,0 + rldimi r0,r11,32,31 /* clear EN_ATTN */ + b load_hids /* Jump to shared code */ + + +_GLOBAL(__setup_cpu_ppc970MP) + /* Do nothing if not running in HV mode */ + mfmsr r0 + rldicl. r0,r0,4,63 + beq no_hv_mode + + mfspr r0,SPRN_HID0 + li r11,0x15 /* clear DOZE and SLEEP */ + rldimi r0,r11,52,6 /* set DEEPNAP, NAP and DPM */ + li r11,0 + rldimi r0,r11,32,31 /* clear EN_ATTN */ + +load_hids: + mtspr SPRN_HID0,r0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + sync + isync + + /* Try to set LPES = 01 in HID4 */ + mfspr r0,SPRN_HID4 + clrldi r0,r0,1 /* clear LPES0 */ + ori r0,r0,HID4_LPES1 /* set LPES1 */ + sync + mtspr SPRN_HID4,r0 + isync + + /* Save away cpu state */ + LOAD_REG_ADDR(r5,cpu_state_storage) + + /* Save HID0,1,4 and 5 */ + mfspr r3,SPRN_HID0 + std r3,CS_HID0(r5) + mfspr r3,SPRN_HID1 + std r3,CS_HID1(r5) + mfspr r4,SPRN_HID4 + std r4,CS_HID4(r5) + mfspr r3,SPRN_HID5 + std r3,CS_HID5(r5) + + /* See if we successfully set LPES1 to 1; if not we are in Apple mode */ + andi. r4,r4,HID4_LPES1 + bnelr + +no_hv_mode: + /* Disable CPU_FTR_HVMODE and exit, since we don't have HV mode */ + ld r5,CPU_SPEC_FEATURES(r4) + LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE) + andc r5,r5,r6 + std r5,CPU_SPEC_FEATURES(r4) + blr + +/* Called with no MMU context (typically MSR:IR/DR off) to + * restore CPU state as backed up by the previous + * function. This does not include cache setting + */ +_GLOBAL(__restore_cpu_ppc970) + /* Do nothing if not running in HV mode */ + mfmsr r0 + rldicl. r0,r0,4,63 + beqlr + + LOAD_REG_ADDR(r5,cpu_state_storage) + /* Before accessing memory, we make sure rm_ci is clear */ + li r0,0 + mfspr r3,SPRN_HID4 + rldimi r3,r0,40,23 /* clear bit 23 (rm_ci) */ + sync + mtspr SPRN_HID4,r3 + isync + sync + + /* Clear interrupt prefix */ + li r0,0 + sync + mtspr SPRN_HIOR,0 + isync + + /* Restore HID0 */ + ld r3,CS_HID0(r5) + sync + isync + mtspr SPRN_HID0,r3 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + sync + isync + + /* Restore HID1 */ + ld r3,CS_HID1(r5) + sync + isync + mtspr SPRN_HID1,r3 + mtspr SPRN_HID1,r3 + sync + isync + + /* Restore HID4 */ + ld r3,CS_HID4(r5) + sync + isync + mtspr SPRN_HID4,r3 + sync + isync + + /* Restore HID5 */ + ld r3,CS_HID5(r5) + sync + isync + mtspr SPRN_HID5,r3 + sync + isync + blr + diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index b91345fa080..0c157642c2a 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -10,19 +10,24 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <linux/string.h> #include <linux/sched.h> #include <linux/threads.h> #include <linux/init.h> -#include <linux/module.h> +#include <linux/export.h> #include <asm/oprofile_impl.h> #include <asm/cputable.h> +#include <asm/prom.h> /* for PTRRELOC on ARCH=ppc */ +#include <asm/mmu.h> +#include <asm/setup.h> struct cpu_spec* cur_cpu_spec = NULL; EXPORT_SYMBOL(cur_cpu_spec); +/* The platform string corresponding to the real PVR */ +const char *powerpc_base_platform; + /* NOTE: * Unlike ppc32, ppc64 will only call this once for the boot CPU, it's * the responsibility of the appropriate CPU save/restore functions to @@ -30,11 +35,21 @@ EXPORT_SYMBOL(cur_cpu_spec); * part of the cputable though. That has to be fixed for both ppc32 * and ppc64 */ -#ifdef CONFIG_PPC64 -extern void __setup_cpu_power3(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_power4(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_be(unsigned long offset, struct cpu_spec* spec); -#else +#ifdef CONFIG_PPC32 +extern void __setup_cpu_e200(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_e500v1(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_e500v2(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_e500mc(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_440ep(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_440epx(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_440gx(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_440grx(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_440spe(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_440x5(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_460ex(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_460gt(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_460sx(unsigned long offset, struct cpu_spec *spec); +extern void __setup_cpu_apm821xx(unsigned long offset, struct cpu_spec *spec); extern void __setup_cpu_603(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_604(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_750(unsigned long offset, struct cpu_spec* spec); @@ -44,7 +59,29 @@ extern void __setup_cpu_7400(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_7410(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_745x(unsigned long offset, struct cpu_spec* spec); #endif /* CONFIG_PPC32 */ +#ifdef CONFIG_PPC64 extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_ppc970MP(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_pa6t(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_a2(unsigned long offset, struct cpu_spec* spec); +extern void __restore_cpu_pa6t(void); +extern void __restore_cpu_ppc970(void); +extern void __setup_cpu_power7(unsigned long offset, struct cpu_spec* spec); +extern void __restore_cpu_power7(void); +extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec); +extern void __restore_cpu_power8(void); +extern void __restore_cpu_a2(void); +extern void __flush_tlb_power7(unsigned long inval_selector); +extern void __flush_tlb_power8(unsigned long inval_selector); +extern long __machine_check_early_realmode_p7(struct pt_regs *regs); +extern long __machine_check_early_realmode_p8(struct pt_regs *regs); +#endif /* CONFIG_PPC64 */ +#if defined(CONFIG_E500) +extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_e6500(unsigned long offset, struct cpu_spec* spec); +extern void __restore_cpu_e5500(void); +extern void __restore_cpu_e6500(void); +#endif /* CONFIG_E500 */ /* This table only contains "desktop" CPUs, it need to be filled with embedded * ones as well... @@ -52,48 +89,69 @@ extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec); #define COMMON_USER (PPC_FEATURE_32 | PPC_FEATURE_HAS_FPU | \ PPC_FEATURE_HAS_MMU) #define COMMON_USER_PPC64 (COMMON_USER | PPC_FEATURE_64) - - -/* We only set the spe features if the kernel was compiled with - * spe support - */ -#ifdef CONFIG_SPE -#define PPC_FEATURE_SPE_COMP PPC_FEATURE_HAS_SPE +#define COMMON_USER_POWER4 (COMMON_USER_PPC64 | PPC_FEATURE_POWER4) +#define COMMON_USER_POWER5 (COMMON_USER_PPC64 | PPC_FEATURE_POWER5 |\ + PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP) +#define COMMON_USER_POWER5_PLUS (COMMON_USER_PPC64 | PPC_FEATURE_POWER5_PLUS|\ + PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP) +#define COMMON_USER_POWER6 (COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_05 |\ + PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \ + PPC_FEATURE_TRUE_LE | \ + PPC_FEATURE_PSERIES_PERFMON_COMPAT) +#define COMMON_USER_POWER7 (COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_06 |\ + PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \ + PPC_FEATURE_TRUE_LE | \ + PPC_FEATURE_PSERIES_PERFMON_COMPAT) +#define COMMON_USER2_POWER7 (PPC_FEATURE2_DSCR) +#define COMMON_USER_POWER8 (COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_06 |\ + PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \ + PPC_FEATURE_TRUE_LE | \ + PPC_FEATURE_PSERIES_PERFMON_COMPAT) +#define COMMON_USER2_POWER8 (PPC_FEATURE2_ARCH_2_07 | \ + PPC_FEATURE2_HTM_COMP | PPC_FEATURE2_DSCR | \ + PPC_FEATURE2_ISEL | PPC_FEATURE2_TAR | \ + PPC_FEATURE2_VEC_CRYPTO) +#define COMMON_USER_PA6T (COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\ + PPC_FEATURE_TRUE_LE | \ + PPC_FEATURE_HAS_ALTIVEC_COMP) +#ifdef CONFIG_PPC_BOOK3E_64 +#define COMMON_USER_BOOKE (COMMON_USER_PPC64 | PPC_FEATURE_BOOKE) #else -#define PPC_FEATURE_SPE_COMP 0 +#define COMMON_USER_BOOKE (PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | \ + PPC_FEATURE_BOOKE) #endif -struct cpu_spec cpu_specs[] = { -#ifdef CONFIG_PPC64 +static struct cpu_spec __initdata cpu_specs[] = { +#ifdef CONFIG_PPC_BOOK3S_64 { /* Power3 */ .pvr_mask = 0xffff0000, .pvr_value = 0x00400000, .cpu_name = "POWER3 (630)", .cpu_features = CPU_FTRS_POWER3, - .cpu_user_features = COMMON_USER_PPC64, + .cpu_user_features = COMMON_USER_PPC64|PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, - .cpu_setup = __setup_cpu_power3, -#ifdef CONFIG_OPROFILE + .pmc_type = PPC_PMC_IBM, .oprofile_cpu_type = "ppc64/power3", - .oprofile_model = &op_model_rs64, -#endif + .oprofile_type = PPC_OPROFILE_RS64, + .platform = "power3", }, { /* Power3+ */ .pvr_mask = 0xffff0000, .pvr_value = 0x00410000, .cpu_name = "POWER3 (630+)", .cpu_features = CPU_FTRS_POWER3, - .cpu_user_features = COMMON_USER_PPC64, + .cpu_user_features = COMMON_USER_PPC64|PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, - .cpu_setup = __setup_cpu_power3, -#ifdef CONFIG_OPROFILE + .pmc_type = PPC_PMC_IBM, .oprofile_cpu_type = "ppc64/power3", - .oprofile_model = &op_model_rs64, -#endif + .oprofile_type = PPC_OPROFILE_RS64, + .platform = "power3", }, { /* Northstar */ .pvr_mask = 0xffff0000, @@ -101,14 +159,14 @@ struct cpu_spec cpu_specs[] = { .cpu_name = "RS64-II (northstar)", .cpu_features = CPU_FTRS_RS64, .cpu_user_features = COMMON_USER_PPC64, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, - .cpu_setup = __setup_cpu_power3, -#ifdef CONFIG_OPROFILE + .pmc_type = PPC_PMC_IBM, .oprofile_cpu_type = "ppc64/rs64", - .oprofile_model = &op_model_rs64, -#endif + .oprofile_type = PPC_OPROFILE_RS64, + .platform = "rs64", }, { /* Pulsar */ .pvr_mask = 0xffff0000, @@ -116,14 +174,14 @@ struct cpu_spec cpu_specs[] = { .cpu_name = "RS64-III (pulsar)", .cpu_features = CPU_FTRS_RS64, .cpu_user_features = COMMON_USER_PPC64, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, - .cpu_setup = __setup_cpu_power3, -#ifdef CONFIG_OPROFILE + .pmc_type = PPC_PMC_IBM, .oprofile_cpu_type = "ppc64/rs64", - .oprofile_model = &op_model_rs64, -#endif + .oprofile_type = PPC_OPROFILE_RS64, + .platform = "rs64", }, { /* I-star */ .pvr_mask = 0xffff0000, @@ -131,14 +189,14 @@ struct cpu_spec cpu_specs[] = { .cpu_name = "RS64-III (icestar)", .cpu_features = CPU_FTRS_RS64, .cpu_user_features = COMMON_USER_PPC64, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, - .cpu_setup = __setup_cpu_power3, -#ifdef CONFIG_OPROFILE + .pmc_type = PPC_PMC_IBM, .oprofile_cpu_type = "ppc64/rs64", - .oprofile_model = &op_model_rs64, -#endif + .oprofile_type = PPC_OPROFILE_RS64, + .platform = "rs64", }, { /* S-star */ .pvr_mask = 0xffff0000, @@ -146,140 +204,402 @@ struct cpu_spec cpu_specs[] = { .cpu_name = "RS64-IV (sstar)", .cpu_features = CPU_FTRS_RS64, .cpu_user_features = COMMON_USER_PPC64, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, - .cpu_setup = __setup_cpu_power3, -#ifdef CONFIG_OPROFILE + .pmc_type = PPC_PMC_IBM, .oprofile_cpu_type = "ppc64/rs64", - .oprofile_model = &op_model_rs64, -#endif + .oprofile_type = PPC_OPROFILE_RS64, + .platform = "rs64", }, { /* Power4 */ .pvr_mask = 0xffff0000, .pvr_value = 0x00350000, .cpu_name = "POWER4 (gp)", .cpu_features = CPU_FTRS_POWER4, - .cpu_user_features = COMMON_USER_PPC64, + .cpu_user_features = COMMON_USER_POWER4, + .mmu_features = MMU_FTRS_POWER4, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, - .cpu_setup = __setup_cpu_power4, -#ifdef CONFIG_OPROFILE + .pmc_type = PPC_PMC_IBM, .oprofile_cpu_type = "ppc64/power4", - .oprofile_model = &op_model_rs64, -#endif + .oprofile_type = PPC_OPROFILE_POWER4, + .platform = "power4", }, { /* Power4+ */ .pvr_mask = 0xffff0000, .pvr_value = 0x00380000, .cpu_name = "POWER4+ (gq)", .cpu_features = CPU_FTRS_POWER4, - .cpu_user_features = COMMON_USER_PPC64, + .cpu_user_features = COMMON_USER_POWER4, + .mmu_features = MMU_FTRS_POWER4, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, - .cpu_setup = __setup_cpu_power4, -#ifdef CONFIG_OPROFILE + .pmc_type = PPC_PMC_IBM, .oprofile_cpu_type = "ppc64/power4", - .oprofile_model = &op_model_power4, -#endif + .oprofile_type = PPC_OPROFILE_POWER4, + .platform = "power4", }, { /* PPC970 */ .pvr_mask = 0xffff0000, .pvr_value = 0x00390000, .cpu_name = "PPC970", .cpu_features = CPU_FTRS_PPC970, - .cpu_user_features = COMMON_USER_PPC64 | + .cpu_user_features = COMMON_USER_POWER4 | PPC_FEATURE_HAS_ALTIVEC_COMP, + .mmu_features = MMU_FTRS_PPC970, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, + .pmc_type = PPC_PMC_IBM, .cpu_setup = __setup_cpu_ppc970, -#ifdef CONFIG_OPROFILE + .cpu_restore = __restore_cpu_ppc970, .oprofile_cpu_type = "ppc64/970", - .oprofile_model = &op_model_power4, -#endif + .oprofile_type = PPC_OPROFILE_POWER4, + .platform = "ppc970", }, -#endif /* CONFIG_PPC64 */ -#if defined(CONFIG_PPC64) || defined(CONFIG_POWER4) { /* PPC970FX */ .pvr_mask = 0xffff0000, .pvr_value = 0x003c0000, .cpu_name = "PPC970FX", -#ifdef CONFIG_PPC32 - .cpu_features = CPU_FTRS_970_32, -#else .cpu_features = CPU_FTRS_PPC970, -#endif - .cpu_user_features = COMMON_USER_PPC64 | + .cpu_user_features = COMMON_USER_POWER4 | PPC_FEATURE_HAS_ALTIVEC_COMP, + .mmu_features = MMU_FTRS_PPC970, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, + .pmc_type = PPC_PMC_IBM, .cpu_setup = __setup_cpu_ppc970, -#ifdef CONFIG_OPROFILE + .cpu_restore = __restore_cpu_ppc970, .oprofile_cpu_type = "ppc64/970", - .oprofile_model = &op_model_power4, -#endif + .oprofile_type = PPC_OPROFILE_POWER4, + .platform = "ppc970", + }, + { /* PPC970MP DD1.0 - no DEEPNAP, use regular 970 init */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x00440100, + .cpu_name = "PPC970MP", + .cpu_features = CPU_FTRS_PPC970, + .cpu_user_features = COMMON_USER_POWER4 | + PPC_FEATURE_HAS_ALTIVEC_COMP, + .mmu_features = MMU_FTRS_PPC970, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 8, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_ppc970, + .cpu_restore = __restore_cpu_ppc970, + .oprofile_cpu_type = "ppc64/970MP", + .oprofile_type = PPC_OPROFILE_POWER4, + .platform = "ppc970", }, -#endif /* defined(CONFIG_PPC64) || defined(CONFIG_POWER4) */ -#ifdef CONFIG_PPC64 { /* PPC970MP */ .pvr_mask = 0xffff0000, .pvr_value = 0x00440000, .cpu_name = "PPC970MP", .cpu_features = CPU_FTRS_PPC970, - .cpu_user_features = COMMON_USER_PPC64 | + .cpu_user_features = COMMON_USER_POWER4 | + PPC_FEATURE_HAS_ALTIVEC_COMP, + .mmu_features = MMU_FTRS_PPC970, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 8, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_ppc970MP, + .cpu_restore = __restore_cpu_ppc970, + .oprofile_cpu_type = "ppc64/970MP", + .oprofile_type = PPC_OPROFILE_POWER4, + .platform = "ppc970", + }, + { /* PPC970GX */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00450000, + .cpu_name = "PPC970GX", + .cpu_features = CPU_FTRS_PPC970, + .cpu_user_features = COMMON_USER_POWER4 | PPC_FEATURE_HAS_ALTIVEC_COMP, + .mmu_features = MMU_FTRS_PPC970, .icache_bsize = 128, .dcache_bsize = 128, + .num_pmcs = 8, + .pmc_type = PPC_PMC_IBM, .cpu_setup = __setup_cpu_ppc970, -#ifdef CONFIG_OPROFILE .oprofile_cpu_type = "ppc64/970", - .oprofile_model = &op_model_power4, -#endif + .oprofile_type = PPC_OPROFILE_POWER4, + .platform = "ppc970", }, - { /* Power5 */ + { /* Power5 GR */ .pvr_mask = 0xffff0000, .pvr_value = 0x003a0000, .cpu_name = "POWER5 (gr)", .cpu_features = CPU_FTRS_POWER5, - .cpu_user_features = COMMON_USER_PPC64, + .cpu_user_features = COMMON_USER_POWER5, + .mmu_features = MMU_FTRS_POWER5, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 6, - .cpu_setup = __setup_cpu_power4, -#ifdef CONFIG_OPROFILE + .pmc_type = PPC_PMC_IBM, .oprofile_cpu_type = "ppc64/power5", - .oprofile_model = &op_model_power4, -#endif + .oprofile_type = PPC_OPROFILE_POWER4, + /* SIHV / SIPR bits are implemented on POWER4+ (GQ) + * and above but only works on POWER5 and above + */ + .oprofile_mmcra_sihv = MMCRA_SIHV, + .oprofile_mmcra_sipr = MMCRA_SIPR, + .platform = "power5", + }, + { /* Power5++ */ + .pvr_mask = 0xffffff00, + .pvr_value = 0x003b0300, + .cpu_name = "POWER5+ (gs)", + .cpu_features = CPU_FTRS_POWER5, + .cpu_user_features = COMMON_USER_POWER5_PLUS, + .mmu_features = MMU_FTRS_POWER5, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .oprofile_cpu_type = "ppc64/power5++", + .oprofile_type = PPC_OPROFILE_POWER4, + .oprofile_mmcra_sihv = MMCRA_SIHV, + .oprofile_mmcra_sipr = MMCRA_SIPR, + .platform = "power5+", }, - { /* Power5 */ + { /* Power5 GS */ .pvr_mask = 0xffff0000, .pvr_value = 0x003b0000, - .cpu_name = "POWER5 (gs)", + .cpu_name = "POWER5+ (gs)", .cpu_features = CPU_FTRS_POWER5, - .cpu_user_features = COMMON_USER_PPC64, + .cpu_user_features = COMMON_USER_POWER5_PLUS, + .mmu_features = MMU_FTRS_POWER5, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 6, - .cpu_setup = __setup_cpu_power4, -#ifdef CONFIG_OPROFILE - .oprofile_cpu_type = "ppc64/power5", - .oprofile_model = &op_model_power4, -#endif + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power5+", + .oprofile_type = PPC_OPROFILE_POWER4, + .oprofile_mmcra_sihv = MMCRA_SIHV, + .oprofile_mmcra_sipr = MMCRA_SIPR, + .platform = "power5+", + }, + { /* POWER6 in P5+ mode; 2.04-compliant processor */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000001, + .cpu_name = "POWER5+", + .cpu_features = CPU_FTRS_POWER5, + .cpu_user_features = COMMON_USER_POWER5_PLUS, + .mmu_features = MMU_FTRS_POWER5, + .icache_bsize = 128, + .dcache_bsize = 128, + .oprofile_cpu_type = "ppc64/ibm-compat-v1", + .oprofile_type = PPC_OPROFILE_POWER4, + .platform = "power5+", + }, + { /* Power6 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003e0000, + .cpu_name = "POWER6 (raw)", + .cpu_features = CPU_FTRS_POWER6, + .cpu_user_features = COMMON_USER_POWER6 | + PPC_FEATURE_POWER6_EXT, + .mmu_features = MMU_FTRS_POWER6, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power6", + .oprofile_type = PPC_OPROFILE_POWER4, + .oprofile_mmcra_sihv = POWER6_MMCRA_SIHV, + .oprofile_mmcra_sipr = POWER6_MMCRA_SIPR, + .oprofile_mmcra_clear = POWER6_MMCRA_THRM | + POWER6_MMCRA_OTHER, + .platform = "power6x", + }, + { /* 2.05-compliant processor, i.e. Power6 "architected" mode */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000002, + .cpu_name = "POWER6 (architected)", + .cpu_features = CPU_FTRS_POWER6, + .cpu_user_features = COMMON_USER_POWER6, + .mmu_features = MMU_FTRS_POWER6, + .icache_bsize = 128, + .dcache_bsize = 128, + .oprofile_cpu_type = "ppc64/ibm-compat-v1", + .oprofile_type = PPC_OPROFILE_POWER4, + .platform = "power6", }, - { /* BE DD1.x */ + { /* 2.06-compliant processor, i.e. Power7 "architected" mode */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000003, + .cpu_name = "POWER7 (architected)", + .cpu_features = CPU_FTRS_POWER7, + .cpu_user_features = COMMON_USER_POWER7, + .cpu_user_features2 = COMMON_USER2_POWER7, + .mmu_features = MMU_FTRS_POWER7, + .icache_bsize = 128, + .dcache_bsize = 128, + .oprofile_type = PPC_OPROFILE_POWER4, + .oprofile_cpu_type = "ppc64/ibm-compat-v1", + .cpu_setup = __setup_cpu_power7, + .cpu_restore = __restore_cpu_power7, + .flush_tlb = __flush_tlb_power7, + .machine_check_early = __machine_check_early_realmode_p7, + .platform = "power7", + }, + { /* 2.07-compliant processor, i.e. Power8 "architected" mode */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000004, + .cpu_name = "POWER8 (architected)", + .cpu_features = CPU_FTRS_POWER8, + .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, + .mmu_features = MMU_FTRS_POWER8, + .icache_bsize = 128, + .dcache_bsize = 128, + .oprofile_type = PPC_OPROFILE_INVALID, + .oprofile_cpu_type = "ppc64/ibm-compat-v1", + .cpu_setup = __setup_cpu_power8, + .cpu_restore = __restore_cpu_power8, + .flush_tlb = __flush_tlb_power8, + .machine_check_early = __machine_check_early_realmode_p8, + .platform = "power8", + }, + { /* Power7 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003f0000, + .cpu_name = "POWER7 (raw)", + .cpu_features = CPU_FTRS_POWER7, + .cpu_user_features = COMMON_USER_POWER7, + .cpu_user_features2 = COMMON_USER2_POWER7, + .mmu_features = MMU_FTRS_POWER7, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power7", + .oprofile_type = PPC_OPROFILE_POWER4, + .cpu_setup = __setup_cpu_power7, + .cpu_restore = __restore_cpu_power7, + .flush_tlb = __flush_tlb_power7, + .machine_check_early = __machine_check_early_realmode_p7, + .platform = "power7", + }, + { /* Power7+ */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004A0000, + .cpu_name = "POWER7+ (raw)", + .cpu_features = CPU_FTRS_POWER7, + .cpu_user_features = COMMON_USER_POWER7, + .cpu_user_features2 = COMMON_USER2_POWER7, + .mmu_features = MMU_FTRS_POWER7, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power7", + .oprofile_type = PPC_OPROFILE_POWER4, + .cpu_setup = __setup_cpu_power7, + .cpu_restore = __restore_cpu_power7, + .flush_tlb = __flush_tlb_power7, + .machine_check_early = __machine_check_early_realmode_p7, + .platform = "power7+", + }, + { /* Power8E */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004b0000, + .cpu_name = "POWER8E (raw)", + .cpu_features = CPU_FTRS_POWER8E, + .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, + .mmu_features = MMU_FTRS_POWER8, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power8", + .oprofile_type = PPC_OPROFILE_INVALID, + .cpu_setup = __setup_cpu_power8, + .cpu_restore = __restore_cpu_power8, + .flush_tlb = __flush_tlb_power8, + .machine_check_early = __machine_check_early_realmode_p8, + .platform = "power8", + }, + { /* Power8 DD1: Does not support doorbell IPIs */ + .pvr_mask = 0xffffff00, + .pvr_value = 0x004d0100, + .cpu_name = "POWER8 (raw)", + .cpu_features = CPU_FTRS_POWER8_DD1, + .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, + .mmu_features = MMU_FTRS_POWER8, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power8", + .oprofile_type = PPC_OPROFILE_INVALID, + .cpu_setup = __setup_cpu_power8, + .cpu_restore = __restore_cpu_power8, + .flush_tlb = __flush_tlb_power8, + .machine_check_early = __machine_check_early_realmode_p8, + .platform = "power8", + }, + { /* Power8 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004d0000, + .cpu_name = "POWER8 (raw)", + .cpu_features = CPU_FTRS_POWER8, + .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, + .mmu_features = MMU_FTRS_POWER8, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power8", + .oprofile_type = PPC_OPROFILE_INVALID, + .cpu_setup = __setup_cpu_power8, + .cpu_restore = __restore_cpu_power8, + .flush_tlb = __flush_tlb_power8, + .machine_check_early = __machine_check_early_realmode_p8, + .platform = "power8", + }, + { /* Cell Broadband Engine */ .pvr_mask = 0xffff0000, .pvr_value = 0x00700000, .cpu_name = "Cell Broadband Engine", .cpu_features = CPU_FTRS_CELL, .cpu_user_features = COMMON_USER_PPC64 | - PPC_FEATURE_HAS_ALTIVEC_COMP, + PPC_FEATURE_CELL | PPC_FEATURE_HAS_ALTIVEC_COMP | + PPC_FEATURE_SMT, + .mmu_features = MMU_FTRS_CELL, .icache_bsize = 128, .dcache_bsize = 128, - .cpu_setup = __setup_cpu_be, + .num_pmcs = 4, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/cell-be", + .oprofile_type = PPC_OPROFILE_CELL, + .platform = "ppc-cell-be", + }, + { /* PA Semi PA6T */ + .pvr_mask = 0x7fff0000, + .pvr_value = 0x00900000, + .cpu_name = "PA6T", + .cpu_features = CPU_FTRS_PA6T, + .cpu_user_features = COMMON_USER_PA6T, + .mmu_features = MMU_FTRS_PA6T, + .icache_bsize = 64, + .dcache_bsize = 64, + .num_pmcs = 6, + .pmc_type = PPC_PMC_PA6T, + .cpu_setup = __setup_cpu_pa6t, + .cpu_restore = __restore_cpu_pa6t, + .oprofile_cpu_type = "ppc64/pa6t", + .oprofile_type = PPC_OPROFILE_PA6T, + .platform = "pa6t", }, { /* default match */ .pvr_mask = 0x00000000, @@ -287,12 +607,15 @@ struct cpu_spec cpu_specs[] = { .cpu_name = "POWER4 (compatible)", .cpu_features = CPU_FTRS_COMPATIBLE, .cpu_user_features = COMMON_USER_PPC64, + .mmu_features = MMU_FTRS_DEFAULT_HPTE_ARCH_V2, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 6, - .cpu_setup = __setup_cpu_power4, + .pmc_type = PPC_PMC_IBM, + .platform = "power4", } -#endif /* CONFIG_PPC64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ + #ifdef CONFIG_PPC32 #if CLASSIC_PPC { /* 601 */ @@ -302,357 +625,539 @@ struct cpu_spec cpu_specs[] = { .cpu_features = CPU_FTRS_PPC601, .cpu_user_features = COMMON_USER | PPC_FEATURE_601_INSTR | PPC_FEATURE_UNIFIED_CACHE | PPC_FEATURE_NO_TB, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_generic, + .platform = "ppc601", }, { /* 603 */ .pvr_mask = 0xffff0000, .pvr_value = 0x00030000, .cpu_name = "603", .cpu_features = CPU_FTRS_603, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = 0, .icache_bsize = 32, .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603 + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_generic, + .platform = "ppc603", }, { /* 603e */ .pvr_mask = 0xffff0000, .pvr_value = 0x00060000, .cpu_name = "603e", .cpu_features = CPU_FTRS_603, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = 0, .icache_bsize = 32, .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603 + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_generic, + .platform = "ppc603", }, { /* 603ev */ .pvr_mask = 0xffff0000, .pvr_value = 0x00070000, .cpu_name = "603ev", .cpu_features = CPU_FTRS_603, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = 0, .icache_bsize = 32, .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603 + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_generic, + .platform = "ppc603", }, { /* 604 */ .pvr_mask = 0xffff0000, .pvr_value = 0x00040000, .cpu_name = "604", .cpu_features = CPU_FTRS_604, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 2, - .cpu_setup = __setup_cpu_604 + .cpu_setup = __setup_cpu_604, + .machine_check = machine_check_generic, + .platform = "ppc604", }, { /* 604e */ .pvr_mask = 0xfffff000, .pvr_value = 0x00090000, .cpu_name = "604e", .cpu_features = CPU_FTRS_604, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_604 + .cpu_setup = __setup_cpu_604, + .machine_check = machine_check_generic, + .platform = "ppc604", }, { /* 604r */ .pvr_mask = 0xffff0000, .pvr_value = 0x00090000, .cpu_name = "604r", .cpu_features = CPU_FTRS_604, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_604 + .cpu_setup = __setup_cpu_604, + .machine_check = machine_check_generic, + .platform = "ppc604", }, { /* 604ev */ .pvr_mask = 0xffff0000, .pvr_value = 0x000a0000, .cpu_name = "604ev", .cpu_features = CPU_FTRS_604, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_604 + .cpu_setup = __setup_cpu_604, + .machine_check = machine_check_generic, + .platform = "ppc604", }, { /* 740/750 (0x4202, don't support TAU ?) */ .pvr_mask = 0xffffffff, .pvr_value = 0x00084202, .cpu_name = "740/750", .cpu_features = CPU_FTRS_740_NOTAU, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_750 + .cpu_setup = __setup_cpu_750, + .machine_check = machine_check_generic, + .platform = "ppc750", }, { /* 750CX (80100 and 8010x?) */ .pvr_mask = 0xfffffff0, .pvr_value = 0x00080100, .cpu_name = "750CX", .cpu_features = CPU_FTRS_750, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_750cx + .cpu_setup = __setup_cpu_750cx, + .machine_check = machine_check_generic, + .platform = "ppc750", }, { /* 750CX (82201 and 82202) */ .pvr_mask = 0xfffffff0, .pvr_value = 0x00082200, .cpu_name = "750CX", .cpu_features = CPU_FTRS_750, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_750cx + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750cx, + .machine_check = machine_check_generic, + .platform = "ppc750", }, { /* 750CXe (82214) */ .pvr_mask = 0xfffffff0, .pvr_value = 0x00082210, .cpu_name = "750CXe", .cpu_features = CPU_FTRS_750, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_750cx + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750cx, + .machine_check = machine_check_generic, + .platform = "ppc750", }, { /* 750CXe "Gekko" (83214) */ .pvr_mask = 0xffffffff, .pvr_value = 0x00083214, .cpu_name = "750CXe", .cpu_features = CPU_FTRS_750, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_750cx + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750cx, + .machine_check = machine_check_generic, + .platform = "ppc750", + }, + { /* 750CL (and "Broadway") */ + .pvr_mask = 0xfffff0e0, + .pvr_value = 0x00087000, + .cpu_name = "750CL", + .cpu_features = CPU_FTRS_750CL, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, + .icache_bsize = 32, + .dcache_bsize = 32, + .num_pmcs = 4, + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750, + .machine_check = machine_check_generic, + .platform = "ppc750", + .oprofile_cpu_type = "ppc/750", + .oprofile_type = PPC_OPROFILE_G4, }, { /* 745/755 */ .pvr_mask = 0xfffff000, .pvr_value = 0x00083000, .cpu_name = "745/755", .cpu_features = CPU_FTRS_750, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_750 + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750, + .machine_check = machine_check_generic, + .platform = "ppc750", }, { /* 750FX rev 1.x */ .pvr_mask = 0xffffff00, .pvr_value = 0x70000100, .cpu_name = "750FX", .cpu_features = CPU_FTRS_750FX1, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_750 + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750, + .machine_check = machine_check_generic, + .platform = "ppc750", + .oprofile_cpu_type = "ppc/750", + .oprofile_type = PPC_OPROFILE_G4, }, { /* 750FX rev 2.0 must disable HID0[DPM] */ .pvr_mask = 0xffffffff, .pvr_value = 0x70000200, .cpu_name = "750FX", .cpu_features = CPU_FTRS_750FX2, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_750 + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750, + .machine_check = machine_check_generic, + .platform = "ppc750", + .oprofile_cpu_type = "ppc/750", + .oprofile_type = PPC_OPROFILE_G4, }, { /* 750FX (All revs except 2.0) */ .pvr_mask = 0xffff0000, .pvr_value = 0x70000000, .cpu_name = "750FX", .cpu_features = CPU_FTRS_750FX, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_750fx + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750fx, + .machine_check = machine_check_generic, + .platform = "ppc750", + .oprofile_cpu_type = "ppc/750", + .oprofile_type = PPC_OPROFILE_G4, }, { /* 750GX */ .pvr_mask = 0xffff0000, .pvr_value = 0x70020000, .cpu_name = "750GX", .cpu_features = CPU_FTRS_750GX, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_750fx + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750fx, + .machine_check = machine_check_generic, + .platform = "ppc750", + .oprofile_cpu_type = "ppc/750", + .oprofile_type = PPC_OPROFILE_G4, }, { /* 740/750 (L2CR bit need fixup for 740) */ .pvr_mask = 0xffff0000, .pvr_value = 0x00080000, .cpu_name = "740/750", .cpu_features = CPU_FTRS_740, - .cpu_user_features = COMMON_USER, + .cpu_user_features = COMMON_USER | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_750 + .pmc_type = PPC_PMC_IBM, + .cpu_setup = __setup_cpu_750, + .machine_check = machine_check_generic, + .platform = "ppc750", }, { /* 7400 rev 1.1 ? (no TAU) */ .pvr_mask = 0xffffffff, .pvr_value = 0x000c1101, .cpu_name = "7400 (1.1)", .cpu_features = CPU_FTRS_7400_NOTAU, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features = COMMON_USER | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_7400 + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_7400, + .machine_check = machine_check_generic, + .platform = "ppc7400", }, { /* 7400 */ .pvr_mask = 0xffff0000, .pvr_value = 0x000c0000, .cpu_name = "7400", .cpu_features = CPU_FTRS_7400, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features = COMMON_USER | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_7400 + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_7400, + .machine_check = machine_check_generic, + .platform = "ppc7400", }, { /* 7410 */ .pvr_mask = 0xffff0000, .pvr_value = 0x800c0000, .cpu_name = "7410", .cpu_features = CPU_FTRS_7400, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features = COMMON_USER | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, - .cpu_setup = __setup_cpu_7410 + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_7410, + .machine_check = machine_check_generic, + .platform = "ppc7400", }, { /* 7450 2.0 - no doze/nap */ .pvr_mask = 0xffffffff, .pvr_value = 0x80000200, .cpu_name = "7450", .cpu_features = CPU_FTRS_7450_20, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features = COMMON_USER | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 6, - .cpu_setup = __setup_cpu_745x + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .oprofile_cpu_type = "ppc/7450", + .oprofile_type = PPC_OPROFILE_G4, + .machine_check = machine_check_generic, + .platform = "ppc7450", }, { /* 7450 2.1 */ .pvr_mask = 0xffffffff, .pvr_value = 0x80000201, .cpu_name = "7450", .cpu_features = CPU_FTRS_7450_21, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features = COMMON_USER | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 6, - .cpu_setup = __setup_cpu_745x + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .oprofile_cpu_type = "ppc/7450", + .oprofile_type = PPC_OPROFILE_G4, + .machine_check = machine_check_generic, + .platform = "ppc7450", }, { /* 7450 2.3 and newer */ .pvr_mask = 0xffff0000, .pvr_value = 0x80000000, .cpu_name = "7450", .cpu_features = CPU_FTRS_7450_23, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features = COMMON_USER | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 6, - .cpu_setup = __setup_cpu_745x + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .oprofile_cpu_type = "ppc/7450", + .oprofile_type = PPC_OPROFILE_G4, + .machine_check = machine_check_generic, + .platform = "ppc7450", }, { /* 7455 rev 1.x */ .pvr_mask = 0xffffff00, .pvr_value = 0x80010100, .cpu_name = "7455", .cpu_features = CPU_FTRS_7455_1, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features = COMMON_USER | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 6, - .cpu_setup = __setup_cpu_745x + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .oprofile_cpu_type = "ppc/7450", + .oprofile_type = PPC_OPROFILE_G4, + .machine_check = machine_check_generic, + .platform = "ppc7450", }, { /* 7455 rev 2.0 */ .pvr_mask = 0xffffffff, .pvr_value = 0x80010200, .cpu_name = "7455", .cpu_features = CPU_FTRS_7455_20, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features = COMMON_USER | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 6, - .cpu_setup = __setup_cpu_745x + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .oprofile_cpu_type = "ppc/7450", + .oprofile_type = PPC_OPROFILE_G4, + .machine_check = machine_check_generic, + .platform = "ppc7450", }, { /* 7455 others */ .pvr_mask = 0xffff0000, .pvr_value = 0x80010000, .cpu_name = "7455", .cpu_features = CPU_FTRS_7455, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features = COMMON_USER | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 6, - .cpu_setup = __setup_cpu_745x + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .oprofile_cpu_type = "ppc/7450", + .oprofile_type = PPC_OPROFILE_G4, + .machine_check = machine_check_generic, + .platform = "ppc7450", }, { /* 7447/7457 Rev 1.0 */ .pvr_mask = 0xffffffff, .pvr_value = 0x80020100, .cpu_name = "7447/7457", .cpu_features = CPU_FTRS_7447_10, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features = COMMON_USER | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 6, - .cpu_setup = __setup_cpu_745x + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .oprofile_cpu_type = "ppc/7450", + .oprofile_type = PPC_OPROFILE_G4, + .machine_check = machine_check_generic, + .platform = "ppc7450", }, { /* 7447/7457 Rev 1.1 */ .pvr_mask = 0xffffffff, .pvr_value = 0x80020101, .cpu_name = "7447/7457", .cpu_features = CPU_FTRS_7447_10, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features = COMMON_USER | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 6, - .cpu_setup = __setup_cpu_745x + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .oprofile_cpu_type = "ppc/7450", + .oprofile_type = PPC_OPROFILE_G4, + .machine_check = machine_check_generic, + .platform = "ppc7450", }, { /* 7447/7457 Rev 1.2 and later */ .pvr_mask = 0xffff0000, .pvr_value = 0x80020000, .cpu_name = "7447/7457", .cpu_features = CPU_FTRS_7447, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 6, - .cpu_setup = __setup_cpu_745x + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .oprofile_cpu_type = "ppc/7450", + .oprofile_type = PPC_OPROFILE_G4, + .machine_check = machine_check_generic, + .platform = "ppc7450", }, { /* 7447A */ .pvr_mask = 0xffff0000, .pvr_value = 0x80030000, .cpu_name = "7447A", .cpu_features = CPU_FTRS_7447A, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features = COMMON_USER | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 6, - .cpu_setup = __setup_cpu_745x + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .oprofile_cpu_type = "ppc/7450", + .oprofile_type = PPC_OPROFILE_G4, + .machine_check = machine_check_generic, + .platform = "ppc7450", }, { /* 7448 */ .pvr_mask = 0xffff0000, .pvr_value = 0x80040000, .cpu_name = "7448", - .cpu_features = CPU_FTRS_7447A, - .cpu_user_features = COMMON_USER | PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_features = CPU_FTRS_7448, + .cpu_user_features = COMMON_USER | + PPC_FEATURE_HAS_ALTIVEC_COMP | PPC_FEATURE_PPC_LE, + .mmu_features = MMU_FTR_HPTE_TABLE | MMU_FTR_USE_HIGH_BATS, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 6, - .cpu_setup = __setup_cpu_745x + .pmc_type = PPC_PMC_G4, + .cpu_setup = __setup_cpu_745x, + .oprofile_cpu_type = "ppc/7450", + .oprofile_type = PPC_OPROFILE_G4, + .machine_check = machine_check_generic, + .platform = "ppc7450", }, { /* 82xx (8240, 8245, 8260 are all 603e cores) */ .pvr_mask = 0x7fff0000, @@ -660,9 +1165,12 @@ struct cpu_spec cpu_specs[] = { .cpu_name = "82xx", .cpu_features = CPU_FTRS_82XX, .cpu_user_features = COMMON_USER, + .mmu_features = 0, .icache_bsize = 32, .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603 + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_generic, + .platform = "ppc603", }, { /* All G2_LE (603e core, plus some) have the same pvr */ .pvr_mask = 0x7fff0000, @@ -670,19 +1178,72 @@ struct cpu_spec cpu_specs[] = { .cpu_name = "G2_LE", .cpu_features = CPU_FTRS_G2_LE, .cpu_user_features = COMMON_USER, + .mmu_features = MMU_FTR_USE_HIGH_BATS, .icache_bsize = 32, .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603 + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_generic, + .platform = "ppc603", }, - { /* e300 (a 603e core, plus some) on 83xx */ + { /* e300c1 (a 603e core, plus some) on 83xx */ .pvr_mask = 0x7fff0000, .pvr_value = 0x00830000, - .cpu_name = "e300", + .cpu_name = "e300c1", .cpu_features = CPU_FTRS_E300, .cpu_user_features = COMMON_USER, + .mmu_features = MMU_FTR_USE_HIGH_BATS, .icache_bsize = 32, .dcache_bsize = 32, - .cpu_setup = __setup_cpu_603 + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_generic, + .platform = "ppc603", + }, + { /* e300c2 (an e300c1 core, plus some, minus FPU) on 83xx */ + .pvr_mask = 0x7fff0000, + .pvr_value = 0x00840000, + .cpu_name = "e300c2", + .cpu_features = CPU_FTRS_E300C2, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .mmu_features = MMU_FTR_USE_HIGH_BATS | + MMU_FTR_NEED_DTLB_SW_LRU, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_generic, + .platform = "ppc603", + }, + { /* e300c3 (e300c1, plus one IU, half cache size) on 83xx */ + .pvr_mask = 0x7fff0000, + .pvr_value = 0x00850000, + .cpu_name = "e300c3", + .cpu_features = CPU_FTRS_E300, + .cpu_user_features = COMMON_USER, + .mmu_features = MMU_FTR_USE_HIGH_BATS | + MMU_FTR_NEED_DTLB_SW_LRU, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_603, + .num_pmcs = 4, + .oprofile_cpu_type = "ppc/e300", + .oprofile_type = PPC_OPROFILE_FSL_EMB, + .platform = "ppc603", + }, + { /* e300c4 (e300c1, plus one IU) */ + .pvr_mask = 0x7fff0000, + .pvr_value = 0x00860000, + .cpu_name = "e300c4", + .cpu_features = CPU_FTRS_E300, + .cpu_user_features = COMMON_USER, + .mmu_features = MMU_FTR_USE_HIGH_BATS | + MMU_FTR_NEED_DTLB_SW_LRU, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_603, + .machine_check = machine_check_generic, + .num_pmcs = 4, + .oprofile_cpu_type = "ppc/e300", + .oprofile_type = PPC_OPROFILE_FSL_EMB, + .platform = "ppc603", }, { /* default match, we assume split I/D cache & TB (non-601)... */ .pvr_mask = 0x00000000, @@ -690,8 +1251,11 @@ struct cpu_spec cpu_specs[] = { .cpu_name = "(generic PPC)", .cpu_features = CPU_FTRS_CLASSIC32, .cpu_user_features = COMMON_USER, + .mmu_features = MMU_FTR_HPTE_TABLE, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_generic, + .platform = "ppc603", }, #endif /* CLASSIC_PPC */ #ifdef CONFIG_8xx @@ -703,8 +1267,10 @@ struct cpu_spec cpu_specs[] = { * if the 8xx code is there.... */ .cpu_features = CPU_FTRS_8XX, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .mmu_features = MMU_FTR_TYPE_8xx, .icache_bsize = 16, .dcache_bsize = 16, + .platform = "ppc823", }, #endif /* CONFIG_8xx */ #ifdef CONFIG_40x @@ -714,8 +1280,11 @@ struct cpu_spec cpu_specs[] = { .cpu_name = "403GC", .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 16, .dcache_bsize = 16, + .machine_check = machine_check_4xx, + .platform = "ppc403", }, { /* 403GCX */ .pvr_mask = 0xffffff00, @@ -724,8 +1293,11 @@ struct cpu_spec cpu_specs[] = { .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | PPC_FEATURE_NO_TB, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 16, .dcache_bsize = 16, + .machine_check = machine_check_4xx, + .platform = "ppc403", }, { /* 403G ?? */ .pvr_mask = 0xffff0000, @@ -733,8 +1305,11 @@ struct cpu_spec cpu_specs[] = { .cpu_name = "403G ??", .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 16, .dcache_bsize = 16, + .machine_check = machine_check_4xx, + .platform = "ppc403", }, { /* 405GP */ .pvr_mask = 0xffff0000, @@ -743,8 +1318,11 @@ struct cpu_spec cpu_specs[] = { .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", }, { /* STB 03xxx */ .pvr_mask = 0xffff0000, @@ -753,8 +1331,11 @@ struct cpu_spec cpu_specs[] = { .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", }, { /* STB 04xxx */ .pvr_mask = 0xffff0000, @@ -763,8 +1344,11 @@ struct cpu_spec cpu_specs[] = { .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", }, { /* NP405L */ .pvr_mask = 0xffff0000, @@ -773,8 +1357,11 @@ struct cpu_spec cpu_specs[] = { .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", }, { /* NP4GS3 */ .pvr_mask = 0xffff0000, @@ -783,8 +1370,11 @@ struct cpu_spec cpu_specs[] = { .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", }, { /* NP405H */ .pvr_mask = 0xffff0000, @@ -793,8 +1383,11 @@ struct cpu_spec cpu_specs[] = { .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", }, { /* 405GPr */ .pvr_mask = 0xffff0000, @@ -803,8 +1396,11 @@ struct cpu_spec cpu_specs[] = { .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", }, { /* STBx25xx */ .pvr_mask = 0xffff0000, @@ -813,8 +1409,11 @@ struct cpu_spec cpu_specs[] = { .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", }, { /* 405LP */ .pvr_mask = 0xffff0000, @@ -822,18 +1421,37 @@ struct cpu_spec cpu_specs[] = { .cpu_name = "405LP", .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", }, { /* Xilinx Virtex-II Pro */ - .pvr_mask = 0xffff0000, + .pvr_mask = 0xfffff000, .pvr_value = 0x20010000, .cpu_name = "Virtex-II Pro", .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* Xilinx Virtex-4 FX */ + .pvr_mask = 0xfffff000, + .pvr_value = 0x20011000, + .cpu_name = "Virtex-4 FX", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", }, { /* 405EP */ .pvr_mask = 0xffff0000, @@ -842,105 +1460,563 @@ struct cpu_spec cpu_specs[] = { .cpu_features = CPU_FTRS_40X, .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EX Rev. A/B with Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910007, + .cpu_name = "405EX Rev. A/B", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EX Rev. C without Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x1291000d, + .cpu_name = "405EX Rev. C", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EX Rev. C with Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x1291000f, + .cpu_name = "405EX Rev. C", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EX Rev. D without Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910003, + .cpu_name = "405EX Rev. D", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EX Rev. D with Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910005, + .cpu_name = "405EX Rev. D", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", }, + { /* 405EXr Rev. A/B without Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910001, + .cpu_name = "405EXr Rev. A/B", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EXr Rev. C without Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910009, + .cpu_name = "405EXr Rev. C", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EXr Rev. C with Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x1291000b, + .cpu_name = "405EXr Rev. C", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EXr Rev. D without Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910000, + .cpu_name = "405EXr Rev. D", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* 405EXr Rev. D with Security */ + .pvr_mask = 0xffff000f, + .pvr_value = 0x12910002, + .cpu_name = "405EXr Rev. D", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { + /* 405EZ */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x41510000, + .cpu_name = "405EZ", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* APM8018X */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x7ff11432, + .cpu_name = "APM8018X", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + }, + { /* default match */ + .pvr_mask = 0x00000000, + .pvr_value = 0x00000000, + .cpu_name = "(generic 40x PPC)", + .cpu_features = CPU_FTRS_40X, + .cpu_user_features = PPC_FEATURE_32 | + PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC, + .mmu_features = MMU_FTR_TYPE_40x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc405", + } #endif /* CONFIG_40x */ #ifdef CONFIG_44x { .pvr_mask = 0xf0000fff, .pvr_value = 0x40000850, + .cpu_name = "440GR Rev. A", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc440", + }, + { /* Use logical PVR for 440EP (logical pvr = pvr | 0x8) */ + .pvr_mask = 0xf0000fff, + .pvr_value = 0x40000858, .cpu_name = "440EP Rev. A", .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER, /* 440EP has an FPU */ + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, .icache_bsize = 32, .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440ep, + .machine_check = machine_check_4xx, + .platform = "ppc440", }, { .pvr_mask = 0xf0000fff, .pvr_value = 0x400008d3, + .cpu_name = "440GR Rev. B", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc440", + }, + { /* Matches both physical and logical PVR for 440EP (logical pvr = pvr | 0x8) */ + .pvr_mask = 0xf0000ff7, + .pvr_value = 0x400008d4, + .cpu_name = "440EP Rev. C", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440ep, + .machine_check = machine_check_4xx, + .platform = "ppc440", + }, + { /* Use logical PVR for 440EP (logical pvr = pvr | 0x8) */ + .pvr_mask = 0xf0000fff, + .pvr_value = 0x400008db, .cpu_name = "440EP Rev. B", .cpu_features = CPU_FTRS_44X, - .cpu_user_features = COMMON_USER, /* 440EP has an FPU */ + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440ep, + .machine_check = machine_check_4xx, + .platform = "ppc440", + }, + { /* 440GRX */ + .pvr_mask = 0xf0000ffb, + .pvr_value = 0x200008D0, + .cpu_name = "440GRX", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440grx, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* Use logical PVR for 440EPx (logical pvr = pvr | 0x8) */ + .pvr_mask = 0xf0000ffb, + .pvr_value = 0x200008D8, + .cpu_name = "440EPX", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, .icache_bsize = 32, .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440epx, + .machine_check = machine_check_440A, + .platform = "ppc440", }, { /* 440GP Rev. B */ .pvr_mask = 0xf0000fff, .pvr_value = 0x40000440, .cpu_name = "440GP Rev. B", .cpu_features = CPU_FTRS_44X, - .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc440gp", }, { /* 440GP Rev. C */ .pvr_mask = 0xf0000fff, .pvr_value = 0x40000481, .cpu_name = "440GP Rev. C", .cpu_features = CPU_FTRS_44X, - .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc440gp", }, { /* 440GX Rev. A */ .pvr_mask = 0xf0000fff, .pvr_value = 0x50000850, .cpu_name = "440GX Rev. A", .cpu_features = CPU_FTRS_44X, - .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, .icache_bsize = 32, .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440gx, + .machine_check = machine_check_440A, + .platform = "ppc440", }, { /* 440GX Rev. B */ .pvr_mask = 0xf0000fff, .pvr_value = 0x50000851, .cpu_name = "440GX Rev. B", .cpu_features = CPU_FTRS_44X, - .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, .icache_bsize = 32, .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440gx, + .machine_check = machine_check_440A, + .platform = "ppc440", }, { /* 440GX Rev. C */ .pvr_mask = 0xf0000fff, .pvr_value = 0x50000892, .cpu_name = "440GX Rev. C", .cpu_features = CPU_FTRS_44X, - .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, .icache_bsize = 32, .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440gx, + .machine_check = machine_check_440A, + .platform = "ppc440", }, { /* 440GX Rev. F */ .pvr_mask = 0xf0000fff, .pvr_value = 0x50000894, .cpu_name = "440GX Rev. F", .cpu_features = CPU_FTRS_44X, - .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, .icache_bsize = 32, .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440gx, + .machine_check = machine_check_440A, + .platform = "ppc440", }, { /* 440SP Rev. A */ - .pvr_mask = 0xff000fff, - .pvr_value = 0x53000891, + .pvr_mask = 0xfff00fff, + .pvr_value = 0x53200891, .cpu_name = "440SP Rev. A", .cpu_features = CPU_FTRS_44X, - .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc440", }, + { /* 440SPe Rev. A */ + .pvr_mask = 0xfff00fff, + .pvr_value = 0x53400890, + .cpu_name = "440SPe Rev. A", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440spe, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 440SPe Rev. B */ + .pvr_mask = 0xfff00fff, + .pvr_value = 0x53400891, + .cpu_name = "440SPe Rev. B", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440spe, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 440 in Xilinx Virtex-5 FXT */ + .pvr_mask = 0xfffffff0, + .pvr_value = 0x7ff21910, + .cpu_name = "440 in Virtex-5 FXT", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_440x5, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 460EX */ + .pvr_mask = 0xffff0006, + .pvr_value = 0x13020002, + .cpu_name = "460EX", + .cpu_features = CPU_FTRS_440x6, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_460ex, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 460EX Rev B */ + .pvr_mask = 0xffff0007, + .pvr_value = 0x13020004, + .cpu_name = "460EX Rev. B", + .cpu_features = CPU_FTRS_440x6, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_460ex, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 460GT */ + .pvr_mask = 0xffff0006, + .pvr_value = 0x13020000, + .cpu_name = "460GT", + .cpu_features = CPU_FTRS_440x6, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_460gt, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 460GT Rev B */ + .pvr_mask = 0xffff0007, + .pvr_value = 0x13020005, + .cpu_name = "460GT Rev. B", + .cpu_features = CPU_FTRS_440x6, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_460gt, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 460SX */ + .pvr_mask = 0xffffff00, + .pvr_value = 0x13541800, + .cpu_name = "460SX", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_460sx, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 464 in APM821xx */ + .pvr_mask = 0xfffffff0, + .pvr_value = 0x12C41C80, + .cpu_name = "APM821XX", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE | + PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_apm821xx, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, + { /* 476 DD2 core */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x11a52080, + .cpu_name = "476", + .cpu_features = CPU_FTRS_47X | CPU_FTR_476_DD2, + .cpu_user_features = COMMON_USER_BOOKE | + PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_47x | + MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL, + .icache_bsize = 32, + .dcache_bsize = 128, + .machine_check = machine_check_47x, + .platform = "ppc470", + }, + { /* 476fpe */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x7ff50000, + .cpu_name = "476fpe", + .cpu_features = CPU_FTRS_47X | CPU_FTR_476_DD2, + .cpu_user_features = COMMON_USER_BOOKE | + PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_47x | + MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL, + .icache_bsize = 32, + .dcache_bsize = 128, + .machine_check = machine_check_47x, + .platform = "ppc470", + }, + { /* 476 iss */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00050000, + .cpu_name = "476", + .cpu_features = CPU_FTRS_47X, + .cpu_user_features = COMMON_USER_BOOKE | + PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_47x | + MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL, + .icache_bsize = 32, + .dcache_bsize = 128, + .machine_check = machine_check_47x, + .platform = "ppc470", + }, + { /* 476 others */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x11a50000, + .cpu_name = "476", + .cpu_features = CPU_FTRS_47X, + .cpu_user_features = COMMON_USER_BOOKE | + PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_47x | + MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL, + .icache_bsize = 32, + .dcache_bsize = 128, + .machine_check = machine_check_47x, + .platform = "ppc470", + }, + { /* default match */ + .pvr_mask = 0x00000000, + .pvr_value = 0x00000000, + .cpu_name = "(generic 44x PPC)", + .cpu_features = CPU_FTRS_44X, + .cpu_user_features = COMMON_USER_BOOKE, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .machine_check = machine_check_4xx, + .platform = "ppc440", + } #endif /* CONFIG_44x */ -#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_E200 { /* e200z5 */ .pvr_mask = 0xfff00000, .pvr_value = 0x81000000, .cpu_name = "e200z5", /* xxx - galak: add CPU_FTR_MAYBE_CAN_DOZE */ .cpu_features = CPU_FTRS_E200, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_EFP_SINGLE | + .cpu_user_features = COMMON_USER_BOOKE | + PPC_FEATURE_HAS_EFP_SINGLE | PPC_FEATURE_UNIFIED_CACHE, + .mmu_features = MMU_FTR_TYPE_FSL_E, .dcache_bsize = 32, + .machine_check = machine_check_e200, + .platform = "ppc5554", }, { /* e200z6 */ .pvr_mask = 0xfff00000, @@ -948,49 +2024,236 @@ struct cpu_spec cpu_specs[] = { .cpu_name = "e200z6", /* xxx - galak: add CPU_FTR_MAYBE_CAN_DOZE */ .cpu_features = CPU_FTRS_E200, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_SPE_COMP | - PPC_FEATURE_HAS_EFP_SINGLE | + .cpu_user_features = COMMON_USER_BOOKE | + PPC_FEATURE_HAS_SPE_COMP | + PPC_FEATURE_HAS_EFP_SINGLE_COMP | PPC_FEATURE_UNIFIED_CACHE, + .mmu_features = MMU_FTR_TYPE_FSL_E, .dcache_bsize = 32, + .machine_check = machine_check_e200, + .platform = "ppc5554", }, + { /* default match */ + .pvr_mask = 0x00000000, + .pvr_value = 0x00000000, + .cpu_name = "(generic E200 PPC)", + .cpu_features = CPU_FTRS_E200, + .cpu_user_features = COMMON_USER_BOOKE | + PPC_FEATURE_HAS_EFP_SINGLE | + PPC_FEATURE_UNIFIED_CACHE, + .mmu_features = MMU_FTR_TYPE_FSL_E, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_e200, + .machine_check = machine_check_e200, + .platform = "ppc5554", + } +#endif /* CONFIG_E200 */ +#endif /* CONFIG_PPC32 */ +#ifdef CONFIG_E500 +#ifdef CONFIG_PPC32 { /* e500 */ .pvr_mask = 0xffff0000, .pvr_value = 0x80200000, .cpu_name = "e500", - /* xxx - galak: add CPU_FTR_MAYBE_CAN_DOZE */ .cpu_features = CPU_FTRS_E500, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_SPE_COMP | - PPC_FEATURE_HAS_EFP_SINGLE, + .cpu_user_features = COMMON_USER_BOOKE | + PPC_FEATURE_HAS_SPE_COMP | + PPC_FEATURE_HAS_EFP_SINGLE_COMP, + .cpu_user_features2 = PPC_FEATURE2_ISEL, + .mmu_features = MMU_FTR_TYPE_FSL_E, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, + .oprofile_cpu_type = "ppc/e500", + .oprofile_type = PPC_OPROFILE_FSL_EMB, + .cpu_setup = __setup_cpu_e500v1, + .machine_check = machine_check_e500, + .platform = "ppc8540", }, { /* e500v2 */ .pvr_mask = 0xffff0000, .pvr_value = 0x80210000, .cpu_name = "e500v2", - /* xxx - galak: add CPU_FTR_MAYBE_CAN_DOZE */ .cpu_features = CPU_FTRS_E500_2, - .cpu_user_features = PPC_FEATURE_32 | - PPC_FEATURE_HAS_MMU | PPC_FEATURE_SPE_COMP | - PPC_FEATURE_HAS_EFP_SINGLE | PPC_FEATURE_HAS_EFP_DOUBLE, + .cpu_user_features = COMMON_USER_BOOKE | + PPC_FEATURE_HAS_SPE_COMP | + PPC_FEATURE_HAS_EFP_SINGLE_COMP | + PPC_FEATURE_HAS_EFP_DOUBLE_COMP, + .cpu_user_features2 = PPC_FEATURE2_ISEL, + .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS, .icache_bsize = 32, .dcache_bsize = 32, .num_pmcs = 4, + .oprofile_cpu_type = "ppc/e500", + .oprofile_type = PPC_OPROFILE_FSL_EMB, + .cpu_setup = __setup_cpu_e500v2, + .machine_check = machine_check_e500, + .platform = "ppc8548", }, + { /* e500mc */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80230000, + .cpu_name = "e500mc", + .cpu_features = CPU_FTRS_E500MC, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .cpu_user_features2 = PPC_FEATURE2_ISEL, + .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | + MMU_FTR_USE_TLBILX, + .icache_bsize = 64, + .dcache_bsize = 64, + .num_pmcs = 4, + .oprofile_cpu_type = "ppc/e500mc", + .oprofile_type = PPC_OPROFILE_FSL_EMB, + .cpu_setup = __setup_cpu_e500mc, + .machine_check = machine_check_e500mc, + .platform = "ppce500mc", + }, +#endif /* CONFIG_PPC32 */ + { /* e5500 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80240000, + .cpu_name = "e5500", + .cpu_features = CPU_FTRS_E5500, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .cpu_user_features2 = PPC_FEATURE2_ISEL, + .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | + MMU_FTR_USE_TLBILX, + .icache_bsize = 64, + .dcache_bsize = 64, + .num_pmcs = 4, + .oprofile_cpu_type = "ppc/e500mc", + .oprofile_type = PPC_OPROFILE_FSL_EMB, + .cpu_setup = __setup_cpu_e5500, +#ifndef CONFIG_PPC32 + .cpu_restore = __restore_cpu_e5500, +#endif + .machine_check = machine_check_e500mc, + .platform = "ppce5500", + }, + { /* e6500 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x80400000, + .cpu_name = "e6500", + .cpu_features = CPU_FTRS_E6500, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU | + PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features2 = PPC_FEATURE2_ISEL, + .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | + MMU_FTR_USE_TLBILX, + .icache_bsize = 64, + .dcache_bsize = 64, + .num_pmcs = 6, + .oprofile_cpu_type = "ppc/e6500", + .oprofile_type = PPC_OPROFILE_FSL_EMB, + .cpu_setup = __setup_cpu_e6500, +#ifndef CONFIG_PPC32 + .cpu_restore = __restore_cpu_e6500, #endif -#if !CLASSIC_PPC + .machine_check = machine_check_e500mc, + .platform = "ppce6500", + }, +#ifdef CONFIG_PPC32 { /* default match */ .pvr_mask = 0x00000000, .pvr_value = 0x00000000, - .cpu_name = "(generic PPC)", - .cpu_features = CPU_FTRS_GENERIC_32, - .cpu_user_features = PPC_FEATURE_32, + .cpu_name = "(generic E500 PPC)", + .cpu_features = CPU_FTRS_E500, + .cpu_user_features = COMMON_USER_BOOKE | + PPC_FEATURE_HAS_SPE_COMP | + PPC_FEATURE_HAS_EFP_SINGLE_COMP, + .mmu_features = MMU_FTR_TYPE_FSL_E, .icache_bsize = 32, .dcache_bsize = 32, + .machine_check = machine_check_e500, + .platform = "powerpc", } -#endif /* !CLASSIC_PPC */ #endif /* CONFIG_PPC32 */ +#endif /* CONFIG_E500 */ }; + +static struct cpu_spec the_cpu_spec; + +static struct cpu_spec * __init setup_cpu_spec(unsigned long offset, + struct cpu_spec *s) +{ + struct cpu_spec *t = &the_cpu_spec; + struct cpu_spec old; + + t = PTRRELOC(t); + old = *t; + + /* Copy everything, then do fixups */ + *t = *s; + + /* + * If we are overriding a previous value derived from the real + * PVR with a new value obtained using a logical PVR value, + * don't modify the performance monitor fields. + */ + if (old.num_pmcs && !s->num_pmcs) { + t->num_pmcs = old.num_pmcs; + t->pmc_type = old.pmc_type; + t->oprofile_type = old.oprofile_type; + t->oprofile_mmcra_sihv = old.oprofile_mmcra_sihv; + t->oprofile_mmcra_sipr = old.oprofile_mmcra_sipr; + t->oprofile_mmcra_clear = old.oprofile_mmcra_clear; + + /* + * If we have passed through this logic once before and + * have pulled the default case because the real PVR was + * not found inside cpu_specs[], then we are possibly + * running in compatibility mode. In that case, let the + * oprofiler know which set of compatibility counters to + * pull from by making sure the oprofile_cpu_type string + * is set to that of compatibility mode. If the + * oprofile_cpu_type already has a value, then we are + * possibly overriding a real PVR with a logical one, + * and, in that case, keep the current value for + * oprofile_cpu_type. + */ + if (old.oprofile_cpu_type != NULL) { + t->oprofile_cpu_type = old.oprofile_cpu_type; + t->oprofile_type = old.oprofile_type; + } + } + + *PTRRELOC(&cur_cpu_spec) = &the_cpu_spec; + + /* + * Set the base platform string once; assumes + * we're called with real pvr first. + */ + if (*PTRRELOC(&powerpc_base_platform) == NULL) + *PTRRELOC(&powerpc_base_platform) = t->platform; + +#if defined(CONFIG_PPC64) || defined(CONFIG_BOOKE) + /* ppc64 and booke expect identify_cpu to also call setup_cpu for + * that processor. I will consolidate that at a later time, for now, + * just use #ifdef. We also don't need to PTRRELOC the function + * pointer on ppc64 and booke as we are running at 0 in real mode + * on ppc64 and reloc_offset is always 0 on booke. + */ + if (t->cpu_setup) { + t->cpu_setup(offset, t); + } +#endif /* CONFIG_PPC64 || CONFIG_BOOKE */ + + return t; +} + +struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr) +{ + struct cpu_spec *s = cpu_specs; + int i; + + s = PTRRELOC(s); + + for (i = 0; i < ARRAY_SIZE(cpu_specs); i++,s++) { + if ((pvr & s->pvr_mask) == s->pvr_value) + return setup_cpu_spec(offset, s); + } + + BUG(); + + return NULL; +} diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c new file mode 100644 index 00000000000..51dbace3269 --- /dev/null +++ b/arch/powerpc/kernel/crash.c @@ -0,0 +1,367 @@ +/* + * Architecture specific (PPC64) functions for kexec based crash dumps. + * + * Copyright (C) 2005, IBM Corp. + * + * Created by: Haren Myneni + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + * + */ + +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/reboot.h> +#include <linux/kexec.h> +#include <linux/export.h> +#include <linux/crash_dump.h> +#include <linux/delay.h> +#include <linux/irq.h> +#include <linux/types.h> + +#include <asm/processor.h> +#include <asm/machdep.h> +#include <asm/kexec.h> +#include <asm/kdump.h> +#include <asm/prom.h> +#include <asm/smp.h> +#include <asm/setjmp.h> +#include <asm/debug.h> + +/* + * The primary CPU waits a while for all secondary CPUs to enter. This is to + * avoid sending an IPI if the secondary CPUs are entering + * crash_kexec_secondary on their own (eg via a system reset). + * + * The secondary timeout has to be longer than the primary. Both timeouts are + * in milliseconds. + */ +#define PRIMARY_TIMEOUT 500 +#define SECONDARY_TIMEOUT 1000 + +#define IPI_TIMEOUT 10000 +#define REAL_MODE_TIMEOUT 10000 + +/* This keeps a track of which one is the crashing cpu. */ +int crashing_cpu = -1; +static int time_to_dump; + +#define CRASH_HANDLER_MAX 3 +/* NULL terminated list of shutdown handles */ +static crash_shutdown_t crash_shutdown_handles[CRASH_HANDLER_MAX+1]; +static DEFINE_SPINLOCK(crash_handlers_lock); + +static unsigned long crash_shutdown_buf[JMP_BUF_LEN]; +static int crash_shutdown_cpu = -1; + +static int handle_fault(struct pt_regs *regs) +{ + if (crash_shutdown_cpu == smp_processor_id()) + longjmp(crash_shutdown_buf, 1); + return 0; +} + +#ifdef CONFIG_SMP + +static atomic_t cpus_in_crash; +void crash_ipi_callback(struct pt_regs *regs) +{ + static cpumask_t cpus_state_saved = CPU_MASK_NONE; + + int cpu = smp_processor_id(); + + if (!cpu_online(cpu)) + return; + + hard_irq_disable(); + if (!cpumask_test_cpu(cpu, &cpus_state_saved)) { + crash_save_cpu(regs, cpu); + cpumask_set_cpu(cpu, &cpus_state_saved); + } + + atomic_inc(&cpus_in_crash); + smp_mb__after_atomic(); + + /* + * Starting the kdump boot. + * This barrier is needed to make sure that all CPUs are stopped. + */ + while (!time_to_dump) + cpu_relax(); + + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(1, 1); + +#ifdef CONFIG_PPC64 + kexec_smp_wait(); +#else + for (;;); /* FIXME */ +#endif + + /* NOTREACHED */ +} + +static void crash_kexec_prepare_cpus(int cpu) +{ + unsigned int msecs; + unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */ + int tries = 0; + int (*old_handler)(struct pt_regs *regs); + + printk(KERN_EMERG "Sending IPI to other CPUs\n"); + + crash_send_ipi(crash_ipi_callback); + smp_wmb(); + +again: + /* + * FIXME: Until we will have the way to stop other CPUs reliably, + * the crash CPU will send an IPI and wait for other CPUs to + * respond. + */ + msecs = IPI_TIMEOUT; + while ((atomic_read(&cpus_in_crash) < ncpus) && (--msecs > 0)) + mdelay(1); + + /* Would it be better to replace the trap vector here? */ + + if (atomic_read(&cpus_in_crash) >= ncpus) { + printk(KERN_EMERG "IPI complete\n"); + return; + } + + printk(KERN_EMERG "ERROR: %d cpu(s) not responding\n", + ncpus - atomic_read(&cpus_in_crash)); + + /* + * If we have a panic timeout set then we can't wait indefinitely + * for someone to activate system reset. We also give up on the + * second time through if system reset fail to work. + */ + if ((panic_timeout > 0) || (tries > 0)) + return; + + /* + * A system reset will cause all CPUs to take an 0x100 exception. + * The primary CPU returns here via setjmp, and the secondary + * CPUs reexecute the crash_kexec_secondary path. + */ + old_handler = __debugger; + __debugger = handle_fault; + crash_shutdown_cpu = smp_processor_id(); + + if (setjmp(crash_shutdown_buf) == 0) { + printk(KERN_EMERG "Activate system reset (dumprestart) " + "to stop other cpu(s)\n"); + + /* + * A system reset will force all CPUs to execute the + * crash code again. We need to reset cpus_in_crash so we + * wait for everyone to do this. + */ + atomic_set(&cpus_in_crash, 0); + smp_mb(); + + while (atomic_read(&cpus_in_crash) < ncpus) + cpu_relax(); + } + + crash_shutdown_cpu = -1; + __debugger = old_handler; + + tries++; + goto again; +} + +/* + * This function will be called by secondary cpus. + */ +void crash_kexec_secondary(struct pt_regs *regs) +{ + unsigned long flags; + int msecs = SECONDARY_TIMEOUT; + + local_irq_save(flags); + + /* Wait for the primary crash CPU to signal its progress */ + while (crashing_cpu < 0) { + if (--msecs < 0) { + /* No response, kdump image may not have been loaded */ + local_irq_restore(flags); + return; + } + + mdelay(1); + } + + crash_ipi_callback(regs); +} + +#else /* ! CONFIG_SMP */ + +static void crash_kexec_prepare_cpus(int cpu) +{ + /* + * move the secondaries to us so that we can copy + * the new kernel 0-0x100 safely + * + * do this if kexec in setup.c ? + */ +#ifdef CONFIG_PPC64 + smp_release_cpus(); +#else + /* FIXME */ +#endif +} + +void crash_kexec_secondary(struct pt_regs *regs) +{ +} +#endif /* CONFIG_SMP */ + +/* wait for all the CPUs to hit real mode but timeout if they don't come in */ +#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_64) +static void crash_kexec_wait_realmode(int cpu) +{ + unsigned int msecs; + int i; + + msecs = REAL_MODE_TIMEOUT; + for (i=0; i < nr_cpu_ids && msecs > 0; i++) { + if (i == cpu) + continue; + + while (paca[i].kexec_state < KEXEC_STATE_REAL_MODE) { + barrier(); + if (!cpu_possible(i) || !cpu_online(i) || (msecs <= 0)) + break; + msecs--; + mdelay(1); + } + } + mb(); +} +#else +static inline void crash_kexec_wait_realmode(int cpu) {} +#endif /* CONFIG_SMP && CONFIG_PPC_STD_MMU_64 */ + +/* + * Register a function to be called on shutdown. Only use this if you + * can't reset your device in the second kernel. + */ +int crash_shutdown_register(crash_shutdown_t handler) +{ + unsigned int i, rc; + + spin_lock(&crash_handlers_lock); + for (i = 0 ; i < CRASH_HANDLER_MAX; i++) + if (!crash_shutdown_handles[i]) { + /* Insert handle at first empty entry */ + crash_shutdown_handles[i] = handler; + rc = 0; + break; + } + + if (i == CRASH_HANDLER_MAX) { + printk(KERN_ERR "Crash shutdown handles full, " + "not registered.\n"); + rc = 1; + } + + spin_unlock(&crash_handlers_lock); + return rc; +} +EXPORT_SYMBOL(crash_shutdown_register); + +int crash_shutdown_unregister(crash_shutdown_t handler) +{ + unsigned int i, rc; + + spin_lock(&crash_handlers_lock); + for (i = 0 ; i < CRASH_HANDLER_MAX; i++) + if (crash_shutdown_handles[i] == handler) + break; + + if (i == CRASH_HANDLER_MAX) { + printk(KERN_ERR "Crash shutdown handle not found\n"); + rc = 1; + } else { + /* Shift handles down */ + for (; crash_shutdown_handles[i]; i++) + crash_shutdown_handles[i] = + crash_shutdown_handles[i+1]; + rc = 0; + } + + spin_unlock(&crash_handlers_lock); + return rc; +} +EXPORT_SYMBOL(crash_shutdown_unregister); + +void default_machine_crash_shutdown(struct pt_regs *regs) +{ + unsigned int i; + int (*old_handler)(struct pt_regs *regs); + + /* + * This function is only called after the system + * has panicked or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means stopping other cpus in + * an SMP system. + * The kernel is broken so disable interrupts. + */ + hard_irq_disable(); + + /* + * Make a note of crashing cpu. Will be used in machine_kexec + * such that another IPI will not be sent. + */ + crashing_cpu = smp_processor_id(); + + /* + * If we came in via system reset, wait a while for the secondary + * CPUs to enter. + */ + if (TRAP(regs) == 0x100) + mdelay(PRIMARY_TIMEOUT); + + crash_kexec_prepare_cpus(crashing_cpu); + + crash_save_cpu(regs, crashing_cpu); + + time_to_dump = 1; + + crash_kexec_wait_realmode(crashing_cpu); + + machine_kexec_mask_interrupts(); + + /* + * Call registered shutdown routines safely. Swap out + * __debugger_fault_handler, and replace on exit. + */ + old_handler = __debugger_fault_handler; + __debugger_fault_handler = handle_fault; + crash_shutdown_cpu = smp_processor_id(); + for (i = 0; crash_shutdown_handles[i]; i++) { + if (setjmp(crash_shutdown_buf) == 0) { + /* + * Insert syncs and delay to ensure + * instructions in the dangerous region don't + * leak away from this protected region. + */ + asm volatile("sync; isync"); + /* dangerous region */ + crash_shutdown_handles[i](); + asm volatile("sync; isync"); + } + } + crash_shutdown_cpu = -1; + __debugger_fault_handler = old_handler; + + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(1, 0); +} diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c new file mode 100644 index 00000000000..7a13f378ca2 --- /dev/null +++ b/arch/powerpc/kernel/crash_dump.c @@ -0,0 +1,148 @@ +/* + * Routines for doing kexec-based kdump. + * + * Copyright (C) 2005, IBM Corp. + * + * Created by: Michael Ellerman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#undef DEBUG + +#include <linux/crash_dump.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <asm/code-patching.h> +#include <asm/kdump.h> +#include <asm/prom.h> +#include <asm/firmware.h> +#include <asm/uaccess.h> +#include <asm/rtas.h> + +#ifdef DEBUG +#include <asm/udbg.h> +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +#ifndef CONFIG_NONSTATIC_KERNEL +void __init reserve_kdump_trampoline(void) +{ + memblock_reserve(0, KDUMP_RESERVE_LIMIT); +} + +static void __init create_trampoline(unsigned long addr) +{ + unsigned int *p = (unsigned int *)addr; + + /* The maximum range of a single instruction branch, is the current + * instruction's address + (32 MB - 4) bytes. For the trampoline we + * need to branch to current address + 32 MB. So we insert a nop at + * the trampoline address, then the next instruction (+ 4 bytes) + * does a branch to (32 MB - 4). The net effect is that when we + * branch to "addr" we jump to ("addr" + 32 MB). Although it requires + * two instructions it doesn't require any registers. + */ + patch_instruction(p, PPC_INST_NOP); + patch_branch(++p, addr + PHYSICAL_START, 0); +} + +void __init setup_kdump_trampoline(void) +{ + unsigned long i; + + DBG(" -> setup_kdump_trampoline()\n"); + + for (i = KDUMP_TRAMPOLINE_START; i < KDUMP_TRAMPOLINE_END; i += 8) { + create_trampoline(i); + } + +#ifdef CONFIG_PPC_PSERIES + create_trampoline(__pa(system_reset_fwnmi) - PHYSICAL_START); + create_trampoline(__pa(machine_check_fwnmi) - PHYSICAL_START); +#endif /* CONFIG_PPC_PSERIES */ + + DBG(" <- setup_kdump_trampoline()\n"); +} +#endif /* CONFIG_NONSTATIC_KERNEL */ + +static size_t copy_oldmem_vaddr(void *vaddr, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + if (userbuf) { + if (copy_to_user((char __user *)buf, (vaddr + offset), csize)) + return -EFAULT; + } else + memcpy(buf, (vaddr + offset), csize); + + return csize; +} + +/** + * copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from "oldmem". For this page, there is no pte mapped + * in the current kernel. We stitch up a pte, similar to kmap_atomic. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + void *vaddr; + phys_addr_t paddr; + + if (!csize) + return 0; + + csize = min_t(size_t, csize, PAGE_SIZE); + paddr = pfn << PAGE_SHIFT; + + if (memblock_is_region_memory(paddr, csize)) { + vaddr = __va(paddr); + csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); + } else { + vaddr = __ioremap(paddr, PAGE_SIZE, 0); + csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); + iounmap(vaddr); + } + + return csize; +} + +#ifdef CONFIG_PPC_RTAS +/* + * The crashkernel region will almost always overlap the RTAS region, so + * we have to be careful when shrinking the crashkernel region. + */ +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) +{ + unsigned long addr; + const __be32 *basep, *sizep; + unsigned int rtas_start = 0, rtas_end = 0; + + basep = of_get_property(rtas.dev, "linux,rtas-base", NULL); + sizep = of_get_property(rtas.dev, "rtas-size", NULL); + + if (basep && sizep) { + rtas_start = be32_to_cpup(basep); + rtas_end = rtas_start + be32_to_cpup(sizep); + } + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + /* Does this page overlap with the RTAS region? */ + if (addr <= rtas_end && ((addr + PAGE_SIZE) > rtas_start)) + continue; + + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); + } +} +#endif diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c new file mode 100644 index 00000000000..d55c76c571f --- /dev/null +++ b/arch/powerpc/kernel/dbell.c @@ -0,0 +1,57 @@ +/* + * Author: Kumar Gala <galak@kernel.crashing.org> + * + * Copyright 2009 Freescale Semiconductor Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include <linux/stddef.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/threads.h> +#include <linux/hardirq.h> + +#include <asm/dbell.h> +#include <asm/irq_regs.h> + +#ifdef CONFIG_SMP +void doorbell_setup_this_cpu(void) +{ + unsigned long tag = mfspr(SPRN_DOORBELL_CPUTAG) & PPC_DBELL_TAG_MASK; + + smp_muxed_ipi_set_data(smp_processor_id(), tag); +} + +void doorbell_cause_ipi(int cpu, unsigned long data) +{ + /* Order previous accesses vs. msgsnd, which is treated as a store */ + mb(); + ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, data); +} + +void doorbell_exception(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + irq_enter(); + + may_hard_irq_enable(); + + __get_cpu_var(irq_stat).doorbell_irqs++; + + smp_ipi_demux(); + + irq_exit(); + set_irq_regs(old_regs); +} +#else /* CONFIG_SMP */ +void doorbell_exception(struct pt_regs *regs) +{ + printk(KERN_WARNING "Received doorbell on non-smp system\n"); +} +#endif /* CONFIG_SMP */ + diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c new file mode 100644 index 00000000000..54d0116256f --- /dev/null +++ b/arch/powerpc/kernel/dma-iommu.c @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corporation + * + * Provide default implementations of the DMA mapping callbacks for + * busses using the iommu infrastructure + */ + +#include <linux/export.h> +#include <asm/iommu.h> + +/* + * Generic iommu implementation + */ + +/* Allocates a contiguous real buffer and creates mappings over it. + * Returns the virtual address of the buffer and sets dma_handle + * to the dma address (mapping) of the first page. + */ +static void *dma_iommu_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, + struct dma_attrs *attrs) +{ + return iommu_alloc_coherent(dev, get_iommu_table_base(dev), size, + dma_handle, dev->coherent_dma_mask, flag, + dev_to_node(dev)); +} + +static void dma_iommu_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle, + struct dma_attrs *attrs) +{ + iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle); +} + +/* Creates TCEs for a user provided buffer. The user buffer must be + * contiguous real kernel storage (not vmalloc). The address passed here + * comprises a page address and offset into that page. The dma_addr_t + * returned will point to the same byte within the page as was passed in. + */ +static dma_addr_t dma_iommu_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return iommu_map_page(dev, get_iommu_table_base(dev), page, offset, + size, device_to_mask(dev), direction, attrs); +} + + +static void dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, direction, + attrs); +} + + +static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return iommu_map_sg(dev, get_iommu_table_base(dev), sglist, nelems, + device_to_mask(dev), direction, attrs); +} + +static void dma_iommu_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems, direction, + attrs); +} + +/* We support DMA to/from any memory page via the iommu */ +static int dma_iommu_dma_supported(struct device *dev, u64 mask) +{ + struct iommu_table *tbl = get_iommu_table_base(dev); + + if (!tbl) { + dev_info(dev, "Warning: IOMMU dma not supported: mask 0x%08llx" + ", table unavailable\n", mask); + return 0; + } + + if (tbl->it_offset > (mask >> tbl->it_page_shift)) { + dev_info(dev, "Warning: IOMMU offset too big for device mask\n"); + dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n", + mask, tbl->it_offset << tbl->it_page_shift); + return 0; + } else + return 1; +} + +static u64 dma_iommu_get_required_mask(struct device *dev) +{ + struct iommu_table *tbl = get_iommu_table_base(dev); + u64 mask; + if (!tbl) + return 0; + + mask = 1ULL < (fls_long(tbl->it_offset + tbl->it_size) - 1); + mask += mask - 1; + + return mask; +} + +struct dma_map_ops dma_iommu_ops = { + .alloc = dma_iommu_alloc_coherent, + .free = dma_iommu_free_coherent, + .mmap = dma_direct_mmap_coherent, + .map_sg = dma_iommu_map_sg, + .unmap_sg = dma_iommu_unmap_sg, + .dma_supported = dma_iommu_dma_supported, + .map_page = dma_iommu_map_page, + .unmap_page = dma_iommu_unmap_page, + .get_required_mask = dma_iommu_get_required_mask, +}; +EXPORT_SYMBOL(dma_iommu_ops); diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c new file mode 100644 index 00000000000..bd1a2aba599 --- /dev/null +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -0,0 +1,127 @@ +/* + * Contains routines needed to support swiotlb for ppc. + * + * Copyright (C) 2009-2010 Freescale Semiconductor, Inc. + * Author: Becky Bruce + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + */ + +#include <linux/dma-mapping.h> +#include <linux/memblock.h> +#include <linux/pfn.h> +#include <linux/of_platform.h> +#include <linux/platform_device.h> +#include <linux/pci.h> + +#include <asm/machdep.h> +#include <asm/swiotlb.h> +#include <asm/dma.h> + +unsigned int ppc_swiotlb_enable; + +static u64 swiotlb_powerpc_get_required(struct device *dev) +{ + u64 end, mask, max_direct_dma_addr = dev->archdata.max_direct_dma_addr; + + end = memblock_end_of_DRAM(); + if (max_direct_dma_addr && end > max_direct_dma_addr) + end = max_direct_dma_addr; + end += get_dma_offset(dev); + + mask = 1ULL << (fls64(end) - 1); + mask += mask - 1; + + return mask; +} + +/* + * At the moment, all platforms that use this code only require + * swiotlb to be used if we're operating on HIGHMEM. Since + * we don't ever call anything other than map_sg, unmap_sg, + * map_page, and unmap_page on highmem, use normal dma_ops + * for everything else. + */ +struct dma_map_ops swiotlb_dma_ops = { + .alloc = dma_direct_alloc_coherent, + .free = dma_direct_free_coherent, + .mmap = dma_direct_mmap_coherent, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .dma_supported = swiotlb_dma_supported, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .mapping_error = swiotlb_dma_mapping_error, + .get_required_mask = swiotlb_powerpc_get_required, +}; + +void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) +{ + struct pci_controller *hose; + struct dev_archdata *sd; + + hose = pci_bus_to_host(pdev->bus); + sd = &pdev->dev.archdata; + sd->max_direct_dma_addr = + hose->dma_window_base_cur + hose->dma_window_size; +} + +static int ppc_swiotlb_bus_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + struct dev_archdata *sd; + + /* We are only intereted in device addition */ + if (action != BUS_NOTIFY_ADD_DEVICE) + return 0; + + sd = &dev->archdata; + sd->max_direct_dma_addr = 0; + + /* May need to bounce if the device can't address all of DRAM */ + if ((dma_get_mask(dev) + 1) < memblock_end_of_DRAM()) + set_dma_ops(dev, &swiotlb_dma_ops); + + return NOTIFY_DONE; +} + +static struct notifier_block ppc_swiotlb_plat_bus_notifier = { + .notifier_call = ppc_swiotlb_bus_notify, + .priority = 0, +}; + +int __init swiotlb_setup_bus_notifier(void) +{ + bus_register_notifier(&platform_bus_type, + &ppc_swiotlb_plat_bus_notifier); + return 0; +} + +void swiotlb_detect_4g(void) +{ + if ((memblock_end_of_DRAM() - 1) > 0xffffffff) + ppc_swiotlb_enable = 1; +} + +static int __init swiotlb_late_init(void) +{ + if (ppc_swiotlb_enable) { + swiotlb_print_info(); + set_pci_dma_ops(&swiotlb_dma_ops); + ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; + } else { + swiotlb_free(); + } + + return 0; +} +subsys_initcall(swiotlb_late_init); diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c new file mode 100644 index 00000000000..ee78f6e49d6 --- /dev/null +++ b/arch/powerpc/kernel/dma.c @@ -0,0 +1,243 @@ +/* + * Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corporation + * + * Provide default implementations of the DMA mapping callbacks for + * directly mapped busses. + */ + +#include <linux/device.h> +#include <linux/dma-mapping.h> +#include <linux/dma-debug.h> +#include <linux/gfp.h> +#include <linux/memblock.h> +#include <linux/export.h> +#include <linux/pci.h> +#include <asm/vio.h> +#include <asm/bug.h> +#include <asm/machdep.h> + +/* + * Generic direct DMA implementation + * + * This implementation supports a per-device offset that can be applied if + * the address at which memory is visible to devices is not 0. Platform code + * can set archdata.dma_data to an unsigned long holding the offset. By + * default the offset is PCI_DRAM_OFFSET. + */ + + +void *dma_direct_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, + struct dma_attrs *attrs) +{ + void *ret; +#ifdef CONFIG_NOT_COHERENT_CACHE + ret = __dma_alloc_coherent(dev, size, dma_handle, flag); + if (ret == NULL) + return NULL; + *dma_handle += get_dma_offset(dev); + return ret; +#else + struct page *page; + int node = dev_to_node(dev); + + /* ignore region specifiers */ + flag &= ~(__GFP_HIGHMEM); + + page = alloc_pages_node(node, flag, get_order(size)); + if (page == NULL) + return NULL; + ret = page_address(page); + memset(ret, 0, size); + *dma_handle = __pa(ret) + get_dma_offset(dev); + + return ret; +#endif +} + +void dma_direct_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle, + struct dma_attrs *attrs) +{ +#ifdef CONFIG_NOT_COHERENT_CACHE + __dma_free_coherent(size, vaddr); +#else + free_pages((unsigned long)vaddr, get_order(size)); +#endif +} + +int dma_direct_mmap_coherent(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t handle, size_t size, + struct dma_attrs *attrs) +{ + unsigned long pfn; + +#ifdef CONFIG_NOT_COHERENT_CACHE + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + pfn = __dma_get_coherent_pfn((unsigned long)cpu_addr); +#else + pfn = page_to_pfn(virt_to_page(cpu_addr)); +#endif + return remap_pfn_range(vma, vma->vm_start, + pfn + vma->vm_pgoff, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +} + +static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) { + sg->dma_address = sg_phys(sg) + get_dma_offset(dev); + sg->dma_length = sg->length; + __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); + } + + return nents; +} + +static void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ +} + +static int dma_direct_dma_supported(struct device *dev, u64 mask) +{ +#ifdef CONFIG_PPC64 + /* Could be improved so platforms can set the limit in case + * they have limited DMA windows + */ + return mask >= get_dma_offset(dev) + (memblock_end_of_DRAM() - 1); +#else + return 1; +#endif +} + +static u64 dma_direct_get_required_mask(struct device *dev) +{ + u64 end, mask; + + end = memblock_end_of_DRAM() + get_dma_offset(dev); + + mask = 1ULL << (fls64(end) - 1); + mask += mask - 1; + + return mask; +} + +static inline dma_addr_t dma_direct_map_page(struct device *dev, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + BUG_ON(dir == DMA_NONE); + __dma_sync_page(page, offset, size, dir); + return page_to_phys(page) + offset + get_dma_offset(dev); +} + +static inline void dma_direct_unmap_page(struct device *dev, + dma_addr_t dma_address, + size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ +} + +#ifdef CONFIG_NOT_COHERENT_CACHE +static inline void dma_direct_sync_sg(struct device *dev, + struct scatterlist *sgl, int nents, + enum dma_data_direction direction) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) + __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); +} + +static inline void dma_direct_sync_single(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + __dma_sync(bus_to_virt(dma_handle), size, direction); +} +#endif + +struct dma_map_ops dma_direct_ops = { + .alloc = dma_direct_alloc_coherent, + .free = dma_direct_free_coherent, + .mmap = dma_direct_mmap_coherent, + .map_sg = dma_direct_map_sg, + .unmap_sg = dma_direct_unmap_sg, + .dma_supported = dma_direct_dma_supported, + .map_page = dma_direct_map_page, + .unmap_page = dma_direct_unmap_page, + .get_required_mask = dma_direct_get_required_mask, +#ifdef CONFIG_NOT_COHERENT_CACHE + .sync_single_for_cpu = dma_direct_sync_single, + .sync_single_for_device = dma_direct_sync_single, + .sync_sg_for_cpu = dma_direct_sync_sg, + .sync_sg_for_device = dma_direct_sync_sg, +#endif +}; +EXPORT_SYMBOL(dma_direct_ops); + +#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) + +int __dma_set_mask(struct device *dev, u64 dma_mask) +{ + struct dma_map_ops *dma_ops = get_dma_ops(dev); + + if ((dma_ops != NULL) && (dma_ops->set_dma_mask != NULL)) + return dma_ops->set_dma_mask(dev, dma_mask); + if (!dev->dma_mask || !dma_supported(dev, dma_mask)) + return -EIO; + *dev->dma_mask = dma_mask; + return 0; +} +int dma_set_mask(struct device *dev, u64 dma_mask) +{ + if (ppc_md.dma_set_mask) + return ppc_md.dma_set_mask(dev, dma_mask); + return __dma_set_mask(dev, dma_mask); +} +EXPORT_SYMBOL(dma_set_mask); + +u64 dma_get_required_mask(struct device *dev) +{ + struct dma_map_ops *dma_ops = get_dma_ops(dev); + + if (ppc_md.dma_get_required_mask) + return ppc_md.dma_get_required_mask(dev); + + if (unlikely(dma_ops == NULL)) + return 0; + + if (dma_ops->get_required_mask) + return dma_ops->get_required_mask(dev); + + return DMA_BIT_MASK(8 * sizeof(dma_addr_t)); +} +EXPORT_SYMBOL_GPL(dma_get_required_mask); + +static int __init dma_init(void) +{ + dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); +#ifdef CONFIG_PCI + dma_debug_add_bus(&pci_bus_type); +#endif +#ifdef CONFIG_IBMVIO + dma_debug_add_bus(&vio_bus_type); +#endif + + return 0; +} +fs_initcall(dma_init); + diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c new file mode 100644 index 00000000000..86e25702aac --- /dev/null +++ b/arch/powerpc/kernel/eeh.c @@ -0,0 +1,1183 @@ +/* + * Copyright IBM Corporation 2001, 2005, 2006 + * Copyright Dave Engebretsen & Todd Inglett 2001 + * Copyright Linas Vepstas 2005, 2006 + * Copyright 2001-2012 IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com> + */ + +#include <linux/delay.h> +#include <linux/debugfs.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/proc_fs.h> +#include <linux/rbtree.h> +#include <linux/reboot.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/export.h> +#include <linux/of.h> + +#include <linux/atomic.h> +#include <asm/debug.h> +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/io.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/rtas.h> + + +/** Overview: + * EEH, or "Extended Error Handling" is a PCI bridge technology for + * dealing with PCI bus errors that can't be dealt with within the + * usual PCI framework, except by check-stopping the CPU. Systems + * that are designed for high-availability/reliability cannot afford + * to crash due to a "mere" PCI error, thus the need for EEH. + * An EEH-capable bridge operates by converting a detected error + * into a "slot freeze", taking the PCI adapter off-line, making + * the slot behave, from the OS'es point of view, as if the slot + * were "empty": all reads return 0xff's and all writes are silently + * ignored. EEH slot isolation events can be triggered by parity + * errors on the address or data busses (e.g. during posted writes), + * which in turn might be caused by low voltage on the bus, dust, + * vibration, humidity, radioactivity or plain-old failed hardware. + * + * Note, however, that one of the leading causes of EEH slot + * freeze events are buggy device drivers, buggy device microcode, + * or buggy device hardware. This is because any attempt by the + * device to bus-master data to a memory address that is not + * assigned to the device will trigger a slot freeze. (The idea + * is to prevent devices-gone-wild from corrupting system memory). + * Buggy hardware/drivers will have a miserable time co-existing + * with EEH. + * + * Ideally, a PCI device driver, when suspecting that an isolation + * event has occurred (e.g. by reading 0xff's), will then ask EEH + * whether this is the case, and then take appropriate steps to + * reset the PCI slot, the PCI device, and then resume operations. + * However, until that day, the checking is done here, with the + * eeh_check_failure() routine embedded in the MMIO macros. If + * the slot is found to be isolated, an "EEH Event" is synthesized + * and sent out for processing. + */ + +/* If a device driver keeps reading an MMIO register in an interrupt + * handler after a slot isolation event, it might be broken. + * This sets the threshold for how many read attempts we allow + * before printing an error message. + */ +#define EEH_MAX_FAILS 2100000 + +/* Time to wait for a PCI slot to report status, in milliseconds */ +#define PCI_BUS_RESET_WAIT_MSEC (5*60*1000) + +/* + * EEH probe mode support, which is part of the flags, + * is to support multiple platforms for EEH. Some platforms + * like pSeries do PCI emunation based on device tree. + * However, other platforms like powernv probe PCI devices + * from hardware. The flag is used to distinguish that. + * In addition, struct eeh_ops::probe would be invoked for + * particular OF node or PCI device so that the corresponding + * PE would be created there. + */ +int eeh_subsystem_flags; +EXPORT_SYMBOL(eeh_subsystem_flags); + +/* Platform dependent EEH operations */ +struct eeh_ops *eeh_ops = NULL; + +/* Lock to avoid races due to multiple reports of an error */ +DEFINE_RAW_SPINLOCK(confirm_error_lock); + +/* Buffer for reporting pci register dumps. Its here in BSS, and + * not dynamically alloced, so that it ends up in RMO where RTAS + * can access it. + */ +#define EEH_PCI_REGS_LOG_LEN 4096 +static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN]; + +/* + * The struct is used to maintain the EEH global statistic + * information. Besides, the EEH global statistics will be + * exported to user space through procfs + */ +struct eeh_stats { + u64 no_device; /* PCI device not found */ + u64 no_dn; /* OF node not found */ + u64 no_cfg_addr; /* Config address not found */ + u64 ignored_check; /* EEH check skipped */ + u64 total_mmio_ffs; /* Total EEH checks */ + u64 false_positives; /* Unnecessary EEH checks */ + u64 slot_resets; /* PE reset */ +}; + +static struct eeh_stats eeh_stats; + +#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE) + +static int __init eeh_setup(char *str) +{ + if (!strcmp(str, "off")) + eeh_subsystem_flags |= EEH_FORCE_DISABLED; + + return 1; +} +__setup("eeh=", eeh_setup); + +/** + * eeh_gather_pci_data - Copy assorted PCI config space registers to buff + * @edev: device to report data for + * @buf: point to buffer in which to log + * @len: amount of room in buffer + * + * This routine captures assorted PCI configuration space data, + * and puts them into a buffer for RTAS error logging. + */ +static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len) +{ + struct device_node *dn = eeh_dev_to_of_node(edev); + u32 cfg; + int cap, i; + int n = 0; + + n += scnprintf(buf+n, len-n, "%s\n", dn->full_name); + pr_warn("EEH: of node=%s\n", dn->full_name); + + eeh_ops->read_config(dn, PCI_VENDOR_ID, 4, &cfg); + n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg); + pr_warn("EEH: PCI device/vendor: %08x\n", cfg); + + eeh_ops->read_config(dn, PCI_COMMAND, 4, &cfg); + n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg); + pr_warn("EEH: PCI cmd/status register: %08x\n", cfg); + + /* Gather bridge-specific registers */ + if (edev->mode & EEH_DEV_BRIDGE) { + eeh_ops->read_config(dn, PCI_SEC_STATUS, 2, &cfg); + n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg); + pr_warn("EEH: Bridge secondary status: %04x\n", cfg); + + eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &cfg); + n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg); + pr_warn("EEH: Bridge control: %04x\n", cfg); + } + + /* Dump out the PCI-X command and status regs */ + cap = edev->pcix_cap; + if (cap) { + eeh_ops->read_config(dn, cap, 4, &cfg); + n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg); + pr_warn("EEH: PCI-X cmd: %08x\n", cfg); + + eeh_ops->read_config(dn, cap+4, 4, &cfg); + n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg); + pr_warn("EEH: PCI-X status: %08x\n", cfg); + } + + /* If PCI-E capable, dump PCI-E cap 10 */ + cap = edev->pcie_cap; + if (cap) { + n += scnprintf(buf+n, len-n, "pci-e cap10:\n"); + pr_warn("EEH: PCI-E capabilities and status follow:\n"); + + for (i=0; i<=8; i++) { + eeh_ops->read_config(dn, cap+4*i, 4, &cfg); + n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); + pr_warn("EEH: PCI-E %02x: %08x\n", i, cfg); + } + } + + /* If AER capable, dump it */ + cap = edev->aer_cap; + if (cap) { + n += scnprintf(buf+n, len-n, "pci-e AER:\n"); + pr_warn("EEH: PCI-E AER capability register set follows:\n"); + + for (i=0; i<14; i++) { + eeh_ops->read_config(dn, cap+4*i, 4, &cfg); + n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); + pr_warn("EEH: PCI-E AER %02x: %08x\n", i, cfg); + } + } + + return n; +} + +/** + * eeh_slot_error_detail - Generate combined log including driver log and error log + * @pe: EEH PE + * @severity: temporary or permanent error log + * + * This routine should be called to generate the combined log, which + * is comprised of driver log and error log. The driver log is figured + * out from the config space of the corresponding PCI device, while + * the error log is fetched through platform dependent function call. + */ +void eeh_slot_error_detail(struct eeh_pe *pe, int severity) +{ + size_t loglen = 0; + struct eeh_dev *edev, *tmp; + + /* + * When the PHB is fenced or dead, it's pointless to collect + * the data from PCI config space because it should return + * 0xFF's. For ER, we still retrieve the data from the PCI + * config space. + * + * For pHyp, we have to enable IO for log retrieval. Otherwise, + * 0xFF's is always returned from PCI config space. + */ + if (!(pe->type & EEH_PE_PHB)) { + if (eeh_probe_mode_devtree()) + eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); + eeh_ops->configure_bridge(pe); + eeh_pe_restore_bars(pe); + + pci_regs_buf[0] = 0; + eeh_pe_for_each_dev(pe, edev, tmp) { + loglen += eeh_gather_pci_data(edev, pci_regs_buf + loglen, + EEH_PCI_REGS_LOG_LEN - loglen); + } + } + + eeh_ops->get_log(pe, severity, pci_regs_buf, loglen); +} + +/** + * eeh_token_to_phys - Convert EEH address token to phys address + * @token: I/O token, should be address in the form 0xA.... + * + * This routine should be called to convert virtual I/O address + * to physical one. + */ +static inline unsigned long eeh_token_to_phys(unsigned long token) +{ + pte_t *ptep; + unsigned long pa; + int hugepage_shift; + + /* + * We won't find hugepages here, iomem + */ + ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift); + if (!ptep) + return token; + WARN_ON(hugepage_shift); + pa = pte_pfn(*ptep) << PAGE_SHIFT; + + return pa | (token & (PAGE_SIZE-1)); +} + +/* + * On PowerNV platform, we might already have fenced PHB there. + * For that case, it's meaningless to recover frozen PE. Intead, + * We have to handle fenced PHB firstly. + */ +static int eeh_phb_check_failure(struct eeh_pe *pe) +{ + struct eeh_pe *phb_pe; + unsigned long flags; + int ret; + + if (!eeh_probe_mode_dev()) + return -EPERM; + + /* Find the PHB PE */ + phb_pe = eeh_phb_pe_get(pe->phb); + if (!phb_pe) { + pr_warning("%s Can't find PE for PHB#%d\n", + __func__, pe->phb->global_number); + return -EEXIST; + } + + /* If the PHB has been in problematic state */ + eeh_serialize_lock(&flags); + if (phb_pe->state & EEH_PE_ISOLATED) { + ret = 0; + goto out; + } + + /* Check PHB state */ + ret = eeh_ops->get_state(phb_pe, NULL); + if ((ret < 0) || + (ret == EEH_STATE_NOT_SUPPORT) || + (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) == + (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) { + ret = 0; + goto out; + } + + /* Isolate the PHB and send event */ + eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); + eeh_serialize_unlock(flags); + + pr_err("EEH: PHB#%x failure detected, location: %s\n", + phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe)); + dump_stack(); + eeh_send_failure_event(phb_pe); + + return 1; +out: + eeh_serialize_unlock(flags); + return ret; +} + +/** + * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze + * @edev: eeh device + * + * Check for an EEH failure for the given device node. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze. This routine + * will query firmware for the EEH status. + * + * Returns 0 if there has not been an EEH error; otherwise returns + * a non-zero value and queues up a slot isolation event notification. + * + * It is safe to call this routine in an interrupt context. + */ +int eeh_dev_check_failure(struct eeh_dev *edev) +{ + int ret; + int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); + unsigned long flags; + struct device_node *dn; + struct pci_dev *dev; + struct eeh_pe *pe, *parent_pe, *phb_pe; + int rc = 0; + const char *location; + + eeh_stats.total_mmio_ffs++; + + if (!eeh_enabled()) + return 0; + + if (!edev) { + eeh_stats.no_dn++; + return 0; + } + dn = eeh_dev_to_of_node(edev); + dev = eeh_dev_to_pci_dev(edev); + pe = edev->pe; + + /* Access to IO BARs might get this far and still not want checking. */ + if (!pe) { + eeh_stats.ignored_check++; + pr_debug("EEH: Ignored check for %s %s\n", + eeh_pci_name(dev), dn->full_name); + return 0; + } + + if (!pe->addr && !pe->config_addr) { + eeh_stats.no_cfg_addr++; + return 0; + } + + /* + * On PowerNV platform, we might already have fenced PHB + * there and we need take care of that firstly. + */ + ret = eeh_phb_check_failure(pe); + if (ret > 0) + return ret; + + /* If we already have a pending isolation event for this + * slot, we know it's bad already, we don't need to check. + * Do this checking under a lock; as multiple PCI devices + * in one slot might report errors simultaneously, and we + * only want one error recovery routine running. + */ + eeh_serialize_lock(&flags); + rc = 1; + if (pe->state & EEH_PE_ISOLATED) { + pe->check_count++; + if (pe->check_count % EEH_MAX_FAILS == 0) { + location = of_get_property(dn, "ibm,loc-code", NULL); + printk(KERN_ERR "EEH: %d reads ignored for recovering device at " + "location=%s driver=%s pci addr=%s\n", + pe->check_count, location, + eeh_driver_name(dev), eeh_pci_name(dev)); + printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n", + eeh_driver_name(dev)); + dump_stack(); + } + goto dn_unlock; + } + + /* + * Now test for an EEH failure. This is VERY expensive. + * Note that the eeh_config_addr may be a parent device + * in the case of a device behind a bridge, or it may be + * function zero of a multi-function device. + * In any case they must share a common PHB. + */ + ret = eeh_ops->get_state(pe, NULL); + + /* Note that config-io to empty slots may fail; + * they are empty when they don't have children. + * We will punt with the following conditions: Failure to get + * PE's state, EEH not support and Permanently unavailable + * state, PE is in good state. + */ + if ((ret < 0) || + (ret == EEH_STATE_NOT_SUPPORT) || + ((ret & active_flags) == active_flags)) { + eeh_stats.false_positives++; + pe->false_positives++; + rc = 0; + goto dn_unlock; + } + + /* + * It should be corner case that the parent PE has been + * put into frozen state as well. We should take care + * that at first. + */ + parent_pe = pe->parent; + while (parent_pe) { + /* Hit the ceiling ? */ + if (parent_pe->type & EEH_PE_PHB) + break; + + /* Frozen parent PE ? */ + ret = eeh_ops->get_state(parent_pe, NULL); + if (ret > 0 && + (ret & active_flags) != active_flags) + pe = parent_pe; + + /* Next parent level */ + parent_pe = parent_pe->parent; + } + + eeh_stats.slot_resets++; + + /* Avoid repeated reports of this failure, including problems + * with other functions on this device, and functions under + * bridges. + */ + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_serialize_unlock(flags); + + /* Most EEH events are due to device driver bugs. Having + * a stack trace will help the device-driver authors figure + * out what happened. So print that out. + */ + phb_pe = eeh_phb_pe_get(pe->phb); + pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", + pe->phb->global_number, pe->addr); + pr_err("EEH: PE location: %s, PHB location: %s\n", + eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); + dump_stack(); + + eeh_send_failure_event(pe); + + return 1; + +dn_unlock: + eeh_serialize_unlock(flags); + return rc; +} + +EXPORT_SYMBOL_GPL(eeh_dev_check_failure); + +/** + * eeh_check_failure - Check if all 1's data is due to EEH slot freeze + * @token: I/O token, should be address in the form 0xA.... + * @val: value, should be all 1's (XXX why do we need this arg??) + * + * Check for an EEH failure at the given token address. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze event. This routine + * will query firmware for the EEH status. + * + * Note this routine is safe to call in an interrupt context. + */ +unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val) +{ + unsigned long addr; + struct eeh_dev *edev; + + /* Finding the phys addr + pci device; this is pretty quick. */ + addr = eeh_token_to_phys((unsigned long __force) token); + edev = eeh_addr_cache_get_dev(addr); + if (!edev) { + eeh_stats.no_device++; + return val; + } + + eeh_dev_check_failure(edev); + return val; +} + +EXPORT_SYMBOL(eeh_check_failure); + + +/** + * eeh_pci_enable - Enable MMIO or DMA transfers for this slot + * @pe: EEH PE + * + * This routine should be called to reenable frozen MMIO or DMA + * so that it would work correctly again. It's useful while doing + * recovery or log collection on the indicated device. + */ +int eeh_pci_enable(struct eeh_pe *pe, int function) +{ + int rc, flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); + + /* + * pHyp doesn't allow to enable IO or DMA on unfrozen PE. + * Also, it's pointless to enable them on unfrozen PE. So + * we have the check here. + */ + if (function == EEH_OPT_THAW_MMIO || + function == EEH_OPT_THAW_DMA) { + rc = eeh_ops->get_state(pe, NULL); + if (rc < 0) + return rc; + + /* Needn't to enable or already enabled */ + if ((rc == EEH_STATE_NOT_SUPPORT) || + ((rc & flags) == flags)) + return 0; + } + + rc = eeh_ops->set_option(pe, function); + if (rc) + pr_warn("%s: Unexpected state change %d on " + "PHB#%d-PE#%x, err=%d\n", + __func__, function, pe->phb->global_number, + pe->addr, rc); + + rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); + if (rc <= 0) + return rc; + + if ((function == EEH_OPT_THAW_MMIO) && + (rc & EEH_STATE_MMIO_ENABLED)) + return 0; + + if ((function == EEH_OPT_THAW_DMA) && + (rc & EEH_STATE_DMA_ENABLED)) + return 0; + + return rc; +} + +/** + * pcibios_set_pcie_slot_reset - Set PCI-E reset state + * @dev: pci device struct + * @state: reset state to enter + * + * Return value: + * 0 if success + */ +int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(dev); + struct eeh_pe *pe = edev->pe; + + if (!pe) { + pr_err("%s: No PE found on PCI device %s\n", + __func__, pci_name(dev)); + return -EINVAL; + } + + switch (state) { + case pcie_deassert_reset: + eeh_ops->reset(pe, EEH_RESET_DEACTIVATE); + break; + case pcie_hot_reset: + eeh_ops->reset(pe, EEH_RESET_HOT); + break; + case pcie_warm_reset: + eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL); + break; + default: + return -EINVAL; + }; + + return 0; +} + +/** + * eeh_set_pe_freset - Check the required reset for the indicated device + * @data: EEH device + * @flag: return value + * + * Each device might have its preferred reset type: fundamental or + * hot reset. The routine is used to collected the information for + * the indicated device and its children so that the bunch of the + * devices could be reset properly. + */ +static void *eeh_set_dev_freset(void *data, void *flag) +{ + struct pci_dev *dev; + unsigned int *freset = (unsigned int *)flag; + struct eeh_dev *edev = (struct eeh_dev *)data; + + dev = eeh_dev_to_pci_dev(edev); + if (dev) + *freset |= dev->needs_freset; + + return NULL; +} + +/** + * eeh_reset_pe_once - Assert the pci #RST line for 1/4 second + * @pe: EEH PE + * + * Assert the PCI #RST line for 1/4 second. + */ +static void eeh_reset_pe_once(struct eeh_pe *pe) +{ + unsigned int freset = 0; + + /* Determine type of EEH reset required for + * Partitionable Endpoint, a hot-reset (1) + * or a fundamental reset (3). + * A fundamental reset required by any device under + * Partitionable Endpoint trumps hot-reset. + */ + eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset); + + if (freset) + eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL); + else + eeh_ops->reset(pe, EEH_RESET_HOT); + + eeh_ops->reset(pe, EEH_RESET_DEACTIVATE); +} + +/** + * eeh_reset_pe - Reset the indicated PE + * @pe: EEH PE + * + * This routine should be called to reset indicated device, including + * PE. A PE might include multiple PCI devices and sometimes PCI bridges + * might be involved as well. + */ +int eeh_reset_pe(struct eeh_pe *pe) +{ + int flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); + int i, rc; + + /* Take three shots at resetting the bus */ + for (i=0; i<3; i++) { + eeh_reset_pe_once(pe); + + /* + * EEH_PE_ISOLATED is expected to be removed after + * BAR restore. + */ + rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); + if ((rc & flags) == flags) + return 0; + + if (rc < 0) { + pr_err("%s: Unrecoverable slot failure on PHB#%d-PE#%x", + __func__, pe->phb->global_number, pe->addr); + return -1; + } + pr_err("EEH: bus reset %d failed on PHB#%d-PE#%x, rc=%d\n", + i+1, pe->phb->global_number, pe->addr, rc); + } + + return -1; +} + +/** + * eeh_save_bars - Save device bars + * @edev: PCI device associated EEH device + * + * Save the values of the device bars. Unlike the restore + * routine, this routine is *not* recursive. This is because + * PCI devices are added individually; but, for the restore, + * an entire slot is reset at a time. + */ +void eeh_save_bars(struct eeh_dev *edev) +{ + int i; + struct device_node *dn; + + if (!edev) + return; + dn = eeh_dev_to_of_node(edev); + + for (i = 0; i < 16; i++) + eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]); + + /* + * For PCI bridges including root port, we need enable bus + * master explicitly. Otherwise, it can't fetch IODA table + * entries correctly. So we cache the bit in advance so that + * we can restore it after reset, either PHB range or PE range. + */ + if (edev->mode & EEH_DEV_BRIDGE) + edev->config_space[1] |= PCI_COMMAND_MASTER; +} + +/** + * eeh_ops_register - Register platform dependent EEH operations + * @ops: platform dependent EEH operations + * + * Register the platform dependent EEH operation callback + * functions. The platform should call this function before + * any other EEH operations. + */ +int __init eeh_ops_register(struct eeh_ops *ops) +{ + if (!ops->name) { + pr_warning("%s: Invalid EEH ops name for %p\n", + __func__, ops); + return -EINVAL; + } + + if (eeh_ops && eeh_ops != ops) { + pr_warning("%s: EEH ops of platform %s already existing (%s)\n", + __func__, eeh_ops->name, ops->name); + return -EEXIST; + } + + eeh_ops = ops; + + return 0; +} + +/** + * eeh_ops_unregister - Unreigster platform dependent EEH operations + * @name: name of EEH platform operations + * + * Unregister the platform dependent EEH operation callback + * functions. + */ +int __exit eeh_ops_unregister(const char *name) +{ + if (!name || !strlen(name)) { + pr_warning("%s: Invalid EEH ops name\n", + __func__); + return -EINVAL; + } + + if (eeh_ops && !strcmp(eeh_ops->name, name)) { + eeh_ops = NULL; + return 0; + } + + return -EEXIST; +} + +static int eeh_reboot_notifier(struct notifier_block *nb, + unsigned long action, void *unused) +{ + eeh_set_enable(false); + return NOTIFY_DONE; +} + +static struct notifier_block eeh_reboot_nb = { + .notifier_call = eeh_reboot_notifier, +}; + +/** + * eeh_init - EEH initialization + * + * Initialize EEH by trying to enable it for all of the adapters in the system. + * As a side effect we can determine here if eeh is supported at all. + * Note that we leave EEH on so failed config cycles won't cause a machine + * check. If a user turns off EEH for a particular adapter they are really + * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't + * grant access to a slot if EEH isn't enabled, and so we always enable + * EEH for all slots/all devices. + * + * The eeh-force-off option disables EEH checking globally, for all slots. + * Even if force-off is set, the EEH hardware is still enabled, so that + * newer systems can boot. + */ +int eeh_init(void) +{ + struct pci_controller *hose, *tmp; + struct device_node *phb; + static int cnt = 0; + int ret = 0; + + /* + * We have to delay the initialization on PowerNV after + * the PCI hierarchy tree has been built because the PEs + * are figured out based on PCI devices instead of device + * tree nodes + */ + if (machine_is(powernv) && cnt++ <= 0) + return ret; + + /* Register reboot notifier */ + ret = register_reboot_notifier(&eeh_reboot_nb); + if (ret) { + pr_warn("%s: Failed to register notifier (%d)\n", + __func__, ret); + return ret; + } + + /* call platform initialization function */ + if (!eeh_ops) { + pr_warning("%s: Platform EEH operation not found\n", + __func__); + return -EEXIST; + } else if ((ret = eeh_ops->init())) { + pr_warning("%s: Failed to call platform init function (%d)\n", + __func__, ret); + return ret; + } + + /* Initialize EEH event */ + ret = eeh_event_init(); + if (ret) + return ret; + + /* Enable EEH for all adapters */ + if (eeh_probe_mode_devtree()) { + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) { + phb = hose->dn; + traverse_pci_devices(phb, eeh_ops->of_probe, NULL); + } + } else if (eeh_probe_mode_dev()) { + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) + pci_walk_bus(hose->bus, eeh_ops->dev_probe, NULL); + } else { + pr_warn("%s: Invalid probe mode %x", + __func__, eeh_subsystem_flags); + return -EINVAL; + } + + /* + * Call platform post-initialization. Actually, It's good chance + * to inform platform that EEH is ready to supply service if the + * I/O cache stuff has been built up. + */ + if (eeh_ops->post_init) { + ret = eeh_ops->post_init(); + if (ret) + return ret; + } + + if (eeh_enabled()) + pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); + else + pr_warning("EEH: No capable adapters found\n"); + + return ret; +} + +core_initcall_sync(eeh_init); + +/** + * eeh_add_device_early - Enable EEH for the indicated device_node + * @dn: device node for which to set up EEH + * + * This routine must be used to perform EEH initialization for PCI + * devices that were added after system boot (e.g. hotplug, dlpar). + * This routine must be called before any i/o is performed to the + * adapter (inluding any config-space i/o). + * Whether this actually enables EEH or not for this device depends + * on the CEC architecture, type of the device, on earlier boot + * command-line arguments & etc. + */ +void eeh_add_device_early(struct device_node *dn) +{ + struct pci_controller *phb; + + /* + * If we're doing EEH probe based on PCI device, we + * would delay the probe until late stage because + * the PCI device isn't available this moment. + */ + if (!eeh_probe_mode_devtree()) + return; + + if (!of_node_to_eeh_dev(dn)) + return; + phb = of_node_to_eeh_dev(dn)->phb; + + /* USB Bus children of PCI devices will not have BUID's */ + if (NULL == phb || 0 == phb->buid) + return; + + eeh_ops->of_probe(dn, NULL); +} + +/** + * eeh_add_device_tree_early - Enable EEH for the indicated device + * @dn: device node + * + * This routine must be used to perform EEH initialization for the + * indicated PCI device that was added after system boot (e.g. + * hotplug, dlpar). + */ +void eeh_add_device_tree_early(struct device_node *dn) +{ + struct device_node *sib; + + for_each_child_of_node(dn, sib) + eeh_add_device_tree_early(sib); + eeh_add_device_early(dn); +} +EXPORT_SYMBOL_GPL(eeh_add_device_tree_early); + +/** + * eeh_add_device_late - Perform EEH initialization for the indicated pci device + * @dev: pci device for which to set up EEH + * + * This routine must be used to complete EEH initialization for PCI + * devices that were added after system boot (e.g. hotplug, dlpar). + */ +void eeh_add_device_late(struct pci_dev *dev) +{ + struct device_node *dn; + struct eeh_dev *edev; + + if (!dev || !eeh_enabled()) + return; + + pr_debug("EEH: Adding device %s\n", pci_name(dev)); + + dn = pci_device_to_OF_node(dev); + edev = of_node_to_eeh_dev(dn); + if (edev->pdev == dev) { + pr_debug("EEH: Already referenced !\n"); + return; + } + + /* + * The EEH cache might not be removed correctly because of + * unbalanced kref to the device during unplug time, which + * relies on pcibios_release_device(). So we have to remove + * that here explicitly. + */ + if (edev->pdev) { + eeh_rmv_from_parent_pe(edev); + eeh_addr_cache_rmv_dev(edev->pdev); + eeh_sysfs_remove_device(edev->pdev); + edev->mode &= ~EEH_DEV_SYSFS; + + /* + * We definitely should have the PCI device removed + * though it wasn't correctly. So we needn't call + * into error handler afterwards. + */ + edev->mode |= EEH_DEV_NO_HANDLER; + + edev->pdev = NULL; + dev->dev.archdata.edev = NULL; + } + + edev->pdev = dev; + dev->dev.archdata.edev = edev; + + /* + * We have to do the EEH probe here because the PCI device + * hasn't been created yet in the early stage. + */ + if (eeh_probe_mode_dev()) + eeh_ops->dev_probe(dev, NULL); + + eeh_addr_cache_insert_dev(dev); +} + +/** + * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI bus + * @bus: PCI bus + * + * This routine must be used to perform EEH initialization for PCI + * devices which are attached to the indicated PCI bus. The PCI bus + * is added after system boot through hotplug or dlpar. + */ +void eeh_add_device_tree_late(struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + eeh_add_device_late(dev); + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { + struct pci_bus *subbus = dev->subordinate; + if (subbus) + eeh_add_device_tree_late(subbus); + } + } +} +EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); + +/** + * eeh_add_sysfs_files - Add EEH sysfs files for the indicated PCI bus + * @bus: PCI bus + * + * This routine must be used to add EEH sysfs files for PCI + * devices which are attached to the indicated PCI bus. The PCI bus + * is added after system boot through hotplug or dlpar. + */ +void eeh_add_sysfs_files(struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + eeh_sysfs_add_device(dev); + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { + struct pci_bus *subbus = dev->subordinate; + if (subbus) + eeh_add_sysfs_files(subbus); + } + } +} +EXPORT_SYMBOL_GPL(eeh_add_sysfs_files); + +/** + * eeh_remove_device - Undo EEH setup for the indicated pci device + * @dev: pci device to be removed + * + * This routine should be called when a device is removed from + * a running system (e.g. by hotplug or dlpar). It unregisters + * the PCI device from the EEH subsystem. I/O errors affecting + * this device will no longer be detected after this call; thus, + * i/o errors affecting this slot may leave this device unusable. + */ +void eeh_remove_device(struct pci_dev *dev) +{ + struct eeh_dev *edev; + + if (!dev || !eeh_enabled()) + return; + edev = pci_dev_to_eeh_dev(dev); + + /* Unregister the device with the EEH/PCI address search system */ + pr_debug("EEH: Removing device %s\n", pci_name(dev)); + + if (!edev || !edev->pdev || !edev->pe) { + pr_debug("EEH: Not referenced !\n"); + return; + } + + /* + * During the hotplug for EEH error recovery, we need the EEH + * device attached to the parent PE in order for BAR restore + * a bit later. So we keep it for BAR restore and remove it + * from the parent PE during the BAR resotre. + */ + edev->pdev = NULL; + dev->dev.archdata.edev = NULL; + if (!(edev->pe->state & EEH_PE_KEEP)) + eeh_rmv_from_parent_pe(edev); + else + edev->mode |= EEH_DEV_DISCONNECTED; + + /* + * We're removing from the PCI subsystem, that means + * the PCI device driver can't support EEH or not + * well. So we rely on hotplug completely to do recovery + * for the specific PCI device. + */ + edev->mode |= EEH_DEV_NO_HANDLER; + + eeh_addr_cache_rmv_dev(dev); + eeh_sysfs_remove_device(dev); + edev->mode &= ~EEH_DEV_SYSFS; +} + +static int proc_eeh_show(struct seq_file *m, void *v) +{ + if (!eeh_enabled()) { + seq_printf(m, "EEH Subsystem is globally disabled\n"); + seq_printf(m, "eeh_total_mmio_ffs=%llu\n", eeh_stats.total_mmio_ffs); + } else { + seq_printf(m, "EEH Subsystem is enabled\n"); + seq_printf(m, + "no device=%llu\n" + "no device node=%llu\n" + "no config address=%llu\n" + "check not wanted=%llu\n" + "eeh_total_mmio_ffs=%llu\n" + "eeh_false_positives=%llu\n" + "eeh_slot_resets=%llu\n", + eeh_stats.no_device, + eeh_stats.no_dn, + eeh_stats.no_cfg_addr, + eeh_stats.ignored_check, + eeh_stats.total_mmio_ffs, + eeh_stats.false_positives, + eeh_stats.slot_resets); + } + + return 0; +} + +static int proc_eeh_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_eeh_show, NULL); +} + +static const struct file_operations proc_eeh_operations = { + .open = proc_eeh_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef CONFIG_DEBUG_FS +static int eeh_enable_dbgfs_set(void *data, u64 val) +{ + if (val) + eeh_subsystem_flags &= ~EEH_FORCE_DISABLED; + else + eeh_subsystem_flags |= EEH_FORCE_DISABLED; + + /* Notify the backend */ + if (eeh_ops->post_init) + eeh_ops->post_init(); + + return 0; +} + +static int eeh_enable_dbgfs_get(void *data, u64 *val) +{ + if (eeh_enabled()) + *val = 0x1ul; + else + *val = 0x0ul; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get, + eeh_enable_dbgfs_set, "0x%llx\n"); +#endif + +static int __init eeh_init_proc(void) +{ + if (machine_is(pseries) || machine_is(powernv)) { + proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations); +#ifdef CONFIG_DEBUG_FS + debugfs_create_file("eeh_enable", 0600, + powerpc_debugfs_root, NULL, + &eeh_enable_dbgfs_ops); +#endif + } + + return 0; +} +__initcall(eeh_init_proc); diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c new file mode 100644 index 00000000000..e8c9fd546a5 --- /dev/null +++ b/arch/powerpc/kernel/eeh_cache.c @@ -0,0 +1,310 @@ +/* + * PCI address cache; allows the lookup of PCI devices based on I/O address + * + * Copyright IBM Corporation 2004 + * Copyright Linas Vepstas <linas@austin.ibm.com> 2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> + + +/** + * The pci address cache subsystem. This subsystem places + * PCI device address resources into a red-black tree, sorted + * according to the address range, so that given only an i/o + * address, the corresponding PCI device can be **quickly** + * found. It is safe to perform an address lookup in an interrupt + * context; this ability is an important feature. + * + * Currently, the only customer of this code is the EEH subsystem; + * thus, this code has been somewhat tailored to suit EEH better. + * In particular, the cache does *not* hold the addresses of devices + * for which EEH is not enabled. + * + * (Implementation Note: The RB tree seems to be better/faster + * than any hash algo I could think of for this problem, even + * with the penalty of slow pointer chases for d-cache misses). + */ +struct pci_io_addr_range { + struct rb_node rb_node; + unsigned long addr_lo; + unsigned long addr_hi; + struct eeh_dev *edev; + struct pci_dev *pcidev; + unsigned int flags; +}; + +static struct pci_io_addr_cache { + struct rb_root rb_root; + spinlock_t piar_lock; +} pci_io_addr_cache_root; + +static inline struct eeh_dev *__eeh_addr_cache_get_device(unsigned long addr) +{ + struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node; + + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + if (addr < piar->addr_lo) + n = n->rb_left; + else if (addr > piar->addr_hi) + n = n->rb_right; + else + return piar->edev; + } + + return NULL; +} + +/** + * eeh_addr_cache_get_dev - Get device, given only address + * @addr: mmio (PIO) phys address or i/o port number + * + * Given an mmio phys address, or a port number, find a pci device + * that implements this address. Be sure to pci_dev_put the device + * when finished. I/O port numbers are assumed to be offset + * from zero (that is, they do *not* have pci_io_addr added in). + * It is safe to call this function within an interrupt. + */ +struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr) +{ + struct eeh_dev *edev; + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + edev = __eeh_addr_cache_get_device(addr); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); + return edev; +} + +#ifdef DEBUG +/* + * Handy-dandy debug print routine, does nothing more + * than print out the contents of our addr cache. + */ +static void eeh_addr_cache_print(struct pci_io_addr_cache *cache) +{ + struct rb_node *n; + int cnt = 0; + + n = rb_first(&cache->rb_root); + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + pr_debug("PCI: %s addr range %d [%lx-%lx]: %s\n", + (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt, + piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev)); + cnt++; + n = rb_next(n); + } +} +#endif + +/* Insert address range into the rb tree. */ +static struct pci_io_addr_range * +eeh_addr_cache_insert(struct pci_dev *dev, unsigned long alo, + unsigned long ahi, unsigned int flags) +{ + struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node; + struct rb_node *parent = NULL; + struct pci_io_addr_range *piar; + + /* Walk tree, find a place to insert into tree */ + while (*p) { + parent = *p; + piar = rb_entry(parent, struct pci_io_addr_range, rb_node); + if (ahi < piar->addr_lo) { + p = &parent->rb_left; + } else if (alo > piar->addr_hi) { + p = &parent->rb_right; + } else { + if (dev != piar->pcidev || + alo != piar->addr_lo || ahi != piar->addr_hi) { + pr_warning("PIAR: overlapping address range\n"); + } + return piar; + } + } + piar = kzalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC); + if (!piar) + return NULL; + + piar->addr_lo = alo; + piar->addr_hi = ahi; + piar->edev = pci_dev_to_eeh_dev(dev); + piar->pcidev = dev; + piar->flags = flags; + +#ifdef DEBUG + pr_debug("PIAR: insert range=[%lx:%lx] dev=%s\n", + alo, ahi, pci_name(dev)); +#endif + + rb_link_node(&piar->rb_node, parent, p); + rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); + + return piar; +} + +static void __eeh_addr_cache_insert_dev(struct pci_dev *dev) +{ + struct device_node *dn; + struct eeh_dev *edev; + int i; + + dn = pci_device_to_OF_node(dev); + if (!dn) { + pr_warning("PCI: no pci dn found for dev=%s\n", pci_name(dev)); + return; + } + + edev = of_node_to_eeh_dev(dn); + if (!edev) { + pr_warning("PCI: no EEH dev found for dn=%s\n", + dn->full_name); + return; + } + + /* Skip any devices for which EEH is not enabled. */ + if (!eeh_probe_mode_dev() && !edev->pe) { +#ifdef DEBUG + pr_info("PCI: skip building address cache for=%s - %s\n", + pci_name(dev), dn->full_name); +#endif + return; + } + + /* Walk resources on this device, poke them into the tree */ + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + unsigned long start = pci_resource_start(dev,i); + unsigned long end = pci_resource_end(dev,i); + unsigned int flags = pci_resource_flags(dev,i); + + /* We are interested only bus addresses, not dma or other stuff */ + if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM))) + continue; + if (start == 0 || ~start == 0 || end == 0 || ~end == 0) + continue; + eeh_addr_cache_insert(dev, start, end, flags); + } +} + +/** + * eeh_addr_cache_insert_dev - Add a device to the address cache + * @dev: PCI device whose I/O addresses we are interested in. + * + * In order to support the fast lookup of devices based on addresses, + * we maintain a cache of devices that can be quickly searched. + * This routine adds a device to that cache. + */ +void eeh_addr_cache_insert_dev(struct pci_dev *dev) +{ + unsigned long flags; + + /* Ignore PCI bridges */ + if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE) + return; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + __eeh_addr_cache_insert_dev(dev); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); +} + +static inline void __eeh_addr_cache_rmv_dev(struct pci_dev *dev) +{ + struct rb_node *n; + +restart: + n = rb_first(&pci_io_addr_cache_root.rb_root); + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + if (piar->pcidev == dev) { + rb_erase(n, &pci_io_addr_cache_root.rb_root); + kfree(piar); + goto restart; + } + n = rb_next(n); + } +} + +/** + * eeh_addr_cache_rmv_dev - remove pci device from addr cache + * @dev: device to remove + * + * Remove a device from the addr-cache tree. + * This is potentially expensive, since it will walk + * the tree multiple times (once per resource). + * But so what; device removal doesn't need to be that fast. + */ +void eeh_addr_cache_rmv_dev(struct pci_dev *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + __eeh_addr_cache_rmv_dev(dev); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); +} + +/** + * eeh_addr_cache_build - Build a cache of I/O addresses + * + * Build a cache of pci i/o addresses. This cache will be used to + * find the pci device that corresponds to a given address. + * This routine scans all pci busses to build the cache. + * Must be run late in boot process, after the pci controllers + * have been scanned for devices (after all device resources are known). + */ +void eeh_addr_cache_build(void) +{ + struct device_node *dn; + struct eeh_dev *edev; + struct pci_dev *dev = NULL; + + spin_lock_init(&pci_io_addr_cache_root.piar_lock); + + for_each_pci_dev(dev) { + dn = pci_device_to_OF_node(dev); + if (!dn) + continue; + + edev = of_node_to_eeh_dev(dn); + if (!edev) + continue; + + dev->dev.archdata.edev = edev; + edev->pdev = dev; + + eeh_addr_cache_insert_dev(dev); + eeh_sysfs_add_device(dev); + } + +#ifdef DEBUG + /* Verify tree built up above, echo back the list of addrs. */ + eeh_addr_cache_print(&pci_io_addr_cache_root); +#endif +} diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c new file mode 100644 index 00000000000..1efa28f5fc5 --- /dev/null +++ b/arch/powerpc/kernel/eeh_dev.c @@ -0,0 +1,112 @@ +/* + * The file intends to implement dynamic creation of EEH device, which will + * be bound with OF node and PCI device simutaneously. The EEH devices would + * be foundamental information for EEH core components to work proerly. Besides, + * We have to support multiple situations where dynamic creation of EEH device + * is required: + * + * 1) Before PCI emunation starts, we need create EEH devices according to the + * PCI sensitive OF nodes. + * 2) When PCI emunation is done, we need do the binding between PCI device and + * the associated EEH device. + * 3) DR (Dynamic Reconfiguration) would create PCI sensitive OF node. EEH device + * will be created while PCI sensitive OF node is detected from DR. + * 4) PCI hotplug needs redoing the binding between PCI device and EEH device. If + * PHB is newly inserted, we also need create EEH devices accordingly. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/export.h> +#include <linux/gfp.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> + +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> + +/** + * eeh_dev_init - Create EEH device according to OF node + * @dn: device node + * @data: PHB + * + * It will create EEH device according to the given OF node. The function + * might be called by PCI emunation, DR, PHB hotplug. + */ +void *eeh_dev_init(struct device_node *dn, void *data) +{ + struct pci_controller *phb = data; + struct eeh_dev *edev; + + /* Allocate EEH device */ + edev = kzalloc(sizeof(*edev), GFP_KERNEL); + if (!edev) { + pr_warning("%s: out of memory\n", __func__); + return NULL; + } + + /* Associate EEH device with OF node */ + PCI_DN(dn)->edev = edev; + edev->dn = dn; + edev->phb = phb; + INIT_LIST_HEAD(&edev->list); + + return NULL; +} + +/** + * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB + * @phb: PHB + * + * Scan the PHB OF node and its child association, then create the + * EEH devices accordingly + */ +void eeh_dev_phb_init_dynamic(struct pci_controller *phb) +{ + struct device_node *dn = phb->dn; + + /* EEH PE for PHB */ + eeh_phb_pe_create(phb); + + /* EEH device for PHB */ + eeh_dev_init(dn, phb); + + /* EEH devices for children OF nodes */ + traverse_pci_devices(dn, eeh_dev_init, phb); +} + +/** + * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs + * + * Scan all the existing PHBs and create EEH devices for their OF + * nodes and their children OF nodes + */ +static int __init eeh_dev_phb_init(void) +{ + struct pci_controller *phb, *tmp; + + list_for_each_entry_safe(phb, tmp, &hose_list, list_node) + eeh_dev_phb_init_dynamic(phb); + + pr_info("EEH: devices created\n"); + + return 0; +} + +core_initcall(eeh_dev_phb_init); diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c new file mode 100644 index 00000000000..420da61d4ce --- /dev/null +++ b/arch/powerpc/kernel/eeh_driver.c @@ -0,0 +1,870 @@ +/* + * PCI Error Recovery Driver for RPA-compliant PPC64 platform. + * Copyright IBM Corp. 2004 2005 + * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> + */ +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/ppc-pci.h> +#include <asm/pci-bridge.h> +#include <asm/prom.h> +#include <asm/rtas.h> + +/** + * eeh_pcid_name - Retrieve name of PCI device driver + * @pdev: PCI device + * + * This routine is used to retrieve the name of PCI device driver + * if that's valid. + */ +static inline const char *eeh_pcid_name(struct pci_dev *pdev) +{ + if (pdev && pdev->dev.driver) + return pdev->dev.driver->name; + return ""; +} + +/** + * eeh_pcid_get - Get the PCI device driver + * @pdev: PCI device + * + * The function is used to retrieve the PCI device driver for + * the indicated PCI device. Besides, we will increase the reference + * of the PCI device driver to prevent that being unloaded on + * the fly. Otherwise, kernel crash would be seen. + */ +static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) +{ + if (!pdev || !pdev->driver) + return NULL; + + if (!try_module_get(pdev->driver->driver.owner)) + return NULL; + + return pdev->driver; +} + +/** + * eeh_pcid_put - Dereference on the PCI device driver + * @pdev: PCI device + * + * The function is called to do dereference on the PCI device + * driver of the indicated PCI device. + */ +static inline void eeh_pcid_put(struct pci_dev *pdev) +{ + if (!pdev || !pdev->driver) + return; + + module_put(pdev->driver->driver.owner); +} + +#if 0 +static void print_device_node_tree(struct pci_dn *pdn, int dent) +{ + int i; + struct device_node *pc; + + if (!pdn) + return; + for (i = 0; i < dent; i++) + printk(" "); + printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n", + pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr, + pdn->eeh_pe_config_addr, pdn->node->full_name); + dent += 3; + pc = pdn->node->child; + while (pc) { + print_device_node_tree(PCI_DN(pc), dent); + pc = pc->sibling; + } +} +#endif + +/** + * eeh_disable_irq - Disable interrupt for the recovering device + * @dev: PCI device + * + * This routine must be called when reporting temporary or permanent + * error to the particular PCI device to disable interrupt of that + * device. If the device has enabled MSI or MSI-X interrupt, we needn't + * do real work because EEH should freeze DMA transfers for those PCI + * devices encountering EEH errors, which includes MSI or MSI-X. + */ +static void eeh_disable_irq(struct pci_dev *dev) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(dev); + + /* Don't disable MSI and MSI-X interrupts. They are + * effectively disabled by the DMA Stopped state + * when an EEH error occurs. + */ + if (dev->msi_enabled || dev->msix_enabled) + return; + + if (!irq_has_action(dev->irq)) + return; + + edev->mode |= EEH_DEV_IRQ_DISABLED; + disable_irq_nosync(dev->irq); +} + +/** + * eeh_enable_irq - Enable interrupt for the recovering device + * @dev: PCI device + * + * This routine must be called to enable interrupt while failed + * device could be resumed. + */ +static void eeh_enable_irq(struct pci_dev *dev) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(dev); + + if ((edev->mode) & EEH_DEV_IRQ_DISABLED) { + edev->mode &= ~EEH_DEV_IRQ_DISABLED; + /* + * FIXME !!!!! + * + * This is just ass backwards. This maze has + * unbalanced irq_enable/disable calls. So instead of + * finding the root cause it works around the warning + * in the irq_enable code by conditionally calling + * into it. + * + * That's just wrong.The warning in the core code is + * there to tell people to fix their assymetries in + * their own code, not by abusing the core information + * to avoid it. + * + * I so wish that the assymetry would be the other way + * round and a few more irq_disable calls render that + * shit unusable forever. + * + * tglx + */ + if (irqd_irq_disabled(irq_get_irq_data(dev->irq))) + enable_irq(dev->irq); + } +} + +static bool eeh_dev_removed(struct eeh_dev *edev) +{ + /* EEH device removed ? */ + if (!edev || (edev->mode & EEH_DEV_REMOVED)) + return true; + + return false; +} + +/** + * eeh_report_error - Report pci error to each device driver + * @data: eeh device + * @userdata: return value + * + * Report an EEH error to each device driver, collect up and + * merge the device driver responses. Cumulative response + * passed back in "userdata". + */ +static void *eeh_report_error(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + enum pci_ers_result rc, *res = userdata; + struct pci_driver *driver; + + if (!dev || eeh_dev_removed(edev)) + return NULL; + dev->error_state = pci_channel_io_frozen; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + eeh_disable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->error_detected) { + eeh_pcid_put(dev); + return NULL; + } + + rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); + + /* A driver that needs a reset trumps all others */ + if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + if (*res == PCI_ERS_RESULT_NONE) *res = rc; + + eeh_pcid_put(dev); + return NULL; +} + +/** + * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled + * @data: eeh device + * @userdata: return value + * + * Tells each device driver that IO ports, MMIO and config space I/O + * are now enabled. Collects up and merges the device driver responses. + * Cumulative response passed back in "userdata". + */ +static void *eeh_report_mmio_enabled(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + enum pci_ers_result rc, *res = userdata; + struct pci_driver *driver; + + if (!dev || eeh_dev_removed(edev)) + return NULL; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + if (!driver->err_handler || + !driver->err_handler->mmio_enabled || + (edev->mode & EEH_DEV_NO_HANDLER)) { + eeh_pcid_put(dev); + return NULL; + } + + rc = driver->err_handler->mmio_enabled(dev); + + /* A driver that needs a reset trumps all others */ + if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + if (*res == PCI_ERS_RESULT_NONE) *res = rc; + + eeh_pcid_put(dev); + return NULL; +} + +/** + * eeh_report_reset - Tell device that slot has been reset + * @data: eeh device + * @userdata: return value + * + * This routine must be called while EEH tries to reset particular + * PCI device so that the associated PCI device driver could take + * some actions, usually to save data the driver needs so that the + * driver can work again while the device is recovered. + */ +static void *eeh_report_reset(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + enum pci_ers_result rc, *res = userdata; + struct pci_driver *driver; + + if (!dev || eeh_dev_removed(edev)) + return NULL; + dev->error_state = pci_channel_io_normal; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + eeh_enable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->slot_reset || + (edev->mode & EEH_DEV_NO_HANDLER)) { + eeh_pcid_put(dev); + return NULL; + } + + rc = driver->err_handler->slot_reset(dev); + if ((*res == PCI_ERS_RESULT_NONE) || + (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc; + if (*res == PCI_ERS_RESULT_DISCONNECT && + rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + + eeh_pcid_put(dev); + return NULL; +} + +/** + * eeh_report_resume - Tell device to resume normal operations + * @data: eeh device + * @userdata: return value + * + * This routine must be called to notify the device driver that it + * could resume so that the device driver can do some initialization + * to make the recovered device work again. + */ +static void *eeh_report_resume(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + struct pci_driver *driver; + + if (!dev || eeh_dev_removed(edev)) + return NULL; + dev->error_state = pci_channel_io_normal; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + eeh_enable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->resume || + (edev->mode & EEH_DEV_NO_HANDLER)) { + edev->mode &= ~EEH_DEV_NO_HANDLER; + eeh_pcid_put(dev); + return NULL; + } + + driver->err_handler->resume(dev); + + eeh_pcid_put(dev); + return NULL; +} + +/** + * eeh_report_failure - Tell device driver that device is dead. + * @data: eeh device + * @userdata: return value + * + * This informs the device driver that the device is permanently + * dead, and that no further recovery attempts will be made on it. + */ +static void *eeh_report_failure(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + struct pci_driver *driver; + + if (!dev || eeh_dev_removed(edev)) + return NULL; + dev->error_state = pci_channel_io_perm_failure; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + eeh_disable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->error_detected) { + eeh_pcid_put(dev); + return NULL; + } + + driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); + + eeh_pcid_put(dev); + return NULL; +} + +static void *eeh_rmv_device(void *data, void *userdata) +{ + struct pci_driver *driver; + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + int *removed = (int *)userdata; + + /* + * Actually, we should remove the PCI bridges as well. + * However, that's lots of complexity to do that, + * particularly some of devices under the bridge might + * support EEH. So we just care about PCI devices for + * simplicity here. + */ + if (!dev || (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE)) + return NULL; + + /* + * We rely on count-based pcibios_release_device() to + * detach permanently offlined PEs. Unfortunately, that's + * not reliable enough. We might have the permanently + * offlined PEs attached, but we needn't take care of + * them and their child devices. + */ + if (eeh_dev_removed(edev)) + return NULL; + + driver = eeh_pcid_get(dev); + if (driver) { + eeh_pcid_put(dev); + if (driver->err_handler) + return NULL; + } + + /* Remove it from PCI subsystem */ + pr_debug("EEH: Removing %s without EEH sensitive driver\n", + pci_name(dev)); + edev->bus = dev->bus; + edev->mode |= EEH_DEV_DISCONNECTED; + (*removed)++; + + pci_lock_rescan_remove(); + pci_stop_and_remove_bus_device(dev); + pci_unlock_rescan_remove(); + + return NULL; +} + +static void *eeh_pe_detach_dev(void *data, void *userdata) +{ + struct eeh_pe *pe = (struct eeh_pe *)data; + struct eeh_dev *edev, *tmp; + + eeh_pe_for_each_dev(pe, edev, tmp) { + if (!(edev->mode & EEH_DEV_DISCONNECTED)) + continue; + + edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED); + eeh_rmv_from_parent_pe(edev); + } + + return NULL; +} + +/* + * Explicitly clear PE's frozen state for PowerNV where + * we have frozen PE until BAR restore is completed. It's + * harmless to clear it for pSeries. To be consistent with + * PE reset (for 3 times), we try to clear the frozen state + * for 3 times as well. + */ +static void *__eeh_clear_pe_frozen_state(void *data, void *flag) +{ + struct eeh_pe *pe = (struct eeh_pe *)data; + int i, rc; + + for (i = 0; i < 3; i++) { + rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); + if (rc) + continue; + rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); + if (!rc) + break; + } + + /* The PE has been isolated, clear it */ + if (rc) { + pr_warn("%s: Can't clear frozen PHB#%x-PE#%x (%d)\n", + __func__, pe->phb->global_number, pe->addr, rc); + return (void *)pe; + } + + return NULL; +} + +static int eeh_clear_pe_frozen_state(struct eeh_pe *pe) +{ + void *rc; + + rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, NULL); + if (!rc) + eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + + return rc ? -EIO : 0; +} + +/** + * eeh_reset_device - Perform actual reset of a pci slot + * @pe: EEH PE + * @bus: PCI bus corresponding to the isolcated slot + * + * This routine must be called to do reset on the indicated PE. + * During the reset, udev might be invoked because those affected + * PCI devices will be removed and then added. + */ +static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) +{ + struct pci_bus *frozen_bus = eeh_pe_bus_get(pe); + struct timeval tstamp; + int cnt, rc, removed = 0; + + /* pcibios will clear the counter; save the value */ + cnt = pe->freeze_count; + tstamp = pe->tstamp; + + /* + * We don't remove the corresponding PE instances because + * we need the information afterwords. The attached EEH + * devices are expected to be attached soon when calling + * into pcibios_add_pci_devices(). + */ + eeh_pe_state_mark(pe, EEH_PE_KEEP); + if (bus) { + pci_lock_rescan_remove(); + pcibios_remove_pci_devices(bus); + pci_unlock_rescan_remove(); + } else if (frozen_bus) { + eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed); + } + + /* + * Reset the pci controller. (Asserts RST#; resets config space). + * Reconfigure bridges and devices. Don't try to bring the system + * up if the reset failed for some reason. + * + * During the reset, it's very dangerous to have uncontrolled PCI + * config accesses. So we prefer to block them. However, controlled + * PCI config accesses initiated from EEH itself are allowed. + */ + eeh_pe_state_mark(pe, EEH_PE_RESET); + rc = eeh_reset_pe(pe); + if (rc) { + eeh_pe_state_clear(pe, EEH_PE_RESET); + return rc; + } + + pci_lock_rescan_remove(); + + /* Restore PE */ + eeh_ops->configure_bridge(pe); + eeh_pe_restore_bars(pe); + eeh_pe_state_clear(pe, EEH_PE_RESET); + + /* Clear frozen state */ + rc = eeh_clear_pe_frozen_state(pe); + if (rc) + return rc; + + /* Give the system 5 seconds to finish running the user-space + * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, + * this is a hack, but if we don't do this, and try to bring + * the device up before the scripts have taken it down, + * potentially weird things happen. + */ + if (bus) { + pr_info("EEH: Sleep 5s ahead of complete hotplug\n"); + ssleep(5); + + /* + * The EEH device is still connected with its parent + * PE. We should disconnect it so the binding can be + * rebuilt when adding PCI devices. + */ + eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); + pcibios_add_pci_devices(bus); + } else if (frozen_bus && removed) { + pr_info("EEH: Sleep 5s ahead of partial hotplug\n"); + ssleep(5); + + eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); + pcibios_add_pci_devices(frozen_bus); + } + eeh_pe_state_clear(pe, EEH_PE_KEEP); + + pe->tstamp = tstamp; + pe->freeze_count = cnt; + + pci_unlock_rescan_remove(); + return 0; +} + +/* The longest amount of time to wait for a pci device + * to come back on line, in seconds. + */ +#define MAX_WAIT_FOR_RECOVERY 300 + +static void eeh_handle_normal_event(struct eeh_pe *pe) +{ + struct pci_bus *frozen_bus; + int rc = 0; + enum pci_ers_result result = PCI_ERS_RESULT_NONE; + + frozen_bus = eeh_pe_bus_get(pe); + if (!frozen_bus) { + pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n", + __func__, pe->phb->global_number, pe->addr); + return; + } + + eeh_pe_update_time_stamp(pe); + pe->freeze_count++; + if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) + goto excess_failures; + pr_warning("EEH: This PCI device has failed %d times in the last hour\n", + pe->freeze_count); + + /* Walk the various device drivers attached to this slot through + * a reset sequence, giving each an opportunity to do what it needs + * to accomplish the reset. Each child gets a report of the + * status ... if any child can't handle the reset, then the entire + * slot is dlpar removed and added. + */ + pr_info("EEH: Notify device drivers to shutdown\n"); + eeh_pe_dev_traverse(pe, eeh_report_error, &result); + + /* Get the current PCI slot state. This can take a long time, + * sometimes over 3 seconds for certain systems. + */ + rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); + if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { + pr_warning("EEH: Permanent failure\n"); + goto hard_fail; + } + + /* Since rtas may enable MMIO when posting the error log, + * don't post the error log until after all dev drivers + * have been informed. + */ + pr_info("EEH: Collect temporary log\n"); + eeh_slot_error_detail(pe, EEH_LOG_TEMP); + + /* If all device drivers were EEH-unaware, then shut + * down all of the device drivers, and hope they + * go down willingly, without panicing the system. + */ + if (result == PCI_ERS_RESULT_NONE) { + pr_info("EEH: Reset with hotplug activity\n"); + rc = eeh_reset_device(pe, frozen_bus); + if (rc) { + pr_warning("%s: Unable to reset, err=%d\n", + __func__, rc); + goto hard_fail; + } + } + + /* If all devices reported they can proceed, then re-enable MMIO */ + if (result == PCI_ERS_RESULT_CAN_RECOVER) { + pr_info("EEH: Enable I/O for affected devices\n"); + rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); + + if (rc < 0) + goto hard_fail; + if (rc) { + result = PCI_ERS_RESULT_NEED_RESET; + } else { + pr_info("EEH: Notify device drivers to resume I/O\n"); + eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result); + } + } + + /* If all devices reported they can proceed, then re-enable DMA */ + if (result == PCI_ERS_RESULT_CAN_RECOVER) { + pr_info("EEH: Enabled DMA for affected devices\n"); + rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); + + if (rc < 0) + goto hard_fail; + if (rc) { + result = PCI_ERS_RESULT_NEED_RESET; + } else { + /* + * We didn't do PE reset for the case. The PE + * is still in frozen state. Clear it before + * resuming the PE. + */ + eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + result = PCI_ERS_RESULT_RECOVERED; + } + } + + /* If any device has a hard failure, then shut off everything. */ + if (result == PCI_ERS_RESULT_DISCONNECT) { + pr_warning("EEH: Device driver gave up\n"); + goto hard_fail; + } + + /* If any device called out for a reset, then reset the slot */ + if (result == PCI_ERS_RESULT_NEED_RESET) { + pr_info("EEH: Reset without hotplug activity\n"); + rc = eeh_reset_device(pe, NULL); + if (rc) { + pr_warning("%s: Cannot reset, err=%d\n", + __func__, rc); + goto hard_fail; + } + + pr_info("EEH: Notify device drivers " + "the completion of reset\n"); + result = PCI_ERS_RESULT_NONE; + eeh_pe_dev_traverse(pe, eeh_report_reset, &result); + } + + /* All devices should claim they have recovered by now. */ + if ((result != PCI_ERS_RESULT_RECOVERED) && + (result != PCI_ERS_RESULT_NONE)) { + pr_warning("EEH: Not recovered\n"); + goto hard_fail; + } + + /* Tell all device drivers that they can resume operations */ + pr_info("EEH: Notify device driver to resume\n"); + eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); + + return; + +excess_failures: + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + pr_err("EEH: PHB#%d-PE#%x has failed %d times in the\n" + "last hour and has been permanently disabled.\n" + "Please try reseating or replacing it.\n", + pe->phb->global_number, pe->addr, + pe->freeze_count); + goto perm_error; + +hard_fail: + pr_err("EEH: Unable to recover from failure from PHB#%d-PE#%x.\n" + "Please try reseating or replacing it\n", + pe->phb->global_number, pe->addr); + +perm_error: + eeh_slot_error_detail(pe, EEH_LOG_PERM); + + /* Notify all devices that they're about to go down. */ + eeh_pe_dev_traverse(pe, eeh_report_failure, NULL); + + /* Mark the PE to be removed permanently */ + pe->freeze_count = EEH_MAX_ALLOWED_FREEZES + 1; + + /* + * Shut down the device drivers for good. We mark + * all removed devices correctly to avoid access + * the their PCI config any more. + */ + if (frozen_bus) { + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + + pci_lock_rescan_remove(); + pcibios_remove_pci_devices(frozen_bus); + pci_unlock_rescan_remove(); + } +} + +static void eeh_handle_special_event(void) +{ + struct eeh_pe *pe, *phb_pe; + struct pci_bus *bus; + struct pci_controller *hose; + unsigned long flags; + int rc; + + + do { + rc = eeh_ops->next_error(&pe); + + switch (rc) { + case EEH_NEXT_ERR_DEAD_IOC: + /* Mark all PHBs in dead state */ + eeh_serialize_lock(&flags); + + /* Purge all events */ + eeh_remove_event(NULL, true); + + list_for_each_entry(hose, &hose_list, list_node) { + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe) continue; + + eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); + } + + eeh_serialize_unlock(flags); + + break; + case EEH_NEXT_ERR_FROZEN_PE: + case EEH_NEXT_ERR_FENCED_PHB: + case EEH_NEXT_ERR_DEAD_PHB: + /* Mark the PE in fenced state */ + eeh_serialize_lock(&flags); + + /* Purge all events of the PHB */ + eeh_remove_event(pe, true); + + if (rc == EEH_NEXT_ERR_DEAD_PHB) + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + else + eeh_pe_state_mark(pe, + EEH_PE_ISOLATED | EEH_PE_RECOVERING); + + eeh_serialize_unlock(flags); + + break; + case EEH_NEXT_ERR_NONE: + return; + default: + pr_warn("%s: Invalid value %d from next_error()\n", + __func__, rc); + return; + } + + /* + * For fenced PHB and frozen PE, it's handled as normal + * event. We have to remove the affected PHBs for dead + * PHB and IOC + */ + if (rc == EEH_NEXT_ERR_FROZEN_PE || + rc == EEH_NEXT_ERR_FENCED_PHB) { + eeh_handle_normal_event(pe); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + } else { + pci_lock_rescan_remove(); + list_for_each_entry(hose, &hose_list, list_node) { + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe || + !(phb_pe->state & EEH_PE_ISOLATED) || + (phb_pe->state & EEH_PE_RECOVERING)) + continue; + + /* Notify all devices to be down */ + bus = eeh_pe_bus_get(phb_pe); + eeh_pe_dev_traverse(pe, + eeh_report_failure, NULL); + pcibios_remove_pci_devices(bus); + } + pci_unlock_rescan_remove(); + } + + /* + * If we have detected dead IOC, we needn't proceed + * any more since all PHBs would have been removed + */ + if (rc == EEH_NEXT_ERR_DEAD_IOC) + break; + } while (rc != EEH_NEXT_ERR_NONE); +} + +/** + * eeh_handle_event - Reset a PCI device after hard lockup. + * @pe: EEH PE + * + * While PHB detects address or data parity errors on particular PCI + * slot, the associated PE will be frozen. Besides, DMA's occurring + * to wild addresses (which usually happen due to bugs in device + * drivers or in PCI adapter firmware) can cause EEH error. #SERR, + * #PERR or other misc PCI-related errors also can trigger EEH errors. + * + * Recovery process consists of unplugging the device driver (which + * generated hotplug events to userspace), then issuing a PCI #RST to + * the device, then reconfiguring the PCI config space for all bridges + * & devices under this slot, and then finally restarting the device + * drivers (which cause a second set of hotplug events to go out to + * userspace). + */ +void eeh_handle_event(struct eeh_pe *pe) +{ + if (pe) + eeh_handle_normal_event(pe); + else + eeh_handle_special_event(); +} diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c new file mode 100644 index 00000000000..4eefb6e34db --- /dev/null +++ b/arch/powerpc/kernel/eeh_event.c @@ -0,0 +1,196 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2005 Linas Vepstas <linas@linas.org> + */ + +#include <linux/delay.h> +#include <linux/list.h> +#include <linux/sched.h> +#include <linux/semaphore.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/kthread.h> +#include <asm/eeh_event.h> +#include <asm/ppc-pci.h> + +/** Overview: + * EEH error states may be detected within exception handlers; + * however, the recovery processing needs to occur asynchronously + * in a normal kernel context and not an interrupt context. + * This pair of routines creates an event and queues it onto a + * work-queue, where a worker thread can drive recovery. + */ + +static DEFINE_SPINLOCK(eeh_eventlist_lock); +static struct semaphore eeh_eventlist_sem; +LIST_HEAD(eeh_eventlist); + +/** + * eeh_event_handler - Dispatch EEH events. + * @dummy - unused + * + * The detection of a frozen slot can occur inside an interrupt, + * where it can be hard to do anything about it. The goal of this + * routine is to pull these detection events out of the context + * of the interrupt handler, and re-dispatch them for processing + * at a later time in a normal context. + */ +static int eeh_event_handler(void * dummy) +{ + unsigned long flags; + struct eeh_event *event; + struct eeh_pe *pe; + + while (!kthread_should_stop()) { + if (down_interruptible(&eeh_eventlist_sem)) + break; + + /* Fetch EEH event from the queue */ + spin_lock_irqsave(&eeh_eventlist_lock, flags); + event = NULL; + if (!list_empty(&eeh_eventlist)) { + event = list_entry(eeh_eventlist.next, + struct eeh_event, list); + list_del(&event->list); + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + if (!event) + continue; + + /* We might have event without binding PE */ + pe = event->pe; + if (pe) { + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + if (pe->type & EEH_PE_PHB) + pr_info("EEH: Detected error on PHB#%d\n", + pe->phb->global_number); + else + pr_info("EEH: Detected PCI bus error on " + "PHB#%d-PE#%x\n", + pe->phb->global_number, pe->addr); + eeh_handle_event(pe); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + } else { + eeh_handle_event(NULL); + } + + kfree(event); + } + + return 0; +} + +/** + * eeh_event_init - Start kernel thread to handle EEH events + * + * This routine is called to start the kernel thread for processing + * EEH event. + */ +int eeh_event_init(void) +{ + struct task_struct *t; + int ret = 0; + + /* Initialize semaphore */ + sema_init(&eeh_eventlist_sem, 0); + + t = kthread_run(eeh_event_handler, NULL, "eehd"); + if (IS_ERR(t)) { + ret = PTR_ERR(t); + pr_err("%s: Failed to start EEH daemon (%d)\n", + __func__, ret); + return ret; + } + + return 0; +} + +/** + * eeh_send_failure_event - Generate a PCI error event + * @pe: EEH PE + * + * This routine can be called within an interrupt context; + * the actual event will be delivered in a normal context + * (from a workqueue). + */ +int eeh_send_failure_event(struct eeh_pe *pe) +{ + unsigned long flags; + struct eeh_event *event; + + event = kzalloc(sizeof(*event), GFP_ATOMIC); + if (!event) { + pr_err("EEH: out of memory, event not handled\n"); + return -ENOMEM; + } + event->pe = pe; + + /* We may or may not be called in an interrupt context */ + spin_lock_irqsave(&eeh_eventlist_lock, flags); + list_add(&event->list, &eeh_eventlist); + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + + /* For EEH deamon to knick in */ + up(&eeh_eventlist_sem); + + return 0; +} + +/** + * eeh_remove_event - Remove EEH event from the queue + * @pe: Event binding to the PE + * @force: Event will be removed unconditionally + * + * On PowerNV platform, we might have subsequent coming events + * is part of the former one. For that case, those subsequent + * coming events are totally duplicated and unnecessary, thus + * they should be removed. + */ +void eeh_remove_event(struct eeh_pe *pe, bool force) +{ + unsigned long flags; + struct eeh_event *event, *tmp; + + /* + * If we have NULL PE passed in, we have dead IOC + * or we're sure we can report all existing errors + * by the caller. + * + * With "force", the event with associated PE that + * have been isolated, the event won't be removed + * to avoid event lost. + */ + spin_lock_irqsave(&eeh_eventlist_lock, flags); + list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) { + if (!force && event->pe && + (event->pe->state & EEH_PE_ISOLATED)) + continue; + + if (!pe) { + list_del(&event->list); + kfree(event); + } else if (pe->type & EEH_PE_PHB) { + if (event->pe && event->pe->phb == pe->phb) { + list_del(&event->list); + kfree(event); + } + } else if (event->pe == pe) { + list_del(&event->list); + kfree(event); + } + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); +} diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c new file mode 100644 index 00000000000..fbd01eba447 --- /dev/null +++ b/arch/powerpc/kernel/eeh_pe.c @@ -0,0 +1,887 @@ +/* + * The file intends to implement PE based on the information from + * platforms. Basically, there have 3 types of PEs: PHB/Bus/Device. + * All the PEs should be organized as hierarchy tree. The first level + * of the tree will be associated to existing PHBs since the particular + * PE is only meaningful in one PHB domain. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> + +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> + +static LIST_HEAD(eeh_phb_pe); + +/** + * eeh_pe_alloc - Allocate PE + * @phb: PCI controller + * @type: PE type + * + * Allocate PE instance dynamically. + */ +static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type) +{ + struct eeh_pe *pe; + + /* Allocate PHB PE */ + pe = kzalloc(sizeof(struct eeh_pe), GFP_KERNEL); + if (!pe) return NULL; + + /* Initialize PHB PE */ + pe->type = type; + pe->phb = phb; + INIT_LIST_HEAD(&pe->child_list); + INIT_LIST_HEAD(&pe->child); + INIT_LIST_HEAD(&pe->edevs); + + return pe; +} + +/** + * eeh_phb_pe_create - Create PHB PE + * @phb: PCI controller + * + * The function should be called while the PHB is detected during + * system boot or PCI hotplug in order to create PHB PE. + */ +int eeh_phb_pe_create(struct pci_controller *phb) +{ + struct eeh_pe *pe; + + /* Allocate PHB PE */ + pe = eeh_pe_alloc(phb, EEH_PE_PHB); + if (!pe) { + pr_err("%s: out of memory!\n", __func__); + return -ENOMEM; + } + + /* Put it into the list */ + list_add_tail(&pe->child, &eeh_phb_pe); + + pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number); + + return 0; +} + +/** + * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB + * @phb: PCI controller + * + * The overall PEs form hierarchy tree. The first layer of the + * hierarchy tree is composed of PHB PEs. The function is used + * to retrieve the corresponding PHB PE according to the given PHB. + */ +struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb) +{ + struct eeh_pe *pe; + + list_for_each_entry(pe, &eeh_phb_pe, child) { + /* + * Actually, we needn't check the type since + * the PE for PHB has been determined when that + * was created. + */ + if ((pe->type & EEH_PE_PHB) && pe->phb == phb) + return pe; + } + + return NULL; +} + +/** + * eeh_pe_next - Retrieve the next PE in the tree + * @pe: current PE + * @root: root PE + * + * The function is used to retrieve the next PE in the + * hierarchy PE tree. + */ +static struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, + struct eeh_pe *root) +{ + struct list_head *next = pe->child_list.next; + + if (next == &pe->child_list) { + while (1) { + if (pe == root) + return NULL; + next = pe->child.next; + if (next != &pe->parent->child_list) + break; + pe = pe->parent; + } + } + + return list_entry(next, struct eeh_pe, child); +} + +/** + * eeh_pe_traverse - Traverse PEs in the specified PHB + * @root: root PE + * @fn: callback + * @flag: extra parameter to callback + * + * The function is used to traverse the specified PE and its + * child PEs. The traversing is to be terminated once the + * callback returns something other than NULL, or no more PEs + * to be traversed. + */ +void *eeh_pe_traverse(struct eeh_pe *root, + eeh_traverse_func fn, void *flag) +{ + struct eeh_pe *pe; + void *ret; + + for (pe = root; pe; pe = eeh_pe_next(pe, root)) { + ret = fn(pe, flag); + if (ret) return ret; + } + + return NULL; +} + +/** + * eeh_pe_dev_traverse - Traverse the devices from the PE + * @root: EEH PE + * @fn: function callback + * @flag: extra parameter to callback + * + * The function is used to traverse the devices of the specified + * PE and its child PEs. + */ +void *eeh_pe_dev_traverse(struct eeh_pe *root, + eeh_traverse_func fn, void *flag) +{ + struct eeh_pe *pe; + struct eeh_dev *edev, *tmp; + void *ret; + + if (!root) { + pr_warning("%s: Invalid PE %p\n", __func__, root); + return NULL; + } + + /* Traverse root PE */ + for (pe = root; pe; pe = eeh_pe_next(pe, root)) { + eeh_pe_for_each_dev(pe, edev, tmp) { + ret = fn(edev, flag); + if (ret) + return ret; + } + } + + return NULL; +} + +/** + * __eeh_pe_get - Check the PE address + * @data: EEH PE + * @flag: EEH device + * + * For one particular PE, it can be identified by PE address + * or tranditional BDF address. BDF address is composed of + * Bus/Device/Function number. The extra data referred by flag + * indicates which type of address should be used. + */ +static void *__eeh_pe_get(void *data, void *flag) +{ + struct eeh_pe *pe = (struct eeh_pe *)data; + struct eeh_dev *edev = (struct eeh_dev *)flag; + + /* Unexpected PHB PE */ + if (pe->type & EEH_PE_PHB) + return NULL; + + /* We prefer PE address */ + if (edev->pe_config_addr && + (edev->pe_config_addr == pe->addr)) + return pe; + + /* Try BDF address */ + if (edev->config_addr && + (edev->config_addr == pe->config_addr)) + return pe; + + return NULL; +} + +/** + * eeh_pe_get - Search PE based on the given address + * @edev: EEH device + * + * Search the corresponding PE based on the specified address which + * is included in the eeh device. The function is used to check if + * the associated PE has been created against the PE address. It's + * notable that the PE address has 2 format: traditional PE address + * which is composed of PCI bus/device/function number, or unified + * PE address. + */ +struct eeh_pe *eeh_pe_get(struct eeh_dev *edev) +{ + struct eeh_pe *root = eeh_phb_pe_get(edev->phb); + struct eeh_pe *pe; + + pe = eeh_pe_traverse(root, __eeh_pe_get, edev); + + return pe; +} + +/** + * eeh_pe_get_parent - Retrieve the parent PE + * @edev: EEH device + * + * The whole PEs existing in the system are organized as hierarchy + * tree. The function is used to retrieve the parent PE according + * to the parent EEH device. + */ +static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev) +{ + struct device_node *dn; + struct eeh_dev *parent; + + /* + * It might have the case for the indirect parent + * EEH device already having associated PE, but + * the direct parent EEH device doesn't have yet. + */ + dn = edev->dn->parent; + while (dn) { + /* We're poking out of PCI territory */ + if (!PCI_DN(dn)) return NULL; + + parent = of_node_to_eeh_dev(dn); + /* We're poking out of PCI territory */ + if (!parent) return NULL; + + if (parent->pe) + return parent->pe; + + dn = dn->parent; + } + + return NULL; +} + +/** + * eeh_add_to_parent_pe - Add EEH device to parent PE + * @edev: EEH device + * + * Add EEH device to the parent PE. If the parent PE already + * exists, the PE type will be changed to EEH_PE_BUS. Otherwise, + * we have to create new PE to hold the EEH device and the new + * PE will be linked to its parent PE as well. + */ +int eeh_add_to_parent_pe(struct eeh_dev *edev) +{ + struct eeh_pe *pe, *parent; + + /* + * Search the PE has been existing or not according + * to the PE address. If that has been existing, the + * PE should be composed of PCI bus and its subordinate + * components. + */ + pe = eeh_pe_get(edev); + if (pe && !(pe->type & EEH_PE_INVALID)) { + if (!edev->pe_config_addr) { + pr_err("%s: PE with addr 0x%x already exists\n", + __func__, edev->config_addr); + return -EEXIST; + } + + /* Mark the PE as type of PCI bus */ + pe->type = EEH_PE_BUS; + edev->pe = pe; + + /* Put the edev to PE */ + list_add_tail(&edev->list, &pe->edevs); + pr_debug("EEH: Add %s to Bus PE#%x\n", + edev->dn->full_name, pe->addr); + + return 0; + } else if (pe && (pe->type & EEH_PE_INVALID)) { + list_add_tail(&edev->list, &pe->edevs); + edev->pe = pe; + /* + * We're running to here because of PCI hotplug caused by + * EEH recovery. We need clear EEH_PE_INVALID until the top. + */ + parent = pe; + while (parent) { + if (!(parent->type & EEH_PE_INVALID)) + break; + parent->type &= ~(EEH_PE_INVALID | EEH_PE_KEEP); + parent = parent->parent; + } + pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n", + edev->dn->full_name, pe->addr, pe->parent->addr); + + return 0; + } + + /* Create a new EEH PE */ + pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE); + if (!pe) { + pr_err("%s: out of memory!\n", __func__); + return -ENOMEM; + } + pe->addr = edev->pe_config_addr; + pe->config_addr = edev->config_addr; + + /* + * While doing PE reset, we probably hot-reset the + * upstream bridge. However, the PCI devices including + * the associated EEH devices might be removed when EEH + * core is doing recovery. So that won't safe to retrieve + * the bridge through downstream EEH device. We have to + * trace the parent PCI bus, then the upstream bridge. + */ + if (eeh_probe_mode_dev()) + pe->bus = eeh_dev_to_pci_dev(edev)->bus; + + /* + * Put the new EEH PE into hierarchy tree. If the parent + * can't be found, the newly created PE will be attached + * to PHB directly. Otherwise, we have to associate the + * PE with its parent. + */ + parent = eeh_pe_get_parent(edev); + if (!parent) { + parent = eeh_phb_pe_get(edev->phb); + if (!parent) { + pr_err("%s: No PHB PE is found (PHB Domain=%d)\n", + __func__, edev->phb->global_number); + edev->pe = NULL; + kfree(pe); + return -EEXIST; + } + } + pe->parent = parent; + + /* + * Put the newly created PE into the child list and + * link the EEH device accordingly. + */ + list_add_tail(&pe->child, &parent->child_list); + list_add_tail(&edev->list, &pe->edevs); + edev->pe = pe; + pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n", + edev->dn->full_name, pe->addr, pe->parent->addr); + + return 0; +} + +/** + * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE + * @edev: EEH device + * + * The PE hierarchy tree might be changed when doing PCI hotplug. + * Also, the PCI devices or buses could be removed from the system + * during EEH recovery. So we have to call the function remove the + * corresponding PE accordingly if necessary. + */ +int eeh_rmv_from_parent_pe(struct eeh_dev *edev) +{ + struct eeh_pe *pe, *parent, *child; + int cnt; + + if (!edev->pe) { + pr_debug("%s: No PE found for EEH device %s\n", + __func__, edev->dn->full_name); + return -EEXIST; + } + + /* Remove the EEH device */ + pe = edev->pe; + edev->pe = NULL; + list_del(&edev->list); + + /* + * Check if the parent PE includes any EEH devices. + * If not, we should delete that. Also, we should + * delete the parent PE if it doesn't have associated + * child PEs and EEH devices. + */ + while (1) { + parent = pe->parent; + if (pe->type & EEH_PE_PHB) + break; + + if (!(pe->state & EEH_PE_KEEP)) { + if (list_empty(&pe->edevs) && + list_empty(&pe->child_list)) { + list_del(&pe->child); + kfree(pe); + } else { + break; + } + } else { + if (list_empty(&pe->edevs)) { + cnt = 0; + list_for_each_entry(child, &pe->child_list, child) { + if (!(child->type & EEH_PE_INVALID)) { + cnt++; + break; + } + } + + if (!cnt) + pe->type |= EEH_PE_INVALID; + else + break; + } + } + + pe = parent; + } + + return 0; +} + +/** + * eeh_pe_update_time_stamp - Update PE's frozen time stamp + * @pe: EEH PE + * + * We have time stamp for each PE to trace its time of getting + * frozen in last hour. The function should be called to update + * the time stamp on first error of the specific PE. On the other + * handle, we needn't account for errors happened in last hour. + */ +void eeh_pe_update_time_stamp(struct eeh_pe *pe) +{ + struct timeval tstamp; + + if (!pe) return; + + if (pe->freeze_count <= 0) { + pe->freeze_count = 0; + do_gettimeofday(&pe->tstamp); + } else { + do_gettimeofday(&tstamp); + if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) { + pe->tstamp = tstamp; + pe->freeze_count = 0; + } + } +} + +/** + * __eeh_pe_state_mark - Mark the state for the PE + * @data: EEH PE + * @flag: state + * + * The function is used to mark the indicated state for the given + * PE. Also, the associated PCI devices will be put into IO frozen + * state as well. + */ +static void *__eeh_pe_state_mark(void *data, void *flag) +{ + struct eeh_pe *pe = (struct eeh_pe *)data; + int state = *((int *)flag); + struct eeh_dev *edev, *tmp; + struct pci_dev *pdev; + + /* Keep the state of permanently removed PE intact */ + if ((pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) && + (state & (EEH_PE_ISOLATED | EEH_PE_RECOVERING))) + return NULL; + + pe->state |= state; + + /* Offline PCI devices if applicable */ + if (state != EEH_PE_ISOLATED) + return NULL; + + eeh_pe_for_each_dev(pe, edev, tmp) { + pdev = eeh_dev_to_pci_dev(edev); + if (pdev) + pdev->error_state = pci_channel_io_frozen; + } + + return NULL; +} + +/** + * eeh_pe_state_mark - Mark specified state for PE and its associated device + * @pe: EEH PE + * + * EEH error affects the current PE and its child PEs. The function + * is used to mark appropriate state for the affected PEs and the + * associated devices. + */ +void eeh_pe_state_mark(struct eeh_pe *pe, int state) +{ + eeh_pe_traverse(pe, __eeh_pe_state_mark, &state); +} + +static void *__eeh_pe_dev_mode_mark(void *data, void *flag) +{ + struct eeh_dev *edev = data; + int mode = *((int *)flag); + + edev->mode |= mode; + + return NULL; +} + +/** + * eeh_pe_dev_state_mark - Mark state for all device under the PE + * @pe: EEH PE + * + * Mark specific state for all child devices of the PE. + */ +void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode) +{ + eeh_pe_dev_traverse(pe, __eeh_pe_dev_mode_mark, &mode); +} + +/** + * __eeh_pe_state_clear - Clear state for the PE + * @data: EEH PE + * @flag: state + * + * The function is used to clear the indicated state from the + * given PE. Besides, we also clear the check count of the PE + * as well. + */ +static void *__eeh_pe_state_clear(void *data, void *flag) +{ + struct eeh_pe *pe = (struct eeh_pe *)data; + int state = *((int *)flag); + + /* Keep the state of permanently removed PE intact */ + if ((pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) && + (state & EEH_PE_ISOLATED)) + return NULL; + + pe->state &= ~state; + + /* Clear check count since last isolation */ + if (state & EEH_PE_ISOLATED) + pe->check_count = 0; + + return NULL; +} + +/** + * eeh_pe_state_clear - Clear state for the PE and its children + * @pe: PE + * @state: state to be cleared + * + * When the PE and its children has been recovered from error, + * we need clear the error state for that. The function is used + * for the purpose. + */ +void eeh_pe_state_clear(struct eeh_pe *pe, int state) +{ + eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); +} + +/* + * Some PCI bridges (e.g. PLX bridges) have primary/secondary + * buses assigned explicitly by firmware, and we probably have + * lost that after reset. So we have to delay the check until + * the PCI-CFG registers have been restored for the parent + * bridge. + * + * Don't use normal PCI-CFG accessors, which probably has been + * blocked on normal path during the stage. So we need utilize + * eeh operations, which is always permitted. + */ +static void eeh_bridge_check_link(struct eeh_dev *edev, + struct device_node *dn) +{ + int cap; + uint32_t val; + int timeout = 0; + + /* + * We only check root port and downstream ports of + * PCIe switches + */ + if (!(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT))) + return; + + pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n", + __func__, edev->phb->global_number, + edev->config_addr >> 8, + PCI_SLOT(edev->config_addr & 0xFF), + PCI_FUNC(edev->config_addr & 0xFF)); + + /* Check slot status */ + cap = edev->pcie_cap; + eeh_ops->read_config(dn, cap + PCI_EXP_SLTSTA, 2, &val); + if (!(val & PCI_EXP_SLTSTA_PDS)) { + pr_debug(" No card in the slot (0x%04x) !\n", val); + return; + } + + /* Check power status if we have the capability */ + eeh_ops->read_config(dn, cap + PCI_EXP_SLTCAP, 2, &val); + if (val & PCI_EXP_SLTCAP_PCP) { + eeh_ops->read_config(dn, cap + PCI_EXP_SLTCTL, 2, &val); + if (val & PCI_EXP_SLTCTL_PCC) { + pr_debug(" In power-off state, power it on ...\n"); + val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC); + val |= (0x0100 & PCI_EXP_SLTCTL_PIC); + eeh_ops->write_config(dn, cap + PCI_EXP_SLTCTL, 2, val); + msleep(2 * 1000); + } + } + + /* Enable link */ + eeh_ops->read_config(dn, cap + PCI_EXP_LNKCTL, 2, &val); + val &= ~PCI_EXP_LNKCTL_LD; + eeh_ops->write_config(dn, cap + PCI_EXP_LNKCTL, 2, val); + + /* Check link */ + eeh_ops->read_config(dn, cap + PCI_EXP_LNKCAP, 4, &val); + if (!(val & PCI_EXP_LNKCAP_DLLLARC)) { + pr_debug(" No link reporting capability (0x%08x) \n", val); + msleep(1000); + return; + } + + /* Wait the link is up until timeout (5s) */ + timeout = 0; + while (timeout < 5000) { + msleep(20); + timeout += 20; + + eeh_ops->read_config(dn, cap + PCI_EXP_LNKSTA, 2, &val); + if (val & PCI_EXP_LNKSTA_DLLLA) + break; + } + + if (val & PCI_EXP_LNKSTA_DLLLA) + pr_debug(" Link up (%s)\n", + (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB"); + else + pr_debug(" Link not ready (0x%04x)\n", val); +} + +#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) +#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)]) + +static void eeh_restore_bridge_bars(struct eeh_dev *edev, + struct device_node *dn) +{ + int i; + + /* + * Device BARs: 0x10 - 0x18 + * Bus numbers and windows: 0x18 - 0x30 + */ + for (i = 4; i < 13; i++) + eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]); + /* Rom: 0x38 */ + eeh_ops->write_config(dn, 14*4, 4, edev->config_space[14]); + + /* Cache line & Latency timer: 0xC 0xD */ + eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1, + SAVED_BYTE(PCI_CACHE_LINE_SIZE)); + eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1, + SAVED_BYTE(PCI_LATENCY_TIMER)); + /* Max latency, min grant, interrupt ping and line: 0x3C */ + eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]); + + /* PCI Command: 0x4 */ + eeh_ops->write_config(dn, PCI_COMMAND, 4, edev->config_space[1]); + + /* Check the PCIe link is ready */ + eeh_bridge_check_link(edev, dn); +} + +static void eeh_restore_device_bars(struct eeh_dev *edev, + struct device_node *dn) +{ + int i; + u32 cmd; + + for (i = 4; i < 10; i++) + eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]); + /* 12 == Expansion ROM Address */ + eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]); + + eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1, + SAVED_BYTE(PCI_CACHE_LINE_SIZE)); + eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1, + SAVED_BYTE(PCI_LATENCY_TIMER)); + + /* max latency, min grant, interrupt pin and line */ + eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]); + + /* + * Restore PERR & SERR bits, some devices require it, + * don't touch the other command bits + */ + eeh_ops->read_config(dn, PCI_COMMAND, 4, &cmd); + if (edev->config_space[1] & PCI_COMMAND_PARITY) + cmd |= PCI_COMMAND_PARITY; + else + cmd &= ~PCI_COMMAND_PARITY; + if (edev->config_space[1] & PCI_COMMAND_SERR) + cmd |= PCI_COMMAND_SERR; + else + cmd &= ~PCI_COMMAND_SERR; + eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd); +} + +/** + * eeh_restore_one_device_bars - Restore the Base Address Registers for one device + * @data: EEH device + * @flag: Unused + * + * Loads the PCI configuration space base address registers, + * the expansion ROM base address, the latency timer, and etc. + * from the saved values in the device node. + */ +static void *eeh_restore_one_device_bars(void *data, void *flag) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct device_node *dn = eeh_dev_to_of_node(edev); + + /* Do special restore for bridges */ + if (edev->mode & EEH_DEV_BRIDGE) + eeh_restore_bridge_bars(edev, dn); + else + eeh_restore_device_bars(edev, dn); + + if (eeh_ops->restore_config) + eeh_ops->restore_config(dn); + + return NULL; +} + +/** + * eeh_pe_restore_bars - Restore the PCI config space info + * @pe: EEH PE + * + * This routine performs a recursive walk to the children + * of this device as well. + */ +void eeh_pe_restore_bars(struct eeh_pe *pe) +{ + /* + * We needn't take the EEH lock since eeh_pe_dev_traverse() + * will take that. + */ + eeh_pe_dev_traverse(pe, eeh_restore_one_device_bars, NULL); +} + +/** + * eeh_pe_loc_get - Retrieve location code binding to the given PE + * @pe: EEH PE + * + * Retrieve the location code of the given PE. If the primary PE bus + * is root bus, we will grab location code from PHB device tree node + * or root port. Otherwise, the upstream bridge's device tree node + * of the primary PE bus will be checked for the location code. + */ +const char *eeh_pe_loc_get(struct eeh_pe *pe) +{ + struct pci_controller *hose; + struct pci_bus *bus = eeh_pe_bus_get(pe); + struct pci_dev *pdev; + struct device_node *dn; + const char *loc; + + if (!bus) + return "N/A"; + + /* PHB PE or root PE ? */ + if (pci_is_root_bus(bus)) { + hose = pci_bus_to_host(bus); + loc = of_get_property(hose->dn, + "ibm,loc-code", NULL); + if (loc) + return loc; + loc = of_get_property(hose->dn, + "ibm,io-base-loc-code", NULL); + if (loc) + return loc; + + pdev = pci_get_slot(bus, 0x0); + } else { + pdev = bus->self; + } + + if (!pdev) { + loc = "N/A"; + goto out; + } + + dn = pci_device_to_OF_node(pdev); + if (!dn) { + loc = "N/A"; + goto out; + } + + loc = of_get_property(dn, "ibm,loc-code", NULL); + if (!loc) + loc = of_get_property(dn, "ibm,slot-location-code", NULL); + if (!loc) + loc = "N/A"; + +out: + if (pci_is_root_bus(bus) && pdev) + pci_dev_put(pdev); + return loc; +} + +/** + * eeh_pe_bus_get - Retrieve PCI bus according to the given PE + * @pe: EEH PE + * + * Retrieve the PCI bus according to the given PE. Basically, + * there're 3 types of PEs: PHB/Bus/Device. For PHB PE, the + * primary PCI bus will be retrieved. The parent bus will be + * returned for BUS PE. However, we don't have associated PCI + * bus for DEVICE PE. + */ +struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) +{ + struct pci_bus *bus = NULL; + struct eeh_dev *edev; + struct pci_dev *pdev; + + if (pe->type & EEH_PE_PHB) { + bus = pe->phb->bus; + } else if (pe->type & EEH_PE_BUS || + pe->type & EEH_PE_DEVICE) { + if (pe->bus) { + bus = pe->bus; + goto out; + } + + edev = list_first_entry(&pe->edevs, struct eeh_dev, list); + pdev = eeh_dev_to_pci_dev(edev); + if (pdev) + bus = pdev->bus; + } + +out: + return bus; +} diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c new file mode 100644 index 00000000000..e2595ba4b72 --- /dev/null +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -0,0 +1,98 @@ +/* + * Sysfs entries for PCI Error Recovery for PAPR-compliant platform. + * Copyright IBM Corporation 2007 + * Copyright Linas Vepstas <linas@austin.ibm.com> 2007 + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> + */ +#include <linux/pci.h> +#include <linux/stat.h> +#include <asm/ppc-pci.h> +#include <asm/pci-bridge.h> + +/** + * EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic + * @_name: name of file in sysfs directory + * @_memb: name of member in struct pci_dn to access + * @_format: printf format for display + * + * All of the attributes look very similar, so just + * auto-gen a cut-n-paste routine to display them. + */ +#define EEH_SHOW_ATTR(_name,_memb,_format) \ +static ssize_t eeh_show_##_name(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct pci_dev *pdev = to_pci_dev(dev); \ + struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); \ + \ + if (!edev) \ + return 0; \ + \ + return sprintf(buf, _format "\n", edev->_memb); \ +} \ +static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL); + +EEH_SHOW_ATTR(eeh_mode, mode, "0x%x"); +EEH_SHOW_ATTR(eeh_config_addr, config_addr, "0x%x"); +EEH_SHOW_ATTR(eeh_pe_config_addr, pe_config_addr, "0x%x"); + +void eeh_sysfs_add_device(struct pci_dev *pdev) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); + int rc=0; + + if (!eeh_enabled()) + return; + + if (edev && (edev->mode & EEH_DEV_SYSFS)) + return; + + rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode); + rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr); + rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); + + if (rc) + printk(KERN_WARNING "EEH: Unable to create sysfs entries\n"); + else if (edev) + edev->mode |= EEH_DEV_SYSFS; +} + +void eeh_sysfs_remove_device(struct pci_dev *pdev) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); + + /* + * The parent directory might have been removed. We needn't + * continue for that case. + */ + if (!pdev->dev.kobj.sd) { + if (edev) + edev->mode &= ~EEH_DEV_SYSFS; + return; + } + + device_remove_file(&pdev->dev, &dev_attr_eeh_mode); + device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr); + device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); + + if (edev) + edev->mode &= ~EEH_DEV_SYSFS; +} diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 2e99ae41723..22b45a4955c 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -19,7 +19,6 @@ * */ -#include <linux/config.h> #include <linux/errno.h> #include <linux/sys.h> #include <linux/threads.h> @@ -31,6 +30,8 @@ #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> #include <asm/unistd.h> +#include <asm/ftrace.h> +#include <asm/ptrace.h> #undef SHOW_SYSCALLS #undef SHOW_SYSCALLS_TASK @@ -45,29 +46,58 @@ #endif #ifdef CONFIG_BOOKE -#include "head_booke.h" -#define TRANSFER_TO_HANDLER_EXC_LEVEL(exc_level) \ - mtspr exc_level##_SPRG,r8; \ - BOOKE_LOAD_EXC_LEVEL_STACK(exc_level); \ - lwz r0,GPR10-INT_FRAME_SIZE(r8); \ - stw r0,GPR10(r11); \ - lwz r0,GPR11-INT_FRAME_SIZE(r8); \ - stw r0,GPR11(r11); \ - mfspr r8,exc_level##_SPRG - .globl mcheck_transfer_to_handler mcheck_transfer_to_handler: - TRANSFER_TO_HANDLER_EXC_LEVEL(MCHECK) - b transfer_to_handler_full + mfspr r0,SPRN_DSRR0 + stw r0,_DSRR0(r11) + mfspr r0,SPRN_DSRR1 + stw r0,_DSRR1(r11) + /* fall through */ .globl debug_transfer_to_handler debug_transfer_to_handler: - TRANSFER_TO_HANDLER_EXC_LEVEL(DEBUG) - b transfer_to_handler_full + mfspr r0,SPRN_CSRR0 + stw r0,_CSRR0(r11) + mfspr r0,SPRN_CSRR1 + stw r0,_CSRR1(r11) + /* fall through */ .globl crit_transfer_to_handler crit_transfer_to_handler: - TRANSFER_TO_HANDLER_EXC_LEVEL(CRIT) +#ifdef CONFIG_PPC_BOOK3E_MMU + mfspr r0,SPRN_MAS0 + stw r0,MAS0(r11) + mfspr r0,SPRN_MAS1 + stw r0,MAS1(r11) + mfspr r0,SPRN_MAS2 + stw r0,MAS2(r11) + mfspr r0,SPRN_MAS3 + stw r0,MAS3(r11) + mfspr r0,SPRN_MAS6 + stw r0,MAS6(r11) +#ifdef CONFIG_PHYS_64BIT + mfspr r0,SPRN_MAS7 + stw r0,MAS7(r11) +#endif /* CONFIG_PHYS_64BIT */ +#endif /* CONFIG_PPC_BOOK3E_MMU */ +#ifdef CONFIG_44x + mfspr r0,SPRN_MMUCR + stw r0,MMUCR(r11) +#endif + mfspr r0,SPRN_SRR0 + stw r0,_SRR0(r11) + mfspr r0,SPRN_SRR1 + stw r0,_SRR1(r11) + + /* set the stack limit to the current stack + * and set the limit to protect the thread_info + * struct + */ + mfspr r8,SPRN_SPRG_THREAD + lwz r0,KSP_LIMIT(r8) + stw r0,SAVED_KSP_LIMIT(r11) + rlwimi r0,r1,0,0,(31-THREAD_SHIFT) + stw r0,KSP_LIMIT(r8) /* fall through */ #endif @@ -78,6 +108,20 @@ crit_transfer_to_handler: stw r0,GPR10(r11) lwz r0,crit_r11@l(0) stw r0,GPR11(r11) + mfspr r0,SPRN_SRR0 + stw r0,crit_srr0@l(0) + mfspr r0,SPRN_SRR1 + stw r0,crit_srr1@l(0) + + /* set the stack limit to the current stack + * and set the limit to protect the thread_info + * struct + */ + mfspr r8,SPRN_SPRG_THREAD + lwz r0,KSP_LIMIT(r8) + stw r0,saved_ksp_limit@l(0) + rlwimi r0,r1,0,0,(31-THREAD_SHIFT) + stw r0,KSP_LIMIT(r8) /* fall through */ #endif @@ -103,7 +147,7 @@ transfer_to_handler: mfspr r2,SPRN_XER stw r12,_CTR(r11) stw r2,_XER(r11) - mfspr r12,SPRN_SPRG3 + mfspr r12,SPRN_SPRG_THREAD addi r2,r12,-THREAD tovirt(r2,r2) /* set r2 to current */ beq 2f /* if from user, fix up THREAD.regs */ @@ -111,9 +155,9 @@ transfer_to_handler: stw r11,PT_REGS(r12) #if defined(CONFIG_40x) || defined(CONFIG_BOOKE) /* Check to see if the dbcr0 register is set up to debug. Use the - single-step bit to do this. */ + internal debug mode bit to do this. */ lwz r12,THREAD_DBCR0(r12) - andis. r12,r12,DBCR0_IC@h + andis. r12,r12,DBCR0_IDM@h beq+ 3f /* From user and task is ptraced - load up global dbcr0 */ li r12,-1 /* clear all pending debug events */ @@ -121,6 +165,12 @@ transfer_to_handler: lis r11,global_dbcr0@ha tophys(r11,r11) addi r11,r11,global_dbcr0@l +#ifdef CONFIG_SMP + CURRENT_THREAD_INFO(r9, r1) + lwz r9,TI_CPU(r9) + slwi r9,r9,3 + add r11,r11,r9 +#endif lwz r12,0(r11) mtspr SPRN_DBCR0,r12 lwz r12,4(r11) @@ -128,34 +178,96 @@ transfer_to_handler: stw r12,4(r11) #endif b 3f + 2: /* if from kernel, check interrupted DOZE/NAP mode and * check for stack overflow */ -#ifdef CONFIG_6xx - mfspr r11,SPRN_HID0 - mtcr r11 -BEGIN_FTR_SECTION - bt- 8,power_save_6xx_restore /* Check DOZE */ -END_FTR_SECTION_IFSET(CPU_FTR_CAN_DOZE) -BEGIN_FTR_SECTION - bt- 9,power_save_6xx_restore /* Check NAP */ -END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) -#endif /* CONFIG_6xx */ + lwz r9,KSP_LIMIT(r12) + cmplw r1,r9 /* if r1 <= ksp_limit */ + ble- stack_ovf /* then the kernel stack overflowed */ +5: +#if defined(CONFIG_6xx) || defined(CONFIG_E500) + CURRENT_THREAD_INFO(r9, r1) + tophys(r9,r9) /* check local flags */ + lwz r12,TI_LOCAL_FLAGS(r9) + mtcrf 0x01,r12 + bt- 31-TLF_NAPPING,4f + bt- 31-TLF_SLEEPING,7f +#endif /* CONFIG_6xx || CONFIG_E500 */ .globl transfer_to_handler_cont transfer_to_handler_cont: - lwz r11,THREAD_INFO-THREAD(r12) - cmplw r1,r11 /* if r1 <= current->thread_info */ - ble- stack_ovf /* then the kernel stack overflowed */ 3: mflr r9 lwz r11,0(r9) /* virtual address of handler */ lwz r9,4(r9) /* where to go when done */ - FIX_SRR1(r10,r12) +#ifdef CONFIG_TRACE_IRQFLAGS + lis r12,reenable_mmu@h + ori r12,r12,reenable_mmu@l + mtspr SPRN_SRR0,r12 + mtspr SPRN_SRR1,r10 + SYNC + RFI +reenable_mmu: /* re-enable mmu so we can */ + mfmsr r10 + lwz r12,_MSR(r1) + xor r10,r10,r12 + andi. r10,r10,MSR_EE /* Did EE change? */ + beq 1f + + /* + * The trace_hardirqs_off will use CALLER_ADDR0 and CALLER_ADDR1. + * If from user mode there is only one stack frame on the stack, and + * accessing CALLER_ADDR1 will cause oops. So we need create a dummy + * stack frame to make trace_hardirqs_off happy. + * + * This is handy because we also need to save a bunch of GPRs, + * r3 can be different from GPR3(r1) at this point, r9 and r11 + * contains the old MSR and handler address respectively, + * r4 & r5 can contain page fault arguments that need to be passed + * along as well. r12, CCR, CTR, XER etc... are left clobbered as + * they aren't useful past this point (aren't syscall arguments), + * the rest is restored from the exception frame. + */ + stwu r1,-32(r1) + stw r9,8(r1) + stw r11,12(r1) + stw r3,16(r1) + stw r4,20(r1) + stw r5,24(r1) + bl trace_hardirqs_off + lwz r5,24(r1) + lwz r4,20(r1) + lwz r3,16(r1) + lwz r11,12(r1) + lwz r9,8(r1) + addi r1,r1,32 + lwz r0,GPR0(r1) + lwz r6,GPR6(r1) + lwz r7,GPR7(r1) + lwz r8,GPR8(r1) +1: mtctr r11 + mtlr r9 + bctr /* jump to handler */ +#else /* CONFIG_TRACE_IRQFLAGS */ mtspr SPRN_SRR0,r11 mtspr SPRN_SRR1,r10 mtlr r9 SYNC RFI /* jump to handler, enable MMU */ +#endif /* CONFIG_TRACE_IRQFLAGS */ + +#if defined (CONFIG_6xx) || defined(CONFIG_E500) +4: rlwinm r12,r12,0,~_TLF_NAPPING + stw r12,TI_LOCAL_FLAGS(r9) + b power_save_ppc32_restore + +7: rlwinm r12,r12,0,~_TLF_SLEEPING + stw r12,TI_LOCAL_FLAGS(r9) + lwz r9,_MSR(r11) /* if sleeping, clear MSR.EE */ + rlwinm r9,r9,0,~MSR_EE + lwz r12,_LINK(r11) /* and return to address in LR */ + b fast_exception_return +#endif /* * On kernel stack overflow, load up an initial stack pointer @@ -163,10 +275,10 @@ transfer_to_handler_cont: */ stack_ovf: /* sometimes we use a statically-allocated stack, which is OK. */ - lis r11,_end@h - ori r11,r11,_end@l - cmplw r1,r11 - ble 3b /* r1 <= &_end is OK */ + lis r12,_end@h + ori r12,r12,_end@l + cmplw r1,r12 + ble 5b /* r1 <= &_end is OK */ SAVE_NVGPRS(r11) addi r3,r1,STACK_FRAME_OVERHEAD lis r1,init_thread_union@ha @@ -189,7 +301,6 @@ stack_ovf: 0: _GLOBAL(DoSyscall) - stw r0,THREAD+LAST_SYSCALL(r2) stw r3,ORIG_GPR3(r1) li r12,0 stw r12,RESULT(r1) @@ -199,9 +310,32 @@ _GLOBAL(DoSyscall) #ifdef SHOW_SYSCALLS bl do_show_syscall #endif /* SHOW_SYSCALLS */ - rlwinm r10,r1,0,0,(31-THREAD_SHIFT) /* current_thread_info() */ - li r11,0 - stb r11,TI_SC_NOERR(r10) +#ifdef CONFIG_TRACE_IRQFLAGS + /* Return from syscalls can (and generally will) hard enable + * interrupts. You aren't supposed to call a syscall with + * interrupts disabled in the first place. However, to ensure + * that we get it right vs. lockdep if it happens, we force + * that hard enable here with appropriate tracing if we see + * that we have been called with interrupts off + */ + mfmsr r11 + andi. r12,r11,MSR_EE + bne+ 1f + /* We came in with interrupts disabled, we enable them now */ + bl trace_hardirqs_on + mfmsr r11 + lwz r0,GPR0(r1) + lwz r3,GPR3(r1) + lwz r4,GPR4(r1) + ori r11,r11,MSR_EE + lwz r5,GPR5(r1) + lwz r6,GPR6(r1) + lwz r7,GPR7(r1) + lwz r8,GPR8(r1) + mtmsr r11 +1: +#endif /* CONFIG_TRACE_IRQFLAGS */ + CURRENT_THREAD_INFO(r10, r1) lwz r11,TI_FLAGS(r10) andi. r11,r11,_TIF_SYSCALL_T_OR_A bne- syscall_dotrace @@ -222,40 +356,61 @@ ret_from_syscall: bl do_show_syscall_exit #endif mr r6,r3 - li r11,-_LAST_ERRNO - cmplw 0,r3,r11 - rlwinm r12,r1,0,0,(31-THREAD_SHIFT) /* current_thread_info() */ - blt+ 30f - lbz r11,TI_SC_NOERR(r12) - cmpwi r11,0 - bne 30f - neg r3,r3 - lwz r10,_CCR(r1) /* Set SO bit in CR */ - oris r10,r10,0x1000 - stw r10,_CCR(r1) - + CURRENT_THREAD_INFO(r12, r1) /* disable interrupts so current_thread_info()->flags can't change */ -30: LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */ + LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */ + /* Note: We don't bother telling lockdep about it */ SYNC MTMSRD(r10) lwz r9,TI_FLAGS(r12) - andi. r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SIGPENDING|_TIF_NEED_RESCHED) + li r8,-_LAST_ERRNO + andi. r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) bne- syscall_exit_work + cmplw 0,r3,r8 + blt+ syscall_exit_cont + lwz r11,_CCR(r1) /* Load CR */ + neg r3,r3 + oris r11,r11,0x1000 /* Set SO bit in CR */ + stw r11,_CCR(r1) syscall_exit_cont: + lwz r8,_MSR(r1) +#ifdef CONFIG_TRACE_IRQFLAGS + /* If we are going to return from the syscall with interrupts + * off, we trace that here. It shouldn't happen though but we + * want to catch the bugger if it does right ? + */ + andi. r10,r8,MSR_EE + bne+ 1f + stw r3,GPR3(r1) + bl trace_hardirqs_off + lwz r3,GPR3(r1) +1: +#endif /* CONFIG_TRACE_IRQFLAGS */ #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - /* If the process has its own DBCR0 value, load it up. The single - step bit tells us that dbcr0 should be loaded. */ + /* If the process has its own DBCR0 value, load it up. The internal + debug mode bit tells us that dbcr0 should be loaded. */ lwz r0,THREAD+THREAD_DBCR0(r2) - andis. r10,r0,DBCR0_IC@h + andis. r10,r0,DBCR0_IDM@h bnel- load_dbcr0 #endif +#ifdef CONFIG_44x +BEGIN_MMU_FTR_SECTION + lis r4,icache_44x_need_flush@ha + lwz r5,icache_44x_need_flush@l(r4) + cmplwi cr0,r5,0 + bne- 2f +1: +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_47x) +#endif /* CONFIG_44x */ +BEGIN_FTR_SECTION + lwarx r7,0,r1 +END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) stwcx. r0,0,r1 /* to clear the reservation */ lwz r4,_LINK(r1) lwz r5,_CCR(r1) mtlr r4 mtcr r5 lwz r7,_NIP(r1) - lwz r8,_MSR(r1) FIX_SRR1(r8, r0) lwz r2,GPR2(r1) lwz r1,GPR1(r1) @@ -263,6 +418,12 @@ syscall_exit_cont: mtspr SPRN_SRR1,r8 SYNC RFI +#ifdef CONFIG_44x +2: li r7,0 + iccci r0,r0 + stw r7,icache_44x_need_flush@l(r4) + b 1b +#endif /* CONFIG_44x */ 66: li r3,-ENOSYS b ret_from_syscall @@ -274,6 +435,17 @@ ret_from_fork: li r3,0 b ret_from_syscall + .globl ret_from_kernel_thread +ret_from_kernel_thread: + REST_NVGPRS(r1) + bl schedule_tail + mtlr r14 + mr r3,r15 + PPC440EP_ERR42 + blrl + li r3,0 + b ret_from_syscall + /* Traced system call support */ syscall_dotrace: SAVE_NVGPRS(r1) @@ -281,7 +453,12 @@ syscall_dotrace: stw r0,_TRAP(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_syscall_trace_enter - lwz r0,GPR0(r1) /* Restore original registers */ + /* + * Restore argument registers possibly just changed. + * We use the return value of do_syscall_trace_enter + * for call number to look up in the table (r0). + */ + mr r0,r3 lwz r3,GPR3(r1) lwz r4,GPR4(r1) lwz r5,GPR5(r1) @@ -292,45 +469,59 @@ syscall_dotrace: b syscall_dotrace_cont syscall_exit_work: - stw r6,RESULT(r1) /* Save result */ + andi. r0,r9,_TIF_RESTOREALL + beq+ 0f + REST_NVGPRS(r1) + b 2f +0: cmplw 0,r3,r8 + blt+ 1f + andi. r0,r9,_TIF_NOERROR + bne- 1f + lwz r11,_CCR(r1) /* Load CR */ + neg r3,r3 + oris r11,r11,0x1000 /* Set SO bit in CR */ + stw r11,_CCR(r1) + +1: stw r6,RESULT(r1) /* Save result */ stw r3,GPR3(r1) /* Update return value */ - andi. r0,r9,_TIF_SYSCALL_T_OR_A - beq 5f +2: andi. r0,r9,(_TIF_PERSYSCALL_MASK) + beq 4f + + /* Clear per-syscall TIF flags if any are set. */ + + li r11,_TIF_PERSYSCALL_MASK + addi r12,r12,TI_FLAGS +3: lwarx r8,0,r12 + andc r8,r8,r11 +#ifdef CONFIG_IBM405_ERR77 + dcbt 0,r12 +#endif + stwcx. r8,0,r12 + bne- 3b + subi r12,r12,TI_FLAGS + +4: /* Anything which requires enabling interrupts? */ + andi. r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP) + beq ret_from_except + + /* Re-enable interrupts. There is no need to trace that with + * lockdep as we are supposed to have IRQs on at this point + */ ori r10,r10,MSR_EE SYNC - MTMSRD(r10) /* re-enable interrupts */ + MTMSRD(r10) + + /* Save NVGPRS if they're not saved already */ lwz r4,_TRAP(r1) andi. r4,r4,1 - beq 4f + beq 5f SAVE_NVGPRS(r1) li r4,0xc00 stw r4,_TRAP(r1) -4: +5: addi r3,r1,STACK_FRAME_OVERHEAD bl do_syscall_trace_leave - REST_NVGPRS(r1) -2: - lwz r3,GPR3(r1) - LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */ - SYNC - MTMSRD(r10) /* disable interrupts again */ - rlwinm r12,r1,0,0,(31-THREAD_SHIFT) /* current_thread_info() */ - lwz r9,TI_FLAGS(r12) -5: - andi. r0,r9,_TIF_NEED_RESCHED - bne 1f - lwz r5,_MSR(r1) - andi. r5,r5,MSR_PR - beq syscall_exit_cont - andi. r0,r9,_TIF_SIGPENDING - beq syscall_exit_cont - b do_user_signal -1: - ori r10,r10,MSR_EE - SYNC - MTMSRD(r10) /* re-enable interrupts */ - bl schedule - b 2b + b ret_from_except_full #ifdef SHOW_SYSCALLS do_show_syscall: @@ -401,28 +592,10 @@ show_syscalls_task: #endif /* SHOW_SYSCALLS */ /* - * The sigsuspend and rt_sigsuspend system calls can call do_signal - * and thus put the process into the stopped state where we might - * want to examine its user state with ptrace. Therefore we need - * to save all the nonvolatile registers (r13 - r31) before calling - * the C code. + * The fork/clone functions need to copy the full register set into + * the child process. Therefore we need to save all the nonvolatile + * registers (r13 - r31) before calling the C code. */ - .globl ppc_sigsuspend -ppc_sigsuspend: - SAVE_NVGPRS(r1) - lwz r0,_TRAP(r1) - rlwinm r0,r0,0,0,30 /* clear LSB to indicate full */ - stw r0,_TRAP(r1) /* register set saved */ - b sys_sigsuspend - - .globl ppc_rt_sigsuspend -ppc_rt_sigsuspend: - SAVE_NVGPRS(r1) - lwz r0,_TRAP(r1) - rlwinm r0,r0,0,0,30 - stw r0,_TRAP(r1) - b sys_rt_sigsuspend - .globl ppc_fork ppc_fork: SAVE_NVGPRS(r1) @@ -515,9 +688,11 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_SPE +BEGIN_FTR_SECTION oris r0,r0,MSR_SPE@h /* Disable SPE */ mfspr r12,SPRN_SPEFSCR /* save spefscr register value */ stw r12,THREAD+THREAD_SPEFSCR(r2) +END_FTR_SECTION_IFSET(CPU_FTR_SPE) #endif /* CONFIG_SPE */ and. r0,r0,r11 /* FP or altivec or SPE enabled? */ beq+ 1f @@ -539,7 +714,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) tophys(r0,r4) CLR_TOP32(r0) - mtspr SPRN_SPRG3,r0 /* Update current THREAD phys addr */ + mtspr SPRN_SPRG_THREAD,r0 /* Update current THREAD phys addr */ lwz r1,KSP(r4) /* Load new stack pointer */ /* save the old current 'last' for return value */ @@ -553,8 +728,10 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_SPE +BEGIN_FTR_SECTION lwz r0,THREAD+THREAD_SPEFSCR(r2) mtspr SPRN_SPEFSCR,r0 /* restore SPEFSCR reg */ +END_FTR_SECTION_IFSET(CPU_FTR_SPE) #endif /* CONFIG_SPE */ lwz r0,_CCR(r1) @@ -607,7 +784,11 @@ fast_exception_return: mr r12,r4 /* restart at exc_exit_restart */ b 2b - .comm fee_restarts,4 + .section .bss + .align 2 +fee_restarts: + .space 4 + .previous /* aargh, a nonrecoverable interrupt, panic */ /* aargh, we don't know which trap this is */ @@ -626,16 +807,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_601) .long ret_from_except #endif - .globl sigreturn_exit -sigreturn_exit: - subi r1,r3,STACK_FRAME_OVERHEAD - rlwinm r12,r1,0,0,(31-THREAD_SHIFT) /* current_thread_info() */ - lwz r9,TI_FLAGS(r12) - andi. r0,r9,_TIF_SYSCALL_T_OR_A - beq+ ret_from_except_full - bl do_syscall_trace_leave - /* fall through */ - .globl ret_from_except_full ret_from_except_full: REST_NVGPRS(r1) @@ -646,6 +817,7 @@ ret_from_except: /* Hard-disable interrupts so that current_thread_info()->flags * can't change between when we test it and when we return * from the interrupt. */ + /* Note: We don't bother telling lockdep about it */ LOAD_MSR_KERNEL(r10,MSR_KERNEL) SYNC /* Some chip revs have problems here... */ MTMSRD(r10) /* disable interrupts */ @@ -656,46 +828,140 @@ ret_from_except: user_exc_return: /* r10 contains MSR_KERNEL here */ /* Check current_thread_info()->flags */ - rlwinm r9,r1,0,0,(31-THREAD_SHIFT) + CURRENT_THREAD_INFO(r9, r1) lwz r9,TI_FLAGS(r9) - andi. r0,r9,(_TIF_SIGPENDING|_TIF_NEED_RESCHED) + andi. r0,r9,_TIF_USER_WORK_MASK bne do_work restore_user: #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - /* Check whether this process has its own DBCR0 value. The single - step bit tells us that dbcr0 should be loaded. */ + /* Check whether this process has its own DBCR0 value. The internal + debug mode bit tells us that dbcr0 should be loaded. */ lwz r0,THREAD+THREAD_DBCR0(r2) - andis. r10,r0,DBCR0_IC@h + andis. r10,r0,DBCR0_IDM@h bnel- load_dbcr0 #endif -#ifdef CONFIG_PREEMPT b restore /* N.B. the only way to get here is from the beq following ret_from_except. */ resume_kernel: + /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ + CURRENT_THREAD_INFO(r9, r1) + lwz r8,TI_FLAGS(r9) + andis. r0,r8,_TIF_EMULATE_STACK_STORE@h + beq+ 1f + + addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */ + + lwz r3,GPR1(r1) + subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */ + mr r4,r1 /* src: current exception frame */ + mr r1,r3 /* Reroute the trampoline frame to r1 */ + + /* Copy from the original to the trampoline. */ + li r5,INT_FRAME_SIZE/4 /* size: INT_FRAME_SIZE */ + li r6,0 /* start offset: 0 */ + mtctr r5 +2: lwzx r0,r6,r4 + stwx r0,r6,r3 + addi r6,r6,4 + bdnz 2b + + /* Do real store operation to complete stwu */ + lwz r5,GPR1(r1) + stw r8,0(r5) + + /* Clear _TIF_EMULATE_STACK_STORE flag */ + lis r11,_TIF_EMULATE_STACK_STORE@h + addi r5,r9,TI_FLAGS +0: lwarx r8,0,r5 + andc r8,r8,r11 +#ifdef CONFIG_IBM405_ERR77 + dcbt 0,r5 +#endif + stwcx. r8,0,r5 + bne- 0b +1: + +#ifdef CONFIG_PREEMPT /* check current_thread_info->preempt_count */ - rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r0,TI_PREEMPT(r9) cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ bne restore - lwz r0,TI_FLAGS(r9) - andi. r0,r0,_TIF_NEED_RESCHED + andi. r8,r8,_TIF_NEED_RESCHED beq+ restore + lwz r3,_MSR(r1) andi. r0,r3,MSR_EE /* interrupts off? */ beq restore /* don't schedule if so */ +#ifdef CONFIG_TRACE_IRQFLAGS + /* Lockdep thinks irqs are enabled, we need to call + * preempt_schedule_irq with IRQs off, so we inform lockdep + * now that we -did- turn them off already + */ + bl trace_hardirqs_off +#endif 1: bl preempt_schedule_irq - rlwinm r9,r1,0,0,(31-THREAD_SHIFT) + CURRENT_THREAD_INFO(r9, r1) lwz r3,TI_FLAGS(r9) andi. r0,r3,_TIF_NEED_RESCHED bne- 1b -#else -resume_kernel: +#ifdef CONFIG_TRACE_IRQFLAGS + /* And now, to properly rebalance the above, we tell lockdep they + * are being turned back on, which will happen when we return + */ + bl trace_hardirqs_on +#endif #endif /* CONFIG_PREEMPT */ /* interrupts are hard-disabled at this point */ restore: +#ifdef CONFIG_44x +BEGIN_MMU_FTR_SECTION + b 1f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) + lis r4,icache_44x_need_flush@ha + lwz r5,icache_44x_need_flush@l(r4) + cmplwi cr0,r5,0 + beq+ 1f + li r6,0 + iccci r0,r0 + stw r6,icache_44x_need_flush@l(r4) +1: +#endif /* CONFIG_44x */ + + lwz r9,_MSR(r1) +#ifdef CONFIG_TRACE_IRQFLAGS + /* Lockdep doesn't know about the fact that IRQs are temporarily turned + * off in this assembly code while peeking at TI_FLAGS() and such. However + * we need to inform it if the exception turned interrupts off, and we + * are about to trun them back on. + * + * The problem here sadly is that we don't know whether the exceptions was + * one that turned interrupts off or not. So we always tell lockdep about + * turning them on here when we go back to wherever we came from with EE + * on, even if that may meen some redudant calls being tracked. Maybe later + * we could encode what the exception did somewhere or test the exception + * type in the pt_regs but that sounds overkill + */ + andi. r10,r9,MSR_EE + beq 1f + /* + * Since the ftrace irqsoff latency trace checks CALLER_ADDR1, + * which is the stack frame here, we need to force a stack frame + * in case we came from user space. + */ + stwu r1,-32(r1) + mflr r0 + stw r0,4(r1) + stwu r1,-32(r1) + bl trace_hardirqs_on + lwz r1,0(r1) + lwz r1,0(r1) + lwz r9,_MSR(r1) +1: +#endif /* CONFIG_TRACE_IRQFLAGS */ + lwz r0,GPR0(r1) lwz r2,GPR2(r1) REST_4GPRS(3, r1) @@ -707,10 +973,12 @@ restore: mtctr r11 PPC405_ERR77(0,r1) +BEGIN_FTR_SECTION + lwarx r11,0,r1 +END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) stwcx. r0,0,r1 /* to clear the reservation */ #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) - lwz r9,_MSR(r1) andi. r10,r9,MSR_RI /* check if this exception occurred */ beql nonrecoverable /* at a bad place (MSR:RI = 0) */ @@ -733,7 +1001,6 @@ restore: MTMSRD(r10) /* clear the RI bit */ .globl exc_exit_restart exc_exit_restart: - lwz r9,_MSR(r1) lwz r12,_NIP(r1) FIX_SRR1(r9,r10) mtspr SPRN_SRR0,r12 @@ -836,18 +1103,91 @@ exc_exit_restart_end: exc_lvl_rfi; \ b .; /* prevent prefetch past exc_lvl_rfi */ +#define RESTORE_xSRR(exc_lvl_srr0, exc_lvl_srr1) \ + lwz r9,_##exc_lvl_srr0(r1); \ + lwz r10,_##exc_lvl_srr1(r1); \ + mtspr SPRN_##exc_lvl_srr0,r9; \ + mtspr SPRN_##exc_lvl_srr1,r10; + +#if defined(CONFIG_PPC_BOOK3E_MMU) +#ifdef CONFIG_PHYS_64BIT +#define RESTORE_MAS7 \ + lwz r11,MAS7(r1); \ + mtspr SPRN_MAS7,r11; +#else +#define RESTORE_MAS7 +#endif /* CONFIG_PHYS_64BIT */ +#define RESTORE_MMU_REGS \ + lwz r9,MAS0(r1); \ + lwz r10,MAS1(r1); \ + lwz r11,MAS2(r1); \ + mtspr SPRN_MAS0,r9; \ + lwz r9,MAS3(r1); \ + mtspr SPRN_MAS1,r10; \ + lwz r10,MAS6(r1); \ + mtspr SPRN_MAS2,r11; \ + mtspr SPRN_MAS3,r9; \ + mtspr SPRN_MAS6,r10; \ + RESTORE_MAS7; +#elif defined(CONFIG_44x) +#define RESTORE_MMU_REGS \ + lwz r9,MMUCR(r1); \ + mtspr SPRN_MMUCR,r9; +#else +#define RESTORE_MMU_REGS +#endif + +#ifdef CONFIG_40x .globl ret_from_crit_exc ret_from_crit_exc: - RET_FROM_EXC_LEVEL(SPRN_CSRR0, SPRN_CSRR1, RFCI) + mfspr r9,SPRN_SPRG_THREAD + lis r10,saved_ksp_limit@ha; + lwz r10,saved_ksp_limit@l(r10); + tovirt(r9,r9); + stw r10,KSP_LIMIT(r9) + lis r9,crit_srr0@ha; + lwz r9,crit_srr0@l(r9); + lis r10,crit_srr1@ha; + lwz r10,crit_srr1@l(r10); + mtspr SPRN_SRR0,r9; + mtspr SPRN_SRR1,r10; + RET_FROM_EXC_LEVEL(SPRN_CSRR0, SPRN_CSRR1, PPC_RFCI) +#endif /* CONFIG_40x */ #ifdef CONFIG_BOOKE + .globl ret_from_crit_exc +ret_from_crit_exc: + mfspr r9,SPRN_SPRG_THREAD + lwz r10,SAVED_KSP_LIMIT(r1) + stw r10,KSP_LIMIT(r9) + RESTORE_xSRR(SRR0,SRR1); + RESTORE_MMU_REGS; + RET_FROM_EXC_LEVEL(SPRN_CSRR0, SPRN_CSRR1, PPC_RFCI) + .globl ret_from_debug_exc ret_from_debug_exc: - RET_FROM_EXC_LEVEL(SPRN_DSRR0, SPRN_DSRR1, RFDI) + mfspr r9,SPRN_SPRG_THREAD + lwz r10,SAVED_KSP_LIMIT(r1) + stw r10,KSP_LIMIT(r9) + lwz r9,THREAD_INFO-THREAD(r9) + CURRENT_THREAD_INFO(r10, r1) + lwz r10,TI_PREEMPT(r10) + stw r10,TI_PREEMPT(r9) + RESTORE_xSRR(SRR0,SRR1); + RESTORE_xSRR(CSRR0,CSRR1); + RESTORE_MMU_REGS; + RET_FROM_EXC_LEVEL(SPRN_DSRR0, SPRN_DSRR1, PPC_RFDI) .globl ret_from_mcheck_exc ret_from_mcheck_exc: - RET_FROM_EXC_LEVEL(SPRN_MCSRR0, SPRN_MCSRR1, RFMCI) + mfspr r9,SPRN_SPRG_THREAD + lwz r10,SAVED_KSP_LIMIT(r1) + stw r10,KSP_LIMIT(r9) + RESTORE_xSRR(SRR0,SRR1); + RESTORE_xSRR(CSRR0,CSRR1); + RESTORE_xSRR(DSRR0,DSRR1); + RESTORE_MMU_REGS; + RET_FROM_EXC_LEVEL(SPRN_MCSRR0, SPRN_MCSRR1, PPC_RFMCI) #endif /* CONFIG_BOOKE */ /* @@ -863,6 +1203,12 @@ load_dbcr0: mfspr r10,SPRN_DBCR0 lis r11,global_dbcr0@ha addi r11,r11,global_dbcr0@l +#ifdef CONFIG_SMP + CURRENT_THREAD_INFO(r9, r1) + lwz r9,TI_CPU(r9) + slwi r9,r9,3 + add r11,r11,r9 +#endif stw r10,0(r11) mtspr SPRN_DBCR0,r0 lwz r10,4(r11) @@ -872,7 +1218,11 @@ load_dbcr0: mtspr SPRN_DBSR,r11 /* clear all pending debug events */ blr - .comm global_dbcr0,8 + .section .bss + .align 4 +global_dbcr0: + .space 8*NR_CPUS + .previous #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ @@ -880,19 +1230,26 @@ do_work: /* r10 contains MSR_KERNEL here */ beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ + /* Note: We don't need to inform lockdep that we are enabling + * interrupts here. As far as it knows, they are already enabled + */ ori r10,r10,MSR_EE SYNC MTMSRD(r10) /* hard-enable interrupts */ bl schedule recheck: + /* Note: And we don't tell it we are disabling them again + * neither. Those disable/enable cycles used to peek at + * TI_FLAGS aren't advertised. + */ LOAD_MSR_KERNEL(r10,MSR_KERNEL) SYNC MTMSRD(r10) /* disable interrupts */ - rlwinm r9,r1,0,0,(31-THREAD_SHIFT) + CURRENT_THREAD_INFO(r9, r1) lwz r9,TI_FLAGS(r9) andi. r0,r9,_TIF_NEED_RESCHED bne- do_resched - andi. r0,r9,_TIF_SIGPENDING + andi. r0,r9,_TIF_USER_WORK_MASK beq restore_user do_user_signal: /* r10 contains MSR_KERNEL here */ ori r10,r10,MSR_EE @@ -905,9 +1262,9 @@ do_user_signal: /* r10 contains MSR_KERNEL here */ SAVE_NVGPRS(r1) rlwinm r3,r3,0,0,30 stw r3,_TRAP(r1) -2: li r3,0 - addi r4,r1,STACK_FRAME_OVERHEAD - bl do_signal +2: addi r3,r1,STACK_FRAME_OVERHEAD + mr r4,r9 + bl do_notify_resume REST_NVGPRS(r1) b recheck @@ -947,7 +1304,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_601) /* shouldn't return */ b 4b - .comm ee_restarts,4 + .section .bss + .align 2 +ee_restarts: + .space 4 + .previous /* * PROM code for specific machines follows. Put it @@ -963,7 +1324,7 @@ _GLOBAL(enter_rtas) stwu r1,-INT_FRAME_SIZE(r1) mflr r0 stw r0,INT_FRAME_SIZE+4(r1) - LOADADDR(r4, rtas) + LOAD_REG_ADDR(r4, rtas) lis r6,1f@ha /* physical return address for rtas */ addi r6,r6,1f@l tophys(r6,r6) @@ -977,7 +1338,7 @@ _GLOBAL(enter_rtas) MTMSRD(r0) /* don't get trashed */ li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) mtlr r6 - mtspr SPRN_SPRG2,r7 + mtspr SPRN_SPRG_RTAS,r7 mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 RFI @@ -987,7 +1348,7 @@ _GLOBAL(enter_rtas) FIX_SRR1(r9,r0) addi r1,r1,INT_FRAME_SIZE li r0,0 - mtspr SPRN_SPRG2,r0 + mtspr SPRN_SPRG_RTAS,r0 mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 RFI /* return to caller */ @@ -998,3 +1359,102 @@ machine_check_in_rtas: /* XXX load up BATs and panic */ #endif /* CONFIG_PPC_RTAS */ + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE +_GLOBAL(mcount) +_GLOBAL(_mcount) + /* + * It is required that _mcount on PPC32 must preserve the + * link register. But we have r0 to play with. We use r0 + * to push the return address back to the caller of mcount + * into the ctr register, restore the link register and + * then jump back using the ctr register. + */ + mflr r0 + mtctr r0 + lwz r0, 4(r1) + mtlr r0 + bctr + +_GLOBAL(ftrace_caller) + MCOUNT_SAVE_FRAME + /* r3 ends up with link register */ + subi r3, r3, MCOUNT_INSN_SIZE +.globl ftrace_call +ftrace_call: + bl ftrace_stub + nop +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + b ftrace_graph_stub +_GLOBAL(ftrace_graph_stub) +#endif + MCOUNT_RESTORE_FRAME + /* old link register ends up in ctr reg */ + bctr +#else +_GLOBAL(mcount) +_GLOBAL(_mcount) + + MCOUNT_SAVE_FRAME + + subi r3, r3, MCOUNT_INSN_SIZE + LOAD_REG_ADDR(r5, ftrace_trace_function) + lwz r5,0(r5) + + mtctr r5 + bctrl + nop + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + b ftrace_graph_caller +#endif + MCOUNT_RESTORE_FRAME + bctr +#endif + +_GLOBAL(ftrace_stub) + blr + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +_GLOBAL(ftrace_graph_caller) + /* load r4 with local address */ + lwz r4, 44(r1) + subi r4, r4, MCOUNT_INSN_SIZE + + /* get the parent address */ + addi r3, r1, 52 + + bl prepare_ftrace_return + nop + + MCOUNT_RESTORE_FRAME + /* old link register ends up in ctr reg */ + bctr + +_GLOBAL(return_to_handler) + /* need to save return values */ + stwu r1, -32(r1) + stw r3, 20(r1) + stw r4, 16(r1) + stw r31, 12(r1) + mr r31, r1 + + bl ftrace_return_to_handler + nop + + /* return value has real return address */ + mtlr r3 + + lwz r3, 20(r1) + lwz r4, 16(r1) + lwz r31,12(r1) + lwz r1, 0(r1) + + /* Jump back to real return address */ + blr +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#endif /* CONFIG_MCOUNT */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 984a1063071..6528c5e2cc4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -1,6 +1,4 @@ /* - * arch/ppc64/kernel/entry.S - * * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) * Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP @@ -20,7 +18,6 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <linux/errno.h> #include <asm/unistd.h> #include <asm/processor.h> @@ -30,21 +27,24 @@ #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> #include <asm/cputable.h> - -#ifdef CONFIG_PPC_ISERIES -#define DO_SOFT_DISABLE -#endif +#include <asm/firmware.h> +#include <asm/bug.h> +#include <asm/ptrace.h> +#include <asm/irqflags.h> +#include <asm/ftrace.h> +#include <asm/hw_irq.h> +#include <asm/context_tracking.h> /* * System calls. */ .section ".toc","aw" -.SYS_CALL_TABLE: - .tc .sys_call_table[TC],.sys_call_table +SYS_CALL_TABLE: + .tc sys_call_table[TC],sys_call_table /* This value is used to mark exception frames on the stack. */ exception_marker: - .tc ID_72656773_68657265[TC],0x7265677368657265 + .tc ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER .section ".text" .align 7 @@ -63,8 +63,11 @@ system_call_common: std r12,_MSR(r1) std r0,GPR0(r1) std r10,GPR1(r1) - std r2,GPR2(r1) + beq 2f /* if from kernel mode */ + ACCOUNT_CPU_USER_ENTRY(r10, r11) +2: std r2,GPR2(r1) std r3,GPR3(r1) + mfcr r2 std r4,GPR4(r1) std r5,GPR5(r1) std r6,GPR6(r1) @@ -75,50 +78,82 @@ system_call_common: std r11,GPR10(r1) std r11,GPR11(r1) std r11,GPR12(r1) + std r11,_XER(r1) + std r11,_CTR(r1) std r9,GPR13(r1) - crclr so - mfcr r9 mflr r10 + /* + * This clears CR0.SO (bit 28), which is the error indication on + * return from this system call. + */ + rldimi r2,r11,28,(63-28) li r11,0xc01 - std r9,_CCR(r1) std r10,_LINK(r1) std r11,_TRAP(r1) - mfxer r9 - mfctr r10 - std r9,_XER(r1) - std r10,_CTR(r1) std r3,ORIG_GPR3(r1) + std r2,_CCR(r1) ld r2,PACATOC(r13) addi r9,r1,STACK_FRAME_OVERHEAD ld r11,exception_marker@toc(r2) std r11,-16(r9) /* "regshere" marker */ -#ifdef CONFIG_PPC_ISERIES - /* Hack for handling interrupts when soft-enabling on iSeries */ - cmpdi cr1,r0,0x5555 /* syscall 0x5555 */ - andi. r10,r12,MSR_PR /* from kernel */ - crand 4*cr0+eq,4*cr1+eq,4*cr0+eq - beq hardware_interrupt_entry - lbz r10,PACAPROCENABLED(r13) - std r10,SOFTE(r1) +#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR) +BEGIN_FW_FTR_SECTION + beq 33f + /* if from user, see if there are any DTL entries to process */ + ld r10,PACALPPACAPTR(r13) /* get ptr to VPA */ + ld r11,PACA_DTL_RIDX(r13) /* get log read index */ + addi r10,r10,LPPACA_DTLIDX + LDX_BE r10,0,r10 /* get log write index */ + cmpd cr1,r11,r10 + beq+ cr1,33f + bl accumulate_stolen_time + REST_GPR(0,r1) + REST_4GPRS(3,r1) + REST_2GPRS(7,r1) + addi r9,r1,STACK_FRAME_OVERHEAD +33: +END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE && CONFIG_PPC_SPLPAR */ + + /* + * A syscall should always be called with interrupts enabled + * so we just unconditionally hard-enable here. When some kind + * of irq tracing is used, we additionally check that condition + * is correct + */ +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_BUG) + lbz r10,PACASOFTIRQEN(r13) + xori r10,r10,1 +1: tdnei r10,0 + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING #endif - mfmsr r11 + +#ifdef CONFIG_PPC_BOOK3E + wrteei 1 +#else + ld r11,PACAKMSR(r13) ori r11,r11,MSR_EE mtmsrd r11,1 +#endif /* CONFIG_PPC_BOOK3E */ + + /* We do need to set SOFTE in the stack frame or the return + * from interrupt will be painful + */ + li r10,1 + std r10,SOFTE(r1) #ifdef SHOW_SYSCALLS - bl .do_show_syscall + bl do_show_syscall REST_GPR(0,r1) REST_4GPRS(3,r1) REST_2GPRS(7,r1) addi r9,r1,STACK_FRAME_OVERHEAD #endif - clrrdi r11,r1,THREAD_SHIFT - li r12,0 + CURRENT_THREAD_INFO(r11, r1) ld r10,TI_FLAGS(r11) - stb r12,TI_SC_NOERR(r11) andi. r11,r10,_TIF_SYSCALL_T_OR_A - bne- syscall_dotrace -syscall_dotrace_cont: + bne syscall_dotrace +.Lsyscall_dotrace_cont: cmpldi 0,r0,NR_syscalls bge- syscall_enosys @@ -127,7 +162,7 @@ system_call: /* label this so stack traces look sane */ * Need to vector to 32 Bit or default sys_call_table here, * based on caller's run-mode / personality. */ - ld r11,.SYS_CALL_TABLE@toc(2) + ld r11,SYS_CALL_TABLE@toc(2) andi. r10,r10,_TIF_32BIT beq 15f addi r11,r11,8 /* use 32-bit syscall entries */ @@ -139,81 +174,89 @@ system_call: /* label this so stack traces look sane */ clrldi r8,r8,32 15: slwi r0,r0,4 - ldx r10,r11,r0 /* Fetch system call handler [ptr] */ - mtctr r10 + ldx r12,r11,r0 /* Fetch system call handler [ptr] */ + mtctr r12 bctrl /* Call handler */ syscall_exit: + std r3,RESULT(r1) #ifdef SHOW_SYSCALLS - std r3,GPR3(r1) - bl .do_show_syscall_exit - ld r3,GPR3(r1) + bl do_show_syscall_exit + ld r3,RESULT(r1) #endif - std r3,RESULT(r1) - ld r5,_CCR(r1) - li r10,-_LAST_ERRNO - cmpld r3,r10 - clrrdi r12,r1,THREAD_SHIFT - bge- syscall_error -syscall_error_cont: + CURRENT_THREAD_INFO(r12, r1) - /* check for syscall tracing or audit */ - ld r9,TI_FLAGS(r12) - andi. r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP) - bne- syscall_exit_trace -syscall_exit_trace_cont: - - /* disable interrupts so current_thread_info()->flags can't change, - and so that we don't get interrupted after loading SRR0/1. */ ld r8,_MSR(r1) +#ifdef CONFIG_PPC_BOOK3S + /* No MSR:RI on BookE */ andi. r10,r8,MSR_RI beq- unrecov_restore - mfmsr r10 - rldicl r10,r10,48,1 - rotldi r10,r10,16 - mtmsrd r10,1 +#endif + /* + * Disable interrupts so current_thread_info()->flags can't change, + * and so that we don't get interrupted after loading SRR0/1. + */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 0 +#else + ld r10,PACAKMSR(r13) + /* + * For performance reasons we clear RI the same time that we + * clear EE. We only need to clear RI just before we restore r13 + * below, but batching it with EE saves us one expensive mtmsrd call. + * We have to be careful to restore RI if we branch anywhere from + * here (eg syscall_exit_work). + */ + li r9,MSR_RI + andc r11,r10,r9 + mtmsrd r11,1 +#endif /* CONFIG_PPC_BOOK3E */ + ld r9,TI_FLAGS(r12) - andi. r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SIGPENDING|_TIF_NEED_RESCHED) + li r11,-_LAST_ERRNO + andi. r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) bne- syscall_exit_work + cmpld r3,r11 + ld r5,_CCR(r1) + bge- syscall_error +.Lsyscall_error_cont: ld r7,_NIP(r1) +BEGIN_FTR_SECTION stdcx. r0,0,r1 /* to clear the reservation */ +END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) andi. r6,r8,MSR_PR ld r4,_LINK(r1) - beq- 1f /* only restore r13 if */ - ld r13,GPR13(r1) /* returning to usermode */ + + beq- 1f + ACCOUNT_CPU_USER_EXIT(r11, r12) + HMT_MEDIUM_LOW_HAS_PPR + ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ 1: ld r2,GPR2(r1) - li r12,MSR_RI - andc r10,r10,r12 - mtmsrd r10,1 /* clear MSR.RI */ ld r1,GPR1(r1) mtlr r4 mtcr r5 mtspr SPRN_SRR0,r7 mtspr SPRN_SRR1,r8 - rfid + RFI b . /* prevent speculative execution */ -syscall_enosys: - li r3,-ENOSYS - std r3,RESULT(r1) - clrrdi r12,r1,THREAD_SHIFT - ld r5,_CCR(r1) - -syscall_error: - lbz r11,TI_SC_NOERR(r12) - cmpwi 0,r11,0 - bne- syscall_error_cont - neg r3,r3 +syscall_error: oris r5,r5,0x1000 /* Set SO bit in CR */ + neg r3,r3 std r5,_CCR(r1) - b syscall_error_cont - + b .Lsyscall_error_cont + /* Traced system call support */ syscall_dotrace: - bl .save_nvgprs + bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_syscall_trace_enter - ld r0,GPR0(r1) /* Restore original registers */ + bl do_syscall_trace_enter + /* + * Restore argument registers possibly just changed. + * We use the return value of do_syscall_trace_enter + * for the call number to look up in the table (r0). + */ + mr r0,r3 ld r3,GPR3(r1) ld r4,GPR4(r1) ld r5,GPR5(r1) @@ -221,26 +264,65 @@ syscall_dotrace: ld r7,GPR7(r1) ld r8,GPR8(r1) addi r9,r1,STACK_FRAME_OVERHEAD - clrrdi r10,r1,THREAD_SHIFT + CURRENT_THREAD_INFO(r10, r1) ld r10,TI_FLAGS(r10) - b syscall_dotrace_cont + b .Lsyscall_dotrace_cont -syscall_exit_trace: - std r3,GPR3(r1) - bl .save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_syscall_trace_leave +syscall_enosys: + li r3,-ENOSYS + b syscall_exit + +syscall_exit_work: +#ifdef CONFIG_PPC_BOOK3S + mtmsrd r10,1 /* Restore RI */ +#endif + /* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr. + If TIF_NOERROR is set, just save r3 as it is. */ + + andi. r0,r9,_TIF_RESTOREALL + beq+ 0f REST_NVGPRS(r1) - ld r3,GPR3(r1) + b 2f +0: cmpld r3,r11 /* r10 is -LAST_ERRNO */ + blt+ 1f + andi. r0,r9,_TIF_NOERROR + bne- 1f ld r5,_CCR(r1) - clrrdi r12,r1,THREAD_SHIFT - b syscall_exit_trace_cont - -/* Stuff to do on exit from a system call. */ -syscall_exit_work: - std r3,GPR3(r1) + neg r3,r3 + oris r5,r5,0x1000 /* Set SO bit in CR */ std r5,_CCR(r1) - b .ret_from_except_lite +1: std r3,GPR3(r1) +2: andi. r0,r9,(_TIF_PERSYSCALL_MASK) + beq 4f + + /* Clear per-syscall TIF flags if any are set. */ + + li r11,_TIF_PERSYSCALL_MASK + addi r12,r12,TI_FLAGS +3: ldarx r10,0,r12 + andc r10,r10,r11 + stdcx. r10,0,r12 + bne- 3b + subi r12,r12,TI_FLAGS + +4: /* Anything else left to do? */ + SET_DEFAULT_THREAD_PPR(r3, r10) /* Set thread.ppr = 3 */ + andi. r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP) + beq ret_from_except_lite + + /* Re-enable interrupts */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 1 +#else + ld r10,PACAKMSR(r13) + ori r10,r10,MSR_EE + mtmsrd r10,1 +#endif /* CONFIG_PPC_BOOK3E */ + + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_syscall_trace_leave + b ret_from_except /* Save non-volatile GPRs, if not already saved. */ _GLOBAL(save_nvgprs) @@ -252,6 +334,7 @@ _GLOBAL(save_nvgprs) std r0,_TRAP(r1) blr + /* * The sigsuspend and rt_sigsuspend system calls can call do_signal * and thus put the process into the stopped state where we might @@ -260,85 +343,47 @@ _GLOBAL(save_nvgprs) * the C code. Similarly, fork, vfork and clone need the full * register state on the stack so that it can be copied to the child. */ -_GLOBAL(ppc32_sigsuspend) - bl .save_nvgprs - bl .compat_sys_sigsuspend - b 70f - -_GLOBAL(ppc64_rt_sigsuspend) - bl .save_nvgprs - bl .sys_rt_sigsuspend - b 70f - -_GLOBAL(ppc32_rt_sigsuspend) - bl .save_nvgprs - bl .compat_sys_rt_sigsuspend -70: cmpdi 0,r3,0 - /* If it returned an error, we need to return via syscall_exit to set - the SO bit in cr0 and potentially stop for ptrace. */ - bne syscall_exit - /* If sigsuspend() returns zero, we are going into a signal handler. We - may need to call audit_syscall_exit() to mark the exit from sigsuspend() */ -#ifdef CONFIG_AUDIT - ld r3,PACACURRENT(r13) - ld r4,AUDITCONTEXT(r3) - cmpdi 0,r4,0 - beq .ret_from_except /* No audit_context: Leave immediately. */ - li r4, 2 /* AUDITSC_FAILURE */ - li r5,-4 /* It's always -EINTR */ - bl .audit_syscall_exit -#endif - b .ret_from_except _GLOBAL(ppc_fork) - bl .save_nvgprs - bl .sys_fork + bl save_nvgprs + bl sys_fork b syscall_exit _GLOBAL(ppc_vfork) - bl .save_nvgprs - bl .sys_vfork + bl save_nvgprs + bl sys_vfork b syscall_exit _GLOBAL(ppc_clone) - bl .save_nvgprs - bl .sys_clone + bl save_nvgprs + bl sys_clone b syscall_exit _GLOBAL(ppc32_swapcontext) - bl .save_nvgprs - bl .compat_sys_swapcontext - b 80f - + bl save_nvgprs + bl compat_sys_swapcontext + b syscall_exit + _GLOBAL(ppc64_swapcontext) - bl .save_nvgprs - bl .sys_swapcontext - b 80f - -_GLOBAL(ppc32_sigreturn) - bl .compat_sys_sigreturn - b 80f - -_GLOBAL(ppc32_rt_sigreturn) - bl .compat_sys_rt_sigreturn - b 80f - -_GLOBAL(ppc64_rt_sigreturn) - bl .sys_rt_sigreturn - -80: cmpdi 0,r3,0 - blt syscall_exit - clrrdi r4,r1,THREAD_SHIFT - ld r4,TI_FLAGS(r4) - andi. r4,r4,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP) - beq+ 81f - addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_syscall_trace_leave -81: b .ret_from_except + bl save_nvgprs + bl sys_swapcontext + b syscall_exit _GLOBAL(ret_from_fork) - bl .schedule_tail + bl schedule_tail + REST_NVGPRS(r1) + li r3,0 + b syscall_exit + +_GLOBAL(ret_from_kernel_thread) + bl schedule_tail REST_NVGPRS(r1) + mtlr r14 + mr r3,r15 +#if defined(_CALL_ELF) && _CALL_ELF == 2 + mr r12,r14 +#endif + blrl li r3,0 b syscall_exit @@ -358,7 +403,7 @@ _GLOBAL(ret_from_fork) * the fork code also. * * The code which creates the new task context is in 'copy_thread' - * in arch/ppc64/kernel/process.c + * in arch/powerpc/kernel/process.c */ .align 7 _GLOBAL(_switch) @@ -371,6 +416,11 @@ _GLOBAL(_switch) mflr r20 /* Return to switch caller */ mfmsr r22 li r0, MSR_FP +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + oris r0,r0,MSR_VSX@h /* Disable VSX */ +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif /* CONFIG_VSX */ #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION oris r0,r0,MSR_VEC@h /* Disable altivec */ @@ -381,13 +431,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) and. r0,r0,r22 beq+ 1f andc r22,r22,r0 - mtmsrd r22 + MTMSRD(r22) isync 1: std r20,_NIP(r1) mfcr r23 std r23,_CCR(r1) std r1,KSP(r3) /* Set old stack pointer */ +#ifdef CONFIG_PPC_BOOK3S_64 +BEGIN_FTR_SECTION + /* Event based branch registers */ + mfspr r0, SPRN_BESCR + std r0, THREAD_BESCR(r3) + mfspr r0, SPRN_EBBHR + std r0, THREAD_EBBHR(r3) + mfspr r0, SPRN_EBBRR + std r0, THREAD_EBBRR(r3) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +#endif + #ifdef CONFIG_SMP /* We need a sync somewhere here to make sure that if the * previous task gets rescheduled on another CPU, it sees all @@ -396,13 +458,40 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) sync #endif /* CONFIG_SMP */ + /* + * If we optimise away the clear of the reservation in system + * calls because we know the CPU tracks the address of the + * reservation, then we need to clear it here to cover the + * case that the kernel context switch path has no larx + * instructions. + */ +BEGIN_FTR_SECTION + ldarx r6,0,r1 +END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS) + +#ifdef CONFIG_PPC_BOOK3S +/* Cancel all explict user streams as they will have no use after context + * switch and will stop the HW from creating streams itself + */ + DCBT_STOP_ALL_STREAM_IDS(r6) +#endif + addi r6,r4,-THREAD /* Convert THREAD to 'current' */ std r6,PACACURRENT(r13) /* Set new 'current' */ ld r8,KSP(r4) /* new stack pointer */ +#ifdef CONFIG_PPC_BOOK3S BEGIN_FTR_SECTION + BEGIN_FTR_SECTION_NESTED(95) clrrdi r6,r8,28 /* get its ESID */ clrrdi r9,r1,28 /* get current sp ESID */ + FTR_SECTION_ELSE_NESTED(95) + clrrdi r6,r8,40 /* get its 1T ESID */ + clrrdi r9,r1,40 /* get current sp 1T ESID */ + ALT_MMU_FTR_SECTION_END_NESTED_IFCLR(MMU_FTR_1T_SEGMENT, 95) +FTR_SECTION_ELSE + b 2f +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_SLB) clrldi. r0,r6,2 /* is new ESID c00000000? */ cmpd cr1,r6,r9 /* or is new ESID the same as current ESID? */ cror eq,4*cr1+eq,eq @@ -412,14 +501,38 @@ BEGIN_FTR_SECTION ld r7,KSP_VSID(r4) /* Get new stack's VSID */ oris r0,r6,(SLB_ESID_V)@h ori r0,r0,(SLB_NUM_BOLTED-1)@l +BEGIN_FTR_SECTION + li r9,MMU_SEGSIZE_1T /* insert B field */ + oris r6,r6,(MMU_SEGSIZE_1T << SLBIE_SSIZE_SHIFT)@h + rldimi r7,r9,SLB_VSID_SSIZE_SHIFT,0 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) + + /* Update the last bolted SLB. No write barriers are needed + * here, provided we only update the current CPU's SLB shadow + * buffer. + */ + ld r9,PACA_SLBSHADOWPTR(r13) + li r12,0 + std r12,SLBSHADOW_STACKESID(r9) /* Clear ESID */ + li r12,SLBSHADOW_STACKVSID + STDX_BE r7,r12,r9 /* Save VSID */ + li r12,SLBSHADOW_STACKESID + STDX_BE r0,r12,r9 /* Save ESID */ + + /* No need to check for MMU_FTR_NO_SLBIE_B here, since when + * we have 1TB segments, the only CPUs known to have the errata + * only support less than 1TB of system memory and we'll never + * actually hit this code path. + */ + slbie r6 slbie r6 /* Workaround POWER5 < DD2.1 issue */ slbmte r7,r0 isync - 2: -END_FTR_SECTION_IFSET(CPU_FTR_SLB) - clrrdi r7,r8,THREAD_SHIFT /* base of new stack */ +#endif /* !CONFIG_PPC_BOOK3S */ + + CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE because we don't need to leave the 288-byte ABI gap at the top of the kernel stack. */ @@ -428,8 +541,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_SLB) mr r1,r8 /* start using new stack pointer */ std r7,PACAKSAVE(r13) - ld r6,_CCR(r1) - mtcrf 0xFF,r6 +#ifdef CONFIG_PPC_BOOK3S_64 +BEGIN_FTR_SECTION + /* Event based branch registers */ + ld r0, THREAD_BESCR(r4) + mtspr SPRN_BESCR, r0 + ld r0, THREAD_EBBHR(r4) + mtspr SPRN_EBBHR, r0 + ld r0, THREAD_EBBRR(r4) + mtspr SPRN_EBBRR, r0 + + ld r0,THREAD_TAR(r4) + mtspr SPRN_TAR,r0 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +#endif #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION @@ -437,6 +562,28 @@ BEGIN_FTR_SECTION mtspr SPRN_VRSAVE,r0 /* if G4, restore VRSAVE reg */ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_PPC64 +BEGIN_FTR_SECTION + lwz r6,THREAD_DSCR_INHERIT(r4) + ld r0,THREAD_DSCR(r4) + cmpwi r6,0 + bne 1f + ld r0,PACA_DSCR(r13) +1: +BEGIN_FTR_SECTION_NESTED(70) + mfspr r8, SPRN_FSCR + rldimi r8, r6, FSCR_DSCR_LG, (63 - FSCR_DSCR_LG) + mtspr SPRN_FSCR, r8 +END_FTR_SECTION_NESTED(CPU_FTR_ARCH_207S, CPU_FTR_ARCH_207S, 70) + cmpd r0,r25 + beq 2f + mtspr SPRN_DSCR,r0 +2: +END_FTR_SECTION_IFSET(CPU_FTR_DSCR) +#endif + + ld r6,_CCR(r1) + mtcrf 0xFF,r6 /* r3-r13 are destroyed -- Cort */ REST_8GPRS(14, r1) @@ -453,7 +600,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) _GLOBAL(ret_from_except) ld r11,_TRAP(r1) andi. r0,r11,1 - bne .ret_from_except_lite + bne ret_from_except_lite REST_NVGPRS(r1) _GLOBAL(ret_from_except_lite) @@ -462,85 +609,245 @@ _GLOBAL(ret_from_except_lite) * can't change between when we test it and when we return * from the interrupt. */ - mfmsr r10 /* Get current interrupt state */ - rldicl r9,r10,48,1 /* clear MSR_EE */ - rotldi r9,r9,16 - mtmsrd r9,1 /* Update machine state */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 0 +#else + ld r10,PACAKMSR(r13) /* Get kernel MSR without EE */ + mtmsrd r10,1 /* Update machine state */ +#endif /* CONFIG_PPC_BOOK3E */ -#ifdef CONFIG_PREEMPT - clrrdi r9,r1,THREAD_SHIFT /* current_thread_info() */ - li r0,_TIF_NEED_RESCHED /* bits to check */ + CURRENT_THREAD_INFO(r9, r1) ld r3,_MSR(r1) +#ifdef CONFIG_PPC_BOOK3E + ld r10,PACACURRENT(r13) +#endif /* CONFIG_PPC_BOOK3E */ ld r4,TI_FLAGS(r9) - /* Move MSR_PR bit in r3 to _TIF_SIGPENDING position in r0 */ - rlwimi r0,r3,32+TIF_SIGPENDING-MSR_PR_LG,_TIF_SIGPENDING - and. r0,r4,r0 /* check NEED_RESCHED and maybe SIGPENDING */ - bne do_work - -#else /* !CONFIG_PREEMPT */ - ld r3,_MSR(r1) /* Returning to user mode? */ andi. r3,r3,MSR_PR - beq restore /* if not, just restore regs and return */ + beq resume_kernel +#ifdef CONFIG_PPC_BOOK3E + lwz r3,(THREAD+THREAD_DBCR0)(r10) +#endif /* CONFIG_PPC_BOOK3E */ /* Check current_thread_info()->flags */ - clrrdi r9,r1,THREAD_SHIFT - ld r4,TI_FLAGS(r9) andi. r0,r4,_TIF_USER_WORK_MASK - bne do_work +#ifdef CONFIG_PPC_BOOK3E + bne 1f + /* + * Check to see if the dbcr0 register is set up to debug. + * Use the internal debug mode bit to do this. + */ + andis. r0,r3,DBCR0_IDM@h + beq restore + mfmsr r0 + rlwinm r0,r0,0,~MSR_DE /* Clear MSR.DE */ + mtmsr r0 + mtspr SPRN_DBCR0,r3 + li r10, -1 + mtspr SPRN_DBSR,r10 + b restore +#else + beq restore +#endif +1: andi. r0,r4,_TIF_NEED_RESCHED + beq 2f + bl restore_interrupts + SCHEDULE_USER + b ret_from_except_lite +2: +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + andi. r0,r4,_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM + bne 3f /* only restore TM if nothing else to do */ + addi r3,r1,STACK_FRAME_OVERHEAD + bl restore_tm_state + b restore +3: #endif + bl save_nvgprs + bl restore_interrupts + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_notify_resume + b ret_from_except + +resume_kernel: + /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ + andis. r8,r4,_TIF_EMULATE_STACK_STORE@h + beq+ 1f + + addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */ + + lwz r3,GPR1(r1) + subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */ + mr r4,r1 /* src: current exception frame */ + mr r1,r3 /* Reroute the trampoline frame to r1 */ + + /* Copy from the original to the trampoline. */ + li r5,INT_FRAME_SIZE/8 /* size: INT_FRAME_SIZE */ + li r6,0 /* start offset: 0 */ + mtctr r5 +2: ldx r0,r6,r4 + stdx r0,r6,r3 + addi r6,r6,8 + bdnz 2b + + /* Do real store operation to complete stwu */ + lwz r5,GPR1(r1) + std r8,0(r5) + + /* Clear _TIF_EMULATE_STACK_STORE flag */ + lis r11,_TIF_EMULATE_STACK_STORE@h + addi r5,r9,TI_FLAGS +0: ldarx r4,0,r5 + andc r4,r4,r11 + stdcx. r4,0,r5 + bne- 0b +1: + +#ifdef CONFIG_PREEMPT + /* Check if we need to preempt */ + andi. r0,r4,_TIF_NEED_RESCHED + beq+ restore + /* Check that preempt_count() == 0 and interrupts are enabled */ + lwz r8,TI_PREEMPT(r9) + cmpwi cr1,r8,0 + ld r0,SOFTE(r1) + cmpdi r0,0 + crandc eq,cr1*4+eq,eq + bne restore + + /* + * Here we are preempting the current task. We want to make + * sure we are soft-disabled first and reconcile irq state. + */ + RECONCILE_IRQ_STATE(r3,r4) +1: bl preempt_schedule_irq + + /* Re-test flags and eventually loop */ + CURRENT_THREAD_INFO(r9, r1) + ld r4,TI_FLAGS(r9) + andi. r0,r4,_TIF_NEED_RESCHED + bne 1b + /* + * arch_local_irq_restore() from preempt_schedule_irq above may + * enable hard interrupt but we really should disable interrupts + * when we return from the interrupt, and so that we don't get + * interrupted after loading SRR0/1. + */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 0 +#else + ld r10,PACAKMSR(r13) /* Get kernel MSR without EE */ + mtmsrd r10,1 /* Update machine state */ +#endif /* CONFIG_PPC_BOOK3E */ +#endif /* CONFIG_PREEMPT */ + + .globl fast_exc_return_irq +fast_exc_return_irq: restore: -#ifdef CONFIG_PPC_ISERIES + /* + * This is the main kernel exit path. First we check if we + * are about to re-enable interrupts + */ ld r5,SOFTE(r1) - cmpdi 0,r5,0 - beq 4f - /* Check for pending interrupts (iSeries) */ - ld r3,PACALPPACA+LPPACAANYINT(r13) - cmpdi r3,0 - beq+ 4f /* skip do_IRQ if no interrupts */ + lbz r6,PACASOFTIRQEN(r13) + cmpwi cr0,r5,0 + beq restore_irq_off - li r3,0 - stb r3,PACAPROCENABLED(r13) /* ensure we are soft-disabled */ - ori r10,r10,MSR_EE - mtmsrd r10 /* hard-enable again */ - addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_IRQ - b .ret_from_except_lite /* loop back and handle more */ + /* We are enabling, were we already enabled ? Yes, just return */ + cmpwi cr0,r6,1 + beq cr0,do_restore -4: stb r5,PACAPROCENABLED(r13) -#endif + /* + * We are about to soft-enable interrupts (we are hard disabled + * at this point). We check if there's anything that needs to + * be replayed first. + */ + lbz r0,PACAIRQHAPPENED(r13) + cmpwi cr0,r0,0 + bne- restore_check_irq_replay - ld r3,_MSR(r1) - andi. r0,r3,MSR_RI - beq- unrecov_restore + /* + * Get here when nothing happened while soft-disabled, just + * soft-enable and move-on. We will hard-enable as a side + * effect of rfi + */ +restore_no_replay: + TRACE_ENABLE_INTS + li r0,1 + stb r0,PACASOFTIRQEN(r13); - andi. r0,r3,MSR_PR + /* + * Final return path. BookE is handled in a different file + */ +do_restore: +#ifdef CONFIG_PPC_BOOK3E + b exception_return_book3e +#else + /* + * Clear the reservation. If we know the CPU tracks the address of + * the reservation then we can potentially save some cycles and use + * a larx. On POWER6 and POWER7 this is significantly faster. + */ +BEGIN_FTR_SECTION + stdcx. r0,0,r1 /* to clear the reservation */ +FTR_SECTION_ELSE + ldarx r4,0,r1 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) /* - * r13 is our per cpu area, only restore it if we are returning to - * userspace + * Some code path such as load_up_fpu or altivec return directly + * here. They run entirely hard disabled and do not alter the + * interrupt state. They also don't use lwarx/stwcx. and thus + * are known not to leave dangling reservations. */ - beq 1f - REST_GPR(13, r1) -1: - ld r3,_CTR(r1) + .globl fast_exception_return +fast_exception_return: + ld r3,_MSR(r1) + ld r4,_CTR(r1) ld r0,_LINK(r1) - mtctr r3 + mtctr r4 mtlr r0 - ld r3,_XER(r1) - mtspr SPRN_XER,r3 + ld r4,_XER(r1) + mtspr SPRN_XER,r4 REST_8GPRS(5, r1) - stdcx. r0,0,r1 /* to clear the reservation */ + andi. r0,r3,MSR_RI + beq- unrecov_restore - mfmsr r0 - li r2, MSR_RI - andc r0,r0,r2 - mtmsrd r0,1 + /* Load PPR from thread struct before we clear MSR:RI */ +BEGIN_FTR_SECTION + ld r2,PACACURRENT(r13) + ld r2,TASKTHREADPPR(r2) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) - ld r0,_MSR(r1) - mtspr SPRN_SRR1,r0 + /* + * Clear RI before restoring r13. If we are returning to + * userspace and we take an exception after restoring r13, + * we end up corrupting the userspace r13 value. + */ + ld r4,PACAKMSR(r13) /* Get kernel MSR without EE */ + andc r4,r4,r0 /* r0 contains MSR_RI here */ + mtmsrd r4,1 + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + /* TM debug */ + std r3, PACATMSCRATCH(r13) /* Stash returned-to MSR */ +#endif + /* + * r13 is our per cpu area, only restore it if we are returning to + * userspace the value stored in the stack frame may belong to + * another CPU. + */ + andi. r0,r3,MSR_PR + beq 1f +BEGIN_FTR_SECTION + mtspr SPRN_PPR,r2 /* Restore PPR */ +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + ACCOUNT_CPU_USER_EXIT(r2, r4) + REST_GPR(13, r1) +1: + mtspr SPRN_SRR1,r3 ld r2,_CCR(r1) mtcrf 0xFF,r2 @@ -556,61 +863,88 @@ restore: rfid b . /* prevent speculative execution */ -/* Note: this must change if we start using the TIF_NOTIFY_RESUME bit */ -do_work: -#ifdef CONFIG_PREEMPT - andi. r0,r3,MSR_PR /* Returning to user mode? */ - bne user_work - /* Check that preempt_count() == 0 and interrupts are enabled */ - lwz r8,TI_PREEMPT(r9) - cmpwi cr1,r8,0 -#ifdef CONFIG_PPC_ISERIES - ld r0,SOFTE(r1) - cmpdi r0,0 -#else - andi. r0,r3,MSR_EE -#endif - crandc eq,cr1*4+eq,eq - bne restore - /* here we are preempting the current task */ -1: -#ifdef CONFIG_PPC_ISERIES - li r0,1 - stb r0,PACAPROCENABLED(r13) -#endif - ori r10,r10,MSR_EE - mtmsrd r10,1 /* reenable interrupts */ - bl .preempt_schedule - mfmsr r10 - clrrdi r9,r1,THREAD_SHIFT - rldicl r10,r10,48,1 /* disable interrupts again */ - rotldi r10,r10,16 - mtmsrd r10,1 - ld r4,TI_FLAGS(r9) - andi. r0,r4,_TIF_NEED_RESCHED - bne 1b - b restore - -user_work: -#endif - /* Enable interrupts */ - ori r10,r10,MSR_EE - mtmsrd r10,1 +#endif /* CONFIG_PPC_BOOK3E */ - andi. r0,r4,_TIF_NEED_RESCHED + /* + * We are returning to a context with interrupts soft disabled. + * + * However, we may also about to hard enable, so we need to + * make sure that in this case, we also clear PACA_IRQ_HARD_DIS + * or that bit can get out of sync and bad things will happen + */ +restore_irq_off: + ld r3,_MSR(r1) + lbz r7,PACAIRQHAPPENED(r13) + andi. r0,r3,MSR_EE beq 1f - bl .schedule - b .ret_from_except_lite + rlwinm r7,r7,0,~PACA_IRQ_HARD_DIS + stb r7,PACAIRQHAPPENED(r13) +1: li r0,0 + stb r0,PACASOFTIRQEN(r13); + TRACE_DISABLE_INTS + b do_restore -1: bl .save_nvgprs - li r3,0 - addi r4,r1,STACK_FRAME_OVERHEAD - bl .do_signal - b .ret_from_except + /* + * Something did happen, check if a re-emit is needed + * (this also clears paca->irq_happened) + */ +restore_check_irq_replay: + /* XXX: We could implement a fast path here where we check + * for irq_happened being just 0x01, in which case we can + * clear it and return. That means that we would potentially + * miss a decrementer having wrapped all the way around. + * + * Still, this might be useful for things like hash_page + */ + bl __check_irq_replay + cmpwi cr0,r3,0 + beq restore_no_replay + + /* + * We need to re-emit an interrupt. We do so by re-using our + * existing exception frame. We first change the trap value, + * but we need to ensure we preserve the low nibble of it + */ + ld r4,_TRAP(r1) + clrldi r4,r4,60 + or r4,r4,r3 + std r4,_TRAP(r1) + /* + * Then find the right handler and call it. Interrupts are + * still soft-disabled and we keep them that way. + */ + cmpwi cr0,r3,0x500 + bne 1f + addi r3,r1,STACK_FRAME_OVERHEAD; + bl do_IRQ + b ret_from_except +1: cmpwi cr0,r3,0x900 + bne 1f + addi r3,r1,STACK_FRAME_OVERHEAD; + bl timer_interrupt + b ret_from_except +#ifdef CONFIG_PPC_DOORBELL +1: +#ifdef CONFIG_PPC_BOOK3E + cmpwi cr0,r3,0x280 +#else + BEGIN_FTR_SECTION + cmpwi cr0,r3,0xe80 + FTR_SECTION_ELSE + cmpwi cr0,r3,0xa00 + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) +#endif /* CONFIG_PPC_BOOK3E */ + bne 1f + addi r3,r1,STACK_FRAME_OVERHEAD; + bl doorbell_exception + b ret_from_except +#endif /* CONFIG_PPC_DOORBELL */ +1: b ret_from_except /* What else to do here ? */ + unrecov_restore: addi r3,r1,STACK_FRAME_OVERHEAD - bl .unrecoverable_exception + bl unrecoverable_exception b unrecov_restore #ifdef CONFIG_PPC_RTAS @@ -646,25 +980,28 @@ _GLOBAL(enter_rtas) std r7,_DAR(r1) mfdsisr r8 std r8,_DSISR(r1) - mfsrr0 r9 - std r9,_SRR0(r1) - mfsrr1 r10 - std r10,_SRR1(r1) + /* Temporary workaround to clear CR until RTAS can be modified to + * ignore all bits. + */ + li r0,0 + mtcr r0 + +#ifdef CONFIG_BUG /* There is no way it is acceptable to get here with interrupts enabled, * check it with the asm equivalent of WARN_ON */ - mfmsr r6 - andi. r0,r6,MSR_EE + lbz r0,PACASOFTIRQEN(r13) 1: tdnei r0,0 -.section __bug_table,"a" - .llong 1b,__LINE__ + 0x1000000, 1f, 2f -.previous -.section .rodata,"a" -1: .asciz __FILE__ -2: .asciz "enter_rtas" -.previous + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING +#endif + /* Hard-disable interrupts */ + mfmsr r6 + rldicl r7,r6,48,1 + rotldi r7,r7,16 + mtmsrd r7,1 + /* Unfortunately, the stack pointer and the MSR are also clobbered, * so they are saved in the PACA which allows us to restore * our original state after RTAS returns. @@ -673,9 +1010,8 @@ _GLOBAL(enter_rtas) std r6,PACASAVEDMSR(r13) /* Setup our real return addr */ - SET_REG_TO_LABEL(r4,.rtas_return_loc) - SET_REG_TO_CONST(r9,KERNELBASE) - sub r4,r4,r9 + LOAD_REG_ADDR(r4,rtas_return_loc) + clrldi r4,r4,2 /* convert to realmode address */ mtlr r4 li r0,0 @@ -684,13 +1020,12 @@ _GLOBAL(enter_rtas) li r9,1 rldicr r9,r9,MSR_SF_LG,(63-MSR_SF_LG) - ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP + ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE andc r6,r0,r9 - ori r6,r6,MSR_RI sync /* disable interrupts so SRR0/1 */ mtmsrd r0 /* don't get trashed */ - SET_REG_TO_LABEL(r4,rtas) + LOAD_REG_ADDR(r4, rtas) ld r5,RTASENTRY(r4) /* get the rtas->entry value */ ld r4,RTASBASE(r4) /* get the rtas->base value */ @@ -699,11 +1034,16 @@ _GLOBAL(enter_rtas) rfid b . /* prevent speculative execution */ -_STATIC(rtas_return_loc) +rtas_return_loc: + FIXUP_ENDIAN + /* relocation is off at this point */ - mfspr r4,SPRN_SPRG3 /* Get PACA */ - SET_REG_TO_CONST(r5, KERNELBASE) - sub r4,r4,r5 /* RELOC the PACA base pointer */ + GET_PACA(r4) + clrldi r4,r4,2 /* convert to realmode address */ + + bcl 20,31,$+4 +0: mflr r3 + ld r3,(1f-0b)(r3) /* get &rtas_restore_regs */ mfmsr r6 li r0,MSR_RI @@ -712,7 +1052,6 @@ _STATIC(rtas_return_loc) mtmsrd r6 ld r1,PACAR1(r4) /* Restore our SP */ - LOADADDR(r3,.rtas_restore_regs) ld r4,PACASAVEDMSR(r4) /* Restore our MSR */ mtspr SPRN_SRR0,r3 @@ -720,14 +1059,17 @@ _STATIC(rtas_return_loc) rfid b . /* prevent speculative execution */ -_STATIC(rtas_restore_regs) + .align 3 +1: .llong rtas_restore_regs + +rtas_restore_regs: /* relocation is on at this point */ REST_GPR(2, r1) /* Restore the TOC */ REST_GPR(13, r1) /* Restore paca */ REST_8GPRS(14, r1) /* Restore the non-volatiles */ REST_10GPRS(22, r1) /* ditto */ - mfspr r13,SPRN_SPRG3 + GET_PACA(r13) ld r4,_CCR(r1) mtcr r4 @@ -739,10 +1081,6 @@ _STATIC(rtas_restore_regs) mtdar r7 ld r8,_DSISR(r1) mtdsisr r8 - ld r9,_SRR0(r1) - mtsrr0 r9 - ld r10,_SRR1(r1) - mtsrr1 r10 addi r1,r1,RTAS_FRAME_SIZE /* Unstack our frame */ ld r0,16(r1) /* get return address */ @@ -752,8 +1090,6 @@ _STATIC(rtas_restore_regs) #endif /* CONFIG_PPC_RTAS */ -#ifdef CONFIG_PPC_MULTIPLATFORM - _GLOBAL(enter_prom) mflr r0 std r0,16(r1) @@ -763,46 +1099,39 @@ _GLOBAL(enter_prom) * of all registers that it saves. We therefore save those registers * PROM might touch to the stack. (r0, r3-r13 are caller saved) */ - SAVE_8GPRS(2, r1) + SAVE_GPR(2, r1) SAVE_GPR(13, r1) SAVE_8GPRS(14, r1) SAVE_10GPRS(22, r1) - mfcr r4 - std r4,_CCR(r1) - mfctr r5 - std r5,_CTR(r1) - mfspr r6,SPRN_XER - std r6,_XER(r1) - mfdar r7 - std r7,_DAR(r1) - mfdsisr r8 - std r8,_DSISR(r1) - mfsrr0 r9 - std r9,_SRR0(r1) - mfsrr1 r10 - std r10,_SRR1(r1) + mfcr r10 mfmsr r11 + std r10,_CCR(r1) std r11,_MSR(r1) - /* Get the PROM entrypoint */ - ld r0,GPR4(r1) - mtlr r0 + /* Put PROM address in SRR0 */ + mtsrr0 r4 - /* Switch MSR to 32 bits mode + /* Setup our trampoline return addr in LR */ + bcl 20,31,$+4 +0: mflr r4 + addi r4,r4,(1f - 0b) + mtlr r4 + + /* Prepare a 32-bit mode big endian MSR */ - mfmsr r11 - li r12,1 - rldicr r12,r12,MSR_SF_LG,(63-MSR_SF_LG) - andc r11,r11,r12 - li r12,1 - rldicr r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG) - andc r11,r11,r12 - mtmsrd r11 - isync +#ifdef CONFIG_PPC_BOOK3E + rlwinm r11,r11,0,1,31 + mtsrr1 r11 + rfi +#else /* CONFIG_PPC_BOOK3E */ + LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE) + andc r11,r11,r12 + mtsrr1 r11 + rfid +#endif /* CONFIG_PPC_BOOK3E */ - /* Restore arguments & enter PROM here... */ - ld r3,GPR3(r1) - blrl +1: /* Return from OF */ + FIXUP_ENDIAN /* Just make sure that r1 top 32 bits didn't get * corrupt by OF @@ -811,7 +1140,7 @@ _GLOBAL(enter_prom) /* Restore the MSR (back to 64 bits) */ ld r0,_MSR(r1) - mtmsrd r0 + MTMSRD(r0) isync /* Restore other registers */ @@ -821,22 +1150,139 @@ _GLOBAL(enter_prom) REST_10GPRS(22, r1) ld r4,_CCR(r1) mtcr r4 - ld r5,_CTR(r1) - mtctr r5 - ld r6,_XER(r1) - mtspr SPRN_XER,r6 - ld r7,_DAR(r1) - mtdar r7 - ld r8,_DSISR(r1) - mtdsisr r8 - ld r9,_SRR0(r1) - mtsrr0 r9 - ld r10,_SRR1(r1) - mtsrr1 r10 addi r1,r1,PROM_FRAME_SIZE ld r0,16(r1) mtlr r0 blr - -#endif /* CONFIG_PPC_MULTIPLATFORM */ + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE +_GLOBAL(mcount) +_GLOBAL(_mcount) + blr + +_GLOBAL_TOC(ftrace_caller) + /* Taken from output of objdump from lib64/glibc */ + mflr r3 + ld r11, 0(r1) + stdu r1, -112(r1) + std r3, 128(r1) + ld r4, 16(r11) + subi r3, r3, MCOUNT_INSN_SIZE +.globl ftrace_call +ftrace_call: + bl ftrace_stub + nop +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + b ftrace_graph_stub +_GLOBAL(ftrace_graph_stub) +#endif + ld r0, 128(r1) + mtlr r0 + addi r1, r1, 112 +_GLOBAL(ftrace_stub) + blr +#else +_GLOBAL_TOC(_mcount) + /* Taken from output of objdump from lib64/glibc */ + mflr r3 + ld r11, 0(r1) + stdu r1, -112(r1) + std r3, 128(r1) + ld r4, 16(r11) + + subi r3, r3, MCOUNT_INSN_SIZE + LOAD_REG_ADDR(r5,ftrace_trace_function) + ld r5,0(r5) + ld r5,0(r5) + mtctr r5 + bctrl + nop + + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + b ftrace_graph_caller +#endif + ld r0, 128(r1) + mtlr r0 + addi r1, r1, 112 +_GLOBAL(ftrace_stub) + blr + +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +_GLOBAL(ftrace_graph_caller) + /* load r4 with local address */ + ld r4, 128(r1) + subi r4, r4, MCOUNT_INSN_SIZE + + /* get the parent address */ + ld r11, 112(r1) + addi r3, r11, 16 + + bl prepare_ftrace_return + nop + + ld r0, 128(r1) + mtlr r0 + addi r1, r1, 112 + blr + +_GLOBAL(return_to_handler) + /* need to save return values */ + std r4, -24(r1) + std r3, -16(r1) + std r31, -8(r1) + mr r31, r1 + stdu r1, -112(r1) + + bl ftrace_return_to_handler + nop + + /* return value has real return address */ + mtlr r3 + + ld r1, 0(r1) + ld r4, -24(r1) + ld r3, -16(r1) + ld r31, -8(r1) + + /* Jump back to real return address */ + blr + +_GLOBAL(mod_return_to_handler) + /* need to save return values */ + std r4, -32(r1) + std r3, -24(r1) + /* save TOC */ + std r2, -16(r1) + std r31, -8(r1) + mr r31, r1 + stdu r1, -112(r1) + + /* + * We are in a module using the module's TOC. + * Switch to our TOC to run inside the core kernel. + */ + ld r2, PACATOC(r13) + + bl ftrace_return_to_handler + nop + + /* return value has real return address */ + mtlr r3 + + ld r1, 0(r1) + ld r4, -32(r1) + ld r3, -24(r1) + ld r2, -16(r1) + ld r31, -8(r1) + + /* Jump back to real return address */ + blr +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +#endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S new file mode 100644 index 00000000000..9f1ebf7338f --- /dev/null +++ b/arch/powerpc/kernel/epapr_hcalls.S @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2012 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/threads.h> +#include <asm/epapr_hcalls.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/thread_info.h> +#include <asm/ppc_asm.h> +#include <asm/asm-compat.h> +#include <asm/asm-offsets.h> + +#ifndef CONFIG_PPC64 +/* epapr_ev_idle() was derived from e500_idle() */ +_GLOBAL(epapr_ev_idle) + CURRENT_THREAD_INFO(r3, r1) + PPC_LL r4, TI_LOCAL_FLAGS(r3) /* set napping bit */ + ori r4, r4,_TLF_NAPPING /* so when we take an exception */ + PPC_STL r4, TI_LOCAL_FLAGS(r3) /* it will return to our caller */ + + wrteei 1 + +idle_loop: + LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE)) + +.global epapr_ev_idle_start +epapr_ev_idle_start: + li r3, -1 + nop + nop + nop + + /* + * Guard against spurious wakeups from a hypervisor -- + * only interrupt will cause us to return to LR due to + * _TLF_NAPPING. + */ + b idle_loop +#endif + +/* Hypercall entry point. Will be patched with device tree instructions. */ +.global epapr_hypercall_start +epapr_hypercall_start: + li r3, -1 + nop + nop + nop + blr diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c new file mode 100644 index 00000000000..59e4ba74975 --- /dev/null +++ b/arch/powerpc/kernel/epapr_paravirt.c @@ -0,0 +1,85 @@ +/* + * ePAPR para-virtualization support. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright (C) 2012 Freescale Semiconductor, Inc. + */ + +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <asm/epapr_hcalls.h> +#include <asm/cacheflush.h> +#include <asm/code-patching.h> +#include <asm/machdep.h> + +#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) +extern void epapr_ev_idle(void); +extern u32 epapr_ev_idle_start[]; +#endif + +bool epapr_paravirt_enabled; +static bool __maybe_unused epapr_has_idle; + +static int __init early_init_dt_scan_epapr(unsigned long node, + const char *uname, + int depth, void *data) +{ + const u32 *insts; + int len; + int i; + + insts = of_get_flat_dt_prop(node, "hcall-instructions", &len); + if (!insts) + return 0; + + if (len % 4 || len > (4 * 4)) + return -1; + + for (i = 0; i < (len / 4); i++) { + u32 inst = be32_to_cpu(insts[i]); + patch_instruction(epapr_hypercall_start + i, inst); +#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) + patch_instruction(epapr_ev_idle_start + i, inst); +#endif + } + +#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) + if (of_get_flat_dt_prop(node, "has-idle", NULL)) + epapr_has_idle = true; +#endif + + epapr_paravirt_enabled = true; + + return 1; +} + +int __init epapr_paravirt_early_init(void) +{ + of_scan_flat_dt(early_init_dt_scan_epapr, NULL); + + return 0; +} + +static int __init epapr_idle_init(void) +{ +#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) + if (epapr_has_idle) + ppc_md.power_save = epapr_ev_idle; +#endif + + return 0; +} + +postcore_initcall(epapr_idle_init); diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S new file mode 100644 index 00000000000..bb9cac6c805 --- /dev/null +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -0,0 +1,1650 @@ +/* + * Boot code and exception vectors for Book3E processors + * + * Copyright (C) 2007 Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/threads.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cputable.h> +#include <asm/setup.h> +#include <asm/thread_info.h> +#include <asm/reg_a2.h> +#include <asm/exception-64e.h> +#include <asm/bug.h> +#include <asm/irqflags.h> +#include <asm/ptrace.h> +#include <asm/ppc-opcode.h> +#include <asm/mmu.h> +#include <asm/hw_irq.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_booke_hv_asm.h> + +/* XXX This will ultimately add space for a special exception save + * structure used to save things like SRR0/SRR1, SPRGs, MAS, etc... + * when taking special interrupts. For now we don't support that, + * special interrupts from within a non-standard level will probably + * blow you up + */ +#define SPECIAL_EXC_SRR0 0 +#define SPECIAL_EXC_SRR1 1 +#define SPECIAL_EXC_SPRG_GEN 2 +#define SPECIAL_EXC_SPRG_TLB 3 +#define SPECIAL_EXC_MAS0 4 +#define SPECIAL_EXC_MAS1 5 +#define SPECIAL_EXC_MAS2 6 +#define SPECIAL_EXC_MAS3 7 +#define SPECIAL_EXC_MAS6 8 +#define SPECIAL_EXC_MAS7 9 +#define SPECIAL_EXC_MAS5 10 /* E.HV only */ +#define SPECIAL_EXC_MAS8 11 /* E.HV only */ +#define SPECIAL_EXC_IRQHAPPENED 12 +#define SPECIAL_EXC_DEAR 13 +#define SPECIAL_EXC_ESR 14 +#define SPECIAL_EXC_SOFTE 15 +#define SPECIAL_EXC_CSRR0 16 +#define SPECIAL_EXC_CSRR1 17 +/* must be even to keep 16-byte stack alignment */ +#define SPECIAL_EXC_END 18 + +#define SPECIAL_EXC_FRAME_SIZE (INT_FRAME_SIZE + SPECIAL_EXC_END * 8) +#define SPECIAL_EXC_FRAME_OFFS (INT_FRAME_SIZE - 288) + +#define SPECIAL_EXC_STORE(reg, name) \ + std reg, (SPECIAL_EXC_##name * 8 + SPECIAL_EXC_FRAME_OFFS)(r1) + +#define SPECIAL_EXC_LOAD(reg, name) \ + ld reg, (SPECIAL_EXC_##name * 8 + SPECIAL_EXC_FRAME_OFFS)(r1) + +special_reg_save: + lbz r9,PACAIRQHAPPENED(r13) + RECONCILE_IRQ_STATE(r3,r4) + + /* + * We only need (or have stack space) to save this stuff if + * we interrupted the kernel. + */ + ld r3,_MSR(r1) + andi. r3,r3,MSR_PR + bnelr + + /* Copy info into temporary exception thread info */ + ld r11,PACAKSAVE(r13) + CURRENT_THREAD_INFO(r11, r11) + CURRENT_THREAD_INFO(r12, r1) + ld r10,TI_FLAGS(r11) + std r10,TI_FLAGS(r12) + ld r10,TI_PREEMPT(r11) + std r10,TI_PREEMPT(r12) + ld r10,TI_TASK(r11) + std r10,TI_TASK(r12) + + /* + * Advance to the next TLB exception frame for handler + * types that don't do it automatically. + */ + LOAD_REG_ADDR(r11,extlb_level_exc) + lwz r12,0(r11) + mfspr r10,SPRN_SPRG_TLB_EXFRAME + add r10,r10,r12 + mtspr SPRN_SPRG_TLB_EXFRAME,r10 + + /* + * Save registers needed to allow nesting of certain exceptions + * (such as TLB misses) inside special exception levels + */ + mfspr r10,SPRN_SRR0 + SPECIAL_EXC_STORE(r10,SRR0) + mfspr r10,SPRN_SRR1 + SPECIAL_EXC_STORE(r10,SRR1) + mfspr r10,SPRN_SPRG_GEN_SCRATCH + SPECIAL_EXC_STORE(r10,SPRG_GEN) + mfspr r10,SPRN_SPRG_TLB_SCRATCH + SPECIAL_EXC_STORE(r10,SPRG_TLB) + mfspr r10,SPRN_MAS0 + SPECIAL_EXC_STORE(r10,MAS0) + mfspr r10,SPRN_MAS1 + SPECIAL_EXC_STORE(r10,MAS1) + mfspr r10,SPRN_MAS2 + SPECIAL_EXC_STORE(r10,MAS2) + mfspr r10,SPRN_MAS3 + SPECIAL_EXC_STORE(r10,MAS3) + mfspr r10,SPRN_MAS6 + SPECIAL_EXC_STORE(r10,MAS6) + mfspr r10,SPRN_MAS7 + SPECIAL_EXC_STORE(r10,MAS7) +BEGIN_FTR_SECTION + mfspr r10,SPRN_MAS5 + SPECIAL_EXC_STORE(r10,MAS5) + mfspr r10,SPRN_MAS8 + SPECIAL_EXC_STORE(r10,MAS8) + + /* MAS5/8 could have inappropriate values if we interrupted KVM code */ + li r10,0 + mtspr SPRN_MAS5,r10 + mtspr SPRN_MAS8,r10 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) + SPECIAL_EXC_STORE(r9,IRQHAPPENED) + + mfspr r10,SPRN_DEAR + SPECIAL_EXC_STORE(r10,DEAR) + mfspr r10,SPRN_ESR + SPECIAL_EXC_STORE(r10,ESR) + + lbz r10,PACASOFTIRQEN(r13) + SPECIAL_EXC_STORE(r10,SOFTE) + ld r10,_NIP(r1) + SPECIAL_EXC_STORE(r10,CSRR0) + ld r10,_MSR(r1) + SPECIAL_EXC_STORE(r10,CSRR1) + + blr + +ret_from_level_except: + ld r3,_MSR(r1) + andi. r3,r3,MSR_PR + beq 1f + b ret_from_except +1: + + LOAD_REG_ADDR(r11,extlb_level_exc) + lwz r12,0(r11) + mfspr r10,SPRN_SPRG_TLB_EXFRAME + sub r10,r10,r12 + mtspr SPRN_SPRG_TLB_EXFRAME,r10 + + /* + * It's possible that the special level exception interrupted a + * TLB miss handler, and inserted the same entry that the + * interrupted handler was about to insert. On CPUs without TLB + * write conditional, this can result in a duplicate TLB entry. + * Wipe all non-bolted entries to be safe. + * + * Note that this doesn't protect against any TLB misses + * we may take accessing the stack from here to the end of + * the special level exception. It's not clear how we can + * reasonably protect against that, but only CPUs with + * neither TLB write conditional nor bolted kernel memory + * are affected. Do any such CPUs even exist? + */ + PPC_TLBILX_ALL(0,R0) + + REST_NVGPRS(r1) + + SPECIAL_EXC_LOAD(r10,SRR0) + mtspr SPRN_SRR0,r10 + SPECIAL_EXC_LOAD(r10,SRR1) + mtspr SPRN_SRR1,r10 + SPECIAL_EXC_LOAD(r10,SPRG_GEN) + mtspr SPRN_SPRG_GEN_SCRATCH,r10 + SPECIAL_EXC_LOAD(r10,SPRG_TLB) + mtspr SPRN_SPRG_TLB_SCRATCH,r10 + SPECIAL_EXC_LOAD(r10,MAS0) + mtspr SPRN_MAS0,r10 + SPECIAL_EXC_LOAD(r10,MAS1) + mtspr SPRN_MAS1,r10 + SPECIAL_EXC_LOAD(r10,MAS2) + mtspr SPRN_MAS2,r10 + SPECIAL_EXC_LOAD(r10,MAS3) + mtspr SPRN_MAS3,r10 + SPECIAL_EXC_LOAD(r10,MAS6) + mtspr SPRN_MAS6,r10 + SPECIAL_EXC_LOAD(r10,MAS7) + mtspr SPRN_MAS7,r10 +BEGIN_FTR_SECTION + SPECIAL_EXC_LOAD(r10,MAS5) + mtspr SPRN_MAS5,r10 + SPECIAL_EXC_LOAD(r10,MAS8) + mtspr SPRN_MAS8,r10 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) + + lbz r6,PACASOFTIRQEN(r13) + ld r5,SOFTE(r1) + + /* Interrupts had better not already be enabled... */ + twnei r6,0 + + cmpwi cr0,r5,0 + beq 1f + + TRACE_ENABLE_INTS + stb r5,PACASOFTIRQEN(r13) +1: + /* + * Restore PACAIRQHAPPENED rather than setting it based on + * the return MSR[EE], since we could have interrupted + * __check_irq_replay() or other inconsistent transitory + * states that must remain that way. + */ + SPECIAL_EXC_LOAD(r10,IRQHAPPENED) + stb r10,PACAIRQHAPPENED(r13) + + SPECIAL_EXC_LOAD(r10,DEAR) + mtspr SPRN_DEAR,r10 + SPECIAL_EXC_LOAD(r10,ESR) + mtspr SPRN_ESR,r10 + + stdcx. r0,0,r1 /* to clear the reservation */ + + REST_4GPRS(2, r1) + REST_4GPRS(6, r1) + + ld r10,_CTR(r1) + ld r11,_XER(r1) + mtctr r10 + mtxer r11 + + blr + +.macro ret_from_level srr0 srr1 paca_ex scratch + bl ret_from_level_except + + ld r10,_LINK(r1) + ld r11,_CCR(r1) + ld r0,GPR13(r1) + mtlr r10 + mtcr r11 + + ld r10,GPR10(r1) + ld r11,GPR11(r1) + ld r12,GPR12(r1) + mtspr \scratch,r0 + + std r10,\paca_ex+EX_R10(r13); + std r11,\paca_ex+EX_R11(r13); + ld r10,_NIP(r1) + ld r11,_MSR(r1) + ld r0,GPR0(r1) + ld r1,GPR1(r1) + mtspr \srr0,r10 + mtspr \srr1,r11 + ld r10,\paca_ex+EX_R10(r13) + ld r11,\paca_ex+EX_R11(r13) + mfspr r13,\scratch +.endm + +ret_from_crit_except: + ret_from_level SPRN_CSRR0 SPRN_CSRR1 PACA_EXCRIT SPRN_SPRG_CRIT_SCRATCH + rfci + +ret_from_mc_except: + ret_from_level SPRN_MCSRR0 SPRN_MCSRR1 PACA_EXMC SPRN_SPRG_MC_SCRATCH + rfmci + +/* Exception prolog code for all exceptions */ +#define EXCEPTION_PROLOG(n, intnum, type, addition) \ + mtspr SPRN_SPRG_##type##_SCRATCH,r13; /* get spare registers */ \ + mfspr r13,SPRN_SPRG_PACA; /* get PACA */ \ + std r10,PACA_EX##type+EX_R10(r13); \ + std r11,PACA_EX##type+EX_R11(r13); \ + mfcr r10; /* save CR */ \ + mfspr r11,SPRN_##type##_SRR1;/* what are we coming from */ \ + DO_KVM intnum,SPRN_##type##_SRR1; /* KVM hook */ \ + stw r10,PACA_EX##type+EX_CR(r13); /* save old CR in the PACA */ \ + addition; /* additional code for that exc. */ \ + std r1,PACA_EX##type+EX_R1(r13); /* save old r1 in the PACA */ \ + type##_SET_KSTACK; /* get special stack if necessary */\ + andi. r10,r11,MSR_PR; /* save stack pointer */ \ + beq 1f; /* branch around if supervisor */ \ + ld r1,PACAKSAVE(r13); /* get kernel stack coming from usr */\ +1: cmpdi cr1,r1,0; /* check if SP makes sense */ \ + bge- cr1,exc_##n##_bad_stack;/* bad stack (TODO: out of line) */ \ + mfspr r10,SPRN_##type##_SRR0; /* read SRR0 before touching stack */ + +/* Exception type-specific macros */ +#define GEN_SET_KSTACK \ + subi r1,r1,INT_FRAME_SIZE; /* alloc frame on kernel stack */ +#define SPRN_GEN_SRR0 SPRN_SRR0 +#define SPRN_GEN_SRR1 SPRN_SRR1 + +#define GDBELL_SET_KSTACK GEN_SET_KSTACK +#define SPRN_GDBELL_SRR0 SPRN_GSRR0 +#define SPRN_GDBELL_SRR1 SPRN_GSRR1 + +#define CRIT_SET_KSTACK \ + ld r1,PACA_CRIT_STACK(r13); \ + subi r1,r1,SPECIAL_EXC_FRAME_SIZE +#define SPRN_CRIT_SRR0 SPRN_CSRR0 +#define SPRN_CRIT_SRR1 SPRN_CSRR1 + +#define DBG_SET_KSTACK \ + ld r1,PACA_DBG_STACK(r13); \ + subi r1,r1,SPECIAL_EXC_FRAME_SIZE +#define SPRN_DBG_SRR0 SPRN_DSRR0 +#define SPRN_DBG_SRR1 SPRN_DSRR1 + +#define MC_SET_KSTACK \ + ld r1,PACA_MC_STACK(r13); \ + subi r1,r1,SPECIAL_EXC_FRAME_SIZE +#define SPRN_MC_SRR0 SPRN_MCSRR0 +#define SPRN_MC_SRR1 SPRN_MCSRR1 + +#define NORMAL_EXCEPTION_PROLOG(n, intnum, addition) \ + EXCEPTION_PROLOG(n, intnum, GEN, addition##_GEN(n)) + +#define CRIT_EXCEPTION_PROLOG(n, intnum, addition) \ + EXCEPTION_PROLOG(n, intnum, CRIT, addition##_CRIT(n)) + +#define DBG_EXCEPTION_PROLOG(n, intnum, addition) \ + EXCEPTION_PROLOG(n, intnum, DBG, addition##_DBG(n)) + +#define MC_EXCEPTION_PROLOG(n, intnum, addition) \ + EXCEPTION_PROLOG(n, intnum, MC, addition##_MC(n)) + +#define GDBELL_EXCEPTION_PROLOG(n, intnum, addition) \ + EXCEPTION_PROLOG(n, intnum, GDBELL, addition##_GDBELL(n)) + +/* Variants of the "addition" argument for the prolog + */ +#define PROLOG_ADDITION_NONE_GEN(n) +#define PROLOG_ADDITION_NONE_GDBELL(n) +#define PROLOG_ADDITION_NONE_CRIT(n) +#define PROLOG_ADDITION_NONE_DBG(n) +#define PROLOG_ADDITION_NONE_MC(n) + +#define PROLOG_ADDITION_MASKABLE_GEN(n) \ + lbz r10,PACASOFTIRQEN(r13); /* are irqs soft-disabled ? */ \ + cmpwi cr0,r10,0; /* yes -> go out of line */ \ + beq masked_interrupt_book3e_##n + +#define PROLOG_ADDITION_2REGS_GEN(n) \ + std r14,PACA_EXGEN+EX_R14(r13); \ + std r15,PACA_EXGEN+EX_R15(r13) + +#define PROLOG_ADDITION_1REG_GEN(n) \ + std r14,PACA_EXGEN+EX_R14(r13); + +#define PROLOG_ADDITION_2REGS_CRIT(n) \ + std r14,PACA_EXCRIT+EX_R14(r13); \ + std r15,PACA_EXCRIT+EX_R15(r13) + +#define PROLOG_ADDITION_2REGS_DBG(n) \ + std r14,PACA_EXDBG+EX_R14(r13); \ + std r15,PACA_EXDBG+EX_R15(r13) + +#define PROLOG_ADDITION_2REGS_MC(n) \ + std r14,PACA_EXMC+EX_R14(r13); \ + std r15,PACA_EXMC+EX_R15(r13) + + +/* Core exception code for all exceptions except TLB misses. */ +#define EXCEPTION_COMMON_LVL(n, scratch, excf) \ +exc_##n##_common: \ + std r0,GPR0(r1); /* save r0 in stackframe */ \ + std r2,GPR2(r1); /* save r2 in stackframe */ \ + SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ + SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ + std r9,GPR9(r1); /* save r9 in stackframe */ \ + std r10,_NIP(r1); /* save SRR0 to stackframe */ \ + std r11,_MSR(r1); /* save SRR1 to stackframe */ \ + beq 2f; /* if from kernel mode */ \ + ACCOUNT_CPU_USER_ENTRY(r10,r11);/* accounting (uses cr0+eq) */ \ +2: ld r3,excf+EX_R10(r13); /* get back r10 */ \ + ld r4,excf+EX_R11(r13); /* get back r11 */ \ + mfspr r5,scratch; /* get back r13 */ \ + std r12,GPR12(r1); /* save r12 in stackframe */ \ + ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \ + mflr r6; /* save LR in stackframe */ \ + mfctr r7; /* save CTR in stackframe */ \ + mfspr r8,SPRN_XER; /* save XER in stackframe */ \ + ld r9,excf+EX_R1(r13); /* load orig r1 back from PACA */ \ + lwz r10,excf+EX_CR(r13); /* load orig CR back from PACA */ \ + lbz r11,PACASOFTIRQEN(r13); /* get current IRQ softe */ \ + ld r12,exception_marker@toc(r2); \ + li r0,0; \ + std r3,GPR10(r1); /* save r10 to stackframe */ \ + std r4,GPR11(r1); /* save r11 to stackframe */ \ + std r5,GPR13(r1); /* save it to stackframe */ \ + std r6,_LINK(r1); \ + std r7,_CTR(r1); \ + std r8,_XER(r1); \ + li r3,(n)+1; /* indicate partial regs in trap */ \ + std r9,0(r1); /* store stack frame back link */ \ + std r10,_CCR(r1); /* store orig CR in stackframe */ \ + std r9,GPR1(r1); /* store stack frame back link */ \ + std r11,SOFTE(r1); /* and save it to stackframe */ \ + std r12,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ \ + std r3,_TRAP(r1); /* set trap number */ \ + std r0,RESULT(r1); /* clear regs->result */ + +#define EXCEPTION_COMMON(n) \ + EXCEPTION_COMMON_LVL(n, SPRN_SPRG_GEN_SCRATCH, PACA_EXGEN) +#define EXCEPTION_COMMON_CRIT(n) \ + EXCEPTION_COMMON_LVL(n, SPRN_SPRG_CRIT_SCRATCH, PACA_EXCRIT) +#define EXCEPTION_COMMON_MC(n) \ + EXCEPTION_COMMON_LVL(n, SPRN_SPRG_MC_SCRATCH, PACA_EXMC) +#define EXCEPTION_COMMON_DBG(n) \ + EXCEPTION_COMMON_LVL(n, SPRN_SPRG_DBG_SCRATCH, PACA_EXDBG) + +/* + * This is meant for exceptions that don't immediately hard-enable. We + * set a bit in paca->irq_happened to ensure that a subsequent call to + * arch_local_irq_restore() will properly hard-enable and avoid the + * fast-path, and then reconcile irq state. + */ +#define INTS_DISABLE RECONCILE_IRQ_STATE(r3,r4) + +/* + * This is called by exceptions that don't use INTS_DISABLE (that did not + * touch irq indicators in the PACA). This will restore MSR:EE to it's + * previous value + * + * XXX In the long run, we may want to open-code it in order to separate the + * load from the wrtee, thus limiting the latency caused by the dependency + * but at this point, I'll favor code clarity until we have a near to final + * implementation + */ +#define INTS_RESTORE_HARD \ + ld r11,_MSR(r1); \ + wrtee r11; + +/* XXX FIXME: Restore r14/r15 when necessary */ +#define BAD_STACK_TRAMPOLINE(n) \ +exc_##n##_bad_stack: \ + li r1,(n); /* get exception number */ \ + sth r1,PACA_TRAP_SAVE(r13); /* store trap */ \ + b bad_stack_book3e; /* bad stack error */ + +/* WARNING: If you change the layout of this stub, make sure you chcek + * the debug exception handler which handles single stepping + * into exceptions from userspace, and the MM code in + * arch/powerpc/mm/tlb_nohash.c which patches the branch here + * and would need to be updated if that branch is moved + */ +#define EXCEPTION_STUB(loc, label) \ + . = interrupt_base_book3e + loc; \ + nop; /* To make debug interrupts happy */ \ + b exc_##label##_book3e; + +#define ACK_NONE(r) +#define ACK_DEC(r) \ + lis r,TSR_DIS@h; \ + mtspr SPRN_TSR,r +#define ACK_FIT(r) \ + lis r,TSR_FIS@h; \ + mtspr SPRN_TSR,r + +/* Used by asynchronous interrupt that may happen in the idle loop. + * + * This check if the thread was in the idle loop, and if yes, returns + * to the caller rather than the PC. This is to avoid a race if + * interrupts happen before the wait instruction. + */ +#define CHECK_NAPPING() \ + CURRENT_THREAD_INFO(r11, r1); \ + ld r10,TI_LOCAL_FLAGS(r11); \ + andi. r9,r10,_TLF_NAPPING; \ + beq+ 1f; \ + ld r8,_LINK(r1); \ + rlwinm r7,r10,0,~_TLF_NAPPING; \ + std r8,_NIP(r1); \ + std r7,TI_LOCAL_FLAGS(r11); \ +1: + + +#define MASKABLE_EXCEPTION(trapnum, intnum, label, hdlr, ack) \ + START_EXCEPTION(label); \ + NORMAL_EXCEPTION_PROLOG(trapnum, intnum, PROLOG_ADDITION_MASKABLE)\ + EXCEPTION_COMMON(trapnum) \ + INTS_DISABLE; \ + ack(r8); \ + CHECK_NAPPING(); \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + bl hdlr; \ + b ret_from_except_lite; + +/* This value is used to mark exception frames on the stack. */ + .section ".toc","aw" +exception_marker: + .tc ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER + + +/* + * And here we have the exception vectors ! + */ + + .text + .balign 0x1000 + .globl interrupt_base_book3e +interrupt_base_book3e: /* fake trap */ + EXCEPTION_STUB(0x000, machine_check) + EXCEPTION_STUB(0x020, critical_input) /* 0x0100 */ + EXCEPTION_STUB(0x040, debug_crit) /* 0x0d00 */ + EXCEPTION_STUB(0x060, data_storage) /* 0x0300 */ + EXCEPTION_STUB(0x080, instruction_storage) /* 0x0400 */ + EXCEPTION_STUB(0x0a0, external_input) /* 0x0500 */ + EXCEPTION_STUB(0x0c0, alignment) /* 0x0600 */ + EXCEPTION_STUB(0x0e0, program) /* 0x0700 */ + EXCEPTION_STUB(0x100, fp_unavailable) /* 0x0800 */ + EXCEPTION_STUB(0x120, system_call) /* 0x0c00 */ + EXCEPTION_STUB(0x140, ap_unavailable) /* 0x0f20 */ + EXCEPTION_STUB(0x160, decrementer) /* 0x0900 */ + EXCEPTION_STUB(0x180, fixed_interval) /* 0x0980 */ + EXCEPTION_STUB(0x1a0, watchdog) /* 0x09f0 */ + EXCEPTION_STUB(0x1c0, data_tlb_miss) + EXCEPTION_STUB(0x1e0, instruction_tlb_miss) + EXCEPTION_STUB(0x200, altivec_unavailable) + EXCEPTION_STUB(0x220, altivec_assist) + EXCEPTION_STUB(0x260, perfmon) + EXCEPTION_STUB(0x280, doorbell) + EXCEPTION_STUB(0x2a0, doorbell_crit) + EXCEPTION_STUB(0x2c0, guest_doorbell) + EXCEPTION_STUB(0x2e0, guest_doorbell_crit) + EXCEPTION_STUB(0x300, hypercall) + EXCEPTION_STUB(0x320, ehpriv) + EXCEPTION_STUB(0x340, lrat_error) + + .globl interrupt_end_book3e +interrupt_end_book3e: + +/* Critical Input Interrupt */ + START_EXCEPTION(critical_input); + CRIT_EXCEPTION_PROLOG(0x100, BOOKE_INTERRUPT_CRITICAL, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON_CRIT(0x100) + bl save_nvgprs + bl special_reg_save + CHECK_NAPPING(); + addi r3,r1,STACK_FRAME_OVERHEAD + bl unknown_exception + b ret_from_crit_except + +/* Machine Check Interrupt */ + START_EXCEPTION(machine_check); + MC_EXCEPTION_PROLOG(0x000, BOOKE_INTERRUPT_MACHINE_CHECK, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON_MC(0x000) + bl save_nvgprs + bl special_reg_save + CHECK_NAPPING(); + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_exception + b ret_from_mc_except + +/* Data Storage Interrupt */ + START_EXCEPTION(data_storage) + NORMAL_EXCEPTION_PROLOG(0x300, BOOKE_INTERRUPT_DATA_STORAGE, + PROLOG_ADDITION_2REGS) + mfspr r14,SPRN_DEAR + mfspr r15,SPRN_ESR + EXCEPTION_COMMON(0x300) + INTS_DISABLE + b storage_fault_common + +/* Instruction Storage Interrupt */ + START_EXCEPTION(instruction_storage); + NORMAL_EXCEPTION_PROLOG(0x400, BOOKE_INTERRUPT_INST_STORAGE, + PROLOG_ADDITION_2REGS) + li r15,0 + mr r14,r10 + EXCEPTION_COMMON(0x400) + INTS_DISABLE + b storage_fault_common + +/* External Input Interrupt */ + MASKABLE_EXCEPTION(0x500, BOOKE_INTERRUPT_EXTERNAL, + external_input, do_IRQ, ACK_NONE) + +/* Alignment */ + START_EXCEPTION(alignment); + NORMAL_EXCEPTION_PROLOG(0x600, BOOKE_INTERRUPT_ALIGNMENT, + PROLOG_ADDITION_2REGS) + mfspr r14,SPRN_DEAR + mfspr r15,SPRN_ESR + EXCEPTION_COMMON(0x600) + b alignment_more /* no room, go out of line */ + +/* Program Interrupt */ + START_EXCEPTION(program); + NORMAL_EXCEPTION_PROLOG(0x700, BOOKE_INTERRUPT_PROGRAM, + PROLOG_ADDITION_1REG) + mfspr r14,SPRN_ESR + EXCEPTION_COMMON(0x700) + INTS_DISABLE + std r14,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + ld r14,PACA_EXGEN+EX_R14(r13) + bl save_nvgprs + bl program_check_exception + b ret_from_except + +/* Floating Point Unavailable Interrupt */ + START_EXCEPTION(fp_unavailable); + NORMAL_EXCEPTION_PROLOG(0x800, BOOKE_INTERRUPT_FP_UNAVAIL, + PROLOG_ADDITION_NONE) + /* we can probably do a shorter exception entry for that one... */ + EXCEPTION_COMMON(0x800) + ld r12,_MSR(r1) + andi. r0,r12,MSR_PR; + beq- 1f + bl load_up_fpu + b fast_exception_return +1: INTS_DISABLE + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + bl kernel_fp_unavailable_exception + b ret_from_except + +/* Altivec Unavailable Interrupt */ + START_EXCEPTION(altivec_unavailable); + NORMAL_EXCEPTION_PROLOG(0x200, BOOKE_INTERRUPT_SPE_ALTIVEC_UNAVAIL, + PROLOG_ADDITION_NONE) + /* we can probably do a shorter exception entry for that one... */ + EXCEPTION_COMMON(0x200) +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + ld r12,_MSR(r1) + andi. r0,r12,MSR_PR; + beq- 1f + bl load_up_altivec + b fast_exception_return +1: +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif + INTS_DISABLE + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + bl altivec_unavailable_exception + b ret_from_except + +/* AltiVec Assist */ + START_EXCEPTION(altivec_assist); + NORMAL_EXCEPTION_PROLOG(0x220, + BOOKE_INTERRUPT_SPE_FP_DATA_ALTIVEC_ASSIST, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0x220) + INTS_DISABLE + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + bl altivec_assist_exception +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#else + bl unknown_exception +#endif + b ret_from_except + + +/* Decrementer Interrupt */ + MASKABLE_EXCEPTION(0x900, BOOKE_INTERRUPT_DECREMENTER, + decrementer, timer_interrupt, ACK_DEC) + +/* Fixed Interval Timer Interrupt */ + MASKABLE_EXCEPTION(0x980, BOOKE_INTERRUPT_FIT, + fixed_interval, unknown_exception, ACK_FIT) + +/* Watchdog Timer Interrupt */ + START_EXCEPTION(watchdog); + CRIT_EXCEPTION_PROLOG(0x9f0, BOOKE_INTERRUPT_WATCHDOG, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON_CRIT(0x9f0) + bl save_nvgprs + bl special_reg_save + CHECK_NAPPING(); + addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_BOOKE_WDT + bl WatchdogException +#else + bl unknown_exception +#endif + b ret_from_crit_except + +/* System Call Interrupt */ + START_EXCEPTION(system_call) + mr r9,r13 /* keep a copy of userland r13 */ + mfspr r11,SPRN_SRR0 /* get return address */ + mfspr r12,SPRN_SRR1 /* get previous MSR */ + mfspr r13,SPRN_SPRG_PACA /* get our PACA */ + b system_call_common + +/* Auxiliary Processor Unavailable Interrupt */ + START_EXCEPTION(ap_unavailable); + NORMAL_EXCEPTION_PROLOG(0xf20, BOOKE_INTERRUPT_AP_UNAVAIL, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0xf20) + INTS_DISABLE + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + bl unknown_exception + b ret_from_except + +/* Debug exception as a critical interrupt*/ + START_EXCEPTION(debug_crit); + CRIT_EXCEPTION_PROLOG(0xd00, BOOKE_INTERRUPT_DEBUG, + PROLOG_ADDITION_2REGS) + + /* + * If there is a single step or branch-taken exception in an + * exception entry sequence, it was probably meant to apply to + * the code where the exception occurred (since exception entry + * doesn't turn off DE automatically). We simulate the effect + * of turning off DE on entry to an exception handler by turning + * off DE in the CSRR1 value and clearing the debug status. + */ + + mfspr r14,SPRN_DBSR /* check single-step/branch taken */ + andis. r15,r14,(DBSR_IC|DBSR_BT)@h + beq+ 1f + + LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) + LOAD_REG_IMMEDIATE(r15,interrupt_end_book3e) + cmpld cr0,r10,r14 + cmpld cr1,r10,r15 + blt+ cr0,1f + bge+ cr1,1f + + /* here it looks like we got an inappropriate debug exception. */ + lis r14,(DBSR_IC|DBSR_BT)@h /* clear the event */ + rlwinm r11,r11,0,~MSR_DE /* clear DE in the CSRR1 value */ + mtspr SPRN_DBSR,r14 + mtspr SPRN_CSRR1,r11 + lwz r10,PACA_EXCRIT+EX_CR(r13) /* restore registers */ + ld r1,PACA_EXCRIT+EX_R1(r13) + ld r14,PACA_EXCRIT+EX_R14(r13) + ld r15,PACA_EXCRIT+EX_R15(r13) + mtcr r10 + ld r10,PACA_EXCRIT+EX_R10(r13) /* restore registers */ + ld r11,PACA_EXCRIT+EX_R11(r13) + mfspr r13,SPRN_SPRG_CRIT_SCRATCH + rfci + + /* Normal debug exception */ + /* XXX We only handle coming from userspace for now since we can't + * quite save properly an interrupted kernel state yet + */ +1: andi. r14,r11,MSR_PR; /* check for userspace again */ + beq kernel_dbg_exc; /* if from kernel mode */ + + /* Now we mash up things to make it look like we are coming on a + * normal exception + */ + mfspr r14,SPRN_DBSR + EXCEPTION_COMMON_CRIT(0xd00) + std r14,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + mr r4,r14 + ld r14,PACA_EXCRIT+EX_R14(r13) + ld r15,PACA_EXCRIT+EX_R15(r13) + bl save_nvgprs + bl DebugException + b ret_from_except + +kernel_dbg_exc: + b . /* NYI */ + +/* Debug exception as a debug interrupt*/ + START_EXCEPTION(debug_debug); + DBG_EXCEPTION_PROLOG(0xd00, BOOKE_INTERRUPT_DEBUG, + PROLOG_ADDITION_2REGS) + + /* + * If there is a single step or branch-taken exception in an + * exception entry sequence, it was probably meant to apply to + * the code where the exception occurred (since exception entry + * doesn't turn off DE automatically). We simulate the effect + * of turning off DE on entry to an exception handler by turning + * off DE in the DSRR1 value and clearing the debug status. + */ + + mfspr r14,SPRN_DBSR /* check single-step/branch taken */ + andis. r15,r14,(DBSR_IC|DBSR_BT)@h + beq+ 1f + + LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) + LOAD_REG_IMMEDIATE(r15,interrupt_end_book3e) + cmpld cr0,r10,r14 + cmpld cr1,r10,r15 + blt+ cr0,1f + bge+ cr1,1f + + /* here it looks like we got an inappropriate debug exception. */ + lis r14,(DBSR_IC|DBSR_BT)@h /* clear the event */ + rlwinm r11,r11,0,~MSR_DE /* clear DE in the DSRR1 value */ + mtspr SPRN_DBSR,r14 + mtspr SPRN_DSRR1,r11 + lwz r10,PACA_EXDBG+EX_CR(r13) /* restore registers */ + ld r1,PACA_EXDBG+EX_R1(r13) + ld r14,PACA_EXDBG+EX_R14(r13) + ld r15,PACA_EXDBG+EX_R15(r13) + mtcr r10 + ld r10,PACA_EXDBG+EX_R10(r13) /* restore registers */ + ld r11,PACA_EXDBG+EX_R11(r13) + mfspr r13,SPRN_SPRG_DBG_SCRATCH + rfdi + + /* Normal debug exception */ + /* XXX We only handle coming from userspace for now since we can't + * quite save properly an interrupted kernel state yet + */ +1: andi. r14,r11,MSR_PR; /* check for userspace again */ + beq kernel_dbg_exc; /* if from kernel mode */ + + /* Now we mash up things to make it look like we are coming on a + * normal exception + */ + mfspr r14,SPRN_DBSR + EXCEPTION_COMMON_DBG(0xd08) + INTS_DISABLE + std r14,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + mr r4,r14 + ld r14,PACA_EXDBG+EX_R14(r13) + ld r15,PACA_EXDBG+EX_R15(r13) + bl save_nvgprs + bl DebugException + b ret_from_except + + START_EXCEPTION(perfmon); + NORMAL_EXCEPTION_PROLOG(0x260, BOOKE_INTERRUPT_PERFORMANCE_MONITOR, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0x260) + INTS_DISABLE + CHECK_NAPPING() + addi r3,r1,STACK_FRAME_OVERHEAD + bl performance_monitor_exception + b ret_from_except_lite + +/* Doorbell interrupt */ + MASKABLE_EXCEPTION(0x280, BOOKE_INTERRUPT_DOORBELL, + doorbell, doorbell_exception, ACK_NONE) + +/* Doorbell critical Interrupt */ + START_EXCEPTION(doorbell_crit); + CRIT_EXCEPTION_PROLOG(0x2a0, BOOKE_INTERRUPT_DOORBELL_CRITICAL, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON_CRIT(0x2a0) + bl save_nvgprs + bl special_reg_save + CHECK_NAPPING(); + addi r3,r1,STACK_FRAME_OVERHEAD + bl unknown_exception + b ret_from_crit_except + +/* + * Guest doorbell interrupt + * This general exception use GSRRx save/restore registers + */ + START_EXCEPTION(guest_doorbell); + GDBELL_EXCEPTION_PROLOG(0x2c0, BOOKE_INTERRUPT_GUEST_DBELL, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0x2c0) + addi r3,r1,STACK_FRAME_OVERHEAD + bl save_nvgprs + INTS_RESTORE_HARD + bl unknown_exception + b ret_from_except + +/* Guest Doorbell critical Interrupt */ + START_EXCEPTION(guest_doorbell_crit); + CRIT_EXCEPTION_PROLOG(0x2e0, BOOKE_INTERRUPT_GUEST_DBELL_CRIT, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON_CRIT(0x2e0) + bl save_nvgprs + bl special_reg_save + CHECK_NAPPING(); + addi r3,r1,STACK_FRAME_OVERHEAD + bl unknown_exception + b ret_from_crit_except + +/* Hypervisor call */ + START_EXCEPTION(hypercall); + NORMAL_EXCEPTION_PROLOG(0x310, BOOKE_INTERRUPT_HV_SYSCALL, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0x310) + addi r3,r1,STACK_FRAME_OVERHEAD + bl save_nvgprs + INTS_RESTORE_HARD + bl unknown_exception + b ret_from_except + +/* Embedded Hypervisor priviledged */ + START_EXCEPTION(ehpriv); + NORMAL_EXCEPTION_PROLOG(0x320, BOOKE_INTERRUPT_HV_PRIV, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0x320) + addi r3,r1,STACK_FRAME_OVERHEAD + bl save_nvgprs + INTS_RESTORE_HARD + bl unknown_exception + b ret_from_except + +/* LRAT Error interrupt */ + START_EXCEPTION(lrat_error); + NORMAL_EXCEPTION_PROLOG(0x340, BOOKE_INTERRUPT_LRAT_ERROR, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0x340) + addi r3,r1,STACK_FRAME_OVERHEAD + bl .save_nvgprs + INTS_RESTORE_HARD + bl .unknown_exception + b .ret_from_except + +/* + * An interrupt came in while soft-disabled; We mark paca->irq_happened + * accordingly and if the interrupt is level sensitive, we hard disable + */ + +.macro masked_interrupt_book3e paca_irq full_mask + lbz r10,PACAIRQHAPPENED(r13) + ori r10,r10,\paca_irq + stb r10,PACAIRQHAPPENED(r13) + + .if \full_mask == 1 + rldicl r10,r11,48,1 /* clear MSR_EE */ + rotldi r11,r10,16 + mtspr SPRN_SRR1,r11 + .endif + + lwz r11,PACA_EXGEN+EX_CR(r13) + mtcr r11 + ld r10,PACA_EXGEN+EX_R10(r13) + ld r11,PACA_EXGEN+EX_R11(r13) + mfspr r13,SPRN_SPRG_GEN_SCRATCH + rfi + b . +.endm + +masked_interrupt_book3e_0x500: + // XXX When adding support for EPR, use PACA_IRQ_EE_EDGE + masked_interrupt_book3e PACA_IRQ_EE 1 + +masked_interrupt_book3e_0x900: + ACK_DEC(r10); + masked_interrupt_book3e PACA_IRQ_DEC 0 + +masked_interrupt_book3e_0x980: + ACK_FIT(r10); + masked_interrupt_book3e PACA_IRQ_DEC 0 + +masked_interrupt_book3e_0x280: +masked_interrupt_book3e_0x2c0: + masked_interrupt_book3e PACA_IRQ_DBELL 0 + +/* + * Called from arch_local_irq_enable when an interrupt needs + * to be resent. r3 contains either 0x500,0x900,0x260 or 0x280 + * to indicate the kind of interrupt. MSR:EE is already off. + * We generate a stackframe like if a real interrupt had happened. + * + * Note: While MSR:EE is off, we need to make sure that _MSR + * in the generated frame has EE set to 1 or the exception + * handler will not properly re-enable them. + */ +_GLOBAL(__replay_interrupt) + /* We are going to jump to the exception common code which + * will retrieve various register values from the PACA which + * we don't give a damn about. + */ + mflr r10 + mfmsr r11 + mfcr r4 + mtspr SPRN_SPRG_GEN_SCRATCH,r13; + std r1,PACA_EXGEN+EX_R1(r13); + stw r4,PACA_EXGEN+EX_CR(r13); + ori r11,r11,MSR_EE + subi r1,r1,INT_FRAME_SIZE; + cmpwi cr0,r3,0x500 + beq exc_0x500_common + cmpwi cr0,r3,0x900 + beq exc_0x900_common + cmpwi cr0,r3,0x280 + beq exc_0x280_common + blr + + +/* + * This is called from 0x300 and 0x400 handlers after the prologs with + * r14 and r15 containing the fault address and error code, with the + * original values stashed away in the PACA + */ +storage_fault_common: + std r14,_DAR(r1) + std r15,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + mr r4,r14 + mr r5,r15 + ld r14,PACA_EXGEN+EX_R14(r13) + ld r15,PACA_EXGEN+EX_R15(r13) + bl do_page_fault + cmpdi r3,0 + bne- 1f + b ret_from_except_lite +1: bl save_nvgprs + mr r5,r3 + addi r3,r1,STACK_FRAME_OVERHEAD + ld r4,_DAR(r1) + bl bad_page_fault + b ret_from_except + +/* + * Alignment exception doesn't fit entirely in the 0x100 bytes so it + * continues here. + */ +alignment_more: + std r14,_DAR(r1) + std r15,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + ld r14,PACA_EXGEN+EX_R14(r13) + ld r15,PACA_EXGEN+EX_R15(r13) + bl save_nvgprs + INTS_RESTORE_HARD + bl alignment_exception + b ret_from_except + +/* + * We branch here from entry_64.S for the last stage of the exception + * return code path. MSR:EE is expected to be off at that point + */ +_GLOBAL(exception_return_book3e) + b 1f + +/* This is the return from load_up_fpu fast path which could do with + * less GPR restores in fact, but for now we have a single return path + */ + .globl fast_exception_return +fast_exception_return: + wrteei 0 +1: mr r0,r13 + ld r10,_MSR(r1) + REST_4GPRS(2, r1) + andi. r6,r10,MSR_PR + REST_2GPRS(6, r1) + beq 1f + ACCOUNT_CPU_USER_EXIT(r10, r11) + ld r0,GPR13(r1) + +1: stdcx. r0,0,r1 /* to clear the reservation */ + + ld r8,_CCR(r1) + ld r9,_LINK(r1) + ld r10,_CTR(r1) + ld r11,_XER(r1) + mtcr r8 + mtlr r9 + mtctr r10 + mtxer r11 + REST_2GPRS(8, r1) + ld r10,GPR10(r1) + ld r11,GPR11(r1) + ld r12,GPR12(r1) + mtspr SPRN_SPRG_GEN_SCRATCH,r0 + + std r10,PACA_EXGEN+EX_R10(r13); + std r11,PACA_EXGEN+EX_R11(r13); + ld r10,_NIP(r1) + ld r11,_MSR(r1) + ld r0,GPR0(r1) + ld r1,GPR1(r1) + mtspr SPRN_SRR0,r10 + mtspr SPRN_SRR1,r11 + ld r10,PACA_EXGEN+EX_R10(r13) + ld r11,PACA_EXGEN+EX_R11(r13) + mfspr r13,SPRN_SPRG_GEN_SCRATCH + rfi + +/* + * Trampolines used when spotting a bad kernel stack pointer in + * the exception entry code. + * + * TODO: move some bits like SRR0 read to trampoline, pass PACA + * index around, etc... to handle crit & mcheck + */ +BAD_STACK_TRAMPOLINE(0x000) +BAD_STACK_TRAMPOLINE(0x100) +BAD_STACK_TRAMPOLINE(0x200) +BAD_STACK_TRAMPOLINE(0x220) +BAD_STACK_TRAMPOLINE(0x260) +BAD_STACK_TRAMPOLINE(0x280) +BAD_STACK_TRAMPOLINE(0x2a0) +BAD_STACK_TRAMPOLINE(0x2c0) +BAD_STACK_TRAMPOLINE(0x2e0) +BAD_STACK_TRAMPOLINE(0x300) +BAD_STACK_TRAMPOLINE(0x310) +BAD_STACK_TRAMPOLINE(0x320) +BAD_STACK_TRAMPOLINE(0x340) +BAD_STACK_TRAMPOLINE(0x400) +BAD_STACK_TRAMPOLINE(0x500) +BAD_STACK_TRAMPOLINE(0x600) +BAD_STACK_TRAMPOLINE(0x700) +BAD_STACK_TRAMPOLINE(0x800) +BAD_STACK_TRAMPOLINE(0x900) +BAD_STACK_TRAMPOLINE(0x980) +BAD_STACK_TRAMPOLINE(0x9f0) +BAD_STACK_TRAMPOLINE(0xa00) +BAD_STACK_TRAMPOLINE(0xb00) +BAD_STACK_TRAMPOLINE(0xc00) +BAD_STACK_TRAMPOLINE(0xd00) +BAD_STACK_TRAMPOLINE(0xd08) +BAD_STACK_TRAMPOLINE(0xe00) +BAD_STACK_TRAMPOLINE(0xf00) +BAD_STACK_TRAMPOLINE(0xf20) + + .globl bad_stack_book3e +bad_stack_book3e: + /* XXX: Needs to make SPRN_SPRG_GEN depend on exception type */ + mfspr r10,SPRN_SRR0; /* read SRR0 before touching stack */ + ld r1,PACAEMERGSP(r13) + subi r1,r1,64+INT_FRAME_SIZE + std r10,_NIP(r1) + std r11,_MSR(r1) + ld r10,PACA_EXGEN+EX_R1(r13) /* FIXME for crit & mcheck */ + lwz r11,PACA_EXGEN+EX_CR(r13) /* FIXME for crit & mcheck */ + std r10,GPR1(r1) + std r11,_CCR(r1) + mfspr r10,SPRN_DEAR + mfspr r11,SPRN_ESR + std r10,_DAR(r1) + std r11,_DSISR(r1) + std r0,GPR0(r1); /* save r0 in stackframe */ \ + std r2,GPR2(r1); /* save r2 in stackframe */ \ + SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ + SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ + std r9,GPR9(r1); /* save r9 in stackframe */ \ + ld r3,PACA_EXGEN+EX_R10(r13);/* get back r10 */ \ + ld r4,PACA_EXGEN+EX_R11(r13);/* get back r11 */ \ + mfspr r5,SPRN_SPRG_GEN_SCRATCH;/* get back r13 XXX can be wrong */ \ + std r3,GPR10(r1); /* save r10 to stackframe */ \ + std r4,GPR11(r1); /* save r11 to stackframe */ \ + std r12,GPR12(r1); /* save r12 in stackframe */ \ + std r5,GPR13(r1); /* save it to stackframe */ \ + mflr r10 + mfctr r11 + mfxer r12 + std r10,_LINK(r1) + std r11,_CTR(r1) + std r12,_XER(r1) + SAVE_10GPRS(14,r1) + SAVE_8GPRS(24,r1) + lhz r12,PACA_TRAP_SAVE(r13) + std r12,_TRAP(r1) + addi r11,r1,INT_FRAME_SIZE + std r11,0(r1) + li r12,0 + std r12,0(r11) + ld r2,PACATOC(r13) +1: addi r3,r1,STACK_FRAME_OVERHEAD + bl kernel_bad_stack + b 1b + +/* + * Setup the initial TLB for a core. This current implementation + * assume that whatever we are running off will not conflict with + * the new mapping at PAGE_OFFSET. + */ +_GLOBAL(initial_tlb_book3e) + + /* Look for the first TLB with IPROT set */ + mfspr r4,SPRN_TLB0CFG + andi. r3,r4,TLBnCFG_IPROT + lis r3,MAS0_TLBSEL(0)@h + bne found_iprot + + mfspr r4,SPRN_TLB1CFG + andi. r3,r4,TLBnCFG_IPROT + lis r3,MAS0_TLBSEL(1)@h + bne found_iprot + + mfspr r4,SPRN_TLB2CFG + andi. r3,r4,TLBnCFG_IPROT + lis r3,MAS0_TLBSEL(2)@h + bne found_iprot + + lis r3,MAS0_TLBSEL(3)@h + mfspr r4,SPRN_TLB3CFG + /* fall through */ + +found_iprot: + andi. r5,r4,TLBnCFG_HES + bne have_hes + + mflr r8 /* save LR */ +/* 1. Find the index of the entry we're executing in + * + * r3 = MAS0_TLBSEL (for the iprot array) + * r4 = SPRN_TLBnCFG + */ + bl invstr /* Find our address */ +invstr: mflr r6 /* Make it accessible */ + mfmsr r7 + rlwinm r5,r7,27,31,31 /* extract MSR[IS] */ + mfspr r7,SPRN_PID + slwi r7,r7,16 + or r7,r7,r5 + mtspr SPRN_MAS6,r7 + tlbsx 0,r6 /* search MSR[IS], SPID=PID */ + + mfspr r3,SPRN_MAS0 + rlwinm r5,r3,16,20,31 /* Extract MAS0(Entry) */ + + mfspr r7,SPRN_MAS1 /* Insure IPROT set */ + oris r7,r7,MAS1_IPROT@h + mtspr SPRN_MAS1,r7 + tlbwe + +/* 2. Invalidate all entries except the entry we're executing in + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we are running in + * r4 = SPRN_TLBnCFG + * r5 = ESEL of entry we are running in + */ + andi. r4,r4,TLBnCFG_N_ENTRY /* Extract # entries */ + li r6,0 /* Set Entry counter to 0 */ +1: mr r7,r3 /* Set MAS0(TLBSEL) */ + rlwimi r7,r6,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r6) */ + mtspr SPRN_MAS0,r7 + tlbre + mfspr r7,SPRN_MAS1 + rlwinm r7,r7,0,2,31 /* Clear MAS1 Valid and IPROT */ + cmpw r5,r6 + beq skpinv /* Dont update the current execution TLB */ + mtspr SPRN_MAS1,r7 + tlbwe + isync +skpinv: addi r6,r6,1 /* Increment */ + cmpw r6,r4 /* Are we done? */ + bne 1b /* If not, repeat */ + + /* Invalidate all TLBs */ + PPC_TLBILX_ALL(0,R0) + sync + isync + +/* 3. Setup a temp mapping and jump to it + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we are running in + * r5 = ESEL of entry we are running in + */ + andi. r7,r5,0x1 /* Find an entry not used and is non-zero */ + addi r7,r7,0x1 + mr r4,r3 /* Set MAS0(TLBSEL) = 1 */ + mtspr SPRN_MAS0,r4 + tlbre + + rlwimi r4,r7,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r7) */ + mtspr SPRN_MAS0,r4 + + mfspr r7,SPRN_MAS1 + xori r6,r7,MAS1_TS /* Setup TMP mapping in the other Address space */ + mtspr SPRN_MAS1,r6 + + tlbwe + + mfmsr r6 + xori r6,r6,MSR_IS + mtspr SPRN_SRR1,r6 + bl 1f /* Find our address */ +1: mflr r6 + addi r6,r6,(2f - 1b) + mtspr SPRN_SRR0,r6 + rfi +2: + +/* 4. Clear out PIDs & Search info + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we started in + * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping + * r5 = MAS3 + */ + li r6,0 + mtspr SPRN_MAS6,r6 + mtspr SPRN_PID,r6 + +/* 5. Invalidate mapping we started in + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we started in + * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping + * r5 = MAS3 + */ + mtspr SPRN_MAS0,r3 + tlbre + mfspr r6,SPRN_MAS1 + rlwinm r6,r6,0,2,31 /* clear IPROT and VALID */ + mtspr SPRN_MAS1,r6 + tlbwe + sync + isync + +/* The mapping only needs to be cache-coherent on SMP */ +#ifdef CONFIG_SMP +#define M_IF_SMP MAS2_M +#else +#define M_IF_SMP 0 +#endif + +/* 6. Setup KERNELBASE mapping in TLB[0] + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we started in + * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping + * r5 = MAS3 + */ + rlwinm r3,r3,0,16,3 /* clear ESEL */ + mtspr SPRN_MAS0,r3 + lis r6,(MAS1_VALID|MAS1_IPROT)@h + ori r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_1GB))@l + mtspr SPRN_MAS1,r6 + + LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET | M_IF_SMP) + mtspr SPRN_MAS2,r6 + + rlwinm r5,r5,0,0,25 + ori r5,r5,MAS3_SR | MAS3_SW | MAS3_SX + mtspr SPRN_MAS3,r5 + li r5,-1 + rlwinm r5,r5,0,0,25 + + tlbwe + +/* 7. Jump to KERNELBASE mapping + * + * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping + */ + /* Now we branch the new virtual address mapped by this entry */ + LOAD_REG_IMMEDIATE(r6,2f) + lis r7,MSR_KERNEL@h + ori r7,r7,MSR_KERNEL@l + mtspr SPRN_SRR0,r6 + mtspr SPRN_SRR1,r7 + rfi /* start execution out of TLB1[0] entry */ +2: + +/* 8. Clear out the temp mapping + * + * r4 = MAS0 w/TLBSEL & ESEL for the entry we are running in + */ + mtspr SPRN_MAS0,r4 + tlbre + mfspr r5,SPRN_MAS1 + rlwinm r5,r5,0,2,31 /* clear IPROT and VALID */ + mtspr SPRN_MAS1,r5 + tlbwe + sync + isync + + /* We translate LR and return */ + tovirt(r8,r8) + mtlr r8 + blr + +have_hes: + /* Setup MAS 0,1,2,3 and 7 for tlbwe of a 1G entry that maps the + * kernel linear mapping. We also set MAS8 once for all here though + * that will have to be made dependent on whether we are running under + * a hypervisor I suppose. + */ + + /* BEWARE, MAGIC + * This code is called as an ordinary function on the boot CPU. But to + * avoid duplication, this code is also used in SCOM bringup of + * secondary CPUs. We read the code between the initial_tlb_code_start + * and initial_tlb_code_end labels one instruction at a time and RAM it + * into the new core via SCOM. That doesn't process branches, so there + * must be none between those two labels. It also means if this code + * ever takes any parameters, the SCOM code must also be updated to + * provide them. + */ + .globl a2_tlbinit_code_start +a2_tlbinit_code_start: + + ori r11,r3,MAS0_WQ_ALLWAYS + oris r11,r11,MAS0_ESEL(3)@h /* Use way 3: workaround A2 erratum 376 */ + mtspr SPRN_MAS0,r11 + lis r3,(MAS1_VALID | MAS1_IPROT)@h + ori r3,r3,BOOK3E_PAGESZ_1GB << MAS1_TSIZE_SHIFT + mtspr SPRN_MAS1,r3 + LOAD_REG_IMMEDIATE(r3, PAGE_OFFSET | MAS2_M) + mtspr SPRN_MAS2,r3 + li r3,MAS3_SR | MAS3_SW | MAS3_SX + mtspr SPRN_MAS7_MAS3,r3 + li r3,0 + mtspr SPRN_MAS8,r3 + + /* Write the TLB entry */ + tlbwe + + .globl a2_tlbinit_after_linear_map +a2_tlbinit_after_linear_map: + + /* Now we branch the new virtual address mapped by this entry */ + LOAD_REG_IMMEDIATE(r3,1f) + mtctr r3 + bctr + +1: /* We are now running at PAGE_OFFSET, clean the TLB of everything + * else (including IPROTed things left by firmware) + * r4 = TLBnCFG + * r3 = current address (more or less) + */ + + li r5,0 + mtspr SPRN_MAS6,r5 + tlbsx 0,r3 + + rlwinm r9,r4,0,TLBnCFG_N_ENTRY + rlwinm r10,r4,8,0xff + addi r10,r10,-1 /* Get inner loop mask */ + + li r3,1 + + mfspr r5,SPRN_MAS1 + rlwinm r5,r5,0,(~(MAS1_VALID|MAS1_IPROT)) + + mfspr r6,SPRN_MAS2 + rldicr r6,r6,0,51 /* Extract EPN */ + + mfspr r7,SPRN_MAS0 + rlwinm r7,r7,0,0xffff0fff /* Clear HES and WQ */ + + rlwinm r8,r7,16,0xfff /* Extract ESEL */ + +2: add r4,r3,r8 + and r4,r4,r10 + + rlwimi r7,r4,16,MAS0_ESEL_MASK + + mtspr SPRN_MAS0,r7 + mtspr SPRN_MAS1,r5 + mtspr SPRN_MAS2,r6 + tlbwe + + addi r3,r3,1 + and. r4,r3,r10 + + bne 3f + addis r6,r6,(1<<30)@h +3: + cmpw r3,r9 + blt 2b + + .globl a2_tlbinit_after_iprot_flush +a2_tlbinit_after_iprot_flush: + + PPC_TLBILX(0,0,R0) + sync + isync + + .globl a2_tlbinit_code_end +a2_tlbinit_code_end: + + /* We translate LR and return */ + mflr r3 + tovirt(r3,r3) + mtlr r3 + blr + +/* + * Main entry (boot CPU, thread 0) + * + * We enter here from head_64.S, possibly after the prom_init trampoline + * with r3 and r4 already saved to r31 and 30 respectively and in 64 bits + * mode. Anything else is as it was left by the bootloader + * + * Initial requirements of this port: + * + * - Kernel loaded at 0 physical + * - A good lump of memory mapped 0:0 by UTLB entry 0 + * - MSR:IS & MSR:DS set to 0 + * + * Note that some of the above requirements will be relaxed in the future + * as the kernel becomes smarter at dealing with different initial conditions + * but for now you have to be careful + */ +_GLOBAL(start_initialization_book3e) + mflr r28 + + /* First, we need to setup some initial TLBs to map the kernel + * text, data and bss at PAGE_OFFSET. We don't have a real mode + * and always use AS 0, so we just set it up to match our link + * address and never use 0 based addresses. + */ + bl initial_tlb_book3e + + /* Init global core bits */ + bl init_core_book3e + + /* Init per-thread bits */ + bl init_thread_book3e + + /* Return to common init code */ + tovirt(r28,r28) + mtlr r28 + blr + + +/* + * Secondary core/processor entry + * + * This is entered for thread 0 of a secondary core, all other threads + * are expected to be stopped. It's similar to start_initialization_book3e + * except that it's generally entered from the holding loop in head_64.S + * after CPUs have been gathered by Open Firmware. + * + * We assume we are in 32 bits mode running with whatever TLB entry was + * set for us by the firmware or POR engine. + */ +_GLOBAL(book3e_secondary_core_init_tlb_set) + li r4,1 + b generic_secondary_smp_init + +_GLOBAL(book3e_secondary_core_init) + mflr r28 + + /* Do we need to setup initial TLB entry ? */ + cmplwi r4,0 + bne 2f + + /* Setup TLB for this core */ + bl initial_tlb_book3e + + /* We can return from the above running at a different + * address, so recalculate r2 (TOC) + */ + bl relative_toc + + /* Init global core bits */ +2: bl init_core_book3e + + /* Init per-thread bits */ +3: bl init_thread_book3e + + /* Return to common init code at proper virtual address. + * + * Due to various previous assumptions, we know we entered this + * function at either the final PAGE_OFFSET mapping or using a + * 1:1 mapping at 0, so we don't bother doing a complicated check + * here, we just ensure the return address has the right top bits. + * + * Note that if we ever want to be smarter about where we can be + * started from, we have to be careful that by the time we reach + * the code below we may already be running at a different location + * than the one we were called from since initial_tlb_book3e can + * have moved us already. + */ + cmpdi cr0,r28,0 + blt 1f + lis r3,PAGE_OFFSET@highest + sldi r3,r3,32 + or r28,r28,r3 +1: mtlr r28 + blr + +_GLOBAL(book3e_secondary_thread_init) + mflr r28 + b 3b + +init_core_book3e: + /* Establish the interrupt vector base */ + LOAD_REG_IMMEDIATE(r3, interrupt_base_book3e) + mtspr SPRN_IVPR,r3 + sync + blr + +init_thread_book3e: + lis r3,(SPRN_EPCR_ICM | SPRN_EPCR_GICM)@h + mtspr SPRN_EPCR,r3 + + /* Make sure interrupts are off */ + wrteei 0 + + /* disable all timers and clear out status */ + li r3,0 + mtspr SPRN_TCR,r3 + mfspr r3,SPRN_TSR + mtspr SPRN_TSR,r3 + + blr + +_GLOBAL(__setup_base_ivors) + SET_IVOR(0, 0x020) /* Critical Input */ + SET_IVOR(1, 0x000) /* Machine Check */ + SET_IVOR(2, 0x060) /* Data Storage */ + SET_IVOR(3, 0x080) /* Instruction Storage */ + SET_IVOR(4, 0x0a0) /* External Input */ + SET_IVOR(5, 0x0c0) /* Alignment */ + SET_IVOR(6, 0x0e0) /* Program */ + SET_IVOR(7, 0x100) /* FP Unavailable */ + SET_IVOR(8, 0x120) /* System Call */ + SET_IVOR(9, 0x140) /* Auxiliary Processor Unavailable */ + SET_IVOR(10, 0x160) /* Decrementer */ + SET_IVOR(11, 0x180) /* Fixed Interval Timer */ + SET_IVOR(12, 0x1a0) /* Watchdog Timer */ + SET_IVOR(13, 0x1c0) /* Data TLB Error */ + SET_IVOR(14, 0x1e0) /* Instruction TLB Error */ + SET_IVOR(15, 0x040) /* Debug */ + + sync + + blr + +_GLOBAL(setup_altivec_ivors) + SET_IVOR(32, 0x200) /* AltiVec Unavailable */ + SET_IVOR(33, 0x220) /* AltiVec Assist */ + blr + +_GLOBAL(setup_perfmon_ivor) + SET_IVOR(35, 0x260) /* Performance Monitor */ + blr + +_GLOBAL(setup_doorbell_ivors) + SET_IVOR(36, 0x280) /* Processor Doorbell */ + SET_IVOR(37, 0x2a0) /* Processor Doorbell Crit */ + blr + +_GLOBAL(setup_ehv_ivors) + SET_IVOR(40, 0x300) /* Embedded Hypervisor System Call */ + SET_IVOR(41, 0x320) /* Embedded Hypervisor Privilege */ + SET_IVOR(38, 0x2c0) /* Guest Processor Doorbell */ + SET_IVOR(39, 0x2e0) /* Guest Processor Doorbell Crit/MC */ + blr + +_GLOBAL(setup_lrat_ivor) + SET_IVOR(42, 0x340) /* LRAT Error */ + blr diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S new file mode 100644 index 00000000000..a7d36b19221 --- /dev/null +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -0,0 +1,1793 @@ +/* + * This file contains the 64-bit "server" PowerPC variant + * of the low level exception handling including exception + * vectors, exception return, part of the slb and stab + * handling and other fixed offset specific things. + * + * This file is meant to be #included from head_64.S due to + * position dependent assembly. + * + * Most of this originates from head_64.S and thus has the same + * copyright history. + * + */ + +#include <asm/hw_irq.h> +#include <asm/exception-64s.h> +#include <asm/ptrace.h> + +/* + * We layout physical memory as follows: + * 0x0000 - 0x00ff : Secondary processor spin code + * 0x0100 - 0x17ff : pSeries Interrupt prologs + * 0x1800 - 0x4000 : interrupt support common interrupt prologs + * 0x4000 - 0x5fff : pSeries interrupts with IR=1,DR=1 + * 0x6000 - 0x6fff : more interrupt support including for IR=1,DR=1 + * 0x7000 - 0x7fff : FWNMI data area + * 0x8000 - 0x8fff : Initial (CPU0) segment table + * 0x9000 - : Early init and support code + */ + /* Syscall routine is used twice, in reloc-off and reloc-on paths */ +#define SYSCALL_PSERIES_1 \ +BEGIN_FTR_SECTION \ + cmpdi r0,0x1ebe ; \ + beq- 1f ; \ +END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ + mr r9,r13 ; \ + GET_PACA(r13) ; \ + mfspr r11,SPRN_SRR0 ; \ +0: + +#define SYSCALL_PSERIES_2_RFID \ + mfspr r12,SPRN_SRR1 ; \ + ld r10,PACAKBASE(r13) ; \ + LOAD_HANDLER(r10, system_call_entry) ; \ + mtspr SPRN_SRR0,r10 ; \ + ld r10,PACAKMSR(r13) ; \ + mtspr SPRN_SRR1,r10 ; \ + rfid ; \ + b . ; /* prevent speculative execution */ + +#define SYSCALL_PSERIES_3 \ + /* Fast LE/BE switch system call */ \ +1: mfspr r12,SPRN_SRR1 ; \ + xori r12,r12,MSR_LE ; \ + mtspr SPRN_SRR1,r12 ; \ + rfid ; /* return to userspace */ \ + b . ; /* prevent speculative execution */ + +#if defined(CONFIG_RELOCATABLE) + /* + * We can't branch directly; in the direct case we use LR + * and system_call_entry restores LR. (We thus need to move + * LR to r10 in the RFID case too.) + */ +#define SYSCALL_PSERIES_2_DIRECT \ + mflr r10 ; \ + ld r12,PACAKBASE(r13) ; \ + LOAD_HANDLER(r12, system_call_entry_direct) ; \ + mtctr r12 ; \ + mfspr r12,SPRN_SRR1 ; \ + /* Re-use of r13... No spare regs to do this */ \ + li r13,MSR_RI ; \ + mtmsrd r13,1 ; \ + GET_PACA(r13) ; /* get r13 back */ \ + bctr ; +#else + /* We can branch directly */ +#define SYSCALL_PSERIES_2_DIRECT \ + mfspr r12,SPRN_SRR1 ; \ + li r10,MSR_RI ; \ + mtmsrd r10,1 ; /* Set RI (EE=0) */ \ + b system_call_entry_direct ; +#endif + +/* + * This is the start of the interrupt handlers for pSeries + * This code runs with relocation off. + * Code from here to __end_interrupts gets copied down to real + * address 0x100 when we are running a relocatable kernel. + * Therefore any relative branches in this section must only + * branch to labels in this section. + */ + . = 0x100 + .globl __start_interrupts +__start_interrupts: + + .globl system_reset_pSeries; +system_reset_pSeries: + HMT_MEDIUM_PPR_DISCARD + SET_SCRATCH0(r13) +#ifdef CONFIG_PPC_P7_NAP +BEGIN_FTR_SECTION + /* Running native on arch 2.06 or later, check if we are + * waking up from nap. We only handle no state loss and + * supervisor state loss. We do -not- handle hypervisor + * state loss at this time. + */ + mfspr r13,SPRN_SRR1 + rlwinm. r13,r13,47-31,30,31 + beq 9f + + /* waking up from powersave (nap) state */ + cmpwi cr1,r13,2 + /* Total loss of HV state is fatal, we could try to use the + * PIR to locate a PACA, then use an emergency stack etc... + * OPAL v3 based powernv platforms have new idle states + * which fall in this catagory. + */ + bgt cr1,8f + GET_PACA(r13) + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + li r0,KVM_HWTHREAD_IN_KERNEL + stb r0,HSTATE_HWTHREAD_STATE(r13) + /* Order setting hwthread_state vs. testing hwthread_req */ + sync + lbz r0,HSTATE_HWTHREAD_REQ(r13) + cmpwi r0,0 + beq 1f + b kvm_start_guest +1: +#endif + + beq cr1,2f + b power7_wakeup_noloss +2: b power7_wakeup_loss + + /* Fast Sleep wakeup on PowerNV */ +8: GET_PACA(r13) + b power7_wakeup_tb_loss + +9: +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) +#endif /* CONFIG_PPC_P7_NAP */ + EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD, + NOTEST, 0x100) + + . = 0x200 +machine_check_pSeries_1: + /* This is moved out of line as it can be patched by FW, but + * some code path might still want to branch into the original + * vector + */ + HMT_MEDIUM_PPR_DISCARD + SET_SCRATCH0(r13) /* save r13 */ +#ifdef CONFIG_PPC_P7_NAP +BEGIN_FTR_SECTION + /* Running native on arch 2.06 or later, check if we are + * waking up from nap. We only handle no state loss and + * supervisor state loss. We do -not- handle hypervisor + * state loss at this time. + */ + mfspr r13,SPRN_SRR1 + rlwinm. r13,r13,47-31,30,31 + OPT_GET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR) + beq 9f + + mfspr r13,SPRN_SRR1 + rlwinm. r13,r13,47-31,30,31 + /* waking up from powersave (nap) state */ + cmpwi cr1,r13,2 + /* Total loss of HV state is fatal. let's just stay stuck here */ + OPT_GET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR) + bgt cr1,. +9: + OPT_SET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR) +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) +#endif /* CONFIG_PPC_P7_NAP */ + EXCEPTION_PROLOG_0(PACA_EXMC) +BEGIN_FTR_SECTION + b machine_check_pSeries_early +FTR_SECTION_ELSE + b machine_check_pSeries_0 +ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) + + . = 0x300 + .globl data_access_pSeries +data_access_pSeries: + HMT_MEDIUM_PPR_DISCARD + SET_SCRATCH0(r13) +BEGIN_FTR_SECTION + b data_access_check_stab +data_access_not_stab: +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB) + EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD, + KVMTEST, 0x300) + + . = 0x380 + .globl data_access_slb_pSeries +data_access_slb_pSeries: + HMT_MEDIUM_PPR_DISCARD + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXSLB) + EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380) + std r3,PACA_EXSLB+EX_R3(r13) + mfspr r3,SPRN_DAR +#ifdef __DISABLED__ + /* Keep that around for when we re-implement dynamic VSIDs */ + cmpdi r3,0 + bge slb_miss_user_pseries +#endif /* __DISABLED__ */ + mfspr r12,SPRN_SRR1 +#ifndef CONFIG_RELOCATABLE + b slb_miss_realmode +#else + /* + * We can't just use a direct branch to slb_miss_realmode + * because the distance from here to there depends on where + * the kernel ends up being put. + */ + mfctr r11 + ld r10,PACAKBASE(r13) + LOAD_HANDLER(r10, slb_miss_realmode) + mtctr r10 + bctr +#endif + + STD_EXCEPTION_PSERIES(0x400, 0x400, instruction_access) + + . = 0x480 + .globl instruction_access_slb_pSeries +instruction_access_slb_pSeries: + HMT_MEDIUM_PPR_DISCARD + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXSLB) + EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) + std r3,PACA_EXSLB+EX_R3(r13) + mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ +#ifdef __DISABLED__ + /* Keep that around for when we re-implement dynamic VSIDs */ + cmpdi r3,0 + bge slb_miss_user_pseries +#endif /* __DISABLED__ */ + mfspr r12,SPRN_SRR1 +#ifndef CONFIG_RELOCATABLE + b slb_miss_realmode +#else + mfctr r11 + ld r10,PACAKBASE(r13) + LOAD_HANDLER(r10, slb_miss_realmode) + mtctr r10 + bctr +#endif + + /* We open code these as we can't have a ". = x" (even with + * x = "." within a feature section + */ + . = 0x500; + .globl hardware_interrupt_pSeries; + .globl hardware_interrupt_hv; +hardware_interrupt_pSeries: +hardware_interrupt_hv: + HMT_MEDIUM_PPR_DISCARD + BEGIN_FTR_SECTION + _MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt, + EXC_HV, SOFTEN_TEST_HV) + KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502) + FTR_SECTION_ELSE + _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt, + EXC_STD, SOFTEN_TEST_HV_201) + KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500) + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + + STD_EXCEPTION_PSERIES(0x600, 0x600, alignment) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x600) + + STD_EXCEPTION_PSERIES(0x700, 0x700, program_check) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x700) + + STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800) + + . = 0x900 + .globl decrementer_pSeries +decrementer_pSeries: + _MASKABLE_EXCEPTION_PSERIES(0x900, decrementer, EXC_STD, SOFTEN_TEST_PR) + + STD_EXCEPTION_HV(0x980, 0x982, hdecrementer) + + MASKABLE_EXCEPTION_PSERIES(0xa00, 0xa00, doorbell_super) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00) + + STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xb00) + + . = 0xc00 + .globl system_call_pSeries +system_call_pSeries: + HMT_MEDIUM +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER + SET_SCRATCH0(r13) + GET_PACA(r13) + std r9,PACA_EXGEN+EX_R9(r13) + std r10,PACA_EXGEN+EX_R10(r13) + mfcr r9 + KVMTEST(0xc00) + GET_SCRATCH0(r13) +#endif + SYSCALL_PSERIES_1 + SYSCALL_PSERIES_2_RFID + SYSCALL_PSERIES_3 + KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00) + + STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xd00) + + /* At 0xe??? we have a bunch of hypervisor exceptions, we branch + * out of line to handle them + */ + . = 0xe00 +hv_data_storage_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b h_data_storage_hv + + . = 0xe20 +hv_instr_storage_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b h_instr_storage_hv + + . = 0xe40 +emulation_assist_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b emulation_assist_hv + + . = 0xe60 +hv_exception_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b hmi_exception_hv + + . = 0xe80 +hv_doorbell_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b h_doorbell_hv + + /* We need to deal with the Altivec unavailable exception + * here which is at 0xf20, thus in the middle of the + * prolog code of the PerformanceMonitor one. A little + * trickery is thus necessary + */ + . = 0xf00 +performance_monitor_pseries_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b performance_monitor_pSeries + + . = 0xf20 +altivec_unavailable_pseries_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b altivec_unavailable_pSeries + + . = 0xf40 +vsx_unavailable_pseries_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b vsx_unavailable_pSeries + + . = 0xf60 +facility_unavailable_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b facility_unavailable_pSeries + + . = 0xf80 +hv_facility_unavailable_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b facility_unavailable_hv + +#ifdef CONFIG_CBE_RAS + STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error) + KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202) +#endif /* CONFIG_CBE_RAS */ + + STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint) + KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x1300) + + . = 0x1500 + .global denorm_exception_hv +denorm_exception_hv: + HMT_MEDIUM_PPR_DISCARD + mtspr SPRN_SPRG_HSCRATCH0,r13 + EXCEPTION_PROLOG_0(PACA_EXGEN) + EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x1500) + +#ifdef CONFIG_PPC_DENORMALISATION + mfspr r10,SPRN_HSRR1 + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ + addi r11,r11,-4 /* HSRR0 is next instruction */ + bne+ denorm_assist +#endif + + KVMTEST(0x1500) + EXCEPTION_PROLOG_PSERIES_1(denorm_common, EXC_HV) + KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1500) + +#ifdef CONFIG_CBE_RAS + STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance) + KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602) +#endif /* CONFIG_CBE_RAS */ + + STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x1700) + +#ifdef CONFIG_CBE_RAS + STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal) + KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802) +#else + . = 0x1800 +#endif /* CONFIG_CBE_RAS */ + + +/*** Out of line interrupts support ***/ + + .align 7 + /* moved from 0x200 */ +machine_check_pSeries_early: +BEGIN_FTR_SECTION + EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) + /* + * Register contents: + * R13 = PACA + * R9 = CR + * Original R9 to R13 is saved on PACA_EXMC + * + * Switch to mc_emergency stack and handle re-entrancy (we limit + * the nested MCE upto level 4 to avoid stack overflow). + * Save MCE registers srr1, srr0, dar and dsisr and then set ME=1 + * + * We use paca->in_mce to check whether this is the first entry or + * nested machine check. We increment paca->in_mce to track nested + * machine checks. + * + * If this is the first entry then set stack pointer to + * paca->mc_emergency_sp, otherwise r1 is already pointing to + * stack frame on mc_emergency stack. + * + * NOTE: We are here with MSR_ME=0 (off), which means we risk a + * checkstop if we get another machine check exception before we do + * rfid with MSR_ME=1. + */ + mr r11,r1 /* Save r1 */ + lhz r10,PACA_IN_MCE(r13) + cmpwi r10,0 /* Are we in nested machine check */ + bne 0f /* Yes, we are. */ + /* First machine check entry */ + ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ +0: subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + addi r10,r10,1 /* increment paca->in_mce */ + sth r10,PACA_IN_MCE(r13) + /* Limit nested MCE to level 4 to avoid stack overflow */ + cmpwi r10,4 + bgt 2f /* Check if we hit limit of 4 */ + std r11,GPR1(r1) /* Save r1 on the stack. */ + std r11,0(r1) /* make stack chain pointer */ + mfspr r11,SPRN_SRR0 /* Save SRR0 */ + std r11,_NIP(r1) + mfspr r11,SPRN_SRR1 /* Save SRR1 */ + std r11,_MSR(r1) + mfspr r11,SPRN_DAR /* Save DAR */ + std r11,_DAR(r1) + mfspr r11,SPRN_DSISR /* Save DSISR */ + std r11,_DSISR(r1) + std r9,_CCR(r1) /* Save CR in stackframe */ + /* Save r9 through r13 from EXMC save area to stack frame. */ + EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) + mfmsr r11 /* get MSR value */ + ori r11,r11,MSR_ME /* turn on ME bit */ + ori r11,r11,MSR_RI /* turn on RI bit */ + ld r12,PACAKBASE(r13) /* get high part of &label */ + LOAD_HANDLER(r12, machine_check_handle_early) +1: mtspr SPRN_SRR0,r12 + mtspr SPRN_SRR1,r11 + rfid + b . /* prevent speculative execution */ +2: + /* Stack overflow. Stay on emergency stack and panic. + * Keep the ME bit off while panic-ing, so that if we hit + * another machine check we checkstop. + */ + addi r1,r1,INT_FRAME_SIZE /* go back to previous stack frame */ + ld r11,PACAKMSR(r13) + ld r12,PACAKBASE(r13) + LOAD_HANDLER(r12, unrecover_mce) + li r10,MSR_ME + andc r11,r11,r10 /* Turn off MSR_ME */ + b 1b + b . /* prevent speculative execution */ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) + +machine_check_pSeries: + .globl machine_check_fwnmi +machine_check_fwnmi: + HMT_MEDIUM_PPR_DISCARD + SET_SCRATCH0(r13) /* save r13 */ + EXCEPTION_PROLOG_0(PACA_EXMC) +machine_check_pSeries_0: + EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST, 0x200) + EXCEPTION_PROLOG_PSERIES_1(machine_check_common, EXC_STD) + KVM_HANDLER_SKIP(PACA_EXMC, EXC_STD, 0x200) + + /* moved from 0x300 */ +data_access_check_stab: + GET_PACA(r13) + std r9,PACA_EXSLB+EX_R9(r13) + std r10,PACA_EXSLB+EX_R10(r13) + mfspr r10,SPRN_DAR + mfspr r9,SPRN_DSISR + srdi r10,r10,60 + rlwimi r10,r9,16,0x20 +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + lbz r9,HSTATE_IN_GUEST(r13) + rlwimi r10,r9,8,0x300 +#endif + mfcr r9 + cmpwi r10,0x2c + beq do_stab_bolted_pSeries + mtcrf 0x80,r9 + ld r9,PACA_EXSLB+EX_R9(r13) + ld r10,PACA_EXSLB+EX_R10(r13) + b data_access_not_stab +do_stab_bolted_pSeries: + std r11,PACA_EXSLB+EX_R11(r13) + std r12,PACA_EXSLB+EX_R12(r13) + GET_SCRATCH0(r10) + std r10,PACA_EXSLB+EX_R13(r13) + EXCEPTION_PROLOG_PSERIES_1(do_stab_bolted, EXC_STD) + + KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300) + KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400) + KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900) + KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982) + +#ifdef CONFIG_PPC_DENORMALISATION +denorm_assist: +BEGIN_FTR_SECTION +/* + * To denormalise we need to move a copy of the register to itself. + * For POWER6 do that here for all FP regs. + */ + mfmsr r10 + ori r10,r10,(MSR_FP|MSR_FE0|MSR_FE1) + xori r10,r10,(MSR_FE0|MSR_FE1) + mtmsrd r10 + sync + +#define FMR2(n) fmr (n), (n) ; fmr n+1, n+1 +#define FMR4(n) FMR2(n) ; FMR2(n+2) +#define FMR8(n) FMR4(n) ; FMR4(n+4) +#define FMR16(n) FMR8(n) ; FMR8(n+8) +#define FMR32(n) FMR16(n) ; FMR16(n+16) + FMR32(0) + +FTR_SECTION_ELSE +/* + * To denormalise we need to move a copy of the register to itself. + * For POWER7 do that here for the first 32 VSX registers only. + */ + mfmsr r10 + oris r10,r10,MSR_VSX@h + mtmsrd r10 + sync + +#define XVCPSGNDP2(n) XVCPSGNDP(n,n,n) ; XVCPSGNDP(n+1,n+1,n+1) +#define XVCPSGNDP4(n) XVCPSGNDP2(n) ; XVCPSGNDP2(n+2) +#define XVCPSGNDP8(n) XVCPSGNDP4(n) ; XVCPSGNDP4(n+4) +#define XVCPSGNDP16(n) XVCPSGNDP8(n) ; XVCPSGNDP8(n+8) +#define XVCPSGNDP32(n) XVCPSGNDP16(n) ; XVCPSGNDP16(n+16) + XVCPSGNDP32(0) + +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_206) + +BEGIN_FTR_SECTION + b denorm_done +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) +/* + * To denormalise we need to move a copy of the register to itself. + * For POWER8 we need to do that for all 64 VSX registers + */ + XVCPSGNDP32(32) +denorm_done: + mtspr SPRN_HSRR0,r11 + mtcrf 0x80,r9 + ld r9,PACA_EXGEN+EX_R9(r13) + RESTORE_PPR_PACA(PACA_EXGEN, r10) +BEGIN_FTR_SECTION + ld r10,PACA_EXGEN+EX_CFAR(r13) + mtspr SPRN_CFAR,r10 +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) + ld r10,PACA_EXGEN+EX_R10(r13) + ld r11,PACA_EXGEN+EX_R11(r13) + ld r12,PACA_EXGEN+EX_R12(r13) + ld r13,PACA_EXGEN+EX_R13(r13) + HRFID + b . +#endif + + .align 7 + /* moved from 0xe00 */ + STD_EXCEPTION_HV_OOL(0xe02, h_data_storage) + KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0xe02) + STD_EXCEPTION_HV_OOL(0xe22, h_instr_storage) + KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe22) + STD_EXCEPTION_HV_OOL(0xe42, emulation_assist) + KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe42) + STD_EXCEPTION_HV_OOL(0xe62, hmi_exception) /* need to flush cache ? */ + KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62) + MASKABLE_EXCEPTION_HV_OOL(0xe82, h_doorbell) + KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe82) + + /* moved from 0xf00 */ + STD_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf00) + STD_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20) + STD_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40) + STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60) + STD_EXCEPTION_HV_OOL(0xf82, facility_unavailable) + KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xf82) + +/* + * An interrupt came in while soft-disabled. We set paca->irq_happened, then: + * - If it was a decrementer interrupt, we bump the dec to max and and return. + * - If it was a doorbell we return immediately since doorbells are edge + * triggered and won't automatically refire. + * - else we hard disable and return. + * This is called with r10 containing the value to OR to the paca field. + */ +#define MASKED_INTERRUPT(_H) \ +masked_##_H##interrupt: \ + std r11,PACA_EXGEN+EX_R11(r13); \ + lbz r11,PACAIRQHAPPENED(r13); \ + or r11,r11,r10; \ + stb r11,PACAIRQHAPPENED(r13); \ + cmpwi r10,PACA_IRQ_DEC; \ + bne 1f; \ + lis r10,0x7fff; \ + ori r10,r10,0xffff; \ + mtspr SPRN_DEC,r10; \ + b 2f; \ +1: cmpwi r10,PACA_IRQ_DBELL; \ + beq 2f; \ + mfspr r10,SPRN_##_H##SRR1; \ + rldicl r10,r10,48,1; /* clear MSR_EE */ \ + rotldi r10,r10,16; \ + mtspr SPRN_##_H##SRR1,r10; \ +2: mtcrf 0x80,r9; \ + ld r9,PACA_EXGEN+EX_R9(r13); \ + ld r10,PACA_EXGEN+EX_R10(r13); \ + ld r11,PACA_EXGEN+EX_R11(r13); \ + GET_SCRATCH0(r13); \ + ##_H##rfid; \ + b . + + MASKED_INTERRUPT() + MASKED_INTERRUPT(H) + +/* + * Called from arch_local_irq_enable when an interrupt needs + * to be resent. r3 contains 0x500, 0x900, 0xa00 or 0xe80 to indicate + * which kind of interrupt. MSR:EE is already off. We generate a + * stackframe like if a real interrupt had happened. + * + * Note: While MSR:EE is off, we need to make sure that _MSR + * in the generated frame has EE set to 1 or the exception + * handler will not properly re-enable them. + */ +_GLOBAL(__replay_interrupt) + /* We are going to jump to the exception common code which + * will retrieve various register values from the PACA which + * we don't give a damn about, so we don't bother storing them. + */ + mfmsr r12 + mflr r11 + mfcr r9 + ori r12,r12,MSR_EE + cmpwi r3,0x900 + beq decrementer_common + cmpwi r3,0x500 + beq hardware_interrupt_common +BEGIN_FTR_SECTION + cmpwi r3,0xe80 + beq h_doorbell_common +FTR_SECTION_ELSE + cmpwi r3,0xa00 + beq doorbell_super_common +ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) + blr + +#ifdef CONFIG_PPC_PSERIES +/* + * Vectors for the FWNMI option. Share common code. + */ + .globl system_reset_fwnmi + .align 7 +system_reset_fwnmi: + HMT_MEDIUM_PPR_DISCARD + SET_SCRATCH0(r13) /* save r13 */ + EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD, + NOTEST, 0x100) + +#endif /* CONFIG_PPC_PSERIES */ + +#ifdef __DISABLED__ +/* + * This is used for when the SLB miss handler has to go virtual, + * which doesn't happen for now anymore but will once we re-implement + * dynamic VSIDs for shared page tables + */ +slb_miss_user_pseries: + std r10,PACA_EXGEN+EX_R10(r13) + std r11,PACA_EXGEN+EX_R11(r13) + std r12,PACA_EXGEN+EX_R12(r13) + GET_SCRATCH0(r10) + ld r11,PACA_EXSLB+EX_R9(r13) + ld r12,PACA_EXSLB+EX_R3(r13) + std r10,PACA_EXGEN+EX_R13(r13) + std r11,PACA_EXGEN+EX_R9(r13) + std r12,PACA_EXGEN+EX_R3(r13) + clrrdi r12,r13,32 + mfmsr r10 + mfspr r11,SRR0 /* save SRR0 */ + ori r12,r12,slb_miss_user_common@l /* virt addr of handler */ + ori r10,r10,MSR_IR|MSR_DR|MSR_RI + mtspr SRR0,r12 + mfspr r12,SRR1 /* and SRR1 */ + mtspr SRR1,r10 + rfid + b . /* prevent spec. execution */ +#endif /* __DISABLED__ */ + +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER +kvmppc_skip_interrupt: + /* + * Here all GPRs are unchanged from when the interrupt happened + * except for r13, which is saved in SPRG_SCRATCH0. + */ + mfspr r13, SPRN_SRR0 + addi r13, r13, 4 + mtspr SPRN_SRR0, r13 + GET_SCRATCH0(r13) + rfid + b . + +kvmppc_skip_Hinterrupt: + /* + * Here all GPRs are unchanged from when the interrupt happened + * except for r13, which is saved in SPRG_SCRATCH0. + */ + mfspr r13, SPRN_HSRR0 + addi r13, r13, 4 + mtspr SPRN_HSRR0, r13 + GET_SCRATCH0(r13) + hrfid + b . +#endif + +/* + * Code from here down to __end_handlers is invoked from the + * exception prologs above. Because the prologs assemble the + * addresses of these handlers using the LOAD_HANDLER macro, + * which uses an ori instruction, these handlers must be in + * the first 64k of the kernel image. + */ + +/*** Common interrupt handlers ***/ + + STD_EXCEPTION_COMMON(0x100, system_reset, system_reset_exception) + + STD_EXCEPTION_COMMON_ASYNC(0x500, hardware_interrupt, do_IRQ) + STD_EXCEPTION_COMMON_ASYNC(0x900, decrementer, timer_interrupt) + STD_EXCEPTION_COMMON(0x980, hdecrementer, hdec_interrupt) +#ifdef CONFIG_PPC_DOORBELL + STD_EXCEPTION_COMMON_ASYNC(0xa00, doorbell_super, doorbell_exception) +#else + STD_EXCEPTION_COMMON_ASYNC(0xa00, doorbell_super, unknown_exception) +#endif + STD_EXCEPTION_COMMON(0xb00, trap_0b, unknown_exception) + STD_EXCEPTION_COMMON(0xd00, single_step, single_step_exception) + STD_EXCEPTION_COMMON(0xe00, trap_0e, unknown_exception) + STD_EXCEPTION_COMMON(0xe40, emulation_assist, emulation_assist_interrupt) + STD_EXCEPTION_COMMON(0xe60, hmi_exception, unknown_exception) +#ifdef CONFIG_PPC_DOORBELL + STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, doorbell_exception) +#else + STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, unknown_exception) +#endif + STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, performance_monitor_exception) + STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, instruction_breakpoint_exception) + STD_EXCEPTION_COMMON(0x1502, denorm, unknown_exception) +#ifdef CONFIG_ALTIVEC + STD_EXCEPTION_COMMON(0x1700, altivec_assist, altivec_assist_exception) +#else + STD_EXCEPTION_COMMON(0x1700, altivec_assist, unknown_exception) +#endif +#ifdef CONFIG_CBE_RAS + STD_EXCEPTION_COMMON(0x1200, cbe_system_error, cbe_system_error_exception) + STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, cbe_maintenance_exception) + STD_EXCEPTION_COMMON(0x1800, cbe_thermal, cbe_thermal_exception) +#endif /* CONFIG_CBE_RAS */ + + /* + * Relocation-on interrupts: A subset of the interrupts can be delivered + * with IR=1/DR=1, if AIL==2 and MSR.HV won't be changed by delivering + * it. Addresses are the same as the original interrupt addresses, but + * offset by 0xc000000000004000. + * It's impossible to receive interrupts below 0x300 via this mechanism. + * KVM: None of these traps are from the guest ; anything that escalated + * to HV=1 from HV=0 is delivered via real mode handlers. + */ + + /* + * This uses the standard macro, since the original 0x300 vector + * only has extra guff for STAB-based processors -- which never + * come here. + */ + STD_RELON_EXCEPTION_PSERIES(0x4300, 0x300, data_access) + . = 0x4380 + .globl data_access_slb_relon_pSeries +data_access_slb_relon_pSeries: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXSLB) + EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) + std r3,PACA_EXSLB+EX_R3(r13) + mfspr r3,SPRN_DAR + mfspr r12,SPRN_SRR1 +#ifndef CONFIG_RELOCATABLE + b slb_miss_realmode +#else + /* + * We can't just use a direct branch to slb_miss_realmode + * because the distance from here to there depends on where + * the kernel ends up being put. + */ + mfctr r11 + ld r10,PACAKBASE(r13) + LOAD_HANDLER(r10, slb_miss_realmode) + mtctr r10 + bctr +#endif + + STD_RELON_EXCEPTION_PSERIES(0x4400, 0x400, instruction_access) + . = 0x4480 + .globl instruction_access_slb_relon_pSeries +instruction_access_slb_relon_pSeries: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXSLB) + EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480) + std r3,PACA_EXSLB+EX_R3(r13) + mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ + mfspr r12,SPRN_SRR1 +#ifndef CONFIG_RELOCATABLE + b slb_miss_realmode +#else + mfctr r11 + ld r10,PACAKBASE(r13) + LOAD_HANDLER(r10, slb_miss_realmode) + mtctr r10 + bctr +#endif + + . = 0x4500 + .globl hardware_interrupt_relon_pSeries; + .globl hardware_interrupt_relon_hv; +hardware_interrupt_relon_pSeries: +hardware_interrupt_relon_hv: + BEGIN_FTR_SECTION + _MASKABLE_RELON_EXCEPTION_PSERIES(0x502, hardware_interrupt, EXC_HV, SOFTEN_TEST_HV) + FTR_SECTION_ELSE + _MASKABLE_RELON_EXCEPTION_PSERIES(0x500, hardware_interrupt, EXC_STD, SOFTEN_TEST_PR) + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) + STD_RELON_EXCEPTION_PSERIES(0x4600, 0x600, alignment) + STD_RELON_EXCEPTION_PSERIES(0x4700, 0x700, program_check) + STD_RELON_EXCEPTION_PSERIES(0x4800, 0x800, fp_unavailable) + MASKABLE_RELON_EXCEPTION_PSERIES(0x4900, 0x900, decrementer) + STD_RELON_EXCEPTION_HV(0x4980, 0x982, hdecrementer) + MASKABLE_RELON_EXCEPTION_PSERIES(0x4a00, 0xa00, doorbell_super) + STD_RELON_EXCEPTION_PSERIES(0x4b00, 0xb00, trap_0b) + + . = 0x4c00 + .globl system_call_relon_pSeries +system_call_relon_pSeries: + HMT_MEDIUM + SYSCALL_PSERIES_1 + SYSCALL_PSERIES_2_DIRECT + SYSCALL_PSERIES_3 + + STD_RELON_EXCEPTION_PSERIES(0x4d00, 0xd00, single_step) + + . = 0x4e00 + b . /* Can't happen, see v2.07 Book III-S section 6.5 */ + + . = 0x4e20 + b . /* Can't happen, see v2.07 Book III-S section 6.5 */ + + . = 0x4e40 +emulation_assist_relon_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b emulation_assist_relon_hv + + . = 0x4e60 + b . /* Can't happen, see v2.07 Book III-S section 6.5 */ + + . = 0x4e80 +h_doorbell_relon_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b h_doorbell_relon_hv + + . = 0x4f00 +performance_monitor_relon_pseries_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b performance_monitor_relon_pSeries + + . = 0x4f20 +altivec_unavailable_relon_pseries_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b altivec_unavailable_relon_pSeries + + . = 0x4f40 +vsx_unavailable_relon_pseries_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b vsx_unavailable_relon_pSeries + + . = 0x4f60 +facility_unavailable_relon_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b facility_unavailable_relon_pSeries + + . = 0x4f80 +hv_facility_unavailable_relon_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b hv_facility_unavailable_relon_hv + + STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint) +#ifdef CONFIG_PPC_DENORMALISATION + . = 0x5500 + b denorm_exception_hv +#endif + STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) + + /* Other future vectors */ + .align 7 + .globl __end_interrupts +__end_interrupts: + + .align 7 +system_call_entry_direct: +#if defined(CONFIG_RELOCATABLE) + /* The first level prologue may have used LR to get here, saving + * orig in r10. To save hacking/ifdeffing common code, restore here. + */ + mtlr r10 +#endif +system_call_entry: + b system_call_common + +ppc64_runlatch_on_trampoline: + b __ppc64_runlatch_on + +/* + * Here we have detected that the kernel stack pointer is bad. + * R9 contains the saved CR, r13 points to the paca, + * r10 contains the (bad) kernel stack pointer, + * r11 and r12 contain the saved SRR0 and SRR1. + * We switch to using an emergency stack, save the registers there, + * and call kernel_bad_stack(), which panics. + */ +bad_stack: + ld r1,PACAEMERGSP(r13) + subi r1,r1,64+INT_FRAME_SIZE + std r9,_CCR(r1) + std r10,GPR1(r1) + std r11,_NIP(r1) + std r12,_MSR(r1) + mfspr r11,SPRN_DAR + mfspr r12,SPRN_DSISR + std r11,_DAR(r1) + std r12,_DSISR(r1) + mflr r10 + mfctr r11 + mfxer r12 + std r10,_LINK(r1) + std r11,_CTR(r1) + std r12,_XER(r1) + SAVE_GPR(0,r1) + SAVE_GPR(2,r1) + ld r10,EX_R3(r3) + std r10,GPR3(r1) + SAVE_GPR(4,r1) + SAVE_4GPRS(5,r1) + ld r9,EX_R9(r3) + ld r10,EX_R10(r3) + SAVE_2GPRS(9,r1) + ld r9,EX_R11(r3) + ld r10,EX_R12(r3) + ld r11,EX_R13(r3) + std r9,GPR11(r1) + std r10,GPR12(r1) + std r11,GPR13(r1) +BEGIN_FTR_SECTION + ld r10,EX_CFAR(r3) + std r10,ORIG_GPR3(r1) +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) + SAVE_8GPRS(14,r1) + SAVE_10GPRS(22,r1) + lhz r12,PACA_TRAP_SAVE(r13) + std r12,_TRAP(r1) + addi r11,r1,INT_FRAME_SIZE + std r11,0(r1) + li r12,0 + std r12,0(r11) + ld r2,PACATOC(r13) + ld r11,exception_marker@toc(r2) + std r12,RESULT(r1) + std r11,STACK_FRAME_OVERHEAD-16(r1) +1: addi r3,r1,STACK_FRAME_OVERHEAD + bl kernel_bad_stack + b 1b + +/* + * Here r13 points to the paca, r9 contains the saved CR, + * SRR0 and SRR1 are saved in r11 and r12, + * r9 - r13 are saved in paca->exgen. + */ + .align 7 + .globl data_access_common +data_access_common: + mfspr r10,SPRN_DAR + std r10,PACA_EXGEN+EX_DAR(r13) + mfspr r10,SPRN_DSISR + stw r10,PACA_EXGEN+EX_DSISR(r13) + EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN) + DISABLE_INTS + ld r12,_MSR(r1) + ld r3,PACA_EXGEN+EX_DAR(r13) + lwz r4,PACA_EXGEN+EX_DSISR(r13) + li r5,0x300 + b do_hash_page /* Try to handle as hpte fault */ + + .align 7 + .globl h_data_storage_common +h_data_storage_common: + mfspr r10,SPRN_HDAR + std r10,PACA_EXGEN+EX_DAR(r13) + mfspr r10,SPRN_HDSISR + stw r10,PACA_EXGEN+EX_DSISR(r13) + EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN) + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl unknown_exception + b ret_from_except + + .align 7 + .globl instruction_access_common +instruction_access_common: + EXCEPTION_PROLOG_COMMON(0x400, PACA_EXGEN) + DISABLE_INTS + ld r12,_MSR(r1) + ld r3,_NIP(r1) + andis. r4,r12,0x5820 + li r5,0x400 + b do_hash_page /* Try to handle as hpte fault */ + + STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception) + +/* + * Here is the common SLB miss user that is used when going to virtual + * mode for SLB misses, that is currently not used + */ +#ifdef __DISABLED__ + .align 7 + .globl slb_miss_user_common +slb_miss_user_common: + mflr r10 + std r3,PACA_EXGEN+EX_DAR(r13) + stw r9,PACA_EXGEN+EX_CCR(r13) + std r10,PACA_EXGEN+EX_LR(r13) + std r11,PACA_EXGEN+EX_SRR0(r13) + bl slb_allocate_user + + ld r10,PACA_EXGEN+EX_LR(r13) + ld r3,PACA_EXGEN+EX_R3(r13) + lwz r9,PACA_EXGEN+EX_CCR(r13) + ld r11,PACA_EXGEN+EX_SRR0(r13) + mtlr r10 + beq- slb_miss_fault + + andi. r10,r12,MSR_RI /* check for unrecoverable exception */ + beq- unrecov_user_slb + mfmsr r10 + +.machine push +.machine "power4" + mtcrf 0x80,r9 +.machine pop + + clrrdi r10,r10,2 /* clear RI before setting SRR0/1 */ + mtmsrd r10,1 + + mtspr SRR0,r11 + mtspr SRR1,r12 + + ld r9,PACA_EXGEN+EX_R9(r13) + ld r10,PACA_EXGEN+EX_R10(r13) + ld r11,PACA_EXGEN+EX_R11(r13) + ld r12,PACA_EXGEN+EX_R12(r13) + ld r13,PACA_EXGEN+EX_R13(r13) + rfid + b . + +slb_miss_fault: + EXCEPTION_PROLOG_COMMON(0x380, PACA_EXGEN) + ld r4,PACA_EXGEN+EX_DAR(r13) + li r5,0 + std r4,_DAR(r1) + std r5,_DSISR(r1) + b handle_page_fault + +unrecov_user_slb: + EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN) + DISABLE_INTS + bl save_nvgprs +1: addi r3,r1,STACK_FRAME_OVERHEAD + bl unrecoverable_exception + b 1b + +#endif /* __DISABLED__ */ + + + /* + * Machine check is different because we use a different + * save area: PACA_EXMC instead of PACA_EXGEN. + */ + .align 7 + .globl machine_check_common +machine_check_common: + + mfspr r10,SPRN_DAR + std r10,PACA_EXGEN+EX_DAR(r13) + mfspr r10,SPRN_DSISR + stw r10,PACA_EXGEN+EX_DSISR(r13) + EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC) + FINISH_NAP + DISABLE_INTS + ld r3,PACA_EXGEN+EX_DAR(r13) + lwz r4,PACA_EXGEN+EX_DSISR(r13) + std r3,_DAR(r1) + std r4,_DSISR(r1) + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_exception + b ret_from_except + + .align 7 + .globl alignment_common +alignment_common: + mfspr r10,SPRN_DAR + std r10,PACA_EXGEN+EX_DAR(r13) + mfspr r10,SPRN_DSISR + stw r10,PACA_EXGEN+EX_DSISR(r13) + EXCEPTION_PROLOG_COMMON(0x600, PACA_EXGEN) + ld r3,PACA_EXGEN+EX_DAR(r13) + lwz r4,PACA_EXGEN+EX_DSISR(r13) + std r3,_DAR(r1) + std r4,_DSISR(r1) + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl alignment_exception + b ret_from_except + + .align 7 + .globl program_check_common +program_check_common: + EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl program_check_exception + b ret_from_except + + .align 7 + .globl fp_unavailable_common +fp_unavailable_common: + EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN) + bne 1f /* if from user, just load it up */ + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl kernel_fp_unavailable_exception + BUG_OPCODE +1: +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + /* Test if 2 TM state bits are zero. If non-zero (ie. userspace was in + * transaction), go do TM stuff + */ + rldicl. r0, r12, (64-MSR_TS_LG), (64-2) + bne- 2f +END_FTR_SECTION_IFSET(CPU_FTR_TM) +#endif + bl load_up_fpu + b fast_exception_return +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +2: /* User process was in a transaction */ + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl fp_unavailable_tm + b ret_from_except +#endif + .align 7 + .globl altivec_unavailable_common +altivec_unavailable_common: + EXCEPTION_PROLOG_COMMON(0xf20, PACA_EXGEN) +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + beq 1f +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + BEGIN_FTR_SECTION_NESTED(69) + /* Test if 2 TM state bits are zero. If non-zero (ie. userspace was in + * transaction), go do TM stuff + */ + rldicl. r0, r12, (64-MSR_TS_LG), (64-2) + bne- 2f + END_FTR_SECTION_NESTED(CPU_FTR_TM, CPU_FTR_TM, 69) +#endif + bl load_up_altivec + b fast_exception_return +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +2: /* User process was in a transaction */ + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl altivec_unavailable_tm + b ret_from_except +#endif +1: +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl altivec_unavailable_exception + b ret_from_except + + .align 7 + .globl vsx_unavailable_common +vsx_unavailable_common: + EXCEPTION_PROLOG_COMMON(0xf40, PACA_EXGEN) +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + beq 1f +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + BEGIN_FTR_SECTION_NESTED(69) + /* Test if 2 TM state bits are zero. If non-zero (ie. userspace was in + * transaction), go do TM stuff + */ + rldicl. r0, r12, (64-MSR_TS_LG), (64-2) + bne- 2f + END_FTR_SECTION_NESTED(CPU_FTR_TM, CPU_FTR_TM, 69) +#endif + b load_up_vsx +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +2: /* User process was in a transaction */ + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl vsx_unavailable_tm + b ret_from_except +#endif +1: +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl vsx_unavailable_exception + b ret_from_except + + STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) + STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) + + .align 7 + .globl __end_handlers +__end_handlers: + + /* Equivalents to the above handlers for relocation-on interrupt vectors */ + STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) + MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) + + STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) + STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) + +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) +/* + * Data area reserved for FWNMI option. + * This address (0x7000) is fixed by the RPA. + */ + .= 0x7000 + .globl fwnmi_data_area +fwnmi_data_area: + + /* pseries and powernv need to keep the whole page from + * 0x7000 to 0x8000 free for use by the firmware + */ + . = 0x8000 +#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ + +/* Space for CPU0's segment table */ + .balign 4096 + .globl initial_stab +initial_stab: + .space 4096 + +#ifdef CONFIG_PPC_POWERNV +_GLOBAL(opal_mc_secondary_handler) + HMT_MEDIUM_PPR_DISCARD + SET_SCRATCH0(r13) + GET_PACA(r13) + clrldi r3,r3,2 + tovirt(r3,r3) + std r3,PACA_OPAL_MC_EVT(r13) + ld r13,OPAL_MC_SRR0(r3) + mtspr SPRN_SRR0,r13 + ld r13,OPAL_MC_SRR1(r3) + mtspr SPRN_SRR1,r13 + ld r3,OPAL_MC_GPR3(r3) + GET_SCRATCH0(r13) + b machine_check_pSeries +#endif /* CONFIG_PPC_POWERNV */ + + +#define MACHINE_CHECK_HANDLER_WINDUP \ + /* Clear MSR_RI before setting SRR0 and SRR1. */\ + li r0,MSR_RI; \ + mfmsr r9; /* get MSR value */ \ + andc r9,r9,r0; \ + mtmsrd r9,1; /* Clear MSR_RI */ \ + /* Move original SRR0 and SRR1 into the respective regs */ \ + ld r9,_MSR(r1); \ + mtspr SPRN_SRR1,r9; \ + ld r3,_NIP(r1); \ + mtspr SPRN_SRR0,r3; \ + ld r9,_CTR(r1); \ + mtctr r9; \ + ld r9,_XER(r1); \ + mtxer r9; \ + ld r9,_LINK(r1); \ + mtlr r9; \ + REST_GPR(0, r1); \ + REST_8GPRS(2, r1); \ + REST_GPR(10, r1); \ + ld r11,_CCR(r1); \ + mtcr r11; \ + /* Decrement paca->in_mce. */ \ + lhz r12,PACA_IN_MCE(r13); \ + subi r12,r12,1; \ + sth r12,PACA_IN_MCE(r13); \ + REST_GPR(11, r1); \ + REST_2GPRS(12, r1); \ + /* restore original r1. */ \ + ld r1,GPR1(r1) + + /* + * Handle machine check early in real mode. We come here with + * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack. + */ + .align 7 + .globl machine_check_handle_early +machine_check_handle_early: + std r0,GPR0(r1) /* Save r0 */ + EXCEPTION_PROLOG_COMMON_3(0x200) + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_early + std r3,RESULT(r1) /* Save result */ + ld r12,_MSR(r1) +#ifdef CONFIG_PPC_P7_NAP + /* + * Check if thread was in power saving mode. We come here when any + * of the following is true: + * a. thread wasn't in power saving mode + * b. thread was in power saving mode with no state loss or + * supervisor state loss + * + * Go back to nap again if (b) is true. + */ + rlwinm. r11,r12,47-31,30,31 /* Was it in power saving mode? */ + beq 4f /* No, it wasn;t */ + /* Thread was in power saving mode. Go back to nap again. */ + cmpwi r11,2 + bne 3f + /* Supervisor state loss */ + li r0,1 + stb r0,PACA_NAPSTATELOST(r13) +3: bl machine_check_queue_event + MACHINE_CHECK_HANDLER_WINDUP + GET_PACA(r13) + ld r1,PACAR1(r13) + b power7_enter_nap_mode +4: +#endif + /* + * Check if we are coming from hypervisor userspace. If yes then we + * continue in host kernel in V mode to deliver the MC event. + */ + rldicl. r11,r12,4,63 /* See if MC hit while in HV mode. */ + beq 5f + andi. r11,r12,MSR_PR /* See if coming from user. */ + bne 9f /* continue in V mode if we are. */ + +5: +#ifdef CONFIG_KVM_BOOK3S_64_HV + /* + * We are coming from kernel context. Check if we are coming from + * guest. if yes, then we can continue. We will fall through + * do_kvm_200->kvmppc_interrupt to deliver the MC event to guest. + */ + lbz r11,HSTATE_IN_GUEST(r13) + cmpwi r11,0 /* Check if coming from guest */ + bne 9f /* continue if we are. */ +#endif + /* + * At this point we are not sure about what context we come from. + * Queue up the MCE event and return from the interrupt. + * But before that, check if this is an un-recoverable exception. + * If yes, then stay on emergency stack and panic. + */ + andi. r11,r12,MSR_RI + bne 2f +1: mfspr r11,SPRN_SRR0 + ld r10,PACAKBASE(r13) + LOAD_HANDLER(r10,unrecover_mce) + mtspr SPRN_SRR0,r10 + ld r10,PACAKMSR(r13) + /* + * We are going down. But there are chances that we might get hit by + * another MCE during panic path and we may run into unstable state + * with no way out. Hence, turn ME bit off while going down, so that + * when another MCE is hit during panic path, system will checkstop + * and hypervisor will get restarted cleanly by SP. + */ + li r3,MSR_ME + andc r10,r10,r3 /* Turn off MSR_ME */ + mtspr SPRN_SRR1,r10 + rfid + b . +2: + /* + * Check if we have successfully handled/recovered from error, if not + * then stay on emergency stack and panic. + */ + ld r3,RESULT(r1) /* Load result */ + cmpdi r3,0 /* see if we handled MCE successfully */ + + beq 1b /* if !handled then panic */ + /* + * Return from MC interrupt. + * Queue up the MCE event so that we can log it later, while + * returning from kernel or opal call. + */ + bl machine_check_queue_event + MACHINE_CHECK_HANDLER_WINDUP + rfid +9: + /* Deliver the machine check to host kernel in V mode. */ + MACHINE_CHECK_HANDLER_WINDUP + b machine_check_pSeries + +unrecover_mce: + /* Invoke machine_check_exception to print MCE event and panic. */ + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_exception + /* + * We will not reach here. Even if we did, there is no way out. Call + * unrecoverable_exception and die. + */ +1: addi r3,r1,STACK_FRAME_OVERHEAD + bl unrecoverable_exception + b 1b +/* + * r13 points to the PACA, r9 contains the saved CR, + * r12 contain the saved SRR1, SRR0 is still ready for return + * r3 has the faulting address + * r9 - r13 are saved in paca->exslb. + * r3 is saved in paca->slb_r3 + * We assume we aren't going to take any exceptions during this procedure. + */ +slb_miss_realmode: + mflr r10 +#ifdef CONFIG_RELOCATABLE + mtctr r11 +#endif + + stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ + std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ + + bl slb_allocate_realmode + + /* All done -- return from exception. */ + + ld r10,PACA_EXSLB+EX_LR(r13) + ld r3,PACA_EXSLB+EX_R3(r13) + lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ + + mtlr r10 + + andi. r10,r12,MSR_RI /* check for unrecoverable exception */ + beq- 2f + +.machine push +.machine "power4" + mtcrf 0x80,r9 + mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ +.machine pop + + RESTORE_PPR_PACA(PACA_EXSLB, r9) + ld r9,PACA_EXSLB+EX_R9(r13) + ld r10,PACA_EXSLB+EX_R10(r13) + ld r11,PACA_EXSLB+EX_R11(r13) + ld r12,PACA_EXSLB+EX_R12(r13) + ld r13,PACA_EXSLB+EX_R13(r13) + rfid + b . /* prevent speculative execution */ + +2: mfspr r11,SPRN_SRR0 + ld r10,PACAKBASE(r13) + LOAD_HANDLER(r10,unrecov_slb) + mtspr SPRN_SRR0,r10 + ld r10,PACAKMSR(r13) + mtspr SPRN_SRR1,r10 + rfid + b . + +unrecov_slb: + EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB) + DISABLE_INTS + bl save_nvgprs +1: addi r3,r1,STACK_FRAME_OVERHEAD + bl unrecoverable_exception + b 1b + + +#ifdef CONFIG_PPC_970_NAP +power4_fixup_nap: + andc r9,r9,r10 + std r9,TI_LOCAL_FLAGS(r11) + ld r10,_LINK(r1) /* make idle task do the */ + std r10,_NIP(r1) /* equivalent of a blr */ + blr +#endif + +/* + * Hash table stuff + */ + .align 7 +do_hash_page: + std r3,_DAR(r1) + std r4,_DSISR(r1) + + andis. r0,r4,0xa410 /* weird error? */ + bne- handle_page_fault /* if not, try to insert a HPTE */ + andis. r0,r4,DSISR_DABRMATCH@h + bne- handle_dabr_fault + +BEGIN_FTR_SECTION + andis. r0,r4,0x0020 /* Is it a segment table fault? */ + bne- do_ste_alloc /* If so handle it */ +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB) + + CURRENT_THREAD_INFO(r11, r1) + lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ + andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */ + bne 77f /* then don't call hash_page now */ + /* + * We need to set the _PAGE_USER bit if MSR_PR is set or if we are + * accessing a userspace segment (even from the kernel). We assume + * kernel addresses always have the high bit set. + */ + rlwinm r4,r4,32-25+9,31-9,31-9 /* DSISR_STORE -> _PAGE_RW */ + rotldi r0,r3,15 /* Move high bit into MSR_PR posn */ + orc r0,r12,r0 /* MSR_PR | ~high_bit */ + rlwimi r4,r0,32-13,30,30 /* becomes _PAGE_USER access bit */ + ori r4,r4,1 /* add _PAGE_PRESENT */ + rlwimi r4,r5,22+2,31-2,31-2 /* Set _PAGE_EXEC if trap is 0x400 */ + + /* + * r3 contains the faulting address + * r4 contains the required access permissions + * r5 contains the trap number + * + * at return r3 = 0 for success, 1 for page fault, negative for error + */ + bl hash_page /* build HPTE if possible */ + cmpdi r3,0 /* see if hash_page succeeded */ + + /* Success */ + beq fast_exc_return_irq /* Return from exception on success */ + + /* Error */ + blt- 13f + +/* Here we have a page fault that hash_page can't handle. */ +handle_page_fault: +11: ld r4,_DAR(r1) + ld r5,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_page_fault + cmpdi r3,0 + beq+ 12f + bl save_nvgprs + mr r5,r3 + addi r3,r1,STACK_FRAME_OVERHEAD + lwz r4,_DAR(r1) + bl bad_page_fault + b ret_from_except + +/* We have a data breakpoint exception - handle it */ +handle_dabr_fault: + bl save_nvgprs + ld r4,_DAR(r1) + ld r5,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_break +12: b ret_from_except_lite + + +/* We have a page fault that hash_page could handle but HV refused + * the PTE insertion + */ +13: bl save_nvgprs + mr r5,r3 + addi r3,r1,STACK_FRAME_OVERHEAD + ld r4,_DAR(r1) + bl low_hash_fault + b ret_from_except + +/* + * We come here as a result of a DSI at a point where we don't want + * to call hash_page, such as when we are accessing memory (possibly + * user memory) inside a PMU interrupt that occurred while interrupts + * were soft-disabled. We want to invoke the exception handler for + * the access, or panic if there isn't a handler. + */ +77: bl save_nvgprs + mr r4,r3 + addi r3,r1,STACK_FRAME_OVERHEAD + li r5,SIGSEGV + bl bad_page_fault + b ret_from_except + + /* here we have a segment miss */ +do_ste_alloc: + bl ste_allocate /* try to insert stab entry */ + cmpdi r3,0 + bne- handle_page_fault + b fast_exception_return + +/* + * r13 points to the PACA, r9 contains the saved CR, + * r11 and r12 contain the saved SRR0 and SRR1. + * r9 - r13 are saved in paca->exslb. + * We assume we aren't going to take any exceptions during this procedure. + * We assume (DAR >> 60) == 0xc. + */ + .align 7 +do_stab_bolted: + stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ + std r11,PACA_EXSLB+EX_SRR0(r13) /* save SRR0 in exc. frame */ + mfspr r11,SPRN_DAR /* ea */ + + /* + * check for bad kernel/user address + * (ea & ~REGION_MASK) >= PGTABLE_RANGE + */ + rldicr. r9,r11,4,(63 - 46 - 4) + li r9,0 /* VSID = 0 for bad address */ + bne- 0f + + /* + * Calculate VSID: + * This is the kernel vsid, we take the top for context from + * the range. context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 + * Here we know that (ea >> 60) == 0xc + */ + lis r9,(MAX_USER_CONTEXT + 1)@ha + addi r9,r9,(MAX_USER_CONTEXT + 1)@l + + srdi r10,r11,SID_SHIFT + rldimi r10,r9,ESID_BITS,0 /* proto vsid */ + ASM_VSID_SCRAMBLE(r10, r9, 256M) + rldic r9,r10,12,16 /* r9 = vsid << 12 */ + +0: + /* Hash to the primary group */ + ld r10,PACASTABVIRT(r13) + srdi r11,r11,SID_SHIFT + rldimi r10,r11,7,52 /* r10 = first ste of the group */ + + /* Search the primary group for a free entry */ +1: ld r11,0(r10) /* Test valid bit of the current ste */ + andi. r11,r11,0x80 + beq 2f + addi r10,r10,16 + andi. r11,r10,0x70 + bne 1b + + /* Stick for only searching the primary group for now. */ + /* At least for now, we use a very simple random castout scheme */ + /* Use the TB as a random number ; OR in 1 to avoid entry 0 */ + mftb r11 + rldic r11,r11,4,57 /* r11 = (r11 << 4) & 0x70 */ + ori r11,r11,0x10 + + /* r10 currently points to an ste one past the group of interest */ + /* make it point to the randomly selected entry */ + subi r10,r10,128 + or r10,r10,r11 /* r10 is the entry to invalidate */ + + isync /* mark the entry invalid */ + ld r11,0(r10) + rldicl r11,r11,56,1 /* clear the valid bit */ + rotldi r11,r11,8 + std r11,0(r10) + sync + + clrrdi r11,r11,28 /* Get the esid part of the ste */ + slbie r11 + +2: std r9,8(r10) /* Store the vsid part of the ste */ + eieio + + mfspr r11,SPRN_DAR /* Get the new esid */ + clrrdi r11,r11,28 /* Permits a full 32b of ESID */ + ori r11,r11,0x90 /* Turn on valid and kp */ + std r11,0(r10) /* Put new entry back into the stab */ + + sync + + /* All done -- return from exception. */ + lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ + ld r11,PACA_EXSLB+EX_SRR0(r13) /* get saved SRR0 */ + + andi. r10,r12,MSR_RI + beq- unrecov_slb + + mtcrf 0x80,r9 /* restore CR */ + + mfmsr r10 + clrrdi r10,r10,2 + mtmsrd r10,1 + + mtspr SPRN_SRR0,r11 + mtspr SPRN_SRR1,r12 + ld r9,PACA_EXSLB+EX_R9(r13) + ld r10,PACA_EXSLB+EX_R10(r13) + ld r11,PACA_EXSLB+EX_R11(r13) + ld r12,PACA_EXSLB+EX_R12(r13) + ld r13,PACA_EXSLB+EX_R13(r13) + rfid + b . /* prevent speculative execution */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c new file mode 100644 index 00000000000..742694c1d85 --- /dev/null +++ b/arch/powerpc/kernel/fadump.c @@ -0,0 +1,1316 @@ +/* + * Firmware Assisted dump: A robust mechanism to get reliable kernel crash + * dump with assistance from firmware. This approach does not use kexec, + * instead firmware assists in booting the kdump kernel while preserving + * memory contents. The most of the code implementation has been adapted + * from phyp assisted dump implementation written by Linas Vepstas and + * Manish Ahuja + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2011 IBM Corporation + * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> + */ + +#undef DEBUG +#define pr_fmt(fmt) "fadump: " fmt + +#include <linux/string.h> +#include <linux/memblock.h> +#include <linux/delay.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/crash_dump.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> + +#include <asm/page.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/fadump.h> +#include <asm/debug.h> +#include <asm/setup.h> + +static struct fw_dump fw_dump; +static struct fadump_mem_struct fdm; +static const struct fadump_mem_struct *fdm_active; + +static DEFINE_MUTEX(fadump_mutex); +struct fad_crash_memory_ranges crash_memory_ranges[INIT_CRASHMEM_RANGES]; +int crash_mem_ranges; + +/* Scan the Firmware Assisted dump configuration details. */ +int __init early_init_dt_scan_fw_dump(unsigned long node, + const char *uname, int depth, void *data) +{ + const __be32 *sections; + int i, num_sections; + int size; + const int *token; + + if (depth != 1 || strcmp(uname, "rtas") != 0) + return 0; + + /* + * Check if Firmware Assisted dump is supported. if yes, check + * if dump has been initiated on last reboot. + */ + token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL); + if (!token) + return 1; + + fw_dump.fadump_supported = 1; + fw_dump.ibm_configure_kernel_dump = *token; + + /* + * The 'ibm,kernel-dump' rtas node is present only if there is + * dump data waiting for us. + */ + fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL); + if (fdm_active) + fw_dump.dump_active = 1; + + /* Get the sizes required to store dump data for the firmware provided + * dump sections. + * For each dump section type supported, a 32bit cell which defines + * the ID of a supported section followed by two 32 bit cells which + * gives teh size of the section in bytes. + */ + sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", + &size); + + if (!sections) + return 1; + + num_sections = size / (3 * sizeof(u32)); + + for (i = 0; i < num_sections; i++, sections += 3) { + u32 type = (u32)of_read_number(sections, 1); + + switch (type) { + case FADUMP_CPU_STATE_DATA: + fw_dump.cpu_state_data_size = + of_read_ulong(§ions[1], 2); + break; + case FADUMP_HPTE_REGION: + fw_dump.hpte_region_size = + of_read_ulong(§ions[1], 2); + break; + } + } + + return 1; +} + +int is_fadump_active(void) +{ + return fw_dump.dump_active; +} + +/* Print firmware assisted dump configurations for debugging purpose. */ +static void fadump_show_config(void) +{ + pr_debug("Support for firmware-assisted dump (fadump): %s\n", + (fw_dump.fadump_supported ? "present" : "no support")); + + if (!fw_dump.fadump_supported) + return; + + pr_debug("Fadump enabled : %s\n", + (fw_dump.fadump_enabled ? "yes" : "no")); + pr_debug("Dump Active : %s\n", + (fw_dump.dump_active ? "yes" : "no")); + pr_debug("Dump section sizes:\n"); + pr_debug(" CPU state data size: %lx\n", fw_dump.cpu_state_data_size); + pr_debug(" HPTE region size : %lx\n", fw_dump.hpte_region_size); + pr_debug("Boot memory size : %lx\n", fw_dump.boot_memory_size); +} + +static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, + unsigned long addr) +{ + if (!fdm) + return 0; + + memset(fdm, 0, sizeof(struct fadump_mem_struct)); + addr = addr & PAGE_MASK; + + fdm->header.dump_format_version = 0x00000001; + fdm->header.dump_num_sections = 3; + fdm->header.dump_status_flag = 0; + fdm->header.offset_first_dump_section = + (u32)offsetof(struct fadump_mem_struct, cpu_state_data); + + /* + * Fields for disk dump option. + * We are not using disk dump option, hence set these fields to 0. + */ + fdm->header.dd_block_size = 0; + fdm->header.dd_block_offset = 0; + fdm->header.dd_num_blocks = 0; + fdm->header.dd_offset_disk_path = 0; + + /* set 0 to disable an automatic dump-reboot. */ + fdm->header.max_time_auto = 0; + + /* Kernel dump sections */ + /* cpu state data section. */ + fdm->cpu_state_data.request_flag = FADUMP_REQUEST_FLAG; + fdm->cpu_state_data.source_data_type = FADUMP_CPU_STATE_DATA; + fdm->cpu_state_data.source_address = 0; + fdm->cpu_state_data.source_len = fw_dump.cpu_state_data_size; + fdm->cpu_state_data.destination_address = addr; + addr += fw_dump.cpu_state_data_size; + + /* hpte region section */ + fdm->hpte_region.request_flag = FADUMP_REQUEST_FLAG; + fdm->hpte_region.source_data_type = FADUMP_HPTE_REGION; + fdm->hpte_region.source_address = 0; + fdm->hpte_region.source_len = fw_dump.hpte_region_size; + fdm->hpte_region.destination_address = addr; + addr += fw_dump.hpte_region_size; + + /* RMA region section */ + fdm->rmr_region.request_flag = FADUMP_REQUEST_FLAG; + fdm->rmr_region.source_data_type = FADUMP_REAL_MODE_REGION; + fdm->rmr_region.source_address = RMA_START; + fdm->rmr_region.source_len = fw_dump.boot_memory_size; + fdm->rmr_region.destination_address = addr; + addr += fw_dump.boot_memory_size; + + return addr; +} + +/** + * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM + * + * Function to find the largest memory size we need to reserve during early + * boot process. This will be the size of the memory that is required for a + * kernel to boot successfully. + * + * This function has been taken from phyp-assisted dump feature implementation. + * + * returns larger of 256MB or 5% rounded down to multiples of 256MB. + * + * TODO: Come up with better approach to find out more accurate memory size + * that is required for a kernel to boot successfully. + * + */ +static inline unsigned long fadump_calculate_reserve_size(void) +{ + unsigned long size; + + /* + * Check if the size is specified through fadump_reserve_mem= cmdline + * option. If yes, then use that. + */ + if (fw_dump.reserve_bootvar) + return fw_dump.reserve_bootvar; + + /* divide by 20 to get 5% of value */ + size = memblock_end_of_DRAM() / 20; + + /* round it down in multiples of 256 */ + size = size & ~0x0FFFFFFFUL; + + /* Truncate to memory_limit. We don't want to over reserve the memory.*/ + if (memory_limit && size > memory_limit) + size = memory_limit; + + return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM); +} + +/* + * Calculate the total memory size required to be reserved for + * firmware-assisted dump registration. + */ +static unsigned long get_fadump_area_size(void) +{ + unsigned long size = 0; + + size += fw_dump.cpu_state_data_size; + size += fw_dump.hpte_region_size; + size += fw_dump.boot_memory_size; + size += sizeof(struct fadump_crash_info_header); + size += sizeof(struct elfhdr); /* ELF core header.*/ + size += sizeof(struct elf_phdr); /* place holder for cpu notes */ + /* Program headers for crash memory regions. */ + size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2); + + size = PAGE_ALIGN(size); + return size; +} + +int __init fadump_reserve_mem(void) +{ + unsigned long base, size, memory_boundary; + + if (!fw_dump.fadump_enabled) + return 0; + + if (!fw_dump.fadump_supported) { + printk(KERN_INFO "Firmware-assisted dump is not supported on" + " this hardware\n"); + fw_dump.fadump_enabled = 0; + return 0; + } + /* + * Initialize boot memory size + * If dump is active then we have already calculated the size during + * first kernel. + */ + if (fdm_active) + fw_dump.boot_memory_size = fdm_active->rmr_region.source_len; + else + fw_dump.boot_memory_size = fadump_calculate_reserve_size(); + + /* + * Calculate the memory boundary. + * If memory_limit is less than actual memory boundary then reserve + * the memory for fadump beyond the memory_limit and adjust the + * memory_limit accordingly, so that the running kernel can run with + * specified memory_limit. + */ + if (memory_limit && memory_limit < memblock_end_of_DRAM()) { + size = get_fadump_area_size(); + if ((memory_limit + size) < memblock_end_of_DRAM()) + memory_limit += size; + else + memory_limit = memblock_end_of_DRAM(); + printk(KERN_INFO "Adjusted memory_limit for firmware-assisted" + " dump, now %#016llx\n", memory_limit); + } + if (memory_limit) + memory_boundary = memory_limit; + else + memory_boundary = memblock_end_of_DRAM(); + + if (fw_dump.dump_active) { + printk(KERN_INFO "Firmware-assisted dump is active.\n"); + /* + * If last boot has crashed then reserve all the memory + * above boot_memory_size so that we don't touch it until + * dump is written to disk by userspace tool. This memory + * will be released for general use once the dump is saved. + */ + base = fw_dump.boot_memory_size; + size = memory_boundary - base; + memblock_reserve(base, size); + printk(KERN_INFO "Reserved %ldMB of memory at %ldMB " + "for saving crash dump\n", + (unsigned long)(size >> 20), + (unsigned long)(base >> 20)); + + fw_dump.fadumphdr_addr = + fdm_active->rmr_region.destination_address + + fdm_active->rmr_region.source_len; + pr_debug("fadumphdr_addr = %p\n", + (void *) fw_dump.fadumphdr_addr); + } else { + /* Reserve the memory at the top of memory. */ + size = get_fadump_area_size(); + base = memory_boundary - size; + memblock_reserve(base, size); + printk(KERN_INFO "Reserved %ldMB of memory at %ldMB " + "for firmware-assisted dump\n", + (unsigned long)(size >> 20), + (unsigned long)(base >> 20)); + } + fw_dump.reserve_dump_area_start = base; + fw_dump.reserve_dump_area_size = size; + return 1; +} + +/* Look for fadump= cmdline option. */ +static int __init early_fadump_param(char *p) +{ + if (!p) + return 1; + + if (strncmp(p, "on", 2) == 0) + fw_dump.fadump_enabled = 1; + else if (strncmp(p, "off", 3) == 0) + fw_dump.fadump_enabled = 0; + + return 0; +} +early_param("fadump", early_fadump_param); + +/* Look for fadump_reserve_mem= cmdline option */ +static int __init early_fadump_reserve_mem(char *p) +{ + if (p) + fw_dump.reserve_bootvar = memparse(p, &p); + return 0; +} +early_param("fadump_reserve_mem", early_fadump_reserve_mem); + +static void register_fw_dump(struct fadump_mem_struct *fdm) +{ + int rc; + unsigned int wait_time; + + pr_debug("Registering for firmware-assisted kernel dump...\n"); + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, + FADUMP_REGISTER, fdm, + sizeof(struct fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + + } while (wait_time); + + switch (rc) { + case -1: + printk(KERN_ERR "Failed to register firmware-assisted kernel" + " dump. Hardware Error(%d).\n", rc); + break; + case -3: + printk(KERN_ERR "Failed to register firmware-assisted kernel" + " dump. Parameter Error(%d).\n", rc); + break; + case -9: + printk(KERN_ERR "firmware-assisted kernel dump is already " + " registered."); + fw_dump.dump_registered = 1; + break; + case 0: + printk(KERN_INFO "firmware-assisted kernel dump registration" + " is successful\n"); + fw_dump.dump_registered = 1; + break; + } +} + +void crash_fadump(struct pt_regs *regs, const char *str) +{ + struct fadump_crash_info_header *fdh = NULL; + + if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr) + return; + + fdh = __va(fw_dump.fadumphdr_addr); + crashing_cpu = smp_processor_id(); + fdh->crashing_cpu = crashing_cpu; + crash_save_vmcoreinfo(); + + if (regs) + fdh->regs = *regs; + else + ppc_save_regs(&fdh->regs); + + fdh->cpu_online_mask = *cpu_online_mask; + + /* Call ibm,os-term rtas call to trigger firmware assisted dump */ + rtas_os_term((char *)str); +} + +#define GPR_MASK 0xffffff0000000000 +static inline int fadump_gpr_index(u64 id) +{ + int i = -1; + char str[3]; + + if ((id & GPR_MASK) == REG_ID("GPR")) { + /* get the digits at the end */ + id &= ~GPR_MASK; + id >>= 24; + str[2] = '\0'; + str[1] = id & 0xff; + str[0] = (id >> 8) & 0xff; + sscanf(str, "%d", &i); + if (i > 31) + i = -1; + } + return i; +} + +static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id, + u64 reg_val) +{ + int i; + + i = fadump_gpr_index(reg_id); + if (i >= 0) + regs->gpr[i] = (unsigned long)reg_val; + else if (reg_id == REG_ID("NIA")) + regs->nip = (unsigned long)reg_val; + else if (reg_id == REG_ID("MSR")) + regs->msr = (unsigned long)reg_val; + else if (reg_id == REG_ID("CTR")) + regs->ctr = (unsigned long)reg_val; + else if (reg_id == REG_ID("LR")) + regs->link = (unsigned long)reg_val; + else if (reg_id == REG_ID("XER")) + regs->xer = (unsigned long)reg_val; + else if (reg_id == REG_ID("CR")) + regs->ccr = (unsigned long)reg_val; + else if (reg_id == REG_ID("DAR")) + regs->dar = (unsigned long)reg_val; + else if (reg_id == REG_ID("DSISR")) + regs->dsisr = (unsigned long)reg_val; +} + +static struct fadump_reg_entry* +fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + + while (reg_entry->reg_id != REG_ID("CPUEND")) { + fadump_set_regval(regs, reg_entry->reg_id, + reg_entry->reg_value); + reg_entry++; + } + reg_entry++; + return reg_entry; +} + +static u32 *fadump_append_elf_note(u32 *buf, char *name, unsigned type, + void *data, size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) + 3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void fadump_final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) +{ + struct elf_prstatus prstatus; + + memset(&prstatus, 0, sizeof(prstatus)); + /* + * FIXME: How do i get PID? Do I really need it? + * prstatus.pr_pid = ???? + */ + elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + buf = fadump_append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + return buf; +} + +static void fadump_update_elfcore_header(char *bufp) +{ + struct elfhdr *elf; + struct elf_phdr *phdr; + + elf = (struct elfhdr *)bufp; + bufp += sizeof(struct elfhdr); + + /* First note is a place holder for cpu notes info. */ + phdr = (struct elf_phdr *)bufp; + + if (phdr->p_type == PT_NOTE) { + phdr->p_paddr = fw_dump.cpu_notes_buf; + phdr->p_offset = phdr->p_paddr; + phdr->p_filesz = fw_dump.cpu_notes_buf_size; + phdr->p_memsz = fw_dump.cpu_notes_buf_size; + } + return; +} + +static void *fadump_cpu_notes_buf_alloc(unsigned long size) +{ + void *vaddr; + struct page *page; + unsigned long order, count, i; + + order = get_order(size); + vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order); + if (!vaddr) + return NULL; + + count = 1 << order; + page = virt_to_page(vaddr); + for (i = 0; i < count; i++) + SetPageReserved(page + i); + return vaddr; +} + +static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size) +{ + struct page *page; + unsigned long order, count, i; + + order = get_order(size); + count = 1 << order; + page = virt_to_page(vaddr); + for (i = 0; i < count; i++) + ClearPageReserved(page + i); + __free_pages(page, order); +} + +/* + * Read CPU state dump data and convert it into ELF notes. + * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be + * used to access the data to allow for additional fields to be added without + * affecting compatibility. Each list of registers for a CPU starts with + * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes, + * 8 Byte ASCII identifier and 8 Byte register value. The register entry + * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part + * of register value. For more details refer to PAPR document. + * + * Only for the crashing cpu we ignore the CPU dump data and get exact + * state from fadump crash info structure populated by first kernel at the + * time of crash. + */ +static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) +{ + struct fadump_reg_save_area_header *reg_header; + struct fadump_reg_entry *reg_entry; + struct fadump_crash_info_header *fdh = NULL; + void *vaddr; + unsigned long addr; + u32 num_cpus, *note_buf; + struct pt_regs regs; + int i, rc = 0, cpu = 0; + + if (!fdm->cpu_state_data.bytes_dumped) + return -EINVAL; + + addr = fdm->cpu_state_data.destination_address; + vaddr = __va(addr); + + reg_header = vaddr; + if (reg_header->magic_number != REGSAVE_AREA_MAGIC) { + printk(KERN_ERR "Unable to read register save area.\n"); + return -ENOENT; + } + pr_debug("--------CPU State Data------------\n"); + pr_debug("Magic Number: %llx\n", reg_header->magic_number); + pr_debug("NumCpuOffset: %x\n", reg_header->num_cpu_offset); + + vaddr += reg_header->num_cpu_offset; + num_cpus = *((u32 *)(vaddr)); + pr_debug("NumCpus : %u\n", num_cpus); + vaddr += sizeof(u32); + reg_entry = (struct fadump_reg_entry *)vaddr; + + /* Allocate buffer to hold cpu crash notes. */ + fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t); + fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size); + note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size); + if (!note_buf) { + printk(KERN_ERR "Failed to allocate 0x%lx bytes for " + "cpu notes buffer\n", fw_dump.cpu_notes_buf_size); + return -ENOMEM; + } + fw_dump.cpu_notes_buf = __pa(note_buf); + + pr_debug("Allocated buffer for cpu notes of size %ld at %p\n", + (num_cpus * sizeof(note_buf_t)), note_buf); + + if (fw_dump.fadumphdr_addr) + fdh = __va(fw_dump.fadumphdr_addr); + + for (i = 0; i < num_cpus; i++) { + if (reg_entry->reg_id != REG_ID("CPUSTRT")) { + printk(KERN_ERR "Unable to read CPU state data\n"); + rc = -ENOENT; + goto error_out; + } + /* Lower 4 bytes of reg_value contains logical cpu id */ + cpu = reg_entry->reg_value & FADUMP_CPU_ID_MASK; + if (fdh && !cpumask_test_cpu(cpu, &fdh->cpu_online_mask)) { + SKIP_TO_NEXT_CPU(reg_entry); + continue; + } + pr_debug("Reading register data for cpu %d...\n", cpu); + if (fdh && fdh->crashing_cpu == cpu) { + regs = fdh->regs; + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + SKIP_TO_NEXT_CPU(reg_entry); + } else { + reg_entry++; + reg_entry = fadump_read_registers(reg_entry, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + } + } + fadump_final_note(note_buf); + + if (fdh) { + pr_debug("Updating elfcore header (%llx) with cpu notes\n", + fdh->elfcorehdr_addr); + fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr)); + } + return 0; + +error_out: + fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf), + fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf = 0; + fw_dump.cpu_notes_buf_size = 0; + return rc; + +} + +/* + * Validate and process the dump data stored by firmware before exporting + * it through '/proc/vmcore'. + */ +static int __init process_fadump(const struct fadump_mem_struct *fdm_active) +{ + struct fadump_crash_info_header *fdh; + int rc = 0; + + if (!fdm_active || !fw_dump.fadumphdr_addr) + return -EINVAL; + + /* Check if the dump data is valid. */ + if ((fdm_active->header.dump_status_flag == FADUMP_ERROR_FLAG) || + (fdm_active->cpu_state_data.error_flags != 0) || + (fdm_active->rmr_region.error_flags != 0)) { + printk(KERN_ERR "Dump taken by platform is not valid\n"); + return -EINVAL; + } + if ((fdm_active->rmr_region.bytes_dumped != + fdm_active->rmr_region.source_len) || + !fdm_active->cpu_state_data.bytes_dumped) { + printk(KERN_ERR "Dump taken by platform is incomplete\n"); + return -EINVAL; + } + + /* Validate the fadump crash info header */ + fdh = __va(fw_dump.fadumphdr_addr); + if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { + printk(KERN_ERR "Crash info header is not valid.\n"); + return -EINVAL; + } + + rc = fadump_build_cpu_notes(fdm_active); + if (rc) + return rc; + + /* + * We are done validating dump info and elfcore header is now ready + * to be exported. set elfcorehdr_addr so that vmcore module will + * export the elfcore header through '/proc/vmcore'. + */ + elfcorehdr_addr = fdh->elfcorehdr_addr; + + return 0; +} + +static inline void fadump_add_crash_memory(unsigned long long base, + unsigned long long end) +{ + if (base == end) + return; + + pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", + crash_mem_ranges, base, end - 1, (end - base)); + crash_memory_ranges[crash_mem_ranges].base = base; + crash_memory_ranges[crash_mem_ranges].size = end - base; + crash_mem_ranges++; +} + +static void fadump_exclude_reserved_area(unsigned long long start, + unsigned long long end) +{ + unsigned long long ra_start, ra_end; + + ra_start = fw_dump.reserve_dump_area_start; + ra_end = ra_start + fw_dump.reserve_dump_area_size; + + if ((ra_start < end) && (ra_end > start)) { + if ((start < ra_start) && (end > ra_end)) { + fadump_add_crash_memory(start, ra_start); + fadump_add_crash_memory(ra_end, end); + } else if (start < ra_start) { + fadump_add_crash_memory(start, ra_start); + } else if (ra_end < end) { + fadump_add_crash_memory(ra_end, end); + } + } else + fadump_add_crash_memory(start, end); +} + +static int fadump_init_elfcore_header(char *bufp) +{ + struct elfhdr *elf; + + elf = (struct elfhdr *) bufp; + bufp += sizeof(struct elfhdr); + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + elf->e_type = ET_CORE; + elf->e_machine = ELF_ARCH; + elf->e_version = EV_CURRENT; + elf->e_entry = 0; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_shoff = 0; + elf->e_flags = ELF_CORE_EFLAGS; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize = sizeof(struct elf_phdr); + elf->e_phnum = 0; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + + return 0; +} + +/* + * Traverse through memblock structure and setup crash memory ranges. These + * ranges will be used create PT_LOAD program headers in elfcore header. + */ +static void fadump_setup_crash_memory_ranges(void) +{ + struct memblock_region *reg; + unsigned long long start, end; + + pr_debug("Setup crash memory ranges.\n"); + crash_mem_ranges = 0; + /* + * add the first memory chunk (RMA_START through boot_memory_size) as + * a separate memory chunk. The reason is, at the time crash firmware + * will move the content of this memory chunk to different location + * specified during fadump registration. We need to create a separate + * program header for this chunk with the correct offset. + */ + fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size); + + for_each_memblock(memory, reg) { + start = (unsigned long long)reg->base; + end = start + (unsigned long long)reg->size; + if (start == RMA_START && end >= fw_dump.boot_memory_size) + start = fw_dump.boot_memory_size; + + /* add this range excluding the reserved dump area. */ + fadump_exclude_reserved_area(start, end); + } +} + +/* + * If the given physical address falls within the boot memory region then + * return the relocated address that points to the dump region reserved + * for saving initial boot memory contents. + */ +static inline unsigned long fadump_relocate(unsigned long paddr) +{ + if (paddr > RMA_START && paddr < fw_dump.boot_memory_size) + return fdm.rmr_region.destination_address + paddr; + else + return paddr; +} + +static int fadump_create_elfcore_headers(char *bufp) +{ + struct elfhdr *elf; + struct elf_phdr *phdr; + int i; + + fadump_init_elfcore_header(bufp); + elf = (struct elfhdr *)bufp; + bufp += sizeof(struct elfhdr); + + /* + * setup ELF PT_NOTE, place holder for cpu notes info. The notes info + * will be populated during second kernel boot after crash. Hence + * this PT_NOTE will always be the first elf note. + * + * NOTE: Any new ELF note addition should be placed after this note. + */ + phdr = (struct elf_phdr *)bufp; + bufp += sizeof(struct elf_phdr); + phdr->p_type = PT_NOTE; + phdr->p_flags = 0; + phdr->p_vaddr = 0; + phdr->p_align = 0; + + phdr->p_offset = 0; + phdr->p_paddr = 0; + phdr->p_filesz = 0; + phdr->p_memsz = 0; + + (elf->e_phnum)++; + + /* setup ELF PT_NOTE for vmcoreinfo */ + phdr = (struct elf_phdr *)bufp; + bufp += sizeof(struct elf_phdr); + phdr->p_type = PT_NOTE; + phdr->p_flags = 0; + phdr->p_vaddr = 0; + phdr->p_align = 0; + + phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note()); + phdr->p_offset = phdr->p_paddr; + phdr->p_memsz = vmcoreinfo_max_size; + phdr->p_filesz = vmcoreinfo_max_size; + + /* Increment number of program headers. */ + (elf->e_phnum)++; + + /* setup PT_LOAD sections. */ + + for (i = 0; i < crash_mem_ranges; i++) { + unsigned long long mbase, msize; + mbase = crash_memory_ranges[i].base; + msize = crash_memory_ranges[i].size; + + if (!msize) + continue; + + phdr = (struct elf_phdr *)bufp; + bufp += sizeof(struct elf_phdr); + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mbase; + + if (mbase == RMA_START) { + /* + * The entire RMA region will be moved by firmware + * to the specified destination_address. Hence set + * the correct offset. + */ + phdr->p_offset = fdm.rmr_region.destination_address; + } + + phdr->p_paddr = mbase; + phdr->p_vaddr = (unsigned long)__va(mbase); + phdr->p_filesz = msize; + phdr->p_memsz = msize; + phdr->p_align = 0; + + /* Increment number of program headers. */ + (elf->e_phnum)++; + } + return 0; +} + +static unsigned long init_fadump_header(unsigned long addr) +{ + struct fadump_crash_info_header *fdh; + + if (!addr) + return 0; + + fw_dump.fadumphdr_addr = addr; + fdh = __va(addr); + addr += sizeof(struct fadump_crash_info_header); + + memset(fdh, 0, sizeof(struct fadump_crash_info_header)); + fdh->magic_number = FADUMP_CRASH_INFO_MAGIC; + fdh->elfcorehdr_addr = addr; + /* We will set the crashing cpu id in crash_fadump() during crash. */ + fdh->crashing_cpu = CPU_UNKNOWN; + + return addr; +} + +static void register_fadump(void) +{ + unsigned long addr; + void *vaddr; + + /* + * If no memory is reserved then we can not register for firmware- + * assisted dump. + */ + if (!fw_dump.reserve_dump_area_size) + return; + + fadump_setup_crash_memory_ranges(); + + addr = fdm.rmr_region.destination_address + fdm.rmr_region.source_len; + /* Initialize fadump crash info header. */ + addr = init_fadump_header(addr); + vaddr = __va(addr); + + pr_debug("Creating ELF core headers at %#016lx\n", addr); + fadump_create_elfcore_headers(vaddr); + + /* register the future kernel dump with firmware. */ + register_fw_dump(&fdm); +} + +static int fadump_unregister_dump(struct fadump_mem_struct *fdm) +{ + int rc = 0; + unsigned int wait_time; + + pr_debug("Un-register firmware-assisted dump\n"); + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, + FADUMP_UNREGISTER, fdm, + sizeof(struct fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + printk(KERN_ERR "Failed to un-register firmware-assisted dump." + " unexpected error(%d).\n", rc); + return rc; + } + fw_dump.dump_registered = 0; + return 0; +} + +static int fadump_invalidate_dump(struct fadump_mem_struct *fdm) +{ + int rc = 0; + unsigned int wait_time; + + pr_debug("Invalidating firmware-assisted dump registration\n"); + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, + FADUMP_INVALIDATE, fdm, + sizeof(struct fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + printk(KERN_ERR "Failed to invalidate firmware-assisted dump " + "rgistration. unexpected error(%d).\n", rc); + return rc; + } + fw_dump.dump_active = 0; + fdm_active = NULL; + return 0; +} + +void fadump_cleanup(void) +{ + /* Invalidate the registration only if dump is active. */ + if (fw_dump.dump_active) { + init_fadump_mem_struct(&fdm, + fdm_active->cpu_state_data.destination_address); + fadump_invalidate_dump(&fdm); + } +} + +/* + * Release the memory that was reserved in early boot to preserve the memory + * contents. The released memory will be available for general use. + */ +static void fadump_release_memory(unsigned long begin, unsigned long end) +{ + unsigned long addr; + unsigned long ra_start, ra_end; + + ra_start = fw_dump.reserve_dump_area_start; + ra_end = ra_start + fw_dump.reserve_dump_area_size; + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + /* + * exclude the dump reserve area. Will reuse it for next + * fadump registration. + */ + if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start)) + continue; + + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); + } +} + +static void fadump_invalidate_release_mem(void) +{ + unsigned long reserved_area_start, reserved_area_end; + unsigned long destination_address; + + mutex_lock(&fadump_mutex); + if (!fw_dump.dump_active) { + mutex_unlock(&fadump_mutex); + return; + } + + destination_address = fdm_active->cpu_state_data.destination_address; + fadump_cleanup(); + mutex_unlock(&fadump_mutex); + + /* + * Save the current reserved memory bounds we will require them + * later for releasing the memory for general use. + */ + reserved_area_start = fw_dump.reserve_dump_area_start; + reserved_area_end = reserved_area_start + + fw_dump.reserve_dump_area_size; + /* + * Setup reserve_dump_area_start and its size so that we can + * reuse this reserved memory for Re-registration. + */ + fw_dump.reserve_dump_area_start = destination_address; + fw_dump.reserve_dump_area_size = get_fadump_area_size(); + + fadump_release_memory(reserved_area_start, reserved_area_end); + if (fw_dump.cpu_notes_buf) { + fadump_cpu_notes_buf_free( + (unsigned long)__va(fw_dump.cpu_notes_buf), + fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf = 0; + fw_dump.cpu_notes_buf_size = 0; + } + /* Initialize the kernel dump memory structure for FAD registration. */ + init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); +} + +static ssize_t fadump_release_memory_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!fw_dump.dump_active) + return -EPERM; + + if (buf[0] == '1') { + /* + * Take away the '/proc/vmcore'. We are releasing the dump + * memory, hence it will not be valid anymore. + */ + vmcore_cleanup(); + fadump_invalidate_release_mem(); + + } else + return -EINVAL; + return count; +} + +static ssize_t fadump_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", fw_dump.fadump_enabled); +} + +static ssize_t fadump_register_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", fw_dump.dump_registered); +} + +static ssize_t fadump_register_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret = 0; + + if (!fw_dump.fadump_enabled || fdm_active) + return -EPERM; + + mutex_lock(&fadump_mutex); + + switch (buf[0]) { + case '0': + if (fw_dump.dump_registered == 0) { + ret = -EINVAL; + goto unlock_out; + } + /* Un-register Firmware-assisted dump */ + fadump_unregister_dump(&fdm); + break; + case '1': + if (fw_dump.dump_registered == 1) { + ret = -EINVAL; + goto unlock_out; + } + /* Register Firmware-assisted dump */ + register_fadump(); + break; + default: + ret = -EINVAL; + break; + } + +unlock_out: + mutex_unlock(&fadump_mutex); + return ret < 0 ? ret : count; +} + +static int fadump_region_show(struct seq_file *m, void *private) +{ + const struct fadump_mem_struct *fdm_ptr; + + if (!fw_dump.fadump_enabled) + return 0; + + mutex_lock(&fadump_mutex); + if (fdm_active) + fdm_ptr = fdm_active; + else { + mutex_unlock(&fadump_mutex); + fdm_ptr = &fdm; + } + + seq_printf(m, + "CPU : [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + fdm_ptr->cpu_state_data.destination_address, + fdm_ptr->cpu_state_data.destination_address + + fdm_ptr->cpu_state_data.source_len - 1, + fdm_ptr->cpu_state_data.source_len, + fdm_ptr->cpu_state_data.bytes_dumped); + seq_printf(m, + "HPTE: [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + fdm_ptr->hpte_region.destination_address, + fdm_ptr->hpte_region.destination_address + + fdm_ptr->hpte_region.source_len - 1, + fdm_ptr->hpte_region.source_len, + fdm_ptr->hpte_region.bytes_dumped); + seq_printf(m, + "DUMP: [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + fdm_ptr->rmr_region.destination_address, + fdm_ptr->rmr_region.destination_address + + fdm_ptr->rmr_region.source_len - 1, + fdm_ptr->rmr_region.source_len, + fdm_ptr->rmr_region.bytes_dumped); + + if (!fdm_active || + (fw_dump.reserve_dump_area_start == + fdm_ptr->cpu_state_data.destination_address)) + goto out; + + /* Dump is active. Show reserved memory region. */ + seq_printf(m, + " : [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + (unsigned long long)fw_dump.reserve_dump_area_start, + fdm_ptr->cpu_state_data.destination_address - 1, + fdm_ptr->cpu_state_data.destination_address - + fw_dump.reserve_dump_area_start, + fdm_ptr->cpu_state_data.destination_address - + fw_dump.reserve_dump_area_start); +out: + if (fdm_active) + mutex_unlock(&fadump_mutex); + return 0; +} + +static struct kobj_attribute fadump_release_attr = __ATTR(fadump_release_mem, + 0200, NULL, + fadump_release_memory_store); +static struct kobj_attribute fadump_attr = __ATTR(fadump_enabled, + 0444, fadump_enabled_show, + NULL); +static struct kobj_attribute fadump_register_attr = __ATTR(fadump_registered, + 0644, fadump_register_show, + fadump_register_store); + +static int fadump_region_open(struct inode *inode, struct file *file) +{ + return single_open(file, fadump_region_show, inode->i_private); +} + +static const struct file_operations fadump_region_fops = { + .open = fadump_region_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void fadump_init_files(void) +{ + struct dentry *debugfs_file; + int rc = 0; + + rc = sysfs_create_file(kernel_kobj, &fadump_attr.attr); + if (rc) + printk(KERN_ERR "fadump: unable to create sysfs file" + " fadump_enabled (%d)\n", rc); + + rc = sysfs_create_file(kernel_kobj, &fadump_register_attr.attr); + if (rc) + printk(KERN_ERR "fadump: unable to create sysfs file" + " fadump_registered (%d)\n", rc); + + debugfs_file = debugfs_create_file("fadump_region", 0444, + powerpc_debugfs_root, NULL, + &fadump_region_fops); + if (!debugfs_file) + printk(KERN_ERR "fadump: unable to create debugfs file" + " fadump_region\n"); + + if (fw_dump.dump_active) { + rc = sysfs_create_file(kernel_kobj, &fadump_release_attr.attr); + if (rc) + printk(KERN_ERR "fadump: unable to create sysfs file" + " fadump_release_mem (%d)\n", rc); + } + return; +} + +/* + * Prepare for firmware-assisted dump. + */ +int __init setup_fadump(void) +{ + if (!fw_dump.fadump_enabled) + return 0; + + if (!fw_dump.fadump_supported) { + printk(KERN_ERR "Firmware-assisted dump is not supported on" + " this hardware\n"); + return 0; + } + + fadump_show_config(); + /* + * If dump data is available then see if it is valid and prepare for + * saving it to the disk. + */ + if (fw_dump.dump_active) { + /* + * if dump process fails then invalidate the registration + * and release memory before proceeding for re-registration. + */ + if (process_fadump(fdm_active) < 0) + fadump_invalidate_release_mem(); + } + /* Initialize the kernel dump memory structure for FAD registration. */ + else if (fw_dump.reserve_dump_area_size) + init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); + fadump_init_files(); + + return 1; +} +subsys_initcall(setup_fadump); diff --git a/arch/powerpc/kernel/firmware.c b/arch/powerpc/kernel/firmware.c new file mode 100644 index 00000000000..2eae4478f7a --- /dev/null +++ b/arch/powerpc/kernel/firmware.c @@ -0,0 +1,22 @@ +/* + * Extracted from cputable.c + * + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + * + * Modifications for ppc64: + * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com> + * Copyright (C) 2005 Stephen Rothwell, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/export.h> +#include <linux/cache.h> + +#include <asm/firmware.h> + +unsigned long powerpc_firmware_features __read_mostly; +EXPORT_SYMBOL_GPL(powerpc_firmware_features); diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 4d6001fa1cf..9ad236e5d2c 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -2,6 +2,11 @@ * FPU support code, moved here from head.S so that it can be used * by chips which use other head-whatever.S files. * + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu> + * Copyright (C) 1996 Paul Mackerras. + * Copyright (C) 1997 Dan Malek (dmalek@jlc.net). + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -9,7 +14,6 @@ * */ -#include <linux/config.h> #include <asm/reg.h> #include <asm/page.h> #include <asm/mmu.h> @@ -19,6 +23,98 @@ #include <asm/thread_info.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> +#include <asm/ptrace.h> + +#ifdef CONFIG_VSX +#define __REST_32FPVSRS(n,c,base) \ +BEGIN_FTR_SECTION \ + b 2f; \ +END_FTR_SECTION_IFSET(CPU_FTR_VSX); \ + REST_32FPRS(n,base); \ + b 3f; \ +2: REST_32VSRS(n,c,base); \ +3: + +#define __SAVE_32FPVSRS(n,c,base) \ +BEGIN_FTR_SECTION \ + b 2f; \ +END_FTR_SECTION_IFSET(CPU_FTR_VSX); \ + SAVE_32FPRS(n,base); \ + b 3f; \ +2: SAVE_32VSRS(n,c,base); \ +3: +#else +#define __REST_32FPVSRS(n,b,base) REST_32FPRS(n, base) +#define __SAVE_32FPVSRS(n,b,base) SAVE_32FPRS(n, base) +#endif +#define REST_32FPVSRS(n,c,base) __REST_32FPVSRS(n,__REG_##c,__REG_##base) +#define SAVE_32FPVSRS(n,c,base) __SAVE_32FPVSRS(n,__REG_##c,__REG_##base) + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* void do_load_up_transact_fpu(struct thread_struct *thread) + * + * This is similar to load_up_fpu but for the transactional version of the FP + * register set. It doesn't mess with the task MSR or valid flags. + * Furthermore, we don't do lazy FP with TM currently. + */ +_GLOBAL(do_load_up_transact_fpu) + mfmsr r6 + ori r5,r6,MSR_FP +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + oris r5,r5,MSR_VSX@h +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + SYNC + MTMSRD(r5) + + addi r7,r3,THREAD_TRANSACT_FPSTATE + lfd fr0,FPSTATE_FPSCR(r7) + MTFSF_L(fr0) + REST_32FPVSRS(0, R4, R7) + + /* FP/VSX off again */ + MTMSRD(r6) + SYNC + + blr +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + +/* + * Enable use of the FPU, and VSX if possible, for the caller. + */ +_GLOBAL(fp_enable) + mfmsr r3 + ori r3,r3,MSR_FP +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + oris r3,r3,MSR_VSX@h +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + SYNC + MTMSRD(r3) + isync /* (not necessary for arch 2.02 and later) */ + blr + +/* + * Load state from memory into FP registers including FPSCR. + * Assumes the caller has enabled FP in the MSR. + */ +_GLOBAL(load_fp_state) + lfd fr0,FPSTATE_FPSCR(r3) + MTFSF_L(fr0) + REST_32FPVSRS(0, R4, R3) + blr + +/* + * Store FP state into memory, including FPSCR + * Assumes the caller has enabled FP in the MSR. + */ +_GLOBAL(store_fp_state) + SAVE_32FPVSRS(0, R4, R3) + mffs fr0 + stfd fr0,FPSTATE_FPSCR(r3) + blr /* * This task wants to use the FPU now. @@ -26,10 +122,17 @@ * and save its floating-point registers in its thread_struct. * Load up this task's FP registers from its thread_struct, * enable the FPU for the current task and return to the task. + * Note that on 32-bit this can only use registers that will be + * restored by fast_exception_return, i.e. r3 - r6, r10 and r11. */ _GLOBAL(load_up_fpu) mfmsr r5 ori r5,r5,MSR_FP +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + oris r5,r5,MSR_VSX@h +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif SYNC MTMSRD(r5) /* enable use of fpu now */ isync @@ -39,49 +142,51 @@ _GLOBAL(load_up_fpu) * to another. Instead we call giveup_fpu in switch_to. */ #ifndef CONFIG_SMP - LOADBASE(r3, last_task_used_math) + LOAD_REG_ADDRBASE(r3, last_task_used_math) toreal(r3) - LDL r4,OFF(last_task_used_math)(r3) - CMPI 0,r4,0 + PPC_LL r4,ADDROFF(last_task_used_math)(r3) + PPC_LCMPI 0,r4,0 beq 1f toreal(r4) addi r4,r4,THREAD /* want last_task_used_math->thread */ - SAVE_32FPRS(0, r4) + addi r10,r4,THREAD_FPSTATE + SAVE_32FPVSRS(0, R5, R10) mffs fr0 - stfd fr0,THREAD_FPSCR(r4) - LDL r5,PT_REGS(r4) + stfd fr0,FPSTATE_FPSCR(r10) + PPC_LL r5,PT_REGS(r4) toreal(r5) - LDL r4,_MSR-STACK_FRAME_OVERHEAD(r5) + PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5) li r10,MSR_FP|MSR_FE0|MSR_FE1 andc r4,r4,r10 /* disable FP for previous task */ - STL r4,_MSR-STACK_FRAME_OVERHEAD(r5) + PPC_STL r4,_MSR-STACK_FRAME_OVERHEAD(r5) 1: #endif /* CONFIG_SMP */ /* enable use of FP after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG3 /* current task's THREAD (phys) */ + mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ lwz r4,THREAD_FPEXC_MODE(r5) ori r9,r9,MSR_FP /* enable FP for current */ or r9,r9,r4 #else ld r4,PACACURRENT(r13) addi r5,r4,THREAD /* Get THREAD */ - ld r4,THREAD_FPEXC_MODE(r5) + lwz r4,THREAD_FPEXC_MODE(r5) ori r12,r12,MSR_FP or r12,r12,r4 std r12,_MSR(r1) #endif - lfd fr0,THREAD_FPSCR(r5) - mtfsf 0xff,fr0 - REST_32FPRS(0, r5) + addi r10,r5,THREAD_FPSTATE + lfd fr0,FPSTATE_FPSCR(r10) + MTFSF_L(fr0) + REST_32FPVSRS(0, R4, R10) #ifndef CONFIG_SMP subi r4,r5,THREAD fromreal(r4) - STL r4,OFF(last_task_used_math)(r3) + PPC_STL r4,ADDROFF(last_task_used_math)(r3) #endif /* CONFIG_SMP */ /* restore registers and return */ /* we haven't used ctr or xer or lr */ - b fast_exception_return + blr /* * giveup_fpu(tsk) @@ -92,53 +197,57 @@ _GLOBAL(load_up_fpu) _GLOBAL(giveup_fpu) mfmsr r5 ori r5,r5,MSR_FP +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + oris r5,r5,MSR_VSX@h +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif SYNC_601 ISYNC_601 MTMSRD(r5) /* enable use of fpu now */ SYNC_601 isync - CMPI 0,r3,0 + PPC_LCMPI 0,r3,0 beqlr- /* if no previous owner, done */ addi r3,r3,THREAD /* want THREAD of task */ - LDL r5,PT_REGS(r3) - CMPI 0,r5,0 - SAVE_32FPRS(0, r3) + PPC_LL r6,THREAD_FPSAVEAREA(r3) + PPC_LL r5,PT_REGS(r3) + PPC_LCMPI 0,r6,0 + bne 2f + addi r6,r3,THREAD_FPSTATE +2: PPC_LCMPI 0,r5,0 + SAVE_32FPVSRS(0, R4, R6) mffs fr0 - stfd fr0,THREAD_FPSCR(r3) + stfd fr0,FPSTATE_FPSCR(r6) beq 1f - LDL r4,_MSR-STACK_FRAME_OVERHEAD(r5) + PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5) li r3,MSR_FP|MSR_FE0|MSR_FE1 +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + oris r3,r3,MSR_VSX@h +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif andc r4,r4,r3 /* disable FP for previous task */ - STL r4,_MSR-STACK_FRAME_OVERHEAD(r5) + PPC_STL r4,_MSR-STACK_FRAME_OVERHEAD(r5) 1: #ifndef CONFIG_SMP li r5,0 - LOADBASE(r4,last_task_used_math) - STL r5,OFF(last_task_used_math)(r4) + LOAD_REG_ADDRBASE(r4,last_task_used_math) + PPC_STL r5,ADDROFF(last_task_used_math)(r4) #endif /* CONFIG_SMP */ blr /* * These are used in the alignment trap handler when emulating * single-precision loads and stores. - * We restore and save the fpscr so the task gets the same result - * and exceptions as if the cpu had performed the load or store. */ _GLOBAL(cvt_fd) - lfd 0,THREAD_FPSCR(r5) /* load up fpscr value */ - mtfsf 0xff,0 lfs 0,0(r3) stfd 0,0(r4) - mffs 0 - stfd 0,THREAD_FPSCR(r5) /* save new fpscr value */ blr _GLOBAL(cvt_df) - lfd 0,THREAD_FPSCR(r5) /* load up fpscr value */ - mtfsf 0xff,0 lfd 0,0(r3) stfs 0,0(r4) - mffs 0 - stfd 0,THREAD_FPSCR(r5) /* save new fpscr value */ blr diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S new file mode 100644 index 00000000000..f22e7e44fbf --- /dev/null +++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S @@ -0,0 +1,237 @@ + +/* 1. Find the index of the entry we're executing in */ + bl invstr /* Find our address */ +invstr: mflr r6 /* Make it accessible */ + mfmsr r7 + rlwinm r4,r7,27,31,31 /* extract MSR[IS] */ + mfspr r7, SPRN_PID0 + slwi r7,r7,16 + or r7,r7,r4 + mtspr SPRN_MAS6,r7 + tlbsx 0,r6 /* search MSR[IS], SPID=PID0 */ + mfspr r7,SPRN_MAS1 + andis. r7,r7,MAS1_VALID@h + bne match_TLB + + mfspr r7,SPRN_MMUCFG + rlwinm r7,r7,21,28,31 /* extract MMUCFG[NPIDS] */ + cmpwi r7,3 + bne match_TLB /* skip if NPIDS != 3 */ + + mfspr r7,SPRN_PID1 + slwi r7,r7,16 + or r7,r7,r4 + mtspr SPRN_MAS6,r7 + tlbsx 0,r6 /* search MSR[IS], SPID=PID1 */ + mfspr r7,SPRN_MAS1 + andis. r7,r7,MAS1_VALID@h + bne match_TLB + mfspr r7, SPRN_PID2 + slwi r7,r7,16 + or r7,r7,r4 + mtspr SPRN_MAS6,r7 + tlbsx 0,r6 /* Fall through, we had to match */ + +match_TLB: + mfspr r7,SPRN_MAS0 + rlwinm r3,r7,16,20,31 /* Extract MAS0(Entry) */ + + mfspr r7,SPRN_MAS1 /* Insure IPROT set */ + oris r7,r7,MAS1_IPROT@h + mtspr SPRN_MAS1,r7 + tlbwe + +/* 2. Invalidate all entries except the entry we're executing in */ + mfspr r9,SPRN_TLB1CFG + andi. r9,r9,0xfff + li r6,0 /* Set Entry counter to 0 */ +1: lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r7,r6,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r6) */ + mtspr SPRN_MAS0,r7 + tlbre + mfspr r7,SPRN_MAS1 + rlwinm r7,r7,0,2,31 /* Clear MAS1 Valid and IPROT */ + cmpw r3,r6 + beq skpinv /* Dont update the current execution TLB */ + mtspr SPRN_MAS1,r7 + tlbwe + isync +skpinv: addi r6,r6,1 /* Increment */ + cmpw r6,r9 /* Are we done? */ + bne 1b /* If not, repeat */ + + /* Invalidate TLB0 */ + li r6,0x04 + tlbivax 0,r6 + TLBSYNC + /* Invalidate TLB1 */ + li r6,0x0c + tlbivax 0,r6 + TLBSYNC + +/* 3. Setup a temp mapping and jump to it */ + andi. r5, r3, 0x1 /* Find an entry not used and is non-zero */ + addi r5, r5, 0x1 + lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r7,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ + mtspr SPRN_MAS0,r7 + tlbre + + /* grab and fixup the RPN */ + mfspr r6,SPRN_MAS1 /* extract MAS1[SIZE] */ + rlwinm r6,r6,25,27,31 + li r8,-1 + addi r6,r6,10 + slw r6,r8,r6 /* convert to mask */ + + bl 1f /* Find our address */ +1: mflr r7 + + mfspr r8,SPRN_MAS3 +#ifdef CONFIG_PHYS_64BIT + mfspr r23,SPRN_MAS7 +#endif + and r8,r6,r8 + subfic r9,r6,-4096 + and r9,r9,r7 + + or r25,r8,r9 + ori r8,r25,(MAS3_SX|MAS3_SW|MAS3_SR) + + /* Just modify the entry ID and EPN for the temp mapping */ + lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r7,r5,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r5) */ + mtspr SPRN_MAS0,r7 + xori r6,r4,1 /* Setup TMP mapping in the other Address space */ + slwi r6,r6,12 + oris r6,r6,(MAS1_VALID|MAS1_IPROT)@h + ori r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_4K))@l + mtspr SPRN_MAS1,r6 + mfspr r6,SPRN_MAS2 + li r7,0 /* temp EPN = 0 */ + rlwimi r7,r6,0,20,31 + mtspr SPRN_MAS2,r7 + mtspr SPRN_MAS3,r8 + tlbwe + + xori r6,r4,1 + slwi r6,r6,5 /* setup new context with other address space */ + bl 1f /* Find our address */ +1: mflr r9 + rlwimi r7,r9,0,20,31 + addi r7,r7,(2f - 1b) + mtspr SPRN_SRR0,r7 + mtspr SPRN_SRR1,r6 + rfi +2: +/* 4. Clear out PIDs & Search info */ + li r6,0 + mtspr SPRN_MAS6,r6 + mtspr SPRN_PID0,r6 + + mfspr r7,SPRN_MMUCFG + rlwinm r7,r7,21,28,31 /* extract MMUCFG[NPIDS] */ + cmpwi r7,3 + bne 2f /* skip if NPIDS != 3 */ + + mtspr SPRN_PID1,r6 + mtspr SPRN_PID2,r6 + +/* 5. Invalidate mapping we started in */ +2: + lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r7,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ + mtspr SPRN_MAS0,r7 + tlbre + mfspr r6,SPRN_MAS1 + rlwinm r6,r6,0,2,0 /* clear IPROT */ + mtspr SPRN_MAS1,r6 + tlbwe + /* Invalidate TLB1 */ + li r9,0x0c + tlbivax 0,r9 + TLBSYNC + +/* The mapping only needs to be cache-coherent on SMP */ +#ifdef CONFIG_SMP +#define M_IF_SMP MAS2_M +#else +#define M_IF_SMP 0 +#endif + +#if defined(ENTRY_MAPPING_BOOT_SETUP) + +/* 6. Setup KERNELBASE mapping in TLB1[0] */ + lis r6,0x1000 /* Set MAS0(TLBSEL) = TLB1(1), ESEL = 0 */ + mtspr SPRN_MAS0,r6 + lis r6,(MAS1_VALID|MAS1_IPROT)@h + ori r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_64M))@l + mtspr SPRN_MAS1,r6 + lis r6,MAS2_VAL(PAGE_OFFSET, BOOK3E_PAGESZ_64M, M_IF_SMP)@h + ori r6,r6,MAS2_VAL(PAGE_OFFSET, BOOK3E_PAGESZ_64M, M_IF_SMP)@l + mtspr SPRN_MAS2,r6 + mtspr SPRN_MAS3,r8 + tlbwe + +/* 7. Jump to KERNELBASE mapping */ + lis r6,(KERNELBASE & ~0xfff)@h + ori r6,r6,(KERNELBASE & ~0xfff)@l + rlwinm r7,r25,0,0x03ffffff + add r6,r7,r6 + +#elif defined(ENTRY_MAPPING_KEXEC_SETUP) +/* + * 6. Setup a 1:1 mapping in TLB1. Esel 0 is unsued, 1 or 2 contains the tmp + * mapping so we start at 3. We setup 8 mappings, each 256MiB in size. This + * will cover the first 2GiB of memory. + */ + + lis r10, (MAS1_VALID|MAS1_IPROT)@h + ori r10,r10, (MAS1_TSIZE(BOOK3E_PAGESZ_256M))@l + li r11, 0 + li r0, 8 + mtctr r0 + +next_tlb_setup: + addi r0, r11, 3 + rlwinm r0, r0, 16, 4, 15 // Compute esel + rlwinm r9, r11, 28, 0, 3 // Compute [ER]PN + oris r0, r0, (MAS0_TLBSEL(1))@h + mtspr SPRN_MAS0,r0 + mtspr SPRN_MAS1,r10 + mtspr SPRN_MAS2,r9 + ori r9, r9, (MAS3_SX|MAS3_SW|MAS3_SR) + mtspr SPRN_MAS3,r9 + tlbwe + addi r11, r11, 1 + bdnz+ next_tlb_setup + +/* 7. Jump to our 1:1 mapping */ + mr r6, r25 +#else + #error You need to specify the mapping or not use this at all. +#endif + + lis r7,MSR_KERNEL@h + ori r7,r7,MSR_KERNEL@l + bl 1f /* Find our address */ +1: mflr r9 + rlwimi r6,r9,0,20,31 + addi r6,r6,(2f - 1b) + mtspr SPRN_SRR0,r6 + mtspr SPRN_SRR1,r7 + rfi /* start execution out of TLB1[0] entry */ + +/* 8. Clear out the temp mapping */ +2: lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r7,r5,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r5) */ + mtspr SPRN_MAS0,r7 + tlbre + mfspr r8,SPRN_MAS1 + rlwinm r8,r8,0,2,0 /* clear IPROT */ + mtspr SPRN_MAS1,r8 + tlbwe + /* Invalidate TLB1 */ + li r9,0x0c + tlbivax 0,r9 + TLBSYNC diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c new file mode 100644 index 00000000000..d178834fe50 --- /dev/null +++ b/arch/powerpc/kernel/ftrace.c @@ -0,0 +1,591 @@ +/* + * Code for replacing ftrace calls with jumps. + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * + * Thanks goes out to P.A. Semi, Inc for supplying me with a PPC64 box. + * + * Added function graph tracer code, taken from x86 that was written + * by Frederic Weisbecker, and ported to PPC by Steven Rostedt. + * + */ + +#define pr_fmt(fmt) "ftrace-powerpc: " fmt + +#include <linux/spinlock.h> +#include <linux/hardirq.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/list.h> + +#include <asm/cacheflush.h> +#include <asm/code-patching.h> +#include <asm/ftrace.h> +#include <asm/syscall.h> + + +#ifdef CONFIG_DYNAMIC_FTRACE +static unsigned int +ftrace_call_replace(unsigned long ip, unsigned long addr, int link) +{ + unsigned int op; + + addr = ppc_function_entry((void *)addr); + + /* if (link) set op to 'bl' else 'b' */ + op = create_branch((unsigned int *)ip, addr, link ? 1 : 0); + + return op; +} + +static int +ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new) +{ + unsigned int replaced; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. We do this by using the + * probe_kernel_* functions. + * + * No real locking needed, this code is run through + * kstop_machine, or before SMP starts. + */ + + /* read the text we want to modify */ + if (probe_kernel_read(&replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (replaced != old) + return -EINVAL; + + /* replace the text with the new text */ + if (patch_instruction((unsigned int *)ip, new)) + return -EPERM; + + return 0; +} + +/* + * Helper functions that are the same for both PPC64 and PPC32. + */ +static int test_24bit_addr(unsigned long ip, unsigned long addr) +{ + addr = ppc_function_entry((void *)addr); + + /* use the create_branch to verify that this offset can be branched */ + return create_branch((unsigned int *)ip, addr, 0); +} + +#ifdef CONFIG_MODULES + +static int is_bl_op(unsigned int op) +{ + return (op & 0xfc000003) == 0x48000001; +} + +static unsigned long find_bl_target(unsigned long ip, unsigned int op) +{ + static int offset; + + offset = (op & 0x03fffffc); + /* make it signed */ + if (offset & 0x02000000) + offset |= 0xfe000000; + + return ip + (long)offset; +} + +#ifdef CONFIG_PPC64 +static int +__ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned int op; + unsigned long entry, ptr; + unsigned long ip = rec->ip; + void *tramp; + + /* read where this goes */ + if (probe_kernel_read(&op, (void *)ip, sizeof(int))) + return -EFAULT; + + /* Make sure that that this is still a 24bit jump */ + if (!is_bl_op(op)) { + pr_err("Not expected bl: opcode is %x\n", op); + return -EINVAL; + } + + /* lets find where the pointer goes */ + tramp = (void *)find_bl_target(ip, op); + + pr_devel("ip:%lx jumps to %p", ip, tramp); + + if (!is_module_trampoline(tramp)) { + pr_err("Not a trampoline\n"); + return -EINVAL; + } + + if (module_trampoline_target(mod, tramp, &ptr)) { + pr_err("Failed to get trampoline target\n"); + return -EFAULT; + } + + pr_devel("trampoline target %lx", ptr); + + entry = ppc_global_function_entry((void *)addr); + /* This should match what was called */ + if (ptr != entry) { + pr_err("addr %lx does not match expected %lx\n", ptr, entry); + return -EINVAL; + } + + /* + * Our original call site looks like: + * + * bl <tramp> + * ld r2,XX(r1) + * + * Milton Miller pointed out that we can not simply nop the branch. + * If a task was preempted when calling a trace function, the nops + * will remove the way to restore the TOC in r2 and the r2 TOC will + * get corrupted. + * + * Use a b +8 to jump over the load. + */ + op = 0x48000008; /* b +8 */ + + if (patch_instruction((unsigned int *)ip, op)) + return -EPERM; + + return 0; +} + +#else /* !PPC64 */ +static int +__ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned int op; + unsigned int jmp[4]; + unsigned long ip = rec->ip; + unsigned long tramp; + + if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure that that this is still a 24bit jump */ + if (!is_bl_op(op)) { + pr_err("Not expected bl: opcode is %x\n", op); + return -EINVAL; + } + + /* lets find where the pointer goes */ + tramp = find_bl_target(ip, op); + + /* + * On PPC32 the trampoline looks like: + * 0x3d, 0x80, 0x00, 0x00 lis r12,sym@ha + * 0x39, 0x8c, 0x00, 0x00 addi r12,r12,sym@l + * 0x7d, 0x89, 0x03, 0xa6 mtctr r12 + * 0x4e, 0x80, 0x04, 0x20 bctr + */ + + pr_devel("ip:%lx jumps to %lx", ip, tramp); + + /* Find where the trampoline jumps to */ + if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) { + pr_err("Failed to read %lx\n", tramp); + return -EFAULT; + } + + pr_devel(" %08x %08x ", jmp[0], jmp[1]); + + /* verify that this is what we expect it to be */ + if (((jmp[0] & 0xffff0000) != 0x3d800000) || + ((jmp[1] & 0xffff0000) != 0x398c0000) || + (jmp[2] != 0x7d8903a6) || + (jmp[3] != 0x4e800420)) { + pr_err("Not a trampoline\n"); + return -EINVAL; + } + + tramp = (jmp[1] & 0xffff) | + ((jmp[0] & 0xffff) << 16); + if (tramp & 0x8000) + tramp -= 0x10000; + + pr_devel(" %lx ", tramp); + + if (tramp != addr) { + pr_err("Trampoline location %08lx does not match addr\n", + tramp); + return -EINVAL; + } + + op = PPC_INST_NOP; + + if (patch_instruction((unsigned int *)ip, op)) + return -EPERM; + + return 0; +} +#endif /* PPC64 */ +#endif /* CONFIG_MODULES */ + +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned int old, new; + + /* + * If the calling address is more that 24 bits away, + * then we had to use a trampoline to make the call. + * Otherwise just update the call site. + */ + if (test_24bit_addr(ip, addr)) { + /* within range */ + old = ftrace_call_replace(ip, addr, 1); + new = PPC_INST_NOP; + return ftrace_modify_code(ip, old, new); + } + +#ifdef CONFIG_MODULES + /* + * Out of range jumps are called from modules. + * We should either already have a pointer to the module + * or it has been passed in. + */ + if (!rec->arch.mod) { + if (!mod) { + pr_err("No module loaded addr=%lx\n", addr); + return -EFAULT; + } + rec->arch.mod = mod; + } else if (mod) { + if (mod != rec->arch.mod) { + pr_err("Record mod %p not equal to passed in mod %p\n", + rec->arch.mod, mod); + return -EINVAL; + } + /* nothing to do if mod == rec->arch.mod */ + } else + mod = rec->arch.mod; + + return __ftrace_make_nop(mod, rec, addr); +#else + /* We should not get here without modules */ + return -EINVAL; +#endif /* CONFIG_MODULES */ +} + +#ifdef CONFIG_MODULES +#ifdef CONFIG_PPC64 +static int +__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned int op[2]; + void *ip = (void *)rec->ip; + + /* read where this goes */ + if (probe_kernel_read(op, ip, sizeof(op))) + return -EFAULT; + + /* + * We expect to see: + * + * b +8 + * ld r2,XX(r1) + * + * The load offset is different depending on the ABI. For simplicity + * just mask it out when doing the compare. + */ + if ((op[0] != 0x48000008) || ((op[1] & 0xffff0000) != 0xe8410000)) { + pr_err("Unexpected call sequence: %x %x\n", op[0], op[1]); + return -EINVAL; + } + + /* If we never set up a trampoline to ftrace_caller, then bail */ + if (!rec->arch.mod->arch.tramp) { + pr_err("No ftrace trampoline\n"); + return -EINVAL; + } + + /* Ensure branch is within 24 bits */ + if (!create_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) { + pr_err("Branch out of range\n"); + return -EINVAL; + } + + if (patch_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) { + pr_err("REL24 out of range!\n"); + return -EINVAL; + } + + return 0; +} +#else +static int +__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned int op; + unsigned long ip = rec->ip; + + /* read where this goes */ + if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* It should be pointing to a nop */ + if (op != PPC_INST_NOP) { + pr_err("Expected NOP but have %x\n", op); + return -EINVAL; + } + + /* If we never set up a trampoline to ftrace_caller, then bail */ + if (!rec->arch.mod->arch.tramp) { + pr_err("No ftrace trampoline\n"); + return -EINVAL; + } + + /* create the branch to the trampoline */ + op = create_branch((unsigned int *)ip, + rec->arch.mod->arch.tramp, BRANCH_SET_LINK); + if (!op) { + pr_err("REL24 out of range!\n"); + return -EINVAL; + } + + pr_devel("write to %lx\n", rec->ip); + + if (patch_instruction((unsigned int *)ip, op)) + return -EPERM; + + return 0; +} +#endif /* CONFIG_PPC64 */ +#endif /* CONFIG_MODULES */ + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned int old, new; + + /* + * If the calling address is more that 24 bits away, + * then we had to use a trampoline to make the call. + * Otherwise just update the call site. + */ + if (test_24bit_addr(ip, addr)) { + /* within range */ + old = PPC_INST_NOP; + new = ftrace_call_replace(ip, addr, 1); + return ftrace_modify_code(ip, old, new); + } + +#ifdef CONFIG_MODULES + /* + * Out of range jumps are called from modules. + * Being that we are converting from nop, it had better + * already have a module defined. + */ + if (!rec->arch.mod) { + pr_err("No module loaded\n"); + return -EINVAL; + } + + return __ftrace_make_call(rec, addr); +#else + /* We should not get here without modules */ + return -EINVAL; +#endif /* CONFIG_MODULES */ +} + +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned int old, new; + int ret; + + old = *(unsigned int *)&ftrace_call; + new = ftrace_call_replace(ip, (unsigned long)func, 1); + ret = ftrace_modify_code(ip, old, new); + + return ret; +} + +static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr = (unsigned long)FTRACE_ADDR; + int ret; + + ret = ftrace_update_record(rec, enable); + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + case FTRACE_UPDATE_MAKE_CALL: + return ftrace_make_call(rec, ftrace_addr); + case FTRACE_UPDATE_MAKE_NOP: + return ftrace_make_nop(NULL, rec, ftrace_addr); + } + + return 0; +} + +void ftrace_replace_code(int enable) +{ + struct ftrace_rec_iter *iter; + struct dyn_ftrace *rec; + int ret; + + for (iter = ftrace_rec_iter_start(); iter; + iter = ftrace_rec_iter_next(iter)) { + rec = ftrace_rec_iter_record(iter); + ret = __ftrace_replace_code(rec, enable); + if (ret) { + ftrace_bug(ret, rec->ip); + return; + } + } +} + +void arch_ftrace_update_code(int command) +{ + if (command & FTRACE_UPDATE_CALLS) + ftrace_replace_code(1); + else if (command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(0); + + if (command & FTRACE_UPDATE_TRACE_FUNC) + ftrace_update_ftrace_func(ftrace_trace_function); + + if (command & FTRACE_START_FUNC_RET) + ftrace_enable_ftrace_graph_caller(); + else if (command & FTRACE_STOP_FUNC_RET) + ftrace_disable_ftrace_graph_caller(); +} + +int __init ftrace_dyn_arch_init(void) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_graph_call(void); +extern void ftrace_graph_stub(void); + +int ftrace_enable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + unsigned long addr = (unsigned long)(&ftrace_graph_caller); + unsigned long stub = (unsigned long)(&ftrace_graph_stub); + unsigned int old, new; + + old = ftrace_call_replace(ip, stub, 0); + new = ftrace_call_replace(ip, addr, 0); + + return ftrace_modify_code(ip, old, new); +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + unsigned long addr = (unsigned long)(&ftrace_graph_caller); + unsigned long stub = (unsigned long)(&ftrace_graph_stub); + unsigned int old, new; + + old = ftrace_call_replace(ip, addr, 0); + new = ftrace_call_replace(ip, stub, 0); + + return ftrace_modify_code(ip, old, new); +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_PPC64 +extern void mod_return_to_handler(void); +#endif + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +{ + unsigned long old; + int faulted; + struct ftrace_graph_ent trace; + unsigned long return_hooker = (unsigned long)&return_to_handler; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + +#ifdef CONFIG_PPC64 + /* non core kernel code needs to save and restore the TOC */ + if (REGION_ID(self_addr) != KERNEL_REGION_ID) + return_hooker = (unsigned long)&mod_return_to_handler; +#endif + + return_hooker = ppc_function_entry((void *)return_hooker); + + /* + * Protect against fault, even if it shouldn't + * happen. This tool is too much intrusive to + * ignore such a protection. + */ + asm volatile( + "1: " PPC_LL "%[old], 0(%[parent])\n" + "2: " PPC_STL "%[return_hooker], 0(%[parent])\n" + " li %[faulted], 0\n" + "3:\n" + + ".section .fixup, \"ax\"\n" + "4: li %[faulted], 1\n" + " b 3b\n" + ".previous\n" + + ".section __ex_table,\"a\"\n" + PPC_LONG_ALIGN "\n" + PPC_LONG "1b,4b\n" + PPC_LONG "2b,4b\n" + ".previous" + + : [old] "=&r" (old), [faulted] "=r" (faulted) + : [parent] "r" (parent), [return_hooker] "r" (return_hooker) + : "memory" + ); + + if (unlikely(faulted)) { + ftrace_graph_stop(); + WARN_ON(1); + return; + } + + trace.func = self_addr; + trace.depth = current->curr_ret_stack + 1; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) { + *parent = old; + return; + } + + if (ftrace_push_return_trace(old, self_addr, &trace.depth, 0) == -EBUSY) + *parent = old; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_PPC64) +unsigned long __init arch_syscall_addr(int nr) +{ + return sys_call_table[nr*2]; +} +#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index b102e3a2415..dc0488b6f6e 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -9,7 +9,6 @@ * rewritten by Paul Mackerras. * Copyright (C) 1996 Paul Mackerras. * MPC8xx modifications Copyright (C) 1997 Dan Malek (dmalek@jlc.net). - * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk). * * This file contains the low-level support and setup for the * PowerPC platform, including trap and interrupt dispatch. @@ -22,7 +21,7 @@ * */ -#include <linux/config.h> +#include <linux/init.h> #include <asm/reg.h> #include <asm/page.h> #include <asm/mmu.h> @@ -32,10 +31,9 @@ #include <asm/thread_info.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> - -#ifdef CONFIG_APUS -#include <asm/amigappc.h> -#endif +#include <asm/ptrace.h> +#include <asm/bug.h> +#include <asm/kvm_book3s_asm.h> /* 601 only have IBAT; cr0.eq is set on 601 when using this macro */ #define LOAD_BAT(n, reg, RA, RB) \ @@ -54,20 +52,17 @@ mtspr SPRN_DBAT##n##L,RB; \ 1: - .text + __HEAD .stabs "arch/powerpc/kernel/",N_SO,0,0,0f .stabs "head_32.S",N_SO,0,0,0f 0: - .globl _stext -_stext: +_ENTRY(_stext); /* * _start is defined this way because the XCOFF loader in the OpenFirmware * on the powermac expects the entry point to be a procedure descriptor. */ - .text - .globl _start -_start: +_ENTRY(_start); /* * These are here for legacy reasons, the kernel used to * need to look like a coff function entry for the pmac @@ -93,11 +88,6 @@ _start: * r4: virtual address of boot_infos_t * r5: 0 * - * APUS - * r3: 'APUS' - * r4: physical address of memory base - * Linux/m68k style BootInfo structure at &_end. - * * PREP * This is jumped to on prep systems right after the kernel is relocated * to its proper place in memory by the boot loader. The expected layout @@ -122,11 +112,34 @@ __start: */ cmpwi 0,r5,0 beq 1f + +#ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE + /* find out where we are now */ + bcl 20,31,$+4 +0: mflr r8 /* r8 = runtime addr here */ + addis r8,r8,(_stext - 0b)@ha + addi r8,r8,(_stext - 0b)@l /* current runtime base addr */ bl prom_init +#endif /* CONFIG_PPC_OF_BOOT_TRAMPOLINE */ + + /* We never return. We also hit that trap if trying to boot + * from OF while CONFIG_PPC_OF_BOOT_TRAMPOLINE isn't selected */ + trap + +/* + * Check for BootX signature when supporting PowerMac and branch to + * appropriate trampoline if it's present + */ +#ifdef CONFIG_PPC_PMAC +1: lis r31,0x426f + ori r31,r31,0x6f58 + cmpw 0,r3,r31 + bne 1f + bl bootx_init trap +#endif /* CONFIG_PPC_PMAC */ -1: mr r31,r3 /* save parameters */ - mr r30,r4 +1: mr r31,r3 /* save device tree ptr */ li r24,0 /* cpu # */ /* @@ -136,14 +149,6 @@ __start: */ bl early_init -#ifdef CONFIG_APUS -/* On APUS the __va/__pa constants need to be set to the correct - * values before continuing. - */ - mr r4,r30 - bl fix_mem_constants -#endif /* CONFIG_APUS */ - /* Switch MMU off, clear BATs and flush TLB. At this point, r3 contains * the physical address we are running at, returned by early_init() */ @@ -153,6 +158,15 @@ __after_mmu_off: bl flush_tlbs bl initial_bats +#if defined(CONFIG_BOOTX_TEXT) + bl setup_disp_bat +#endif +#ifdef CONFIG_PPC_EARLY_DEBUG_CPM + bl setup_cpm_bat +#endif +#ifdef CONFIG_PPC_EARLY_DEBUG_USBGECKO + bl setup_usbgecko_bat +#endif /* * Call setup_cpu for CPU 0 and initialize 6xx Idle @@ -166,7 +180,6 @@ __after_mmu_off: #endif /* CONFIG_6xx */ -#ifndef CONFIG_APUS /* * We need to run with _start at physical address 0. * On CHRP, we are loaded at 0x10000 since OF on CHRP uses @@ -177,9 +190,9 @@ __after_mmu_off: bl reloc_offset mr r26,r3 addis r4,r3,KERNELBASE@h /* current address of _start */ - cmpwi 0,r4,0 /* are we already running at 0? */ + lis r5,PHYSICAL_START@h + cmplw 0,r4,r5 /* already running at PHYSICAL_START? */ bne relocate_kernel -#endif /* CONFIG_APUS */ /* * we now have the 1st 16M of ram mapped with the bats. * prep needs the mmu to be turned on here, but pmac already has it on. @@ -234,8 +247,8 @@ __secondary_hold_acknowledge: * task's thread_struct. */ #define EXCEPTION_PROLOG \ - mtspr SPRN_SPRG0,r10; \ - mtspr SPRN_SPRG1,r11; \ + mtspr SPRN_SPRG_SCRATCH0,r10; \ + mtspr SPRN_SPRG_SCRATCH1,r11; \ mfcr r10; \ EXCEPTION_PROLOG_1; \ EXCEPTION_PROLOG_2 @@ -245,7 +258,7 @@ __secondary_hold_acknowledge: andi. r11,r11,MSR_PR; \ tophys(r11,r1); /* use tophys(r1) if kernel */ \ beq 1f; \ - mfspr r11,SPRN_SPRG3; \ + mfspr r11,SPRN_SPRG_THREAD; \ lwz r11,THREAD_INFO-THREAD(r11); \ addi r11,r11,THREAD_SIZE; \ tophys(r11,r11); \ @@ -257,9 +270,9 @@ __secondary_hold_acknowledge: stw r10,_CCR(r11); /* save registers */ \ stw r12,GPR12(r11); \ stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG0; \ + mfspr r10,SPRN_SPRG_SCRATCH0; \ stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG1; \ + mfspr r12,SPRN_SPRG_SCRATCH1; \ stw r12,GPR11(r11); \ mflr r10; \ stw r10,_LINK(r11); \ @@ -271,8 +284,8 @@ __secondary_hold_acknowledge: li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR); /* can take exceptions */ \ MTMSRD(r10); /* (except for mach check in rtas) */ \ stw r0,GPR0(r11); \ - lis r10,0x7265; /* put exception frame marker */ \ - addi r10,r10,0x6773; \ + lis r10,STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */ \ + addi r10,r10,STACK_FRAME_REGS_MARKER@l; \ stw r10,8(r11); \ SAVE_4GPRS(3, r11); \ SAVE_2GPRS(7, r11) @@ -290,6 +303,7 @@ __secondary_hold_acknowledge: */ #define EXCEPTION(n, label, hdlr, xfer) \ . = n; \ + DO_KVM n; \ label: \ EXCEPTION_PROLOG; \ addi r3,r1,STACK_FRAME_OVERHEAD; \ @@ -327,12 +341,7 @@ i##n: \ /* System reset */ /* core99 pmac starts the seconary here by changing the vector, and putting it back to what it was (unknown_exception) when done. */ -#if defined(CONFIG_GEMINI) && defined(CONFIG_SMP) - . = 0x100 - b __secondary_start_gemini -#else EXCEPTION(0x100, Reset, unknown_exception, EXC_XFER_STD) -#endif /* Machine check */ /* @@ -350,11 +359,12 @@ i##n: \ * -- paulus. */ . = 0x200 - mtspr SPRN_SPRG0,r10 - mtspr SPRN_SPRG1,r11 + DO_KVM 0x200 + mtspr SPRN_SPRG_SCRATCH0,r10 + mtspr SPRN_SPRG_SCRATCH1,r11 mfcr r10 #ifdef CONFIG_PPC_CHRP - mfspr r11,SPRN_SPRG2 + mfspr r11,SPRN_SPRG_RTAS cmpwi 0,r11,0 bne 7f #endif /* CONFIG_PPC_CHRP */ @@ -362,7 +372,7 @@ i##n: \ 7: EXCEPTION_PROLOG_2 addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_CHRP - mfspr r4,SPRN_SPRG2 + mfspr r4,SPRN_SPRG_RTAS cmpwi cr1,r4,0 bne cr1,1f #endif @@ -373,22 +383,24 @@ i##n: \ /* Data access exception. */ . = 0x300 + DO_KVM 0x300 DataAccess: EXCEPTION_PROLOG mfspr r10,SPRN_DSISR + stw r10,_DSISR(r11) andis. r0,r10,0xa470 /* weird error? */ bne 1f /* if not, try to put a PTE */ mfspr r4,SPRN_DAR /* into the hash table */ rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */ bl hash_page -1: stw r10,_DSISR(r11) - mr r5,r10 +1: lwz r5,_DSISR(r11) /* get DSISR value */ mfspr r4,SPRN_DAR - EXC_XFER_EE_LITE(0x300, handle_page_fault) + EXC_XFER_LITE(0x300, handle_page_fault) /* Instruction access exception. */ . = 0x400 + DO_KVM 0x400 InstructionAccess: EXCEPTION_PROLOG andis. r0,r9,0x4000 /* no pte found? */ @@ -398,13 +410,14 @@ InstructionAccess: bl hash_page 1: mr r4,r12 mr r5,r9 - EXC_XFER_EE_LITE(0x400, handle_page_fault) + EXC_XFER_LITE(0x400, handle_page_fault) /* External interrupt */ EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) /* Alignment exception */ . = 0x600 + DO_KVM 0x600 Alignment: EXCEPTION_PROLOG mfspr r4,SPRN_DAR @@ -419,10 +432,20 @@ Alignment: /* Floating-point unavailable */ . = 0x800 + DO_KVM 0x800 FPUnavailable: +BEGIN_FTR_SECTION +/* + * Certain Freescale cores don't have a FPU and treat fp instructions + * as a FP Unavailable exception. Redirect to illegal/emulation handling. + */ + b ProgramCheck +END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE) EXCEPTION_PROLOG - bne load_up_fpu /* if from user, just load it up */ - addi r3,r1,STACK_FRAME_OVERHEAD + beq 1f + bl load_up_fpu /* if from user, just load it up */ + b fast_exception_return +1: addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_EE_LITE(0x800, kernel_fp_unavailable_exception) /* Decrementer */ @@ -433,6 +456,7 @@ FPUnavailable: /* System call */ . = 0xc00 + DO_KVM 0xc00 SystemCall: EXCEPTION_PROLOG EXC_XFER_EE_LITE(0xc00, DoSyscall) @@ -450,16 +474,13 @@ SystemCall: * by executing an altivec instruction. */ . = 0xf00 - b Trap_0f + DO_KVM 0xf00 + b PerformanceMonitor . = 0xf20 + DO_KVM 0xf20 b AltiVecUnavailable -Trap_0f: - EXCEPTION_PROLOG - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_EE(0xf00, unknown_exception) - /* * Handle TLB miss for instruction on 603/603e. * Note: we get an alternate set of r0 - r3 to use automatically. @@ -467,49 +488,50 @@ Trap_0f: . = 0x1000 InstructionTLBMiss: /* - * r0: stored ctr + * r0: scratch * r1: linux style pte ( later becomes ppc hardware pte ) * r2: ptr to linux-style pte * r3: scratch */ - mfctr r0 /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_IMISS - lis r1,KERNELBASE@h /* check if kernel address */ - cmplw 0,r3,r1 - mfspr r2,SPRN_SPRG3 + lis r1,PAGE_OFFSET@h /* check if kernel address */ + cmplw 0,r1,r3 + mfspr r2,SPRN_SPRG_THREAD li r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */ lwz r2,PGDIR(r2) - blt+ 112f + bge- 112f + mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ + rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ lis r2,swapper_pg_dir@ha /* if kernel address, use */ addi r2,r2,swapper_pg_dir@l /* kernel page table */ - mfspr r1,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ - rlwinm r1,r1,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ 112: tophys(r2,r2) rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- InstructionAddressInvalid /* return if no mapping */ rlwimi r2,r3,22,20,29 /* insert next 10 bits of address */ - lwz r3,0(r2) /* get linux-style pte */ - andc. r1,r1,r3 /* check access & ~permission */ + lwz r0,0(r2) /* get linux-style pte */ + andc. r1,r1,r0 /* check access & ~permission */ bne- InstructionAddressInvalid /* return if access not permitted */ - ori r3,r3,_PAGE_ACCESSED /* set _PAGE_ACCESSED in pte */ + ori r0,r0,_PAGE_ACCESSED /* set _PAGE_ACCESSED in pte */ /* * NOTE! We are assuming this is not an SMP system, otherwise * we would need to update the pte atomically with lwarx/stwcx. */ - stw r3,0(r2) /* update PTE (accessed bit) */ + stw r0,0(r2) /* update PTE (accessed bit) */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1,r3,32-10,31,31 /* _PAGE_RW -> PP lsb */ - rlwinm r2,r3,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ + rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ + rlwinm r2,r0,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ and r1,r1,r2 /* writable if _RW and _DIRTY */ - rlwimi r3,r3,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r3,r3,32-1,31,31 /* _PAGE_USER -> PP lsb */ - ori r1,r1,0xe14 /* clear out reserved bits and M */ - andc r1,r3,r1 /* PP = user? (rw&dirty? 2: 3): 0 */ + rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ + rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ + ori r1,r1,0xe04 /* clear out reserved bits */ + andc r1,r0,r1 /* PP = user? (rw&dirty? 2: 3): 0 */ +BEGIN_FTR_SECTION + rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ +END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) mtspr SPRN_RPA,r1 - mfspr r3,SPRN_IMISS tlbli r3 mfspr r3,SPRN_SRR1 /* Need to restore CR0 */ mtcrf 0x80,r3 @@ -520,7 +542,6 @@ InstructionAddressInvalid: addis r1,r1,0x2000 mtspr SPRN_DSISR,r1 /* (shouldn't be needed) */ - mtctr r0 /* Restore CTR */ andi. r2,r3,0xFFFF /* Clear upper bits of SRR1 */ or r2,r2,r1 mtspr SPRN_SRR1,r2 @@ -541,59 +562,71 @@ InstructionAddressInvalid: . = 0x1100 DataLoadTLBMiss: /* - * r0: stored ctr + * r0: scratch * r1: linux style pte ( later becomes ppc hardware pte ) * r2: ptr to linux-style pte * r3: scratch */ - mfctr r0 /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_DMISS - lis r1,KERNELBASE@h /* check if kernel address */ - cmplw 0,r3,r1 - mfspr r2,SPRN_SPRG3 + lis r1,PAGE_OFFSET@h /* check if kernel address */ + cmplw 0,r1,r3 + mfspr r2,SPRN_SPRG_THREAD li r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */ lwz r2,PGDIR(r2) - blt+ 112f + bge- 112f + mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ + rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ lis r2,swapper_pg_dir@ha /* if kernel address, use */ addi r2,r2,swapper_pg_dir@l /* kernel page table */ - mfspr r1,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ - rlwinm r1,r1,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ 112: tophys(r2,r2) rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- DataAddressInvalid /* return if no mapping */ rlwimi r2,r3,22,20,29 /* insert next 10 bits of address */ - lwz r3,0(r2) /* get linux-style pte */ - andc. r1,r1,r3 /* check access & ~permission */ + lwz r0,0(r2) /* get linux-style pte */ + andc. r1,r1,r0 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ - ori r3,r3,_PAGE_ACCESSED /* set _PAGE_ACCESSED in pte */ + ori r0,r0,_PAGE_ACCESSED /* set _PAGE_ACCESSED in pte */ /* * NOTE! We are assuming this is not an SMP system, otherwise * we would need to update the pte atomically with lwarx/stwcx. */ - stw r3,0(r2) /* update PTE (accessed bit) */ + stw r0,0(r2) /* update PTE (accessed bit) */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1,r3,32-10,31,31 /* _PAGE_RW -> PP lsb */ - rlwinm r2,r3,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ + rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ + rlwinm r2,r0,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ and r1,r1,r2 /* writable if _RW and _DIRTY */ - rlwimi r3,r3,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r3,r3,32-1,31,31 /* _PAGE_USER -> PP lsb */ - ori r1,r1,0xe14 /* clear out reserved bits and M */ - andc r1,r3,r1 /* PP = user? (rw&dirty? 2: 3): 0 */ + rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ + rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ + ori r1,r1,0xe04 /* clear out reserved bits */ + andc r1,r0,r1 /* PP = user? (rw&dirty? 2: 3): 0 */ +BEGIN_FTR_SECTION + rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ +END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) mtspr SPRN_RPA,r1 - mfspr r3,SPRN_DMISS + mfspr r2,SPRN_SRR1 /* Need to restore CR0 */ + mtcrf 0x80,r2 +BEGIN_MMU_FTR_SECTION + li r0,1 + mfspr r1,SPRN_SPRG_603_LRU + rlwinm r2,r3,20,27,31 /* Get Address bits 15:19 */ + slw r0,r0,r2 + xor r1,r0,r1 + srw r0,r1,r2 + mtspr SPRN_SPRG_603_LRU,r1 + mfspr r2,SPRN_SRR1 + rlwimi r2,r0,31-14,14,14 + mtspr SPRN_SRR1,r2 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) tlbld r3 - mfspr r3,SPRN_SRR1 /* Need to restore CR0 */ - mtcrf 0x80,r3 rfi DataAddressInvalid: mfspr r3,SPRN_SRR1 rlwinm r1,r3,9,6,6 /* Get load/store bit */ addis r1,r1,0x2000 mtspr SPRN_DSISR,r1 - mtctr r0 /* Restore CTR */ andi. r2,r3,0xFFFF /* Clear upper bits of SRR1 */ mtspr SPRN_SRR1,r2 mfspr r1,SPRN_DMISS /* Get failing address */ @@ -613,48 +646,61 @@ DataAddressInvalid: . = 0x1200 DataStoreTLBMiss: /* - * r0: stored ctr + * r0: scratch * r1: linux style pte ( later becomes ppc hardware pte ) * r2: ptr to linux-style pte * r3: scratch */ - mfctr r0 /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_DMISS - lis r1,KERNELBASE@h /* check if kernel address */ - cmplw 0,r3,r1 - mfspr r2,SPRN_SPRG3 + lis r1,PAGE_OFFSET@h /* check if kernel address */ + cmplw 0,r1,r3 + mfspr r2,SPRN_SPRG_THREAD li r1,_PAGE_RW|_PAGE_USER|_PAGE_PRESENT /* access flags */ lwz r2,PGDIR(r2) - blt+ 112f + bge- 112f + mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ + rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ lis r2,swapper_pg_dir@ha /* if kernel address, use */ addi r2,r2,swapper_pg_dir@l /* kernel page table */ - mfspr r1,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ - rlwinm r1,r1,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ 112: tophys(r2,r2) rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- DataAddressInvalid /* return if no mapping */ rlwimi r2,r3,22,20,29 /* insert next 10 bits of address */ - lwz r3,0(r2) /* get linux-style pte */ - andc. r1,r1,r3 /* check access & ~permission */ + lwz r0,0(r2) /* get linux-style pte */ + andc. r1,r1,r0 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ - ori r3,r3,_PAGE_ACCESSED|_PAGE_DIRTY + ori r0,r0,_PAGE_ACCESSED|_PAGE_DIRTY /* * NOTE! We are assuming this is not an SMP system, otherwise * we would need to update the pte atomically with lwarx/stwcx. */ - stw r3,0(r2) /* update PTE (accessed/dirty bits) */ + stw r0,0(r2) /* update PTE (accessed/dirty bits) */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r3,r3,32-1,30,30 /* _PAGE_USER -> PP msb */ - li r1,0xe15 /* clear out reserved bits and M */ - andc r1,r3,r1 /* PP = user? 2: 0 */ + rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ + li r1,0xe05 /* clear out reserved bits & PP lsb */ + andc r1,r0,r1 /* PP = user? 2: 0 */ +BEGIN_FTR_SECTION + rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ +END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) mtspr SPRN_RPA,r1 - mfspr r3,SPRN_DMISS + mfspr r2,SPRN_SRR1 /* Need to restore CR0 */ + mtcrf 0x80,r2 +BEGIN_MMU_FTR_SECTION + li r0,1 + mfspr r1,SPRN_SPRG_603_LRU + rlwinm r2,r3,20,27,31 /* Get Address bits 15:19 */ + slw r0,r0,r2 + xor r1,r0,r1 + srw r0,r1,r2 + mtspr SPRN_SPRG_603_LRU,r1 + mfspr r2,SPRN_SRR1 + rlwimi r2,r0,31-14,14,14 + mtspr SPRN_SRR1,r2 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) tlbld r3 - mfspr r3,SPRN_SRR1 /* Need to restore CR0 */ - mtcrf 0x80,r3 rfi #ifndef CONFIG_ALTIVEC @@ -699,132 +745,28 @@ DataStoreTLBMiss: AltiVecUnavailable: EXCEPTION_PROLOG #ifdef CONFIG_ALTIVEC - bne load_up_altivec /* if from user, just load it up */ -#endif /* CONFIG_ALTIVEC */ - EXC_XFER_EE_LITE(0xf20, altivec_unavailable_exception) - -#ifdef CONFIG_ALTIVEC -/* Note that the AltiVec support is closely modeled after the FP - * support. Changes to one are likely to be applicable to the - * other! */ -load_up_altivec: -/* - * Disable AltiVec for the task which had AltiVec previously, - * and save its AltiVec registers in its thread_struct. - * Enables AltiVec for use in the kernel on return. - * On SMP we know the AltiVec units are free, since we give it up every - * switch. -- Kumar - */ - mfmsr r5 - oris r5,r5,MSR_VEC@h - MTMSRD(r5) /* enable use of AltiVec now */ - isync -/* - * For SMP, we don't do lazy AltiVec switching because it just gets too - * horrendously complex, especially when a task switches from one CPU - * to another. Instead we call giveup_altivec in switch_to. - */ -#ifndef CONFIG_SMP - tophys(r6,0) - addis r3,r6,last_task_used_altivec@ha - lwz r4,last_task_used_altivec@l(r3) - cmpwi 0,r4,0 beq 1f - add r4,r4,r6 - addi r4,r4,THREAD /* want THREAD of last_task_used_altivec */ - SAVE_32VRS(0,r10,r4) - mfvscr vr0 - li r10,THREAD_VSCR - stvx vr0,r10,r4 - lwz r5,PT_REGS(r4) - add r5,r5,r6 - lwz r4,_MSR-STACK_FRAME_OVERHEAD(r5) - lis r10,MSR_VEC@h - andc r4,r4,r10 /* disable altivec for previous task */ - stw r4,_MSR-STACK_FRAME_OVERHEAD(r5) -1: -#endif /* CONFIG_SMP */ - /* enable use of AltiVec after return */ - oris r9,r9,MSR_VEC@h - mfspr r5,SPRN_SPRG3 /* current task's THREAD (phys) */ - li r4,1 - li r10,THREAD_VSCR - stw r4,THREAD_USED_VR(r5) - lvx vr0,r10,r5 - mtvscr vr0 - REST_32VRS(0,r10,r5) -#ifndef CONFIG_SMP - subi r4,r5,THREAD - sub r4,r4,r6 - stw r4,last_task_used_altivec@l(r3) -#endif /* CONFIG_SMP */ - /* restore registers and return */ - /* we haven't used ctr or xer or lr */ + bl load_up_altivec /* if from user, just load it up */ b fast_exception_return +#endif /* CONFIG_ALTIVEC */ +1: addi r3,r1,STACK_FRAME_OVERHEAD + EXC_XFER_EE_LITE(0xf20, altivec_unavailable_exception) -/* - * AltiVec unavailable trap from kernel - print a message, but let - * the task use AltiVec in the kernel until it returns to user mode. - */ -KernelAltiVec: - lwz r3,_MSR(r1) - oris r3,r3,MSR_VEC@h - stw r3,_MSR(r1) /* enable use of AltiVec after return */ - lis r3,87f@h - ori r3,r3,87f@l - mr r4,r2 /* current */ - lwz r5,_NIP(r1) - bl printk - b ret_from_except -87: .string "AltiVec used in kernel (task=%p, pc=%x) \n" - .align 4,0 - -/* - * giveup_altivec(tsk) - * Disable AltiVec for the task given as the argument, - * and save the AltiVec registers in its thread_struct. - * Enables AltiVec for use in the kernel on return. - */ +PerformanceMonitor: + EXCEPTION_PROLOG + addi r3,r1,STACK_FRAME_OVERHEAD + EXC_XFER_STD(0xf00, performance_monitor_exception) - .globl giveup_altivec -giveup_altivec: - mfmsr r5 - oris r5,r5,MSR_VEC@h - SYNC - MTMSRD(r5) /* enable use of AltiVec now */ - isync - cmpwi 0,r3,0 - beqlr- /* if no previous owner, done */ - addi r3,r3,THREAD /* want THREAD of task */ - lwz r5,PT_REGS(r3) - cmpwi 0,r5,0 - SAVE_32VRS(0, r4, r3) - mfvscr vr0 - li r4,THREAD_VSCR - stvx vr0,r4,r3 - beq 1f - lwz r4,_MSR-STACK_FRAME_OVERHEAD(r5) - lis r3,MSR_VEC@h - andc r4,r4,r3 /* disable AltiVec for previous task */ - stw r4,_MSR-STACK_FRAME_OVERHEAD(r5) -1: -#ifndef CONFIG_SMP - li r5,0 - lis r4,last_task_used_altivec@ha - stw r5,last_task_used_altivec@l(r4) -#endif /* CONFIG_SMP */ - blr -#endif /* CONFIG_ALTIVEC */ /* * This code is jumped to from the startup code to copy - * the kernel image to physical address 0. + * the kernel image to physical address PHYSICAL_START. */ relocate_kernel: addis r9,r26,klimit@ha /* fetch klimit */ lwz r25,klimit@l(r9) addis r25,r25,-KERNELBASE@h - li r3,0 /* Destination base address */ + lis r3,PHYSICAL_START@h /* Destination base address */ li r6,0 /* Destination offset */ li r5,0x4000 /* # bytes of memory to copy */ bl copy_and_flush /* copy the first 0x4000 bytes */ @@ -841,7 +783,7 @@ relocate_kernel: * r3 = dest addr, r4 = source addr, r5 = copy limit, r6 = start offset * on exit, r3, r4, r5 are unchanged, r6 is updated to be >= r5. */ -_GLOBAL(copy_and_flush) +_ENTRY(copy_and_flush) addi r5,r5,-4 addi r6,r6,-4 4: li r0,L1_CACHE_BYTES/4 @@ -861,98 +803,13 @@ _GLOBAL(copy_and_flush) addi r6,r6,4 blr -#ifdef CONFIG_APUS -/* - * On APUS the physical base address of the kernel is not known at compile - * time, which means the __pa/__va constants used are incorrect. In the - * __init section is recorded the virtual addresses of instructions using - * these constants, so all that has to be done is fix these before - * continuing the kernel boot. - * - * r4 = The physical address of the kernel base. - */ -fix_mem_constants: - mr r10,r4 - addis r10,r10,-KERNELBASE@h /* virt_to_phys constant */ - neg r11,r10 /* phys_to_virt constant */ - - lis r12,__vtop_table_begin@h - ori r12,r12,__vtop_table_begin@l - add r12,r12,r10 /* table begin phys address */ - lis r13,__vtop_table_end@h - ori r13,r13,__vtop_table_end@l - add r13,r13,r10 /* table end phys address */ - subi r12,r12,4 - subi r13,r13,4 -1: lwzu r14,4(r12) /* virt address of instruction */ - add r14,r14,r10 /* phys address of instruction */ - lwz r15,0(r14) /* instruction, now insert top */ - rlwimi r15,r10,16,16,31 /* half of vp const in low half */ - stw r15,0(r14) /* of instruction and restore. */ - dcbst r0,r14 /* write it to memory */ - sync - icbi r0,r14 /* flush the icache line */ - cmpw r12,r13 - bne 1b - sync /* additional sync needed on g4 */ - isync - -/* - * Map the memory where the exception handlers will - * be copied to when hash constants have been patched. - */ -#ifdef CONFIG_APUS_FAST_EXCEPT - lis r8,0xfff0 -#else - lis r8,0 -#endif - ori r8,r8,0x2 /* 128KB, supervisor */ - mtspr SPRN_DBAT3U,r8 - mtspr SPRN_DBAT3L,r8 - - lis r12,__ptov_table_begin@h - ori r12,r12,__ptov_table_begin@l - add r12,r12,r10 /* table begin phys address */ - lis r13,__ptov_table_end@h - ori r13,r13,__ptov_table_end@l - add r13,r13,r10 /* table end phys address */ - subi r12,r12,4 - subi r13,r13,4 -1: lwzu r14,4(r12) /* virt address of instruction */ - add r14,r14,r10 /* phys address of instruction */ - lwz r15,0(r14) /* instruction, now insert top */ - rlwimi r15,r11,16,16,31 /* half of pv const in low half*/ - stw r15,0(r14) /* of instruction and restore. */ - dcbst r0,r14 /* write it to memory */ - sync - icbi r0,r14 /* flush the icache line */ - cmpw r12,r13 - bne 1b - - sync /* additional sync needed on g4 */ - isync /* No speculative loading until now */ - blr - -/*********************************************************************** - * Please note that on APUS the exception handlers are located at the - * physical address 0xfff0000. For this reason, the exception handlers - * cannot use relative branches to access the code below. - ***********************************************************************/ -#endif /* CONFIG_APUS */ - #ifdef CONFIG_SMP -#ifdef CONFIG_GEMINI - .globl __secondary_start_gemini -__secondary_start_gemini: - mfspr r4,SPRN_HID0 - ori r4,r4,HID0_ICFI - li r3,0 - ori r3,r3,HID0_ICE - andc r4,r4,r3 - mtspr SPRN_HID0,r4 - sync - b __secondary_start -#endif /* CONFIG_GEMINI */ + .globl __secondary_start_mpc86xx +__secondary_start_mpc86xx: + mfspr r3, SPRN_PIR + stw r3, __secondary_hold_acknowledge@l(0) + mr r24, r3 /* cpu # */ + b __secondary_start .globl __secondary_start_pmac_0 __secondary_start_pmac_0: @@ -1006,9 +863,9 @@ __secondary_start: tophys(r4,r2) addi r4,r4,THREAD /* phys address of our thread_struct */ CLR_TOP32(r4) - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 li r3,0 - mtspr SPRN_SPRG2,r3 /* 0 => not in RTAS */ + mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ /* enable MMU and jump to start_secondary */ li r4,MSR_KERNEL @@ -1021,14 +878,18 @@ __secondary_start: RFI #endif /* CONFIG_SMP */ +#ifdef CONFIG_KVM_BOOK3S_HANDLER +#include "../kvm/book3s_rmhandlers.S" +#endif + /* * Those generic dummy functions are kept for CPUs not * included in CONFIG_6xx */ #if !defined(CONFIG_6xx) -_GLOBAL(__save_cpu_setup) +_ENTRY(__save_cpu_setup) blr -_GLOBAL(__restore_cpu_setup) +_ENTRY(__restore_cpu_setup) blr #endif /* !defined(CONFIG_6xx) */ @@ -1069,7 +930,12 @@ load_up_mmu: LOAD_BAT(1,r3,r4,r5) LOAD_BAT(2,r3,r4,r5) LOAD_BAT(3,r3,r4,r5) - +BEGIN_MMU_FTR_SECTION + LOAD_BAT(4,r3,r4,r5) + LOAD_BAT(5,r3,r4,r5) + LOAD_BAT(6,r3,r4,r5) + LOAD_BAT(7,r3,r4,r5) +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) blr /* @@ -1084,9 +950,9 @@ start_here: tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ CLR_TOP32(r4) - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 li r3,0 - mtspr SPRN_SPRG2,r3 /* 0 => not in RTAS */ + mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ /* stack */ lis r1,init_thread_union@ha @@ -1097,24 +963,12 @@ start_here: * Do early platform-specific initialization, * and set up the MMU. */ - mr r3,r31 - mr r4,r30 + li r3,0 + mr r4,r31 bl machine_init + bl __save_cpu_setup bl MMU_init -#ifdef CONFIG_APUS - /* Copy exception code to exception vector base on APUS. */ - lis r4,KERNELBASE@h -#ifdef CONFIG_APUS_FAST_EXCEPT - lis r3,0xfff0 /* Copy to 0xfff00000 */ -#else - lis r3,0 /* Copy to 0x00000000 */ -#endif - li r5,0x4000 /* # bytes of memory to copy */ - li r6,0 - bl copy_and_flush /* copy the first 0x4000 bytes */ -#endif /* CONFIG_APUS */ - /* * Go back to running unmapped so we can load up new values * for SDR1 (hash table pointer) and the segment registers @@ -1157,9 +1011,14 @@ start_here: RFI /* + * void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next); + * * Set up the segment registers for a new context. */ -_GLOBAL(set_context) +_ENTRY(switch_mmu_context) + lwz r3,MMCONTEXTID(r4) + cmpwi cr0,r3,0 + blt- 4f mulli r3,r3,897 /* multiply context by skew factor */ rlwinm r3,r3,4,8,27 /* VSID = (context & 0xfffff) << 4 */ addis r3,r3,0x6000 /* Set Ks, Ku bits */ @@ -1170,6 +1029,7 @@ _GLOBAL(set_context) /* Context switch the PTE pointer for the Abatron BDI2000. * The PGDIR is passed as second argument. */ + lwz r4,MM_PGD(r4) lis r5, KERNELBASE@h lwz r5, 0xf0(r5) stw r4, 0x4(r5) @@ -1185,6 +1045,9 @@ _GLOBAL(set_context) sync isync blr +4: trap + EMIT_BUG_ENTRY 4b,__FILE__,__LINE__,0 + blr /* * An undocumented "feature" of 604e requires that the v bit @@ -1218,7 +1081,7 @@ clear_bats: mtspr SPRN_IBAT2L,r10 mtspr SPRN_IBAT3U,r10 mtspr SPRN_IBAT3L,r10 -BEGIN_FTR_SECTION +BEGIN_MMU_FTR_SECTION /* Here's a tweak: at this point, CPU setup have * not been called yet, so HIGH_BAT_EN may not be * set in HID0 for the 745x processors. However, it @@ -1241,14 +1104,14 @@ BEGIN_FTR_SECTION mtspr SPRN_IBAT6L,r10 mtspr SPRN_IBAT7U,r10 mtspr SPRN_IBAT7L,r10 -END_FTR_SECTION_IFSET(CPU_FTR_HAS_HIGH_BATS) +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) blr flush_tlbs: lis r10, 0x40 1: addic. r10, r10, -0x1000 tlbie r10 - blt 1b + bgt 1b sync blr @@ -1264,24 +1127,27 @@ mmu_off: RFI /* - * Use the first pair of BAT registers to map the 1st 16MB - * of RAM to KERNELBASE. From this point on we can't safely - * call OF any more. + * On 601, we use 3 BATs to map up to 24M of RAM at _PAGE_OFFSET + * (we keep one for debugging) and on others, we use one 256M BAT. */ initial_bats: - lis r11,KERNELBASE@h + lis r11,PAGE_OFFSET@h mfspr r9,SPRN_PVR rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */ cmpwi 0,r9,1 bne 4f ori r11,r11,4 /* set up BAT registers for 601 */ li r8,0x7f /* valid, block length = 8MB */ - oris r9,r11,0x800000@h /* set up BAT reg for 2nd 8M */ - oris r10,r8,0x800000@h /* set up BAT reg for 2nd 8M */ mtspr SPRN_IBAT0U,r11 /* N.B. 601 has valid bit in */ mtspr SPRN_IBAT0L,r8 /* lower BAT register */ - mtspr SPRN_IBAT1U,r9 - mtspr SPRN_IBAT1L,r10 + addis r11,r11,0x800000@h + addis r8,r8,0x800000@h + mtspr SPRN_IBAT1U,r11 + mtspr SPRN_IBAT1L,r8 + addis r11,r11,0x800000@h + addis r8,r8,0x800000@h + mtspr SPRN_IBAT2U,r11 + mtspr SPRN_IBAT2L,r8 isync blr @@ -1291,11 +1157,7 @@ initial_bats: #else ori r8,r8,2 /* R/W access */ #endif /* CONFIG_SMP */ -#ifdef CONFIG_APUS - ori r11,r11,BL_8M<<2|0x2 /* set up 8MB BAT registers for 604 */ -#else ori r11,r11,BL_256M<<2|0x2 /* set up BAT registers for 604 */ -#endif /* CONFIG_APUS */ mtspr SPRN_DBAT0L,r8 /* N.B. 6xx (not 601) have valid */ mtspr SPRN_DBAT0U,r11 /* bit in upper BAT register */ @@ -1305,6 +1167,67 @@ initial_bats: blr +#ifdef CONFIG_BOOTX_TEXT +setup_disp_bat: + /* + * setup the display bat prepared for us in prom.c + */ + mflr r8 + bl reloc_offset + mtlr r8 + addis r8,r3,disp_BAT@ha + addi r8,r8,disp_BAT@l + cmpwi cr0,r8,0 + beqlr + lwz r11,0(r8) + lwz r8,4(r8) + mfspr r9,SPRN_PVR + rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */ + cmpwi 0,r9,1 + beq 1f + mtspr SPRN_DBAT3L,r8 + mtspr SPRN_DBAT3U,r11 + blr +1: mtspr SPRN_IBAT3L,r8 + mtspr SPRN_IBAT3U,r11 + blr +#endif /* CONFIG_BOOTX_TEXT */ + +#ifdef CONFIG_PPC_EARLY_DEBUG_CPM +setup_cpm_bat: + lis r8, 0xf000 + ori r8, r8, 0x002a + mtspr SPRN_DBAT1L, r8 + + lis r11, 0xf000 + ori r11, r11, (BL_1M << 2) | 2 + mtspr SPRN_DBAT1U, r11 + + blr +#endif + +#ifdef CONFIG_PPC_EARLY_DEBUG_USBGECKO +setup_usbgecko_bat: + /* prepare a BAT for early io */ +#if defined(CONFIG_GAMECUBE) + lis r8, 0x0c00 +#elif defined(CONFIG_WII) + lis r8, 0x0d00 +#else +#error Invalid platform for USB Gecko based early debugging. +#endif + /* + * The virtual address used must match the virtual address + * associated to the fixmap entry FIX_EARLY_DEBUG_BASE. + */ + lis r11, 0xfffe /* top 128K */ + ori r8, r8, 0x002a /* uncached, guarded ,rw */ + ori r11, r11, 0x2 /* 128K, Vs=1, Vp=0 */ + mtspr SPRN_DBAT1L, r8 + mtspr SPRN_DBAT1U, r11 + blr +#endif + #ifdef CONFIG_8260 /* Jump into the system reset for the rom. * We first disable the MMU, and then jump to the ROM reset address. @@ -1355,15 +1278,7 @@ empty_zero_page: .globl swapper_pg_dir swapper_pg_dir: - .space 4096 - -/* - * This space gets a copy of optional info passed to us by the bootstrap - * Used to pass parameters into the kernel like root=/dev/sda1, etc. - */ - .globl cmd_line -cmd_line: - .space 512 + .space PGD_TABLE_SIZE .globl intercept_table intercept_table: diff --git a/arch/powerpc/kernel/head_4xx.S b/arch/powerpc/kernel/head_40x.S index 2590e97f553..7d7d8635227 100644 --- a/arch/powerpc/kernel/head_4xx.S +++ b/arch/powerpc/kernel/head_40x.S @@ -31,16 +31,16 @@ * */ -#include <linux/config.h> +#include <linux/init.h> #include <asm/processor.h> #include <asm/page.h> #include <asm/mmu.h> #include <asm/pgtable.h> -#include <asm/ibm4xx.h> #include <asm/cputable.h> #include <asm/thread_info.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> +#include <asm/ptrace.h> /* As with the other PowerPC ports, it is expected that when code * execution begins here, the following registers contain valid, yet @@ -54,17 +54,11 @@ * * This is all going to change RSN when we add bi_recs....... -- Dan */ - .text -_GLOBAL(_stext) -_GLOBAL(_start) + __HEAD +_ENTRY(_stext); +_ENTRY(_start); - /* Save parameters we are passed. - */ - mr r31,r3 - mr r30,r4 - mr r29,r5 - mr r28,r6 - mr r27,r7 + mr r31,r3 /* save device tree ptr */ /* We have to turn on the MMU right away so we get cache modes * set correctly. @@ -91,28 +85,34 @@ turn_on_mmu: */ . = 0xc0 crit_save: -_GLOBAL(crit_r10) +_ENTRY(crit_r10) + .space 4 +_ENTRY(crit_r11) + .space 4 +_ENTRY(crit_srr0) .space 4 -_GLOBAL(crit_r11) +_ENTRY(crit_srr1) + .space 4 +_ENTRY(saved_ksp_limit) .space 4 /* * Exception vector entry code. This code runs with address translation - * turned off (i.e. using physical addresses). We assume SPRG3 has the - * physical address of the current task thread_struct. + * turned off (i.e. using physical addresses). We assume SPRG_THREAD has + * the physical address of the current task thread_struct. * Note that we have to have decremented r1 before we write to any fields * of the exception frame, since a critical interrupt could occur at any * time, and it will write to the area immediately below the current r1. */ #define NORMAL_EXCEPTION_PROLOG \ - mtspr SPRN_SPRG0,r10; /* save two registers to work with */\ - mtspr SPRN_SPRG1,r11; \ - mtspr SPRN_SPRG2,r1; \ + mtspr SPRN_SPRG_SCRATCH0,r10; /* save two registers to work with */\ + mtspr SPRN_SPRG_SCRATCH1,r11; \ + mtspr SPRN_SPRG_SCRATCH2,r1; \ mfcr r10; /* save CR in r10 for now */\ mfspr r11,SPRN_SRR1; /* check whether user or kernel */\ andi. r11,r11,MSR_PR; \ beq 1f; \ - mfspr r1,SPRN_SPRG3; /* if from user, start at top of */\ + mfspr r1,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r1,THREAD_INFO-THREAD(r1); /* this thread's kernel stack */\ addi r1,r1,THREAD_SIZE; \ 1: subi r1,r1,INT_FRAME_SIZE; /* Allocate an exception frame */\ @@ -120,13 +120,13 @@ _GLOBAL(crit_r11) stw r10,_CCR(r11); /* save various registers */\ stw r12,GPR12(r11); \ stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG0; \ + mfspr r10,SPRN_SPRG_SCRATCH0; \ stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG1; \ + mfspr r12,SPRN_SPRG_SCRATCH1; \ stw r12,GPR11(r11); \ mflr r10; \ stw r10,_LINK(r11); \ - mfspr r10,SPRN_SPRG2; \ + mfspr r10,SPRN_SPRG_SCRATCH2; \ mfspr r12,SPRN_SRR0; \ stw r10,GPR1(r11); \ mfspr r9,SPRN_SRR1; \ @@ -150,14 +150,14 @@ _GLOBAL(crit_r11) mfcr r10; /* save CR in r10 for now */\ mfspr r11,SPRN_SRR3; /* check whether user or kernel */\ andi. r11,r11,MSR_PR; \ - lis r11,critical_stack_top@h; \ - ori r11,r11,critical_stack_top@l; \ + lis r11,critirq_ctx@ha; \ + tophys(r11,r11); \ + lwz r11,critirq_ctx@l(r11); \ beq 1f; \ /* COMING FROM USER MODE */ \ - mfspr r11,SPRN_SPRG3; /* if from user, start at top of */\ + mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ - addi r11,r11,THREAD_SIZE; \ -1: subi r11,r11,INT_FRAME_SIZE; /* Allocate an exception frame */\ +1: addi r11,r11,THREAD_SIZE-INT_FRAME_SIZE; /* Alloc an excpt frm */\ tophys(r11,r11); \ stw r10,_CCR(r11); /* save various registers */\ stw r12,GPR12(r11); \ @@ -260,8 +260,8 @@ label: * and exit. Otherwise, we call heavywight functions to do the work. */ START_EXCEPTION(0x0300, DataStorage) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 + mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_SCRATCH1, r11 #ifdef CONFIG_403GCX stw r12, 0(r0) stw r9, 4(r0) @@ -270,12 +270,12 @@ label: stw r11, 8(r0) stw r12, 12(r0) #else - mtspr SPRN_SPRG4, r12 - mtspr SPRN_SPRG5, r9 + mtspr SPRN_SPRG_SCRATCH3, r12 + mtspr SPRN_SPRG_SCRATCH4, r9 mfcr r11 mfspr r12, SPRN_PID - mtspr SPRN_SPRG7, r11 - mtspr SPRN_SPRG6, r12 + mtspr SPRN_SPRG_SCRATCH6, r11 + mtspr SPRN_SPRG_SCRATCH5, r12 #endif /* First, check if it was a zone fault (which means a user @@ -291,7 +291,7 @@ label: /* If we are faulting a kernel address, we have to use the * kernel page tables. */ - lis r11, TASK_SIZE@h + lis r11, PAGE_OFFSET@h cmplw r10, r11 blt+ 3f lis r11, swapper_pg_dir@h @@ -303,7 +303,7 @@ label: /* Get the PGD for the current thread. */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: tophys(r11, r11) @@ -350,15 +350,15 @@ label: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 PPC405_ERR77_SYNC rfi /* Should sync shadow TLBs */ b . /* prevent prefetch past rfi */ @@ -375,15 +375,15 @@ label: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 b DataAccess /* @@ -394,7 +394,7 @@ label: NORMAL_EXCEPTION_PROLOG mr r4,r12 /* Pass SRR0 as arg2 */ li r5,0 /* Pass zero as arg3 */ - EXC_XFER_EE_LITE(0x400, handle_page_fault) + EXC_XFER_LITE(0x400, handle_page_fault) /* 0x0500 - External Interrupt Exception */ EXCEPTION(0x0500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) @@ -430,30 +430,18 @@ label: EXCEPTION(0x0F00, Trap_0F, unknown_exception, EXC_XFER_EE) /* 0x1000 - Programmable Interval Timer (PIT) Exception */ - START_EXCEPTION(0x1000, Decrementer) - NORMAL_EXCEPTION_PROLOG - lis r0,TSR_PIS@h - mtspr SPRN_TSR,r0 /* Clear the PIT exception */ - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_LITE(0x1000, timer_interrupt) - -#if 0 -/* NOTE: - * FIT and WDT handlers are not implemented yet. - */ + . = 0x1000 + b Decrementer /* 0x1010 - Fixed Interval Timer (FIT) Exception */ - STND_EXCEPTION(0x1010, FITException, unknown_exception) + . = 0x1010 + b FITException /* 0x1020 - Watchdog Timer (WDT) Exception */ -#ifdef CONFIG_BOOKE_WDT - CRITICAL_EXCEPTION(0x1020, WDTException, WatchdogException) -#else - CRITICAL_EXCEPTION(0x1020, WDTException, unknown_exception) -#endif -#endif + . = 0x1020 + b WDTException /* 0x1100 - Data TLB Miss Exception * As the name implies, translation is not in the MMU, so search the @@ -461,8 +449,8 @@ label: * load TLB entries from the page table if they exist. */ START_EXCEPTION(0x1100, DTLBMiss) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 + mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_SCRATCH1, r11 #ifdef CONFIG_403GCX stw r12, 0(r0) stw r9, 4(r0) @@ -471,19 +459,19 @@ label: stw r11, 8(r0) stw r12, 12(r0) #else - mtspr SPRN_SPRG4, r12 - mtspr SPRN_SPRG5, r9 + mtspr SPRN_SPRG_SCRATCH3, r12 + mtspr SPRN_SPRG_SCRATCH4, r9 mfcr r11 mfspr r12, SPRN_PID - mtspr SPRN_SPRG7, r11 - mtspr SPRN_SPRG6, r12 + mtspr SPRN_SPRG_SCRATCH6, r11 + mtspr SPRN_SPRG_SCRATCH5, r12 #endif mfspr r10, SPRN_DEAR /* Get faulting address */ /* If we are faulting a kernel address, we have to use the * kernel page tables. */ - lis r11, TASK_SIZE@h + lis r11, PAGE_OFFSET@h cmplw r10, r11 blt+ 3f lis r11, swapper_pg_dir@h @@ -495,7 +483,7 @@ label: /* Get the PGD for the current thread. */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: tophys(r11, r11) @@ -545,15 +533,15 @@ label: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 b DataAccess /* 0x1200 - Instruction TLB Miss Exception @@ -561,8 +549,8 @@ label: * registers and bailout to a different point. */ START_EXCEPTION(0x1200, ITLBMiss) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 + mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_SCRATCH1, r11 #ifdef CONFIG_403GCX stw r12, 0(r0) stw r9, 4(r0) @@ -571,19 +559,19 @@ label: stw r11, 8(r0) stw r12, 12(r0) #else - mtspr SPRN_SPRG4, r12 - mtspr SPRN_SPRG5, r9 + mtspr SPRN_SPRG_SCRATCH3, r12 + mtspr SPRN_SPRG_SCRATCH4, r9 mfcr r11 mfspr r12, SPRN_PID - mtspr SPRN_SPRG7, r11 - mtspr SPRN_SPRG6, r12 + mtspr SPRN_SPRG_SCRATCH6, r11 + mtspr SPRN_SPRG_SCRATCH5, r12 #endif mfspr r10, SPRN_SRR0 /* Get faulting address */ /* If we are faulting a kernel address, we have to use the * kernel page tables. */ - lis r11, TASK_SIZE@h + lis r11, PAGE_OFFSET@h cmplw r10, r11 blt+ 3f lis r11, swapper_pg_dir@h @@ -595,7 +583,7 @@ label: /* Get the PGD for the current thread. */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: tophys(r11, r11) @@ -645,15 +633,15 @@ label: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 b InstructionAccess EXCEPTION(0x1300, Trap_13, unknown_exception, EXC_XFER_EE) @@ -738,6 +726,29 @@ label: (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ NOCOPY, crit_transfer_to_handler, ret_from_crit_exc) + /* Programmable Interval Timer (PIT) Exception. (from 0x1000) */ +Decrementer: + NORMAL_EXCEPTION_PROLOG + lis r0,TSR_PIS@h + mtspr SPRN_TSR,r0 /* Clear the PIT exception */ + addi r3,r1,STACK_FRAME_OVERHEAD + EXC_XFER_LITE(0x1000, timer_interrupt) + + /* Fixed Interval Timer (FIT) Exception. (from 0x1010) */ +FITException: + NORMAL_EXCEPTION_PROLOG + addi r3,r1,STACK_FRAME_OVERHEAD; + EXC_XFER_EE(0x1010, unknown_exception) + + /* Watchdog Timer (WDT) Exception. (from 0x1020) */ +WDTException: + CRITICAL_EXCEPTION_PROLOG; + addi r3,r1,STACK_FRAME_OVERHEAD; + EXC_XFER_TEMPLATE(WatchdogException, 0x1020+2, + (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), + NOCOPY, crit_transfer_to_handler, + ret_from_crit_exc) + /* * The other Data TLB exceptions bail out to this point * if they can't resolve the lightweight TLB fault. @@ -747,7 +758,7 @@ DataAccess: mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */ stw r5,_ESR(r11) mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ - EXC_XFER_EE_LITE(0x300, handle_page_fault) + EXC_XFER_LITE(0x300, handle_page_fault) /* Other PowerPC processors, namely those derived from the 6xx-series * have vectors from 0x2100 through 0x2F00 defined, but marked as reserved. @@ -760,7 +771,7 @@ DataAccess: * miss get to this point to load the TLB. * r10 - TLB_TAG value * r11 - Linux PTE - * r12, r9 - avilable to use + * r12, r9 - available to use * PID - loaded with proper value when we get here * Upon exit, we reload everything and RFI. * Actually, it will fit now, but oh well.....a common place @@ -773,7 +784,7 @@ finish_tlb_load: */ lwz r9, tlb_4xx_index@l(0) addi r9, r9, 1 - andi. r9, r9, (PPC4XX_TLB_SIZE-1) + andi. r9, r9, (PPC40X_TLB_SIZE-1) stw r9, tlb_4xx_index@l(0) 6: @@ -798,27 +809,19 @@ finish_tlb_load: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 PPC405_ERR77_SYNC rfi /* Should sync shadow TLBs */ b . /* prevent prefetch past rfi */ -/* extern void giveup_fpu(struct task_struct *prev) - * - * The PowerPC 4xx family of processors do not have an FPU, so this just - * returns. - */ -_GLOBAL(giveup_fpu) - blr - /* This is where the main kernel code starts. */ start_here: @@ -830,7 +833,7 @@ start_here: /* ptr to phys current thread */ tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 /* stack */ lis r1,init_thread_union@ha @@ -843,11 +846,8 @@ start_here: /* * Decide what sort of machine this is and initialize the MMU. */ - mr r3,r31 - mr r4,r30 - mr r5,r29 - mr r6,r28 - mr r7,r27 + li r3,0 + mr r4,r31 bl machine_init bl MMU_init @@ -918,11 +918,7 @@ initial_mmu: mtspr SPRN_PID,r0 sync - /* Configure and load two entries into TLB slots 62 and 63. - * In case we are pinning TLBs, these are reserved in by the - * other TLB functions. If not reserving, then it doesn't - * matter where they are loaded. - */ + /* Configure and load one entry into TLB slots 63 */ clrrwi r4,r4,10 /* Mask off the real page number */ ori r4,r4,(TLB_WR | TLB_EX) /* Set the write and execute bits */ @@ -934,25 +930,6 @@ initial_mmu: tlbwe r4,r0,TLB_DATA /* Load the data portion of the entry */ tlbwe r3,r0,TLB_TAG /* Load the tag portion of the entry */ -#if defined(CONFIG_SERIAL_TEXT_DEBUG) && defined(SERIAL_DEBUG_IO_BASE) - - /* Load a TLB entry for the UART, so that ppc4xx_progress() can use - * the UARTs nice and early. We use a 4k real==virtual mapping. */ - - lis r3,SERIAL_DEBUG_IO_BASE@h - ori r3,r3,SERIAL_DEBUG_IO_BASE@l - mr r4,r3 - clrrwi r4,r4,12 - ori r4,r4,(TLB_WR|TLB_I|TLB_M|TLB_G) - - clrrwi r3,r3,12 - ori r3,r3,(TLB_VALID | TLB_PAGESZ(PAGESZ_4K)) - - li r0,0 /* TLB slot 0 */ - tlbwe r4,r0,TLB_DATA - tlbwe r3,r0,TLB_TAG -#endif /* CONFIG_SERIAL_DEBUG_TEXT && SERIAL_DEBUG_IO_BASE */ - isync /* Establish the exception vector base @@ -996,24 +973,7 @@ empty_zero_page: .space 4096 .globl swapper_pg_dir swapper_pg_dir: - .space 4096 - - -/* Stack for handling critical exceptions from kernel mode */ - .section .bss - .align 12 -exception_stack_bottom: - .space 4096 -critical_stack_top: - .globl exception_stack_top -exception_stack_top: - -/* This space gets a copy of optional info passed to us by the bootstrap - * which is used to pass parameters into the kernel like root=/dev/sda1, etc. - */ - .globl cmd_line -cmd_line: - .space 512 + .space PGD_TABLE_SIZE /* Room for two PTE pointers, usually the kernel and current user pointers * to their respective root page table. diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 8b49679fad5..c334f53453f 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -1,6 +1,4 @@ /* - * arch/ppc/kernel/head_44x.S - * * Kernel execution entry point code. * * Copyright (c) 1995-1996 Gary Thomas <gdt@linuxppc.org> @@ -30,17 +28,17 @@ * option) any later version. */ -#include <linux/config.h> +#include <linux/init.h> #include <asm/processor.h> #include <asm/page.h> #include <asm/mmu.h> #include <asm/pgtable.h> -#include <asm/ibm4xx.h> -#include <asm/ibm44x.h> #include <asm/cputable.h> #include <asm/thread_info.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> +#include <asm/ptrace.h> +#include <asm/synch.h> #include "head_booke.h" @@ -55,190 +53,47 @@ * r7 - End of kernel command line string * */ - .text -_GLOBAL(_stext) -_GLOBAL(_start) + __HEAD +_ENTRY(_stext); +_ENTRY(_start); /* * Reserve a word at a fixed location to store the address * of abatron_pteptrs */ nop -/* - * Save parameters we are passed - */ - mr r31,r3 - mr r30,r4 - mr r29,r5 - mr r28,r6 - mr r27,r7 + mr r31,r3 /* save device tree ptr */ li r24,0 /* CPU number */ +#ifdef CONFIG_RELOCATABLE /* - * Set up the initial MMU state - * - * We are still executing code at the virtual address - * mappings set by the firmware for the base of RAM. - * - * We first invalidate all TLB entries but the one - * we are running from. We then load the KERNELBASE - * mappings so we can begin to use kernel addresses - * natively and so the interrupt vector locations are - * permanently pinned (necessary since Book E - * implementations always have translation enabled). - * - * TODO: Use the known TLB entry we are running from to - * determine which physical region we are located - * in. This can be used to determine where in RAM - * (on a shared CPU system) or PCI memory space - * (on a DRAMless system) we are located. - * For now, we assume a perfect world which means - * we are located at the base of DRAM (physical 0). - */ - -/* - * Search TLB for entry that we are currently using. - * Invalidate all entries but the one we are using. - */ - /* Load our current PID->MMUCR TID and MSR IS->MMUCR STS */ - mfspr r3,SPRN_PID /* Get PID */ - mfmsr r4 /* Get MSR */ - andi. r4,r4,MSR_IS@l /* TS=1? */ - beq wmmucr /* If not, leave STS=0 */ - oris r3,r3,PPC44x_MMUCR_STS@h /* Set STS=1 */ -wmmucr: mtspr SPRN_MMUCR,r3 /* Put MMUCR */ - sync - - bl invstr /* Find our address */ -invstr: mflr r5 /* Make it accessible */ - tlbsx r23,0,r5 /* Find entry we are in */ - li r4,0 /* Start at TLB entry 0 */ - li r3,0 /* Set PAGEID inval value */ -1: cmpw r23,r4 /* Is this our entry? */ - beq skpinv /* If so, skip the inval */ - tlbwe r3,r4,PPC44x_TLB_PAGEID /* If not, inval the entry */ -skpinv: addi r4,r4,1 /* Increment */ - cmpwi r4,64 /* Are we done? */ - bne 1b /* If not, repeat */ - isync /* If so, context change */ - -/* - * Configure and load pinned entry into TLB slot 63. + * Relocate ourselves to the current runtime address. + * This is called only by the Boot CPU. + * "relocate" is called with our current runtime virutal + * address. + * r21 will be loaded with the physical runtime address of _stext */ + bl 0f /* Get our runtime address */ +0: mflr r21 /* Make it accessible */ + addis r21,r21,(_stext - 0b)@ha + addi r21,r21,(_stext - 0b)@l /* Get our current runtime base */ - lis r3,KERNELBASE@h /* Load the kernel virtual address */ - ori r3,r3,KERNELBASE@l - - /* Kernel is at the base of RAM */ - li r4, 0 /* Load the kernel physical address */ - - /* Load the kernel PID = 0 */ - li r0,0 - mtspr SPRN_PID,r0 - sync - - /* Initialize MMUCR */ - li r5,0 - mtspr SPRN_MMUCR,r5 - sync - - /* pageid fields */ - clrrwi r3,r3,10 /* Mask off the effective page number */ - ori r3,r3,PPC44x_TLB_VALID | PPC44x_TLB_256M - - /* xlat fields */ - clrrwi r4,r4,10 /* Mask off the real page number */ - /* ERPN is 0 for first 4GB page */ - - /* attrib fields */ - /* Added guarded bit to protect against speculative loads/stores */ - li r5,0 - ori r5,r5,(PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G) - - li r0,63 /* TLB slot 63 */ - - tlbwe r3,r0,PPC44x_TLB_PAGEID /* Load the pageid fields */ - tlbwe r4,r0,PPC44x_TLB_XLAT /* Load the translation fields */ - tlbwe r5,r0,PPC44x_TLB_ATTRIB /* Load the attrib/access fields */ - - /* Force context change */ - mfmsr r0 - mtspr SPRN_SRR1, r0 - lis r0,3f@h - ori r0,r0,3f@l - mtspr SPRN_SRR0,r0 - sync - rfi - - /* If necessary, invalidate original entry we used */ -3: cmpwi r23,63 - beq 4f - li r6,0 - tlbwe r6,r23,PPC44x_TLB_PAGEID - isync - -4: -#ifdef CONFIG_SERIAL_TEXT_DEBUG /* - * Add temporary UART mapping for early debug. - * We can map UART registers wherever we want as long as they don't - * interfere with other system mappings (e.g. with pinned entries). - * For an example of how we handle this - see ocotea.h. --ebs + * We have the runtime (virutal) address of our base. + * We calculate our shift of offset from a 256M page. + * We could map the 256M page we belong to at PAGE_OFFSET and + * get going from there. */ - /* pageid fields */ - lis r3,UART0_IO_BASE@h - ori r3,r3,PPC44x_TLB_VALID | PPC44x_TLB_4K - - /* xlat fields */ - lis r4,UART0_PHYS_IO_BASE@h /* RPN depends on SoC */ -#ifndef CONFIG_440EP - ori r4,r4,0x0001 /* ERPN is 1 for second 4GB page */ + lis r4,KERNELBASE@h + ori r4,r4,KERNELBASE@l + rlwinm r6,r21,0,4,31 /* r6 = PHYS_START % 256M */ + rlwinm r5,r4,0,4,31 /* r5 = KERNELBASE % 256M */ + subf r3,r5,r6 /* r3 = r6 - r5 */ + add r3,r4,r3 /* Required Virutal Address */ + + bl relocate #endif - /* attrib fields */ - li r5,0 - ori r5,r5,(PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_I | PPC44x_TLB_G) - - li r0,0 /* TLB slot 0 */ - - tlbwe r3,r0,PPC44x_TLB_PAGEID /* Load the pageid fields */ - tlbwe r4,r0,PPC44x_TLB_XLAT /* Load the translation fields */ - tlbwe r5,r0,PPC44x_TLB_ATTRIB /* Load the attrib/access fields */ - - /* Force context change */ - isync -#endif /* CONFIG_SERIAL_TEXT_DEBUG */ - - /* Establish the interrupt vector offsets */ - SET_IVOR(0, CriticalInput); - SET_IVOR(1, MachineCheck); - SET_IVOR(2, DataStorage); - SET_IVOR(3, InstructionStorage); - SET_IVOR(4, ExternalInput); - SET_IVOR(5, Alignment); - SET_IVOR(6, Program); - SET_IVOR(7, FloatingPointUnavailable); - SET_IVOR(8, SystemCall); - SET_IVOR(9, AuxillaryProcessorUnavailable); - SET_IVOR(10, Decrementer); - SET_IVOR(11, FixedIntervalTimer); - SET_IVOR(12, WatchdogTimer); - SET_IVOR(13, DataTLBError); - SET_IVOR(14, InstructionTLBError); - SET_IVOR(15, Debug); - - /* Establish the interrupt vector base */ - lis r4,interrupt_base@h /* IVPR only uses the high 16-bits */ - mtspr SPRN_IVPR,r4 - -#ifdef CONFIG_440EP - /* Clear DAPUIB flag in CCR0 (enable APU between CPU and FPU) */ - mfspr r2,SPRN_CCR0 - lis r3,0xffef - ori r3,r3,0xffff - and r2,r2,r3 - mtspr SPRN_CCR0,r2 - isync -#endif + bl init_cpu_state /* * This is where the main kernel code starts. @@ -250,7 +105,7 @@ skpinv: addi r4,r4,1 /* Increment */ /* ptr to current thread */ addi r4,r2,THREAD /* init task's THREAD */ - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 /* stack */ lis r1,init_thread_union@h @@ -260,14 +115,94 @@ skpinv: addi r4,r4,1 /* Increment */ bl early_init +#ifdef CONFIG_RELOCATABLE + /* + * Relocatable kernel support based on processing of dynamic + * relocation entries. + * + * r25 will contain RPN/ERPN for the start address of memory + * r21 will contain the current offset of _stext + */ + lis r3,kernstart_addr@ha + la r3,kernstart_addr@l(r3) + + /* + * Compute the kernstart_addr. + * kernstart_addr => (r6,r8) + * kernstart_addr & ~0xfffffff => (r6,r7) + */ + rlwinm r6,r25,0,28,31 /* ERPN. Bits 32-35 of Address */ + rlwinm r7,r25,0,0,3 /* RPN - assuming 256 MB page size */ + rlwinm r8,r21,0,4,31 /* r8 = (_stext & 0xfffffff) */ + or r8,r7,r8 /* Compute the lower 32bit of kernstart_addr */ + + /* Store kernstart_addr */ + stw r6,0(r3) /* higher 32bit */ + stw r8,4(r3) /* lower 32bit */ + + /* + * Compute the virt_phys_offset : + * virt_phys_offset = stext.run - kernstart_addr + * + * stext.run = (KERNELBASE & ~0xfffffff) + (kernstart_addr & 0xfffffff) + * When we relocate, we have : + * + * (kernstart_addr & 0xfffffff) = (stext.run & 0xfffffff) + * + * hence: + * virt_phys_offset = (KERNELBASE & ~0xfffffff) - (kernstart_addr & ~0xfffffff) + * + */ + + /* KERNELBASE&~0xfffffff => (r4,r5) */ + li r4, 0 /* higer 32bit */ + lis r5,KERNELBASE@h + rlwinm r5,r5,0,0,3 /* Align to 256M, lower 32bit */ + + /* + * 64bit subtraction. + */ + subfc r5,r7,r5 + subfe r4,r6,r4 + + /* Store virt_phys_offset */ + lis r3,virt_phys_offset@ha + la r3,virt_phys_offset@l(r3) + + stw r4,0(r3) + stw r5,4(r3) + +#elif defined(CONFIG_DYNAMIC_MEMSTART) + /* + * Mapping based, page aligned dynamic kernel loading. + * + * r25 will contain RPN/ERPN for the start address of memory + * + * Add the difference between KERNELBASE and PAGE_OFFSET to the + * start of physical memory to get kernstart_addr. + */ + lis r3,kernstart_addr@ha + la r3,kernstart_addr@l(r3) + + lis r4,KERNELBASE@h + ori r4,r4,KERNELBASE@l + lis r5,PAGE_OFFSET@h + ori r5,r5,PAGE_OFFSET@l + subf r4,r5,r4 + + rlwinm r6,r25,0,28,31 /* ERPN */ + rlwinm r7,r25,0,0,3 /* RPN - assuming 256 MB page size */ + add r7,r7,r4 + + stw r6,0(r3) + stw r7,4(r3) +#endif + /* * Decide what sort of machine this is and initialize the MMU. */ - mr r3,r31 - mr r4,r30 - mr r5,r29 - mr r6,r28 - mr r7,r27 + li r3,0 + mr r4,r31 bl machine_init bl MMU_init @@ -281,6 +216,10 @@ skpinv: addi r4,r4,1 /* Increment */ stw r5, 0(r4) /* Save abatron_pteptrs at a fixed location */ stw r6, 0(r5) + /* Clear the Machine Check Syndrome Register */ + li r0,0 + mtspr SPRN_MCSR,r0 + /* Let's move on */ lis r4,start_kernel@h ori r4,r4,start_kernel@l @@ -309,128 +248,22 @@ skpinv: addi r4,r4,1 /* Increment */ interrupt_base: /* Critical Input Interrupt */ - CRITICAL_EXCEPTION(0x0100, CriticalInput, unknown_exception) + CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception) /* Machine Check Interrupt */ -#ifdef CONFIG_440A - MCHECK_EXCEPTION(0x0200, MachineCheck, machine_check_exception) -#else - CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception) -#endif + CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \ + machine_check_exception) + MCHECK_EXCEPTION(0x0210, MachineCheckA, machine_check_exception) /* Data Storage Interrupt */ - START_EXCEPTION(DataStorage) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 - mfcr r11 - mtspr SPRN_SPRG7W, r11 - - /* - * Check if it was a store fault, if not then bail - * because a user tried to access a kernel or - * read-protected page. Otherwise, get the - * offending address and handle it. - */ - mfspr r10, SPRN_ESR - andis. r10, r10, ESR_ST@h - beq 2f - - mfspr r10, SPRN_DEAR /* Get faulting address */ - - /* If we are faulting a kernel address, we have to use the - * kernel page tables. - */ - lis r11, TASK_SIZE@h - cmplw r10, r11 - blt+ 3f - lis r11, swapper_pg_dir@h - ori r11, r11, swapper_pg_dir@l - - mfspr r12,SPRN_MMUCR - rlwinm r12,r12,0,0,23 /* Clear TID */ - - b 4f - - /* Get the PGD for the current thread */ -3: - mfspr r11,SPRN_SPRG3 - lwz r11,PGDIR(r11) + DATA_STORAGE_EXCEPTION - /* Load PID into MMUCR TID */ - mfspr r12,SPRN_MMUCR /* Get MMUCR */ - mfspr r13,SPRN_PID /* Get PID */ - rlwimi r12,r13,0,24,31 /* Set TID */ - -4: - mtspr SPRN_MMUCR,r12 - - rlwinm r12, r10, 13, 19, 29 /* Compute pgdir/pmd offset */ - lwzx r11, r12, r11 /* Get pgd/pmd entry */ - rlwinm. r12, r11, 0, 0, 20 /* Extract pt base address */ - beq 2f /* Bail if no table */ - - rlwimi r12, r10, 23, 20, 28 /* Compute pte address */ - lwz r11, 4(r12) /* Get pte entry */ - - andi. r13, r11, _PAGE_RW /* Is it writeable? */ - beq 2f /* Bail if not */ - - /* Update 'changed'. - */ - ori r11, r11, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE - stw r11, 4(r12) /* Update Linux page table */ - - li r13, PPC44x_TLB_SR@l /* Set SR */ - rlwimi r13, r11, 29, 29, 29 /* SX = _PAGE_HWEXEC */ - rlwimi r13, r11, 0, 30, 30 /* SW = _PAGE_RW */ - rlwimi r13, r11, 29, 28, 28 /* UR = _PAGE_USER */ - rlwimi r12, r11, 31, 26, 26 /* (_PAGE_USER>>1)->r12 */ - rlwimi r12, r11, 29, 30, 30 /* (_PAGE_USER>>3)->r12 */ - and r12, r12, r11 /* HWEXEC/RW & USER */ - rlwimi r13, r12, 0, 26, 26 /* UX = HWEXEC & USER */ - rlwimi r13, r12, 3, 27, 27 /* UW = RW & USER */ - - rlwimi r11,r13,0,26,31 /* Insert static perms */ - - rlwinm r11,r11,0,20,15 /* Clear U0-U3 */ - - /* find the TLB index that caused the fault. It has to be here. */ - tlbsx r10, 0, r10 - - tlbwe r11, r10, PPC44x_TLB_ATTRIB /* Write ATTRIB */ - - /* Done...restore registers and get out of here. - */ - mfspr r11, SPRN_SPRG7R - mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 - rfi /* Force context change */ - -2: - /* - * The bailout. Restore registers to pre-exception conditions - * and call the heavyweights to help us out. - */ - mfspr r11, SPRN_SPRG7R - mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 - b data_access - - /* Instruction Storage Interrupt */ + /* Instruction Storage Interrupt */ INSTRUCTION_STORAGE_EXCEPTION /* External Input Interrupt */ - EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE) + EXCEPTION(0x0500, BOOKE_INTERRUPT_EXTERNAL, ExternalInput, \ + do_IRQ, EXC_XFER_LITE) /* Alignment Interrupt */ ALIGNMENT_EXCEPTION @@ -442,46 +275,48 @@ interrupt_base: #ifdef CONFIG_PPC_FPU FP_UNAVAILABLE_EXCEPTION #else - EXCEPTION(0x2010, FloatingPointUnavailable, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x2010, BOOKE_INTERRUPT_FP_UNAVAIL, \ + FloatingPointUnavailable, unknown_exception, EXC_XFER_EE) #endif - /* System Call Interrupt */ START_EXCEPTION(SystemCall) - NORMAL_EXCEPTION_PROLOG + NORMAL_EXCEPTION_PROLOG(BOOKE_INTERRUPT_SYSCALL) EXC_XFER_EE_LITE(0x0c00, DoSyscall) - /* Auxillary Processor Unavailable Interrupt */ - EXCEPTION(0x2020, AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE) + /* Auxiliary Processor Unavailable Interrupt */ + EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \ + AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE) /* Decrementer Interrupt */ DECREMENTER_EXCEPTION /* Fixed Internal Timer Interrupt */ /* TODO: Add FIT support */ - EXCEPTION(0x1010, FixedIntervalTimer, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x1010, BOOKE_INTERRUPT_FIT, FixedIntervalTimer, \ + unknown_exception, EXC_XFER_EE) /* Watchdog Timer Interrupt */ /* TODO: Add watchdog support */ #ifdef CONFIG_BOOKE_WDT - CRITICAL_EXCEPTION(0x1020, WatchdogTimer, WatchdogException) + CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, WatchdogException) #else - CRITICAL_EXCEPTION(0x1020, WatchdogTimer, unknown_exception) + CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, unknown_exception) #endif /* Data TLB Error Interrupt */ - START_EXCEPTION(DataTLBError) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 + START_EXCEPTION(DataTLBError44x) + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_WSCRATCH1, r11 + mtspr SPRN_SPRG_WSCRATCH2, r12 + mtspr SPRN_SPRG_WSCRATCH3, r13 mfcr r11 - mtspr SPRN_SPRG7W, r11 + mtspr SPRN_SPRG_WSCRATCH4, r11 mfspr r10, SPRN_DEAR /* Get faulting address */ /* If we are faulting a kernel address, we have to use the * kernel page tables. */ - lis r11, TASK_SIZE@h + lis r11, PAGE_OFFSET@h cmplw r10, r11 blt+ 3f lis r11, swapper_pg_dir@h @@ -494,7 +329,7 @@ interrupt_base: /* Get the PGD for the current thread */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) /* Load PID into MMUCR TID */ @@ -505,33 +340,74 @@ interrupt_base: 4: mtspr SPRN_MMUCR,r12 - rlwinm r12, r10, 13, 19, 29 /* Compute pgdir/pmd offset */ + /* Mask of required permission bits. Note that while we + * do copy ESR:ST to _PAGE_RW position as trying to write + * to an RO page is pretty common, we don't do it with + * _PAGE_DIRTY. We could do it, but it's a fairly rare + * event so I'd rather take the overhead when it happens + * rather than adding an instruction here. We should measure + * whether the whole thing is worth it in the first place + * as we could avoid loading SPRN_ESR completely in the first + * place... + * + * TODO: Is it worth doing that mfspr & rlwimi in the first + * place or can we save a couple of instructions here ? + */ + mfspr r12,SPRN_ESR + li r13,_PAGE_PRESENT|_PAGE_ACCESSED + rlwimi r13,r12,10,30,30 + + /* Load the PTE */ + /* Compute pgdir/pmd offset */ + rlwinm r12, r10, PPC44x_PGD_OFF_SHIFT, PPC44x_PGD_OFF_MASK_BIT, 29 lwzx r11, r12, r11 /* Get pgd/pmd entry */ rlwinm. r12, r11, 0, 0, 20 /* Extract pt base address */ beq 2f /* Bail if no table */ - rlwimi r12, r10, 23, 20, 28 /* Compute pte address */ - lwz r11, 4(r12) /* Get pte entry */ - andi. r13, r11, _PAGE_PRESENT /* Is the page present? */ - beq 2f /* Bail if not present */ + /* Compute pte address */ + rlwimi r12, r10, PPC44x_PTE_ADD_SHIFT, PPC44x_PTE_ADD_MASK_BIT, 28 + lwz r11, 0(r12) /* Get high word of pte entry */ + lwz r12, 4(r12) /* Get low word of pte entry */ + + lis r10,tlb_44x_index@ha + + andc. r13,r13,r12 /* Check permission */ + + /* Load the next available TLB index */ + lwz r13,tlb_44x_index@l(r10) + + bne 2f /* Bail if permission mismach */ + + /* Increment, rollover, and store TLB index */ + addi r13,r13,1 + + /* Compare with watermark (instruction gets patched) */ + .globl tlb_44x_patch_hwater_D +tlb_44x_patch_hwater_D: + cmpwi 0,r13,1 /* reserve entries */ + ble 5f + li r13,0 +5: + /* Store the next available TLB index */ + stw r13,tlb_44x_index@l(r10) - ori r11, r11, _PAGE_ACCESSED - stw r11, 4(r12) + /* Re-load the faulting address */ + mfspr r10,SPRN_DEAR /* Jump to common tlb load */ - b finish_tlb_load + b finish_tlb_load_44x 2: /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 - b data_access + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 + b DataStorage /* Instruction TLB Error Interrupt */ /* @@ -539,19 +415,19 @@ interrupt_base: * information from different registers and bailout * to a different point. */ - START_EXCEPTION(InstructionTLBError) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 + START_EXCEPTION(InstructionTLBError44x) + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_WSCRATCH1, r11 + mtspr SPRN_SPRG_WSCRATCH2, r12 + mtspr SPRN_SPRG_WSCRATCH3, r13 mfcr r11 - mtspr SPRN_SPRG7W, r11 + mtspr SPRN_SPRG_WSCRATCH4, r11 mfspr r10, SPRN_SRR0 /* Get faulting address */ /* If we are faulting a kernel address, we have to use the * kernel page tables. */ - lis r11, TASK_SIZE@h + lis r11, PAGE_OFFSET@h cmplw r10, r11 blt+ 3f lis r11, swapper_pg_dir@h @@ -564,7 +440,7 @@ interrupt_base: /* Get the PGD for the current thread */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) /* Load PID into MMUCR TID */ @@ -575,154 +451,338 @@ interrupt_base: 4: mtspr SPRN_MMUCR,r12 - rlwinm r12, r10, 13, 19, 29 /* Compute pgdir/pmd offset */ + /* Make up the required permissions */ + li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC + + /* Compute pgdir/pmd offset */ + rlwinm r12, r10, PPC44x_PGD_OFF_SHIFT, PPC44x_PGD_OFF_MASK_BIT, 29 lwzx r11, r12, r11 /* Get pgd/pmd entry */ rlwinm. r12, r11, 0, 0, 20 /* Extract pt base address */ beq 2f /* Bail if no table */ - rlwimi r12, r10, 23, 20, 28 /* Compute pte address */ - lwz r11, 4(r12) /* Get pte entry */ - andi. r13, r11, _PAGE_PRESENT /* Is the page present? */ - beq 2f /* Bail if not present */ + /* Compute pte address */ + rlwimi r12, r10, PPC44x_PTE_ADD_SHIFT, PPC44x_PTE_ADD_MASK_BIT, 28 + lwz r11, 0(r12) /* Get high word of pte entry */ + lwz r12, 4(r12) /* Get low word of pte entry */ + + lis r10,tlb_44x_index@ha + + andc. r13,r13,r12 /* Check permission */ - ori r11, r11, _PAGE_ACCESSED - stw r11, 4(r12) + /* Load the next available TLB index */ + lwz r13,tlb_44x_index@l(r10) + + bne 2f /* Bail if permission mismach */ + + /* Increment, rollover, and store TLB index */ + addi r13,r13,1 + + /* Compare with watermark (instruction gets patched) */ + .globl tlb_44x_patch_hwater_I +tlb_44x_patch_hwater_I: + cmpwi 0,r13,1 /* reserve entries */ + ble 5f + li r13,0 +5: + /* Store the next available TLB index */ + stw r13,tlb_44x_index@l(r10) + + /* Re-load the faulting address */ + mfspr r10,SPRN_SRR0 /* Jump to common TLB load point */ - b finish_tlb_load + b finish_tlb_load_44x 2: /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 b InstructionStorage - /* Debug Interrupt */ - DEBUG_EXCEPTION - /* - * Local functions - */ - /* - * Data TLB exceptions will bail out to this point - * if they can't resolve the lightweight TLB fault. - */ -data_access: - NORMAL_EXCEPTION_PROLOG - mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */ - stw r5,_ESR(r11) - mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ - EXC_XFER_EE_LITE(0x0300, handle_page_fault) - -/* - * Both the instruction and data TLB miss get to this * point to load the TLB. * r10 - EA of fault - * r11 - available to use - * r12 - Pointer to the 64-bit PTE - * r13 - available to use + * r11 - PTE high word value + * r12 - PTE low word value + * r13 - TLB index * MMUCR - loaded with proper value when we get here * Upon exit, we reload everything and RFI. */ -finish_tlb_load: - /* - * We set execute, because we don't have the granularity to - * properly set this at the page level (Linux problem). - * If shared is set, we cause a zero PID->TID load. - * Many of these bits are software only. Bits we don't set - * here we (properly should) assume have the appropriate value. - */ - - /* Load the next available TLB index */ - lis r13, tlb_44x_index@ha - lwz r13, tlb_44x_index@l(r13) - /* Load the TLB high watermark */ - lis r11, tlb_44x_hwater@ha - lwz r11, tlb_44x_hwater@l(r11) - - /* Increment, rollover, and store TLB index */ - addi r13, r13, 1 - cmpw 0, r13, r11 /* reserve entries */ - ble 7f - li r13, 0 -7: - /* Store the next available TLB index */ - lis r11, tlb_44x_index@ha - stw r13, tlb_44x_index@l(r11) - - lwz r11, 0(r12) /* Get MS word of PTE */ - lwz r12, 4(r12) /* Get LS word of PTE */ - rlwimi r11, r12, 0, 0 , 19 /* Insert RPN */ - tlbwe r11, r13, PPC44x_TLB_XLAT /* Write XLAT */ +finish_tlb_load_44x: + /* Combine RPN & ERPN an write WS 0 */ + rlwimi r11,r12,0,0,31-PAGE_SHIFT + tlbwe r11,r13,PPC44x_TLB_XLAT /* - * Create PAGEID. This is the faulting address, + * Create WS1. This is the faulting address (EPN), * page size, and valid flag. */ - li r11, PPC44x_TLB_VALID | PPC44x_TLB_4K - rlwimi r10, r11, 0, 20, 31 /* Insert valid and page size */ - tlbwe r10, r13, PPC44x_TLB_PAGEID /* Write PAGEID */ - - li r10, PPC44x_TLB_SR@l /* Set SR */ - rlwimi r10, r12, 0, 30, 30 /* Set SW = _PAGE_RW */ - rlwimi r10, r12, 29, 29, 29 /* SX = _PAGE_HWEXEC */ - rlwimi r10, r12, 29, 28, 28 /* UR = _PAGE_USER */ - rlwimi r11, r12, 31, 26, 26 /* (_PAGE_USER>>1)->r12 */ - and r11, r12, r11 /* HWEXEC & USER */ - rlwimi r10, r11, 0, 26, 26 /* UX = HWEXEC & USER */ - - rlwimi r12, r10, 0, 26, 31 /* Insert static perms */ - rlwinm r12, r12, 0, 20, 15 /* Clear U0-U3 */ - tlbwe r12, r13, PPC44x_TLB_ATTRIB /* Write ATTRIB */ + li r11,PPC44x_TLB_VALID | PPC44x_TLBE_SIZE + /* Insert valid and page size */ + rlwimi r10,r11,0,PPC44x_PTE_ADD_MASK_BIT,31 + tlbwe r10,r13,PPC44x_TLB_PAGEID /* Write PAGEID */ + + /* And WS 2 */ + li r10,0xf85 /* Mask to apply from PTE */ + rlwimi r10,r12,29,30,30 /* DIRTY -> SW position */ + and r11,r12,r10 /* Mask PTE bits to keep */ + andi. r10,r12,_PAGE_USER /* User page ? */ + beq 1f /* nope, leave U bits empty */ + rlwimi r11,r11,3,26,28 /* yes, copy S bits to U */ +1: tlbwe r11,r13,PPC44x_TLB_ATTRIB /* Write ATTRIB */ /* Done...restore registers and get out of here. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 rfi /* Force context change */ -/* - * Global functions +/* TLB error interrupts for 476 */ +#ifdef CONFIG_PPC_47x + START_EXCEPTION(DataTLBError47x) + mtspr SPRN_SPRG_WSCRATCH0,r10 /* Save some working registers */ + mtspr SPRN_SPRG_WSCRATCH1,r11 + mtspr SPRN_SPRG_WSCRATCH2,r12 + mtspr SPRN_SPRG_WSCRATCH3,r13 + mfcr r11 + mtspr SPRN_SPRG_WSCRATCH4,r11 + mfspr r10,SPRN_DEAR /* Get faulting address */ + + /* If we are faulting a kernel address, we have to use the + * kernel page tables. + */ + lis r11,PAGE_OFFSET@h + cmplw cr0,r10,r11 + blt+ 3f + lis r11,swapper_pg_dir@h + ori r11,r11, swapper_pg_dir@l + li r12,0 /* MMUCR = 0 */ + b 4f + + /* Get the PGD for the current thread and setup MMUCR */ +3: mfspr r11,SPRN_SPRG3 + lwz r11,PGDIR(r11) + mfspr r12,SPRN_PID /* Get PID */ +4: mtspr SPRN_MMUCR,r12 /* Set MMUCR */ + + /* Mask of required permission bits. Note that while we + * do copy ESR:ST to _PAGE_RW position as trying to write + * to an RO page is pretty common, we don't do it with + * _PAGE_DIRTY. We could do it, but it's a fairly rare + * event so I'd rather take the overhead when it happens + * rather than adding an instruction here. We should measure + * whether the whole thing is worth it in the first place + * as we could avoid loading SPRN_ESR completely in the first + * place... + * + * TODO: Is it worth doing that mfspr & rlwimi in the first + * place or can we save a couple of instructions here ? + */ + mfspr r12,SPRN_ESR + li r13,_PAGE_PRESENT|_PAGE_ACCESSED + rlwimi r13,r12,10,30,30 + + /* Load the PTE */ + /* Compute pgdir/pmd offset */ + rlwinm r12,r10,PPC44x_PGD_OFF_SHIFT,PPC44x_PGD_OFF_MASK_BIT,29 + lwzx r11,r12,r11 /* Get pgd/pmd entry */ + + /* Word 0 is EPN,V,TS,DSIZ */ + li r12,PPC47x_TLB0_VALID | PPC47x_TLBE_SIZE + rlwimi r10,r12,0,32-PAGE_SHIFT,31 /* Insert valid and page size*/ + li r12,0 + tlbwe r10,r12,0 + + /* XXX can we do better ? Need to make sure tlbwe has established + * latch V bit in MMUCR0 before the PTE is loaded further down */ +#ifdef CONFIG_SMP + isync +#endif + + rlwinm. r12,r11,0,0,20 /* Extract pt base address */ + /* Compute pte address */ + rlwimi r12,r10,PPC44x_PTE_ADD_SHIFT,PPC44x_PTE_ADD_MASK_BIT,28 + beq 2f /* Bail if no table */ + lwz r11,0(r12) /* Get high word of pte entry */ + + /* XXX can we do better ? maybe insert a known 0 bit from r11 into the + * bottom of r12 to create a data dependency... We can also use r10 + * as destination nowadays + */ +#ifdef CONFIG_SMP + lwsync +#endif + lwz r12,4(r12) /* Get low word of pte entry */ + + andc. r13,r13,r12 /* Check permission */ + + /* Jump to common tlb load */ + beq finish_tlb_load_47x + +2: /* The bailout. Restore registers to pre-exception conditions + * and call the heavyweights to help us out. + */ + mfspr r11,SPRN_SPRG_RSCRATCH4 + mtcr r11 + mfspr r13,SPRN_SPRG_RSCRATCH3 + mfspr r12,SPRN_SPRG_RSCRATCH2 + mfspr r11,SPRN_SPRG_RSCRATCH1 + mfspr r10,SPRN_SPRG_RSCRATCH0 + b DataStorage + + /* Instruction TLB Error Interrupt */ + /* + * Nearly the same as above, except we get our + * information from different registers and bailout + * to a different point. + */ + START_EXCEPTION(InstructionTLBError47x) + mtspr SPRN_SPRG_WSCRATCH0,r10 /* Save some working registers */ + mtspr SPRN_SPRG_WSCRATCH1,r11 + mtspr SPRN_SPRG_WSCRATCH2,r12 + mtspr SPRN_SPRG_WSCRATCH3,r13 + mfcr r11 + mtspr SPRN_SPRG_WSCRATCH4,r11 + mfspr r10,SPRN_SRR0 /* Get faulting address */ + + /* If we are faulting a kernel address, we have to use the + * kernel page tables. + */ + lis r11,PAGE_OFFSET@h + cmplw cr0,r10,r11 + blt+ 3f + lis r11,swapper_pg_dir@h + ori r11,r11, swapper_pg_dir@l + li r12,0 /* MMUCR = 0 */ + b 4f + + /* Get the PGD for the current thread and setup MMUCR */ +3: mfspr r11,SPRN_SPRG_THREAD + lwz r11,PGDIR(r11) + mfspr r12,SPRN_PID /* Get PID */ +4: mtspr SPRN_MMUCR,r12 /* Set MMUCR */ + + /* Make up the required permissions */ + li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC + + /* Load PTE */ + /* Compute pgdir/pmd offset */ + rlwinm r12,r10,PPC44x_PGD_OFF_SHIFT,PPC44x_PGD_OFF_MASK_BIT,29 + lwzx r11,r12,r11 /* Get pgd/pmd entry */ + + /* Word 0 is EPN,V,TS,DSIZ */ + li r12,PPC47x_TLB0_VALID | PPC47x_TLBE_SIZE + rlwimi r10,r12,0,32-PAGE_SHIFT,31 /* Insert valid and page size*/ + li r12,0 + tlbwe r10,r12,0 + + /* XXX can we do better ? Need to make sure tlbwe has established + * latch V bit in MMUCR0 before the PTE is loaded further down */ +#ifdef CONFIG_SMP + isync +#endif + + rlwinm. r12,r11,0,0,20 /* Extract pt base address */ + /* Compute pte address */ + rlwimi r12,r10,PPC44x_PTE_ADD_SHIFT,PPC44x_PTE_ADD_MASK_BIT,28 + beq 2f /* Bail if no table */ + + lwz r11,0(r12) /* Get high word of pte entry */ + /* XXX can we do better ? maybe insert a known 0 bit from r11 into the + * bottom of r12 to create a data dependency... We can also use r10 + * as destination nowadays + */ +#ifdef CONFIG_SMP + lwsync +#endif + lwz r12,4(r12) /* Get low word of pte entry */ + + andc. r13,r13,r12 /* Check permission */ + + /* Jump to common TLB load point */ + beq finish_tlb_load_47x + +2: /* The bailout. Restore registers to pre-exception conditions + * and call the heavyweights to help us out. + */ + mfspr r11, SPRN_SPRG_RSCRATCH4 + mtcr r11 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 + b InstructionStorage /* - * extern void giveup_altivec(struct task_struct *prev) - * - * The 44x core does not have an AltiVec unit. + * Both the instruction and data TLB miss get to this + * point to load the TLB. + * r10 - free to use + * r11 - PTE high word value + * r12 - PTE low word value + * r13 - free to use + * MMUCR - loaded with proper value when we get here + * Upon exit, we reload everything and RFI. */ -_GLOBAL(giveup_altivec) - blr +finish_tlb_load_47x: + /* Combine RPN & ERPN an write WS 1 */ + rlwimi r11,r12,0,0,31-PAGE_SHIFT + tlbwe r11,r13,1 + + /* And make up word 2 */ + li r10,0xf85 /* Mask to apply from PTE */ + rlwimi r10,r12,29,30,30 /* DIRTY -> SW position */ + and r11,r12,r10 /* Mask PTE bits to keep */ + andi. r10,r12,_PAGE_USER /* User page ? */ + beq 1f /* nope, leave U bits empty */ + rlwimi r11,r11,3,26,28 /* yes, copy S bits to U */ +1: tlbwe r11,r13,2 + + /* Done...restore registers and get out of here. + */ + mfspr r11, SPRN_SPRG_RSCRATCH4 + mtcr r11 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 + rfi + +#endif /* CONFIG_PPC_47x */ + + /* Debug Interrupt */ + /* + * This statement needs to exist at the end of the IVPR + * definition just in case you end up taking a debug + * exception within another exception. + */ + DEBUG_CRIT_EXCEPTION + +interrupt_end: /* - * extern void giveup_fpu(struct task_struct *prev) - * - * The 44x core does not have an FPU. + * Global functions */ -#ifndef CONFIG_PPC_FPU -_GLOBAL(giveup_fpu) - blr -#endif /* - * extern void abort(void) - * - * At present, this routine just applies a system reset. + * Adjust the machine check IVOR on 440A cores */ -_GLOBAL(abort) - mfspr r13,SPRN_DBCR0 - oris r13,r13,DBCR0_RST_SYSTEM@h - mtspr SPRN_DBCR0,r13 +_GLOBAL(__fixup_440A_mcheck) + li r3,MachineCheckA@l + mtspr SPRN_IVOR1,r3 + sync + blr _GLOBAL(set_context) @@ -739,40 +799,466 @@ _GLOBAL(set_context) blr /* + * Init CPU state. This is called at boot time or for secondary CPUs + * to setup initial TLB entries, setup IVORs, etc... + * + */ +_GLOBAL(init_cpu_state) + mflr r22 +#ifdef CONFIG_PPC_47x + /* We use the PVR to differenciate 44x cores from 476 */ + mfspr r3,SPRN_PVR + srwi r3,r3,16 + cmplwi cr0,r3,PVR_476FPE@h + beq head_start_47x + cmplwi cr0,r3,PVR_476@h + beq head_start_47x + cmplwi cr0,r3,PVR_476_ISS@h + beq head_start_47x +#endif /* CONFIG_PPC_47x */ + +/* + * In case the firmware didn't do it, we apply some workarounds + * that are good for all 440 core variants here + */ + mfspr r3,SPRN_CCR0 + rlwinm r3,r3,0,0,27 /* disable icache prefetch */ + isync + mtspr SPRN_CCR0,r3 + isync + sync + +/* + * Set up the initial MMU state for 44x + * + * We are still executing code at the virtual address + * mappings set by the firmware for the base of RAM. + * + * We first invalidate all TLB entries but the one + * we are running from. We then load the KERNELBASE + * mappings so we can begin to use kernel addresses + * natively and so the interrupt vector locations are + * permanently pinned (necessary since Book E + * implementations always have translation enabled). + * + * TODO: Use the known TLB entry we are running from to + * determine which physical region we are located + * in. This can be used to determine where in RAM + * (on a shared CPU system) or PCI memory space + * (on a DRAMless system) we are located. + * For now, we assume a perfect world which means + * we are located at the base of DRAM (physical 0). + */ + +/* + * Search TLB for entry that we are currently using. + * Invalidate all entries but the one we are using. + */ + /* Load our current PID->MMUCR TID and MSR IS->MMUCR STS */ + mfspr r3,SPRN_PID /* Get PID */ + mfmsr r4 /* Get MSR */ + andi. r4,r4,MSR_IS@l /* TS=1? */ + beq wmmucr /* If not, leave STS=0 */ + oris r3,r3,PPC44x_MMUCR_STS@h /* Set STS=1 */ +wmmucr: mtspr SPRN_MMUCR,r3 /* Put MMUCR */ + sync + + bl invstr /* Find our address */ +invstr: mflr r5 /* Make it accessible */ + tlbsx r23,0,r5 /* Find entry we are in */ + li r4,0 /* Start at TLB entry 0 */ + li r3,0 /* Set PAGEID inval value */ +1: cmpw r23,r4 /* Is this our entry? */ + beq skpinv /* If so, skip the inval */ + tlbwe r3,r4,PPC44x_TLB_PAGEID /* If not, inval the entry */ +skpinv: addi r4,r4,1 /* Increment */ + cmpwi r4,64 /* Are we done? */ + bne 1b /* If not, repeat */ + isync /* If so, context change */ + +/* + * Configure and load pinned entry into TLB slot 63. + */ +#ifdef CONFIG_NONSTATIC_KERNEL + /* + * In case of a NONSTATIC_KERNEL we reuse the TLB XLAT + * entries of the initial mapping set by the boot loader. + * The XLAT entry is stored in r25 + */ + + /* Read the XLAT entry for our current mapping */ + tlbre r25,r23,PPC44x_TLB_XLAT + + lis r3,KERNELBASE@h + ori r3,r3,KERNELBASE@l + + /* Use our current RPN entry */ + mr r4,r25 +#else + + lis r3,PAGE_OFFSET@h + ori r3,r3,PAGE_OFFSET@l + + /* Kernel is at the base of RAM */ + li r4, 0 /* Load the kernel physical address */ +#endif + + /* Load the kernel PID = 0 */ + li r0,0 + mtspr SPRN_PID,r0 + sync + + /* Initialize MMUCR */ + li r5,0 + mtspr SPRN_MMUCR,r5 + sync + + /* pageid fields */ + clrrwi r3,r3,10 /* Mask off the effective page number */ + ori r3,r3,PPC44x_TLB_VALID | PPC44x_TLB_256M + + /* xlat fields */ + clrrwi r4,r4,10 /* Mask off the real page number */ + /* ERPN is 0 for first 4GB page */ + + /* attrib fields */ + /* Added guarded bit to protect against speculative loads/stores */ + li r5,0 + ori r5,r5,(PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G) + + li r0,63 /* TLB slot 63 */ + + tlbwe r3,r0,PPC44x_TLB_PAGEID /* Load the pageid fields */ + tlbwe r4,r0,PPC44x_TLB_XLAT /* Load the translation fields */ + tlbwe r5,r0,PPC44x_TLB_ATTRIB /* Load the attrib/access fields */ + + /* Force context change */ + mfmsr r0 + mtspr SPRN_SRR1, r0 + lis r0,3f@h + ori r0,r0,3f@l + mtspr SPRN_SRR0,r0 + sync + rfi + + /* If necessary, invalidate original entry we used */ +3: cmpwi r23,63 + beq 4f + li r6,0 + tlbwe r6,r23,PPC44x_TLB_PAGEID + isync + +4: +#ifdef CONFIG_PPC_EARLY_DEBUG_44x + /* Add UART mapping for early debug. */ + + /* pageid fields */ + lis r3,PPC44x_EARLY_DEBUG_VIRTADDR@h + ori r3,r3,PPC44x_TLB_VALID|PPC44x_TLB_TS|PPC44x_TLB_64K + + /* xlat fields */ + lis r4,CONFIG_PPC_EARLY_DEBUG_44x_PHYSLOW@h + ori r4,r4,CONFIG_PPC_EARLY_DEBUG_44x_PHYSHIGH + + /* attrib fields */ + li r5,(PPC44x_TLB_SW|PPC44x_TLB_SR|PPC44x_TLB_I|PPC44x_TLB_G) + li r0,62 /* TLB slot 0 */ + + tlbwe r3,r0,PPC44x_TLB_PAGEID + tlbwe r4,r0,PPC44x_TLB_XLAT + tlbwe r5,r0,PPC44x_TLB_ATTRIB + + /* Force context change */ + isync +#endif /* CONFIG_PPC_EARLY_DEBUG_44x */ + + /* Establish the interrupt vector offsets */ + SET_IVOR(0, CriticalInput); + SET_IVOR(1, MachineCheck); + SET_IVOR(2, DataStorage); + SET_IVOR(3, InstructionStorage); + SET_IVOR(4, ExternalInput); + SET_IVOR(5, Alignment); + SET_IVOR(6, Program); + SET_IVOR(7, FloatingPointUnavailable); + SET_IVOR(8, SystemCall); + SET_IVOR(9, AuxillaryProcessorUnavailable); + SET_IVOR(10, Decrementer); + SET_IVOR(11, FixedIntervalTimer); + SET_IVOR(12, WatchdogTimer); + SET_IVOR(13, DataTLBError44x); + SET_IVOR(14, InstructionTLBError44x); + SET_IVOR(15, DebugCrit); + + b head_start_common + + +#ifdef CONFIG_PPC_47x + +#ifdef CONFIG_SMP + +/* Entry point for secondary 47x processors */ +_GLOBAL(start_secondary_47x) + mr r24,r3 /* CPU number */ + + bl init_cpu_state + + /* Now we need to bolt the rest of kernel memory which + * is done in C code. We must be careful because our task + * struct or our stack can (and will probably) be out + * of reach of the initial 256M TLB entry, so we use a + * small temporary stack in .bss for that. This works + * because only one CPU at a time can be in this code + */ + lis r1,temp_boot_stack@h + ori r1,r1,temp_boot_stack@l + addi r1,r1,1024-STACK_FRAME_OVERHEAD + li r0,0 + stw r0,0(r1) + bl mmu_init_secondary + + /* Now we can get our task struct and real stack pointer */ + + /* Get current_thread_info and current */ + lis r1,secondary_ti@ha + lwz r1,secondary_ti@l(r1) + lwz r2,TI_TASK(r1) + + /* Current stack pointer */ + addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD + li r0,0 + stw r0,0(r1) + + /* Kernel stack for exception entry in SPRG3 */ + addi r4,r2,THREAD /* init task's THREAD */ + mtspr SPRN_SPRG3,r4 + + b start_secondary + +#endif /* CONFIG_SMP */ + +/* + * Set up the initial MMU state for 44x + * + * We are still executing code at the virtual address + * mappings set by the firmware for the base of RAM. + */ + +head_start_47x: + /* Load our current PID->MMUCR TID and MSR IS->MMUCR STS */ + mfspr r3,SPRN_PID /* Get PID */ + mfmsr r4 /* Get MSR */ + andi. r4,r4,MSR_IS@l /* TS=1? */ + beq 1f /* If not, leave STS=0 */ + oris r3,r3,PPC47x_MMUCR_STS@h /* Set STS=1 */ +1: mtspr SPRN_MMUCR,r3 /* Put MMUCR */ + sync + + /* Find the entry we are running from */ + bl 1f +1: mflr r23 + tlbsx r23,0,r23 + tlbre r24,r23,0 + tlbre r25,r23,1 + tlbre r26,r23,2 + +/* + * Cleanup time + */ + + /* Initialize MMUCR */ + li r5,0 + mtspr SPRN_MMUCR,r5 + sync + +clear_all_utlb_entries: + + #; Set initial values. + + addis r3,0,0x8000 + addi r4,0,0 + addi r5,0,0 + b clear_utlb_entry + + #; Align the loop to speed things up. + + .align 6 + +clear_utlb_entry: + + tlbwe r4,r3,0 + tlbwe r5,r3,1 + tlbwe r5,r3,2 + addis r3,r3,0x2000 + cmpwi r3,0 + bne clear_utlb_entry + addis r3,0,0x8000 + addis r4,r4,0x100 + cmpwi r4,0 + bne clear_utlb_entry + + #; Restore original entry. + + oris r23,r23,0x8000 /* specify the way */ + tlbwe r24,r23,0 + tlbwe r25,r23,1 + tlbwe r26,r23,2 + +/* + * Configure and load pinned entry into TLB for the kernel core + */ + + lis r3,PAGE_OFFSET@h + ori r3,r3,PAGE_OFFSET@l + + /* Load the kernel PID = 0 */ + li r0,0 + mtspr SPRN_PID,r0 + sync + + /* Word 0 */ + clrrwi r3,r3,12 /* Mask off the effective page number */ + ori r3,r3,PPC47x_TLB0_VALID | PPC47x_TLB0_256M + + /* Word 1 - use r25. RPN is the same as the original entry */ + + /* Word 2 */ + li r5,0 + ori r5,r5,PPC47x_TLB2_S_RWX +#ifdef CONFIG_SMP + ori r5,r5,PPC47x_TLB2_M +#endif + + /* We write to way 0 and bolted 0 */ + lis r0,0x8800 + tlbwe r3,r0,0 + tlbwe r25,r0,1 + tlbwe r5,r0,2 + +/* + * Configure SSPCR, ISPCR and USPCR for now to search everything, we can fix + * them up later + */ + LOAD_REG_IMMEDIATE(r3, 0x9abcdef0) + mtspr SPRN_SSPCR,r3 + mtspr SPRN_USPCR,r3 + LOAD_REG_IMMEDIATE(r3, 0x12345670) + mtspr SPRN_ISPCR,r3 + + /* Force context change */ + mfmsr r0 + mtspr SPRN_SRR1, r0 + lis r0,3f@h + ori r0,r0,3f@l + mtspr SPRN_SRR0,r0 + sync + rfi + + /* Invalidate original entry we used */ +3: + rlwinm r24,r24,0,21,19 /* clear the "valid" bit */ + tlbwe r24,r23,0 + addi r24,0,0 + tlbwe r24,r23,1 + tlbwe r24,r23,2 + isync /* Clear out the shadow TLB entries */ + +#ifdef CONFIG_PPC_EARLY_DEBUG_44x + /* Add UART mapping for early debug. */ + + /* Word 0 */ + lis r3,PPC44x_EARLY_DEBUG_VIRTADDR@h + ori r3,r3,PPC47x_TLB0_VALID | PPC47x_TLB0_TS | PPC47x_TLB0_1M + + /* Word 1 */ + lis r4,CONFIG_PPC_EARLY_DEBUG_44x_PHYSLOW@h + ori r4,r4,CONFIG_PPC_EARLY_DEBUG_44x_PHYSHIGH + + /* Word 2 */ + li r5,(PPC47x_TLB2_S_RW | PPC47x_TLB2_IMG) + + /* Bolted in way 0, bolt slot 5, we -hope- we don't hit the same + * congruence class as the kernel, we need to make sure of it at + * some point + */ + lis r0,0x8d00 + tlbwe r3,r0,0 + tlbwe r4,r0,1 + tlbwe r5,r0,2 + + /* Force context change */ + isync +#endif /* CONFIG_PPC_EARLY_DEBUG_44x */ + + /* Establish the interrupt vector offsets */ + SET_IVOR(0, CriticalInput); + SET_IVOR(1, MachineCheckA); + SET_IVOR(2, DataStorage); + SET_IVOR(3, InstructionStorage); + SET_IVOR(4, ExternalInput); + SET_IVOR(5, Alignment); + SET_IVOR(6, Program); + SET_IVOR(7, FloatingPointUnavailable); + SET_IVOR(8, SystemCall); + SET_IVOR(9, AuxillaryProcessorUnavailable); + SET_IVOR(10, Decrementer); + SET_IVOR(11, FixedIntervalTimer); + SET_IVOR(12, WatchdogTimer); + SET_IVOR(13, DataTLBError47x); + SET_IVOR(14, InstructionTLBError47x); + SET_IVOR(15, DebugCrit); + + /* We configure icbi to invalidate 128 bytes at a time since the + * current 32-bit kernel code isn't too happy with icache != dcache + * block size + */ + mfspr r3,SPRN_CCR0 + oris r3,r3,0x0020 + mtspr SPRN_CCR0,r3 + isync + +#endif /* CONFIG_PPC_47x */ + +/* + * Here we are back to code that is common between 44x and 47x + * + * We proceed to further kernel initialization and return to the + * main kernel entry + */ +head_start_common: + /* Establish the interrupt vector base */ + lis r4,interrupt_base@h /* IVPR only uses the high 16-bits */ + mtspr SPRN_IVPR,r4 + + /* + * If the kernel was loaded at a non-zero 256 MB page, we need to + * mask off the most significant 4 bits to get the relative address + * from the start of physical memory + */ + rlwinm r22,r22,0,4,31 + addis r22,r22,PAGE_OFFSET@h + mtlr r22 + isync + blr + +/* * We put a few things here that have to be page-aligned. This stuff * goes at the beginning of the data segment, which is page-aligned. */ .data - .align 12 + .align PAGE_SHIFT .globl sdata sdata: .globl empty_zero_page empty_zero_page: - .space 4096 + .space PAGE_SIZE /* * To support >32-bit physical addresses, we use an 8KB pgdir. */ .globl swapper_pg_dir swapper_pg_dir: - .space 8192 - -/* Reserved 4k for the critical exception stack & 4k for the machine - * check stack per CPU for kernel mode exceptions */ - .section .bss - .align 12 -exception_stack_bottom: - .space BOOKE_EXCEPTION_STACK_SIZE - .globl exception_stack_top -exception_stack_top: - -/* - * This space gets a copy of optional info passed to us by the bootstrap - * which is used to pass parameters into the kernel like root=/dev/sda1, etc. - */ - .globl cmd_line -cmd_line: - .space 512 + .space PGD_TABLE_SIZE /* * Room for two PTE pointers, usually the kernel and current user pointers @@ -780,3 +1266,9 @@ cmd_line: */ abatron_pteptrs: .space 8 + +#ifdef CONFIG_SMP + .align 12 +temp_boot_stack: + .space 1024 +#endif /* CONFIG_SMP */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 147215a0d6c..a95145d7f61 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -1,6 +1,4 @@ /* - * arch/ppc64/kernel/head.S - * * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) * @@ -14,8 +12,9 @@ * Adapted for 64bit PowerPC by Dave Engebretsen, Peter Bergner, and * Mike Corrigan {engebret|bergner|mikejc}@us.ibm.com * - * This file contains the low-level support and setup for the - * PowerPC-64 platform, including trap and interrupt dispatch. + * This file contains the entry point for the 64-bit kernel along + * with some early initialization code common to all 64-bit powerpc + * variants. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -23,100 +22,65 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <linux/threads.h> +#include <linux/init.h> #include <asm/reg.h> #include <asm/page.h> #include <asm/mmu.h> -#include <asm/systemcfg.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> #include <asm/bug.h> #include <asm/cputable.h> #include <asm/setup.h> #include <asm/hvcall.h> -#include <asm/iSeries/LparMap.h> #include <asm/thread_info.h> +#include <asm/firmware.h> +#include <asm/page_64.h> +#include <asm/irqflags.h> +#include <asm/kvm_book3s_asm.h> +#include <asm/ptrace.h> +#include <asm/hw_irq.h> -#ifdef CONFIG_PPC_ISERIES -#define DO_SOFT_DISABLE -#endif - -/* - * We layout physical memory as follows: - * 0x0000 - 0x00ff : Secondary processor spin code - * 0x0100 - 0x2fff : pSeries Interrupt prologs - * 0x3000 - 0x5fff : interrupt support, iSeries and common interrupt prologs - * 0x6000 - 0x6fff : Initial (CPU0) segment table - * 0x7000 - 0x7fff : FWNMI data area - * 0x8000 - : Early init and support code - */ - -/* - * SPRG Usage - * - * Register Definition - * - * SPRG0 reserved for hypervisor - * SPRG1 temp - used to save gpr - * SPRG2 temp - used to save gpr - * SPRG3 virt addr of paca +/* The physical memory is laid out such that the secondary processor + * spin code sits at 0x0000...0x00ff. On server, the vectors follow + * using the layout described in exceptions-64s.S */ /* * Entering into this code we make the following assumptions: - * For pSeries: + * + * For pSeries or server processors: * 1. The MMU is off & open firmware is running in real mode. * 2. The kernel is entered at __start + * -or- For OPAL entry: + * 1. The MMU is off, processor in HV mode, primary CPU enters at 0 + * with device-tree in gpr3. We also get OPAL base in r8 and + * entry in r9 for debugging purposes + * 2. Secondary processors enter at 0x60 with PIR in gpr3 * - * For iSeries: - * 1. The MMU is on (as it always is for iSeries) - * 2. The kernel is entered at system_reset_iSeries + * For Book3E processors: + * 1. The MMU is on running in AS0 in a state defined in ePAPR + * 2. The kernel is entered at __start */ .text .globl _stext _stext: -#ifdef CONFIG_PPC_MULTIPLATFORM _GLOBAL(__start) /* NOP this out unconditionally */ BEGIN_FTR_SECTION - b .__start_initialization_multiplatform + FIXUP_ENDIAN + b __start_initialization_multiplatform END_FTR_SECTION(0, 1) -#endif /* CONFIG_PPC_MULTIPLATFORM */ /* Catch branch to 0 in real mode */ trap -#ifdef CONFIG_PPC_ISERIES - /* - * At offset 0x20, there is a pointer to iSeries LPAR data. - * This is required by the hypervisor + /* Secondary processors spin on this value until it becomes non-zero. + * When non-zero, it contains the real address of the function the cpu + * should jump to. */ - . = 0x20 - .llong hvReleaseData-KERNELBASE - - /* - * At offset 0x28 and 0x30 are offsets to the mschunks_map - * array (used by the iSeries LPAR debugger to do translation - * between physical addresses and absolute addresses) and - * to the pidhash table (also used by the debugger) - */ - .llong mschunks_map-KERNELBASE - .llong 0 /* pidhash-KERNELBASE SFRXXX */ - - /* Offset 0x38 - Pointer to start of embedded System.map */ - .globl embedded_sysmap_start -embedded_sysmap_start: - .llong 0 - /* Offset 0x40 - Pointer to end of embedded System.map */ - .globl embedded_sysmap_end -embedded_sysmap_end: - .llong 0 - -#endif /* CONFIG_PPC_ISERIES */ - - /* Secondary processors spin on this value until it goes to 1. */ + .balign 8 .globl __secondary_hold_spinloop __secondary_hold_spinloop: .llong 0x0 @@ -127,42 +91,78 @@ __secondary_hold_spinloop: __secondary_hold_acknowledge: .llong 0x0 +#ifdef CONFIG_RELOCATABLE + /* This flag is set to 1 by a loader if the kernel should run + * at the loaded address instead of the linked address. This + * is used by kexec-tools to keep the the kdump kernel in the + * crash_kernel region. The loader is responsible for + * observing the alignment requirement. + */ + /* Do not move this variable as kexec-tools knows about it. */ + . = 0x5c + .globl __run_at_load +__run_at_load: + .long 0x72756e30 /* "run0" -- relocate to 0 by default */ +#endif + . = 0x60 /* - * The following code is used on pSeries to hold secondary processors - * in a spin loop after they have been freed from OpenFirmware, but + * The following code is used to hold secondary processors + * in a spin loop after they have entered the kernel, but * before the bulk of the kernel has been relocated. This code * is relocated to physical address 0x60 before prom_init is run. * All of it must fit below the first exception vector at 0x100. + * Use .globl here not _GLOBAL because we want __secondary_hold + * to be the actual text address, not a descriptor. */ -_GLOBAL(__secondary_hold) + .globl __secondary_hold +__secondary_hold: + FIXUP_ENDIAN +#ifndef CONFIG_PPC_BOOK3E mfmsr r24 ori r24,r24,MSR_RI mtmsrd r24 /* RI on */ - - /* Grab our linux cpu number */ +#endif + /* Grab our physical cpu number */ mr r24,r3 + /* stash r4 for book3e */ + mr r25,r4 /* Tell the master cpu we're here */ /* Relocation is off & we are located at an address less */ /* than 0x100, so only need to grab low order offset. */ - std r24,__secondary_hold_acknowledge@l(0) + std r24,__secondary_hold_acknowledge-_stext(0) sync + li r26,0 +#ifdef CONFIG_PPC_BOOK3E + tovirt(r26,r26) +#endif /* All secondary cpus wait here until told to start. */ -100: ld r4,__secondary_hold_spinloop@l(0) - cmpdi 0,r4,1 - bne 100b +100: ld r12,__secondary_hold_spinloop-_stext(r26) + cmpdi 0,r12,0 + beq 100b -#ifdef CONFIG_HMT - b .hmt_init -#else -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_KEXEC) +#ifdef CONFIG_PPC_BOOK3E + tovirt(r12,r12) +#endif + mtctr r12 mr r3,r24 - b .pSeries_secondary_smp_init + /* + * it may be the case that other platforms have r4 right to + * begin with, this gives us some safety in case it is not + */ +#ifdef CONFIG_PPC_BOOK3E + mr r4,r25 #else - BUG_OPCODE + li r4,0 #endif + /* Make sure that patched code is visible */ + isync + bctr +#else + BUG_OPCODE #endif /* This value is used to mark exception frames on the stack. */ @@ -172,1163 +172,145 @@ exception_marker: .text /* - * The following macros define the code that appears as - * the prologue to each of the exception handlers. They - * are split into two parts to allow a single kernel binary - * to be used for pSeries and iSeries. - * LOL. One day... - paulus - */ - -/* - * We make as much of the exception code common between native - * exception handlers (including pSeries LPAR) and iSeries LPAR - * implementations as possible. - */ - -/* - * This is the start of the interrupt handlers for pSeries - * This code runs with relocation off. - */ -#define EX_R9 0 -#define EX_R10 8 -#define EX_R11 16 -#define EX_R12 24 -#define EX_R13 32 -#define EX_SRR0 40 -#define EX_R3 40 /* SLB miss saves R3, but not SRR0 */ -#define EX_DAR 48 -#define EX_LR 48 /* SLB miss saves LR, but not DAR */ -#define EX_DSISR 56 -#define EX_CCR 60 - -#define EXCEPTION_PROLOG_PSERIES(area, label) \ - mfspr r13,SPRN_SPRG3; /* get paca address into r13 */ \ - std r9,area+EX_R9(r13); /* save r9 - r12 */ \ - std r10,area+EX_R10(r13); \ - std r11,area+EX_R11(r13); \ - std r12,area+EX_R12(r13); \ - mfspr r9,SPRN_SPRG1; \ - std r9,area+EX_R13(r13); \ - mfcr r9; \ - clrrdi r12,r13,32; /* get high part of &label */ \ - mfmsr r10; \ - mfspr r11,SPRN_SRR0; /* save SRR0 */ \ - ori r12,r12,(label)@l; /* virt addr of handler */ \ - ori r10,r10,MSR_IR|MSR_DR|MSR_RI; \ - mtspr SPRN_SRR0,r12; \ - mfspr r12,SPRN_SRR1; /* and SRR1 */ \ - mtspr SPRN_SRR1,r10; \ - rfid; \ - b . /* prevent speculative execution */ - -/* - * This is the start of the interrupt handlers for iSeries - * This code runs with relocation on. - */ -#define EXCEPTION_PROLOG_ISERIES_1(area) \ - mfspr r13,SPRN_SPRG3; /* get paca address into r13 */ \ - std r9,area+EX_R9(r13); /* save r9 - r12 */ \ - std r10,area+EX_R10(r13); \ - std r11,area+EX_R11(r13); \ - std r12,area+EX_R12(r13); \ - mfspr r9,SPRN_SPRG1; \ - std r9,area+EX_R13(r13); \ - mfcr r9 - -#define EXCEPTION_PROLOG_ISERIES_2 \ - mfmsr r10; \ - ld r11,PACALPPACA+LPPACASRR0(r13); \ - ld r12,PACALPPACA+LPPACASRR1(r13); \ - ori r10,r10,MSR_RI; \ - mtmsrd r10,1 - -/* - * The common exception prolog is used for all except a few exceptions - * such as a segment miss on a kernel address. We have to be prepared - * to take another exception from the point where we first touch the - * kernel stack onwards. - * - * On entry r13 points to the paca, r9-r13 are saved in the paca, - * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and - * SRR1, and relocation is on. - */ -#define EXCEPTION_PROLOG_COMMON(n, area) \ - andi. r10,r12,MSR_PR; /* See if coming from user */ \ - mr r10,r1; /* Save r1 */ \ - subi r1,r1,INT_FRAME_SIZE; /* alloc frame on kernel stack */ \ - beq- 1f; \ - ld r1,PACAKSAVE(r13); /* kernel stack to use */ \ -1: cmpdi cr1,r1,0; /* check if r1 is in userspace */ \ - bge- cr1,bad_stack; /* abort if it is */ \ - std r9,_CCR(r1); /* save CR in stackframe */ \ - std r11,_NIP(r1); /* save SRR0 in stackframe */ \ - std r12,_MSR(r1); /* save SRR1 in stackframe */ \ - std r10,0(r1); /* make stack chain pointer */ \ - std r0,GPR0(r1); /* save r0 in stackframe */ \ - std r10,GPR1(r1); /* save r1 in stackframe */ \ - std r2,GPR2(r1); /* save r2 in stackframe */ \ - SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ - SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ - ld r9,area+EX_R9(r13); /* move r9, r10 to stackframe */ \ - ld r10,area+EX_R10(r13); \ - std r9,GPR9(r1); \ - std r10,GPR10(r1); \ - ld r9,area+EX_R11(r13); /* move r11 - r13 to stackframe */ \ - ld r10,area+EX_R12(r13); \ - ld r11,area+EX_R13(r13); \ - std r9,GPR11(r1); \ - std r10,GPR12(r1); \ - std r11,GPR13(r1); \ - ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \ - mflr r9; /* save LR in stackframe */ \ - std r9,_LINK(r1); \ - mfctr r10; /* save CTR in stackframe */ \ - std r10,_CTR(r1); \ - mfspr r11,SPRN_XER; /* save XER in stackframe */ \ - std r11,_XER(r1); \ - li r9,(n)+1; \ - std r9,_TRAP(r1); /* set trap number */ \ - li r10,0; \ - ld r11,exception_marker@toc(r2); \ - std r10,RESULT(r1); /* clear regs->result */ \ - std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ - -/* - * Exception vectors. - */ -#define STD_EXCEPTION_PSERIES(n, label) \ - . = n; \ - .globl label##_pSeries; \ -label##_pSeries: \ - HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ - RUNLATCH_ON(r13); \ - EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common) - -#define STD_EXCEPTION_ISERIES(n, label, area) \ - .globl label##_iSeries; \ -label##_iSeries: \ - HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ - RUNLATCH_ON(r13); \ - EXCEPTION_PROLOG_ISERIES_1(area); \ - EXCEPTION_PROLOG_ISERIES_2; \ - b label##_common - -#define MASKABLE_EXCEPTION_ISERIES(n, label) \ - .globl label##_iSeries; \ -label##_iSeries: \ - HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ - RUNLATCH_ON(r13); \ - EXCEPTION_PROLOG_ISERIES_1(PACA_EXGEN); \ - lbz r10,PACAPROCENABLED(r13); \ - cmpwi 0,r10,0; \ - beq- label##_iSeries_masked; \ - EXCEPTION_PROLOG_ISERIES_2; \ - b label##_common; \ - -#ifdef DO_SOFT_DISABLE -#define DISABLE_INTS \ - lbz r10,PACAPROCENABLED(r13); \ - li r11,0; \ - std r10,SOFTE(r1); \ - mfmsr r10; \ - stb r11,PACAPROCENABLED(r13); \ - ori r10,r10,MSR_EE; \ - mtmsrd r10,1 - -#define ENABLE_INTS \ - lbz r10,PACAPROCENABLED(r13); \ - mfmsr r11; \ - std r10,SOFTE(r1); \ - ori r11,r11,MSR_EE; \ - mtmsrd r11,1 - -#else /* hard enable/disable interrupts */ -#define DISABLE_INTS - -#define ENABLE_INTS \ - ld r12,_MSR(r1); \ - mfmsr r11; \ - rlwimi r11,r12,0,MSR_EE; \ - mtmsrd r11,1 - -#endif - -#define STD_EXCEPTION_COMMON(trap, label, hdlr) \ - .align 7; \ - .globl label##_common; \ -label##_common: \ - EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \ - DISABLE_INTS; \ - bl .save_nvgprs; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - bl hdlr; \ - b .ret_from_except - -#define STD_EXCEPTION_COMMON_LITE(trap, label, hdlr) \ - .align 7; \ - .globl label##_common; \ -label##_common: \ - EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \ - DISABLE_INTS; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - bl hdlr; \ - b .ret_from_except_lite - -/* - * Start of pSeries system interrupt routines - */ - . = 0x100 - .globl __start_interrupts -__start_interrupts: - - STD_EXCEPTION_PSERIES(0x100, system_reset) - - . = 0x200 -_machine_check_pSeries: - HMT_MEDIUM - mtspr SPRN_SPRG1,r13 /* save r13 */ - RUNLATCH_ON(r13) - EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common) - - . = 0x300 - .globl data_access_pSeries -data_access_pSeries: - HMT_MEDIUM - mtspr SPRN_SPRG1,r13 -BEGIN_FTR_SECTION - mtspr SPRN_SPRG2,r12 - mfspr r13,SPRN_DAR - mfspr r12,SPRN_DSISR - srdi r13,r13,60 - rlwimi r13,r12,16,0x20 - mfcr r12 - cmpwi r13,0x2c - beq .do_stab_bolted_pSeries - mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG2 -END_FTR_SECTION_IFCLR(CPU_FTR_SLB) - EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common) - - . = 0x380 - .globl data_access_slb_pSeries -data_access_slb_pSeries: - HMT_MEDIUM - mtspr SPRN_SPRG1,r13 - RUNLATCH_ON(r13) - mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ - std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */ - std r10,PACA_EXSLB+EX_R10(r13) - std r11,PACA_EXSLB+EX_R11(r13) - std r12,PACA_EXSLB+EX_R12(r13) - std r3,PACA_EXSLB+EX_R3(r13) - mfspr r9,SPRN_SPRG1 - std r9,PACA_EXSLB+EX_R13(r13) - mfcr r9 - mfspr r12,SPRN_SRR1 /* and SRR1 */ - mfspr r3,SPRN_DAR - b .do_slb_miss /* Rel. branch works in real mode */ - - STD_EXCEPTION_PSERIES(0x400, instruction_access) - - . = 0x480 - .globl instruction_access_slb_pSeries -instruction_access_slb_pSeries: - HMT_MEDIUM - mtspr SPRN_SPRG1,r13 - RUNLATCH_ON(r13) - mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ - std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */ - std r10,PACA_EXSLB+EX_R10(r13) - std r11,PACA_EXSLB+EX_R11(r13) - std r12,PACA_EXSLB+EX_R12(r13) - std r3,PACA_EXSLB+EX_R3(r13) - mfspr r9,SPRN_SPRG1 - std r9,PACA_EXSLB+EX_R13(r13) - mfcr r9 - mfspr r12,SPRN_SRR1 /* and SRR1 */ - mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - b .do_slb_miss /* Rel. branch works in real mode */ - - STD_EXCEPTION_PSERIES(0x500, hardware_interrupt) - STD_EXCEPTION_PSERIES(0x600, alignment) - STD_EXCEPTION_PSERIES(0x700, program_check) - STD_EXCEPTION_PSERIES(0x800, fp_unavailable) - STD_EXCEPTION_PSERIES(0x900, decrementer) - STD_EXCEPTION_PSERIES(0xa00, trap_0a) - STD_EXCEPTION_PSERIES(0xb00, trap_0b) - - . = 0xc00 - .globl system_call_pSeries -system_call_pSeries: - HMT_MEDIUM - RUNLATCH_ON(r9) - mr r9,r13 - mfmsr r10 - mfspr r13,SPRN_SPRG3 - mfspr r11,SPRN_SRR0 - clrrdi r12,r13,32 - oris r12,r12,system_call_common@h - ori r12,r12,system_call_common@l - mtspr SPRN_SRR0,r12 - ori r10,r10,MSR_IR|MSR_DR|MSR_RI - mfspr r12,SPRN_SRR1 - mtspr SPRN_SRR1,r10 - rfid - b . /* prevent speculative execution */ - - STD_EXCEPTION_PSERIES(0xd00, single_step) - STD_EXCEPTION_PSERIES(0xe00, trap_0e) - - /* We need to deal with the Altivec unavailable exception - * here which is at 0xf20, thus in the middle of the - * prolog code of the PerformanceMonitor one. A little - * trickery is thus necessary - */ - . = 0xf00 - b performance_monitor_pSeries - - STD_EXCEPTION_PSERIES(0xf20, altivec_unavailable) - - STD_EXCEPTION_PSERIES(0x1300, instruction_breakpoint) - STD_EXCEPTION_PSERIES(0x1700, altivec_assist) - - . = 0x3000 - -/*** pSeries interrupt support ***/ - - /* moved from 0xf00 */ - STD_EXCEPTION_PSERIES(., performance_monitor) - - .align 7 -_GLOBAL(do_stab_bolted_pSeries) - mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG2 - EXCEPTION_PROLOG_PSERIES(PACA_EXSLB, .do_stab_bolted) - -/* - * Vectors for the FWNMI option. Share common code. - */ - .globl system_reset_fwnmi -system_reset_fwnmi: - HMT_MEDIUM - mtspr SPRN_SPRG1,r13 /* save r13 */ - RUNLATCH_ON(r13) - EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common) - - .globl machine_check_fwnmi -machine_check_fwnmi: - HMT_MEDIUM - mtspr SPRN_SPRG1,r13 /* save r13 */ - RUNLATCH_ON(r13) - EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common) - -#ifdef CONFIG_PPC_ISERIES -/*** ISeries-LPAR interrupt handlers ***/ - - STD_EXCEPTION_ISERIES(0x200, machine_check, PACA_EXMC) - - .globl data_access_iSeries -data_access_iSeries: - mtspr SPRN_SPRG1,r13 -BEGIN_FTR_SECTION - mtspr SPRN_SPRG2,r12 - mfspr r13,SPRN_DAR - mfspr r12,SPRN_DSISR - srdi r13,r13,60 - rlwimi r13,r12,16,0x20 - mfcr r12 - cmpwi r13,0x2c - beq .do_stab_bolted_iSeries - mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG2 -END_FTR_SECTION_IFCLR(CPU_FTR_SLB) - EXCEPTION_PROLOG_ISERIES_1(PACA_EXGEN) - EXCEPTION_PROLOG_ISERIES_2 - b data_access_common - -.do_stab_bolted_iSeries: - mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG2 - EXCEPTION_PROLOG_ISERIES_1(PACA_EXSLB) - EXCEPTION_PROLOG_ISERIES_2 - b .do_stab_bolted - - .globl data_access_slb_iSeries -data_access_slb_iSeries: - mtspr SPRN_SPRG1,r13 /* save r13 */ - EXCEPTION_PROLOG_ISERIES_1(PACA_EXSLB) - std r3,PACA_EXSLB+EX_R3(r13) - ld r12,PACALPPACA+LPPACASRR1(r13) - mfspr r3,SPRN_DAR - b .do_slb_miss - - STD_EXCEPTION_ISERIES(0x400, instruction_access, PACA_EXGEN) - - .globl instruction_access_slb_iSeries -instruction_access_slb_iSeries: - mtspr SPRN_SPRG1,r13 /* save r13 */ - EXCEPTION_PROLOG_ISERIES_1(PACA_EXSLB) - std r3,PACA_EXSLB+EX_R3(r13) - ld r12,PACALPPACA+LPPACASRR1(r13) - ld r3,PACALPPACA+LPPACASRR0(r13) - b .do_slb_miss - - MASKABLE_EXCEPTION_ISERIES(0x500, hardware_interrupt) - STD_EXCEPTION_ISERIES(0x600, alignment, PACA_EXGEN) - STD_EXCEPTION_ISERIES(0x700, program_check, PACA_EXGEN) - STD_EXCEPTION_ISERIES(0x800, fp_unavailable, PACA_EXGEN) - MASKABLE_EXCEPTION_ISERIES(0x900, decrementer) - STD_EXCEPTION_ISERIES(0xa00, trap_0a, PACA_EXGEN) - STD_EXCEPTION_ISERIES(0xb00, trap_0b, PACA_EXGEN) - - .globl system_call_iSeries -system_call_iSeries: - mr r9,r13 - mfspr r13,SPRN_SPRG3 - EXCEPTION_PROLOG_ISERIES_2 - b system_call_common - - STD_EXCEPTION_ISERIES( 0xd00, single_step, PACA_EXGEN) - STD_EXCEPTION_ISERIES( 0xe00, trap_0e, PACA_EXGEN) - STD_EXCEPTION_ISERIES( 0xf00, performance_monitor, PACA_EXGEN) - - .globl system_reset_iSeries -system_reset_iSeries: - mfspr r13,SPRN_SPRG3 /* Get paca address */ - mfmsr r24 - ori r24,r24,MSR_RI - mtmsrd r24 /* RI on */ - lhz r24,PACAPACAINDEX(r13) /* Get processor # */ - cmpwi 0,r24,0 /* Are we processor 0? */ - beq .__start_initialization_iSeries /* Start up the first processor */ - mfspr r4,SPRN_CTRLF - li r5,CTRL_RUNLATCH /* Turn off the run light */ - andc r4,r4,r5 - mtspr SPRN_CTRLT,r4 - -1: - HMT_LOW -#ifdef CONFIG_SMP - lbz r23,PACAPROCSTART(r13) /* Test if this processor - * should start */ - sync - LOADADDR(r3,current_set) - sldi r28,r24,3 /* get current_set[cpu#] */ - ldx r3,r3,r28 - addi r1,r3,THREAD_SIZE - subi r1,r1,STACK_FRAME_OVERHEAD - - cmpwi 0,r23,0 - beq iSeries_secondary_smp_loop /* Loop until told to go */ - bne .__secondary_start /* Loop until told to go */ -iSeries_secondary_smp_loop: - /* Let the Hypervisor know we are alive */ - /* 8002 is a call to HvCallCfg::getLps, a harmless Hypervisor function */ - lis r3,0x8002 - rldicr r3,r3,32,15 /* r0 = (r3 << 32) & 0xffff000000000000 */ -#else /* CONFIG_SMP */ - /* Yield the processor. This is required for non-SMP kernels - which are running on multi-threaded machines. */ - lis r3,0x8000 - rldicr r3,r3,32,15 /* r3 = (r3 << 32) & 0xffff000000000000 */ - addi r3,r3,18 /* r3 = 0x8000000000000012 which is "yield" */ - li r4,0 /* "yield timed" */ - li r5,-1 /* "yield forever" */ -#endif /* CONFIG_SMP */ - li r0,-1 /* r0=-1 indicates a Hypervisor call */ - sc /* Invoke the hypervisor via a system call */ - mfspr r13,SPRN_SPRG3 /* Put r13 back ???? */ - b 1b /* If SMP not configured, secondaries - * loop forever */ - - .globl decrementer_iSeries_masked -decrementer_iSeries_masked: - li r11,1 - stb r11,PACALPPACA+LPPACADECRINT(r13) - lwz r12,PACADEFAULTDECR(r13) - mtspr SPRN_DEC,r12 - /* fall through */ - - .globl hardware_interrupt_iSeries_masked -hardware_interrupt_iSeries_masked: - mtcrf 0x80,r9 /* Restore regs */ - ld r11,PACALPPACA+LPPACASRR0(r13) - ld r12,PACALPPACA+LPPACASRR1(r13) - mtspr SPRN_SRR0,r11 - mtspr SPRN_SRR1,r12 - ld r9,PACA_EXGEN+EX_R9(r13) - ld r10,PACA_EXGEN+EX_R10(r13) - ld r11,PACA_EXGEN+EX_R11(r13) - ld r12,PACA_EXGEN+EX_R12(r13) - ld r13,PACA_EXGEN+EX_R13(r13) - rfid - b . /* prevent speculative execution */ -#endif /* CONFIG_PPC_ISERIES */ - -/*** Common interrupt handlers ***/ - - STD_EXCEPTION_COMMON(0x100, system_reset, .system_reset_exception) - - /* - * Machine check is different because we use a different - * save area: PACA_EXMC instead of PACA_EXGEN. - */ - .align 7 - .globl machine_check_common -machine_check_common: - EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC) - DISABLE_INTS - bl .save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl .machine_check_exception - b .ret_from_except - - STD_EXCEPTION_COMMON_LITE(0x900, decrementer, .timer_interrupt) - STD_EXCEPTION_COMMON(0xa00, trap_0a, .unknown_exception) - STD_EXCEPTION_COMMON(0xb00, trap_0b, .unknown_exception) - STD_EXCEPTION_COMMON(0xd00, single_step, .single_step_exception) - STD_EXCEPTION_COMMON(0xe00, trap_0e, .unknown_exception) - STD_EXCEPTION_COMMON(0xf00, performance_monitor, .performance_monitor_exception) - STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, .instruction_breakpoint_exception) -#ifdef CONFIG_ALTIVEC - STD_EXCEPTION_COMMON(0x1700, altivec_assist, .altivec_assist_exception) -#else - STD_EXCEPTION_COMMON(0x1700, altivec_assist, .unknown_exception) -#endif - -/* - * Here we have detected that the kernel stack pointer is bad. - * R9 contains the saved CR, r13 points to the paca, - * r10 contains the (bad) kernel stack pointer, - * r11 and r12 contain the saved SRR0 and SRR1. - * We switch to using an emergency stack, save the registers there, - * and call kernel_bad_stack(), which panics. + * On server, we include the exception vectors code here as it + * relies on absolute addressing which is only possible within + * this compilation unit */ -bad_stack: - ld r1,PACAEMERGSP(r13) - subi r1,r1,64+INT_FRAME_SIZE - std r9,_CCR(r1) - std r10,GPR1(r1) - std r11,_NIP(r1) - std r12,_MSR(r1) - mfspr r11,SPRN_DAR - mfspr r12,SPRN_DSISR - std r11,_DAR(r1) - std r12,_DSISR(r1) - mflr r10 - mfctr r11 - mfxer r12 - std r10,_LINK(r1) - std r11,_CTR(r1) - std r12,_XER(r1) - SAVE_GPR(0,r1) - SAVE_GPR(2,r1) - SAVE_4GPRS(3,r1) - SAVE_2GPRS(7,r1) - SAVE_10GPRS(12,r1) - SAVE_10GPRS(22,r1) - addi r11,r1,INT_FRAME_SIZE - std r11,0(r1) - li r12,0 - std r12,0(r11) - ld r2,PACATOC(r13) -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl .kernel_bad_stack - b 1b - -/* - * Return from an exception with minimal checks. - * The caller is assumed to have done EXCEPTION_PROLOG_COMMON. - * If interrupts have been enabled, or anything has been - * done that might have changed the scheduling status of - * any task or sent any task a signal, you should use - * ret_from_except or ret_from_except_lite instead of this. - */ - .globl fast_exception_return -fast_exception_return: - ld r12,_MSR(r1) - ld r11,_NIP(r1) - andi. r3,r12,MSR_RI /* check if RI is set */ - beq- unrecov_fer - ld r3,_CCR(r1) - ld r4,_LINK(r1) - ld r5,_CTR(r1) - ld r6,_XER(r1) - mtcr r3 - mtlr r4 - mtctr r5 - mtxer r6 - REST_GPR(0, r1) - REST_8GPRS(2, r1) - - mfmsr r10 - clrrdi r10,r10,2 /* clear RI (LE is 0 already) */ - mtmsrd r10,1 - - mtspr SPRN_SRR1,r12 - mtspr SPRN_SRR0,r11 - REST_4GPRS(10, r1) - ld r1,GPR1(r1) - rfid - b . /* prevent speculative execution */ - -unrecov_fer: - bl .save_nvgprs -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl .unrecoverable_exception - b 1b - -/* - * Here r13 points to the paca, r9 contains the saved CR, - * SRR0 and SRR1 are saved in r11 and r12, - * r9 - r13 are saved in paca->exgen. - */ - .align 7 - .globl data_access_common -data_access_common: - RUNLATCH_ON(r10) /* It wont fit in the 0x300 handler */ - mfspr r10,SPRN_DAR - std r10,PACA_EXGEN+EX_DAR(r13) - mfspr r10,SPRN_DSISR - stw r10,PACA_EXGEN+EX_DSISR(r13) - EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN) - ld r3,PACA_EXGEN+EX_DAR(r13) - lwz r4,PACA_EXGEN+EX_DSISR(r13) - li r5,0x300 - b .do_hash_page /* Try to handle as hpte fault */ - - .align 7 - .globl instruction_access_common -instruction_access_common: - EXCEPTION_PROLOG_COMMON(0x400, PACA_EXGEN) - ld r3,_NIP(r1) - andis. r4,r12,0x5820 - li r5,0x400 - b .do_hash_page /* Try to handle as hpte fault */ - - .align 7 - .globl hardware_interrupt_common - .globl hardware_interrupt_entry -hardware_interrupt_common: - EXCEPTION_PROLOG_COMMON(0x500, PACA_EXGEN) -hardware_interrupt_entry: - DISABLE_INTS - addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_IRQ - b .ret_from_except_lite - - .align 7 - .globl alignment_common -alignment_common: - mfspr r10,SPRN_DAR - std r10,PACA_EXGEN+EX_DAR(r13) - mfspr r10,SPRN_DSISR - stw r10,PACA_EXGEN+EX_DSISR(r13) - EXCEPTION_PROLOG_COMMON(0x600, PACA_EXGEN) - ld r3,PACA_EXGEN+EX_DAR(r13) - lwz r4,PACA_EXGEN+EX_DSISR(r13) - std r3,_DAR(r1) - std r4,_DSISR(r1) - bl .save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - ENABLE_INTS - bl .alignment_exception - b .ret_from_except - - .align 7 - .globl program_check_common -program_check_common: - EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) - bl .save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - ENABLE_INTS - bl .program_check_exception - b .ret_from_except - - .align 7 - .globl fp_unavailable_common -fp_unavailable_common: - EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN) - bne .load_up_fpu /* if from user, just load it up */ - bl .save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - ENABLE_INTS - bl .kernel_fp_unavailable_exception - BUG_OPCODE - - .align 7 - .globl altivec_unavailable_common -altivec_unavailable_common: - EXCEPTION_PROLOG_COMMON(0xf20, PACA_EXGEN) -#ifdef CONFIG_ALTIVEC -BEGIN_FTR_SECTION - bne .load_up_altivec /* if from user, just load it up */ -END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#ifdef CONFIG_PPC_BOOK3S +#include "exceptions-64s.S" #endif - bl .save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - ENABLE_INTS - bl .altivec_unavailable_exception - b .ret_from_except - -#ifdef CONFIG_ALTIVEC -/* - * load_up_altivec(unused, unused, tsk) - * Disable VMX for the task which had it previously, - * and save its vector registers in its thread_struct. - * Enables the VMX for use in the kernel on return. - * On SMP we know the VMX is free, since we give it up every - * switch (ie, no lazy save of the vector registers). - * On entry: r13 == 'current' && last_task_used_altivec != 'current' - */ -_STATIC(load_up_altivec) - mfmsr r5 /* grab the current MSR */ - oris r5,r5,MSR_VEC@h - mtmsrd r5 /* enable use of VMX now */ - isync - -/* - * For SMP, we don't do lazy VMX switching because it just gets too - * horrendously complex, especially when a task switches from one CPU - * to another. Instead we call giveup_altvec in switch_to. - * VRSAVE isn't dealt with here, that is done in the normal context - * switch code. Note that we could rely on vrsave value to eventually - * avoid saving all of the VREGs here... - */ -#ifndef CONFIG_SMP - ld r3,last_task_used_altivec@got(r2) - ld r4,0(r3) - cmpdi 0,r4,0 - beq 1f - /* Save VMX state to last_task_used_altivec's THREAD struct */ - addi r4,r4,THREAD - SAVE_32VRS(0,r5,r4) - mfvscr vr0 - li r10,THREAD_VSCR - stvx vr0,r10,r4 - /* Disable VMX for last_task_used_altivec */ - ld r5,PT_REGS(r4) - ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) - lis r6,MSR_VEC@h - andc r4,r4,r6 - std r4,_MSR-STACK_FRAME_OVERHEAD(r5) -1: -#endif /* CONFIG_SMP */ - /* Hack: if we get an altivec unavailable trap with VRSAVE - * set to all zeros, we assume this is a broken application - * that fails to set it properly, and thus we switch it to - * all 1's - */ - mfspr r4,SPRN_VRSAVE - cmpdi 0,r4,0 - bne+ 1f - li r4,-1 - mtspr SPRN_VRSAVE,r4 -1: - /* enable use of VMX after return */ - ld r4,PACACURRENT(r13) - addi r5,r4,THREAD /* Get THREAD */ - oris r12,r12,MSR_VEC@h - std r12,_MSR(r1) - li r4,1 - li r10,THREAD_VSCR - stw r4,THREAD_USED_VR(r5) - lvx vr0,r10,r5 - mtvscr vr0 - REST_32VRS(0,r4,r5) -#ifndef CONFIG_SMP - /* Update last_task_used_math to 'current' */ - subi r4,r5,THREAD /* Back to 'current' */ - std r4,0(r3) -#endif /* CONFIG_SMP */ - /* restore registers and return */ - b fast_exception_return -#endif /* CONFIG_ALTIVEC */ - -/* - * Hash table stuff - */ - .align 7 -_GLOBAL(do_hash_page) - std r3,_DAR(r1) - std r4,_DSISR(r1) - - andis. r0,r4,0xa450 /* weird error? */ - bne- .handle_page_fault /* if not, try to insert a HPTE */ -BEGIN_FTR_SECTION - andis. r0,r4,0x0020 /* Is it a segment table fault? */ - bne- .do_ste_alloc /* If so handle it */ -END_FTR_SECTION_IFCLR(CPU_FTR_SLB) - - /* - * We need to set the _PAGE_USER bit if MSR_PR is set or if we are - * accessing a userspace segment (even from the kernel). We assume - * kernel addresses always have the high bit set. - */ - rlwinm r4,r4,32-25+9,31-9,31-9 /* DSISR_STORE -> _PAGE_RW */ - rotldi r0,r3,15 /* Move high bit into MSR_PR posn */ - orc r0,r12,r0 /* MSR_PR | ~high_bit */ - rlwimi r4,r0,32-13,30,30 /* becomes _PAGE_USER access bit */ - ori r4,r4,1 /* add _PAGE_PRESENT */ - rlwimi r4,r5,22+2,31-2,31-2 /* Set _PAGE_EXEC if trap is 0x400 */ - /* - * On iSeries, we soft-disable interrupts here, then - * hard-enable interrupts so that the hash_page code can spin on - * the hash_table_lock without problems on a shared processor. - */ - DISABLE_INTS - - /* - * r3 contains the faulting address - * r4 contains the required access permissions - * r5 contains the trap number - * - * at return r3 = 0 for success - */ - bl .hash_page /* build HPTE if possible */ - cmpdi r3,0 /* see if hash_page succeeded */ +_GLOBAL(generic_secondary_thread_init) + mr r24,r3 -#ifdef DO_SOFT_DISABLE - /* - * If we had interrupts soft-enabled at the point where the - * DSI/ISI occurred, and an interrupt came in during hash_page, - * handle it now. - * We jump to ret_from_except_lite rather than fast_exception_return - * because ret_from_except_lite will check for and handle pending - * interrupts if necessary. - */ - beq .ret_from_except_lite - /* For a hash failure, we don't bother re-enabling interrupts */ - ble- 12f + /* turn on 64-bit mode */ + bl enable_64b_mode - /* - * hash_page couldn't handle it, set soft interrupt enable back - * to what it was before the trap. Note that .local_irq_restore - * handles any interrupts pending at this point. - */ - ld r3,SOFTE(r1) - bl .local_irq_restore - b 11f -#else - beq fast_exception_return /* Return from exception on success */ - ble- 12f /* Failure return from hash_page */ + /* get a valid TOC pointer, wherever we're mapped at */ + bl relative_toc + tovirt(r2,r2) - /* fall through */ +#ifdef CONFIG_PPC_BOOK3E + /* Book3E initialization */ + mr r3,r24 + bl book3e_secondary_thread_init #endif - -/* Here we have a page fault that hash_page can't handle. */ -_GLOBAL(handle_page_fault) - ENABLE_INTS -11: ld r4,_DAR(r1) - ld r5,_DSISR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_page_fault - cmpdi r3,0 - beq+ .ret_from_except_lite - bl .save_nvgprs - mr r5,r3 - addi r3,r1,STACK_FRAME_OVERHEAD - lwz r4,_DAR(r1) - bl .bad_page_fault - b .ret_from_except - -/* We have a page fault that hash_page could handle but HV refused - * the PTE insertion - */ -12: bl .save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - lwz r4,_DAR(r1) - bl .low_hash_fault - b .ret_from_except - - /* here we have a segment miss */ -_GLOBAL(do_ste_alloc) - bl .ste_allocate /* try to insert stab entry */ - cmpdi r3,0 - beq+ fast_exception_return - b .handle_page_fault - -/* - * r13 points to the PACA, r9 contains the saved CR, - * r11 and r12 contain the saved SRR0 and SRR1. - * r9 - r13 are saved in paca->exslb. - * We assume we aren't going to take any exceptions during this procedure. - * We assume (DAR >> 60) == 0xc. - */ - .align 7 -_GLOBAL(do_stab_bolted) - stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ - std r11,PACA_EXSLB+EX_SRR0(r13) /* save SRR0 in exc. frame */ - - /* Hash to the primary group */ - ld r10,PACASTABVIRT(r13) - mfspr r11,SPRN_DAR - srdi r11,r11,28 - rldimi r10,r11,7,52 /* r10 = first ste of the group */ - - /* Calculate VSID */ - /* This is a kernel address, so protovsid = ESID */ - ASM_VSID_SCRAMBLE(r11, r9) - rldic r9,r11,12,16 /* r9 = vsid << 12 */ - - /* Search the primary group for a free entry */ -1: ld r11,0(r10) /* Test valid bit of the current ste */ - andi. r11,r11,0x80 - beq 2f - addi r10,r10,16 - andi. r11,r10,0x70 - bne 1b - - /* Stick for only searching the primary group for now. */ - /* At least for now, we use a very simple random castout scheme */ - /* Use the TB as a random number ; OR in 1 to avoid entry 0 */ - mftb r11 - rldic r11,r11,4,57 /* r11 = (r11 << 4) & 0x70 */ - ori r11,r11,0x10 - - /* r10 currently points to an ste one past the group of interest */ - /* make it point to the randomly selected entry */ - subi r10,r10,128 - or r10,r10,r11 /* r10 is the entry to invalidate */ - - isync /* mark the entry invalid */ - ld r11,0(r10) - rldicl r11,r11,56,1 /* clear the valid bit */ - rotldi r11,r11,8 - std r11,0(r10) - sync - - clrrdi r11,r11,28 /* Get the esid part of the ste */ - slbie r11 - -2: std r9,8(r10) /* Store the vsid part of the ste */ - eieio - - mfspr r11,SPRN_DAR /* Get the new esid */ - clrrdi r11,r11,28 /* Permits a full 32b of ESID */ - ori r11,r11,0x90 /* Turn on valid and kp */ - std r11,0(r10) /* Put new entry back into the stab */ - - sync - - /* All done -- return from exception. */ - lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ - ld r11,PACA_EXSLB+EX_SRR0(r13) /* get saved SRR0 */ - - andi. r10,r12,MSR_RI - beq- unrecov_slb - - mtcrf 0x80,r9 /* restore CR */ - - mfmsr r10 - clrrdi r10,r10,2 - mtmsrd r10,1 - - mtspr SPRN_SRR0,r11 - mtspr SPRN_SRR1,r12 - ld r9,PACA_EXSLB+EX_R9(r13) - ld r10,PACA_EXSLB+EX_R10(r13) - ld r11,PACA_EXSLB+EX_R11(r13) - ld r12,PACA_EXSLB+EX_R12(r13) - ld r13,PACA_EXSLB+EX_R13(r13) - rfid - b . /* prevent speculative execution */ + b generic_secondary_common_init /* - * r13 points to the PACA, r9 contains the saved CR, - * r11 and r12 contain the saved SRR0 and SRR1. - * r3 has the faulting address - * r9 - r13 are saved in paca->exslb. - * r3 is saved in paca->slb_r3 - * We assume we aren't going to take any exceptions during this procedure. - */ -_GLOBAL(do_slb_miss) - mflr r10 - - stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ - std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ - - bl .slb_allocate /* handle it */ - - /* All done -- return from exception. */ - - ld r10,PACA_EXSLB+EX_LR(r13) - ld r3,PACA_EXSLB+EX_R3(r13) - lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ -#ifdef CONFIG_PPC_ISERIES - ld r11,PACALPPACA+LPPACASRR0(r13) /* get SRR0 value */ -#endif /* CONFIG_PPC_ISERIES */ - - mtlr r10 - - andi. r10,r12,MSR_RI /* check for unrecoverable exception */ - beq- unrecov_slb - -.machine push -.machine "power4" - mtcrf 0x80,r9 - mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ -.machine pop - -#ifdef CONFIG_PPC_ISERIES - mtspr SPRN_SRR0,r11 - mtspr SPRN_SRR1,r12 -#endif /* CONFIG_PPC_ISERIES */ - ld r9,PACA_EXSLB+EX_R9(r13) - ld r10,PACA_EXSLB+EX_R10(r13) - ld r11,PACA_EXSLB+EX_R11(r13) - ld r12,PACA_EXSLB+EX_R12(r13) - ld r13,PACA_EXSLB+EX_R13(r13) - rfid - b . /* prevent speculative execution */ - -unrecov_slb: - EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB) - DISABLE_INTS - bl .save_nvgprs -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl .unrecoverable_exception - b 1b - -/* - * Space for CPU0's segment table. - * - * On iSeries, the hypervisor must fill in at least one entry before - * we get control (with relocate on). The address is give to the hv - * as a page number (see xLparMap in lpardata.c), so this must be at a - * fixed address (the linker can't compute (u64)&initial_stab >> - * PAGE_SHIFT). - */ - . = STAB0_PHYS_ADDR /* 0x6000 */ - .globl initial_stab -initial_stab: - .space 4096 - -/* - * Data area reserved for FWNMI option. - * This address (0x7000) is fixed by the RPA. - */ - .= 0x7000 - .globl fwnmi_data_area -fwnmi_data_area: - - /* iSeries does not use the FWNMI stuff, so it is safe to put - * this here, even if we later allow kernels that will boot on - * both pSeries and iSeries */ -#ifdef CONFIG_PPC_ISERIES - . = LPARMAP_PHYS -#include "lparmap.s" -/* - * This ".text" is here for old compilers that generate a trailing - * .note section when compiling .c files to .s - */ - .text -#endif /* CONFIG_PPC_ISERIES */ - - . = 0x8000 - -/* - * On pSeries, secondary processors spin in the following code. + * On pSeries and most other platforms, secondary processors spin + * in the following code. * At entry, r3 = this processor's number (physical cpu id) + * + * On Book3E, r4 = 1 to indicate that the initial TLB entry for + * this core already exists (setup via some other mechanism such + * as SCOM before entry). */ -_GLOBAL(pSeries_secondary_smp_init) +_GLOBAL(generic_secondary_smp_init) + FIXUP_ENDIAN mr r24,r3 - + mr r25,r4 + /* turn on 64-bit mode */ - bl .enable_64b_mode - isync + bl enable_64b_mode - /* Copy some CPU settings from CPU 0 */ - bl .__restore_cpu_setup + /* get a valid TOC pointer, wherever we're mapped at */ + bl relative_toc + tovirt(r2,r2) +#ifdef CONFIG_PPC_BOOK3E + /* Book3E initialization */ + mr r3,r24 + mr r4,r25 + bl book3e_secondary_core_init +#endif + +generic_secondary_common_init: /* Set up a paca value for this processor. Since we have the * physical cpu id in r24, we need to search the pacas to find * which logical id maps to our physical one. */ - LOADADDR(r13, paca) /* Get base vaddr of paca array */ + LOAD_REG_ADDR(r13, paca) /* Load paca pointer */ + ld r13,0(r13) /* Get base vaddr of paca array */ +#ifndef CONFIG_SMP + addi r13,r13,PACA_SIZE /* know r13 if used accidentally */ + b kexec_wait /* wait for next kernel if !SMP */ +#else + LOAD_REG_ADDR(r7, nr_cpu_ids) /* Load nr_cpu_ids address */ + lwz r7,0(r7) /* also the max paca allocated */ li r5,0 /* logical cpu id */ 1: lhz r6,PACAHWCPUID(r13) /* Load HW procid from paca */ cmpw r6,r24 /* Compare to our id */ beq 2f addi r13,r13,PACA_SIZE /* Loop to next PACA on miss */ addi r5,r5,1 - cmpwi r5,NR_CPUS + cmpw r5,r7 /* Check if more pacas exist */ blt 1b mr r3,r24 /* not found, copy phys to r3 */ - b .kexec_wait /* next kernel might do better */ + b kexec_wait /* next kernel might do better */ + +2: SET_PACA(r13) +#ifdef CONFIG_PPC_BOOK3E + addi r12,r13,PACA_EXTLB /* and TLB exc frame in another */ + mtspr SPRN_SPRG_TLB_EXFRAME,r12 +#endif -2: mtspr SPRN_SPRG3,r13 /* Save vaddr of paca in SPRG3 */ /* From now on, r24 is expected to be logical cpuid */ mr r24,r5 -3: HMT_LOW + + /* See if we need to call a cpu state restore handler */ + LOAD_REG_ADDR(r23, cur_cpu_spec) + ld r23,0(r23) + ld r12,CPU_SPEC_RESTORE(r23) + cmpdi 0,r12,0 + beq 3f +#if !defined(_CALL_ELF) || _CALL_ELF != 2 + ld r12,0(r12) +#endif + mtctr r12 + bctrl + +3: LOAD_REG_ADDR(r3, spinning_secondaries) /* Decrement spinning_secondaries */ + lwarx r4,0,r3 + subi r4,r4,1 + stwcx. r4,0,r3 + bne 3b + isync + +4: HMT_LOW lbz r23,PACAPROCSTART(r13) /* Test if this processor should */ /* start. */ - sync + cmpwi 0,r23,0 + beq 4b /* Loop until told to go */ + + sync /* order paca.run and cur_cpu_spec */ + isync /* In case code patching happened */ /* Create a temp kernel stack for use before relocation is on. */ ld r1,PACAEMERGSP(r13) subi r1,r1,STACK_FRAME_OVERHEAD - cmpwi 0,r23,0 -#ifdef CONFIG_SMP - bne .__secondary_start -#endif - b 3b /* Loop until told to go */ - -#ifdef CONFIG_PPC_ISERIES -_STATIC(__start_initialization_iSeries) - /* Clear out the BSS */ - LOADADDR(r11,__bss_stop) - LOADADDR(r8,__bss_start) - sub r11,r11,r8 /* bss size */ - addi r11,r11,7 /* round up to an even double word */ - rldicl. r11,r11,61,3 /* shift right by 3 */ - beq 4f - addi r8,r8,-8 - li r0,0 - mtctr r11 /* zero this many doublewords */ -3: stdu r0,8(r8) - bdnz 3b -4: - LOADADDR(r1,init_thread_union) - addi r1,r1,THREAD_SIZE - li r0,0 - stdu r0,-STACK_FRAME_OVERHEAD(r1) - - LOADADDR(r3,cpu_specs) - LOADADDR(r4,cur_cpu_spec) - li r5,0 - bl .identify_cpu - - LOADADDR(r2,__toc_start) - addi r2,r2,0x4000 - addi r2,r2,0x4000 - - bl .iSeries_early_setup - bl .early_setup + b __secondary_start +#endif /* SMP */ - /* relocation is on at this point */ - - b .start_here_common -#endif /* CONFIG_PPC_ISERIES */ - -#ifdef CONFIG_PPC_MULTIPLATFORM - -_STATIC(__mmu_off) +/* + * Turn the MMU off. + * Assumes we're mapped EA == RA if the MMU is on. + */ +#ifdef CONFIG_PPC_BOOK3S +__mmu_off: mfmsr r3 andi. r0,r3,MSR_IR|MSR_DR beqlr + mflr r4 andc r3,r3,r0 mtspr SPRN_SRR0,r4 mtspr SPRN_SRR1,r3 sync rfid b . /* prevent speculative execution */ +#endif /* @@ -1342,33 +324,60 @@ _STATIC(__mmu_off) * DT block, r4 is a physical pointer to the kernel itself * */ -_GLOBAL(__start_initialization_multiplatform) +__start_initialization_multiplatform: + /* Make sure we are running in 64 bits mode */ + bl enable_64b_mode + + /* Get TOC pointer (current runtime address) */ + bl relative_toc + + /* find out where we are now */ + bcl 20,31,$+4 +0: mflr r26 /* r26 = runtime addr here */ + addis r26,r26,(_stext - 0b)@ha + addi r26,r26,(_stext - 0b)@l /* current runtime base addr */ + /* * Are we booted from a PROM Of-type client-interface ? */ cmpldi cr0,r5,0 - bne .__boot_from_prom /* yes -> prom */ - + beq 1f + b __boot_from_prom /* yes -> prom */ +1: /* Save parameters */ mr r31,r3 mr r30,r4 +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL + /* Save OPAL entry */ + mr r28,r8 + mr r29,r9 +#endif - /* Make sure we are running in 64 bits mode */ - bl .enable_64b_mode - +#ifdef CONFIG_PPC_BOOK3E + bl start_initialization_book3e + b __after_prom_start +#else /* Setup some critical 970 SPRs before switching MMU off */ - bl .__970_cpu_preinit - - /* cpu # */ - li r24,0 - - /* Switch off MMU if not already */ - LOADADDR(r4, .__after_prom_start - KERNELBASE) - add r4,r4,r30 - bl .__mmu_off - b .__after_prom_start - -_STATIC(__boot_from_prom) + mfspr r0,SPRN_PVR + srwi r0,r0,16 + cmpwi r0,0x39 /* 970 */ + beq 1f + cmpwi r0,0x3c /* 970FX */ + beq 1f + cmpwi r0,0x44 /* 970MP */ + beq 1f + cmpwi r0,0x45 /* 970GX */ + bne 2f +1: bl __cpu_preinit_ppc970 +2: + + /* Switch off MMU if not already off */ + bl __mmu_off + b __after_prom_start +#endif /* CONFIG_PPC_BOOK3E */ + +__boot_from_prom: +#ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE /* Save parameters */ mr r31,r3 mr r30,r4 @@ -1376,18 +385,18 @@ _STATIC(__boot_from_prom) mr r28,r6 mr r27,r7 - /* Make sure we are running in 64 bits mode */ - bl .enable_64b_mode - - /* put a relocation offset into r3 */ - bl .reloc_offset - - LOADADDR(r2,__toc_start) - addi r2,r2,0x4000 - addi r2,r2,0x4000 + /* + * Align the stack to 16-byte boundary + * Depending on the size and layout of the ELF sections in the initial + * boot binary, the stack pointer may be unaligned on PowerMac + */ + rldicr r1,r1,0,59 - /* Relocate the TOC from a virt addr to a real addr */ - add r2,r2,r3 +#ifdef CONFIG_RELOCATABLE + /* Relocate code for where we are now */ + mr r3,r26 + bl relocate +#endif /* Restore parameters */ mr r3,r31 @@ -1397,62 +406,81 @@ _STATIC(__boot_from_prom) mr r7,r27 /* Do all of the interaction with OF client interface */ - bl .prom_init - /* We never return */ + mr r8,r26 + bl prom_init +#endif /* #CONFIG_PPC_OF_BOOT_TRAMPOLINE */ + + /* We never return. We also hit that trap if trying to boot + * from OF while CONFIG_PPC_OF_BOOT_TRAMPOLINE isn't selected */ trap -/* - * At this point, r3 contains the physical address we are running at, - * returned by prom_init() - */ -_STATIC(__after_prom_start) +__after_prom_start: +#ifdef CONFIG_RELOCATABLE + /* process relocations for the final address of the kernel */ + lis r25,PAGE_OFFSET@highest /* compute virtual base of kernel */ + sldi r25,r25,32 + lwz r7,__run_at_load-_stext(r26) + cmplwi cr0,r7,1 /* flagged to stay where we are ? */ + bne 1f + add r25,r25,r26 +1: mr r3,r25 + bl relocate +#endif /* - * We need to run with __start at physical address 0. + * We need to run with _stext at physical address PHYSICAL_START. * This will leave some code in the first 256B of * real memory, which are reserved for software use. - * The remainder of the first page is loaded with the fixed - * interrupt vectors. The next two pages are filled with - * unknown exception placeholders. * * Note: This process overwrites the OF exception vectors. - * r26 == relocation offset - * r27 == KERNELBASE */ - bl .reloc_offset - mr r26,r3 - SET_REG_TO_CONST(r27,KERNELBASE) - li r3,0 /* target addr */ - - // XXX FIXME: Use phys returned by OF (r30) - add r4,r27,r26 /* source addr */ - /* current address of _start */ - /* i.e. where we are running */ - /* the source addr */ - - LOADADDR(r5,copy_to_here) /* # bytes of memory to copy */ - sub r5,r5,r27 - +#ifdef CONFIG_PPC_BOOK3E + tovirt(r3,r3) /* on booke, we already run at PAGE_OFFSET */ +#endif + mr. r4,r26 /* In some cases the loader may */ + beq 9f /* have already put us at zero */ li r6,0x100 /* Start offset, the first 0x100 */ /* bytes were copied earlier. */ +#ifdef CONFIG_PPC_BOOK3E + tovirt(r6,r6) /* on booke, we already run at PAGE_OFFSET */ +#endif + +#ifdef CONFIG_RELOCATABLE +/* + * Check if the kernel has to be running as relocatable kernel based on the + * variable __run_at_load, if it is set the kernel is treated as relocatable + * kernel, otherwise it will be moved to PHYSICAL_START + */ + lwz r7,__run_at_load-_stext(r26) + cmplwi cr0,r7,1 + bne 3f + + /* just copy interrupts */ + LOAD_REG_IMMEDIATE(r5, __end_interrupts - _stext) + b 5f +3: +#endif + lis r5,(copy_to_here - _stext)@ha + addi r5,r5,(copy_to_here - _stext)@l /* # bytes of memory to copy */ - bl .copy_and_flush /* copy the first n bytes */ + bl copy_and_flush /* copy the first n bytes */ /* this includes the code being */ /* executed here. */ - - LOADADDR(r0, 4f) /* Jump to the copy of this code */ - mtctr r0 /* that we just made/relocated */ + addis r8,r3,(4f - _stext)@ha /* Jump to the copy of this code */ + addi r12,r8,(4f - _stext)@l /* that we just made */ + mtctr r12 bctr -4: LOADADDR(r5,klimit) - add r5,r5,r26 - ld r5,0(r5) /* get the value of klimit */ - sub r5,r5,r27 - bl .copy_and_flush /* copy the rest */ - b .start_here_multiplatform +.balign 8 +p_end: .llong _end - _stext + +4: /* Now copy the rest of the kernel up to _end */ + addis r5,r26,(p_end - _stext)@ha + ld r5,(p_end - _stext)@l(r5) /* get _end */ +5: bl copy_and_flush /* copy the rest */ -#endif /* CONFIG_PPC_MULTIPLATFORM */ +9: b start_here_multiplatform /* * Copy routine used to copy the kernel to start at physical address 0 @@ -1465,7 +493,7 @@ _STATIC(__after_prom_start) _GLOBAL(copy_and_flush) addi r5,r5,-8 addi r6,r6,-8 -4: li r0,16 /* Use the least common */ +4: li r0,8 /* Use the smallest common */ /* denominator cache line */ /* size. This results in */ /* extra cache line flushes */ @@ -1487,6 +515,7 @@ _GLOBAL(copy_and_flush) sync addi r5,r5,8 addi r6,r6,8 + isync blr .align 8 @@ -1515,11 +544,23 @@ __secondary_start_pmac_0: _GLOBAL(pmac_secondary_start) /* turn on 64-bit mode */ - bl .enable_64b_mode + bl enable_64b_mode + + li r0,0 + mfspr r3,SPRN_HID4 + rldimi r3,r0,40,23 /* clear bit 23 (rm_ci) */ + sync + mtspr SPRN_HID4,r3 isync + sync + slbia + + /* get TOC pointer (real address) */ + bl relative_toc + tovirt(r2,r2) /* Copy some CPU settings from CPU 0 */ - bl .__restore_cpu_setup + bl __restore_cpu_ppc970 /* pSeries do that early though I don't think we really need it */ mfmsr r3 @@ -1527,16 +568,25 @@ _GLOBAL(pmac_secondary_start) mtmsrd r3 /* RI on */ /* Set up a paca value for this processor. */ - LOADADDR(r4, paca) /* Get base vaddr of paca array */ - mulli r13,r24,PACA_SIZE /* Calculate vaddr of right paca */ + LOAD_REG_ADDR(r4,paca) /* Load paca pointer */ + ld r4,0(r4) /* Get base vaddr of paca array */ + mulli r13,r24,PACA_SIZE /* Calculate vaddr of right paca */ add r13,r13,r4 /* for this processor. */ - mtspr SPRN_SPRG3,r13 /* Save vaddr of paca in SPRG3 */ + SET_PACA(r13) /* Save vaddr of paca in an SPRG*/ + + /* Mark interrupts soft and hard disabled (they might be enabled + * in the PACA when doing hotplug) + */ + li r0,0 + stb r0,PACASOFTIRQEN(r13) + li r0,PACA_IRQ_HARD_DIS + stb r0,PACAIRQHAPPENED(r13) /* Create a temp kernel stack for use before relocation is on. */ ld r1,PACAEMERGSP(r13) subi r1,r1,STACK_FRAME_OVERHEAD - b .__secondary_start + b __secondary_start #endif /* CONFIG_PPC_PMAC */ @@ -1548,125 +598,138 @@ _GLOBAL(pmac_secondary_start) * 1. Processor number * 2. Segment table pointer (virtual address) * On entry the following are set: - * r1 = stack pointer. vaddr for iSeries, raddr (temp stack) for pSeries - * r24 = cpu# (in Linux terms) - * r13 = paca virtual address - * SPRG3 = paca virtual address + * r1 = stack pointer (real addr of temp stack) + * r24 = cpu# (in Linux terms) + * r13 = paca virtual address + * SPRG_PACA = paca virtual address */ -_GLOBAL(__secondary_start) - - HMT_MEDIUM /* Set thread priority to MEDIUM */ + .section ".text"; + .align 2 ; - ld r2,PACATOC(r13) - li r6,0 - stb r6,PACAPROCENABLED(r13) - -#ifndef CONFIG_PPC_ISERIES - /* Initialize the page table pointer register. */ - LOADADDR(r6,_SDR1) - ld r6,0(r6) /* get the value of _SDR1 */ - mtspr SPRN_SDR1,r6 /* set the htab location */ -#endif - /* Initialize the first segment table (or SLB) entry */ - ld r3,PACASTABVIRT(r13) /* get addr of segment table */ - bl .stab_initialize + .globl __secondary_start +__secondary_start: + /* Set thread priority to MEDIUM */ + HMT_MEDIUM - /* Initialize the kernel stack. Just a repeat for iSeries. */ - LOADADDR(r3,current_set) + /* Initialize the kernel stack */ + LOAD_REG_ADDR(r3, current_set) sldi r28,r24,3 /* get current_set[cpu#] */ - ldx r1,r3,r28 - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD - std r1,PACAKSAVE(r13) + ldx r14,r3,r28 + addi r14,r14,THREAD_SIZE-STACK_FRAME_OVERHEAD + std r14,PACAKSAVE(r13) - ld r3,PACASTABREAL(r13) /* get raddr of segment table */ - ori r4,r3,1 /* turn on valid bit */ + /* Do early setup for that CPU (stab, slb, hash table pointer) */ + bl early_setup_secondary -#ifdef CONFIG_PPC_ISERIES - li r0,-1 /* hypervisor call */ - li r3,1 - sldi r3,r3,63 /* 0x8000000000000000 */ - ori r3,r3,4 /* 0x8000000000000004 */ - sc /* HvCall_setASR */ -#else - /* set the ASR */ - ld r3,systemcfg@got(r2) /* r3 = ptr to systemcfg */ - ld r3,0(r3) - lwz r3,PLATFORM(r3) /* r3 = platform flags */ - andi. r3,r3,PLATFORM_LPAR /* Test if bit 0 is set (LPAR bit) */ - beq 98f /* branch if result is 0 */ - mfspr r3,SPRN_PVR - srwi r3,r3,16 - cmpwi r3,0x37 /* SStar */ - beq 97f - cmpwi r3,0x36 /* IStar */ - beq 97f - cmpwi r3,0x34 /* Pulsar */ - bne 98f -97: li r3,H_SET_ASR /* hcall = H_SET_ASR */ - HVSC /* Invoking hcall */ - b 99f -98: /* !(rpa hypervisor) || !(star) */ - mtasr r4 /* set the stab location */ -99: -#endif + /* + * setup the new stack pointer, but *don't* use this until + * translation is on. + */ + mr r1, r14 + + /* Clear backchain so we get nice backtraces */ li r7,0 mtlr r7 + /* Mark interrupts soft and hard disabled (they might be enabled + * in the PACA when doing hotplug) + */ + stb r7,PACASOFTIRQEN(r13) + li r0,PACA_IRQ_HARD_DIS + stb r0,PACAIRQHAPPENED(r13) + /* enable MMU and jump to start_secondary */ - LOADADDR(r3,.start_secondary_prolog) - SET_REG_TO_CONST(r4, MSR_KERNEL) -#ifdef DO_SOFT_DISABLE - ori r4,r4,MSR_EE -#endif + LOAD_REG_ADDR(r3, start_secondary_prolog) + LOAD_REG_IMMEDIATE(r4, MSR_KERNEL) + mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - rfid + RFI b . /* prevent speculative execution */ /* * Running with relocation on at this point. All we want to do is - * zero the stack back-chain pointer before going into C code. + * zero the stack back-chain pointer and get the TOC virtual address + * before going into C code. + */ +start_secondary_prolog: + ld r2,PACATOC(r13) + li r3,0 + std r3,0(r1) /* Zero the stack frame pointer */ + bl start_secondary + b . +/* + * Reset stack pointer and call start_secondary + * to continue with online operation when woken up + * from cede in cpu offline. */ -_GLOBAL(start_secondary_prolog) +_GLOBAL(start_secondary_resume) + ld r1,PACAKSAVE(r13) /* Reload kernel stack pointer */ li r3,0 std r3,0(r1) /* Zero the stack frame pointer */ - bl .start_secondary + bl start_secondary + b . #endif /* * This subroutine clobbers r11 and r12 */ -_GLOBAL(enable_64b_mode) +enable_64b_mode: mfmsr r11 /* grab the current MSR */ - li r12,1 - rldicr r12,r12,MSR_SF_LG,(63-MSR_SF_LG) - or r11,r11,r12 - li r12,1 - rldicr r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG) +#ifdef CONFIG_PPC_BOOK3E + oris r11,r11,0x8000 /* CM bit set, we'll set ICM later */ + mtmsr r11 +#else /* CONFIG_PPC_BOOK3E */ + li r12,(MSR_64BIT | MSR_ISF)@highest + sldi r12,r12,48 or r11,r11,r12 mtmsrd r11 isync +#endif blr -#ifdef CONFIG_PPC_MULTIPLATFORM +/* + * This puts the TOC pointer into r2, offset by 0x8000 (as expected + * by the toolchain). It computes the correct value for wherever we + * are running at the moment, using position-independent code. + * + * Note: The compiler constructs pointers using offsets from the + * TOC in -mcmodel=medium mode. After we relocate to 0 but before + * the MMU is on we need our TOC to be a virtual address otherwise + * these pointers will be real addresses which may get stored and + * accessed later with the MMU on. We use tovirt() at the call + * sites to handle this. + */ +_GLOBAL(relative_toc) + mflr r0 + bcl 20,31,$+4 +0: mflr r11 + ld r2,(p_toc - 0b)(r11) + add r2,r2,r11 + mtlr r0 + blr + +.balign 8 +p_toc: .llong __toc_start + 0x8000 - 0b + /* * This is where the main kernel code starts. */ -_STATIC(start_here_multiplatform) - /* get a new offset, now that the kernel has moved. */ - bl .reloc_offset - mr r26,r3 +start_here_multiplatform: + /* set up the TOC */ + bl relative_toc + tovirt(r2,r2) /* Clear out the BSS. It may have been done in prom_init, * already but that's irrelevant since prom_init will soon * be detached from the kernel completely. Besides, we need * to clear it now for kexec-style entry. */ - LOADADDR(r11,__bss_stop) - LOADADDR(r8,__bss_start) + LOAD_REG_ADDR(r11,__bss_stop) + LOAD_REG_ADDR(r8,__bss_start) sub r11,r11,r8 /* bss size */ addi r11,r11,7 /* round up to an even double word */ - rldicl. r11,r11,61,3 /* shift right by 3 */ + srdi. r11,r11,3 /* shift right by 3 */ beq 4f addi r8,r8,-8 li r0,0 @@ -1675,262 +738,78 @@ _STATIC(start_here_multiplatform) bdnz 3b 4: +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL + /* Setup OPAL entry */ + LOAD_REG_ADDR(r11, opal) + std r28,0(r11); + std r29,8(r11); +#endif + +#ifndef CONFIG_PPC_BOOK3E mfmsr r6 ori r6,r6,MSR_RI mtmsrd r6 /* RI on */ +#endif -#ifdef CONFIG_HMT - /* Start up the second thread on cpu 0 */ - mfspr r3,SPRN_PVR - srwi r3,r3,16 - cmpwi r3,0x34 /* Pulsar */ - beq 90f - cmpwi r3,0x36 /* Icestar */ - beq 90f - cmpwi r3,0x37 /* SStar */ - beq 90f - b 91f /* HMT not supported */ -90: li r3,0 - bl .hmt_start_secondary -91: +#ifdef CONFIG_RELOCATABLE + /* Save the physical address we're running at in kernstart_addr */ + LOAD_REG_ADDR(r4, kernstart_addr) + clrldi r0,r25,2 + std r0,0(r4) #endif - /* The following gets the stack and TOC set up with the regs */ + /* The following gets the stack set up with the regs */ /* pointing to the real addr of the kernel stack. This is */ /* all done to support the C function call below which sets */ /* up the htab. This is done because we have relocated the */ /* kernel but are still running in real mode. */ - LOADADDR(r3,init_thread_union) - add r3,r3,r26 + LOAD_REG_ADDR(r3,init_thread_union) - /* set up a stack pointer (physical address) */ + /* set up a stack pointer */ addi r1,r3,THREAD_SIZE li r0,0 stdu r0,-STACK_FRAME_OVERHEAD(r1) - /* set up the TOC (physical address) */ - LOADADDR(r2,__toc_start) - addi r2,r2,0x4000 - addi r2,r2,0x4000 - add r2,r2,r26 - - LOADADDR(r3,cpu_specs) - add r3,r3,r26 - LOADADDR(r4,cur_cpu_spec) - add r4,r4,r26 - mr r5,r26 - bl .identify_cpu - - /* Save some low level config HIDs of CPU0 to be copied to - * other CPUs later on, or used for suspend/resume - */ - bl .__save_cpu_setup - sync - - /* Setup a valid physical PACA pointer in SPRG3 for early_setup - * note that boot_cpuid can always be 0 nowadays since there is - * nowhere it can be initialized differently before we reach this - * code - */ - LOADADDR(r27, boot_cpuid) - add r27,r27,r26 - lwz r27,0(r27) - - LOADADDR(r24, paca) /* Get base vaddr of paca array */ - mulli r13,r27,PACA_SIZE /* Calculate vaddr of right paca */ - add r13,r13,r24 /* for this processor. */ - add r13,r13,r26 /* convert to physical addr */ - mtspr SPRN_SPRG3,r13 /* PPPBBB: Temp... -Peter */ - /* Do very early kernel initializations, including initial hash table, * stab and slb setup before we turn on relocation. */ /* Restore parameters passed from prom_init/kexec */ mr r3,r31 - bl .early_setup - - /* set the ASR */ - ld r3,PACASTABREAL(r13) - ori r4,r3,1 /* turn on valid bit */ - ld r3,systemcfg@got(r2) /* r3 = ptr to systemcfg */ - ld r3,0(r3) - lwz r3,PLATFORM(r3) /* r3 = platform flags */ - andi. r3,r3,PLATFORM_LPAR /* Test if bit 0 is set (LPAR bit) */ - beq 98f /* branch if result is 0 */ - mfspr r3,SPRN_PVR - srwi r3,r3,16 - cmpwi r3,0x37 /* SStar */ - beq 97f - cmpwi r3,0x36 /* IStar */ - beq 97f - cmpwi r3,0x34 /* Pulsar */ - bne 98f -97: li r3,H_SET_ASR /* hcall = H_SET_ASR */ - HVSC /* Invoking hcall */ - b 99f -98: /* !(rpa hypervisor) || !(star) */ - mtasr r4 /* set the stab location */ -99: - /* Set SDR1 (hash table pointer) */ - ld r3,systemcfg@got(r2) /* r3 = ptr to systemcfg */ - ld r3,0(r3) - lwz r3,PLATFORM(r3) /* r3 = platform flags */ - /* Test if bit 0 is set (LPAR bit) */ - andi. r3,r3,PLATFORM_LPAR - bne 98f /* branch if result is !0 */ - LOADADDR(r6,_SDR1) /* Only if NOT LPAR */ - add r6,r6,r26 - ld r6,0(r6) /* get the value of _SDR1 */ - mtspr SPRN_SDR1,r6 /* set the htab location */ -98: - LOADADDR(r3,.start_here_common) - SET_REG_TO_CONST(r4, MSR_KERNEL) + bl early_setup /* also sets r13 and SPRG_PACA */ + + LOAD_REG_ADDR(r3, start_here_common) + ld r4,PACAKMSR(r13) mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - rfid + RFI b . /* prevent speculative execution */ -#endif /* CONFIG_PPC_MULTIPLATFORM */ /* This is where all platforms converge execution */ -_STATIC(start_here_common) - /* relocation is on at this point */ - - /* The following code sets up the SP and TOC now that we are */ - /* running with translation enabled. */ - - LOADADDR(r3,init_thread_union) - - /* set up the stack */ - addi r1,r3,THREAD_SIZE - li r0,0 - stdu r0,-STACK_FRAME_OVERHEAD(r1) - - /* Apply the CPUs-specific fixups (nop out sections not relevant - * to this CPU - */ - li r3,0 - bl .do_cpu_ftr_fixups - LOADADDR(r26, boot_cpuid) - lwz r26,0(r26) - - LOADADDR(r24, paca) /* Get base vaddr of paca array */ - mulli r13,r26,PACA_SIZE /* Calculate vaddr of right paca */ - add r13,r13,r24 /* for this processor. */ - mtspr SPRN_SPRG3,r13 - - /* ptr to current */ - LOADADDR(r4,init_task) - std r4,PACACURRENT(r13) - - /* Load the TOC */ - ld r2,PACATOC(r13) +start_here_common: + /* relocation is on at this point */ std r1,PACAKSAVE(r13) - bl .setup_system - - /* Load up the kernel context */ -5: -#ifdef DO_SOFT_DISABLE - li r5,0 - stb r5,PACAPROCENABLED(r13) /* Soft Disabled */ - mfmsr r5 - ori r5,r5,MSR_EE /* Hard Enabled */ - mtmsrd r5 -#endif + /* Load the TOC (virtual address) */ + ld r2,PACATOC(r13) - bl .start_kernel - -_GLOBAL(hmt_init) -#ifdef CONFIG_HMT - LOADADDR(r5, hmt_thread_data) - mfspr r7,SPRN_PVR - srwi r7,r7,16 - cmpwi r7,0x34 /* Pulsar */ - beq 90f - cmpwi r7,0x36 /* Icestar */ - beq 91f - cmpwi r7,0x37 /* SStar */ - beq 91f - b 101f -90: mfspr r6,SPRN_PIR - andi. r6,r6,0x1f - b 92f -91: mfspr r6,SPRN_PIR - andi. r6,r6,0x3ff -92: sldi r4,r24,3 - stwx r6,r5,r4 - bl .hmt_start_secondary - b 101f - -__hmt_secondary_hold: - LOADADDR(r5, hmt_thread_data) - clrldi r5,r5,4 - li r7,0 - mfspr r6,SPRN_PIR - mfspr r8,SPRN_PVR - srwi r8,r8,16 - cmpwi r8,0x34 - bne 93f - andi. r6,r6,0x1f - b 103f -93: andi. r6,r6,0x3f - -103: lwzx r8,r5,r7 - cmpw r8,r6 - beq 104f - addi r7,r7,8 - b 103b - -104: addi r7,r7,4 - lwzx r9,r5,r7 - mr r24,r9 -101: -#endif - mr r3,r24 - b .pSeries_secondary_smp_init - -#ifdef CONFIG_HMT -_GLOBAL(hmt_start_secondary) - LOADADDR(r4,__hmt_secondary_hold) - clrldi r4,r4,4 - mtspr SPRN_NIADORM, r4 - mfspr r4, SPRN_MSRDORM - li r5, -65 - and r4, r4, r5 - mtspr SPRN_MSRDORM, r4 - lis r4,0xffef - ori r4,r4,0x7403 - mtspr SPRN_TSC, r4 - li r4,0x1f4 - mtspr SPRN_TST, r4 - mfspr r4, SPRN_HID0 - ori r4, r4, 0x1 - mtspr SPRN_HID0, r4 - mfspr r4, SPRN_CTRLF - oris r4, r4, 0x40 - mtspr SPRN_CTRLT, r4 - blr -#endif + /* Do more system initializations in virtual mode */ + bl setup_system -#if defined(CONFIG_KEXEC) || defined(CONFIG_SMP) -_GLOBAL(smp_release_cpus) - /* All secondary cpus are spinning on a common - * spinloop, release them all now so they can start - * to spin on their individual paca spinloops. - * For non SMP kernels, the secondary cpus never - * get out of the common spinloop. - * XXX This does nothing useful on iSeries, secondaries are - * already waiting on their paca. + /* Mark interrupts soft and hard disabled (they might be enabled + * in the PACA when doing hotplug) */ - li r3,1 - LOADADDR(r5,__secondary_hold_spinloop) - std r3,0(r5) - sync - blr -#endif /* CONFIG_SMP */ + li r0,0 + stb r0,PACASOFTIRQEN(r13) + li r0,PACA_IRQ_HARD_DIS + stb r0,PACAIRQHAPPENED(r13) + /* Generic kernel entry */ + bl start_kernel + + /* Not reached */ + BUG_OPCODE /* * We put a few things here that have to be page-aligned. @@ -1946,12 +825,4 @@ empty_zero_page: .globl swapper_pg_dir swapper_pg_dir: - .space PAGE_SIZE - -/* - * This space gets a copy of optional info passed to us by the bootstrap - * Used to pass parameters into the kernel like root=/dev/sda1, etc. - */ - .globl cmd_line -cmd_line: - .space COMMAND_LINE_SIZE + .space PGD_TABLE_SIZE diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index bc6d1ac5523..7ee876d2adb 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -1,6 +1,4 @@ /* - * arch/ppc/kernel/except_8xx.S - * * PowerPC version * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) * Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP @@ -21,7 +19,7 @@ * */ -#include <linux/config.h> +#include <linux/init.h> #include <asm/processor.h> #include <asm/page.h> #include <asm/mmu.h> @@ -31,6 +29,7 @@ #include <asm/thread_info.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> +#include <asm/ptrace.h> /* Macro to make the code more readable. */ #ifdef CONFIG_8xx_CPU6 @@ -41,12 +40,9 @@ #else #define DO_8xx_CPU6(val, reg) #endif - .text - .globl _stext -_stext: - .text - .globl _start -_start: + __HEAD +_ENTRY(_stext); +_ENTRY(_start); /* MPC8xx * This port was done on an MBX board with an 860. Right now I only @@ -76,18 +72,11 @@ _start: * in the first level table, but that would require many changes to the * Linux page directory/table functions that I don't want to do right now. * - * I used to use SPRG2 for a temporary register in the TLB handler, but it - * has since been put to other uses. I now use a hack to save a register - * and the CCR at memory location 0.....Someday I'll fix this..... * -- Dan */ .globl __start __start: - mr r31,r3 /* save parameters */ - mr r30,r4 - mr r29,r5 - mr r28,r6 - mr r27,r7 + mr r31,r3 /* save device tree ptr */ /* We have to turn on the MMU right away so we get cache modes * set correctly. @@ -115,8 +104,8 @@ turn_on_mmu: * task's thread_struct. */ #define EXCEPTION_PROLOG \ - mtspr SPRN_SPRG0,r10; \ - mtspr SPRN_SPRG1,r11; \ + mtspr SPRN_SPRG_SCRATCH0,r10; \ + mtspr SPRN_SPRG_SCRATCH1,r11; \ mfcr r10; \ EXCEPTION_PROLOG_1; \ EXCEPTION_PROLOG_2 @@ -126,7 +115,7 @@ turn_on_mmu: andi. r11,r11,MSR_PR; \ tophys(r11,r1); /* use tophys(r1) if kernel */ \ beq 1f; \ - mfspr r11,SPRN_SPRG3; \ + mfspr r11,SPRN_SPRG_THREAD; \ lwz r11,THREAD_INFO-THREAD(r11); \ addi r11,r11,THREAD_SIZE; \ tophys(r11,r11); \ @@ -138,9 +127,9 @@ turn_on_mmu: stw r10,_CCR(r11); /* save registers */ \ stw r12,GPR12(r11); \ stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG0; \ + mfspr r10,SPRN_SPRG_SCRATCH0; \ stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG1; \ + mfspr r12,SPRN_SPRG_SCRATCH1; \ stw r12,GPR11(r11); \ mflr r10; \ stw r10,_LINK(r11); \ @@ -211,6 +200,8 @@ MachineCheck: EXCEPTION_PROLOG mfspr r4,SPRN_DAR stw r4,_DAR(r11) + li r5,0x00f0 + mtspr SPRN_DAR,r5 /* Tag DAR, to be used in DTLB Error */ mfspr r5,SPRN_DSISR stw r5,_DSISR(r11) addi r3,r1,STACK_FRAME_OVERHEAD @@ -227,7 +218,9 @@ DataAccess: stw r10,_DSISR(r11) mr r5,r10 mfspr r4,SPRN_DAR - EXC_XFER_EE_LITE(0x300, handle_page_fault) + li r10,0x00f0 + mtspr SPRN_DAR,r10 /* Tag DAR, to be used in DTLB Error */ + EXC_XFER_LITE(0x300, handle_page_fault) /* Instruction access exception. * This is "never generated" by the MPC8xx. We jump to it for other @@ -238,7 +231,7 @@ InstructionAccess: EXCEPTION_PROLOG mr r4,r12 mr r5,r9 - EXC_XFER_EE_LITE(0x400, handle_page_fault) + EXC_XFER_LITE(0x400, handle_page_fault) /* External interrupt */ EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) @@ -249,6 +242,8 @@ Alignment: EXCEPTION_PROLOG mfspr r4,SPRN_DAR stw r4,_DAR(r11) + li r5,0x00f0 + mtspr SPRN_DAR,r5 /* Tag DAR, to be used in DTLB Error */ mfspr r5,SPRN_DSISR stw r5,_DSISR(r11) addi r3,r1,STACK_FRAME_OVERHEAD @@ -301,9 +296,20 @@ InstructionTLBMiss: DO_8xx_CPU6(0x3f80, r3) mtspr SPRN_M_TW, r10 /* Save a couple of working registers */ mfcr r10 +#ifdef CONFIG_8xx_CPU6 stw r10, 0(r0) stw r11, 4(r0) +#else + mtspr SPRN_DAR, r10 + mtspr SPRN_SPRG2, r11 +#endif mfspr r10, SPRN_SRR0 /* Get effective address of fault */ +#ifdef CONFIG_8xx_CPU15 + addi r11, r10, 0x1000 + tlbie r11 + addi r11, r10, -0x1000 + tlbie r11 +#endif DO_8xx_CPU6(0x3780, r3) mtspr SPRN_MD_EPN, r10 /* Have to use MD_EPN for walk, MI_EPN can't */ mfspr r10, SPRN_M_TWB /* Get level 1 table entry address */ @@ -311,12 +317,16 @@ InstructionTLBMiss: /* If we are faulting a kernel address, we have to use the * kernel page tables. */ +#ifdef CONFIG_MODULES + /* Only modules will cause ITLB Misses as we always + * pin the first 8MB of kernel memory */ andi. r11, r10, 0x0800 /* Address >= 0x80000000 */ beq 3f lis r11, swapper_pg_dir@h ori r11, r11, swapper_pg_dir@l rlwimi r10, r11, 0, 2, 19 3: +#endif lwz r11, 0(r10) /* Get the level 1 entry */ rlwinm. r10, r11,0,0,19 /* Extract page descriptor page address */ beq 2f /* If zero, don't try to find a pte */ @@ -332,28 +342,59 @@ InstructionTLBMiss: mfspr r11, SPRN_MD_TWC /* ....and get the pte address */ lwz r10, 0(r11) /* Get the pte */ - ori r10, r10, _PAGE_ACCESSED - stw r10, 0(r11) - +#ifdef CONFIG_SWAP + andi. r11, r10, _PAGE_ACCESSED | _PAGE_PRESENT + cmpwi cr0, r11, _PAGE_ACCESSED | _PAGE_PRESENT + bne- cr0, 2f +#endif /* The Linux PTE won't go exactly into the MMU TLB. - * Software indicator bits 21, 22 and 28 must be clear. + * Software indicator bits 21 and 28 must be clear. * Software indicator bits 24, 25, 26, and 27 must be * set. All other Linux PTE bits control the behavior * of the MMU. */ -2: li r11, 0x00f0 - rlwimi r10, r11, 0, 24, 28 /* Set 24-27, clear 28 */ + li r11, 0x00f0 + rlwimi r10, r11, 0, 0x07f8 /* Set 24-27, clear 21-23,28 */ DO_8xx_CPU6(0x2d80, r3) mtspr SPRN_MI_RPN, r10 /* Update TLB entry */ - mfspr r10, SPRN_M_TW /* Restore registers */ + /* Restore registers */ +#ifndef CONFIG_8xx_CPU6 + mfspr r10, SPRN_DAR + mtcr r10 + mtspr SPRN_DAR, r11 /* Tag DAR */ + mfspr r11, SPRN_SPRG2 +#else lwz r11, 0(r0) mtcr r11 lwz r11, 4(r0) -#ifdef CONFIG_8xx_CPU6 lwz r3, 8(r0) #endif + mfspr r10, SPRN_M_TW rfi +2: + mfspr r11, SPRN_SRR1 + /* clear all error bits as TLB Miss + * sets a few unconditionally + */ + rlwinm r11, r11, 0, 0xffff + mtspr SPRN_SRR1, r11 + + /* Restore registers */ +#ifndef CONFIG_8xx_CPU6 + mfspr r10, SPRN_DAR + mtcr r10 + li r11, 0x00f0 + mtspr SPRN_DAR, r11 /* Tag DAR */ + mfspr r11, SPRN_SPRG2 +#else + lwz r11, 0(r0) + mtcr r11 + lwz r11, 4(r0) + lwz r3, 8(r0) +#endif + mfspr r10, SPRN_M_TW + b InstructionAccess . = 0x1200 DataStoreTLBMiss: @@ -363,8 +404,13 @@ DataStoreTLBMiss: DO_8xx_CPU6(0x3f80, r3) mtspr SPRN_M_TW, r10 /* Save a couple of working registers */ mfcr r10 +#ifdef CONFIG_8xx_CPU6 stw r10, 0(r0) stw r11, 4(r0) +#else + mtspr SPRN_DAR, r10 + mtspr SPRN_SPRG2, r11 +#endif mfspr r10, SPRN_M_TWB /* Get level 1 table entry address */ /* If we are faulting a kernel address, we have to use the @@ -395,15 +441,38 @@ DataStoreTLBMiss: * above. */ rlwimi r11, r10, 0, 27, 27 + /* Insert the WriteThru flag into the TWC from the Linux PTE. + * It is bit 25 in the Linux PTE and bit 30 in the TWC + */ + rlwimi r11, r10, 32-5, 30, 30 DO_8xx_CPU6(0x3b80, r3) mtspr SPRN_MD_TWC, r11 - mfspr r11, SPRN_MD_TWC /* get the pte address again */ - ori r10, r10, _PAGE_ACCESSED - stw r10, 0(r11) + /* Both _PAGE_ACCESSED and _PAGE_PRESENT has to be set. + * We also need to know if the insn is a load/store, so: + * Clear _PAGE_PRESENT and load that which will + * trap into DTLB Error with store bit set accordinly. + */ + /* PRESENT=0x1, ACCESSED=0x20 + * r11 = ((r10 & PRESENT) & ((r10 & ACCESSED) >> 5)); + * r10 = (r10 & ~PRESENT) | r11; + */ +#ifdef CONFIG_SWAP + rlwinm r11, r10, 32-5, _PAGE_PRESENT + and r11, r11, r10 + rlwimi r10, r11, 0, _PAGE_PRESENT +#endif + /* Honour kernel RO, User NA */ + /* 0x200 == Extended encoding, bit 22 */ + rlwimi r10, r10, 32-2, 0x200 /* Copy USER to bit 22, 0x200 */ + /* r11 = (r10 & _PAGE_RW) >> 1 */ + rlwinm r11, r10, 32-1, 0x200 + or r10, r11, r10 + /* invert RW and 0x200 bits */ + xori r10, r10, _PAGE_RW | 0x200 /* The Linux PTE won't go exactly into the MMU TLB. - * Software indicator bits 21, 22 and 28 must be clear. + * Software indicator bits 22 and 28 must be clear. * Software indicator bits 24, 25, 26, and 27 must be * set. All other Linux PTE bits control the behavior * of the MMU. @@ -413,13 +482,20 @@ DataStoreTLBMiss: DO_8xx_CPU6(0x3d80, r3) mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ - mfspr r10, SPRN_M_TW /* Restore registers */ + /* Restore registers */ +#ifndef CONFIG_8xx_CPU6 + mfspr r10, SPRN_DAR + mtcr r10 + mtspr SPRN_DAR, r11 /* Tag DAR */ + mfspr r11, SPRN_SPRG2 +#else + mtspr SPRN_DAR, r11 /* Tag DAR */ lwz r11, 0(r0) mtcr r11 lwz r11, 4(r0) -#ifdef CONFIG_8xx_CPU6 lwz r3, 8(r0) #endif + mfspr r10, SPRN_M_TW rfi /* This is an instruction TLB error on the MPC8xx. This could be due @@ -449,88 +525,10 @@ DataTLBError: stw r10, 0(r0) stw r11, 4(r0) - /* First, make sure this was a store operation. - */ - mfspr r10, SPRN_DSISR - andis. r11, r10, 0x0200 /* If set, indicates store op */ - beq 2f - - /* The EA of a data TLB miss is automatically stored in the MD_EPN - * register. The EA of a data TLB error is automatically stored in - * the DAR, but not the MD_EPN register. We must copy the 20 most - * significant bits of the EA from the DAR to MD_EPN before we - * start walking the page tables. We also need to copy the CASID - * value from the M_CASID register. - * Addendum: The EA of a data TLB error is _supposed_ to be stored - * in DAR, but it seems that this doesn't happen in some cases, such - * as when the error is due to a dcbi instruction to a page with a - * TLB that doesn't have the changed bit set. In such cases, there - * does not appear to be any way to recover the EA of the error - * since it is neither in DAR nor MD_EPN. As a workaround, the - * _PAGE_HWWRITE bit is set for all kernel data pages when the PTEs - * are initialized in mapin_ram(). This will avoid the problem, - * assuming we only use the dcbi instruction on kernel addresses. - */ mfspr r10, SPRN_DAR - rlwinm r11, r10, 0, 0, 19 - ori r11, r11, MD_EVALID - mfspr r10, SPRN_M_CASID - rlwimi r11, r10, 0, 28, 31 - DO_8xx_CPU6(0x3780, r3) - mtspr SPRN_MD_EPN, r11 - - mfspr r10, SPRN_M_TWB /* Get level 1 table entry address */ - - /* If we are faulting a kernel address, we have to use the - * kernel page tables. - */ - andi. r11, r10, 0x0800 - beq 3f - lis r11, swapper_pg_dir@h - ori r11, r11, swapper_pg_dir@l - rlwimi r10, r11, 0, 2, 19 -3: - lwz r11, 0(r10) /* Get the level 1 entry */ - rlwinm. r10, r11,0,0,19 /* Extract page descriptor page address */ - beq 2f /* If zero, bail */ - - /* We have a pte table, so fetch the pte from the table. - */ - ori r11, r11, 1 /* Set valid bit in physical L2 page */ - DO_8xx_CPU6(0x3b80, r3) - mtspr SPRN_MD_TWC, r11 /* Load pte table base address */ - mfspr r11, SPRN_MD_TWC /* ....and get the pte address */ - lwz r10, 0(r11) /* Get the pte */ - - andi. r11, r10, _PAGE_RW /* Is it writeable? */ - beq 2f /* Bail out if not */ - - /* Update 'changed', among others. - */ - ori r10, r10, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE - mfspr r11, SPRN_MD_TWC /* Get pte address again */ - stw r10, 0(r11) /* and update pte in table */ - - /* The Linux PTE won't go exactly into the MMU TLB. - * Software indicator bits 21, 22 and 28 must be clear. - * Software indicator bits 24, 25, 26, and 27 must be - * set. All other Linux PTE bits control the behavior - * of the MMU. - */ - li r11, 0x00f0 - rlwimi r10, r11, 0, 24, 28 /* Set 24-27, clear 28 */ - DO_8xx_CPU6(0x3d80, r3) - mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ - - mfspr r10, SPRN_M_TW /* Restore registers */ - lwz r11, 0(r0) - mtcr r11 - lwz r11, 4(r0) -#ifdef CONFIG_8xx_CPU6 - lwz r3, 8(r0) -#endif - rfi -2: + cmpwi cr0, r10, 0x00f0 + beq- FixupDAR /* must be a buggy dcbX, icbi insn. */ +DARFixed:/* Return from dcbx instruction bug workaround, r10 holds value of DAR */ mfspr r10, SPRN_M_TW /* Restore registers */ lwz r11, 0(r0) mtcr r11 @@ -559,9 +557,139 @@ DataTLBError: . = 0x2000 - .globl giveup_fpu -giveup_fpu: - blr +/* This is the procedure to calculate the data EA for buggy dcbx,dcbi instructions + * by decoding the registers used by the dcbx instruction and adding them. + * DAR is set to the calculated address and r10 also holds the EA on exit. + */ + /* define if you don't want to use self modifying code */ +#define NO_SELF_MODIFYING_CODE +FixupDAR:/* Entry point for dcbx workaround. */ + /* fetch instruction from memory. */ + mfspr r10, SPRN_SRR0 + andis. r11, r10, 0x8000 /* Address >= 0x80000000 */ + DO_8xx_CPU6(0x3780, r3) + mtspr SPRN_MD_EPN, r10 + mfspr r11, SPRN_M_TWB /* Get level 1 table entry address */ + beq- 3f /* Branch if user space */ + lis r11, (swapper_pg_dir-PAGE_OFFSET)@h + ori r11, r11, (swapper_pg_dir-PAGE_OFFSET)@l + rlwimi r11, r10, 32-20, 0xffc /* r11 = r11&~0xffc|(r10>>20)&0xffc */ +3: lwz r11, 0(r11) /* Get the level 1 entry */ + DO_8xx_CPU6(0x3b80, r3) + mtspr SPRN_MD_TWC, r11 /* Load pte table base address */ + mfspr r11, SPRN_MD_TWC /* ....and get the pte address */ + lwz r11, 0(r11) /* Get the pte */ + /* concat physical page address(r11) and page offset(r10) */ + rlwimi r11, r10, 0, 20, 31 + lwz r11,0(r11) +/* Check if it really is a dcbx instruction. */ +/* dcbt and dcbtst does not generate DTLB Misses/Errors, + * no need to include them here */ + srwi r10, r11, 26 /* check if major OP code is 31 */ + cmpwi cr0, r10, 31 + bne- 141f + rlwinm r10, r11, 0, 21, 30 + cmpwi cr0, r10, 2028 /* Is dcbz? */ + beq+ 142f + cmpwi cr0, r10, 940 /* Is dcbi? */ + beq+ 142f + cmpwi cr0, r10, 108 /* Is dcbst? */ + beq+ 144f /* Fix up store bit! */ + cmpwi cr0, r10, 172 /* Is dcbf? */ + beq+ 142f + cmpwi cr0, r10, 1964 /* Is icbi? */ + beq+ 142f +141: mfspr r10, SPRN_DAR /* r10 must hold DAR at exit */ + b DARFixed /* Nope, go back to normal TLB processing */ + +144: mfspr r10, SPRN_DSISR + rlwinm r10, r10,0,7,5 /* Clear store bit for buggy dcbst insn */ + mtspr SPRN_DSISR, r10 +142: /* continue, it was a dcbx, dcbi instruction. */ +#ifdef CONFIG_8xx_CPU6 + lwz r3, 8(r0) /* restore r3 from memory */ +#endif +#ifndef NO_SELF_MODIFYING_CODE + andis. r10,r11,0x1f /* test if reg RA is r0 */ + li r10,modified_instr@l + dcbtst r0,r10 /* touch for store */ + rlwinm r11,r11,0,0,20 /* Zero lower 10 bits */ + oris r11,r11,640 /* Transform instr. to a "add r10,RA,RB" */ + ori r11,r11,532 + stw r11,0(r10) /* store add/and instruction */ + dcbf 0,r10 /* flush new instr. to memory. */ + icbi 0,r10 /* invalidate instr. cache line */ + lwz r11, 4(r0) /* restore r11 from memory */ + mfspr r10, SPRN_M_TW /* restore r10 from M_TW */ + isync /* Wait until new instr is loaded from memory */ +modified_instr: + .space 4 /* this is where the add instr. is stored */ + bne+ 143f + subf r10,r0,r10 /* r10=r10-r0, only if reg RA is r0 */ +143: mtdar r10 /* store faulting EA in DAR */ + b DARFixed /* Go back to normal TLB handling */ +#else + mfctr r10 + mtdar r10 /* save ctr reg in DAR */ + rlwinm r10, r11, 24, 24, 28 /* offset into jump table for reg RB */ + addi r10, r10, 150f@l /* add start of table */ + mtctr r10 /* load ctr with jump address */ + xor r10, r10, r10 /* sum starts at zero */ + bctr /* jump into table */ +150: + add r10, r10, r0 ;b 151f + add r10, r10, r1 ;b 151f + add r10, r10, r2 ;b 151f + add r10, r10, r3 ;b 151f + add r10, r10, r4 ;b 151f + add r10, r10, r5 ;b 151f + add r10, r10, r6 ;b 151f + add r10, r10, r7 ;b 151f + add r10, r10, r8 ;b 151f + add r10, r10, r9 ;b 151f + mtctr r11 ;b 154f /* r10 needs special handling */ + mtctr r11 ;b 153f /* r11 needs special handling */ + add r10, r10, r12 ;b 151f + add r10, r10, r13 ;b 151f + add r10, r10, r14 ;b 151f + add r10, r10, r15 ;b 151f + add r10, r10, r16 ;b 151f + add r10, r10, r17 ;b 151f + add r10, r10, r18 ;b 151f + add r10, r10, r19 ;b 151f + add r10, r10, r20 ;b 151f + add r10, r10, r21 ;b 151f + add r10, r10, r22 ;b 151f + add r10, r10, r23 ;b 151f + add r10, r10, r24 ;b 151f + add r10, r10, r25 ;b 151f + add r10, r10, r26 ;b 151f + add r10, r10, r27 ;b 151f + add r10, r10, r28 ;b 151f + add r10, r10, r29 ;b 151f + add r10, r10, r30 ;b 151f + add r10, r10, r31 +151: + rlwinm. r11,r11,19,24,28 /* offset into jump table for reg RA */ + beq 152f /* if reg RA is zero, don't add it */ + addi r11, r11, 150b@l /* add start of table */ + mtctr r11 /* load ctr with jump address */ + rlwinm r11,r11,0,16,10 /* make sure we don't execute this more than once */ + bctr /* jump into table */ +152: + mfdar r11 + mtctr r11 /* restore ctr reg from DAR */ + mtdar r10 /* save fault EA to DAR */ + b DARFixed /* Go back to normal TLB handling */ + + /* special handling for r10,r11 since these are modified already */ +153: lwz r11, 4(r0) /* load r11 from memory */ + b 155f +154: mfspr r11, SPRN_M_TW /* load r10 from M_TW */ +155: add r10, r10, r11 /* add it */ + mfctr r11 /* restore r11 */ + b 151b +#endif /* * This is where the main kernel code starts. @@ -574,9 +702,7 @@ start_here: /* ptr to phys current thread */ tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ - mtspr SPRN_SPRG3,r4 - li r3,0 - mtspr SPRN_SPRG2,r3 /* 0 => r1 has kernel sp */ + mtspr SPRN_SPRG_THREAD,r4 /* stack */ lis r1,init_thread_union@ha @@ -589,11 +715,8 @@ start_here: /* * Decide what sort of machine this is and initialize the MMU. */ - mr r3,r31 - mr r4,r30 - mr r5,r29 - mr r6,r28 - mr r7,r27 + li r3,0 + mr r4,r31 bl machine_init bl MMU_init @@ -659,12 +782,12 @@ start_here: */ initial_mmu: tlbia /* Invalidate all TLB entries */ -#ifdef CONFIG_PIN_TLB +/* Always pin the first 8 MB ITLB to prevent ITLB + misses while mucking around with SRR0/SRR1 in asm +*/ lis r8, MI_RSV4I@h ori r8, r8, 0x1c00 -#else - li r8, 0 -#endif + mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */ #ifdef CONFIG_PIN_TLB @@ -733,13 +856,16 @@ initial_mmu: mtspr SPRN_MD_TWC, r9 li r11, MI_BOOTINIT /* Create RPN for address 0 */ addis r11, r11, 0x0080 /* Add 8M */ - mtspr SPRN_MD_RPN, r8 + mtspr SPRN_MD_RPN, r11 + + addi r10, r10, 0x0100 + mtspr SPRN_MD_CTR, r10 addis r8, r8, 0x0080 /* Add 8M */ mtspr SPRN_MD_EPN, r8 mtspr SPRN_MD_TWC, r9 addis r11, r11, 0x0080 /* Add 8M */ - mtspr SPRN_MD_RPN, r8 + mtspr SPRN_MD_RPN, r11 #endif /* Since the cache is enabled according to the information we @@ -838,14 +964,6 @@ empty_zero_page: swapper_pg_dir: .space 4096 -/* - * This space gets a copy of optional info passed to us by the bootstrap - * Used to pass parameters into the kernel like root=/dev/sda1, etc. - */ - .globl cmd_line -cmd_line: - .space 512 - /* Room for two PTE table poiners, usually the kernel and current user * pointer to their respective root page table (pgdir). */ diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h new file mode 100644 index 00000000000..a620203f7de --- /dev/null +++ b/arch/powerpc/kernel/head_booke.h @@ -0,0 +1,445 @@ +#ifndef __HEAD_BOOKE_H__ +#define __HEAD_BOOKE_H__ + +#include <asm/ptrace.h> /* for STACK_FRAME_REGS_MARKER */ +#include <asm/kvm_asm.h> +#include <asm/kvm_booke_hv_asm.h> + +/* + * Macros used for common Book-e exception handling + */ + +#define SET_IVOR(vector_number, vector_label) \ + li r26,vector_label@l; \ + mtspr SPRN_IVOR##vector_number,r26; \ + sync + +#if (THREAD_SHIFT < 15) +#define ALLOC_STACK_FRAME(reg, val) \ + addi reg,reg,val +#else +#define ALLOC_STACK_FRAME(reg, val) \ + addis reg,reg,val@ha; \ + addi reg,reg,val@l +#endif + +/* + * Macro used to get to thread save registers. + * Note that entries 0-3 are used for the prolog code, and the remaining + * entries are available for specific exception use in the event a handler + * requires more than 4 scratch registers. + */ +#define THREAD_NORMSAVE(offset) (THREAD_NORMSAVES + (offset * 4)) + +#define NORMAL_EXCEPTION_PROLOG(intno) \ + mtspr SPRN_SPRG_WSCRATCH0, r10; /* save one register */ \ + mfspr r10, SPRN_SPRG_THREAD; \ + stw r11, THREAD_NORMSAVE(0)(r10); \ + stw r13, THREAD_NORMSAVE(2)(r10); \ + mfcr r13; /* save CR in r13 for now */\ + mfspr r11, SPRN_SRR1; \ + DO_KVM BOOKE_INTERRUPT_##intno SPRN_SRR1; \ + andi. r11, r11, MSR_PR; /* check whether user or kernel */\ + mr r11, r1; \ + beq 1f; \ + /* if from user, start at top of this thread's kernel stack */ \ + lwz r11, THREAD_INFO-THREAD(r10); \ + ALLOC_STACK_FRAME(r11, THREAD_SIZE); \ +1 : subi r11, r11, INT_FRAME_SIZE; /* Allocate exception frame */ \ + stw r13, _CCR(r11); /* save various registers */ \ + stw r12,GPR12(r11); \ + stw r9,GPR9(r11); \ + mfspr r13, SPRN_SPRG_RSCRATCH0; \ + stw r13, GPR10(r11); \ + lwz r12, THREAD_NORMSAVE(0)(r10); \ + stw r12,GPR11(r11); \ + lwz r13, THREAD_NORMSAVE(2)(r10); /* restore r13 */ \ + mflr r10; \ + stw r10,_LINK(r11); \ + mfspr r12,SPRN_SRR0; \ + stw r1, GPR1(r11); \ + mfspr r9,SPRN_SRR1; \ + stw r1, 0(r11); \ + mr r1, r11; \ + rlwinm r9,r9,0,14,12; /* clear MSR_WE (necessary?) */\ + stw r0,GPR0(r11); \ + lis r10, STACK_FRAME_REGS_MARKER@ha;/* exception frame marker */ \ + addi r10, r10, STACK_FRAME_REGS_MARKER@l; \ + stw r10, 8(r11); \ + SAVE_4GPRS(3, r11); \ + SAVE_2GPRS(7, r11) + +/* To handle the additional exception priority levels on 40x and Book-E + * processors we allocate a stack per additional priority level. + * + * On 40x critical is the only additional level + * On 44x/e500 we have critical and machine check + * On e200 we have critical and debug (machine check occurs via critical) + * + * Additionally we reserve a SPRG for each priority level so we can free up a + * GPR to use as the base for indirect access to the exception stacks. This + * is necessary since the MMU is always on, for Book-E parts, and the stacks + * are offset from KERNELBASE. + * + * There is some space optimization to be had here if desired. However + * to allow for a common kernel with support for debug exceptions either + * going to critical or their own debug level we aren't currently + * providing configurations that micro-optimize space usage. + */ + +#define MC_STACK_BASE mcheckirq_ctx +#define CRIT_STACK_BASE critirq_ctx + +/* only on e500mc/e200 */ +#define DBG_STACK_BASE dbgirq_ctx + +#define EXC_LVL_FRAME_OVERHEAD (THREAD_SIZE - INT_FRAME_SIZE - EXC_LVL_SIZE) + +#ifdef CONFIG_SMP +#define BOOKE_LOAD_EXC_LEVEL_STACK(level) \ + mfspr r8,SPRN_PIR; \ + slwi r8,r8,2; \ + addis r8,r8,level##_STACK_BASE@ha; \ + lwz r8,level##_STACK_BASE@l(r8); \ + addi r8,r8,EXC_LVL_FRAME_OVERHEAD; +#else +#define BOOKE_LOAD_EXC_LEVEL_STACK(level) \ + lis r8,level##_STACK_BASE@ha; \ + lwz r8,level##_STACK_BASE@l(r8); \ + addi r8,r8,EXC_LVL_FRAME_OVERHEAD; +#endif + +/* + * Exception prolog for critical/machine check exceptions. This is a + * little different from the normal exception prolog above since a + * critical/machine check exception can potentially occur at any point + * during normal exception processing. Thus we cannot use the same SPRG + * registers as the normal prolog above. Instead we use a portion of the + * critical/machine check exception stack at low physical addresses. + */ +#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, intno, exc_level_srr0, exc_level_srr1) \ + mtspr SPRN_SPRG_WSCRATCH_##exc_level,r8; \ + BOOKE_LOAD_EXC_LEVEL_STACK(exc_level);/* r8 points to the exc_level stack*/ \ + stw r9,GPR9(r8); /* save various registers */\ + mfcr r9; /* save CR in r9 for now */\ + stw r10,GPR10(r8); \ + stw r11,GPR11(r8); \ + stw r9,_CCR(r8); /* save CR on stack */\ + mfspr r11,exc_level_srr1; /* check whether user or kernel */\ + DO_KVM BOOKE_INTERRUPT_##intno exc_level_srr1; \ + andi. r11,r11,MSR_PR; \ + mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ + lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ + addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ + beq 1f; \ + /* COMING FROM USER MODE */ \ + stw r9,_CCR(r11); /* save CR */\ + lwz r10,GPR10(r8); /* copy regs from exception stack */\ + lwz r9,GPR9(r8); \ + stw r10,GPR10(r11); \ + lwz r10,GPR11(r8); \ + stw r9,GPR9(r11); \ + stw r10,GPR11(r11); \ + b 2f; \ + /* COMING FROM PRIV MODE */ \ +1: lwz r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r11); \ + lwz r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r11); \ + stw r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r8); \ + stw r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r8); \ + lwz r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r11); \ + stw r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r8); \ + mr r11,r8; \ +2: mfspr r8,SPRN_SPRG_RSCRATCH_##exc_level; \ + stw r12,GPR12(r11); /* save various registers */\ + mflr r10; \ + stw r10,_LINK(r11); \ + mfspr r12,SPRN_DEAR; /* save DEAR and ESR in the frame */\ + stw r12,_DEAR(r11); /* since they may have had stuff */\ + mfspr r9,SPRN_ESR; /* in them at the point where the */\ + stw r9,_ESR(r11); /* exception was taken */\ + mfspr r12,exc_level_srr0; \ + stw r1,GPR1(r11); \ + mfspr r9,exc_level_srr1; \ + stw r1,0(r11); \ + mr r1,r11; \ + rlwinm r9,r9,0,14,12; /* clear MSR_WE (necessary?) */\ + stw r0,GPR0(r11); \ + SAVE_4GPRS(3, r11); \ + SAVE_2GPRS(7, r11) + +#define CRITICAL_EXCEPTION_PROLOG(intno) \ + EXC_LEVEL_EXCEPTION_PROLOG(CRIT, intno, SPRN_CSRR0, SPRN_CSRR1) +#define DEBUG_EXCEPTION_PROLOG \ + EXC_LEVEL_EXCEPTION_PROLOG(DBG, DEBUG, SPRN_DSRR0, SPRN_DSRR1) +#define MCHECK_EXCEPTION_PROLOG \ + EXC_LEVEL_EXCEPTION_PROLOG(MC, MACHINE_CHECK, \ + SPRN_MCSRR0, SPRN_MCSRR1) + +/* + * Guest Doorbell -- this is a bit odd in that uses GSRR0/1 despite + * being delivered to the host. This exception can only happen + * inside a KVM guest -- so we just handle up to the DO_KVM rather + * than try to fit this into one of the existing prolog macros. + */ +#define GUEST_DOORBELL_EXCEPTION \ + START_EXCEPTION(GuestDoorbell); \ + mtspr SPRN_SPRG_WSCRATCH0, r10; /* save one register */ \ + mfspr r10, SPRN_SPRG_THREAD; \ + stw r11, THREAD_NORMSAVE(0)(r10); \ + mfspr r11, SPRN_SRR1; \ + stw r13, THREAD_NORMSAVE(2)(r10); \ + mfcr r13; /* save CR in r13 for now */\ + DO_KVM BOOKE_INTERRUPT_GUEST_DBELL SPRN_GSRR1; \ + trap + +/* + * Exception vectors. + */ +#define START_EXCEPTION(label) \ + .align 5; \ +label: + +#define EXCEPTION(n, intno, label, hdlr, xfer) \ + START_EXCEPTION(label); \ + NORMAL_EXCEPTION_PROLOG(intno); \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + xfer(n, hdlr) + +#define CRITICAL_EXCEPTION(n, intno, label, hdlr) \ + START_EXCEPTION(label); \ + CRITICAL_EXCEPTION_PROLOG(intno); \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ + NOCOPY, crit_transfer_to_handler, \ + ret_from_crit_exc) + +#define MCHECK_EXCEPTION(n, label, hdlr) \ + START_EXCEPTION(label); \ + MCHECK_EXCEPTION_PROLOG; \ + mfspr r5,SPRN_ESR; \ + stw r5,_ESR(r11); \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + EXC_XFER_TEMPLATE(hdlr, n+4, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ + NOCOPY, mcheck_transfer_to_handler, \ + ret_from_mcheck_exc) + +#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret) \ + li r10,trap; \ + stw r10,_TRAP(r11); \ + lis r10,msr@h; \ + ori r10,r10,msr@l; \ + copyee(r10, r9); \ + bl tfer; \ + .long hdlr; \ + .long ret + +#define COPY_EE(d, s) rlwimi d,s,0,16,16 +#define NOCOPY(d, s) + +#define EXC_XFER_STD(n, hdlr) \ + EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, NOCOPY, transfer_to_handler_full, \ + ret_from_except_full) + +#define EXC_XFER_LITE(n, hdlr) \ + EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \ + ret_from_except) + +#define EXC_XFER_EE(n, hdlr) \ + EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \ + ret_from_except_full) + +#define EXC_XFER_EE_LITE(n, hdlr) \ + EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, COPY_EE, transfer_to_handler, \ + ret_from_except) + +/* Check for a single step debug exception while in an exception + * handler before state has been saved. This is to catch the case + * where an instruction that we are trying to single step causes + * an exception (eg ITLB/DTLB miss) and thus the first instruction of + * the exception handler generates a single step debug exception. + * + * If we get a debug trap on the first instruction of an exception handler, + * we reset the MSR_DE in the _exception handler's_ MSR (the debug trap is + * a critical exception, so we are using SPRN_CSRR1 to manipulate the MSR). + * The exception handler was handling a non-critical interrupt, so it will + * save (and later restore) the MSR via SPRN_CSRR1, which will still have + * the MSR_DE bit set. + */ +#define DEBUG_DEBUG_EXCEPTION \ + START_EXCEPTION(DebugDebug); \ + DEBUG_EXCEPTION_PROLOG; \ + \ + /* \ + * If there is a single step or branch-taken exception in an \ + * exception entry sequence, it was probably meant to apply to \ + * the code where the exception occurred (since exception entry \ + * doesn't turn off DE automatically). We simulate the effect \ + * of turning off DE on entry to an exception handler by turning \ + * off DE in the DSRR1 value and clearing the debug status. \ + */ \ + mfspr r10,SPRN_DBSR; /* check single-step/branch taken */ \ + andis. r10,r10,(DBSR_IC|DBSR_BT)@h; \ + beq+ 2f; \ + \ + lis r10,interrupt_base@h; /* check if exception in vectors */ \ + ori r10,r10,interrupt_base@l; \ + cmplw r12,r10; \ + blt+ 2f; /* addr below exception vectors */ \ + \ + lis r10,interrupt_end@h; \ + ori r10,r10,interrupt_end@l; \ + cmplw r12,r10; \ + bgt+ 2f; /* addr above exception vectors */ \ + \ + /* here it looks like we got an inappropriate debug exception. */ \ +1: rlwinm r9,r9,0,~MSR_DE; /* clear DE in the CDRR1 value */ \ + lis r10,(DBSR_IC|DBSR_BT)@h; /* clear the IC event */ \ + mtspr SPRN_DBSR,r10; \ + /* restore state and get out */ \ + lwz r10,_CCR(r11); \ + lwz r0,GPR0(r11); \ + lwz r1,GPR1(r11); \ + mtcrf 0x80,r10; \ + mtspr SPRN_DSRR0,r12; \ + mtspr SPRN_DSRR1,r9; \ + lwz r9,GPR9(r11); \ + lwz r12,GPR12(r11); \ + mtspr SPRN_SPRG_WSCRATCH_DBG,r8; \ + BOOKE_LOAD_EXC_LEVEL_STACK(DBG); /* r8 points to the debug stack */ \ + lwz r10,GPR10(r8); \ + lwz r11,GPR11(r8); \ + mfspr r8,SPRN_SPRG_RSCRATCH_DBG; \ + \ + PPC_RFDI; \ + b .; \ + \ + /* continue normal handling for a debug exception... */ \ +2: mfspr r4,SPRN_DBSR; \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + EXC_XFER_TEMPLATE(DebugException, 0x2008, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), NOCOPY, debug_transfer_to_handler, ret_from_debug_exc) + +#define DEBUG_CRIT_EXCEPTION \ + START_EXCEPTION(DebugCrit); \ + CRITICAL_EXCEPTION_PROLOG(DEBUG); \ + \ + /* \ + * If there is a single step or branch-taken exception in an \ + * exception entry sequence, it was probably meant to apply to \ + * the code where the exception occurred (since exception entry \ + * doesn't turn off DE automatically). We simulate the effect \ + * of turning off DE on entry to an exception handler by turning \ + * off DE in the CSRR1 value and clearing the debug status. \ + */ \ + mfspr r10,SPRN_DBSR; /* check single-step/branch taken */ \ + andis. r10,r10,(DBSR_IC|DBSR_BT)@h; \ + beq+ 2f; \ + \ + lis r10,interrupt_base@h; /* check if exception in vectors */ \ + ori r10,r10,interrupt_base@l; \ + cmplw r12,r10; \ + blt+ 2f; /* addr below exception vectors */ \ + \ + lis r10,interrupt_end@h; \ + ori r10,r10,interrupt_end@l; \ + cmplw r12,r10; \ + bgt+ 2f; /* addr above exception vectors */ \ + \ + /* here it looks like we got an inappropriate debug exception. */ \ +1: rlwinm r9,r9,0,~MSR_DE; /* clear DE in the CSRR1 value */ \ + lis r10,(DBSR_IC|DBSR_BT)@h; /* clear the IC event */ \ + mtspr SPRN_DBSR,r10; \ + /* restore state and get out */ \ + lwz r10,_CCR(r11); \ + lwz r0,GPR0(r11); \ + lwz r1,GPR1(r11); \ + mtcrf 0x80,r10; \ + mtspr SPRN_CSRR0,r12; \ + mtspr SPRN_CSRR1,r9; \ + lwz r9,GPR9(r11); \ + lwz r12,GPR12(r11); \ + mtspr SPRN_SPRG_WSCRATCH_CRIT,r8; \ + BOOKE_LOAD_EXC_LEVEL_STACK(CRIT); /* r8 points to the debug stack */ \ + lwz r10,GPR10(r8); \ + lwz r11,GPR11(r8); \ + mfspr r8,SPRN_SPRG_RSCRATCH_CRIT; \ + \ + rfci; \ + b .; \ + \ + /* continue normal handling for a critical exception... */ \ +2: mfspr r4,SPRN_DBSR; \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + EXC_XFER_TEMPLATE(DebugException, 0x2002, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), NOCOPY, crit_transfer_to_handler, ret_from_crit_exc) + +#define DATA_STORAGE_EXCEPTION \ + START_EXCEPTION(DataStorage) \ + NORMAL_EXCEPTION_PROLOG(DATA_STORAGE); \ + mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \ + stw r5,_ESR(r11); \ + mfspr r4,SPRN_DEAR; /* Grab the DEAR */ \ + EXC_XFER_LITE(0x0300, handle_page_fault) + +#define INSTRUCTION_STORAGE_EXCEPTION \ + START_EXCEPTION(InstructionStorage) \ + NORMAL_EXCEPTION_PROLOG(INST_STORAGE); \ + mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \ + stw r5,_ESR(r11); \ + mr r4,r12; /* Pass SRR0 as arg2 */ \ + li r5,0; /* Pass zero as arg3 */ \ + EXC_XFER_LITE(0x0400, handle_page_fault) + +#define ALIGNMENT_EXCEPTION \ + START_EXCEPTION(Alignment) \ + NORMAL_EXCEPTION_PROLOG(ALIGNMENT); \ + mfspr r4,SPRN_DEAR; /* Grab the DEAR and save it */ \ + stw r4,_DEAR(r11); \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + EXC_XFER_EE(0x0600, alignment_exception) + +#define PROGRAM_EXCEPTION \ + START_EXCEPTION(Program) \ + NORMAL_EXCEPTION_PROLOG(PROGRAM); \ + mfspr r4,SPRN_ESR; /* Grab the ESR and save it */ \ + stw r4,_ESR(r11); \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + EXC_XFER_STD(0x0700, program_check_exception) + +#define DECREMENTER_EXCEPTION \ + START_EXCEPTION(Decrementer) \ + NORMAL_EXCEPTION_PROLOG(DECREMENTER); \ + lis r0,TSR_DIS@h; /* Setup the DEC interrupt mask */ \ + mtspr SPRN_TSR,r0; /* Clear the DEC interrupt */ \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + EXC_XFER_LITE(0x0900, timer_interrupt) + +#define FP_UNAVAILABLE_EXCEPTION \ + START_EXCEPTION(FloatingPointUnavailable) \ + NORMAL_EXCEPTION_PROLOG(FP_UNAVAIL); \ + beq 1f; \ + bl load_up_fpu; /* if from user, just load it up */ \ + b fast_exception_return; \ +1: addi r3,r1,STACK_FRAME_OVERHEAD; \ + EXC_XFER_EE_LITE(0x800, kernel_fp_unavailable_exception) + +#ifndef __ASSEMBLY__ +struct exception_regs { + unsigned long mas0; + unsigned long mas1; + unsigned long mas2; + unsigned long mas3; + unsigned long mas6; + unsigned long mas7; + unsigned long srr0; + unsigned long srr1; + unsigned long csrr0; + unsigned long csrr1; + unsigned long dsrr0; + unsigned long dsrr1; + unsigned long saved_ksp_limit; +}; + +/* ensure this structure is always sized to a multiple of the stack alignment */ +#define STACK_EXC_LVL_FRAME_SIZE _ALIGN_UP(sizeof (struct exception_regs), 16) + +#endif /* __ASSEMBLY__ */ +#endif /* __HEAD_BOOKE_H__ */ diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 5063c603fad..b497188a94a 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -1,30 +1,28 @@ /* - * arch/ppc/kernel/head_fsl_booke.S - * * Kernel execution entry point code. * * Copyright (c) 1995-1996 Gary Thomas <gdt@linuxppc.org> - * Initial PowerPC version. + * Initial PowerPC version. * Copyright (c) 1996 Cort Dougan <cort@cs.nmt.edu> - * Rewritten for PReP + * Rewritten for PReP * Copyright (c) 1996 Paul Mackerras <paulus@cs.anu.edu.au> - * Low-level exception handers, MMU support, and rewrite. + * Low-level exception handers, MMU support, and rewrite. * Copyright (c) 1997 Dan Malek <dmalek@jlc.net> - * PowerPC 8xx modifications. + * PowerPC 8xx modifications. * Copyright (c) 1998-1999 TiVo, Inc. - * PowerPC 403GCX modifications. + * PowerPC 403GCX modifications. * Copyright (c) 1999 Grant Erickson <grant@lcse.umn.edu> - * PowerPC 403GCX/405GP modifications. + * PowerPC 403GCX/405GP modifications. * Copyright 2000 MontaVista Software Inc. * PPC405 modifications - * PowerPC 403GCX/405GP modifications. - * Author: MontaVista Software, Inc. - * frank_rowand@mvista.com or source@mvista.com - * debbie_chu@mvista.com + * PowerPC 403GCX/405GP modifications. + * Author: MontaVista Software, Inc. + * frank_rowand@mvista.com or source@mvista.com + * debbie_chu@mvista.com * Copyright 2002-2004 MontaVista Software, Inc. - * PowerPC 44x support, Matt Porter <mporter@kernel.crashing.org> + * PowerPC 44x support, Matt Porter <mporter@kernel.crashing.org> * Copyright 2004 Freescale Semiconductor, Inc - * PowerPC e500 modifications, Kumar Gala <kumar.gala@freescale.com> + * PowerPC e500 modifications, Kumar Gala <galak@kernel.crashing.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -32,7 +30,7 @@ * option) any later version. */ -#include <linux/config.h> +#include <linux/init.h> #include <linux/threads.h> #include <asm/processor.h> #include <asm/page.h> @@ -42,6 +40,8 @@ #include <asm/thread_info.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> +#include <asm/cache.h> +#include <asm/ptrace.h> #include "head_booke.h" /* As with the other PowerPC ports, it is expected that when code @@ -55,29 +55,93 @@ * r7 - End of kernel command line string * */ - .text -_GLOBAL(_stext) -_GLOBAL(_start) + __HEAD +_ENTRY(_stext); +_ENTRY(_start); /* * Reserve a word at a fixed location to store the address * of abatron_pteptrs */ nop -/* - * Save parameters we are passed - */ - mr r31,r3 - mr r30,r4 - mr r29,r5 - mr r28,r6 - mr r27,r7 - li r24,0 /* CPU number */ + + /* Translate device tree address to physical, save in r30/r31 */ + bl get_phys_addr + mr r30,r3 + mr r31,r4 + + li r25,0 /* phys kernel start (low) */ + li r24,0 /* CPU number */ + li r23,0 /* phys kernel start (high) */ + +#ifdef CONFIG_RELOCATABLE + LOAD_REG_ADDR_PIC(r3, _stext) /* Get our current runtime base */ + + /* Translate _stext address to physical, save in r23/r25 */ + bl get_phys_addr + mr r23,r3 + mr r25,r4 + + bl 0f +0: mflr r8 + addis r3,r8,(is_second_reloc - 0b)@ha + lwz r19,(is_second_reloc - 0b)@l(r3) + + /* Check if this is the second relocation. */ + cmpwi r19,1 + bne 1f + + /* + * For the second relocation, we already get the real memstart_addr + * from device tree. So we will map PAGE_OFFSET to memstart_addr, + * then the virtual address of start kernel should be: + * PAGE_OFFSET + (kernstart_addr - memstart_addr) + * Since the offset between kernstart_addr and memstart_addr should + * never be beyond 1G, so we can just use the lower 32bit of them + * for the calculation. + */ + lis r3,PAGE_OFFSET@h + + addis r4,r8,(kernstart_addr - 0b)@ha + addi r4,r4,(kernstart_addr - 0b)@l + lwz r5,4(r4) + + addis r6,r8,(memstart_addr - 0b)@ha + addi r6,r6,(memstart_addr - 0b)@l + lwz r7,4(r6) + + subf r5,r7,r5 + add r3,r3,r5 + b 2f + +1: + /* + * We have the runtime (virutal) address of our base. + * We calculate our shift of offset from a 64M page. + * We could map the 64M page we belong to at PAGE_OFFSET and + * get going from there. + */ + lis r4,KERNELBASE@h + ori r4,r4,KERNELBASE@l + rlwinm r6,r25,0,0x3ffffff /* r6 = PHYS_START % 64M */ + rlwinm r5,r4,0,0x3ffffff /* r5 = KERNELBASE % 64M */ + subf r3,r5,r6 /* r3 = r6 - r5 */ + add r3,r4,r3 /* Required Virtual Address */ + +2: bl relocate + + /* + * For the second relocation, we already set the right tlb entries + * for the kernel space, so skip the code in fsl_booke_entry_mapping.S + */ + cmpwi r19,1 + beq set_ivor +#endif /* We try to not make any assumptions about how the boot loader * setup or used the TLBs. We invalidate all mappings from the * boot loader and load a single entry in TLB1[0] to map the - * first 16M of kernel memory. Any boot info passed from the - * bootloader needs to live in this first 16M. + * first 64M of kernel memory. Any boot info passed from the + * bootloader needs to live in this first 64M. * * Requirement on bootloader: * - The page we're executing in needs to reside in TLB1 and @@ -92,175 +156,13 @@ _GLOBAL(_start) * if needed */ -/* 1. Find the index of the entry we're executing in */ - bl invstr /* Find our address */ -invstr: mflr r6 /* Make it accessible */ - mfmsr r7 - rlwinm r4,r7,27,31,31 /* extract MSR[IS] */ - mfspr r7, SPRN_PID0 - slwi r7,r7,16 - or r7,r7,r4 - mtspr SPRN_MAS6,r7 - tlbsx 0,r6 /* search MSR[IS], SPID=PID0 */ -#ifndef CONFIG_E200 - mfspr r7,SPRN_MAS1 - andis. r7,r7,MAS1_VALID@h - bne match_TLB - mfspr r7,SPRN_PID1 - slwi r7,r7,16 - or r7,r7,r4 - mtspr SPRN_MAS6,r7 - tlbsx 0,r6 /* search MSR[IS], SPID=PID1 */ - mfspr r7,SPRN_MAS1 - andis. r7,r7,MAS1_VALID@h - bne match_TLB - mfspr r7, SPRN_PID2 - slwi r7,r7,16 - or r7,r7,r4 - mtspr SPRN_MAS6,r7 - tlbsx 0,r6 /* Fall through, we had to match */ -#endif -match_TLB: - mfspr r7,SPRN_MAS0 - rlwinm r3,r7,16,20,31 /* Extract MAS0(Entry) */ - - mfspr r7,SPRN_MAS1 /* Insure IPROT set */ - oris r7,r7,MAS1_IPROT@h - mtspr SPRN_MAS1,r7 - tlbwe +_ENTRY(__early_start) -/* 2. Invalidate all entries except the entry we're executing in */ - mfspr r9,SPRN_TLB1CFG - andi. r9,r9,0xfff - li r6,0 /* Set Entry counter to 0 */ -1: lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ - rlwimi r7,r6,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r6) */ - mtspr SPRN_MAS0,r7 - tlbre - mfspr r7,SPRN_MAS1 - rlwinm r7,r7,0,2,31 /* Clear MAS1 Valid and IPROT */ - cmpw r3,r6 - beq skpinv /* Dont update the current execution TLB */ - mtspr SPRN_MAS1,r7 - tlbwe - isync -skpinv: addi r6,r6,1 /* Increment */ - cmpw r6,r9 /* Are we done? */ - bne 1b /* If not, repeat */ - - /* Invalidate TLB0 */ - li r6,0x04 - tlbivax 0,r6 -#ifdef CONFIG_SMP - tlbsync -#endif - /* Invalidate TLB1 */ - li r6,0x0c - tlbivax 0,r6 -#ifdef CONFIG_SMP - tlbsync -#endif - msync - -/* 3. Setup a temp mapping and jump to it */ - andi. r5, r3, 0x1 /* Find an entry not used and is non-zero */ - addi r5, r5, 0x1 - lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ - rlwimi r7,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ - mtspr SPRN_MAS0,r7 - tlbre - - /* Just modify the entry ID and EPN for the temp mapping */ - lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ - rlwimi r7,r5,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r5) */ - mtspr SPRN_MAS0,r7 - xori r6,r4,1 /* Setup TMP mapping in the other Address space */ - slwi r6,r6,12 - oris r6,r6,(MAS1_VALID|MAS1_IPROT)@h - ori r6,r6,(MAS1_TSIZE(BOOKE_PAGESZ_4K))@l - mtspr SPRN_MAS1,r6 - mfspr r6,SPRN_MAS2 - li r7,0 /* temp EPN = 0 */ - rlwimi r7,r6,0,20,31 - mtspr SPRN_MAS2,r7 - tlbwe - - xori r6,r4,1 - slwi r6,r6,5 /* setup new context with other address space */ - bl 1f /* Find our address */ -1: mflr r9 - rlwimi r7,r9,0,20,31 - addi r7,r7,24 - mtspr SPRN_SRR0,r7 - mtspr SPRN_SRR1,r6 - rfi - -/* 4. Clear out PIDs & Search info */ - li r6,0 - mtspr SPRN_PID0,r6 -#ifndef CONFIG_E200 - mtspr SPRN_PID1,r6 - mtspr SPRN_PID2,r6 -#endif - mtspr SPRN_MAS6,r6 - -/* 5. Invalidate mapping we started in */ - lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ - rlwimi r7,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ - mtspr SPRN_MAS0,r7 - tlbre - li r6,0 - mtspr SPRN_MAS1,r6 - tlbwe - /* Invalidate TLB1 */ - li r9,0x0c - tlbivax 0,r9 -#ifdef CONFIG_SMP - tlbsync -#endif - msync - -/* 6. Setup KERNELBASE mapping in TLB1[0] */ - lis r6,0x1000 /* Set MAS0(TLBSEL) = TLB1(1), ESEL = 0 */ - mtspr SPRN_MAS0,r6 - lis r6,(MAS1_VALID|MAS1_IPROT)@h - ori r6,r6,(MAS1_TSIZE(BOOKE_PAGESZ_16M))@l - mtspr SPRN_MAS1,r6 - li r7,0 - lis r6,KERNELBASE@h - ori r6,r6,KERNELBASE@l - rlwimi r6,r7,0,20,31 - mtspr SPRN_MAS2,r6 - li r7,(MAS3_SX|MAS3_SW|MAS3_SR) - mtspr SPRN_MAS3,r7 - tlbwe - -/* 7. Jump to KERNELBASE mapping */ - lis r7,MSR_KERNEL@h - ori r7,r7,MSR_KERNEL@l - bl 1f /* Find our address */ -1: mflr r9 - rlwimi r6,r9,0,20,31 - addi r6,r6,24 - mtspr SPRN_SRR0,r6 - mtspr SPRN_SRR1,r7 - rfi /* start execution out of TLB1[0] entry */ - -/* 8. Clear out the temp mapping */ - lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ - rlwimi r7,r5,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r5) */ - mtspr SPRN_MAS0,r7 - tlbre - mtspr SPRN_MAS1,r8 - tlbwe - /* Invalidate TLB1 */ - li r9,0x0c - tlbivax 0,r9 -#ifdef CONFIG_SMP - tlbsync -#endif - msync +#define ENTRY_MAPPING_BOOT_SETUP +#include "fsl_booke_entry_mapping.S" +#undef ENTRY_MAPPING_BOOT_SETUP +set_ivor: /* Establish the interrupt vector offsets */ SET_IVOR(0, CriticalInput); SET_IVOR(1, MachineCheck); @@ -277,24 +179,18 @@ skpinv: addi r6,r6,1 /* Increment */ SET_IVOR(12, WatchdogTimer); SET_IVOR(13, DataTLBError); SET_IVOR(14, InstructionTLBError); - SET_IVOR(15, Debug); - SET_IVOR(32, SPEUnavailable); - SET_IVOR(33, SPEFloatingPointData); - SET_IVOR(34, SPEFloatingPointRound); -#ifndef CONFIG_E200 - SET_IVOR(35, PerformanceMonitor); -#endif + SET_IVOR(15, DebugCrit); /* Establish the interrupt vector base */ lis r4,interrupt_base@h /* IVPR only uses the high 16-bits */ mtspr SPRN_IVPR,r4 /* Setup the defaults for TLB entries */ - li r2,(MAS4_TSIZED(BOOKE_PAGESZ_4K))@l + li r2,(MAS4_TSIZED(BOOK3E_PAGESZ_4K))@l #ifdef CONFIG_E200 oris r2,r2,MAS4_TLBSELD(1)@h #endif - mtspr SPRN_MAS4, r2 + mtspr SPRN_MAS4, r2 #if 0 /* Enable DOZE */ @@ -302,12 +198,6 @@ skpinv: addi r6,r6,1 /* Increment */ oris r2,r2,HID0_DOZE@h mtspr SPRN_HID0, r2 #endif -#ifdef CONFIG_E200 - /* enable dedicated debug exception handling resources (Debug APU) */ - mfspr r2,SPRN_HID0 - ori r2,r2,HID0_DAPUEN@l - mtspr SPRN_HID0,r2 -#endif #if !defined(CONFIG_BDI_SWITCH) /* @@ -316,11 +206,23 @@ skpinv: addi r6,r6,1 /* Increment */ */ lis r2,DBCR0_IDM@h mtspr SPRN_DBCR0,r2 + isync /* clear any residual debug events */ li r2,-1 mtspr SPRN_DBSR,r2 #endif +#ifdef CONFIG_SMP + /* Check to see if we're the second processor, and jump + * to the secondary_start code if so + */ + LOAD_REG_ADDR_PIC(r24, boot_cpuid) + lwz r24, 0(r24) + cmpwi r24, -1 + mfspr r24,SPRN_PIR + bne __secondary_start +#endif + /* * This is where the main kernel code starts. */ @@ -331,7 +233,7 @@ skpinv: addi r6,r6,1 /* Increment */ /* ptr to current thread */ addi r4,r2,THREAD /* init task's THREAD */ - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 /* stack */ lis r1,init_thread_union@h @@ -339,20 +241,39 @@ skpinv: addi r6,r6,1 /* Increment */ li r0,0 stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) + CURRENT_THREAD_INFO(r22, r1) + stw r24, TI_CPU(r22) + bl early_init - mfspr r3,SPRN_TLB1CFG - andi. r3,r3,0xfff - lis r4,num_tlbcam_entries@ha - stw r3,num_tlbcam_entries@l(r4) +#ifdef CONFIG_RELOCATABLE + mr r3,r30 + mr r4,r31 +#ifdef CONFIG_PHYS_64BIT + mr r5,r23 + mr r6,r25 +#else + mr r5,r25 +#endif + bl relocate_init +#endif + +#ifdef CONFIG_DYNAMIC_MEMSTART + lis r3,kernstart_addr@ha + la r3,kernstart_addr@l(r3) +#ifdef CONFIG_PHYS_64BIT + stw r23,0(r3) + stw r25,4(r3) +#else + stw r25,0(r3) +#endif +#endif + /* * Decide what sort of machine this is and initialize the MMU. */ - mr r3,r31 - mr r4,r30 - mr r5,r29 - mr r6,r28 - mr r7,r27 + mr r3,r30 + mr r4,r31 bl machine_init bl MMU_init @@ -386,18 +307,33 @@ skpinv: addi r6,r6,1 /* Increment */ * if we find the pte (fall through): * r11 is low pte word * r12 is pointer to the pte + * r10 is the pshift from the PGD, if we're a hugepage */ #ifdef CONFIG_PTE_64BIT -#define PTE_FLAGS_OFFSET 4 +#ifdef CONFIG_HUGETLB_PAGE +#define FIND_PTE \ + rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \ + lwzx r11, r12, r11; /* Get pgd/pmd entry */ \ + rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \ + blt 1000f; /* Normal non-huge page */ \ + beq 2f; /* Bail if no table */ \ + oris r11, r11, PD_HUGE@h; /* Put back address bit */ \ + andi. r10, r11, HUGEPD_SHIFT_MASK@l; /* extract size field */ \ + xor r12, r10, r11; /* drop size bits from pointer */ \ + b 1001f; \ +1000: rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \ + li r10, 0; /* clear r10 */ \ +1001: lwz r11, 4(r12); /* Get pte entry */ +#else #define FIND_PTE \ - rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \ + rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \ lwzx r11, r12, r11; /* Get pgd/pmd entry */ \ rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \ beq 2f; /* Bail if no table */ \ rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \ lwz r11, 4(r12); /* Get pte entry */ -#else -#define PTE_FLAGS_OFFSET 0 +#endif /* HUGEPAGE */ +#else /* !PTE_64BIT */ #define FIND_PTE \ rlwimi r11, r10, 12, 20, 29; /* Create L1 (pgdir/pmd) address */ \ lwz r11, 0(r11); /* Get L1 entry */ \ @@ -426,109 +362,35 @@ skpinv: addi r6,r6,1 /* Increment */ interrupt_base: /* Critical Input Interrupt */ - CRITICAL_EXCEPTION(0x0100, CriticalInput, unknown_exception) + CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception) /* Machine Check Interrupt */ #ifdef CONFIG_E200 /* no RFMCI, MCSRRs on E200 */ - CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception) + CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \ + machine_check_exception) #else MCHECK_EXCEPTION(0x0200, MachineCheck, machine_check_exception) #endif /* Data Storage Interrupt */ START_EXCEPTION(DataStorage) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 - mfcr r11 - mtspr SPRN_SPRG7W, r11 - - /* - * Check if it was a store fault, if not then bail - * because a user tried to access a kernel or - * read-protected page. Otherwise, get the - * offending address and handle it. - */ - mfspr r10, SPRN_ESR - andis. r10, r10, ESR_ST@h - beq 2f - - mfspr r10, SPRN_DEAR /* Get faulting address */ - - /* If we are faulting a kernel address, we have to use the - * kernel page tables. - */ - lis r11, TASK_SIZE@h - ori r11, r11, TASK_SIZE@l - cmplw 0, r10, r11 - bge 2f - - /* Get the PGD for the current thread */ -3: - mfspr r11,SPRN_SPRG3 - lwz r11,PGDIR(r11) -4: - FIND_PTE - - /* Are _PAGE_USER & _PAGE_RW set & _PAGE_HWWRITE not? */ - andi. r13, r11, _PAGE_RW|_PAGE_USER|_PAGE_HWWRITE - cmpwi 0, r13, _PAGE_RW|_PAGE_USER - bne 2f /* Bail if not */ - - /* Update 'changed'. */ - ori r11, r11, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE - stw r11, PTE_FLAGS_OFFSET(r12) /* Update Linux page table */ - - /* MAS2 not updated as the entry does exist in the tlb, this - fault taken to detect state transition (eg: COW -> DIRTY) - */ - andi. r11, r11, _PAGE_HWEXEC - rlwimi r11, r11, 31, 27, 27 /* SX <- _PAGE_HWEXEC */ - ori r11, r11, (MAS3_UW|MAS3_SW|MAS3_UR|MAS3_SR)@l /* set static perms */ - - /* update search PID in MAS6, AS = 0 */ - mfspr r12, SPRN_PID0 - slwi r12, r12, 16 - mtspr SPRN_MAS6, r12 - - /* find the TLB index that caused the fault. It has to be here. */ - tlbsx 0, r10 - - /* only update the perm bits, assume the RPN is fine */ - mfspr r12, SPRN_MAS3 - rlwimi r12, r11, 0, 20, 31 - mtspr SPRN_MAS3,r12 - tlbwe - - /* Done...restore registers and get out of here. */ - mfspr r11, SPRN_SPRG7R - mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 - rfi /* Force context change */ - -2: - /* - * The bailout. Restore registers to pre-exception conditions - * and call the heavyweights to help us out. - */ - mfspr r11, SPRN_SPRG7R - mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 - b data_access + NORMAL_EXCEPTION_PROLOG(DATA_STORAGE) + mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */ + stw r5,_ESR(r11) + mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ + andis. r10,r5,(ESR_ILK|ESR_DLK)@h + bne 1f + EXC_XFER_LITE(0x0300, handle_page_fault) +1: + addi r3,r1,STACK_FRAME_OVERHEAD + EXC_XFER_EE_LITE(0x0300, CacheLockingException) /* Instruction Storage Interrupt */ INSTRUCTION_STORAGE_EXCEPTION /* External Input Interrupt */ - EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE) + EXCEPTION(0x0500, EXTERNAL, ExternalInput, do_IRQ, EXC_XFER_LITE) /* Alignment Interrupt */ ALIGNMENT_EXCEPTION @@ -542,49 +404,59 @@ interrupt_base: #else #ifdef CONFIG_E200 /* E200 treats 'normal' floating point instructions as FP Unavail exception */ - EXCEPTION(0x0800, FloatingPointUnavailable, program_check_exception, EXC_XFER_EE) + EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \ + program_check_exception, EXC_XFER_EE) #else - EXCEPTION(0x0800, FloatingPointUnavailable, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \ + unknown_exception, EXC_XFER_EE) #endif #endif /* System Call Interrupt */ START_EXCEPTION(SystemCall) - NORMAL_EXCEPTION_PROLOG + NORMAL_EXCEPTION_PROLOG(SYSCALL) EXC_XFER_EE_LITE(0x0c00, DoSyscall) - /* Auxillary Processor Unavailable Interrupt */ - EXCEPTION(0x2900, AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE) + /* Auxiliary Processor Unavailable Interrupt */ + EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \ + unknown_exception, EXC_XFER_EE) /* Decrementer Interrupt */ DECREMENTER_EXCEPTION /* Fixed Internal Timer Interrupt */ /* TODO: Add FIT support */ - EXCEPTION(0x3100, FixedIntervalTimer, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x3100, FIT, FixedIntervalTimer, \ + unknown_exception, EXC_XFER_EE) /* Watchdog Timer Interrupt */ #ifdef CONFIG_BOOKE_WDT - CRITICAL_EXCEPTION(0x3200, WatchdogTimer, WatchdogException) + CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, WatchdogException) #else - CRITICAL_EXCEPTION(0x3200, WatchdogTimer, unknown_exception) + CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, unknown_exception) #endif /* Data TLB Error Interrupt */ START_EXCEPTION(DataTLBError) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 - mfcr r11 - mtspr SPRN_SPRG7W, r11 + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mfspr r10, SPRN_SPRG_THREAD + stw r11, THREAD_NORMSAVE(0)(r10) +#ifdef CONFIG_KVM_BOOKE_HV +BEGIN_FTR_SECTION + mfspr r11, SPRN_SRR1 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) +#endif + stw r12, THREAD_NORMSAVE(1)(r10) + stw r13, THREAD_NORMSAVE(2)(r10) + mfcr r13 + stw r13, THREAD_NORMSAVE(3)(r10) + DO_KVM BOOKE_INTERRUPT_DTLB_MISS SPRN_SRR1 mfspr r10, SPRN_DEAR /* Get faulting address */ /* If we are faulting a kernel address, we have to use the * kernel page tables. */ - lis r11, TASK_SIZE@h - ori r11, r11, TASK_SIZE@l + lis r11, PAGE_OFFSET@h cmplw 5, r10, r11 blt 5, 3f lis r11, swapper_pg_dir@h @@ -598,33 +470,60 @@ interrupt_base: /* Get the PGD for the current thread */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: + /* Mask of required permission bits. Note that while we + * do copy ESR:ST to _PAGE_RW position as trying to write + * to an RO page is pretty common, we don't do it with + * _PAGE_DIRTY. We could do it, but it's a fairly rare + * event so I'd rather take the overhead when it happens + * rather than adding an instruction here. We should measure + * whether the whole thing is worth it in the first place + * as we could avoid loading SPRN_ESR completely in the first + * place... + * + * TODO: Is it worth doing that mfspr & rlwimi in the first + * place or can we save a couple of instructions here ? + */ + mfspr r12,SPRN_ESR +#ifdef CONFIG_PTE_64BIT + li r13,_PAGE_PRESENT + oris r13,r13,_PAGE_ACCESSED@h +#else + li r13,_PAGE_PRESENT|_PAGE_ACCESSED +#endif + rlwimi r13,r12,11,29,29 + FIND_PTE - andi. r13, r11, _PAGE_PRESENT /* Is the page present? */ - beq 2f /* Bail if not present */ + andc. r13,r13,r11 /* Check permission */ #ifdef CONFIG_PTE_64BIT - lwz r13, 0(r12) +#ifdef CONFIG_SMP + subf r13,r11,r12 /* create false data dep */ + lwzx r13,r11,r13 /* Get upper pte bits */ +#else + lwz r13,0(r12) /* Get upper pte bits */ +#endif #endif - ori r11, r11, _PAGE_ACCESSED - stw r11, PTE_FLAGS_OFFSET(r12) - /* Jump to common tlb load */ + bne 2f /* Bail if permission/valid mismach */ + + /* Jump to common tlb load */ b finish_tlb_load 2: /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r11, SPRN_SPRG7R + mfspr r10, SPRN_SPRG_THREAD + lwz r11, THREAD_NORMSAVE(3)(r10) mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 - b data_access + lwz r13, THREAD_NORMSAVE(2)(r10) + lwz r12, THREAD_NORMSAVE(1)(r10) + lwz r11, THREAD_NORMSAVE(0)(r10) + mfspr r10, SPRN_SPRG_RSCRATCH0 + b DataStorage /* Instruction TLB Error Interrupt */ /* @@ -633,19 +532,25 @@ interrupt_base: * to a different point. */ START_EXCEPTION(InstructionTLBError) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 - mfcr r11 - mtspr SPRN_SPRG7W, r11 + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mfspr r10, SPRN_SPRG_THREAD + stw r11, THREAD_NORMSAVE(0)(r10) +#ifdef CONFIG_KVM_BOOKE_HV +BEGIN_FTR_SECTION + mfspr r11, SPRN_SRR1 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) +#endif + stw r12, THREAD_NORMSAVE(1)(r10) + stw r13, THREAD_NORMSAVE(2)(r10) + mfcr r13 + stw r13, THREAD_NORMSAVE(3)(r10) + DO_KVM BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR1 mfspr r10, SPRN_SRR0 /* Get faulting address */ /* If we are faulting a kernel address, we have to use the * kernel page tables. */ - lis r11, TASK_SIZE@h - ori r11, r11, TASK_SIZE@l + lis r11, PAGE_OFFSET@h cmplw 5, r10, r11 blt 5, 3f lis r11, swapper_pg_dir@h @@ -655,23 +560,42 @@ interrupt_base: rlwinm r12,r12,0,16,1 mtspr SPRN_MAS1,r12 + /* Make up the required permissions for kernel code */ +#ifdef CONFIG_PTE_64BIT + li r13,_PAGE_PRESENT | _PAGE_BAP_SX + oris r13,r13,_PAGE_ACCESSED@h +#else + li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC +#endif b 4f /* Get the PGD for the current thread */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) + /* Make up the required permissions for user code */ +#ifdef CONFIG_PTE_64BIT + li r13,_PAGE_PRESENT | _PAGE_BAP_UX + oris r13,r13,_PAGE_ACCESSED@h +#else + li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC +#endif + 4: FIND_PTE - andi. r13, r11, _PAGE_PRESENT /* Is the page present? */ - beq 2f /* Bail if not present */ + andc. r13,r13,r11 /* Check permission */ #ifdef CONFIG_PTE_64BIT - lwz r13, 0(r12) +#ifdef CONFIG_SMP + subf r13,r11,r12 /* create false data dep */ + lwzx r13,r11,r13 /* Get upper pte bits */ +#else + lwz r13,0(r12) /* Get upper pte bits */ +#endif #endif - ori r11, r11, _PAGE_ACCESSED - stw r11, PTE_FLAGS_OFFSET(r12) + + bne 2f /* Bail if permission mismach */ /* Jump to common TLB load point */ b finish_tlb_load @@ -680,116 +604,196 @@ interrupt_base: /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r11, SPRN_SPRG7R + mfspr r10, SPRN_SPRG_THREAD + lwz r11, THREAD_NORMSAVE(3)(r10) mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + lwz r13, THREAD_NORMSAVE(2)(r10) + lwz r12, THREAD_NORMSAVE(1)(r10) + lwz r11, THREAD_NORMSAVE(0)(r10) + mfspr r10, SPRN_SPRG_RSCRATCH0 b InstructionStorage #ifdef CONFIG_SPE /* SPE Unavailable */ START_EXCEPTION(SPEUnavailable) - NORMAL_EXCEPTION_PROLOG - bne load_up_spe - addi r3,r1,STACK_FRAME_OVERHEAD + NORMAL_EXCEPTION_PROLOG(SPE_ALTIVEC_UNAVAIL) + beq 1f + bl load_up_spe + b fast_exception_return +1: addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_EE_LITE(0x2010, KernelSPE) #else - EXCEPTION(0x2020, SPEUnavailable, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x2020, SPE_ALTIVEC_UNAVAIL, SPEUnavailable, \ + unknown_exception, EXC_XFER_EE) #endif /* CONFIG_SPE */ /* SPE Floating Point Data */ #ifdef CONFIG_SPE - EXCEPTION(0x2030, SPEFloatingPointData, SPEFloatingPointException, EXC_XFER_EE); -#else - EXCEPTION(0x2040, SPEFloatingPointData, unknown_exception, EXC_XFER_EE) -#endif /* CONFIG_SPE */ + EXCEPTION(0x2030, SPE_FP_DATA_ALTIVEC_ASSIST, SPEFloatingPointData, + SPEFloatingPointException, EXC_XFER_EE) /* SPE Floating Point Round */ - EXCEPTION(0x2050, SPEFloatingPointRound, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ + SPEFloatingPointRoundException, EXC_XFER_EE) +#else + EXCEPTION(0x2040, SPE_FP_DATA_ALTIVEC_ASSIST, SPEFloatingPointData, + unknown_exception, EXC_XFER_EE) + EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ + unknown_exception, EXC_XFER_EE) +#endif /* CONFIG_SPE */ /* Performance Monitor */ - EXCEPTION(0x2060, PerformanceMonitor, performance_monitor_exception, EXC_XFER_STD) + EXCEPTION(0x2060, PERFORMANCE_MONITOR, PerformanceMonitor, \ + performance_monitor_exception, EXC_XFER_STD) + EXCEPTION(0x2070, DOORBELL, Doorbell, doorbell_exception, EXC_XFER_STD) + + CRITICAL_EXCEPTION(0x2080, DOORBELL_CRITICAL, \ + CriticalDoorbell, unknown_exception) /* Debug Interrupt */ - DEBUG_EXCEPTION + DEBUG_DEBUG_EXCEPTION + DEBUG_CRIT_EXCEPTION + + GUEST_DOORBELL_EXCEPTION + + CRITICAL_EXCEPTION(0, GUEST_DBELL_CRIT, CriticalGuestDoorbell, \ + unknown_exception) + + /* Hypercall */ + EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception, EXC_XFER_EE) + + /* Embedded Hypervisor Privilege */ + EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_EE) + +interrupt_end: /* * Local functions */ - /* - * Data TLB exceptions will bail out to this point - * if they can't resolve the lightweight TLB fault. - */ -data_access: - NORMAL_EXCEPTION_PROLOG - mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */ - stw r5,_ESR(r11) - mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ - andis. r10,r5,(ESR_ILK|ESR_DLK)@h - bne 1f - EXC_XFER_EE_LITE(0x0300, handle_page_fault) -1: - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_EE_LITE(0x0300, CacheLockingException) - /* - * Both the instruction and data TLB miss get to this * point to load the TLB. - * r10 - EA of fault - * r11 - TLB (info from Linux PTE) - * r12, r13 - available to use - * CR5 - results of addr < TASK_SIZE + * r10 - tsize encoding (if HUGETLB_PAGE) or available to use + * r11 - TLB (info from Linux PTE) + * r12 - available to use + * r13 - upper bits of PTE (if PTE_64BIT) or available to use + * CR5 - results of addr >= PAGE_OFFSET * MAS0, MAS1 - loaded with proper value when we get here * MAS2, MAS3 - will need additional info from Linux PTE * Upon exit, we reload everything and RFI. */ finish_tlb_load: +#ifdef CONFIG_HUGETLB_PAGE + cmpwi 6, r10, 0 /* check for huge page */ + beq 6, finish_tlb_load_cont /* !huge */ + + /* Alas, we need more scratch registers for hugepages */ + mfspr r12, SPRN_SPRG_THREAD + stw r14, THREAD_NORMSAVE(4)(r12) + stw r15, THREAD_NORMSAVE(5)(r12) + stw r16, THREAD_NORMSAVE(6)(r12) + stw r17, THREAD_NORMSAVE(7)(r12) + + /* Get the next_tlbcam_idx percpu var */ +#ifdef CONFIG_SMP + lwz r12, THREAD_INFO-THREAD(r12) + lwz r15, TI_CPU(r12) + lis r14, __per_cpu_offset@h + ori r14, r14, __per_cpu_offset@l + rlwinm r15, r15, 2, 0, 29 + lwzx r16, r14, r15 +#else + li r16, 0 +#endif + lis r17, next_tlbcam_idx@h + ori r17, r17, next_tlbcam_idx@l + add r17, r17, r16 /* r17 = *next_tlbcam_idx */ + lwz r15, 0(r17) /* r15 = next_tlbcam_idx */ + + lis r14, MAS0_TLBSEL(1)@h /* select TLB1 (TLBCAM) */ + rlwimi r14, r15, 16, 4, 15 /* next_tlbcam_idx entry */ + mtspr SPRN_MAS0, r14 + + /* Extract TLB1CFG(NENTRY) */ + mfspr r16, SPRN_TLB1CFG + andi. r16, r16, 0xfff + + /* Update next_tlbcam_idx, wrapping when necessary */ + addi r15, r15, 1 + cmpw r15, r16 + blt 100f + lis r14, tlbcam_index@h + ori r14, r14, tlbcam_index@l + lwz r15, 0(r14) +100: stw r15, 0(r17) + + /* + * Calc MAS1_TSIZE from r10 (which has pshift encoded) + * tlb_enc = (pshift - 10). + */ + subi r15, r10, 10 + mfspr r16, SPRN_MAS1 + rlwimi r16, r15, 7, 20, 24 + mtspr SPRN_MAS1, r16 + + /* copy the pshift for use later */ + mr r14, r10 + + /* fall through */ + +#endif /* CONFIG_HUGETLB_PAGE */ + /* * We set execute, because we don't have the granularity to * properly set this at the page level (Linux problem). * Many of these bits are software only. Bits we don't set * here we (properly should) assume have the appropriate value. */ - - mfspr r12, SPRN_MAS2 +finish_tlb_load_cont: #ifdef CONFIG_PTE_64BIT - rlwimi r12, r11, 26, 24, 31 /* extract ...WIMGE from pte */ + rlwinm r12, r11, 32-2, 26, 31 /* Move in perm bits */ + andi. r10, r11, _PAGE_DIRTY + bne 1f + li r10, MAS3_SW | MAS3_UW + andc r12, r12, r10 +1: rlwimi r12, r13, 20, 0, 11 /* grab RPN[32:43] */ + rlwimi r12, r11, 20, 12, 19 /* grab RPN[44:51] */ +2: mtspr SPRN_MAS3, r12 +BEGIN_MMU_FTR_SECTION + srwi r10, r13, 12 /* grab RPN[12:31] */ + mtspr SPRN_MAS7, r10 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) #else - rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */ -#endif - mtspr SPRN_MAS2, r12 - - bge 5, 1f - - /* is user addr */ - andi. r12, r11, (_PAGE_USER | _PAGE_HWWRITE | _PAGE_HWEXEC) + li r10, (_PAGE_EXEC | _PAGE_PRESENT) + mr r13, r11 + rlwimi r10, r11, 31, 29, 29 /* extract _PAGE_DIRTY into SW */ + and r12, r11, r10 andi. r10, r11, _PAGE_USER /* Test for _PAGE_USER */ - srwi r10, r12, 1 - or r12, r12, r10 /* Copy user perms into supervisor */ - iseleq r12, 0, r12 - b 2f - - /* is kernel addr */ -1: rlwinm r12, r11, 31, 29, 29 /* Extract _PAGE_HWWRITE into SW */ - ori r12, r12, (MAS3_SX | MAS3_SR) + slwi r10, r12, 1 + or r10, r10, r12 + iseleq r12, r12, r10 + rlwimi r13, r12, 0, 20, 31 /* Get RPN from PTE, merge w/ perms */ + mtspr SPRN_MAS3, r13 +#endif + mfspr r12, SPRN_MAS2 #ifdef CONFIG_PTE_64BIT -2: rlwimi r12, r13, 24, 0, 7 /* grab RPN[32:39] */ - rlwimi r12, r11, 24, 8, 19 /* grab RPN[40:51] */ - mtspr SPRN_MAS3, r12 -BEGIN_FTR_SECTION - srwi r10, r13, 8 /* grab RPN[8:31] */ - mtspr SPRN_MAS7, r10 -END_FTR_SECTION_IFSET(CPU_FTR_BIG_PHYS) + rlwimi r12, r11, 32-19, 27, 31 /* extract WIMGE from pte */ #else -2: rlwimi r11, r12, 0, 20, 31 /* Extract RPN from PTE and merge with perms */ - mtspr SPRN_MAS3, r11 + rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */ +#endif +#ifdef CONFIG_HUGETLB_PAGE + beq 6, 3f /* don't mask if page isn't huge */ + li r13, 1 + slw r13, r13, r14 + subi r13, r13, 1 + rlwinm r13, r13, 0, 0, 19 /* bottom bits used for WIMGE/etc */ + andc r12, r12, r13 /* mask off ea bits within the page */ #endif +3: mtspr SPRN_MAS2, r12 + #ifdef CONFIG_E200 /* Round robin TLB1 entries assignment */ mfspr r12, SPRN_MAS0 @@ -812,25 +816,34 @@ END_FTR_SECTION_IFSET(CPU_FTR_BIG_PHYS) lwz r13, tlbcam_index@l(r13) rlwimi r12, r13, 0, 20, 31 7: - mtspr SPRN_MAS0,r12 + mtspr SPRN_MAS0,r12 #endif /* CONFIG_E200 */ +tlb_write_entry: tlbwe /* Done...restore registers and get out of here. */ - mfspr r11, SPRN_SPRG7R + mfspr r10, SPRN_SPRG_THREAD +#ifdef CONFIG_HUGETLB_PAGE + beq 6, 8f /* skip restore for 4k page faults */ + lwz r14, THREAD_NORMSAVE(4)(r10) + lwz r15, THREAD_NORMSAVE(5)(r10) + lwz r16, THREAD_NORMSAVE(6)(r10) + lwz r17, THREAD_NORMSAVE(7)(r10) +#endif +8: lwz r11, THREAD_NORMSAVE(3)(r10) mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + lwz r13, THREAD_NORMSAVE(2)(r10) + lwz r12, THREAD_NORMSAVE(1)(r10) + lwz r11, THREAD_NORMSAVE(0)(r10) + mfspr r10, SPRN_SPRG_RSCRATCH0 rfi /* Force context change */ #ifdef CONFIG_SPE /* Note that the SPE support is closely modeled after the AltiVec * support. Changes to one are likely to be applicable to the * other! */ -load_up_spe: +_GLOBAL(load_up_spe) /* * Disable SPE for the task which had SPE previously, * and save its SPE registers in its thread_struct. @@ -853,46 +866,32 @@ load_up_spe: cmpi 0,r4,0 beq 1f addi r4,r4,THREAD /* want THREAD of last_task_used_spe */ - SAVE_32EVRS(0,r10,r4) - evxor evr10, evr10, evr10 /* clear out evr10 */ + SAVE_32EVRS(0,r10,r4,THREAD_EVR0) + evxor evr10, evr10, evr10 /* clear out evr10 */ evmwumiaa evr10, evr10, evr10 /* evr10 <- ACC = 0 * 0 + ACC */ li r5,THREAD_ACC - evstddx evr10, r4, r5 /* save off accumulator */ + evstddx evr10, r4, r5 /* save off accumulator */ lwz r5,PT_REGS(r4) lwz r4,_MSR-STACK_FRAME_OVERHEAD(r5) lis r10,MSR_SPE@h andc r4,r4,r10 /* disable SPE for previous task */ stw r4,_MSR-STACK_FRAME_OVERHEAD(r5) 1: -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ /* enable use of SPE after return */ oris r9,r9,MSR_SPE@h - mfspr r5,SPRN_SPRG3 /* current task's THREAD (phys) */ + mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ li r4,1 li r10,THREAD_ACC stw r4,THREAD_USED_SPE(r5) evlddx evr4,r10,r5 evmra evr4,evr4 - REST_32EVRS(0,r10,r5) + REST_32EVRS(0,r10,r5,THREAD_EVR0) #ifndef CONFIG_SMP subi r4,r5,THREAD stw r4,last_task_used_spe@l(r3) -#endif /* CONFIG_SMP */ - /* restore registers and return */ -2: REST_4GPRS(3, r11) - lwz r10,_CCR(r11) - REST_GPR(1, r11) - mtcr r10 - lwz r10,_LINK(r11) - mtlr r10 - REST_GPR(10, r11) - mtspr SPRN_SRR1,r9 - mtspr SPRN_SRR0,r12 - REST_GPR(9, r11) - REST_GPR(12, r11) - lwz r11,GPR11(r11) - SYNC - rfi +#endif /* !CONFIG_SMP */ + blr /* * SPE unavailable trap from kernel - print a message, but let @@ -902,49 +901,104 @@ KernelSPE: lwz r3,_MSR(r1) oris r3,r3,MSR_SPE@h stw r3,_MSR(r1) /* enable use of SPE after return */ +#ifdef CONFIG_PRINTK lis r3,87f@h ori r3,r3,87f@l mr r4,r2 /* current */ lwz r5,_NIP(r1) bl printk +#endif b ret_from_except +#ifdef CONFIG_PRINTK 87: .string "SPE used in kernel (task=%p, pc=%x) \n" +#endif .align 4,0 #endif /* CONFIG_SPE */ /* - * Global functions + * Translate the effec addr in r3 to phys addr. The phys addr will be put + * into r3(higher 32bit) and r4(lower 32bit) */ +get_phys_addr: + mfmsr r8 + mfspr r9,SPRN_PID + rlwinm r9,r9,16,0x3fff0000 /* turn PID into MAS6[SPID] */ + rlwimi r9,r8,28,0x00000001 /* turn MSR[DS] into MAS6[SAS] */ + mtspr SPRN_MAS6,r9 + + tlbsx 0,r3 /* must succeed */ + + mfspr r8,SPRN_MAS1 + mfspr r12,SPRN_MAS3 + rlwinm r9,r8,25,0x1f /* r9 = log2(page size) */ + li r10,1024 + slw r10,r10,r9 /* r10 = page size */ + addi r10,r10,-1 + and r11,r3,r10 /* r11 = page offset */ + andc r4,r12,r10 /* r4 = page base */ + or r4,r4,r11 /* r4 = devtree phys addr */ +#ifdef CONFIG_PHYS_64BIT + mfspr r3,SPRN_MAS7 +#endif + blr /* - * extern void loadcam_entry(unsigned int index) - * - * Load TLBCAM[index] entry in to the L2 CAM MMU + * Global functions */ -_GLOBAL(loadcam_entry) - lis r4,TLBCAM@ha - addi r4,r4,TLBCAM@l - mulli r5,r3,20 - add r3,r5,r4 - lwz r4,0(r3) - mtspr SPRN_MAS0,r4 - lwz r4,4(r3) - mtspr SPRN_MAS1,r4 - lwz r4,8(r3) - mtspr SPRN_MAS2,r4 - lwz r4,12(r3) - mtspr SPRN_MAS3,r4 - tlbwe - isync + +/* Adjust or setup IVORs for e200 */ +_GLOBAL(__setup_e200_ivors) + li r3,DebugDebug@l + mtspr SPRN_IVOR15,r3 + li r3,SPEUnavailable@l + mtspr SPRN_IVOR32,r3 + li r3,SPEFloatingPointData@l + mtspr SPRN_IVOR33,r3 + li r3,SPEFloatingPointRound@l + mtspr SPRN_IVOR34,r3 + sync blr -/* - * extern void giveup_altivec(struct task_struct *prev) - * - * The e500 core does not have an AltiVec unit. - */ -_GLOBAL(giveup_altivec) +/* Adjust or setup IVORs for e500v1/v2 */ +_GLOBAL(__setup_e500_ivors) + li r3,DebugCrit@l + mtspr SPRN_IVOR15,r3 + li r3,SPEUnavailable@l + mtspr SPRN_IVOR32,r3 + li r3,SPEFloatingPointData@l + mtspr SPRN_IVOR33,r3 + li r3,SPEFloatingPointRound@l + mtspr SPRN_IVOR34,r3 + li r3,PerformanceMonitor@l + mtspr SPRN_IVOR35,r3 + sync + blr + +/* Adjust or setup IVORs for e500mc */ +_GLOBAL(__setup_e500mc_ivors) + li r3,DebugDebug@l + mtspr SPRN_IVOR15,r3 + li r3,PerformanceMonitor@l + mtspr SPRN_IVOR35,r3 + li r3,Doorbell@l + mtspr SPRN_IVOR36,r3 + li r3,CriticalDoorbell@l + mtspr SPRN_IVOR37,r3 + sync + blr + +/* setup ehv ivors for */ +_GLOBAL(__setup_ehv_ivors) + li r3,GuestDoorbell@l + mtspr SPRN_IVOR38,r3 + li r3,CriticalGuestDoorbell@l + mtspr SPRN_IVOR39,r3 + li r3,Hypercall@l + mtspr SPRN_IVOR40,r3 + li r3,Ehvpriv@l + mtspr SPRN_IVOR41,r3 + sync blr #ifdef CONFIG_SPE @@ -955,7 +1009,6 @@ _GLOBAL(giveup_altivec) _GLOBAL(giveup_spe) mfmsr r5 oris r5,r5,MSR_SPE@h - SYNC mtmsr r5 /* enable use of SPE now */ isync cmpi 0,r3,0 @@ -963,13 +1016,11 @@ _GLOBAL(giveup_spe) addi r3,r3,THREAD /* want THREAD of task */ lwz r5,PT_REGS(r3) cmpi 0,r5,0 - SAVE_32EVRS(0, r4, r3) - evxor evr6, evr6, evr6 /* clear out evr6 */ + SAVE_32EVRS(0, r4, r3, THREAD_EVR0) + evxor evr6, evr6, evr6 /* clear out evr6 */ evmwumiaa evr6, evr6, evr6 /* evr6 <- ACC = 0 * 0 + ACC */ li r4,THREAD_ACC - evstddx evr6, r4, r3 /* save off accumulator */ - mfspr r6,SPRN_SPEFSCR - stw r6,THREAD_SPEFSCR(r3) /* save spefscr register value */ + evstddx evr6, r4, r3 /* save off accumulator */ beq 1f lwz r4,_MSR-STACK_FRAME_OVERHEAD(r5) lis r3,MSR_SPE@h @@ -980,34 +1031,27 @@ _GLOBAL(giveup_spe) li r5,0 lis r4,last_task_used_spe@ha stw r5,last_task_used_spe@l(r4) -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ blr #endif /* CONFIG_SPE */ /* - * extern void giveup_fpu(struct task_struct *prev) - * - * Not all FSL Book-E cores have an FPU - */ -#ifndef CONFIG_PPC_FPU -_GLOBAL(giveup_fpu) - blr -#endif - -/* * extern void abort(void) * * At present, this routine just applies a system reset. */ _GLOBAL(abort) li r13,0 - mtspr SPRN_DBCR0,r13 /* disable all debug events */ + mtspr SPRN_DBCR0,r13 /* disable all debug events */ + isync mfmsr r13 ori r13,r13,MSR_DE@l /* Enable Debug Events */ mtmsr r13 - mfspr r13,SPRN_DBCR0 - lis r13,(DBCR0_IDM|DBCR0_RST_CHIP)@h - mtspr SPRN_DBCR0,r13 + isync + mfspr r13,SPRN_DBCR0 + lis r13,(DBCR0_IDM|DBCR0_RST_CHIP)@h + mtspr SPRN_DBCR0,r13 + isync _GLOBAL(set_context) @@ -1023,6 +1067,255 @@ _GLOBAL(set_context) isync /* Force context change */ blr +_GLOBAL(flush_dcache_L1) + mfspr r3,SPRN_L1CFG0 + + rlwinm r5,r3,9,3 /* Extract cache block size */ + twlgti r5,1 /* Only 32 and 64 byte cache blocks + * are currently defined. + */ + li r4,32 + subfic r6,r5,2 /* r6 = log2(1KiB / cache block size) - + * log2(number of ways) + */ + slw r5,r4,r5 /* r5 = cache block size */ + + rlwinm r7,r3,0,0xff /* Extract number of KiB in the cache */ + mulli r7,r7,13 /* An 8-way cache will require 13 + * loads per set. + */ + slw r7,r7,r6 + + /* save off HID0 and set DCFA */ + mfspr r8,SPRN_HID0 + ori r9,r8,HID0_DCFA@l + mtspr SPRN_HID0,r9 + isync + + lis r4,KERNELBASE@h + mtctr r7 + +1: lwz r3,0(r4) /* Load... */ + add r4,r4,r5 + bdnz 1b + + msync + lis r4,KERNELBASE@h + mtctr r7 + +1: dcbf 0,r4 /* ...and flush. */ + add r4,r4,r5 + bdnz 1b + + /* restore HID0 */ + mtspr SPRN_HID0,r8 + isync + + blr + +/* Flush L1 d-cache, invalidate and disable d-cache and i-cache */ +_GLOBAL(__flush_disable_L1) + mflr r10 + bl flush_dcache_L1 /* Flush L1 d-cache */ + mtlr r10 + + mfspr r4, SPRN_L1CSR0 /* Invalidate and disable d-cache */ + li r5, 2 + rlwimi r4, r5, 0, 3 + + msync + isync + mtspr SPRN_L1CSR0, r4 + isync + +1: mfspr r4, SPRN_L1CSR0 /* Wait for the invalidate to finish */ + andi. r4, r4, 2 + bne 1b + + mfspr r4, SPRN_L1CSR1 /* Invalidate and disable i-cache */ + li r5, 2 + rlwimi r4, r5, 0, 3 + + mtspr SPRN_L1CSR1, r4 + isync + + blr + +#ifdef CONFIG_SMP +/* When we get here, r24 needs to hold the CPU # */ + .globl __secondary_start +__secondary_start: + LOAD_REG_ADDR_PIC(r3, tlbcam_index) + lwz r3,0(r3) + mtctr r3 + li r26,0 /* r26 safe? */ + + bl switch_to_as1 + mr r27,r3 /* tlb entry */ + /* Load each CAM entry */ +1: mr r3,r26 + bl loadcam_entry + addi r26,r26,1 + bdnz 1b + mr r3,r27 /* tlb entry */ + LOAD_REG_ADDR_PIC(r4, memstart_addr) + lwz r4,0(r4) + mr r5,r25 /* phys kernel start */ + rlwinm r5,r5,0,~0x3ffffff /* aligned 64M */ + subf r4,r5,r4 /* memstart_addr - phys kernel start */ + li r5,0 /* no device tree */ + li r6,0 /* not boot cpu */ + bl restore_to_as0 + + + lis r3,__secondary_hold_acknowledge@h + ori r3,r3,__secondary_hold_acknowledge@l + stw r24,0(r3) + + li r3,0 + mr r4,r24 /* Why? */ + bl call_setup_cpu + + /* get current_thread_info and current */ + lis r1,secondary_ti@ha + lwz r1,secondary_ti@l(r1) + lwz r2,TI_TASK(r1) + + /* stack */ + addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD + li r0,0 + stw r0,0(r1) + + /* ptr to current thread */ + addi r4,r2,THREAD /* address of our thread_struct */ + mtspr SPRN_SPRG_THREAD,r4 + + /* Setup the defaults for TLB entries */ + li r4,(MAS4_TSIZED(BOOK3E_PAGESZ_4K))@l + mtspr SPRN_MAS4,r4 + + /* Jump to start_secondary */ + lis r4,MSR_KERNEL@h + ori r4,r4,MSR_KERNEL@l + lis r3,start_secondary@h + ori r3,r3,start_secondary@l + mtspr SPRN_SRR0,r3 + mtspr SPRN_SRR1,r4 + sync + rfi + sync + + .globl __secondary_hold_acknowledge +__secondary_hold_acknowledge: + .long -1 +#endif + +/* + * Create a tlb entry with the same effective and physical address as + * the tlb entry used by the current running code. But set the TS to 1. + * Then switch to the address space 1. It will return with the r3 set to + * the ESEL of the new created tlb. + */ +_GLOBAL(switch_to_as1) + mflr r5 + + /* Find a entry not used */ + mfspr r3,SPRN_TLB1CFG + andi. r3,r3,0xfff + mfspr r4,SPRN_PID + rlwinm r4,r4,16,0x3fff0000 /* turn PID into MAS6[SPID] */ + mtspr SPRN_MAS6,r4 +1: lis r4,0x1000 /* Set MAS0(TLBSEL) = 1 */ + addi r3,r3,-1 + rlwimi r4,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ + mtspr SPRN_MAS0,r4 + tlbre + mfspr r4,SPRN_MAS1 + andis. r4,r4,MAS1_VALID@h + bne 1b + + /* Get the tlb entry used by the current running code */ + bl 0f +0: mflr r4 + tlbsx 0,r4 + + mfspr r4,SPRN_MAS1 + ori r4,r4,MAS1_TS /* Set the TS = 1 */ + mtspr SPRN_MAS1,r4 + + mfspr r4,SPRN_MAS0 + rlwinm r4,r4,0,~MAS0_ESEL_MASK + rlwimi r4,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ + mtspr SPRN_MAS0,r4 + tlbwe + isync + sync + + mfmsr r4 + ori r4,r4,MSR_IS | MSR_DS + mtspr SPRN_SRR0,r5 + mtspr SPRN_SRR1,r4 + sync + rfi + +/* + * Restore to the address space 0 and also invalidate the tlb entry created + * by switch_to_as1. + * r3 - the tlb entry which should be invalidated + * r4 - __pa(PAGE_OFFSET in AS1) - __pa(PAGE_OFFSET in AS0) + * r5 - device tree virtual address. If r4 is 0, r5 is ignored. + * r6 - boot cpu +*/ +_GLOBAL(restore_to_as0) + mflr r0 + + bl 0f +0: mflr r9 + addi r9,r9,1f - 0b + + /* + * We may map the PAGE_OFFSET in AS0 to a different physical address, + * so we need calculate the right jump and device tree address based + * on the offset passed by r4. + */ + add r9,r9,r4 + add r5,r5,r4 + add r0,r0,r4 + +2: mfmsr r7 + li r8,(MSR_IS | MSR_DS) + andc r7,r7,r8 + + mtspr SPRN_SRR0,r9 + mtspr SPRN_SRR1,r7 + sync + rfi + + /* Invalidate the temporary tlb entry for AS1 */ +1: lis r9,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r9,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ + mtspr SPRN_MAS0,r9 + tlbre + mfspr r9,SPRN_MAS1 + rlwinm r9,r9,0,2,31 /* Clear MAS1 Valid and IPPROT */ + mtspr SPRN_MAS1,r9 + tlbwe + isync + + cmpwi r4,0 + cmpwi cr1,r6,0 + cror eq,4*cr1+eq,eq + bne 3f /* offset != 0 && is_boot_cpu */ + mtlr r0 + blr + + /* + * The PAGE_OFFSET will map to a different physical address, + * jump to _start to do another relocation again. + */ +3: mr r3,r5 + bl _start + /* * We put a few things here that have to be page-aligned. This stuff * goes at the beginning of the data segment, which is page-aligned. @@ -1036,24 +1329,7 @@ empty_zero_page: .space 4096 .globl swapper_pg_dir swapper_pg_dir: - .space 4096 - -/* Reserved 4k for the critical exception stack & 4k for the machine - * check stack per CPU for kernel mode exceptions */ - .section .bss - .align 12 -exception_stack_bottom: - .space BOOKE_EXCEPTION_STACK_SIZE * NR_CPUS - .globl exception_stack_top -exception_stack_top: - -/* - * This space gets a copy of optional info passed to us by the bootstrap - * which is used to pass parameters into the kernel like root=/dev/sda1, etc. - */ - .globl cmd_line -cmd_line: - .space 512 + .space PGD_TABLE_SIZE /* * Room for two PTE pointers, usually the kernel and current user pointers diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c new file mode 100644 index 00000000000..0bb5918faaa --- /dev/null +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -0,0 +1,366 @@ +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. Derived from + * "arch/x86/kernel/hw_breakpoint.c" + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2010 IBM Corporation + * Author: K.Prasad <prasad@linux.vnet.ibm.com> + * + */ + +#include <linux/hw_breakpoint.h> +#include <linux/notifier.h> +#include <linux/kprobes.h> +#include <linux/percpu.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/smp.h> + +#include <asm/hw_breakpoint.h> +#include <asm/processor.h> +#include <asm/sstep.h> +#include <asm/uaccess.h> + +/* + * Stores the breakpoints currently in use on each breakpoint address + * register for every cpu + */ +static DEFINE_PER_CPU(struct perf_event *, bp_per_reg); + +/* + * Returns total number of data or instruction breakpoints available. + */ +int hw_breakpoint_slots(int type) +{ + if (type == TYPE_DATA) + return HBP_NUM; + return 0; /* no instruction breakpoints available */ +} + +/* + * Install a perf counter breakpoint. + * + * We seek a free debug address register and use it for this + * breakpoint. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +int arch_install_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + struct perf_event **slot = &__get_cpu_var(bp_per_reg); + + *slot = bp; + + /* + * Do not install DABR values if the instruction must be single-stepped. + * If so, DABR will be populated in single_step_dabr_instruction(). + */ + if (current->thread.last_hit_ubp != bp) + __set_breakpoint(info); + + return 0; +} + +/* + * Uninstall the breakpoint contained in the given counter. + * + * First we search the debug address register it uses and then we disable + * it. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +void arch_uninstall_hw_breakpoint(struct perf_event *bp) +{ + struct perf_event **slot = &__get_cpu_var(bp_per_reg); + + if (*slot != bp) { + WARN_ONCE(1, "Can't find the breakpoint"); + return; + } + + *slot = NULL; + hw_breakpoint_disable(); +} + +/* + * Perform cleanup of arch-specific counters during unregistration + * of the perf-event + */ +void arch_unregister_hw_breakpoint(struct perf_event *bp) +{ + /* + * If the breakpoint is unregistered between a hw_breakpoint_handler() + * and the single_step_dabr_instruction(), then cleanup the breakpoint + * restoration variables to prevent dangling pointers. + */ + if (bp->ctx && bp->ctx->task) + bp->ctx->task->thread.last_hit_ubp = NULL; +} + +/* + * Check for virtual address in kernel space. + */ +int arch_check_bp_in_kernelspace(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + + return is_kernel_addr(info->address); +} + +int arch_bp_generic_fields(int type, int *gen_bp_type) +{ + *gen_bp_type = 0; + if (type & HW_BRK_TYPE_READ) + *gen_bp_type |= HW_BREAKPOINT_R; + if (type & HW_BRK_TYPE_WRITE) + *gen_bp_type |= HW_BREAKPOINT_W; + if (*gen_bp_type == 0) + return -EINVAL; + return 0; +} + +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int arch_validate_hwbkpt_settings(struct perf_event *bp) +{ + int ret = -EINVAL, length_max; + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + + if (!bp) + return ret; + + info->type = HW_BRK_TYPE_TRANSLATE; + if (bp->attr.bp_type & HW_BREAKPOINT_R) + info->type |= HW_BRK_TYPE_READ; + if (bp->attr.bp_type & HW_BREAKPOINT_W) + info->type |= HW_BRK_TYPE_WRITE; + if (info->type == HW_BRK_TYPE_TRANSLATE) + /* must set alteast read or write */ + return ret; + if (!(bp->attr.exclude_user)) + info->type |= HW_BRK_TYPE_USER; + if (!(bp->attr.exclude_kernel)) + info->type |= HW_BRK_TYPE_KERNEL; + if (!(bp->attr.exclude_hv)) + info->type |= HW_BRK_TYPE_HYP; + info->address = bp->attr.bp_addr; + info->len = bp->attr.bp_len; + + /* + * Since breakpoint length can be a maximum of HW_BREAKPOINT_LEN(8) + * and breakpoint addresses are aligned to nearest double-word + * HW_BREAKPOINT_ALIGN by rounding off to the lower address, the + * 'symbolsize' should satisfy the check below. + */ + length_max = 8; /* DABR */ + if (cpu_has_feature(CPU_FTR_DAWR)) { + length_max = 512 ; /* 64 doublewords */ + /* DAWR region can't cross 512 boundary */ + if ((bp->attr.bp_addr >> 10) != + ((bp->attr.bp_addr + bp->attr.bp_len - 1) >> 10)) + return -EINVAL; + } + if (info->len > + (length_max - (info->address & HW_BREAKPOINT_ALIGN))) + return -EINVAL; + return 0; +} + +/* + * Restores the breakpoint on the debug registers. + * Invoke this function if it is known that the execution context is + * about to change to cause loss of MSR_SE settings. + */ +void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs) +{ + struct arch_hw_breakpoint *info; + + if (likely(!tsk->thread.last_hit_ubp)) + return; + + info = counter_arch_bp(tsk->thread.last_hit_ubp); + regs->msr &= ~MSR_SE; + __set_breakpoint(info); + tsk->thread.last_hit_ubp = NULL; +} + +/* + * Handle debug exception notifications. + */ +int __kprobes hw_breakpoint_handler(struct die_args *args) +{ + int rc = NOTIFY_STOP; + struct perf_event *bp; + struct pt_regs *regs = args->regs; + int stepped = 1; + struct arch_hw_breakpoint *info; + unsigned int instr; + unsigned long dar = regs->dar; + + /* Disable breakpoints during exception handling */ + hw_breakpoint_disable(); + + /* + * The counter may be concurrently released but that can only + * occur from a call_rcu() path. We can then safely fetch + * the breakpoint, use its callback, touch its counter + * while we are in an rcu_read_lock() path. + */ + rcu_read_lock(); + + bp = __get_cpu_var(bp_per_reg); + if (!bp) + goto out; + info = counter_arch_bp(bp); + + /* + * Return early after invoking user-callback function without restoring + * DABR if the breakpoint is from ptrace which always operates in + * one-shot mode. The ptrace-ed process will receive the SIGTRAP signal + * generated in do_dabr(). + */ + if (bp->overflow_handler == ptrace_triggered) { + perf_bp_event(bp, regs); + rc = NOTIFY_DONE; + goto out; + } + + /* + * Verify if dar lies within the address range occupied by the symbol + * being watched to filter extraneous exceptions. If it doesn't, + * we still need to single-step the instruction, but we don't + * generate an event. + */ + info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ; + if (!((bp->attr.bp_addr <= dar) && + (dar - bp->attr.bp_addr < bp->attr.bp_len))) + info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; + + /* Do not emulate user-space instructions, instead single-step them */ + if (user_mode(regs)) { + current->thread.last_hit_ubp = bp; + regs->msr |= MSR_SE; + goto out; + } + + stepped = 0; + instr = 0; + if (!__get_user_inatomic(instr, (unsigned int *) regs->nip)) + stepped = emulate_step(regs, instr); + + /* + * emulate_step() could not execute it. We've failed in reliably + * handling the hw-breakpoint. Unregister it and throw a warning + * message to let the user know about it. + */ + if (!stepped) { + WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " + "0x%lx will be disabled.", info->address); + perf_event_disable(bp); + goto out; + } + /* + * As a policy, the callback is invoked in a 'trigger-after-execute' + * fashion + */ + if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ)) + perf_bp_event(bp, regs); + + __set_breakpoint(info); +out: + rcu_read_unlock(); + return rc; +} + +/* + * Handle single-step exceptions following a DABR hit. + */ +int __kprobes single_step_dabr_instruction(struct die_args *args) +{ + struct pt_regs *regs = args->regs; + struct perf_event *bp = NULL; + struct arch_hw_breakpoint *info; + + bp = current->thread.last_hit_ubp; + /* + * Check if we are single-stepping as a result of a + * previous HW Breakpoint exception + */ + if (!bp) + return NOTIFY_DONE; + + info = counter_arch_bp(bp); + + /* + * We shall invoke the user-defined callback function in the single + * stepping handler to confirm to 'trigger-after-execute' semantics + */ + if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ)) + perf_bp_event(bp, regs); + + __set_breakpoint(info); + current->thread.last_hit_ubp = NULL; + + /* + * If the process was being single-stepped by ptrace, let the + * other single-step actions occur (e.g. generate SIGTRAP). + */ + if (test_thread_flag(TIF_SINGLESTEP)) + return NOTIFY_DONE; + + return NOTIFY_STOP; +} + +/* + * Handle debug exception notifications. + */ +int __kprobes hw_breakpoint_exceptions_notify( + struct notifier_block *unused, unsigned long val, void *data) +{ + int ret = NOTIFY_DONE; + + switch (val) { + case DIE_DABR_MATCH: + ret = hw_breakpoint_handler(data); + break; + case DIE_SSTEP: + ret = single_step_dabr_instruction(data); + break; + } + + return ret; +} + +/* + * Release the user breakpoints used by ptrace + */ +void flush_ptrace_hw_breakpoint(struct task_struct *tsk) +{ + struct thread_struct *t = &tsk->thread; + + unregister_hw_breakpoint(t->ptrace_bps[0]); + t->ptrace_bps[0] = NULL; +} + +void hw_breakpoint_pmu_read(struct perf_event *bp) +{ + /* TODO */ +} diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c new file mode 100644 index 00000000000..1114d13ac19 --- /dev/null +++ b/arch/powerpc/kernel/ibmebus.c @@ -0,0 +1,759 @@ +/* + * IBM PowerPC IBM eBus Infrastructure Support. + * + * Copyright (c) 2005 IBM Corporation + * Joachim Fenkes <fenkes@de.ibm.com> + * Heiko J Schick <schickhj@de.ibm.com> + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/init.h> +#include <linux/export.h> +#include <linux/console.h> +#include <linux/kobject.h> +#include <linux/dma-mapping.h> +#include <linux/interrupt.h> +#include <linux/of.h> +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/of_platform.h> +#include <asm/ibmebus.h> + +static struct device ibmebus_bus_device = { /* fake "parent" device */ + .init_name = "ibmebus", +}; + +struct bus_type ibmebus_bus_type; + +/* These devices will automatically be added to the bus during init */ +static struct of_device_id __initdata ibmebus_matches[] = { + { .compatible = "IBM,lhca" }, + { .compatible = "IBM,lhea" }, + {}, +}; + +static void *ibmebus_alloc_coherent(struct device *dev, + size_t size, + dma_addr_t *dma_handle, + gfp_t flag, + struct dma_attrs *attrs) +{ + void *mem; + + mem = kmalloc(size, flag); + *dma_handle = (dma_addr_t)mem; + + return mem; +} + +static void ibmebus_free_coherent(struct device *dev, + size_t size, void *vaddr, + dma_addr_t dma_handle, + struct dma_attrs *attrs) +{ + kfree(vaddr); +} + +static dma_addr_t ibmebus_map_page(struct device *dev, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return (dma_addr_t)(page_address(page) + offset); +} + +static void ibmebus_unmap_page(struct device *dev, + dma_addr_t dma_addr, + size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return; +} + +static int ibmebus_map_sg(struct device *dev, + struct scatterlist *sgl, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) { + sg->dma_address = (dma_addr_t) sg_virt(sg); + sg->dma_length = sg->length; + } + + return nents; +} + +static void ibmebus_unmap_sg(struct device *dev, + struct scatterlist *sg, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return; +} + +static int ibmebus_dma_supported(struct device *dev, u64 mask) +{ + return mask == DMA_BIT_MASK(64); +} + +static u64 ibmebus_dma_get_required_mask(struct device *dev) +{ + return DMA_BIT_MASK(64); +} + +static struct dma_map_ops ibmebus_dma_ops = { + .alloc = ibmebus_alloc_coherent, + .free = ibmebus_free_coherent, + .map_sg = ibmebus_map_sg, + .unmap_sg = ibmebus_unmap_sg, + .dma_supported = ibmebus_dma_supported, + .get_required_mask = ibmebus_dma_get_required_mask, + .map_page = ibmebus_map_page, + .unmap_page = ibmebus_unmap_page, +}; + +static int ibmebus_match_path(struct device *dev, void *data) +{ + struct device_node *dn = to_platform_device(dev)->dev.of_node; + return (dn->full_name && + (strcasecmp((char *)data, dn->full_name) == 0)); +} + +static int ibmebus_match_node(struct device *dev, void *data) +{ + return to_platform_device(dev)->dev.of_node == data; +} + +static int ibmebus_create_device(struct device_node *dn) +{ + struct platform_device *dev; + int ret; + + dev = of_device_alloc(dn, NULL, &ibmebus_bus_device); + if (!dev) + return -ENOMEM; + + dev->dev.bus = &ibmebus_bus_type; + dev->dev.archdata.dma_ops = &ibmebus_dma_ops; + + ret = of_device_add(dev); + if (ret) + platform_device_put(dev); + return ret; +} + +static int ibmebus_create_devices(const struct of_device_id *matches) +{ + struct device_node *root, *child; + int ret = 0; + + root = of_find_node_by_path("/"); + + for_each_child_of_node(root, child) { + if (!of_match_node(matches, child)) + continue; + + if (bus_find_device(&ibmebus_bus_type, NULL, child, + ibmebus_match_node)) + continue; + + ret = ibmebus_create_device(child); + if (ret) { + printk(KERN_ERR "%s: failed to create device (%i)", + __func__, ret); + of_node_put(child); + break; + } + } + + of_node_put(root); + return ret; +} + +int ibmebus_register_driver(struct platform_driver *drv) +{ + /* If the driver uses devices that ibmebus doesn't know, add them */ + ibmebus_create_devices(drv->driver.of_match_table); + + drv->driver.bus = &ibmebus_bus_type; + return driver_register(&drv->driver); +} +EXPORT_SYMBOL(ibmebus_register_driver); + +void ibmebus_unregister_driver(struct platform_driver *drv) +{ + driver_unregister(&drv->driver); +} +EXPORT_SYMBOL(ibmebus_unregister_driver); + +int ibmebus_request_irq(u32 ist, irq_handler_t handler, + unsigned long irq_flags, const char *devname, + void *dev_id) +{ + unsigned int irq = irq_create_mapping(NULL, ist); + + if (irq == NO_IRQ) + return -EINVAL; + + return request_irq(irq, handler, irq_flags, devname, dev_id); +} +EXPORT_SYMBOL(ibmebus_request_irq); + +void ibmebus_free_irq(u32 ist, void *dev_id) +{ + unsigned int irq = irq_find_mapping(NULL, ist); + + free_irq(irq, dev_id); + irq_dispose_mapping(irq); +} +EXPORT_SYMBOL(ibmebus_free_irq); + +static char *ibmebus_chomp(const char *in, size_t count) +{ + char *out = kmalloc(count + 1, GFP_KERNEL); + + if (!out) + return NULL; + + memcpy(out, in, count); + out[count] = '\0'; + if (out[count - 1] == '\n') + out[count - 1] = '\0'; + + return out; +} + +static ssize_t ibmebus_store_probe(struct bus_type *bus, + const char *buf, size_t count) +{ + struct device_node *dn = NULL; + char *path; + ssize_t rc = 0; + + path = ibmebus_chomp(buf, count); + if (!path) + return -ENOMEM; + + if (bus_find_device(&ibmebus_bus_type, NULL, path, + ibmebus_match_path)) { + printk(KERN_WARNING "%s: %s has already been probed\n", + __func__, path); + rc = -EEXIST; + goto out; + } + + if ((dn = of_find_node_by_path(path))) { + rc = ibmebus_create_device(dn); + of_node_put(dn); + } else { + printk(KERN_WARNING "%s: no such device node: %s\n", + __func__, path); + rc = -ENODEV; + } + +out: + kfree(path); + if (rc) + return rc; + return count; +} +static BUS_ATTR(probe, S_IWUSR, NULL, ibmebus_store_probe); + +static ssize_t ibmebus_store_remove(struct bus_type *bus, + const char *buf, size_t count) +{ + struct device *dev; + char *path; + + path = ibmebus_chomp(buf, count); + if (!path) + return -ENOMEM; + + if ((dev = bus_find_device(&ibmebus_bus_type, NULL, path, + ibmebus_match_path))) { + of_device_unregister(to_platform_device(dev)); + + kfree(path); + return count; + } else { + printk(KERN_WARNING "%s: %s not on the bus\n", + __func__, path); + + kfree(path); + return -ENODEV; + } +} +static BUS_ATTR(remove, S_IWUSR, NULL, ibmebus_store_remove); + +static struct attribute *ibmbus_bus_attrs[] = { + &bus_attr_probe.attr, + &bus_attr_remove.attr, + NULL, +}; +ATTRIBUTE_GROUPS(ibmbus_bus); + +static int ibmebus_bus_bus_match(struct device *dev, struct device_driver *drv) +{ + const struct of_device_id *matches = drv->of_match_table; + + if (!matches) + return 0; + + return of_match_device(matches, dev) != NULL; +} + +static int ibmebus_bus_device_probe(struct device *dev) +{ + int error = -ENODEV; + struct platform_driver *drv; + struct platform_device *of_dev; + + drv = to_platform_driver(dev->driver); + of_dev = to_platform_device(dev); + + if (!drv->probe) + return error; + + of_dev_get(of_dev); + + if (of_driver_match_device(dev, dev->driver)) + error = drv->probe(of_dev); + if (error) + of_dev_put(of_dev); + + return error; +} + +static int ibmebus_bus_device_remove(struct device *dev) +{ + struct platform_device *of_dev = to_platform_device(dev); + struct platform_driver *drv = to_platform_driver(dev->driver); + + if (dev->driver && drv->remove) + drv->remove(of_dev); + return 0; +} + +static void ibmebus_bus_device_shutdown(struct device *dev) +{ + struct platform_device *of_dev = to_platform_device(dev); + struct platform_driver *drv = to_platform_driver(dev->driver); + + if (dev->driver && drv->shutdown) + drv->shutdown(of_dev); +} + +/* + * ibmebus_bus_device_attrs + */ +static ssize_t devspec_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct platform_device *ofdev; + + ofdev = to_platform_device(dev); + return sprintf(buf, "%s\n", ofdev->dev.of_node->full_name); +} + +static ssize_t name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct platform_device *ofdev; + + ofdev = to_platform_device(dev); + return sprintf(buf, "%s\n", ofdev->dev.of_node->name); +} + +static ssize_t modalias_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + ssize_t len = of_device_get_modalias(dev, buf, PAGE_SIZE - 2); + buf[len] = '\n'; + buf[len+1] = 0; + return len+1; +} + +struct device_attribute ibmebus_bus_device_attrs[] = { + __ATTR_RO(devspec), + __ATTR_RO(name), + __ATTR_RO(modalias), + __ATTR_NULL +}; + +#ifdef CONFIG_PM_SLEEP +static int ibmebus_bus_legacy_suspend(struct device *dev, pm_message_t mesg) +{ + struct platform_device *of_dev = to_platform_device(dev); + struct platform_driver *drv = to_platform_driver(dev->driver); + int ret = 0; + + if (dev->driver && drv->suspend) + ret = drv->suspend(of_dev, mesg); + return ret; +} + +static int ibmebus_bus_legacy_resume(struct device *dev) +{ + struct platform_device *of_dev = to_platform_device(dev); + struct platform_driver *drv = to_platform_driver(dev->driver); + int ret = 0; + + if (dev->driver && drv->resume) + ret = drv->resume(of_dev); + return ret; +} + +static int ibmebus_bus_pm_prepare(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (drv && drv->pm && drv->pm->prepare) + ret = drv->pm->prepare(dev); + + return ret; +} + +static void ibmebus_bus_pm_complete(struct device *dev) +{ + struct device_driver *drv = dev->driver; + + if (drv && drv->pm && drv->pm->complete) + drv->pm->complete(dev); +} + +#ifdef CONFIG_SUSPEND + +static int ibmebus_bus_pm_suspend(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (!drv) + return 0; + + if (drv->pm) { + if (drv->pm->suspend) + ret = drv->pm->suspend(dev); + } else { + ret = ibmebus_bus_legacy_suspend(dev, PMSG_SUSPEND); + } + + return ret; +} + +static int ibmebus_bus_pm_suspend_noirq(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (!drv) + return 0; + + if (drv->pm) { + if (drv->pm->suspend_noirq) + ret = drv->pm->suspend_noirq(dev); + } + + return ret; +} + +static int ibmebus_bus_pm_resume(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (!drv) + return 0; + + if (drv->pm) { + if (drv->pm->resume) + ret = drv->pm->resume(dev); + } else { + ret = ibmebus_bus_legacy_resume(dev); + } + + return ret; +} + +static int ibmebus_bus_pm_resume_noirq(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (!drv) + return 0; + + if (drv->pm) { + if (drv->pm->resume_noirq) + ret = drv->pm->resume_noirq(dev); + } + + return ret; +} + +#else /* !CONFIG_SUSPEND */ + +#define ibmebus_bus_pm_suspend NULL +#define ibmebus_bus_pm_resume NULL +#define ibmebus_bus_pm_suspend_noirq NULL +#define ibmebus_bus_pm_resume_noirq NULL + +#endif /* !CONFIG_SUSPEND */ + +#ifdef CONFIG_HIBERNATE_CALLBACKS + +static int ibmebus_bus_pm_freeze(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (!drv) + return 0; + + if (drv->pm) { + if (drv->pm->freeze) + ret = drv->pm->freeze(dev); + } else { + ret = ibmebus_bus_legacy_suspend(dev, PMSG_FREEZE); + } + + return ret; +} + +static int ibmebus_bus_pm_freeze_noirq(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (!drv) + return 0; + + if (drv->pm) { + if (drv->pm->freeze_noirq) + ret = drv->pm->freeze_noirq(dev); + } + + return ret; +} + +static int ibmebus_bus_pm_thaw(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (!drv) + return 0; + + if (drv->pm) { + if (drv->pm->thaw) + ret = drv->pm->thaw(dev); + } else { + ret = ibmebus_bus_legacy_resume(dev); + } + + return ret; +} + +static int ibmebus_bus_pm_thaw_noirq(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (!drv) + return 0; + + if (drv->pm) { + if (drv->pm->thaw_noirq) + ret = drv->pm->thaw_noirq(dev); + } + + return ret; +} + +static int ibmebus_bus_pm_poweroff(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (!drv) + return 0; + + if (drv->pm) { + if (drv->pm->poweroff) + ret = drv->pm->poweroff(dev); + } else { + ret = ibmebus_bus_legacy_suspend(dev, PMSG_HIBERNATE); + } + + return ret; +} + +static int ibmebus_bus_pm_poweroff_noirq(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (!drv) + return 0; + + if (drv->pm) { + if (drv->pm->poweroff_noirq) + ret = drv->pm->poweroff_noirq(dev); + } + + return ret; +} + +static int ibmebus_bus_pm_restore(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (!drv) + return 0; + + if (drv->pm) { + if (drv->pm->restore) + ret = drv->pm->restore(dev); + } else { + ret = ibmebus_bus_legacy_resume(dev); + } + + return ret; +} + +static int ibmebus_bus_pm_restore_noirq(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (!drv) + return 0; + + if (drv->pm) { + if (drv->pm->restore_noirq) + ret = drv->pm->restore_noirq(dev); + } + + return ret; +} + +#else /* !CONFIG_HIBERNATE_CALLBACKS */ + +#define ibmebus_bus_pm_freeze NULL +#define ibmebus_bus_pm_thaw NULL +#define ibmebus_bus_pm_poweroff NULL +#define ibmebus_bus_pm_restore NULL +#define ibmebus_bus_pm_freeze_noirq NULL +#define ibmebus_bus_pm_thaw_noirq NULL +#define ibmebus_bus_pm_poweroff_noirq NULL +#define ibmebus_bus_pm_restore_noirq NULL + +#endif /* !CONFIG_HIBERNATE_CALLBACKS */ + +static struct dev_pm_ops ibmebus_bus_dev_pm_ops = { + .prepare = ibmebus_bus_pm_prepare, + .complete = ibmebus_bus_pm_complete, + .suspend = ibmebus_bus_pm_suspend, + .resume = ibmebus_bus_pm_resume, + .freeze = ibmebus_bus_pm_freeze, + .thaw = ibmebus_bus_pm_thaw, + .poweroff = ibmebus_bus_pm_poweroff, + .restore = ibmebus_bus_pm_restore, + .suspend_noirq = ibmebus_bus_pm_suspend_noirq, + .resume_noirq = ibmebus_bus_pm_resume_noirq, + .freeze_noirq = ibmebus_bus_pm_freeze_noirq, + .thaw_noirq = ibmebus_bus_pm_thaw_noirq, + .poweroff_noirq = ibmebus_bus_pm_poweroff_noirq, + .restore_noirq = ibmebus_bus_pm_restore_noirq, +}; + +#define IBMEBUS_BUS_PM_OPS_PTR (&ibmebus_bus_dev_pm_ops) + +#else /* !CONFIG_PM_SLEEP */ + +#define IBMEBUS_BUS_PM_OPS_PTR NULL + +#endif /* !CONFIG_PM_SLEEP */ + +struct bus_type ibmebus_bus_type = { + .name = "ibmebus", + .uevent = of_device_uevent_modalias, + .bus_groups = ibmbus_bus_groups, + .match = ibmebus_bus_bus_match, + .probe = ibmebus_bus_device_probe, + .remove = ibmebus_bus_device_remove, + .shutdown = ibmebus_bus_device_shutdown, + .dev_attrs = ibmebus_bus_device_attrs, + .pm = IBMEBUS_BUS_PM_OPS_PTR, +}; +EXPORT_SYMBOL(ibmebus_bus_type); + +static int __init ibmebus_bus_init(void) +{ + int err; + + printk(KERN_INFO "IBM eBus Device Driver\n"); + + err = bus_register(&ibmebus_bus_type); + if (err) { + printk(KERN_ERR "%s: failed to register IBM eBus.\n", + __func__); + return err; + } + + err = device_register(&ibmebus_bus_device); + if (err) { + printk(KERN_WARNING "%s: device_register returned %i\n", + __func__, err); + bus_unregister(&ibmebus_bus_type); + + return err; + } + + err = ibmebus_create_devices(ibmebus_matches); + if (err) { + device_unregister(&ibmebus_bus_device); + bus_unregister(&ibmebus_bus_type); + return err; + } + + return 0; +} +postcore_initcall(ibmebus_bus_init); diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c new file mode 100644 index 00000000000..d7216c9abda --- /dev/null +++ b/arch/powerpc/kernel/idle.c @@ -0,0 +1,115 @@ +/* + * Idle daemon for PowerPC. Idle daemon will handle any action + * that needs to be taken when the system becomes idle. + * + * Originally written by Cort Dougan (cort@cs.nmt.edu). + * Subsequent 32-bit hacking by Tom Rini, Armin Kuster, + * Paul Mackerras and others. + * + * iSeries supported added by Mike Corrigan <mikejc@us.ibm.com> + * + * Additional shared processor, SMT, and firmware support + * Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com> + * + * 32-bit and 64-bit versions merged by Paul Mackerras <paulus@samba.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/cpu.h> +#include <linux/sysctl.h> +#include <linux/tick.h> + +#include <asm/processor.h> +#include <asm/cputable.h> +#include <asm/time.h> +#include <asm/machdep.h> +#include <asm/runlatch.h> +#include <asm/smp.h> + + +unsigned long cpuidle_disable = IDLE_NO_OVERRIDE; +EXPORT_SYMBOL(cpuidle_disable); + +static int __init powersave_off(char *arg) +{ + ppc_md.power_save = NULL; + cpuidle_disable = IDLE_POWERSAVE_OFF; + return 0; +} +__setup("powersave=off", powersave_off); + +#ifdef CONFIG_HOTPLUG_CPU +void arch_cpu_idle_dead(void) +{ + sched_preempt_enable_no_resched(); + cpu_die(); +} +#endif + +void arch_cpu_idle(void) +{ + ppc64_runlatch_off(); + + if (ppc_md.power_save) { + ppc_md.power_save(); + /* + * Some power_save functions return with + * interrupts enabled, some don't. + */ + if (irqs_disabled()) + local_irq_enable(); + } else { + local_irq_enable(); + /* + * Go into low thread priority and possibly + * low power mode. + */ + HMT_low(); + HMT_very_low(); + } + + HMT_medium(); + ppc64_runlatch_on(); +} + +int powersave_nap; + +#ifdef CONFIG_SYSCTL +/* + * Register the sysctl to set/clear powersave_nap. + */ +static struct ctl_table powersave_nap_ctl_table[] = { + { + .procname = "powersave-nap", + .data = &powersave_nap, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +static struct ctl_table powersave_nap_sysctl_root[] = { + { + .procname = "kernel", + .mode = 0555, + .child = powersave_nap_ctl_table, + }, + {} +}; + +static int __init +register_powersave_nap_sysctl(void) +{ + register_sysctl_table(powersave_nap_sysctl_root); + + return 0; +} +__initcall(register_powersave_nap_sysctl); +#endif diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S index 444fdcc769f..1686916cc7f 100644 --- a/arch/powerpc/kernel/idle_6xx.S +++ b/arch/powerpc/kernel/idle_6xx.S @@ -13,7 +13,6 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <linux/threads.h> #include <asm/reg.h> #include <asm/page.h> @@ -22,8 +21,6 @@ #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> -#undef DEBUG - .text /* @@ -87,19 +84,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) cmpwi 0,r3,0 beqlr - /* Clear MSR:EE */ - mfmsr r7 - rlwinm r0,r7,0,17,15 - mtmsr r0 - - /* Check current_thread_info()->flags */ - rlwinm r4,r1,0,0,18 - lwz r4,TI_FLAGS(r4) - andi. r0,r4,_TIF_NEED_RESCHED - beq 1f - mtmsr r7 /* out of line this ? */ - blr -1: /* Some pre-nap cleanups needed on some CPUs */ andis. r0,r3,HID0_NAP@h beq 2f @@ -122,12 +106,6 @@ BEGIN_FTR_SECTION dcbf 0,r4 dcbf 0,r4 END_FTR_SECTION_IFSET(CPU_FTR_NAP_DISABLE_L2_PR) -#ifdef DEBUG - lis r6,nap_enter_count@ha - lwz r4,nap_enter_count@l(r6) - addi r4,r4,1 - stw r4,nap_enter_count@l(r6) -#endif 2: BEGIN_FTR_SECTION /* Go to low speed mode on some 750FX */ @@ -157,47 +135,42 @@ BEGIN_FTR_SECTION DSSALL sync END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) - ori r7,r7,MSR_EE /* Could be ommited (already set) */ + CURRENT_THREAD_INFO(r9, r1) + lwz r8,TI_LOCAL_FLAGS(r9) /* set napping bit */ + ori r8,r8,_TLF_NAPPING /* so when we take an exception */ + stw r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */ + mfmsr r7 + ori r7,r7,MSR_EE oris r7,r7,MSR_POW@h - sync - isync +1: sync mtmsr r7 isync - sync - blr - + b 1b + /* * Return from NAP/DOZE mode, restore some CPU specific registers, * we are called with DR/IR still off and r2 containing physical - * address of current. + * address of current. R11 points to the exception frame (physical + * address). We have to preserve r10. */ -_GLOBAL(power_save_6xx_restore) - mfspr r11,SPRN_HID0 - rlwinm. r11,r11,0,10,8 /* Clear NAP & copy NAP bit !state to cr1 EQ */ - cror 4*cr1+eq,4*cr0+eq,4*cr0+eq -BEGIN_FTR_SECTION - rlwinm r11,r11,0,9,7 /* Clear DOZE */ -END_FTR_SECTION_IFSET(CPU_FTR_CAN_DOZE) - mtspr SPRN_HID0, r11 +_GLOBAL(power_save_ppc32_restore) + lwz r9,_LINK(r11) /* interrupted in ppc6xx_idle: */ + stw r9,_NIP(r11) /* make it do a blr */ -#ifdef DEBUG - beq cr1,1f - lis r11,(nap_return_count-KERNELBASE)@ha - lwz r9,nap_return_count@l(r11) - addi r9,r9,1 - stw r9,nap_return_count@l(r11) -1: -#endif - - rlwinm r9,r1,0,0,18 - tophys(r9,r9) - lwz r11,TI_CPU(r9) +#ifdef CONFIG_SMP + CURRENT_THREAD_INFO(r12, r11) + lwz r11,TI_CPU(r12) /* get cpu number * 4 */ slwi r11,r11,2 +#else + li r11,0 +#endif /* Todo make sure all these are in the same page - * and load r22 (@ha part + CPU offset) only once + * and load r11 (@ha part + CPU offset) only once */ BEGIN_FTR_SECTION - beq cr1,1f + mfspr r9,SPRN_HID0 + andis. r9,r9,HID0_NAP@h + beq 1f addis r9,r11,(nap_save_msscr0-KERNELBASE)@ha lwz r9,nap_save_msscr0@l(r9) mtspr SPRN_MSSCR0, r9 @@ -220,14 +193,5 @@ _GLOBAL(nap_save_msscr0) _GLOBAL(nap_save_hid1) .space 4*NR_CPUS -_GLOBAL(powersave_nap) - .long 0 _GLOBAL(powersave_lowspeed) .long 0 - -#ifdef DEBUG -_GLOBAL(nap_enter_count) - .space 4 -_GLOBAL(nap_return_count) - .space 4 -#endif diff --git a/arch/powerpc/kernel/idle_book3e.S b/arch/powerpc/kernel/idle_book3e.S new file mode 100644 index 00000000000..48c21acef91 --- /dev/null +++ b/arch/powerpc/kernel/idle_book3e.S @@ -0,0 +1,101 @@ +/* + * Copyright 2010 IBM Corp, Benjamin Herrenschmidt <benh@kernel.crashing.org> + * + * Generic idle routine for Book3E processors + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/threads.h> +#include <asm/reg.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/ppc-opcode.h> +#include <asm/processor.h> +#include <asm/thread_info.h> +#include <asm/epapr_hcalls.h> + +/* 64-bit version only for now */ +#ifdef CONFIG_PPC64 + +.macro BOOK3E_IDLE name loop +_GLOBAL(\name) + /* Save LR for later */ + mflr r0 + std r0,16(r1) + + /* Hard disable interrupts */ + wrteei 0 + + /* Now check if an interrupt came in while we were soft disabled + * since we may otherwise lose it (doorbells etc...). + */ + lbz r3,PACAIRQHAPPENED(r13) + cmpwi cr0,r3,0 + bnelr + + /* Now we are going to mark ourselves as soft and hard enabled in + * order to be able to take interrupts while asleep. We inform lockdep + * of that. We don't actually turn interrupts on just yet tho. + */ +#ifdef CONFIG_TRACE_IRQFLAGS + stdu r1,-128(r1) + bl trace_hardirqs_on + addi r1,r1,128 +#endif + li r0,1 + stb r0,PACASOFTIRQEN(r13) + + /* Interrupts will make use return to LR, so get something we want + * in there + */ + bl 1f + + /* And return (interrupts are on) */ + ld r0,16(r1) + mtlr r0 + blr + +1: /* Let's set the _TLF_NAPPING flag so interrupts make us return + * to the right spot + */ + CURRENT_THREAD_INFO(r11, r1) + ld r10,TI_LOCAL_FLAGS(r11) + ori r10,r10,_TLF_NAPPING + std r10,TI_LOCAL_FLAGS(r11) + + /* We can now re-enable hard interrupts and go to sleep */ + wrteei 1 + \loop + +.endm + +.macro BOOK3E_IDLE_LOOP +1: + PPC_WAIT(0) + b 1b +.endm + +/* epapr_ev_idle_start below is patched with the proper hcall + opcodes during kernel initialization */ +.macro EPAPR_EV_IDLE_LOOP +idle_loop: + LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE)) + +.global epapr_ev_idle_start +epapr_ev_idle_start: + li r3, -1 + nop + nop + nop + b idle_loop +.endm + +BOOK3E_IDLE epapr_ev_idle EPAPR_EV_IDLE_LOOP + +BOOK3E_IDLE book3e_idle BOOK3E_IDLE_LOOP + +#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S new file mode 100644 index 00000000000..15448668988 --- /dev/null +++ b/arch/powerpc/kernel/idle_e500.S @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved. + * Dave Liu <daveliu@freescale.com> + * copy from idle_6xx.S and modify for e500 based processor, + * implement the power_save function in idle. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/threads.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/thread_info.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + + .text + +_GLOBAL(e500_idle) + CURRENT_THREAD_INFO(r3, r1) + lwz r4,TI_LOCAL_FLAGS(r3) /* set napping bit */ + ori r4,r4,_TLF_NAPPING /* so when we take an exception */ + stw r4,TI_LOCAL_FLAGS(r3) /* it will return to our caller */ + +#ifdef CONFIG_PPC_E500MC + wrteei 1 +1: wait + + /* + * Guard against spurious wakeups (e.g. from a hypervisor) -- + * any real interrupt will cause us to return to LR due to + * _TLF_NAPPING. + */ + b 1b +#else + /* Check if we can nap or doze, put HID0 mask in r3 */ + lis r3,0 +BEGIN_FTR_SECTION + lis r3,HID0_DOZE@h +END_FTR_SECTION_IFSET(CPU_FTR_CAN_DOZE) + +BEGIN_FTR_SECTION + /* Now check if user enabled NAP mode */ + lis r4,powersave_nap@ha + lwz r4,powersave_nap@l(r4) + cmpwi 0,r4,0 + beq 1f + stwu r1,-16(r1) + mflr r0 + stw r0,20(r1) + bl flush_dcache_L1 + lwz r0,20(r1) + addi r1,r1,16 + mtlr r0 + lis r3,HID0_NAP@h +END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) +BEGIN_FTR_SECTION + msync + li r7,L2CSR0_L2FL@l + mtspr SPRN_L2CSR0,r7 +2: + mfspr r7,SPRN_L2CSR0 + andi. r4,r7,L2CSR0_L2FL@l + bne 2b +END_FTR_SECTION_IFSET(CPU_FTR_L2CSR|CPU_FTR_CAN_NAP) +1: + /* Go to NAP or DOZE now */ + mfspr r4,SPRN_HID0 + rlwinm r4,r4,0,~(HID0_DOZE|HID0_NAP|HID0_SLEEP) + or r4,r4,r3 + isync + mtspr SPRN_HID0,r4 + isync + + mfmsr r7 + oris r7,r7,MSR_WE@h + ori r7,r7,MSR_EE + msync + mtmsr r7 + isync +2: b 2b +#endif /* !E500MC */ + +/* + * Return from NAP/DOZE mode, restore some CPU specific registers, + * r2 containing physical address of current. + * r11 points to the exception frame (physical address). + * We have to preserve r10. + */ +_GLOBAL(power_save_ppc32_restore) + lwz r9,_LINK(r11) /* interrupted in e500_idle */ + stw r9,_NIP(r11) /* make it do a blr */ + +#ifdef CONFIG_SMP + CURRENT_THREAD_INFO(r12, r1) + lwz r11,TI_CPU(r12) /* get cpu number * 4 */ + slwi r11,r11,2 +#else + li r11,0 +#endif + + b transfer_to_handler_cont diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S index 1494e2f177f..f57a19348bd 100644 --- a/arch/powerpc/kernel/idle_power4.S +++ b/arch/powerpc/kernel/idle_power4.S @@ -1,11 +1,5 @@ /* - * This file contains the power_save function for 6xx & 7xxx CPUs - * rewritten in assembler - * - * Warning ! This code assumes that if your machine has a 750fx - * it will have PLL 1 set to low speed mode (used during NAP/DOZE). - * if this is not the case some additional changes will have to - * be done to check a runtime var (a bit like powersave-nap) + * This file contains the power_save function for 970-family CPUs. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -13,7 +7,6 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <linux/threads.h> #include <asm/processor.h> #include <asm/page.h> @@ -21,58 +14,60 @@ #include <asm/thread_info.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> +#include <asm/irqflags.h> #undef DEBUG .text -/* - * Here is the power_save_6xx function. This could eventually be - * split into several functions & changing the function pointer - * depending on the various features. - */ _GLOBAL(power4_idle) BEGIN_FTR_SECTION blr END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP) - /* We must dynamically check for the NAP feature as it - * can be cleared by CPU init after the fixups are done - */ - LOADBASE(r3,cur_cpu_spec) - ld r4,OFF(cur_cpu_spec)(r3) - ld r4,CPU_SPEC_FEATURES(r4) - andi. r0,r4,CPU_FTR_CAN_NAP - beqlr /* Now check if user or arch enabled NAP mode */ - LOADBASE(r3,powersave_nap) - lwz r4,OFF(powersave_nap)(r3) + LOAD_REG_ADDRBASE(r3,powersave_nap) + lwz r4,ADDROFF(powersave_nap)(r3) cmpwi 0,r4,0 beqlr - /* Clear MSR:EE */ + /* Hard disable interrupts */ mfmsr r7 - li r4,0 - ori r4,r4,MSR_EE - andc r0,r7,r4 - mtmsrd r0 + rldicl r0,r7,48,1 + rotldi r0,r0,16 + mtmsrd r0,1 - /* Check current_thread_info()->flags */ - clrrdi r4,r1,THREAD_SHIFT - ld r4,TI_FLAGS(r4) - andi. r0,r4,_TIF_NEED_RESCHED - beq 1f - mtmsrd r7 /* out of line this ? */ - blr -1: - /* Go to NAP now */ + /* Check if something happened while soft-disabled */ + lbz r0,PACAIRQHAPPENED(r13) + cmpwi cr0,r0,0 + bnelr + + /* Soft-enable interrupts */ +#ifdef CONFIG_TRACE_IRQFLAGS + mflr r0 + std r0,16(r1) + stdu r1,-128(r1) + bl trace_hardirqs_on + addi r1,r1,128 + ld r0,16(r1) + mtlr r0 + mfmsr r7 +#endif /* CONFIG_TRACE_IRQFLAGS */ + + li r0,1 + stb r0,PACASOFTIRQEN(r13) /* we'll hard-enable shortly */ BEGIN_FTR_SECTION DSSALL sync END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) + CURRENT_THREAD_INFO(r9, r1) + ld r8,TI_LOCAL_FLAGS(r9) /* set napping bit */ + ori r8,r8,_TLF_NAPPING /* so when we take an exception */ + std r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */ + ori r7,r7,MSR_EE oris r7,r7,MSR_POW@h - sync +1: sync isync mtmsrd r7 isync - sync - blr + b 1b + diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S new file mode 100644 index 00000000000..5cf3d367190 --- /dev/null +++ b/arch/powerpc/kernel/idle_power7.S @@ -0,0 +1,187 @@ +/* + * This file contains the power_save function for Power7 CPUs. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/threads.h> +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/thread_info.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/ppc-opcode.h> +#include <asm/hw_irq.h> +#include <asm/kvm_book3s_asm.h> +#include <asm/opal.h> + +#undef DEBUG + +/* Idle state entry routines */ + +#define IDLE_STATE_ENTER_SEQ(IDLE_INST) \ + /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \ + std r0,0(r1); \ + ptesync; \ + ld r0,0(r1); \ +1: cmp cr0,r0,r0; \ + bne 1b; \ + IDLE_INST; \ + b . + + .text + +/* + * Pass requested state in r3: + * 0 - nap + * 1 - sleep + * + * To check IRQ_HAPPENED in r4 + * 0 - don't check + * 1 - check + */ +_GLOBAL(power7_powersave_common) + /* Use r3 to pass state nap/sleep/winkle */ + /* NAP is a state loss, we create a regs frame on the + * stack, fill it up with the state we care about and + * stick a pointer to it in PACAR1. We really only + * need to save PC, some CR bits and the NV GPRs, + * but for now an interrupt frame will do. + */ + mflr r0 + std r0,16(r1) + stdu r1,-INT_FRAME_SIZE(r1) + std r0,_LINK(r1) + std r0,_NIP(r1) + +#ifndef CONFIG_SMP + /* Make sure FPU, VSX etc... are flushed as we may lose + * state when going to nap mode + */ + bl discard_lazy_cpu_state +#endif /* CONFIG_SMP */ + + /* Hard disable interrupts */ + mfmsr r9 + rldicl r9,r9,48,1 + rotldi r9,r9,16 + mtmsrd r9,1 /* hard-disable interrupts */ + + /* Check if something happened while soft-disabled */ + lbz r0,PACAIRQHAPPENED(r13) + cmpwi cr0,r0,0 + beq 1f + cmpwi cr0,r4,0 + beq 1f + addi r1,r1,INT_FRAME_SIZE + ld r0,16(r1) + mtlr r0 + blr + +1: /* We mark irqs hard disabled as this is the state we'll + * be in when returning and we need to tell arch_local_irq_restore() + * about it + */ + li r0,PACA_IRQ_HARD_DIS + stb r0,PACAIRQHAPPENED(r13) + + /* We haven't lost state ... yet */ + li r0,0 + stb r0,PACA_NAPSTATELOST(r13) + + /* Continue saving state */ + SAVE_GPR(2, r1) + SAVE_NVGPRS(r1) + mfcr r4 + std r4,_CCR(r1) + std r9,_MSR(r1) + std r1,PACAR1(r13) + +_GLOBAL(power7_enter_nap_mode) +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + /* Tell KVM we're napping */ + li r4,KVM_HWTHREAD_IN_NAP + stb r4,HSTATE_HWTHREAD_STATE(r13) +#endif + cmpwi cr0,r3,1 + beq 2f + IDLE_STATE_ENTER_SEQ(PPC_NAP) + /* No return */ +2: IDLE_STATE_ENTER_SEQ(PPC_SLEEP) + /* No return */ + +_GLOBAL(power7_idle) + /* Now check if user or arch enabled NAP mode */ + LOAD_REG_ADDRBASE(r3,powersave_nap) + lwz r4,ADDROFF(powersave_nap)(r3) + cmpwi 0,r4,0 + beqlr + li r3, 1 + /* fall through */ + +_GLOBAL(power7_nap) + mr r4,r3 + li r3,0 + b power7_powersave_common + /* No return */ + +_GLOBAL(power7_sleep) + li r3,1 + li r4,1 + b power7_powersave_common + /* No return */ + +_GLOBAL(power7_wakeup_tb_loss) + ld r2,PACATOC(r13); + ld r1,PACAR1(r13) + + /* Time base re-sync */ + li r0,OPAL_RESYNC_TIMEBASE + LOAD_REG_ADDR(r11,opal); + ld r12,8(r11); + ld r2,0(r11); + mtctr r12 + bctrl + + /* TODO: Check r3 for failure */ + + REST_NVGPRS(r1) + REST_GPR(2, r1) + ld r3,_CCR(r1) + ld r4,_MSR(r1) + ld r5,_NIP(r1) + addi r1,r1,INT_FRAME_SIZE + mtcr r3 + mfspr r3,SPRN_SRR1 /* Return SRR1 */ + mtspr SPRN_SRR1,r4 + mtspr SPRN_SRR0,r5 + rfid + +_GLOBAL(power7_wakeup_loss) + ld r1,PACAR1(r13) + REST_NVGPRS(r1) + REST_GPR(2, r1) + ld r3,_CCR(r1) + ld r4,_MSR(r1) + ld r5,_NIP(r1) + addi r1,r1,INT_FRAME_SIZE + mtcr r3 + mtspr SPRN_SRR1,r4 + mtspr SPRN_SRR0,r5 + rfid + +_GLOBAL(power7_wakeup_noloss) + lbz r0,PACA_NAPSTATELOST(r13) + cmpwi r0,0 + bne power7_wakeup_loss + ld r1,PACAR1(r13) + ld r4,_MSR(r1) + ld r5,_NIP(r1) + addi r1,r1,INT_FRAME_SIZE + mtspr SPRN_SRR1,r4 + mtspr SPRN_SRR0,r5 + rfid diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c deleted file mode 100644 index 941043ae040..00000000000 --- a/arch/powerpc/kernel/init_task.c +++ /dev/null @@ -1,36 +0,0 @@ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/init_task.h> -#include <linux/fs.h> -#include <linux/mqueue.h> -#include <asm/uaccess.h> - -static struct fs_struct init_fs = INIT_FS; -static struct files_struct init_files = INIT_FILES; -static struct signal_struct init_signals = INIT_SIGNALS(init_signals); -static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - -/* - * Initial thread structure. - * - * We need to make sure that this is 16384-byte aligned due to the - * way process stacks are handled. This is done by having a special - * "init_task" linker map entry.. - */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; - -/* - * Initial task structure. - * - * All other task structs will be allocated on slabs in fork.c - */ -struct task_struct init_task = INIT_TASK(init_task); - -EXPORT_SYMBOL(init_task); diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c new file mode 100644 index 00000000000..24b968f8e4d --- /dev/null +++ b/arch/powerpc/kernel/io-workarounds.c @@ -0,0 +1,212 @@ +/* + * Support PCI IO workaround + * + * Copyright (C) 2006 Benjamin Herrenschmidt <benh@kernel.crashing.org> + * IBM, Corp. + * (C) Copyright 2007-2008 TOSHIBA CORPORATION + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/sched.h> /* for init_mm */ + +#include <asm/io.h> +#include <asm/machdep.h> +#include <asm/pgtable.h> +#include <asm/ppc-pci.h> +#include <asm/io-workarounds.h> + +#define IOWA_MAX_BUS 8 + +static struct iowa_bus iowa_busses[IOWA_MAX_BUS]; +static unsigned int iowa_bus_count; + +static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr) +{ + int i, j; + struct resource *res; + unsigned long vstart, vend; + + for (i = 0; i < iowa_bus_count; i++) { + struct iowa_bus *bus = &iowa_busses[i]; + struct pci_controller *phb = bus->phb; + + if (vaddr) { + vstart = (unsigned long)phb->io_base_virt; + vend = vstart + phb->pci_io_size - 1; + if ((vaddr >= vstart) && (vaddr <= vend)) + return bus; + } + + if (paddr) + for (j = 0; j < 3; j++) { + res = &phb->mem_resources[j]; + if (paddr >= res->start && paddr <= res->end) + return bus; + } + } + + return NULL; +} + +#ifdef CONFIG_PPC_INDIRECT_MMIO +struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) +{ + unsigned hugepage_shift; + struct iowa_bus *bus; + int token; + + token = PCI_GET_ADDR_TOKEN(addr); + + if (token && token <= iowa_bus_count) + bus = &iowa_busses[token - 1]; + else { + unsigned long vaddr, paddr; + pte_t *ptep; + + vaddr = (unsigned long)PCI_FIX_ADDR(addr); + if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END) + return NULL; + + ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr, + &hugepage_shift); + if (ptep == NULL) + paddr = 0; + else { + /* + * we don't have hugepages backing iomem + */ + WARN_ON(hugepage_shift); + paddr = pte_pfn(*ptep) << PAGE_SHIFT; + } + bus = iowa_pci_find(vaddr, paddr); + + if (bus == NULL) + return NULL; + } + + return bus; +} +#else /* CONFIG_PPC_INDIRECT_MMIO */ +struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) +{ + return NULL; +} +#endif /* !CONFIG_PPC_INDIRECT_MMIO */ + +#ifdef CONFIG_PPC_INDIRECT_PIO +struct iowa_bus *iowa_pio_find_bus(unsigned long port) +{ + unsigned long vaddr = (unsigned long)pci_io_base + port; + return iowa_pci_find(vaddr, 0); +} +#else +struct iowa_bus *iowa_pio_find_bus(unsigned long port) +{ + return NULL; +} +#endif + +#define DEF_PCI_AC_RET(name, ret, at, al, space, aa) \ +static ret iowa_##name at \ +{ \ + struct iowa_bus *bus; \ + bus = iowa_##space##_find_bus(aa); \ + if (bus && bus->ops && bus->ops->name) \ + return bus->ops->name al; \ + return __do_##name al; \ +} + +#define DEF_PCI_AC_NORET(name, at, al, space, aa) \ +static void iowa_##name at \ +{ \ + struct iowa_bus *bus; \ + bus = iowa_##space##_find_bus(aa); \ + if (bus && bus->ops && bus->ops->name) { \ + bus->ops->name al; \ + return; \ + } \ + __do_##name al; \ +} + +#include <asm/io-defs.h> + +#undef DEF_PCI_AC_RET +#undef DEF_PCI_AC_NORET + +static const struct ppc_pci_io iowa_pci_io = { + +#define DEF_PCI_AC_RET(name, ret, at, al, space, aa) .name = iowa_##name, +#define DEF_PCI_AC_NORET(name, at, al, space, aa) .name = iowa_##name, + +#include <asm/io-defs.h> + +#undef DEF_PCI_AC_RET +#undef DEF_PCI_AC_NORET + +}; + +#ifdef CONFIG_PPC_INDIRECT_MMIO +static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, + unsigned long flags, void *caller) +{ + struct iowa_bus *bus; + void __iomem *res = __ioremap_caller(addr, size, flags, caller); + int busno; + + bus = iowa_pci_find(0, (unsigned long)addr); + if (bus != NULL) { + busno = bus - iowa_busses; + PCI_SET_ADDR_TOKEN(res, busno + 1); + } + return res; +} +#else /* CONFIG_PPC_INDIRECT_MMIO */ +#define iowa_ioremap NULL +#endif /* !CONFIG_PPC_INDIRECT_MMIO */ + +/* Enable IO workaround */ +static void io_workaround_init(void) +{ + static int io_workaround_inited; + + if (io_workaround_inited) + return; + ppc_pci_io = iowa_pci_io; + ppc_md.ioremap = iowa_ioremap; + io_workaround_inited = 1; +} + +/* Register new bus to support workaround */ +void iowa_register_bus(struct pci_controller *phb, struct ppc_pci_io *ops, + int (*initfunc)(struct iowa_bus *, void *), void *data) +{ + struct iowa_bus *bus; + struct device_node *np = phb->dn; + + io_workaround_init(); + + if (iowa_bus_count >= IOWA_MAX_BUS) { + pr_err("IOWA:Too many pci bridges, " + "workarounds disabled for %s\n", np->full_name); + return; + } + + bus = &iowa_busses[iowa_bus_count]; + bus->phb = phb; + bus->ops = ops; + bus->private = data; + + if (initfunc) + if ((*initfunc)(bus, data)) + return; + + iowa_bus_count++; + + pr_debug("IOWA:[%d]Add bus, %s.\n", iowa_bus_count-1, np->full_name); +} + diff --git a/arch/powerpc/kernel/io.c b/arch/powerpc/kernel/io.c new file mode 100644 index 00000000000..2a2b4aeab80 --- /dev/null +++ b/arch/powerpc/kernel/io.c @@ -0,0 +1,210 @@ +/* + * I/O string operations + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * Copyright (C) 2006 IBM Corporation + * + * Largely rewritten by Cort Dougan (cort@cs.nmt.edu) + * and Paul Mackerras. + * + * Adapted for iSeries by Mike Corrigan (mikejc@us.ibm.com) + * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com) + * + * Rewritten in C by Stephen Rothwell. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/compiler.h> +#include <linux/export.h> + +#include <asm/io.h> +#include <asm/firmware.h> +#include <asm/bug.h> + +/* See definition in io.h */ +bool isa_io_special; + +void _insb(const volatile u8 __iomem *port, void *buf, long count) +{ + u8 *tbuf = buf; + u8 tmp; + + if (unlikely(count <= 0)) + return; + asm volatile("sync"); + do { + tmp = *port; + eieio(); + *tbuf++ = tmp; + } while (--count != 0); + asm volatile("twi 0,%0,0; isync" : : "r" (tmp)); +} +EXPORT_SYMBOL(_insb); + +void _outsb(volatile u8 __iomem *port, const void *buf, long count) +{ + const u8 *tbuf = buf; + + if (unlikely(count <= 0)) + return; + asm volatile("sync"); + do { + *port = *tbuf++; + } while (--count != 0); + asm volatile("sync"); +} +EXPORT_SYMBOL(_outsb); + +void _insw_ns(const volatile u16 __iomem *port, void *buf, long count) +{ + u16 *tbuf = buf; + u16 tmp; + + if (unlikely(count <= 0)) + return; + asm volatile("sync"); + do { + tmp = *port; + eieio(); + *tbuf++ = tmp; + } while (--count != 0); + asm volatile("twi 0,%0,0; isync" : : "r" (tmp)); +} +EXPORT_SYMBOL(_insw_ns); + +void _outsw_ns(volatile u16 __iomem *port, const void *buf, long count) +{ + const u16 *tbuf = buf; + + if (unlikely(count <= 0)) + return; + asm volatile("sync"); + do { + *port = *tbuf++; + } while (--count != 0); + asm volatile("sync"); +} +EXPORT_SYMBOL(_outsw_ns); + +void _insl_ns(const volatile u32 __iomem *port, void *buf, long count) +{ + u32 *tbuf = buf; + u32 tmp; + + if (unlikely(count <= 0)) + return; + asm volatile("sync"); + do { + tmp = *port; + eieio(); + *tbuf++ = tmp; + } while (--count != 0); + asm volatile("twi 0,%0,0; isync" : : "r" (tmp)); +} +EXPORT_SYMBOL(_insl_ns); + +void _outsl_ns(volatile u32 __iomem *port, const void *buf, long count) +{ + const u32 *tbuf = buf; + + if (unlikely(count <= 0)) + return; + asm volatile("sync"); + do { + *port = *tbuf++; + } while (--count != 0); + asm volatile("sync"); +} +EXPORT_SYMBOL(_outsl_ns); + +#define IO_CHECK_ALIGN(v,a) ((((unsigned long)(v)) & ((a) - 1)) == 0) + +notrace void +_memset_io(volatile void __iomem *addr, int c, unsigned long n) +{ + void *p = (void __force *)addr; + u32 lc = c; + lc |= lc << 8; + lc |= lc << 16; + + __asm__ __volatile__ ("sync" : : : "memory"); + while(n && !IO_CHECK_ALIGN(p, 4)) { + *((volatile u8 *)p) = c; + p++; + n--; + } + while(n >= 4) { + *((volatile u32 *)p) = lc; + p += 4; + n -= 4; + } + while(n) { + *((volatile u8 *)p) = c; + p++; + n--; + } + __asm__ __volatile__ ("sync" : : : "memory"); +} +EXPORT_SYMBOL(_memset_io); + +void _memcpy_fromio(void *dest, const volatile void __iomem *src, + unsigned long n) +{ + void *vsrc = (void __force *) src; + + __asm__ __volatile__ ("sync" : : : "memory"); + while(n && (!IO_CHECK_ALIGN(vsrc, 4) || !IO_CHECK_ALIGN(dest, 4))) { + *((u8 *)dest) = *((volatile u8 *)vsrc); + eieio(); + vsrc++; + dest++; + n--; + } + while(n >= 4) { + *((u32 *)dest) = *((volatile u32 *)vsrc); + eieio(); + vsrc += 4; + dest += 4; + n -= 4; + } + while(n) { + *((u8 *)dest) = *((volatile u8 *)vsrc); + eieio(); + vsrc++; + dest++; + n--; + } + __asm__ __volatile__ ("sync" : : : "memory"); +} +EXPORT_SYMBOL(_memcpy_fromio); + +void _memcpy_toio(volatile void __iomem *dest, const void *src, unsigned long n) +{ + void *vdest = (void __force *) dest; + + __asm__ __volatile__ ("sync" : : : "memory"); + while(n && (!IO_CHECK_ALIGN(vdest, 4) || !IO_CHECK_ALIGN(src, 4))) { + *((volatile u8 *)vdest) = *((u8 *)src); + src++; + vdest++; + n--; + } + while(n >= 4) { + *((volatile u32 *)vdest) = *((volatile u32 *)src); + src += 4; + vdest += 4; + n-=4; + } + while(n) { + *((volatile u8 *)vdest) = *((u8 *)src); + src++; + vdest++; + n--; + } + __asm__ __volatile__ ("sync" : : : "memory"); +} +EXPORT_SYMBOL(_memcpy_toio); diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c new file mode 100644 index 00000000000..12e48d56f77 --- /dev/null +++ b/arch/powerpc/kernel/iomap.c @@ -0,0 +1,131 @@ +/* + * ppc64 "iomap" interface implementation. + * + * (C) Copyright 2004 Linus Torvalds + */ +#include <linux/pci.h> +#include <linux/mm.h> +#include <linux/export.h> +#include <asm/io.h> +#include <asm/pci-bridge.h> + +/* + * Here comes the ppc64 implementation of the IOMAP + * interfaces. + */ +unsigned int ioread8(void __iomem *addr) +{ + return readb(addr); +} +unsigned int ioread16(void __iomem *addr) +{ + return readw(addr); +} +unsigned int ioread16be(void __iomem *addr) +{ + return readw_be(addr); +} +unsigned int ioread32(void __iomem *addr) +{ + return readl(addr); +} +unsigned int ioread32be(void __iomem *addr) +{ + return readl_be(addr); +} +EXPORT_SYMBOL(ioread8); +EXPORT_SYMBOL(ioread16); +EXPORT_SYMBOL(ioread16be); +EXPORT_SYMBOL(ioread32); +EXPORT_SYMBOL(ioread32be); + +void iowrite8(u8 val, void __iomem *addr) +{ + writeb(val, addr); +} +void iowrite16(u16 val, void __iomem *addr) +{ + writew(val, addr); +} +void iowrite16be(u16 val, void __iomem *addr) +{ + writew_be(val, addr); +} +void iowrite32(u32 val, void __iomem *addr) +{ + writel(val, addr); +} +void iowrite32be(u32 val, void __iomem *addr) +{ + writel_be(val, addr); +} +EXPORT_SYMBOL(iowrite8); +EXPORT_SYMBOL(iowrite16); +EXPORT_SYMBOL(iowrite16be); +EXPORT_SYMBOL(iowrite32); +EXPORT_SYMBOL(iowrite32be); + +/* + * These are the "repeat read/write" functions. Note the + * non-CPU byte order. We do things in "IO byteorder" + * here. + * + * FIXME! We could make these do EEH handling if we really + * wanted. Not clear if we do. + */ +void ioread8_rep(void __iomem *addr, void *dst, unsigned long count) +{ + readsb(addr, dst, count); +} +void ioread16_rep(void __iomem *addr, void *dst, unsigned long count) +{ + readsw(addr, dst, count); +} +void ioread32_rep(void __iomem *addr, void *dst, unsigned long count) +{ + readsl(addr, dst, count); +} +EXPORT_SYMBOL(ioread8_rep); +EXPORT_SYMBOL(ioread16_rep); +EXPORT_SYMBOL(ioread32_rep); + +void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) +{ + writesb(addr, src, count); +} +void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) +{ + writesw(addr, src, count); +} +void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) +{ + writesl(addr, src, count); +} +EXPORT_SYMBOL(iowrite8_rep); +EXPORT_SYMBOL(iowrite16_rep); +EXPORT_SYMBOL(iowrite32_rep); + +void __iomem *ioport_map(unsigned long port, unsigned int len) +{ + return (void __iomem *) (port + _IO_BASE); +} + +void ioport_unmap(void __iomem *addr) +{ + /* Nothing to do */ +} +EXPORT_SYMBOL(ioport_map); +EXPORT_SYMBOL(ioport_unmap); + +#ifdef CONFIG_PCI +void pci_iounmap(struct pci_dev *dev, void __iomem *addr) +{ + if (isa_vaddr_is_ioport(addr)) + return; + if (pcibios_vaddr_is_ioport(addr)) + return; + iounmap(addr); +} + +EXPORT_SYMBOL(pci_iounmap); +#endif /* CONFIG_PCI */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c new file mode 100644 index 00000000000..88e3ec6e1d9 --- /dev/null +++ b/arch/powerpc/kernel/iommu.c @@ -0,0 +1,1174 @@ +/* + * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation + * + * Rewrite, cleanup, new allocation schemes, virtual merging: + * Copyright (C) 2004 Olof Johansson, IBM Corporation + * and Ben. Herrenschmidt, IBM Corporation + * + * Dynamic DMA mapping support, bus-independent parts. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/dma-mapping.h> +#include <linux/bitmap.h> +#include <linux/iommu-helper.h> +#include <linux/crash_dump.h> +#include <linux/hash.h> +#include <linux/fault-inject.h> +#include <linux/pci.h> +#include <linux/iommu.h> +#include <linux/sched.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/iommu.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/kdump.h> +#include <asm/fadump.h> +#include <asm/vio.h> +#include <asm/tce.h> + +#define DBG(...) + +static int novmerge; + +static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int); + +static int __init setup_iommu(char *str) +{ + if (!strcmp(str, "novmerge")) + novmerge = 1; + else if (!strcmp(str, "vmerge")) + novmerge = 0; + return 1; +} + +__setup("iommu=", setup_iommu); + +static DEFINE_PER_CPU(unsigned int, iommu_pool_hash); + +/* + * We precalculate the hash to avoid doing it on every allocation. + * + * The hash is important to spread CPUs across all the pools. For example, + * on a POWER7 with 4 way SMT we want interrupts on the primary threads and + * with 4 pools all primary threads would map to the same pool. + */ +static int __init setup_iommu_pool_hash(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS); + + return 0; +} +subsys_initcall(setup_iommu_pool_hash); + +#ifdef CONFIG_FAIL_IOMMU + +static DECLARE_FAULT_ATTR(fail_iommu); + +static int __init setup_fail_iommu(char *str) +{ + return setup_fault_attr(&fail_iommu, str); +} +__setup("fail_iommu=", setup_fail_iommu); + +static bool should_fail_iommu(struct device *dev) +{ + return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1); +} + +static int __init fail_iommu_debugfs(void) +{ + struct dentry *dir = fault_create_debugfs_attr("fail_iommu", + NULL, &fail_iommu); + + return PTR_ERR_OR_ZERO(dir); +} +late_initcall(fail_iommu_debugfs); + +static ssize_t fail_iommu_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", dev->archdata.fail_iommu); +} + +static ssize_t fail_iommu_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + int i; + + if (count > 0 && sscanf(buf, "%d", &i) > 0) + dev->archdata.fail_iommu = (i == 0) ? 0 : 1; + + return count; +} + +static DEVICE_ATTR(fail_iommu, S_IRUGO|S_IWUSR, fail_iommu_show, + fail_iommu_store); + +static int fail_iommu_bus_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + + if (action == BUS_NOTIFY_ADD_DEVICE) { + if (device_create_file(dev, &dev_attr_fail_iommu)) + pr_warn("Unable to create IOMMU fault injection sysfs " + "entries\n"); + } else if (action == BUS_NOTIFY_DEL_DEVICE) { + device_remove_file(dev, &dev_attr_fail_iommu); + } + + return 0; +} + +static struct notifier_block fail_iommu_bus_notifier = { + .notifier_call = fail_iommu_bus_notify +}; + +static int __init fail_iommu_setup(void) +{ +#ifdef CONFIG_PCI + bus_register_notifier(&pci_bus_type, &fail_iommu_bus_notifier); +#endif +#ifdef CONFIG_IBMVIO + bus_register_notifier(&vio_bus_type, &fail_iommu_bus_notifier); +#endif + + return 0; +} +/* + * Must execute after PCI and VIO subsystem have initialised but before + * devices are probed. + */ +arch_initcall(fail_iommu_setup); +#else +static inline bool should_fail_iommu(struct device *dev) +{ + return false; +} +#endif + +static unsigned long iommu_range_alloc(struct device *dev, + struct iommu_table *tbl, + unsigned long npages, + unsigned long *handle, + unsigned long mask, + unsigned int align_order) +{ + unsigned long n, end, start; + unsigned long limit; + int largealloc = npages > 15; + int pass = 0; + unsigned long align_mask; + unsigned long boundary_size; + unsigned long flags; + unsigned int pool_nr; + struct iommu_pool *pool; + + align_mask = 0xffffffffffffffffl >> (64 - align_order); + + /* This allocator was derived from x86_64's bit string search */ + + /* Sanity check */ + if (unlikely(npages == 0)) { + if (printk_ratelimit()) + WARN_ON(1); + return DMA_ERROR_CODE; + } + + if (should_fail_iommu(dev)) + return DMA_ERROR_CODE; + + /* + * We don't need to disable preemption here because any CPU can + * safely use any IOMMU pool. + */ + pool_nr = __raw_get_cpu_var(iommu_pool_hash) & (tbl->nr_pools - 1); + + if (largealloc) + pool = &(tbl->large_pool); + else + pool = &(tbl->pools[pool_nr]); + + spin_lock_irqsave(&(pool->lock), flags); + +again: + if ((pass == 0) && handle && *handle && + (*handle >= pool->start) && (*handle < pool->end)) + start = *handle; + else + start = pool->hint; + + limit = pool->end; + + /* The case below can happen if we have a small segment appended + * to a large, or when the previous alloc was at the very end of + * the available space. If so, go back to the initial start. + */ + if (start >= limit) + start = pool->start; + + if (limit + tbl->it_offset > mask) { + limit = mask - tbl->it_offset + 1; + /* If we're constrained on address range, first try + * at the masked hint to avoid O(n) search complexity, + * but on second pass, start at 0 in pool 0. + */ + if ((start & mask) >= limit || pass > 0) { + spin_unlock(&(pool->lock)); + pool = &(tbl->pools[0]); + spin_lock(&(pool->lock)); + start = pool->start; + } else { + start &= mask; + } + } + + if (dev) + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + 1 << tbl->it_page_shift); + else + boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift); + /* 4GB boundary for iseries_hv_alloc and iseries_hv_map */ + + n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset, + boundary_size >> tbl->it_page_shift, align_mask); + if (n == -1) { + if (likely(pass == 0)) { + /* First try the pool from the start */ + pool->hint = pool->start; + pass++; + goto again; + + } else if (pass <= tbl->nr_pools) { + /* Now try scanning all the other pools */ + spin_unlock(&(pool->lock)); + pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1); + pool = &tbl->pools[pool_nr]; + spin_lock(&(pool->lock)); + pool->hint = pool->start; + pass++; + goto again; + + } else { + /* Give up */ + spin_unlock_irqrestore(&(pool->lock), flags); + return DMA_ERROR_CODE; + } + } + + end = n + npages; + + /* Bump the hint to a new block for small allocs. */ + if (largealloc) { + /* Don't bump to new block to avoid fragmentation */ + pool->hint = end; + } else { + /* Overflow will be taken care of at the next allocation */ + pool->hint = (end + tbl->it_blocksize - 1) & + ~(tbl->it_blocksize - 1); + } + + /* Update handle for SG allocations */ + if (handle) + *handle = end; + + spin_unlock_irqrestore(&(pool->lock), flags); + + return n; +} + +static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, + void *page, unsigned int npages, + enum dma_data_direction direction, + unsigned long mask, unsigned int align_order, + struct dma_attrs *attrs) +{ + unsigned long entry; + dma_addr_t ret = DMA_ERROR_CODE; + int build_fail; + + entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order); + + if (unlikely(entry == DMA_ERROR_CODE)) + return DMA_ERROR_CODE; + + entry += tbl->it_offset; /* Offset into real TCE table */ + ret = entry << tbl->it_page_shift; /* Set the return dma address */ + + /* Put the TCEs in the HW table */ + build_fail = ppc_md.tce_build(tbl, entry, npages, + (unsigned long)page & + IOMMU_PAGE_MASK(tbl), direction, attrs); + + /* ppc_md.tce_build() only returns non-zero for transient errors. + * Clean up the table bitmap in this case and return + * DMA_ERROR_CODE. For all other errors the functionality is + * not altered. + */ + if (unlikely(build_fail)) { + __iommu_free(tbl, ret, npages); + return DMA_ERROR_CODE; + } + + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + /* Make sure updates are seen by hardware */ + mb(); + + return ret; +} + +static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long entry, free_entry; + + entry = dma_addr >> tbl->it_page_shift; + free_entry = entry - tbl->it_offset; + + if (((free_entry + npages) > tbl->it_size) || + (entry < tbl->it_offset)) { + if (printk_ratelimit()) { + printk(KERN_INFO "iommu_free: invalid entry\n"); + printk(KERN_INFO "\tentry = 0x%lx\n", entry); + printk(KERN_INFO "\tdma_addr = 0x%llx\n", (u64)dma_addr); + printk(KERN_INFO "\tTable = 0x%llx\n", (u64)tbl); + printk(KERN_INFO "\tbus# = 0x%llx\n", (u64)tbl->it_busno); + printk(KERN_INFO "\tsize = 0x%llx\n", (u64)tbl->it_size); + printk(KERN_INFO "\tstartOff = 0x%llx\n", (u64)tbl->it_offset); + printk(KERN_INFO "\tindex = 0x%llx\n", (u64)tbl->it_index); + WARN_ON(1); + } + + return false; + } + + return true; +} + +static struct iommu_pool *get_pool(struct iommu_table *tbl, + unsigned long entry) +{ + struct iommu_pool *p; + unsigned long largepool_start = tbl->large_pool.start; + + /* The large pool is the last pool at the top of the table */ + if (entry >= largepool_start) { + p = &tbl->large_pool; + } else { + unsigned int pool_nr = entry / tbl->poolsize; + + BUG_ON(pool_nr > tbl->nr_pools); + p = &tbl->pools[pool_nr]; + } + + return p; +} + +static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long entry, free_entry; + unsigned long flags; + struct iommu_pool *pool; + + entry = dma_addr >> tbl->it_page_shift; + free_entry = entry - tbl->it_offset; + + pool = get_pool(tbl, free_entry); + + if (!iommu_free_check(tbl, dma_addr, npages)) + return; + + ppc_md.tce_free(tbl, entry, npages); + + spin_lock_irqsave(&(pool->lock), flags); + bitmap_clear(tbl->it_map, free_entry, npages); + spin_unlock_irqrestore(&(pool->lock), flags); +} + +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + __iommu_free(tbl, dma_addr, npages); + + /* Make sure TLB cache is flushed if the HW needs it. We do + * not do an mb() here on purpose, it is not needed on any of + * the current platforms. + */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); +} + +int iommu_map_sg(struct device *dev, struct iommu_table *tbl, + struct scatterlist *sglist, int nelems, + unsigned long mask, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + dma_addr_t dma_next = 0, dma_addr; + struct scatterlist *s, *outs, *segstart; + int outcount, incount, i, build_fail = 0; + unsigned int align; + unsigned long handle; + unsigned int max_seg_size; + + BUG_ON(direction == DMA_NONE); + + if ((nelems == 0) || !tbl) + return 0; + + outs = s = segstart = &sglist[0]; + outcount = 1; + incount = nelems; + handle = 0; + + /* Init first segment length for backout at failure */ + outs->dma_length = 0; + + DBG("sg mapping %d elements:\n", nelems); + + max_seg_size = dma_get_max_seg_size(dev); + for_each_sg(sglist, s, nelems, i) { + unsigned long vaddr, npages, entry, slen; + + slen = s->length; + /* Sanity check */ + if (slen == 0) { + dma_next = 0; + continue; + } + /* Allocate iommu entries for that segment */ + vaddr = (unsigned long) sg_virt(s); + npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl)); + align = 0; + if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE && + (vaddr & ~PAGE_MASK) == 0) + align = PAGE_SHIFT - tbl->it_page_shift; + entry = iommu_range_alloc(dev, tbl, npages, &handle, + mask >> tbl->it_page_shift, align); + + DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); + + /* Handle failure */ + if (unlikely(entry == DMA_ERROR_CODE)) { + if (printk_ratelimit()) + dev_info(dev, "iommu_alloc failed, tbl %p " + "vaddr %lx npages %lu\n", tbl, vaddr, + npages); + goto failure; + } + + /* Convert entry to a dma_addr_t */ + entry += tbl->it_offset; + dma_addr = entry << tbl->it_page_shift; + dma_addr |= (s->offset & ~IOMMU_PAGE_MASK(tbl)); + + DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n", + npages, entry, dma_addr); + + /* Insert into HW table */ + build_fail = ppc_md.tce_build(tbl, entry, npages, + vaddr & IOMMU_PAGE_MASK(tbl), + direction, attrs); + if(unlikely(build_fail)) + goto failure; + + /* If we are in an open segment, try merging */ + if (segstart != s) { + DBG(" - trying merge...\n"); + /* We cannot merge if: + * - allocated dma_addr isn't contiguous to previous allocation + */ + if (novmerge || (dma_addr != dma_next) || + (outs->dma_length + s->length > max_seg_size)) { + /* Can't merge: create a new segment */ + segstart = s; + outcount++; + outs = sg_next(outs); + DBG(" can't merge, new segment.\n"); + } else { + outs->dma_length += s->length; + DBG(" merged, new len: %ux\n", outs->dma_length); + } + } + + if (segstart == s) { + /* This is a new segment, fill entries */ + DBG(" - filling new segment.\n"); + outs->dma_address = dma_addr; + outs->dma_length = slen; + } + + /* Calculate next page pointer for contiguous check */ + dma_next = dma_addr + slen; + + DBG(" - dma next is: %lx\n", dma_next); + } + + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + DBG("mapped %d elements:\n", outcount); + + /* For the sake of iommu_unmap_sg, we clear out the length in the + * next entry of the sglist if we didn't fill the list completely + */ + if (outcount < incount) { + outs = sg_next(outs); + outs->dma_address = DMA_ERROR_CODE; + outs->dma_length = 0; + } + + /* Make sure updates are seen by hardware */ + mb(); + + return outcount; + + failure: + for_each_sg(sglist, s, nelems, i) { + if (s->dma_length != 0) { + unsigned long vaddr, npages; + + vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl); + npages = iommu_num_pages(s->dma_address, s->dma_length, + IOMMU_PAGE_SIZE(tbl)); + __iommu_free(tbl, vaddr, npages); + s->dma_address = DMA_ERROR_CODE; + s->dma_length = 0; + } + if (s == outs) + break; + } + return 0; +} + + +void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + + BUG_ON(direction == DMA_NONE); + + if (!tbl) + return; + + sg = sglist; + while (nelems--) { + unsigned int npages; + dma_addr_t dma_handle = sg->dma_address; + + if (sg->dma_length == 0) + break; + npages = iommu_num_pages(dma_handle, sg->dma_length, + IOMMU_PAGE_SIZE(tbl)); + __iommu_free(tbl, dma_handle, npages); + sg = sg_next(sg); + } + + /* Flush/invalidate TLBs if necessary. As for iommu_free(), we + * do not do an mb() here, the affected platforms do not need it + * when freeing. + */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); +} + +static void iommu_table_clear(struct iommu_table *tbl) +{ + /* + * In case of firmware assisted dump system goes through clean + * reboot process at the time of system crash. Hence it's safe to + * clear the TCE entries if firmware assisted dump is active. + */ + if (!is_kdump_kernel() || is_fadump_active()) { + /* Clear the table in case firmware left allocations in it */ + ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size); + return; + } + +#ifdef CONFIG_CRASH_DUMP + if (ppc_md.tce_get) { + unsigned long index, tceval, tcecount = 0; + + /* Reserve the existing mappings left by the first kernel. */ + for (index = 0; index < tbl->it_size; index++) { + tceval = ppc_md.tce_get(tbl, index + tbl->it_offset); + /* + * Freed TCE entry contains 0x7fffffffffffffff on JS20 + */ + if (tceval && (tceval != 0x7fffffffffffffffUL)) { + __set_bit(index, tbl->it_map); + tcecount++; + } + } + + if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) { + printk(KERN_WARNING "TCE table is full; freeing "); + printk(KERN_WARNING "%d entries for the kdump boot\n", + KDUMP_MIN_TCE_ENTRIES); + for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES; + index < tbl->it_size; index++) + __clear_bit(index, tbl->it_map); + } + } +#endif +} + +/* + * Build a iommu_table structure. This contains a bit map which + * is used to manage allocation of the tce space. + */ +struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) +{ + unsigned long sz; + static int welcomed = 0; + struct page *page; + unsigned int i; + struct iommu_pool *p; + + /* number of bytes needed for the bitmap */ + sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); + + page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz)); + if (!page) + panic("iommu_init_table: Can't allocate %ld bytes\n", sz); + tbl->it_map = page_address(page); + memset(tbl->it_map, 0, sz); + + /* + * Reserve page 0 so it will not be used for any mappings. + * This avoids buggy drivers that consider page 0 to be invalid + * to crash the machine or even lose data. + */ + if (tbl->it_offset == 0) + set_bit(0, tbl->it_map); + + /* We only split the IOMMU table if we have 1GB or more of space */ + if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024)) + tbl->nr_pools = IOMMU_NR_POOLS; + else + tbl->nr_pools = 1; + + /* We reserve the top 1/4 of the table for large allocations */ + tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools; + + for (i = 0; i < tbl->nr_pools; i++) { + p = &tbl->pools[i]; + spin_lock_init(&(p->lock)); + p->start = tbl->poolsize * i; + p->hint = p->start; + p->end = p->start + tbl->poolsize; + } + + p = &tbl->large_pool; + spin_lock_init(&(p->lock)); + p->start = tbl->poolsize * i; + p->hint = p->start; + p->end = tbl->it_size; + + iommu_table_clear(tbl); + + if (!welcomed) { + printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n", + novmerge ? "disabled" : "enabled"); + welcomed = 1; + } + + return tbl; +} + +void iommu_free_table(struct iommu_table *tbl, const char *node_name) +{ + unsigned long bitmap_sz; + unsigned int order; + + if (!tbl || !tbl->it_map) { + printk(KERN_ERR "%s: expected TCE map for %s\n", __func__, + node_name); + return; + } + + /* + * In case we have reserved the first bit, we should not emit + * the warning below. + */ + if (tbl->it_offset == 0) + clear_bit(0, tbl->it_map); + +#ifdef CONFIG_IOMMU_API + if (tbl->it_group) { + iommu_group_put(tbl->it_group); + BUG_ON(tbl->it_group); + } +#endif + + /* verify that table contains no entries */ + if (!bitmap_empty(tbl->it_map, tbl->it_size)) + pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name); + + /* calculate bitmap size in bytes */ + bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); + + /* free bitmap */ + order = get_order(bitmap_sz); + free_pages((unsigned long) tbl->it_map, order); + + /* free table */ + kfree(tbl); +} + +/* Creates TCEs for a user provided buffer. The user buffer must be + * contiguous real kernel storage (not vmalloc). The address passed here + * comprises a page address and offset into that page. The dma_addr_t + * returned will point to the same byte within the page as was passed in. + */ +dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, + struct page *page, unsigned long offset, size_t size, + unsigned long mask, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + dma_addr_t dma_handle = DMA_ERROR_CODE; + void *vaddr; + unsigned long uaddr; + unsigned int npages, align; + + BUG_ON(direction == DMA_NONE); + + vaddr = page_address(page) + offset; + uaddr = (unsigned long)vaddr; + npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl)); + + if (tbl) { + align = 0; + if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE && + ((unsigned long)vaddr & ~PAGE_MASK) == 0) + align = PAGE_SHIFT - tbl->it_page_shift; + + dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction, + mask >> tbl->it_page_shift, align, + attrs); + if (dma_handle == DMA_ERROR_CODE) { + if (printk_ratelimit()) { + dev_info(dev, "iommu_alloc failed, tbl %p " + "vaddr %p npages %d\n", tbl, vaddr, + npages); + } + } else + dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl)); + } + + return dma_handle; +} + +void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + unsigned int npages; + + BUG_ON(direction == DMA_NONE); + + if (tbl) { + npages = iommu_num_pages(dma_handle, size, + IOMMU_PAGE_SIZE(tbl)); + iommu_free(tbl, dma_handle, npages); + } +} + +/* Allocates a contiguous real buffer and creates mappings over it. + * Returns the virtual address of the buffer and sets dma_handle + * to the dma address (mapping) of the first page. + */ +void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, + size_t size, dma_addr_t *dma_handle, + unsigned long mask, gfp_t flag, int node) +{ + void *ret = NULL; + dma_addr_t mapping; + unsigned int order; + unsigned int nio_pages, io_order; + struct page *page; + + size = PAGE_ALIGN(size); + order = get_order(size); + + /* + * Client asked for way too much space. This is checked later + * anyway. It is easier to debug here for the drivers than in + * the tce tables. + */ + if (order >= IOMAP_MAX_ORDER) { + dev_info(dev, "iommu_alloc_consistent size too large: 0x%lx\n", + size); + return NULL; + } + + if (!tbl) + return NULL; + + /* Alloc enough pages (and possibly more) */ + page = alloc_pages_node(node, flag, order); + if (!page) + return NULL; + ret = page_address(page); + memset(ret, 0, size); + + /* Set up tces to cover the allocated range */ + nio_pages = size >> tbl->it_page_shift; + io_order = get_iommu_order(size, tbl); + mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, + mask >> tbl->it_page_shift, io_order, NULL); + if (mapping == DMA_ERROR_CODE) { + free_pages((unsigned long)ret, order); + return NULL; + } + *dma_handle = mapping; + return ret; +} + +void iommu_free_coherent(struct iommu_table *tbl, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + if (tbl) { + unsigned int nio_pages; + + size = PAGE_ALIGN(size); + nio_pages = size >> tbl->it_page_shift; + iommu_free(tbl, dma_handle, nio_pages); + size = PAGE_ALIGN(size); + free_pages((unsigned long)vaddr, get_order(size)); + } +} + +#ifdef CONFIG_IOMMU_API +/* + * SPAPR TCE API + */ +static void group_release(void *iommu_data) +{ + struct iommu_table *tbl = iommu_data; + tbl->it_group = NULL; +} + +void iommu_register_group(struct iommu_table *tbl, + int pci_domain_number, unsigned long pe_num) +{ + struct iommu_group *grp; + char *name; + + grp = iommu_group_alloc(); + if (IS_ERR(grp)) { + pr_warn("powerpc iommu api: cannot create new group, err=%ld\n", + PTR_ERR(grp)); + return; + } + tbl->it_group = grp; + iommu_group_set_iommudata(grp, tbl, group_release); + name = kasprintf(GFP_KERNEL, "domain%d-pe%lx", + pci_domain_number, pe_num); + if (!name) + return; + iommu_group_set_name(grp, name); + kfree(name); +} + +enum dma_data_direction iommu_tce_direction(unsigned long tce) +{ + if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE)) + return DMA_BIDIRECTIONAL; + else if (tce & TCE_PCI_READ) + return DMA_TO_DEVICE; + else if (tce & TCE_PCI_WRITE) + return DMA_FROM_DEVICE; + else + return DMA_NONE; +} +EXPORT_SYMBOL_GPL(iommu_tce_direction); + +void iommu_flush_tce(struct iommu_table *tbl) +{ + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + /* Make sure updates are seen by hardware */ + mb(); +} +EXPORT_SYMBOL_GPL(iommu_flush_tce); + +int iommu_tce_clear_param_check(struct iommu_table *tbl, + unsigned long ioba, unsigned long tce_value, + unsigned long npages) +{ + /* ppc_md.tce_free() does not support any value but 0 */ + if (tce_value) + return -EINVAL; + + if (ioba & ~IOMMU_PAGE_MASK(tbl)) + return -EINVAL; + + ioba >>= tbl->it_page_shift; + if (ioba < tbl->it_offset) + return -EINVAL; + + if ((ioba + npages) > (tbl->it_offset + tbl->it_size)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check); + +int iommu_tce_put_param_check(struct iommu_table *tbl, + unsigned long ioba, unsigned long tce) +{ + if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ))) + return -EINVAL; + + if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ)) + return -EINVAL; + + if (ioba & ~IOMMU_PAGE_MASK(tbl)) + return -EINVAL; + + ioba >>= tbl->it_page_shift; + if (ioba < tbl->it_offset) + return -EINVAL; + + if ((ioba + 1) > (tbl->it_offset + tbl->it_size)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); + +unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) +{ + unsigned long oldtce; + struct iommu_pool *pool = get_pool(tbl, entry); + + spin_lock(&(pool->lock)); + + oldtce = ppc_md.tce_get(tbl, entry); + if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) + ppc_md.tce_free(tbl, entry, 1); + else + oldtce = 0; + + spin_unlock(&(pool->lock)); + + return oldtce; +} +EXPORT_SYMBOL_GPL(iommu_clear_tce); + +int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, + unsigned long entry, unsigned long pages) +{ + unsigned long oldtce; + struct page *page; + + for ( ; pages; --pages, ++entry) { + oldtce = iommu_clear_tce(tbl, entry); + if (!oldtce) + continue; + + page = pfn_to_page(oldtce >> PAGE_SHIFT); + WARN_ON(!page); + if (page) { + if (oldtce & TCE_PCI_WRITE) + SetPageDirty(page); + put_page(page); + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages); + +/* + * hwaddr is a kernel virtual address here (0xc... bazillion), + * tce_build converts it to a physical address. + */ +int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, + unsigned long hwaddr, enum dma_data_direction direction) +{ + int ret = -EBUSY; + unsigned long oldtce; + struct iommu_pool *pool = get_pool(tbl, entry); + + spin_lock(&(pool->lock)); + + oldtce = ppc_md.tce_get(tbl, entry); + /* Add new entry if it is not busy */ + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))) + ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL); + + spin_unlock(&(pool->lock)); + + /* if (unlikely(ret)) + pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n", + __func__, hwaddr, entry << IOMMU_PAGE_SHIFT(tbl), + hwaddr, ret); */ + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_tce_build); + +int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, + unsigned long tce) +{ + int ret; + struct page *page = NULL; + unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; + enum dma_data_direction direction = iommu_tce_direction(tce); + + ret = get_user_pages_fast(tce & PAGE_MASK, 1, + direction != DMA_TO_DEVICE, &page); + if (unlikely(ret != 1)) { + /* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n", + tce, entry << IOMMU_PAGE_SHIFT(tbl), ret); */ + return -EFAULT; + } + hwaddr = (unsigned long) page_address(page) + offset; + + ret = iommu_tce_build(tbl, entry, hwaddr, direction); + if (ret) + put_page(page); + + if (ret < 0) + pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n", + __func__, entry << tbl->it_page_shift, tce, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); + +int iommu_take_ownership(struct iommu_table *tbl) +{ + unsigned long sz = (tbl->it_size + 7) >> 3; + + if (tbl->it_offset == 0) + clear_bit(0, tbl->it_map); + + if (!bitmap_empty(tbl->it_map, tbl->it_size)) { + pr_err("iommu_tce: it_map is not empty"); + return -EBUSY; + } + + memset(tbl->it_map, 0xff, sz); + iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); + + /* + * Disable iommu bypass, otherwise the user can DMA to all of + * our physical memory via the bypass window instead of just + * the pages that has been explicitly mapped into the iommu + */ + if (tbl->set_bypass) + tbl->set_bypass(tbl, false); + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_take_ownership); + +void iommu_release_ownership(struct iommu_table *tbl) +{ + unsigned long sz = (tbl->it_size + 7) >> 3; + + iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); + memset(tbl->it_map, 0, sz); + + /* Restore bit#0 set by iommu_init_table() */ + if (tbl->it_offset == 0) + set_bit(0, tbl->it_map); + + /* The kernel owns the device now, we can restore the iommu bypass */ + if (tbl->set_bypass) + tbl->set_bypass(tbl, true); +} +EXPORT_SYMBOL_GPL(iommu_release_ownership); + +int iommu_add_device(struct device *dev) +{ + struct iommu_table *tbl; + int ret = 0; + + if (WARN_ON(dev->iommu_group)) { + pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n", + dev_name(dev), + iommu_group_id(dev->iommu_group)); + return -EBUSY; + } + + tbl = get_iommu_table_base(dev); + if (!tbl || !tbl->it_group) { + pr_debug("iommu_tce: skipping device %s with no tbl\n", + dev_name(dev)); + return 0; + } + + pr_debug("iommu_tce: adding %s to iommu group %d\n", + dev_name(dev), iommu_group_id(tbl->it_group)); + + if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) { + pr_err("iommu_tce: unsupported iommu page size."); + pr_err("%s has not been added\n", dev_name(dev)); + return -EINVAL; + } + + ret = iommu_group_add_device(tbl->it_group, dev); + if (ret < 0) + pr_err("iommu_tce: %s has not been added, ret=%d\n", + dev_name(dev), ret); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_add_device); + +void iommu_del_device(struct device *dev) +{ + /* + * Some devices might not have IOMMU table and group + * and we needn't detach them from the associated + * IOMMU groups + */ + if (!dev->iommu_group) { + pr_debug("iommu_tce: skipping device %s with no tbl\n", + dev_name(dev)); + return; + } + + iommu_group_remove_device(dev); +} +EXPORT_SYMBOL_GPL(iommu_del_device); + +#endif /* CONFIG_IOMMU_API */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c new file mode 100644 index 00000000000..248ee7e5beb --- /dev/null +++ b/arch/powerpc/kernel/irq.c @@ -0,0 +1,678 @@ +/* + * Derived from arch/i386/kernel/irq.c + * Copyright (C) 1992 Linus Torvalds + * Adapted from arch/i386 by Gary Thomas + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * Updated and modified by Cort Dougan <cort@fsmlabs.com> + * Copyright (C) 1996-2001 Cort Dougan + * Adapted for Power Macintosh by Paul Mackerras + * Copyright (C) 1996 Paul Mackerras (paulus@cs.anu.edu.au) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * This file contains the code used by various IRQ handling routines: + * asking for different IRQ's should be done through these routines + * instead of just grabbing them. Thus setups with different IRQ numbers + * shouldn't result in any weird surprises, and installing new handlers + * should be easier. + * + * The MPC8xx has an interrupt mask in the SIU. If a bit is set, the + * interrupt is _enabled_. As expected, IRQ0 is bit 0 in the 32-bit + * mask register (of which only 16 are defined), hence the weird shifting + * and complement of the cached_irq_mask. I want to be able to stuff + * this right into the SIU SMASK register. + * Many of the prep/chrp functions are conditional compiled on CONFIG_8xx + * to reduce code space and undefined function references. + */ + +#undef DEBUG + +#include <linux/export.h> +#include <linux/threads.h> +#include <linux/kernel_stat.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ptrace.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/timex.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/delay.h> +#include <linux/irq.h> +#include <linux/seq_file.h> +#include <linux/cpumask.h> +#include <linux/profile.h> +#include <linux/bitops.h> +#include <linux/list.h> +#include <linux/radix-tree.h> +#include <linux/mutex.h> +#include <linux/bootmem.h> +#include <linux/pci.h> +#include <linux/debugfs.h> +#include <linux/of.h> +#include <linux/of_irq.h> + +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/pgtable.h> +#include <asm/irq.h> +#include <asm/cache.h> +#include <asm/prom.h> +#include <asm/ptrace.h> +#include <asm/machdep.h> +#include <asm/udbg.h> +#include <asm/smp.h> +#include <asm/debug.h> + +#ifdef CONFIG_PPC64 +#include <asm/paca.h> +#include <asm/firmware.h> +#include <asm/lv1call.h> +#endif +#define CREATE_TRACE_POINTS +#include <asm/trace.h> + +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +int __irq_offset_value; + +#ifdef CONFIG_PPC32 +EXPORT_SYMBOL(__irq_offset_value); +atomic_t ppc_n_lost_interrupts; + +#ifdef CONFIG_TAU_INT +extern int tau_initialized; +extern int tau_interrupts(int); +#endif +#endif /* CONFIG_PPC32 */ + +#ifdef CONFIG_PPC64 + +int distribute_irqs = 1; + +static inline notrace unsigned long get_irq_happened(void) +{ + unsigned long happened; + + __asm__ __volatile__("lbz %0,%1(13)" + : "=r" (happened) : "i" (offsetof(struct paca_struct, irq_happened))); + + return happened; +} + +static inline notrace void set_soft_enabled(unsigned long enable) +{ + __asm__ __volatile__("stb %0,%1(13)" + : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); +} + +static inline notrace int decrementer_check_overflow(void) +{ + u64 now = get_tb_or_rtc(); + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); + + return now >= *next_tb; +} + +/* This is called whenever we are re-enabling interrupts + * and returns either 0 (nothing to do) or 500/900/280/a00/e80 if + * there's an EE, DEC or DBELL to generate. + * + * This is called in two contexts: From arch_local_irq_restore() + * before soft-enabling interrupts, and from the exception exit + * path when returning from an interrupt from a soft-disabled to + * a soft enabled context. In both case we have interrupts hard + * disabled. + * + * We take care of only clearing the bits we handled in the + * PACA irq_happened field since we can only re-emit one at a + * time and we don't want to "lose" one. + */ +notrace unsigned int __check_irq_replay(void) +{ + /* + * We use local_paca rather than get_paca() to avoid all + * the debug_smp_processor_id() business in this low level + * function + */ + unsigned char happened = local_paca->irq_happened; + + /* Clear bit 0 which we wouldn't clear otherwise */ + local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + + /* + * Force the delivery of pending soft-disabled interrupts on PS3. + * Any HV call will have this side effect. + */ + if (firmware_has_feature(FW_FEATURE_PS3_LV1)) { + u64 tmp, tmp2; + lv1_get_version_info(&tmp, &tmp2); + } + + /* + * We may have missed a decrementer interrupt. We check the + * decrementer itself rather than the paca irq_happened field + * in case we also had a rollover while hard disabled + */ + local_paca->irq_happened &= ~PACA_IRQ_DEC; + if ((happened & PACA_IRQ_DEC) || decrementer_check_overflow()) + return 0x900; + + /* Finally check if an external interrupt happened */ + local_paca->irq_happened &= ~PACA_IRQ_EE; + if (happened & PACA_IRQ_EE) + return 0x500; + +#ifdef CONFIG_PPC_BOOK3E + /* Finally check if an EPR external interrupt happened + * this bit is typically set if we need to handle another + * "edge" interrupt from within the MPIC "EPR" handler + */ + local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE; + if (happened & PACA_IRQ_EE_EDGE) + return 0x500; + + local_paca->irq_happened &= ~PACA_IRQ_DBELL; + if (happened & PACA_IRQ_DBELL) + return 0x280; +#else + local_paca->irq_happened &= ~PACA_IRQ_DBELL; + if (happened & PACA_IRQ_DBELL) { + if (cpu_has_feature(CPU_FTR_HVMODE)) + return 0xe80; + return 0xa00; + } +#endif /* CONFIG_PPC_BOOK3E */ + + /* There should be nothing left ! */ + BUG_ON(local_paca->irq_happened != 0); + + return 0; +} + +notrace void arch_local_irq_restore(unsigned long en) +{ + unsigned char irq_happened; + unsigned int replay; + + /* Write the new soft-enabled value */ + set_soft_enabled(en); + if (!en) + return; + /* + * From this point onward, we can take interrupts, preempt, + * etc... unless we got hard-disabled. We check if an event + * happened. If none happened, we know we can just return. + * + * We may have preempted before the check below, in which case + * we are checking the "new" CPU instead of the old one. This + * is only a problem if an event happened on the "old" CPU. + * + * External interrupt events will have caused interrupts to + * be hard-disabled, so there is no problem, we + * cannot have preempted. + */ + irq_happened = get_irq_happened(); + if (!irq_happened) + return; + + /* + * We need to hard disable to get a trusted value from + * __check_irq_replay(). We also need to soft-disable + * again to avoid warnings in there due to the use of + * per-cpu variables. + * + * We know that if the value in irq_happened is exactly 0x01 + * then we are already hard disabled (there are other less + * common cases that we'll ignore for now), so we skip the + * (expensive) mtmsrd. + */ + if (unlikely(irq_happened != PACA_IRQ_HARD_DIS)) + __hard_irq_disable(); +#ifdef CONFIG_TRACE_IRQFLAGS + else { + /* + * We should already be hard disabled here. We had bugs + * where that wasn't the case so let's dbl check it and + * warn if we are wrong. Only do that when IRQ tracing + * is enabled as mfmsr() can be costly. + */ + if (WARN_ON(mfmsr() & MSR_EE)) + __hard_irq_disable(); + } +#endif /* CONFIG_TRACE_IRQFLAG */ + + set_soft_enabled(0); + + /* + * Check if anything needs to be re-emitted. We haven't + * soft-enabled yet to avoid warnings in decrementer_check_overflow + * accessing per-cpu variables + */ + replay = __check_irq_replay(); + + /* We can soft-enable now */ + set_soft_enabled(1); + + /* + * And replay if we have to. This will return with interrupts + * hard-enabled. + */ + if (replay) { + __replay_interrupt(replay); + return; + } + + /* Finally, let's ensure we are hard enabled */ + __hard_irq_enable(); +} +EXPORT_SYMBOL(arch_local_irq_restore); + +/* + * This is specifically called by assembly code to re-enable interrupts + * if they are currently disabled. This is typically called before + * schedule() or do_signal() when returning to userspace. We do it + * in C to avoid the burden of dealing with lockdep etc... + * + * NOTE: This is called with interrupts hard disabled but not marked + * as such in paca->irq_happened, so we need to resync this. + */ +void notrace restore_interrupts(void) +{ + if (irqs_disabled()) { + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + local_irq_enable(); + } else + __hard_irq_enable(); +} + +/* + * This is a helper to use when about to go into idle low-power + * when the latter has the side effect of re-enabling interrupts + * (such as calling H_CEDE under pHyp). + * + * You call this function with interrupts soft-disabled (this is + * already the case when ppc_md.power_save is called). The function + * will return whether to enter power save or just return. + * + * In the former case, it will have notified lockdep of interrupts + * being re-enabled and generally sanitized the lazy irq state, + * and in the latter case it will leave with interrupts hard + * disabled and marked as such, so the local_irq_enable() call + * in arch_cpu_idle() will properly re-enable everything. + */ +bool prep_irq_for_idle(void) +{ + /* + * First we need to hard disable to ensure no interrupt + * occurs before we effectively enter the low power state + */ + hard_irq_disable(); + + /* + * If anything happened while we were soft-disabled, + * we return now and do not enter the low power state. + */ + if (lazy_irq_pending()) + return false; + + /* Tell lockdep we are about to re-enable */ + trace_hardirqs_on(); + + /* + * Mark interrupts as soft-enabled and clear the + * PACA_IRQ_HARD_DIS from the pending mask since we + * are about to hard enable as well as a side effect + * of entering the low power state. + */ + local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + local_paca->soft_enabled = 1; + + /* Tell the caller to enter the low power state */ + return true; +} + +#endif /* CONFIG_PPC64 */ + +int arch_show_interrupts(struct seq_file *p, int prec) +{ + int j; + +#if defined(CONFIG_PPC32) && defined(CONFIG_TAU_INT) + if (tau_initialized) { + seq_printf(p, "%*s: ", prec, "TAU"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", tau_interrupts(j)); + seq_puts(p, " PowerPC Thermal Assist (cpu temp)\n"); + } +#endif /* CONFIG_PPC32 && CONFIG_TAU_INT */ + + seq_printf(p, "%*s: ", prec, "LOC"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_event); + seq_printf(p, " Local timer interrupts for timer event device\n"); + + seq_printf(p, "%*s: ", prec, "LOC"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_others); + seq_printf(p, " Local timer interrupts for others\n"); + + seq_printf(p, "%*s: ", prec, "SPU"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).spurious_irqs); + seq_printf(p, " Spurious interrupts\n"); + + seq_printf(p, "%*s: ", prec, "PMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).pmu_irqs); + seq_printf(p, " Performance monitoring interrupts\n"); + + seq_printf(p, "%*s: ", prec, "MCE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).mce_exceptions); + seq_printf(p, " Machine check exceptions\n"); + +#ifdef CONFIG_PPC_DOORBELL + if (cpu_has_feature(CPU_FTR_DBELL)) { + seq_printf(p, "%*s: ", prec, "DBL"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).doorbell_irqs); + seq_printf(p, " Doorbell interrupts\n"); + } +#endif + + return 0; +} + +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = per_cpu(irq_stat, cpu).timer_irqs_event; + + sum += per_cpu(irq_stat, cpu).pmu_irqs; + sum += per_cpu(irq_stat, cpu).mce_exceptions; + sum += per_cpu(irq_stat, cpu).spurious_irqs; + sum += per_cpu(irq_stat, cpu).timer_irqs_others; +#ifdef CONFIG_PPC_DOORBELL + sum += per_cpu(irq_stat, cpu).doorbell_irqs; +#endif + + return sum; +} + +#ifdef CONFIG_HOTPLUG_CPU +void migrate_irqs(void) +{ + struct irq_desc *desc; + unsigned int irq; + static int warned; + cpumask_var_t mask; + const struct cpumask *map = cpu_online_mask; + + alloc_cpumask_var(&mask, GFP_KERNEL); + + for_each_irq_desc(irq, desc) { + struct irq_data *data; + struct irq_chip *chip; + + data = irq_desc_get_irq_data(desc); + if (irqd_is_per_cpu(data)) + continue; + + chip = irq_data_get_irq_chip(data); + + cpumask_and(mask, data->affinity, map); + if (cpumask_any(mask) >= nr_cpu_ids) { + printk("Breaking affinity for irq %i\n", irq); + cpumask_copy(mask, map); + } + if (chip->irq_set_affinity) + chip->irq_set_affinity(data, mask, true); + else if (desc->action && !(warned++)) + printk("Cannot set affinity for irq %i\n", irq); + } + + free_cpumask_var(mask); + + local_irq_enable(); + mdelay(1); + local_irq_disable(); +} +#endif + +static inline void check_stack_overflow(void) +{ +#ifdef CONFIG_DEBUG_STACKOVERFLOW + long sp; + + sp = __get_SP() & (THREAD_SIZE-1); + + /* check for stack overflow: is there less than 2KB free? */ + if (unlikely(sp < (sizeof(struct thread_info) + 2048))) { + printk("do_IRQ: stack overflow: %ld\n", + sp - sizeof(struct thread_info)); + dump_stack(); + } +#endif +} + +void __do_irq(struct pt_regs *regs) +{ + unsigned int irq; + + irq_enter(); + + trace_irq_entry(regs); + + check_stack_overflow(); + + /* + * Query the platform PIC for the interrupt & ack it. + * + * This will typically lower the interrupt line to the CPU + */ + irq = ppc_md.get_irq(); + + /* We can hard enable interrupts now to allow perf interrupts */ + may_hard_irq_enable(); + + /* And finally process it */ + if (unlikely(irq == NO_IRQ)) + __get_cpu_var(irq_stat).spurious_irqs++; + else + generic_handle_irq(irq); + + trace_irq_exit(regs); + + irq_exit(); +} + +void do_IRQ(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + struct thread_info *curtp, *irqtp, *sirqtp; + + /* Switch to the irq stack to handle this */ + curtp = current_thread_info(); + irqtp = hardirq_ctx[raw_smp_processor_id()]; + sirqtp = softirq_ctx[raw_smp_processor_id()]; + + /* Already there ? */ + if (unlikely(curtp == irqtp || curtp == sirqtp)) { + __do_irq(regs); + set_irq_regs(old_regs); + return; + } + + /* Prepare the thread_info in the irq stack */ + irqtp->task = curtp->task; + irqtp->flags = 0; + + /* Copy the preempt_count so that the [soft]irq checks work. */ + irqtp->preempt_count = curtp->preempt_count; + + /* Switch stack and call */ + call_do_irq(regs, irqtp); + + /* Restore stack limit */ + irqtp->task = NULL; + + /* Copy back updates to the thread_info */ + if (irqtp->flags) + set_bits(irqtp->flags, &curtp->flags); + + set_irq_regs(old_regs); +} + +void __init init_IRQ(void) +{ + if (ppc_md.init_IRQ) + ppc_md.init_IRQ(); + + exc_lvl_ctx_init(); + + irq_ctx_init(); +} + +#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +struct thread_info *critirq_ctx[NR_CPUS] __read_mostly; +struct thread_info *dbgirq_ctx[NR_CPUS] __read_mostly; +struct thread_info *mcheckirq_ctx[NR_CPUS] __read_mostly; + +void exc_lvl_ctx_init(void) +{ + struct thread_info *tp; + int i, cpu_nr; + + for_each_possible_cpu(i) { +#ifdef CONFIG_PPC64 + cpu_nr = i; +#else +#ifdef CONFIG_SMP + cpu_nr = get_hard_smp_processor_id(i); +#else + cpu_nr = 0; +#endif +#endif + + memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE); + tp = critirq_ctx[cpu_nr]; + tp->cpu = cpu_nr; + tp->preempt_count = 0; + +#ifdef CONFIG_BOOKE + memset((void *)dbgirq_ctx[cpu_nr], 0, THREAD_SIZE); + tp = dbgirq_ctx[cpu_nr]; + tp->cpu = cpu_nr; + tp->preempt_count = 0; + + memset((void *)mcheckirq_ctx[cpu_nr], 0, THREAD_SIZE); + tp = mcheckirq_ctx[cpu_nr]; + tp->cpu = cpu_nr; + tp->preempt_count = HARDIRQ_OFFSET; +#endif + } +} +#endif + +struct thread_info *softirq_ctx[NR_CPUS] __read_mostly; +struct thread_info *hardirq_ctx[NR_CPUS] __read_mostly; + +void irq_ctx_init(void) +{ + struct thread_info *tp; + int i; + + for_each_possible_cpu(i) { + memset((void *)softirq_ctx[i], 0, THREAD_SIZE); + tp = softirq_ctx[i]; + tp->cpu = i; + + memset((void *)hardirq_ctx[i], 0, THREAD_SIZE); + tp = hardirq_ctx[i]; + tp->cpu = i; + } +} + +void do_softirq_own_stack(void) +{ + struct thread_info *curtp, *irqtp; + + curtp = current_thread_info(); + irqtp = softirq_ctx[smp_processor_id()]; + irqtp->task = curtp->task; + irqtp->flags = 0; + call_do_softirq(irqtp); + irqtp->task = NULL; + + /* Set any flag that may have been set on the + * alternate stack + */ + if (irqtp->flags) + set_bits(irqtp->flags, &curtp->flags); +} + +irq_hw_number_t virq_to_hw(unsigned int virq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + return WARN_ON(!irq_data) ? 0 : irq_data->hwirq; +} +EXPORT_SYMBOL_GPL(virq_to_hw); + +#ifdef CONFIG_SMP +int irq_choose_cpu(const struct cpumask *mask) +{ + int cpuid; + + if (cpumask_equal(mask, cpu_online_mask)) { + static int irq_rover; + static DEFINE_RAW_SPINLOCK(irq_rover_lock); + unsigned long flags; + + /* Round-robin distribution... */ +do_round_robin: + raw_spin_lock_irqsave(&irq_rover_lock, flags); + + irq_rover = cpumask_next(irq_rover, cpu_online_mask); + if (irq_rover >= nr_cpu_ids) + irq_rover = cpumask_first(cpu_online_mask); + + cpuid = irq_rover; + + raw_spin_unlock_irqrestore(&irq_rover_lock, flags); + } else { + cpuid = cpumask_first_and(mask, cpu_online_mask); + if (cpuid >= nr_cpu_ids) + goto do_round_robin; + } + + return get_hard_smp_processor_id(cpuid); +} +#else +int irq_choose_cpu(const struct cpumask *mask) +{ + return hard_smp_processor_id(); +} +#endif + +int arch_early_irq_init(void) +{ + return 0; +} + +#ifdef CONFIG_PPC64 +static int __init setup_noirqdistrib(char *str) +{ + distribute_irqs = 0; + return 1; +} + +__setup("noirqdistrib", setup_noirqdistrib); +#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c new file mode 100644 index 00000000000..0f199709796 --- /dev/null +++ b/arch/powerpc/kernel/isa-bridge.c @@ -0,0 +1,266 @@ +/* + * Routines for tracking a legacy ISA bridge + * + * Copyrigh 2007 Benjamin Herrenschmidt <benh@kernel.crashing.org>, IBM Corp. + * + * Some bits and pieces moved over from pci_64.c + * + * Copyrigh 2003 Anton Blanchard <anton@au.ibm.com>, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define DEBUG + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/notifier.h> + +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> + +unsigned long isa_io_base; /* NULL if no ISA bus */ +EXPORT_SYMBOL(isa_io_base); + +/* Cached ISA bridge dev. */ +static struct device_node *isa_bridge_devnode; +struct pci_dev *isa_bridge_pcidev; +EXPORT_SYMBOL_GPL(isa_bridge_pcidev); + +#define ISA_SPACE_MASK 0x1 +#define ISA_SPACE_IO 0x1 + +static void pci_process_ISA_OF_ranges(struct device_node *isa_node, + unsigned long phb_io_base_phys) +{ + /* We should get some saner parsing here and remove these structs */ + struct pci_address { + u32 a_hi; + u32 a_mid; + u32 a_lo; + }; + + struct isa_address { + u32 a_hi; + u32 a_lo; + }; + + struct isa_range { + struct isa_address isa_addr; + struct pci_address pci_addr; + unsigned int size; + }; + + const struct isa_range *range; + unsigned long pci_addr; + unsigned int isa_addr; + unsigned int size; + int rlen = 0; + + range = of_get_property(isa_node, "ranges", &rlen); + if (range == NULL || (rlen < sizeof(struct isa_range))) + goto inval_range; + + /* From "ISA Binding to 1275" + * The ranges property is laid out as an array of elements, + * each of which comprises: + * cells 0 - 1: an ISA address + * cells 2 - 4: a PCI address + * (size depending on dev->n_addr_cells) + * cell 5: the size of the range + */ + if ((range->isa_addr.a_hi & ISA_SPACE_MASK) != ISA_SPACE_IO) { + range++; + rlen -= sizeof(struct isa_range); + if (rlen < sizeof(struct isa_range)) + goto inval_range; + } + if ((range->isa_addr.a_hi & ISA_SPACE_MASK) != ISA_SPACE_IO) + goto inval_range; + + isa_addr = range->isa_addr.a_lo; + pci_addr = (unsigned long) range->pci_addr.a_mid << 32 | + range->pci_addr.a_lo; + + /* Assume these are both zero. Note: We could fix that and + * do a proper parsing instead ... oh well, that will do for + * now as nobody uses fancy mappings for ISA bridges + */ + if ((pci_addr != 0) || (isa_addr != 0)) { + printk(KERN_ERR "unexpected isa to pci mapping: %s\n", + __func__); + return; + } + + /* Align size and make sure it's cropped to 64K */ + size = PAGE_ALIGN(range->size); + if (size > 0x10000) + size = 0x10000; + + __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, + size, _PAGE_NO_CACHE|_PAGE_GUARDED); + return; + +inval_range: + printk(KERN_ERR "no ISA IO ranges or unexpected isa range, " + "mapping 64k\n"); + __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, + 0x10000, _PAGE_NO_CACHE|_PAGE_GUARDED); +} + + +/** + * isa_bridge_find_early - Find and map the ISA IO space early before + * main PCI discovery. This is optionally called by + * the arch code when adding PCI PHBs to get early + * access to ISA IO ports + */ +void __init isa_bridge_find_early(struct pci_controller *hose) +{ + struct device_node *np, *parent = NULL, *tmp; + + /* If we already have an ISA bridge, bail off */ + if (isa_bridge_devnode != NULL) + return; + + /* For each "isa" node in the system. Note : we do a search by + * type and not by name. It might be better to do by name but that's + * what the code used to do and I don't want to break too much at + * once. We can look into changing that separately + */ + for_each_node_by_type(np, "isa") { + /* Look for our hose being a parent */ + for (parent = of_get_parent(np); parent;) { + if (parent == hose->dn) { + of_node_put(parent); + break; + } + tmp = parent; + parent = of_get_parent(parent); + of_node_put(tmp); + } + if (parent != NULL) + break; + } + if (np == NULL) + return; + isa_bridge_devnode = np; + + /* Now parse the "ranges" property and setup the ISA mapping */ + pci_process_ISA_OF_ranges(np, hose->io_base_phys); + + /* Set the global ISA io base to indicate we have an ISA bridge */ + isa_io_base = ISA_IO_BASE; + + pr_debug("ISA bridge (early) is %s\n", np->full_name); +} + +/** + * isa_bridge_find_late - Find and map the ISA IO space upon discovery of + * a new ISA bridge + */ +static void isa_bridge_find_late(struct pci_dev *pdev, + struct device_node *devnode) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + + /* Store ISA device node and PCI device */ + isa_bridge_devnode = of_node_get(devnode); + isa_bridge_pcidev = pdev; + + /* Now parse the "ranges" property and setup the ISA mapping */ + pci_process_ISA_OF_ranges(devnode, hose->io_base_phys); + + /* Set the global ISA io base to indicate we have an ISA bridge */ + isa_io_base = ISA_IO_BASE; + + pr_debug("ISA bridge (late) is %s on %s\n", + devnode->full_name, pci_name(pdev)); +} + +/** + * isa_bridge_remove - Remove/unmap an ISA bridge + */ +static void isa_bridge_remove(void) +{ + pr_debug("ISA bridge removed !\n"); + + /* Clear the global ISA io base to indicate that we have no more + * ISA bridge. Note that drivers don't quite handle that, though + * we should probably do something about it. But do we ever really + * have ISA bridges being removed on machines using legacy devices ? + */ + isa_io_base = ISA_IO_BASE; + + /* Clear references to the bridge */ + of_node_put(isa_bridge_devnode); + isa_bridge_devnode = NULL; + isa_bridge_pcidev = NULL; + + /* Unmap the ISA area */ + __iounmap_at((void *)ISA_IO_BASE, 0x10000); +} + +/** + * isa_bridge_notify - Get notified of PCI devices addition/removal + */ +static int isa_bridge_notify(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct device *dev = data; + struct pci_dev *pdev = to_pci_dev(dev); + struct device_node *devnode = pci_device_to_OF_node(pdev); + + switch(action) { + case BUS_NOTIFY_ADD_DEVICE: + /* Check if we have an early ISA device, without PCI dev */ + if (isa_bridge_devnode && isa_bridge_devnode == devnode && + !isa_bridge_pcidev) { + pr_debug("ISA bridge PCI attached: %s\n", + pci_name(pdev)); + isa_bridge_pcidev = pdev; + } + + /* Check if we have no ISA device, and this happens to be one, + * register it as such if it has an OF device + */ + if (!isa_bridge_devnode && devnode && devnode->type && + !strcmp(devnode->type, "isa")) + isa_bridge_find_late(pdev, devnode); + + return 0; + case BUS_NOTIFY_DEL_DEVICE: + /* Check if this our existing ISA device */ + if (pdev == isa_bridge_pcidev || + (devnode && devnode == isa_bridge_devnode)) + isa_bridge_remove(); + return 0; + } + return 0; +} + +static struct notifier_block isa_bridge_notifier = { + .notifier_call = isa_bridge_notify +}; + +/** + * isa_bridge_init - register to be notified of ISA bridge addition/removal + * + */ +static int __init isa_bridge_init(void) +{ + bus_register_notifier(&pci_bus_type, &isa_bridge_notifier); + return 0; +} +arch_initcall(isa_bridge_init); diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c new file mode 100644 index 00000000000..a1ed8a8c7cb --- /dev/null +++ b/arch/powerpc/kernel/jump_label.c @@ -0,0 +1,25 @@ +/* + * Copyright 2010 Michael Ellerman, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/jump_label.h> +#include <asm/code-patching.h> + +#ifdef HAVE_JUMP_LABEL +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + u32 *addr = (u32 *)(unsigned long)entry->code; + + if (type == JUMP_LABEL_ENABLE) + patch_branch(addr, entry->target, 0); + else + patch_instruction(addr, PPC_INST_NOP); +} +#endif diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c new file mode 100644 index 00000000000..8504657379f --- /dev/null +++ b/arch/powerpc/kernel/kgdb.c @@ -0,0 +1,494 @@ +/* + * PowerPC backend to the KGDB stub. + * + * 1998 (c) Michael AK Tesch (tesch@cs.wisc.edu) + * Copyright (C) 2003 Timesys Corporation. + * Copyright (C) 2004-2006 MontaVista Software, Inc. + * PPC64 Mods (C) 2005 Frank Rowand (frowand@mvista.com) + * PPC32 support restored by Vitaly Wool <vwool@ru.mvista.com> and + * Sergei Shtylyov <sshtylyov@ru.mvista.com> + * Copyright (C) 2007-2008 Wind River Systems, Inc. + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program as licensed "as is" without any warranty of any + * kind, whether express or implied. + */ + +#include <linux/kernel.h> +#include <linux/kgdb.h> +#include <linux/smp.h> +#include <linux/signal.h> +#include <linux/ptrace.h> +#include <linux/kdebug.h> +#include <asm/current.h> +#include <asm/processor.h> +#include <asm/machdep.h> +#include <asm/debug.h> +#include <linux/slab.h> + +/* + * This table contains the mapping between PowerPC hardware trap types, and + * signals, which are primarily what GDB understands. GDB and the kernel + * don't always agree on values, so we use constants taken from gdb-6.2. + */ +static struct hard_trap_info +{ + unsigned int tt; /* Trap type code for powerpc */ + unsigned char signo; /* Signal that we map this trap into */ +} hard_trap_info[] = { + { 0x0100, 0x02 /* SIGINT */ }, /* system reset */ + { 0x0200, 0x0b /* SIGSEGV */ }, /* machine check */ + { 0x0300, 0x0b /* SIGSEGV */ }, /* data access */ + { 0x0400, 0x0b /* SIGSEGV */ }, /* instruction access */ + { 0x0500, 0x02 /* SIGINT */ }, /* external interrupt */ + { 0x0600, 0x0a /* SIGBUS */ }, /* alignment */ + { 0x0700, 0x05 /* SIGTRAP */ }, /* program check */ + { 0x0800, 0x08 /* SIGFPE */ }, /* fp unavailable */ + { 0x0900, 0x0e /* SIGALRM */ }, /* decrementer */ + { 0x0c00, 0x14 /* SIGCHLD */ }, /* system call */ +#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) + { 0x2002, 0x05 /* SIGTRAP */ }, /* debug */ +#if defined(CONFIG_FSL_BOOKE) + { 0x2010, 0x08 /* SIGFPE */ }, /* spe unavailable */ + { 0x2020, 0x08 /* SIGFPE */ }, /* spe unavailable */ + { 0x2030, 0x08 /* SIGFPE */ }, /* spe fp data */ + { 0x2040, 0x08 /* SIGFPE */ }, /* spe fp data */ + { 0x2050, 0x08 /* SIGFPE */ }, /* spe fp round */ + { 0x2060, 0x0e /* SIGILL */ }, /* performance monitor */ + { 0x2900, 0x08 /* SIGFPE */ }, /* apu unavailable */ + { 0x3100, 0x0e /* SIGALRM */ }, /* fixed interval timer */ + { 0x3200, 0x02 /* SIGINT */ }, /* watchdog */ +#else /* ! CONFIG_FSL_BOOKE */ + { 0x1000, 0x0e /* SIGALRM */ }, /* prog interval timer */ + { 0x1010, 0x0e /* SIGALRM */ }, /* fixed interval timer */ + { 0x1020, 0x02 /* SIGINT */ }, /* watchdog */ + { 0x2010, 0x08 /* SIGFPE */ }, /* fp unavailable */ + { 0x2020, 0x08 /* SIGFPE */ }, /* ap unavailable */ +#endif +#else /* ! (defined(CONFIG_40x) || defined(CONFIG_BOOKE)) */ + { 0x0d00, 0x05 /* SIGTRAP */ }, /* single-step */ +#if defined(CONFIG_8xx) + { 0x1000, 0x04 /* SIGILL */ }, /* software emulation */ +#else /* ! CONFIG_8xx */ + { 0x0f00, 0x04 /* SIGILL */ }, /* performance monitor */ + { 0x0f20, 0x08 /* SIGFPE */ }, /* altivec unavailable */ + { 0x1300, 0x05 /* SIGTRAP */ }, /* instruction address break */ +#if defined(CONFIG_PPC64) + { 0x1200, 0x05 /* SIGILL */ }, /* system error */ + { 0x1500, 0x04 /* SIGILL */ }, /* soft patch */ + { 0x1600, 0x04 /* SIGILL */ }, /* maintenance */ + { 0x1700, 0x08 /* SIGFPE */ }, /* altivec assist */ + { 0x1800, 0x04 /* SIGILL */ }, /* thermal */ +#else /* ! CONFIG_PPC64 */ + { 0x1400, 0x02 /* SIGINT */ }, /* SMI */ + { 0x1600, 0x08 /* SIGFPE */ }, /* altivec assist */ + { 0x1700, 0x04 /* SIGILL */ }, /* TAU */ + { 0x2000, 0x05 /* SIGTRAP */ }, /* run mode */ +#endif +#endif +#endif + { 0x0000, 0x00 } /* Must be last */ +}; + +static int computeSignal(unsigned int tt) +{ + struct hard_trap_info *ht; + + for (ht = hard_trap_info; ht->tt && ht->signo; ht++) + if (ht->tt == tt) + return ht->signo; + + return SIGHUP; /* default for things we don't know about */ +} + +/** + * + * kgdb_skipexception - Bail out of KGDB when we've been triggered. + * @exception: Exception vector number + * @regs: Current &struct pt_regs. + * + * On some architectures we need to skip a breakpoint exception when + * it occurs after a breakpoint has been removed. + * + */ +int kgdb_skipexception(int exception, struct pt_regs *regs) +{ + return kgdb_isremovedbreak(regs->nip); +} + +static int kgdb_call_nmi_hook(struct pt_regs *regs) +{ + kgdb_nmicallback(raw_smp_processor_id(), regs); + return 0; +} + +#ifdef CONFIG_SMP +void kgdb_roundup_cpus(unsigned long flags) +{ + smp_send_debugger_break(); +} +#endif + +/* KGDB functions to use existing PowerPC64 hooks. */ +static int kgdb_debugger(struct pt_regs *regs) +{ + return !kgdb_handle_exception(1, computeSignal(TRAP(regs)), + DIE_OOPS, regs); +} + +static int kgdb_handle_breakpoint(struct pt_regs *regs) +{ + if (user_mode(regs)) + return 0; + + if (kgdb_handle_exception(1, SIGTRAP, 0, regs) != 0) + return 0; + + if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr)) + regs->nip += BREAK_INSTR_SIZE; + + return 1; +} + +static DEFINE_PER_CPU(struct thread_info, kgdb_thread_info); +static int kgdb_singlestep(struct pt_regs *regs) +{ + struct thread_info *thread_info, *exception_thread_info; + struct thread_info *backup_current_thread_info = + &__get_cpu_var(kgdb_thread_info); + + if (user_mode(regs)) + return 0; + + /* + * On Book E and perhaps other processors, singlestep is handled on + * the critical exception stack. This causes current_thread_info() + * to fail, since it it locates the thread_info by masking off + * the low bits of the current stack pointer. We work around + * this issue by copying the thread_info from the kernel stack + * before calling kgdb_handle_exception, and copying it back + * afterwards. On most processors the copy is avoided since + * exception_thread_info == thread_info. + */ + thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1)); + exception_thread_info = current_thread_info(); + + if (thread_info != exception_thread_info) { + /* Save the original current_thread_info. */ + memcpy(backup_current_thread_info, exception_thread_info, sizeof *thread_info); + memcpy(exception_thread_info, thread_info, sizeof *thread_info); + } + + kgdb_handle_exception(0, SIGTRAP, 0, regs); + + if (thread_info != exception_thread_info) + /* Restore current_thread_info lastly. */ + memcpy(exception_thread_info, backup_current_thread_info, sizeof *thread_info); + + return 1; +} + +static int kgdb_iabr_match(struct pt_regs *regs) +{ + if (user_mode(regs)) + return 0; + + if (kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs) != 0) + return 0; + return 1; +} + +static int kgdb_break_match(struct pt_regs *regs) +{ + if (user_mode(regs)) + return 0; + + if (kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs) != 0) + return 0; + return 1; +} + +#define PACK64(ptr, src) do { *(ptr++) = (src); } while (0) + +#define PACK32(ptr, src) do { \ + u32 *ptr32; \ + ptr32 = (u32 *)ptr; \ + *(ptr32++) = (src); \ + ptr = (unsigned long *)ptr32; \ + } while (0) + +void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) +{ + struct pt_regs *regs = (struct pt_regs *)(p->thread.ksp + + STACK_FRAME_OVERHEAD); + unsigned long *ptr = gdb_regs; + int reg; + + memset(gdb_regs, 0, NUMREGBYTES); + + /* Regs GPR0-2 */ + for (reg = 0; reg < 3; reg++) + PACK64(ptr, regs->gpr[reg]); + + /* Regs GPR3-13 are caller saved, not in regs->gpr[] */ + ptr += 11; + + /* Regs GPR14-31 */ + for (reg = 14; reg < 32; reg++) + PACK64(ptr, regs->gpr[reg]); + +#ifdef CONFIG_FSL_BOOKE +#ifdef CONFIG_SPE + for (reg = 0; reg < 32; reg++) + PACK64(ptr, p->thread.evr[reg]); +#else + ptr += 32; +#endif +#else + /* fp registers not used by kernel, leave zero */ + ptr += 32 * 8 / sizeof(long); +#endif + + PACK64(ptr, regs->nip); + PACK64(ptr, regs->msr); + PACK32(ptr, regs->ccr); + PACK64(ptr, regs->link); + PACK64(ptr, regs->ctr); + PACK32(ptr, regs->xer); + + BUG_ON((unsigned long)ptr > + (unsigned long)(((void *)gdb_regs) + NUMREGBYTES)); +} + +#define GDB_SIZEOF_REG sizeof(unsigned long) +#define GDB_SIZEOF_REG_U32 sizeof(u32) + +#ifdef CONFIG_FSL_BOOKE +#define GDB_SIZEOF_FLOAT_REG sizeof(unsigned long) +#else +#define GDB_SIZEOF_FLOAT_REG sizeof(u64) +#endif + +struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = +{ + { "r0", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[0]) }, + { "r1", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[1]) }, + { "r2", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[2]) }, + { "r3", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[3]) }, + { "r4", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[4]) }, + { "r5", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[5]) }, + { "r6", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[6]) }, + { "r7", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[7]) }, + { "r8", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[8]) }, + { "r9", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[9]) }, + { "r10", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[10]) }, + { "r11", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[11]) }, + { "r12", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[12]) }, + { "r13", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[13]) }, + { "r14", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[14]) }, + { "r15", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[15]) }, + { "r16", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[16]) }, + { "r17", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[17]) }, + { "r18", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[18]) }, + { "r19", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[19]) }, + { "r20", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[20]) }, + { "r21", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[21]) }, + { "r22", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[22]) }, + { "r23", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[23]) }, + { "r24", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[24]) }, + { "r25", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[25]) }, + { "r26", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[26]) }, + { "r27", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[27]) }, + { "r28", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[28]) }, + { "r29", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[29]) }, + { "r30", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[30]) }, + { "r31", GDB_SIZEOF_REG, offsetof(struct pt_regs, gpr[31]) }, + + { "f0", GDB_SIZEOF_FLOAT_REG, 0 }, + { "f1", GDB_SIZEOF_FLOAT_REG, 1 }, + { "f2", GDB_SIZEOF_FLOAT_REG, 2 }, + { "f3", GDB_SIZEOF_FLOAT_REG, 3 }, + { "f4", GDB_SIZEOF_FLOAT_REG, 4 }, + { "f5", GDB_SIZEOF_FLOAT_REG, 5 }, + { "f6", GDB_SIZEOF_FLOAT_REG, 6 }, + { "f7", GDB_SIZEOF_FLOAT_REG, 7 }, + { "f8", GDB_SIZEOF_FLOAT_REG, 8 }, + { "f9", GDB_SIZEOF_FLOAT_REG, 9 }, + { "f10", GDB_SIZEOF_FLOAT_REG, 10 }, + { "f11", GDB_SIZEOF_FLOAT_REG, 11 }, + { "f12", GDB_SIZEOF_FLOAT_REG, 12 }, + { "f13", GDB_SIZEOF_FLOAT_REG, 13 }, + { "f14", GDB_SIZEOF_FLOAT_REG, 14 }, + { "f15", GDB_SIZEOF_FLOAT_REG, 15 }, + { "f16", GDB_SIZEOF_FLOAT_REG, 16 }, + { "f17", GDB_SIZEOF_FLOAT_REG, 17 }, + { "f18", GDB_SIZEOF_FLOAT_REG, 18 }, + { "f19", GDB_SIZEOF_FLOAT_REG, 19 }, + { "f20", GDB_SIZEOF_FLOAT_REG, 20 }, + { "f21", GDB_SIZEOF_FLOAT_REG, 21 }, + { "f22", GDB_SIZEOF_FLOAT_REG, 22 }, + { "f23", GDB_SIZEOF_FLOAT_REG, 23 }, + { "f24", GDB_SIZEOF_FLOAT_REG, 24 }, + { "f25", GDB_SIZEOF_FLOAT_REG, 25 }, + { "f26", GDB_SIZEOF_FLOAT_REG, 26 }, + { "f27", GDB_SIZEOF_FLOAT_REG, 27 }, + { "f28", GDB_SIZEOF_FLOAT_REG, 28 }, + { "f29", GDB_SIZEOF_FLOAT_REG, 29 }, + { "f30", GDB_SIZEOF_FLOAT_REG, 30 }, + { "f31", GDB_SIZEOF_FLOAT_REG, 31 }, + + { "pc", GDB_SIZEOF_REG, offsetof(struct pt_regs, nip) }, + { "msr", GDB_SIZEOF_REG, offsetof(struct pt_regs, msr) }, + { "cr", GDB_SIZEOF_REG_U32, offsetof(struct pt_regs, ccr) }, + { "lr", GDB_SIZEOF_REG, offsetof(struct pt_regs, link) }, + { "ctr", GDB_SIZEOF_REG_U32, offsetof(struct pt_regs, ctr) }, + { "xer", GDB_SIZEOF_REG, offsetof(struct pt_regs, xer) }, +}; + +char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) +{ + if (regno >= DBG_MAX_REG_NUM || regno < 0) + return NULL; + + if (regno < 32 || regno >= 64) + /* First 0 -> 31 gpr registers*/ + /* pc, msr, ls... registers 64 -> 69 */ + memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, + dbg_reg_def[regno].size); + + if (regno >= 32 && regno < 64) { + /* FP registers 32 -> 63 */ +#if defined(CONFIG_FSL_BOOKE) && defined(CONFIG_SPE) + if (current) + memcpy(mem, ¤t->thread.evr[regno-32], + dbg_reg_def[regno].size); +#else + /* fp registers not used by kernel, leave zero */ + memset(mem, 0, dbg_reg_def[regno].size); +#endif + } + + return dbg_reg_def[regno].name; +} + +int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) +{ + if (regno >= DBG_MAX_REG_NUM || regno < 0) + return -EINVAL; + + if (regno < 32 || regno >= 64) + /* First 0 -> 31 gpr registers*/ + /* pc, msr, ls... registers 64 -> 69 */ + memcpy((void *)regs + dbg_reg_def[regno].offset, mem, + dbg_reg_def[regno].size); + + if (regno >= 32 && regno < 64) { + /* FP registers 32 -> 63 */ +#if defined(CONFIG_FSL_BOOKE) && defined(CONFIG_SPE) + memcpy(¤t->thread.evr[regno-32], mem, + dbg_reg_def[regno].size); +#else + /* fp registers not used by kernel, leave zero */ + return 0; +#endif + } + + return 0; +} + +void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc) +{ + regs->nip = pc; +} + +/* + * This function does PowerPC specific procesing for interfacing to gdb. + */ +int kgdb_arch_handle_exception(int vector, int signo, int err_code, + char *remcom_in_buffer, char *remcom_out_buffer, + struct pt_regs *linux_regs) +{ + char *ptr = &remcom_in_buffer[1]; + unsigned long addr; + + switch (remcom_in_buffer[0]) { + /* + * sAA..AA Step one instruction from AA..AA + * This will return an error to gdb .. + */ + case 's': + case 'c': + /* handle the optional parameter */ + if (kgdb_hex2long(&ptr, &addr)) + linux_regs->nip = addr; + + atomic_set(&kgdb_cpu_doing_single_step, -1); + /* set the trace bit if we're stepping */ + if (remcom_in_buffer[0] == 's') { +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + mtspr(SPRN_DBCR0, + mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM); + linux_regs->msr |= MSR_DE; +#else + linux_regs->msr |= MSR_SE; +#endif + atomic_set(&kgdb_cpu_doing_single_step, + raw_smp_processor_id()); + } + return 0; + } + + return -1; +} + +/* + * Global data + */ +struct kgdb_arch arch_kgdb_ops = { + .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08}, +}; + +static int kgdb_not_implemented(struct pt_regs *regs) +{ + return 0; +} + +static void *old__debugger_ipi; +static void *old__debugger; +static void *old__debugger_bpt; +static void *old__debugger_sstep; +static void *old__debugger_iabr_match; +static void *old__debugger_break_match; +static void *old__debugger_fault_handler; + +int kgdb_arch_init(void) +{ + old__debugger_ipi = __debugger_ipi; + old__debugger = __debugger; + old__debugger_bpt = __debugger_bpt; + old__debugger_sstep = __debugger_sstep; + old__debugger_iabr_match = __debugger_iabr_match; + old__debugger_break_match = __debugger_break_match; + old__debugger_fault_handler = __debugger_fault_handler; + + __debugger_ipi = kgdb_call_nmi_hook; + __debugger = kgdb_debugger; + __debugger_bpt = kgdb_handle_breakpoint; + __debugger_sstep = kgdb_singlestep; + __debugger_iabr_match = kgdb_iabr_match; + __debugger_break_match = kgdb_break_match; + __debugger_fault_handler = kgdb_not_implemented; + + return 0; +} + +void kgdb_arch_exit(void) +{ + __debugger_ipi = old__debugger_ipi; + __debugger = old__debugger; + __debugger_bpt = old__debugger_bpt; + __debugger_sstep = old__debugger_sstep; + __debugger_iabr_match = old__debugger_iabr_match; + __debugger_break_match = old__debugger_break_match; + __debugger_fault_handler = old__debugger_fault_handler; +} diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c new file mode 100644 index 00000000000..2f72af82513 --- /dev/null +++ b/arch/powerpc/kernel/kprobes.c @@ -0,0 +1,559 @@ +/* + * Kernel Probes (KProbes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes + * interface to access function arguments. + * 2004-Nov Ananth N Mavinakayanahalli <ananth@in.ibm.com> kprobes port + * for PPC64 + */ + +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/preempt.h> +#include <linux/module.h> +#include <linux/kdebug.h> +#include <linux/slab.h> +#include <asm/code-patching.h> +#include <asm/cacheflush.h> +#include <asm/sstep.h> +#include <asm/uaccess.h> + +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; + +int __kprobes arch_prepare_kprobe(struct kprobe *p) +{ + int ret = 0; + kprobe_opcode_t insn = *p->addr; + + if ((unsigned long)p->addr & 0x03) { + printk("Attempt to register kprobe at an unaligned address\n"); + ret = -EINVAL; + } else if (IS_MTMSRD(insn) || IS_RFID(insn) || IS_RFI(insn)) { + printk("Cannot register a kprobe on rfi/rfid or mtmsr[d]\n"); + ret = -EINVAL; + } + + /* insn must be on a special executable page on ppc64. This is + * not explicitly required on ppc32 (right now), but it doesn't hurt */ + if (!ret) { + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) + ret = -ENOMEM; + } + + if (!ret) { + memcpy(p->ainsn.insn, p->addr, + MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + p->opcode = *p->addr; + flush_icache_range((unsigned long)p->ainsn.insn, + (unsigned long)p->ainsn.insn + sizeof(kprobe_opcode_t)); + } + + p->ainsn.boostable = 0; + return ret; +} + +void __kprobes arch_arm_kprobe(struct kprobe *p) +{ + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void __kprobes arch_disarm_kprobe(struct kprobe *p) +{ + *p->addr = p->opcode; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void __kprobes arch_remove_kprobe(struct kprobe *p) +{ + if (p->ainsn.insn) { + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; + } +} + +static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ + enable_single_step(regs); + + /* + * On powerpc we should single step on the original + * instruction even if the probed insn is a trap + * variant as values in regs could play a part in + * if the trap is taken or not + */ + regs->nip = (unsigned long)p->ainsn.insn; +} + +static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.saved_msr = kcb->kprobe_saved_msr; +} + +static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_saved_msr = kcb->prev_kprobe.saved_msr; +} + +static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = p; + kcb->kprobe_saved_msr = regs->msr; +} + +void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + ri->ret_addr = (kprobe_opcode_t *)regs->link; + + /* Replace the return addr with trampoline addr */ + regs->link = (unsigned long)kretprobe_trampoline; +} + +static int __kprobes kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + unsigned int *addr = (unsigned int *)regs->nip; + struct kprobe_ctlblk *kcb; + + /* + * We don't want to be preempted for the entire + * duration of kprobe processing + */ + preempt_disable(); + kcb = get_kprobe_ctlblk(); + + /* Check we're not actually recursing */ + if (kprobe_running()) { + p = get_kprobe(addr); + if (p) { + kprobe_opcode_t insn = *p->ainsn.insn; + if (kcb->kprobe_status == KPROBE_HIT_SS && + is_trap(insn)) { + /* Turn off 'trace' bits */ + regs->msr &= ~MSR_SINGLESTEP; + regs->msr |= kcb->kprobe_saved_msr; + goto no_kprobe; + } + /* We have reentered the kprobe_handler(), since + * another probe was hit while within the handler. + * We here save the original kprobes variables and + * just single step on the instruction of the new probe + * without calling any user handlers. + */ + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_saved_msr = regs->msr; + kprobes_inc_nmissed_count(p); + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_REENTER; + return 1; + } else { + if (*addr != BREAKPOINT_INSTRUCTION) { + /* If trap variant, then it belongs not to us */ + kprobe_opcode_t cur_insn = *addr; + if (is_trap(cur_insn)) + goto no_kprobe; + /* The breakpoint instruction was removed by + * another cpu right after we hit, no further + * handling of this interrupt is appropriate + */ + ret = 1; + goto no_kprobe; + } + p = __get_cpu_var(current_kprobe); + if (p->break_handler && p->break_handler(p, regs)) { + goto ss_probe; + } + } + goto no_kprobe; + } + + p = get_kprobe(addr); + if (!p) { + if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * PowerPC has multiple variants of the "trap" + * instruction. If the current instruction is a + * trap variant, it could belong to someone else + */ + kprobe_opcode_t cur_insn = *addr; + if (is_trap(cur_insn)) + goto no_kprobe; + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + */ + ret = 1; + } + /* Not one of ours: let kernel handle it */ + goto no_kprobe; + } + + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + set_current_kprobe(p, regs, kcb); + if (p->pre_handler && p->pre_handler(p, regs)) + /* handler has already set things up, so skip ss setup */ + return 1; + +ss_probe: + if (p->ainsn.boostable >= 0) { + unsigned int insn = *p->ainsn.insn; + + /* regs->nip is also adjusted if emulate_step returns 1 */ + ret = emulate_step(regs, insn); + if (ret > 0) { + /* + * Once this instruction has been boosted + * successfully, set the boostable flag + */ + if (unlikely(p->ainsn.boostable == 0)) + p->ainsn.boostable = 1; + + if (p->post_handler) + p->post_handler(p, regs, 0); + + kcb->kprobe_status = KPROBE_HIT_SSDONE; + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } else if (ret < 0) { + /* + * We don't allow kprobes on mtmsr(d)/rfi(d), etc. + * So, we should never get here... but, its still + * good to catch them, just in case... + */ + printk("Can't step on instruction %x\n", insn); + BUG(); + } else if (ret == 0) + /* This instruction can't be boosted */ + p->ainsn.boostable = -1; + } + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +/* + * Function return probe trampoline: + * - init_kprobes() establishes a probepoint here + * - When the probed function returns, this probe + * causes the handlers to fire + */ +static void __used kretprobe_trampoline_holder(void) +{ + asm volatile(".global kretprobe_trampoline\n" + "kretprobe_trampoline:\n" + "nop\n"); +} + +/* + * Called when the probe at kretprobe trampoline is hit + */ +static int __kprobes trampoline_probe_handler(struct kprobe *p, + struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head, empty_rp; + struct hlist_node *tmp; + unsigned long flags, orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + + INIT_HLIST_HEAD(&empty_rp); + kretprobe_hash_lock(current, &head, &flags); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more than one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri, &empty_rp); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_assert(ri, orig_ret_address, trampoline_address); + regs->nip = orig_ret_address; + + reset_current_kprobe(); + kretprobe_hash_unlock(current, &flags); + preempt_enable_no_resched(); + + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we don't want the post_handler + * to run (and have re-enabled preemption) + */ + return 1; +} + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "breakpoint" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + */ +static int __kprobes post_kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) + return 0; + + /* make sure we got here for instruction we have a kprobe on */ + if (((unsigned long)cur->ainsn.insn + 4) != regs->nip) + return 0; + + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + /* Adjust nip to after the single-stepped instruction */ + regs->nip = (unsigned long)cur->addr + 4; + regs->msr |= kcb->kprobe_saved_msr; + + /*Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + goto out; + } + reset_current_kprobe(); +out: + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, msr + * will have DE/SE set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->msr & MSR_SINGLESTEP) + return 0; + + return 1; +} + +int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + const struct exception_table_entry *entry; + + switch(kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_REENTER: + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the nip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + regs->nip = (unsigned long)cur->addr; + regs->msr &= ~MSR_SINGLESTEP; /* Turn off 'trace' bits */ + regs->msr |= kcb->kprobe_saved_msr; + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); + preempt_enable_no_resched(); + break; + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SSDONE: + /* + * We increment the nmissed count for accounting, + * we can also use npre/npostfault count for accounting + * these specific fault cases. + */ + kprobes_inc_nmissed_count(cur); + + /* + * We come here because instructions in the pre/post + * handler caused the page_fault, this could happen + * if handler tries to access user space by + * copy_from_user(), get_user() etc. Let the + * user-specified handler try to fix it first. + */ + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + + /* + * In case the user-specified fault handler returned + * zero, try to fix up. + */ + if ((entry = search_exception_tables(regs->nip)) != NULL) { + regs->nip = entry->fixup; + return 1; + } + + /* + * fixup_exception() could not handle it, + * Let do_page_fault() fix it. + */ + break; + default: + break; + } + return 0; +} + +/* + * Wrapper routine to for handling exceptions. + */ +int __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + + if (args->regs && user_mode(args->regs)) + return ret; + + switch (val) { + case DIE_BPT: + if (kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_SSTEP: + if (post_kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + default: + break; + } + return ret; +} + +unsigned long arch_deref_entry_point(void *entry) +{ + return ppc_global_function_entry(entry); +} + +int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + memcpy(&kcb->jprobe_saved_regs, regs, sizeof(struct pt_regs)); + + /* setup return addr to the jprobe handler routine */ + regs->nip = arch_deref_entry_point(jp->entry); +#ifdef CONFIG_PPC64 +#if defined(_CALL_ELF) && _CALL_ELF == 2 + regs->gpr[12] = (unsigned long)jp->entry; +#else + regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc); +#endif +#endif + + return 1; +} + +void __used __kprobes jprobe_return(void) +{ + asm volatile("trap" ::: "memory"); +} + +static void __used __kprobes jprobe_return_end(void) +{ +}; + +int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + /* + * FIXME - we should ideally be validating that we got here 'cos + * of the "trap" in jprobe_return() above, before restoring the + * saved regs... + */ + memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs)); + preempt_enable_no_resched(); + return 1; +} + +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init_kprobes(void) +{ + return register_kprobe(&trampoline_p); +} + +int __kprobes arch_trampoline_kprobe(struct kprobe *p) +{ + if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) + return 1; + + return 0; +} diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c new file mode 100644 index 00000000000..33aa4ddf597 --- /dev/null +++ b/arch/powerpc/kernel/kvm.c @@ -0,0 +1,742 @@ +/* + * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved. + * Copyright 2010-2011 Freescale Semiconductor, Inc. + * + * Authors: + * Alexander Graf <agraf@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/kvm_host.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/kvm_para.h> +#include <linux/slab.h> +#include <linux/of.h> + +#include <asm/reg.h> +#include <asm/sections.h> +#include <asm/cacheflush.h> +#include <asm/disassemble.h> +#include <asm/ppc-opcode.h> +#include <asm/epapr_hcalls.h> + +#define KVM_MAGIC_PAGE (-4096L) +#define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x) + +#define KVM_INST_LWZ 0x80000000 +#define KVM_INST_STW 0x90000000 +#define KVM_INST_LD 0xe8000000 +#define KVM_INST_STD 0xf8000000 +#define KVM_INST_NOP 0x60000000 +#define KVM_INST_B 0x48000000 +#define KVM_INST_B_MASK 0x03ffffff +#define KVM_INST_B_MAX 0x01ffffff +#define KVM_INST_LI 0x38000000 + +#define KVM_MASK_RT 0x03e00000 +#define KVM_RT_30 0x03c00000 +#define KVM_MASK_RB 0x0000f800 +#define KVM_INST_MFMSR 0x7c0000a6 + +#define SPR_FROM 0 +#define SPR_TO 0x100 + +#define KVM_INST_SPR(sprn, moveto) (0x7c0002a6 | \ + (((sprn) & 0x1f) << 16) | \ + (((sprn) & 0x3e0) << 6) | \ + (moveto)) + +#define KVM_INST_MFSPR(sprn) KVM_INST_SPR(sprn, SPR_FROM) +#define KVM_INST_MTSPR(sprn) KVM_INST_SPR(sprn, SPR_TO) + +#define KVM_INST_TLBSYNC 0x7c00046c +#define KVM_INST_MTMSRD_L0 0x7c000164 +#define KVM_INST_MTMSRD_L1 0x7c010164 +#define KVM_INST_MTMSR 0x7c000124 + +#define KVM_INST_WRTEE 0x7c000106 +#define KVM_INST_WRTEEI_0 0x7c000146 +#define KVM_INST_WRTEEI_1 0x7c008146 + +#define KVM_INST_MTSRIN 0x7c0001e4 + +static bool kvm_patching_worked = true; +char kvm_tmp[1024 * 1024]; +static int kvm_tmp_index; + +static inline void kvm_patch_ins(u32 *inst, u32 new_inst) +{ + *inst = new_inst; + flush_icache_range((ulong)inst, (ulong)inst + 4); +} + +static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt) +{ +#ifdef CONFIG_64BIT + kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc)); +#else + kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000fffc)); +#endif +} + +static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt) +{ +#ifdef CONFIG_64BIT + kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc)); +#else + kvm_patch_ins(inst, KVM_INST_LWZ | rt | ((addr + 4) & 0x0000fffc)); +#endif +} + +static void kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt) +{ + kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff)); +} + +static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt) +{ +#ifdef CONFIG_64BIT + kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc)); +#else + kvm_patch_ins(inst, KVM_INST_STW | rt | ((addr + 4) & 0x0000fffc)); +#endif +} + +static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt) +{ + kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc)); +} + +static void kvm_patch_ins_nop(u32 *inst) +{ + kvm_patch_ins(inst, KVM_INST_NOP); +} + +static void kvm_patch_ins_b(u32 *inst, int addr) +{ +#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_PPC_BOOK3S) + /* On relocatable kernels interrupts handlers and our code + can be in different regions, so we don't patch them */ + + if ((ulong)inst < (ulong)&__end_interrupts) + return; +#endif + + kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK)); +} + +static u32 *kvm_alloc(int len) +{ + u32 *p; + + if ((kvm_tmp_index + len) > ARRAY_SIZE(kvm_tmp)) { + printk(KERN_ERR "KVM: No more space (%d + %d)\n", + kvm_tmp_index, len); + kvm_patching_worked = false; + return NULL; + } + + p = (void*)&kvm_tmp[kvm_tmp_index]; + kvm_tmp_index += len; + + return p; +} + +extern u32 kvm_emulate_mtmsrd_branch_offs; +extern u32 kvm_emulate_mtmsrd_reg_offs; +extern u32 kvm_emulate_mtmsrd_orig_ins_offs; +extern u32 kvm_emulate_mtmsrd_len; +extern u32 kvm_emulate_mtmsrd[]; + +static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt) +{ + u32 *p; + int distance_start; + int distance_end; + ulong next_inst; + + p = kvm_alloc(kvm_emulate_mtmsrd_len * 4); + if (!p) + return; + + /* Find out where we are and put everything there */ + distance_start = (ulong)p - (ulong)inst; + next_inst = ((ulong)inst + 4); + distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsrd_branch_offs]; + + /* Make sure we only write valid b instructions */ + if (distance_start > KVM_INST_B_MAX) { + kvm_patching_worked = false; + return; + } + + /* Modify the chunk to fit the invocation */ + memcpy(p, kvm_emulate_mtmsrd, kvm_emulate_mtmsrd_len * 4); + p[kvm_emulate_mtmsrd_branch_offs] |= distance_end & KVM_INST_B_MASK; + switch (get_rt(rt)) { + case 30: + kvm_patch_ins_ll(&p[kvm_emulate_mtmsrd_reg_offs], + magic_var(scratch2), KVM_RT_30); + break; + case 31: + kvm_patch_ins_ll(&p[kvm_emulate_mtmsrd_reg_offs], + magic_var(scratch1), KVM_RT_30); + break; + default: + p[kvm_emulate_mtmsrd_reg_offs] |= rt; + break; + } + + p[kvm_emulate_mtmsrd_orig_ins_offs] = *inst; + flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsrd_len * 4); + + /* Patch the invocation */ + kvm_patch_ins_b(inst, distance_start); +} + +extern u32 kvm_emulate_mtmsr_branch_offs; +extern u32 kvm_emulate_mtmsr_reg1_offs; +extern u32 kvm_emulate_mtmsr_reg2_offs; +extern u32 kvm_emulate_mtmsr_orig_ins_offs; +extern u32 kvm_emulate_mtmsr_len; +extern u32 kvm_emulate_mtmsr[]; + +static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt) +{ + u32 *p; + int distance_start; + int distance_end; + ulong next_inst; + + p = kvm_alloc(kvm_emulate_mtmsr_len * 4); + if (!p) + return; + + /* Find out where we are and put everything there */ + distance_start = (ulong)p - (ulong)inst; + next_inst = ((ulong)inst + 4); + distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsr_branch_offs]; + + /* Make sure we only write valid b instructions */ + if (distance_start > KVM_INST_B_MAX) { + kvm_patching_worked = false; + return; + } + + /* Modify the chunk to fit the invocation */ + memcpy(p, kvm_emulate_mtmsr, kvm_emulate_mtmsr_len * 4); + p[kvm_emulate_mtmsr_branch_offs] |= distance_end & KVM_INST_B_MASK; + + /* Make clobbered registers work too */ + switch (get_rt(rt)) { + case 30: + kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg1_offs], + magic_var(scratch2), KVM_RT_30); + kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg2_offs], + magic_var(scratch2), KVM_RT_30); + break; + case 31: + kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg1_offs], + magic_var(scratch1), KVM_RT_30); + kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg2_offs], + magic_var(scratch1), KVM_RT_30); + break; + default: + p[kvm_emulate_mtmsr_reg1_offs] |= rt; + p[kvm_emulate_mtmsr_reg2_offs] |= rt; + break; + } + + p[kvm_emulate_mtmsr_orig_ins_offs] = *inst; + flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsr_len * 4); + + /* Patch the invocation */ + kvm_patch_ins_b(inst, distance_start); +} + +#ifdef CONFIG_BOOKE + +extern u32 kvm_emulate_wrtee_branch_offs; +extern u32 kvm_emulate_wrtee_reg_offs; +extern u32 kvm_emulate_wrtee_orig_ins_offs; +extern u32 kvm_emulate_wrtee_len; +extern u32 kvm_emulate_wrtee[]; + +static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one) +{ + u32 *p; + int distance_start; + int distance_end; + ulong next_inst; + + p = kvm_alloc(kvm_emulate_wrtee_len * 4); + if (!p) + return; + + /* Find out where we are and put everything there */ + distance_start = (ulong)p - (ulong)inst; + next_inst = ((ulong)inst + 4); + distance_end = next_inst - (ulong)&p[kvm_emulate_wrtee_branch_offs]; + + /* Make sure we only write valid b instructions */ + if (distance_start > KVM_INST_B_MAX) { + kvm_patching_worked = false; + return; + } + + /* Modify the chunk to fit the invocation */ + memcpy(p, kvm_emulate_wrtee, kvm_emulate_wrtee_len * 4); + p[kvm_emulate_wrtee_branch_offs] |= distance_end & KVM_INST_B_MASK; + + if (imm_one) { + p[kvm_emulate_wrtee_reg_offs] = + KVM_INST_LI | __PPC_RT(R30) | MSR_EE; + } else { + /* Make clobbered registers work too */ + switch (get_rt(rt)) { + case 30: + kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs], + magic_var(scratch2), KVM_RT_30); + break; + case 31: + kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs], + magic_var(scratch1), KVM_RT_30); + break; + default: + p[kvm_emulate_wrtee_reg_offs] |= rt; + break; + } + } + + p[kvm_emulate_wrtee_orig_ins_offs] = *inst; + flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrtee_len * 4); + + /* Patch the invocation */ + kvm_patch_ins_b(inst, distance_start); +} + +extern u32 kvm_emulate_wrteei_0_branch_offs; +extern u32 kvm_emulate_wrteei_0_len; +extern u32 kvm_emulate_wrteei_0[]; + +static void kvm_patch_ins_wrteei_0(u32 *inst) +{ + u32 *p; + int distance_start; + int distance_end; + ulong next_inst; + + p = kvm_alloc(kvm_emulate_wrteei_0_len * 4); + if (!p) + return; + + /* Find out where we are and put everything there */ + distance_start = (ulong)p - (ulong)inst; + next_inst = ((ulong)inst + 4); + distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_0_branch_offs]; + + /* Make sure we only write valid b instructions */ + if (distance_start > KVM_INST_B_MAX) { + kvm_patching_worked = false; + return; + } + + memcpy(p, kvm_emulate_wrteei_0, kvm_emulate_wrteei_0_len * 4); + p[kvm_emulate_wrteei_0_branch_offs] |= distance_end & KVM_INST_B_MASK; + flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_0_len * 4); + + /* Patch the invocation */ + kvm_patch_ins_b(inst, distance_start); +} + +#endif + +#ifdef CONFIG_PPC_BOOK3S_32 + +extern u32 kvm_emulate_mtsrin_branch_offs; +extern u32 kvm_emulate_mtsrin_reg1_offs; +extern u32 kvm_emulate_mtsrin_reg2_offs; +extern u32 kvm_emulate_mtsrin_orig_ins_offs; +extern u32 kvm_emulate_mtsrin_len; +extern u32 kvm_emulate_mtsrin[]; + +static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb) +{ + u32 *p; + int distance_start; + int distance_end; + ulong next_inst; + + p = kvm_alloc(kvm_emulate_mtsrin_len * 4); + if (!p) + return; + + /* Find out where we are and put everything there */ + distance_start = (ulong)p - (ulong)inst; + next_inst = ((ulong)inst + 4); + distance_end = next_inst - (ulong)&p[kvm_emulate_mtsrin_branch_offs]; + + /* Make sure we only write valid b instructions */ + if (distance_start > KVM_INST_B_MAX) { + kvm_patching_worked = false; + return; + } + + /* Modify the chunk to fit the invocation */ + memcpy(p, kvm_emulate_mtsrin, kvm_emulate_mtsrin_len * 4); + p[kvm_emulate_mtsrin_branch_offs] |= distance_end & KVM_INST_B_MASK; + p[kvm_emulate_mtsrin_reg1_offs] |= (rb << 10); + p[kvm_emulate_mtsrin_reg2_offs] |= rt; + p[kvm_emulate_mtsrin_orig_ins_offs] = *inst; + flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtsrin_len * 4); + + /* Patch the invocation */ + kvm_patch_ins_b(inst, distance_start); +} + +#endif + +static void kvm_map_magic_page(void *data) +{ + u32 *features = data; + + ulong in[8] = {0}; + ulong out[8]; + + in[0] = KVM_MAGIC_PAGE; + in[1] = KVM_MAGIC_PAGE | MAGIC_PAGE_FLAG_NOT_MAPPED_NX; + + epapr_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE)); + + *features = out[0]; +} + +static void kvm_check_ins(u32 *inst, u32 features) +{ + u32 _inst = *inst; + u32 inst_no_rt = _inst & ~KVM_MASK_RT; + u32 inst_rt = _inst & KVM_MASK_RT; + + switch (inst_no_rt) { + /* Loads */ + case KVM_INST_MFMSR: + kvm_patch_ins_ld(inst, magic_var(msr), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SPRG0): + kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SPRG1): + kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SPRG2): + kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SPRG3): + kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SRR0): + kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SRR1): + kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt); + break; +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_DEAR): +#else + case KVM_INST_MFSPR(SPRN_DAR): +#endif + kvm_patch_ins_ld(inst, magic_var(dar), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_DSISR): + kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt); + break; + +#ifdef CONFIG_PPC_BOOK3E_MMU + case KVM_INST_MFSPR(SPRN_MAS0): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas0), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS1): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas1), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS2): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(mas2), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS3): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas7_3) + 4, inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS4): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas4), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS6): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas6), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS7): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas7_3), inst_rt); + break; +#endif /* CONFIG_PPC_BOOK3E_MMU */ + + case KVM_INST_MFSPR(SPRN_SPRG4): +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_SPRG4R): +#endif + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(sprg4), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SPRG5): +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_SPRG5R): +#endif + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(sprg5), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SPRG6): +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_SPRG6R): +#endif + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(sprg6), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SPRG7): +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_SPRG7R): +#endif + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(sprg7), inst_rt); + break; + +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_ESR): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(esr), inst_rt); + break; +#endif + + case KVM_INST_MFSPR(SPRN_PIR): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(pir), inst_rt); + break; + + + /* Stores */ + case KVM_INST_MTSPR(SPRN_SPRG0): + kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SPRG1): + kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SPRG2): + kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SPRG3): + kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SRR0): + kvm_patch_ins_std(inst, magic_var(srr0), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SRR1): + kvm_patch_ins_std(inst, magic_var(srr1), inst_rt); + break; +#ifdef CONFIG_BOOKE + case KVM_INST_MTSPR(SPRN_DEAR): +#else + case KVM_INST_MTSPR(SPRN_DAR): +#endif + kvm_patch_ins_std(inst, magic_var(dar), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_DSISR): + kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt); + break; +#ifdef CONFIG_PPC_BOOK3E_MMU + case KVM_INST_MTSPR(SPRN_MAS0): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas0), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS1): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas1), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS2): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(mas2), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS3): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas7_3) + 4, inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS4): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas4), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS6): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas6), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS7): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas7_3), inst_rt); + break; +#endif /* CONFIG_PPC_BOOK3E_MMU */ + + case KVM_INST_MTSPR(SPRN_SPRG4): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(sprg4), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SPRG5): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(sprg5), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SPRG6): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(sprg6), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SPRG7): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(sprg7), inst_rt); + break; + +#ifdef CONFIG_BOOKE + case KVM_INST_MTSPR(SPRN_ESR): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(esr), inst_rt); + break; +#endif + + /* Nops */ + case KVM_INST_TLBSYNC: + kvm_patch_ins_nop(inst); + break; + + /* Rewrites */ + case KVM_INST_MTMSRD_L1: + kvm_patch_ins_mtmsrd(inst, inst_rt); + break; + case KVM_INST_MTMSR: + case KVM_INST_MTMSRD_L0: + kvm_patch_ins_mtmsr(inst, inst_rt); + break; +#ifdef CONFIG_BOOKE + case KVM_INST_WRTEE: + kvm_patch_ins_wrtee(inst, inst_rt, 0); + break; +#endif + } + + switch (inst_no_rt & ~KVM_MASK_RB) { +#ifdef CONFIG_PPC_BOOK3S_32 + case KVM_INST_MTSRIN: + if (features & KVM_MAGIC_FEAT_SR) { + u32 inst_rb = _inst & KVM_MASK_RB; + kvm_patch_ins_mtsrin(inst, inst_rt, inst_rb); + } + break; + break; +#endif + } + + switch (_inst) { +#ifdef CONFIG_BOOKE + case KVM_INST_WRTEEI_0: + kvm_patch_ins_wrteei_0(inst); + break; + + case KVM_INST_WRTEEI_1: + kvm_patch_ins_wrtee(inst, 0, 1); + break; +#endif + } +} + +extern u32 kvm_template_start[]; +extern u32 kvm_template_end[]; + +static void kvm_use_magic_page(void) +{ + u32 *p; + u32 *start, *end; + u32 tmp; + u32 features; + + /* Tell the host to map the magic page to -4096 on all CPUs */ + on_each_cpu(kvm_map_magic_page, &features, 1); + + /* Quick self-test to see if the mapping works */ + if (__get_user(tmp, (u32*)KVM_MAGIC_PAGE)) { + kvm_patching_worked = false; + return; + } + + /* Now loop through all code and find instructions */ + start = (void*)_stext; + end = (void*)_etext; + + /* + * Being interrupted in the middle of patching would + * be bad for SPRG4-7, which KVM can't keep in sync + * with emulated accesses because reads don't trap. + */ + local_irq_disable(); + + for (p = start; p < end; p++) { + /* Avoid patching the template code */ + if (p >= kvm_template_start && p < kvm_template_end) { + p = kvm_template_end - 1; + continue; + } + kvm_check_ins(p, features); + } + + local_irq_enable(); + + printk(KERN_INFO "KVM: Live patching for a fast VM %s\n", + kvm_patching_worked ? "worked" : "failed"); +} + +static __init void kvm_free_tmp(void) +{ + free_reserved_area(&kvm_tmp[kvm_tmp_index], + &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL); +} + +static int __init kvm_guest_init(void) +{ + if (!kvm_para_available()) + goto free_tmp; + + if (!epapr_paravirt_enabled) + goto free_tmp; + + if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE)) + kvm_use_magic_page(); + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Enable napping */ + powersave_nap = 1; +#endif + +free_tmp: + kvm_free_tmp(); + + return 0; +} + +postcore_initcall(kvm_guest_init); diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S new file mode 100644 index 00000000000..e100ff324a8 --- /dev/null +++ b/arch/powerpc/kernel/kvm_emul.S @@ -0,0 +1,348 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2010 + * Copyright 2010-2011 Freescale Semiconductor, Inc. + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/asm-offsets.h> + +#define KVM_MAGIC_PAGE (-4096) + +#ifdef CONFIG_64BIT +#define LL64(reg, offs, reg2) ld reg, (offs)(reg2) +#define STL64(reg, offs, reg2) std reg, (offs)(reg2) +#else +#define LL64(reg, offs, reg2) lwz reg, (offs + 4)(reg2) +#define STL64(reg, offs, reg2) stw reg, (offs + 4)(reg2) +#endif + +#define SCRATCH_SAVE \ + /* Enable critical section. We are critical if \ + shared->critical == r1 */ \ + STL64(r1, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0); \ + \ + /* Save state */ \ + PPC_STL r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH1)(0); \ + PPC_STL r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH2)(0); \ + mfcr r31; \ + stw r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH3)(0); + +#define SCRATCH_RESTORE \ + /* Restore state */ \ + PPC_LL r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH1)(0); \ + lwz r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH3)(0); \ + mtcr r30; \ + PPC_LL r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH2)(0); \ + \ + /* Disable critical section. We are critical if \ + shared->critical == r1 and r2 is always != r1 */ \ + STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0); + +.global kvm_template_start +kvm_template_start: + +.global kvm_emulate_mtmsrd +kvm_emulate_mtmsrd: + + SCRATCH_SAVE + + /* Put MSR & ~(MSR_EE|MSR_RI) in r31 */ + LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + lis r30, (~(MSR_EE | MSR_RI))@h + ori r30, r30, (~(MSR_EE | MSR_RI))@l + and r31, r31, r30 + + /* OR the register's (MSR_EE|MSR_RI) on MSR */ +kvm_emulate_mtmsrd_reg: + ori r30, r0, 0 + andi. r30, r30, (MSR_EE|MSR_RI) + or r31, r31, r30 + + /* Put MSR back into magic page */ + STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + /* Check if we have to fetch an interrupt */ + lwz r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0) + cmpwi r31, 0 + beq+ no_check + + /* Check if we may trigger an interrupt */ + andi. r30, r30, MSR_EE + beq no_check + + SCRATCH_RESTORE + + /* Nag hypervisor */ +kvm_emulate_mtmsrd_orig_ins: + tlbsync + + b kvm_emulate_mtmsrd_branch + +no_check: + + SCRATCH_RESTORE + + /* Go back to caller */ +kvm_emulate_mtmsrd_branch: + b . +kvm_emulate_mtmsrd_end: + +.global kvm_emulate_mtmsrd_branch_offs +kvm_emulate_mtmsrd_branch_offs: + .long (kvm_emulate_mtmsrd_branch - kvm_emulate_mtmsrd) / 4 + +.global kvm_emulate_mtmsrd_reg_offs +kvm_emulate_mtmsrd_reg_offs: + .long (kvm_emulate_mtmsrd_reg - kvm_emulate_mtmsrd) / 4 + +.global kvm_emulate_mtmsrd_orig_ins_offs +kvm_emulate_mtmsrd_orig_ins_offs: + .long (kvm_emulate_mtmsrd_orig_ins - kvm_emulate_mtmsrd) / 4 + +.global kvm_emulate_mtmsrd_len +kvm_emulate_mtmsrd_len: + .long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4 + + +#define MSR_SAFE_BITS (MSR_EE | MSR_RI) +#define MSR_CRITICAL_BITS ~MSR_SAFE_BITS + +.global kvm_emulate_mtmsr +kvm_emulate_mtmsr: + + SCRATCH_SAVE + + /* Fetch old MSR in r31 */ + LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + /* Find the changed bits between old and new MSR */ +kvm_emulate_mtmsr_reg1: + ori r30, r0, 0 + xor r31, r30, r31 + + /* Check if we need to really do mtmsr */ + LOAD_REG_IMMEDIATE(r30, MSR_CRITICAL_BITS) + and. r31, r31, r30 + + /* No critical bits changed? Maybe we can stay in the guest. */ + beq maybe_stay_in_guest + +do_mtmsr: + + SCRATCH_RESTORE + + /* Just fire off the mtmsr if it's critical */ +kvm_emulate_mtmsr_orig_ins: + mtmsr r0 + + b kvm_emulate_mtmsr_branch + +maybe_stay_in_guest: + + /* Get the target register in r30 */ +kvm_emulate_mtmsr_reg2: + ori r30, r0, 0 + + /* Put MSR into magic page because we don't call mtmsr */ + STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + /* Check if we have to fetch an interrupt */ + lwz r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0) + cmpwi r31, 0 + beq+ no_mtmsr + + /* Check if we may trigger an interrupt */ + andi. r31, r30, MSR_EE + bne do_mtmsr + +no_mtmsr: + + SCRATCH_RESTORE + + /* Go back to caller */ +kvm_emulate_mtmsr_branch: + b . +kvm_emulate_mtmsr_end: + +.global kvm_emulate_mtmsr_branch_offs +kvm_emulate_mtmsr_branch_offs: + .long (kvm_emulate_mtmsr_branch - kvm_emulate_mtmsr) / 4 + +.global kvm_emulate_mtmsr_reg1_offs +kvm_emulate_mtmsr_reg1_offs: + .long (kvm_emulate_mtmsr_reg1 - kvm_emulate_mtmsr) / 4 + +.global kvm_emulate_mtmsr_reg2_offs +kvm_emulate_mtmsr_reg2_offs: + .long (kvm_emulate_mtmsr_reg2 - kvm_emulate_mtmsr) / 4 + +.global kvm_emulate_mtmsr_orig_ins_offs +kvm_emulate_mtmsr_orig_ins_offs: + .long (kvm_emulate_mtmsr_orig_ins - kvm_emulate_mtmsr) / 4 + +.global kvm_emulate_mtmsr_len +kvm_emulate_mtmsr_len: + .long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4 + +/* also used for wrteei 1 */ +.global kvm_emulate_wrtee +kvm_emulate_wrtee: + + SCRATCH_SAVE + + /* Fetch old MSR in r31 */ + LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + /* Insert new MSR[EE] */ +kvm_emulate_wrtee_reg: + ori r30, r0, 0 + rlwimi r31, r30, 0, MSR_EE + + /* + * If MSR[EE] is now set, check for a pending interrupt. + * We could skip this if MSR[EE] was already on, but that + * should be rare, so don't bother. + */ + andi. r30, r30, MSR_EE + + /* Put MSR into magic page because we don't call wrtee */ + STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + beq no_wrtee + + /* Check if we have to fetch an interrupt */ + lwz r30, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0) + cmpwi r30, 0 + bne do_wrtee + +no_wrtee: + SCRATCH_RESTORE + + /* Go back to caller */ +kvm_emulate_wrtee_branch: + b . + +do_wrtee: + SCRATCH_RESTORE + + /* Just fire off the wrtee if it's critical */ +kvm_emulate_wrtee_orig_ins: + wrtee r0 + + b kvm_emulate_wrtee_branch + +kvm_emulate_wrtee_end: + +.global kvm_emulate_wrtee_branch_offs +kvm_emulate_wrtee_branch_offs: + .long (kvm_emulate_wrtee_branch - kvm_emulate_wrtee) / 4 + +.global kvm_emulate_wrtee_reg_offs +kvm_emulate_wrtee_reg_offs: + .long (kvm_emulate_wrtee_reg - kvm_emulate_wrtee) / 4 + +.global kvm_emulate_wrtee_orig_ins_offs +kvm_emulate_wrtee_orig_ins_offs: + .long (kvm_emulate_wrtee_orig_ins - kvm_emulate_wrtee) / 4 + +.global kvm_emulate_wrtee_len +kvm_emulate_wrtee_len: + .long (kvm_emulate_wrtee_end - kvm_emulate_wrtee) / 4 + +.global kvm_emulate_wrteei_0 +kvm_emulate_wrteei_0: + SCRATCH_SAVE + + /* Fetch old MSR in r31 */ + LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + /* Remove MSR_EE from old MSR */ + rlwinm r31, r31, 0, ~MSR_EE + + /* Write new MSR value back */ + STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + SCRATCH_RESTORE + + /* Go back to caller */ +kvm_emulate_wrteei_0_branch: + b . +kvm_emulate_wrteei_0_end: + +.global kvm_emulate_wrteei_0_branch_offs +kvm_emulate_wrteei_0_branch_offs: + .long (kvm_emulate_wrteei_0_branch - kvm_emulate_wrteei_0) / 4 + +.global kvm_emulate_wrteei_0_len +kvm_emulate_wrteei_0_len: + .long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4 + +.global kvm_emulate_mtsrin +kvm_emulate_mtsrin: + + SCRATCH_SAVE + + LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + andi. r31, r31, MSR_DR | MSR_IR + beq kvm_emulate_mtsrin_reg1 + + SCRATCH_RESTORE + +kvm_emulate_mtsrin_orig_ins: + nop + b kvm_emulate_mtsrin_branch + +kvm_emulate_mtsrin_reg1: + /* rX >> 26 */ + rlwinm r30,r0,6,26,29 + +kvm_emulate_mtsrin_reg2: + stw r0, (KVM_MAGIC_PAGE + KVM_MAGIC_SR)(r30) + + SCRATCH_RESTORE + + /* Go back to caller */ +kvm_emulate_mtsrin_branch: + b . +kvm_emulate_mtsrin_end: + +.global kvm_emulate_mtsrin_branch_offs +kvm_emulate_mtsrin_branch_offs: + .long (kvm_emulate_mtsrin_branch - kvm_emulate_mtsrin) / 4 + +.global kvm_emulate_mtsrin_reg1_offs +kvm_emulate_mtsrin_reg1_offs: + .long (kvm_emulate_mtsrin_reg1 - kvm_emulate_mtsrin) / 4 + +.global kvm_emulate_mtsrin_reg2_offs +kvm_emulate_mtsrin_reg2_offs: + .long (kvm_emulate_mtsrin_reg2 - kvm_emulate_mtsrin) / 4 + +.global kvm_emulate_mtsrin_orig_ins_offs +kvm_emulate_mtsrin_orig_ins_offs: + .long (kvm_emulate_mtsrin_orig_ins - kvm_emulate_mtsrin) / 4 + +.global kvm_emulate_mtsrin_len +kvm_emulate_mtsrin_len: + .long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4 + +.global kvm_template_end +kvm_template_end: diff --git a/arch/powerpc/kernel/l2cr_6xx.S b/arch/powerpc/kernel/l2cr_6xx.S new file mode 100644 index 00000000000..97ec8557f97 --- /dev/null +++ b/arch/powerpc/kernel/l2cr_6xx.S @@ -0,0 +1,470 @@ +/* + L2CR functions + Copyright © 1997-1998 by PowerLogix R & D, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ +/* + Thur, Dec. 12, 1998. + - First public release, contributed by PowerLogix. + *********** + Sat, Aug. 7, 1999. + - Terry: Made sure code disabled interrupts before running. (Previously + it was assumed interrupts were already disabled). + - Terry: Updated for tentative G4 support. 4MB of memory is now flushed + instead of 2MB. (Prob. only 3 is necessary). + - Terry: Updated for workaround to HID0[DPM] processor bug + during global invalidates. + *********** + Thu, July 13, 2000. + - Terry: Added isync to correct for an errata. + + 22 August 2001. + - DanM: Finally added the 7450 patch I've had for the past + several months. The L2CR is similar, but I'm going + to assume the user of this functions knows what they + are doing. + + Author: Terry Greeniaus (tgree@phys.ualberta.ca) + Please e-mail updates to this file to me, thanks! +*/ +#include <asm/processor.h> +#include <asm/cputable.h> +#include <asm/ppc_asm.h> +#include <asm/cache.h> +#include <asm/page.h> + +/* Usage: + + When setting the L2CR register, you must do a few special + things. If you are enabling the cache, you must perform a + global invalidate. If you are disabling the cache, you must + flush the cache contents first. This routine takes care of + doing these things. When first enabling the cache, make sure + you pass in the L2CR you want, as well as passing in the + global invalidate bit set. A global invalidate will only be + performed if the L2I bit is set in applyThis. When enabling + the cache, you should also set the L2E bit in applyThis. If + you want to modify the L2CR contents after the cache has been + enabled, the recommended procedure is to first call + __setL2CR(0) to disable the cache and then call it again with + the new values for L2CR. Examples: + + _setL2CR(0) - disables the cache + _setL2CR(0xB3A04000) - enables my G3 upgrade card: + - L2E set to turn on the cache + - L2SIZ set to 1MB + - L2CLK set to 1:1 + - L2RAM set to pipelined synchronous late-write + - L2I set to perform a global invalidation + - L2OH set to 0.5 nS + - L2DF set because this upgrade card + requires it + + A similar call should work for your card. You need to know + the correct setting for your card and then place them in the + fields I have outlined above. Other fields support optional + features, such as L2DO which caches only data, or L2TS which + causes cache pushes from the L1 cache to go to the L2 cache + instead of to main memory. + +IMPORTANT: + Starting with the 7450, the bits in this register have moved + or behave differently. The Enable, Parity Enable, Size, + and L2 Invalidate are the only bits that have not moved. + The size is read-only for these processors with internal L2 + cache, and the invalidate is a control as well as status. + -- Dan + +*/ +/* + * Summary: this procedure ignores the L2I bit in the value passed in, + * flushes the cache if it was already enabled, always invalidates the + * cache, then enables the cache if the L2E bit is set in the value + * passed in. + * -- paulus. + */ +_GLOBAL(_set_L2CR) + /* Make sure this is a 750 or 7400 chip */ +BEGIN_FTR_SECTION + li r3,-1 + blr +END_FTR_SECTION_IFCLR(CPU_FTR_L2CR) + + mflr r9 + + /* Stop DST streams */ +BEGIN_FTR_SECTION + DSSALL + sync +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) + + /* Turn off interrupts and data relocation. */ + mfmsr r7 /* Save MSR in r7 */ + rlwinm r4,r7,0,17,15 + rlwinm r4,r4,0,28,26 /* Turn off DR bit */ + sync + mtmsr r4 + isync + + /* Before we perform the global invalidation, we must disable dynamic + * power management via HID0[DPM] to work around a processor bug where + * DPM can possibly interfere with the state machine in the processor + * that invalidates the L2 cache tags. + */ + mfspr r8,SPRN_HID0 /* Save HID0 in r8 */ + rlwinm r4,r8,0,12,10 /* Turn off HID0[DPM] */ + sync + mtspr SPRN_HID0,r4 /* Disable DPM */ + sync + + /* Get the current enable bit of the L2CR into r4 */ + mfspr r4,SPRN_L2CR + + /* Tweak some bits */ + rlwinm r5,r3,0,0,0 /* r5 contains the new enable bit */ + rlwinm r3,r3,0,11,9 /* Turn off the invalidate bit */ + rlwinm r3,r3,0,1,31 /* Turn off the enable bit */ + + /* Check to see if we need to flush */ + rlwinm. r4,r4,0,0,0 + beq 2f + + /* Flush the cache. First, read the first 4MB of memory (physical) to + * put new data in the cache. (Actually we only need + * the size of the L2 cache plus the size of the L1 cache, but 4MB will + * cover everything just to be safe). + */ + + /**** Might be a good idea to set L2DO here - to prevent instructions + from getting into the cache. But since we invalidate + the next time we enable the cache it doesn't really matter. + Don't do this unless you accommodate all processor variations. + The bit moved on the 7450..... + ****/ + +BEGIN_FTR_SECTION + /* Disable L2 prefetch on some 745x and try to ensure + * L2 prefetch engines are idle. As explained by errata + * text, we can't be sure they are, we just hope very hard + * that well be enough (sic !). At least I noticed Apple + * doesn't even bother doing the dcbf's here... + */ + mfspr r4,SPRN_MSSCR0 + rlwinm r4,r4,0,0,29 + sync + mtspr SPRN_MSSCR0,r4 + sync + isync + lis r4,KERNELBASE@h + dcbf 0,r4 + dcbf 0,r4 + dcbf 0,r4 + dcbf 0,r4 +END_FTR_SECTION_IFSET(CPU_FTR_SPEC7450) + + /* TODO: use HW flush assist when available */ + + lis r4,0x0002 + mtctr r4 + li r4,0 +1: + lwzx r0,r0,r4 + addi r4,r4,32 /* Go to start of next cache line */ + bdnz 1b + isync + + /* Now, flush the first 4MB of memory */ + lis r4,0x0002 + mtctr r4 + li r4,0 + sync +1: + dcbf 0,r4 + addi r4,r4,32 /* Go to start of next cache line */ + bdnz 1b + +2: + /* Set up the L2CR configuration bits (and switch L2 off) */ + /* CPU errata: Make sure the mtspr below is already in the + * L1 icache + */ + b 20f + .balign L1_CACHE_BYTES +22: + sync + mtspr SPRN_L2CR,r3 + sync + b 23f +20: + b 21f +21: sync + isync + b 22b + +23: + /* Perform a global invalidation */ + oris r3,r3,0x0020 + sync + mtspr SPRN_L2CR,r3 + sync + isync /* For errata */ + +BEGIN_FTR_SECTION + /* On the 7450, we wait for the L2I bit to clear...... + */ +10: mfspr r3,SPRN_L2CR + andis. r4,r3,0x0020 + bne 10b + b 11f +END_FTR_SECTION_IFSET(CPU_FTR_SPEC7450) + + /* Wait for the invalidation to complete */ +3: mfspr r3,SPRN_L2CR + rlwinm. r4,r3,0,31,31 + bne 3b + +11: rlwinm r3,r3,0,11,9 /* Turn off the L2I bit */ + sync + mtspr SPRN_L2CR,r3 + sync + + /* See if we need to enable the cache */ + cmplwi r5,0 + beq 4f + + /* Enable the cache */ + oris r3,r3,0x8000 + mtspr SPRN_L2CR,r3 + sync + + /* Enable L2 HW prefetch on 744x/745x */ +BEGIN_FTR_SECTION + mfspr r3,SPRN_MSSCR0 + ori r3,r3,3 + sync + mtspr SPRN_MSSCR0,r3 + sync + isync +END_FTR_SECTION_IFSET(CPU_FTR_SPEC7450) +4: + + /* Restore HID0[DPM] to whatever it was before */ + sync + mtspr 1008,r8 + sync + + /* Restore MSR (restores EE and DR bits to original state) */ + SYNC + mtmsr r7 + isync + + mtlr r9 + blr + +_GLOBAL(_get_L2CR) + /* Return the L2CR contents */ + li r3,0 +BEGIN_FTR_SECTION + mfspr r3,SPRN_L2CR +END_FTR_SECTION_IFSET(CPU_FTR_L2CR) + blr + + +/* + * Here is a similar routine for dealing with the L3 cache + * on the 745x family of chips + */ + +_GLOBAL(_set_L3CR) + /* Make sure this is a 745x chip */ +BEGIN_FTR_SECTION + li r3,-1 + blr +END_FTR_SECTION_IFCLR(CPU_FTR_L3CR) + + /* Turn off interrupts and data relocation. */ + mfmsr r7 /* Save MSR in r7 */ + rlwinm r4,r7,0,17,15 + rlwinm r4,r4,0,28,26 /* Turn off DR bit */ + sync + mtmsr r4 + isync + + /* Stop DST streams */ + DSSALL + sync + + /* Get the current enable bit of the L3CR into r4 */ + mfspr r4,SPRN_L3CR + + /* Tweak some bits */ + rlwinm r5,r3,0,0,0 /* r5 contains the new enable bit */ + rlwinm r3,r3,0,22,20 /* Turn off the invalidate bit */ + rlwinm r3,r3,0,2,31 /* Turn off the enable & PE bits */ + rlwinm r3,r3,0,5,3 /* Turn off the clken bit */ + /* Check to see if we need to flush */ + rlwinm. r4,r4,0,0,0 + beq 2f + + /* Flush the cache. + */ + + /* TODO: use HW flush assist */ + + lis r4,0x0008 + mtctr r4 + li r4,0 +1: + lwzx r0,r0,r4 + dcbf 0,r4 + addi r4,r4,32 /* Go to start of next cache line */ + bdnz 1b + +2: + /* Set up the L3CR configuration bits (and switch L3 off) */ + sync + mtspr SPRN_L3CR,r3 + sync + + oris r3,r3,L3CR_L3RES@h /* Set reserved bit 5 */ + mtspr SPRN_L3CR,r3 + sync + oris r3,r3,L3CR_L3CLKEN@h /* Set clken */ + mtspr SPRN_L3CR,r3 + sync + + /* Wait for stabilize */ + li r0,256 + mtctr r0 +1: bdnz 1b + + /* Perform a global invalidation */ + ori r3,r3,0x0400 + sync + mtspr SPRN_L3CR,r3 + sync + isync + + /* We wait for the L3I bit to clear...... */ +10: mfspr r3,SPRN_L3CR + andi. r4,r3,0x0400 + bne 10b + + /* Clear CLKEN */ + rlwinm r3,r3,0,5,3 /* Turn off the clken bit */ + mtspr SPRN_L3CR,r3 + sync + + /* Wait for stabilize */ + li r0,256 + mtctr r0 +1: bdnz 1b + + /* See if we need to enable the cache */ + cmplwi r5,0 + beq 4f + + /* Enable the cache */ + oris r3,r3,(L3CR_L3E | L3CR_L3CLKEN)@h + mtspr SPRN_L3CR,r3 + sync + + /* Wait for stabilize */ + li r0,256 + mtctr r0 +1: bdnz 1b + + /* Restore MSR (restores EE and DR bits to original state) */ +4: SYNC + mtmsr r7 + isync + blr + +_GLOBAL(_get_L3CR) + /* Return the L3CR contents */ + li r3,0 +BEGIN_FTR_SECTION + mfspr r3,SPRN_L3CR +END_FTR_SECTION_IFSET(CPU_FTR_L3CR) + blr + +/* --- End of PowerLogix code --- + */ + + +/* flush_disable_L1() - Flush and disable L1 cache + * + * clobbers r0, r3, ctr, cr0 + * Must be called with interrupts disabled and MMU enabled. + */ +_GLOBAL(__flush_disable_L1) + /* Stop pending alitvec streams and memory accesses */ +BEGIN_FTR_SECTION + DSSALL +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) + sync + + /* Load counter to 0x4000 cache lines (512k) and + * load cache with datas + */ + li r3,0x4000 /* 512kB / 32B */ + mtctr r3 + lis r3,KERNELBASE@h +1: + lwz r0,0(r3) + addi r3,r3,0x0020 /* Go to start of next cache line */ + bdnz 1b + isync + sync + + /* Now flush those cache lines */ + li r3,0x4000 /* 512kB / 32B */ + mtctr r3 + lis r3,KERNELBASE@h +1: + dcbf 0,r3 + addi r3,r3,0x0020 /* Go to start of next cache line */ + bdnz 1b + sync + + /* We can now disable the L1 cache (HID0:DCE, HID0:ICE) */ + mfspr r3,SPRN_HID0 + rlwinm r3,r3,0,18,15 + mtspr SPRN_HID0,r3 + sync + isync + blr + +/* inval_enable_L1 - Invalidate and enable L1 cache + * + * Assumes L1 is already disabled and MSR:EE is off + * + * clobbers r3 + */ +_GLOBAL(__inval_enable_L1) + /* Enable and then Flash inval the instruction & data cache */ + mfspr r3,SPRN_HID0 + ori r3,r3, HID0_ICE|HID0_ICFI|HID0_DCE|HID0_DCI + sync + isync + mtspr SPRN_HID0,r3 + xori r3,r3, HID0_ICFI|HID0_DCI + mtspr SPRN_HID0,r3 + sync + + blr + + diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c new file mode 100644 index 00000000000..936258881c9 --- /dev/null +++ b/arch/powerpc/kernel/legacy_serial.c @@ -0,0 +1,648 @@ +#include <linux/kernel.h> +#include <linux/serial.h> +#include <linux/serial_8250.h> +#include <linux/serial_core.h> +#include <linux/console.h> +#include <linux/pci.h> +#include <linux/of_address.h> +#include <linux/of_device.h> +#include <linux/serial_reg.h> +#include <asm/io.h> +#include <asm/mmu.h> +#include <asm/prom.h> +#include <asm/serial.h> +#include <asm/udbg.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> + +#undef DEBUG + +#ifdef DEBUG +#define DBG(fmt...) do { printk(fmt); } while(0) +#else +#define DBG(fmt...) do { } while(0) +#endif + +#define MAX_LEGACY_SERIAL_PORTS 8 + +static struct plat_serial8250_port +legacy_serial_ports[MAX_LEGACY_SERIAL_PORTS+1]; +static struct legacy_serial_info { + struct device_node *np; + unsigned int speed; + unsigned int clock; + int irq_check_parent; + phys_addr_t taddr; +} legacy_serial_infos[MAX_LEGACY_SERIAL_PORTS]; + +static struct of_device_id legacy_serial_parents[] __initdata = { + {.type = "soc",}, + {.type = "tsi-bridge",}, + {.type = "opb", }, + {.compatible = "ibm,opb",}, + {.compatible = "simple-bus",}, + {.compatible = "wrs,epld-localbus",}, + {}, +}; + +static unsigned int legacy_serial_count; +static int legacy_serial_console = -1; + +static const upf_t legacy_port_flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | + UPF_SHARE_IRQ | UPF_FIXED_PORT; + +static unsigned int tsi_serial_in(struct uart_port *p, int offset) +{ + unsigned int tmp; + offset = offset << p->regshift; + if (offset == UART_IIR) { + tmp = readl(p->membase + (UART_IIR & ~3)); + return (tmp >> 16) & 0xff; /* UART_IIR % 4 == 2 */ + } else + return readb(p->membase + offset); +} + +static void tsi_serial_out(struct uart_port *p, int offset, int value) +{ + offset = offset << p->regshift; + if (!((offset == UART_IER) && (value & UART_IER_UUE))) + writeb(value, p->membase + offset); +} + +static int __init add_legacy_port(struct device_node *np, int want_index, + int iotype, phys_addr_t base, + phys_addr_t taddr, unsigned long irq, + upf_t flags, int irq_check_parent) +{ + const __be32 *clk, *spd, *rs; + u32 clock = BASE_BAUD * 16; + u32 shift = 0; + int index; + + /* get clock freq. if present */ + clk = of_get_property(np, "clock-frequency", NULL); + if (clk && *clk) + clock = be32_to_cpup(clk); + + /* get default speed if present */ + spd = of_get_property(np, "current-speed", NULL); + + /* get register shift if present */ + rs = of_get_property(np, "reg-shift", NULL); + if (rs && *rs) + shift = be32_to_cpup(rs); + + /* If we have a location index, then try to use it */ + if (want_index >= 0 && want_index < MAX_LEGACY_SERIAL_PORTS) + index = want_index; + else + index = legacy_serial_count; + + /* if our index is still out of range, that mean that + * array is full, we could scan for a free slot but that + * make little sense to bother, just skip the port + */ + if (index >= MAX_LEGACY_SERIAL_PORTS) + return -1; + if (index >= legacy_serial_count) + legacy_serial_count = index + 1; + + /* Check if there is a port who already claimed our slot */ + if (legacy_serial_infos[index].np != NULL) { + /* if we still have some room, move it, else override */ + if (legacy_serial_count < MAX_LEGACY_SERIAL_PORTS) { + printk(KERN_DEBUG "Moved legacy port %d -> %d\n", + index, legacy_serial_count); + legacy_serial_ports[legacy_serial_count] = + legacy_serial_ports[index]; + legacy_serial_infos[legacy_serial_count] = + legacy_serial_infos[index]; + legacy_serial_count++; + } else { + printk(KERN_DEBUG "Replacing legacy port %d\n", index); + } + } + + /* Now fill the entry */ + memset(&legacy_serial_ports[index], 0, + sizeof(struct plat_serial8250_port)); + if (iotype == UPIO_PORT) + legacy_serial_ports[index].iobase = base; + else + legacy_serial_ports[index].mapbase = base; + + legacy_serial_ports[index].iotype = iotype; + legacy_serial_ports[index].uartclk = clock; + legacy_serial_ports[index].irq = irq; + legacy_serial_ports[index].flags = flags; + legacy_serial_ports[index].regshift = shift; + legacy_serial_infos[index].taddr = taddr; + legacy_serial_infos[index].np = of_node_get(np); + legacy_serial_infos[index].clock = clock; + legacy_serial_infos[index].speed = spd ? be32_to_cpup(spd) : 0; + legacy_serial_infos[index].irq_check_parent = irq_check_parent; + + if (iotype == UPIO_TSI) { + legacy_serial_ports[index].serial_in = tsi_serial_in; + legacy_serial_ports[index].serial_out = tsi_serial_out; + } + + printk(KERN_DEBUG "Found legacy serial port %d for %s\n", + index, np->full_name); + printk(KERN_DEBUG " %s=%llx, taddr=%llx, irq=%lx, clk=%d, speed=%d\n", + (iotype == UPIO_PORT) ? "port" : "mem", + (unsigned long long)base, (unsigned long long)taddr, irq, + legacy_serial_ports[index].uartclk, + legacy_serial_infos[index].speed); + + return index; +} + +static int __init add_legacy_soc_port(struct device_node *np, + struct device_node *soc_dev) +{ + u64 addr; + const __be32 *addrp; + struct device_node *tsi = of_get_parent(np); + + /* We only support ports that have a clock frequency properly + * encoded in the device-tree. + */ + if (of_get_property(np, "clock-frequency", NULL) == NULL) + return -1; + + /* if reg-offset don't try to use it */ + if ((of_get_property(np, "reg-offset", NULL) != NULL)) + return -1; + + /* if rtas uses this device, don't try to use it as well */ + if (of_get_property(np, "used-by-rtas", NULL) != NULL) + return -1; + + /* Get the address */ + addrp = of_get_address(soc_dev, 0, NULL, NULL); + if (addrp == NULL) + return -1; + + addr = of_translate_address(soc_dev, addrp); + if (addr == OF_BAD_ADDR) + return -1; + + /* Add port, irq will be dealt with later. We passed a translated + * IO port value. It will be fixed up later along with the irq + */ + if (tsi && !strcmp(tsi->type, "tsi-bridge")) + return add_legacy_port(np, -1, UPIO_TSI, addr, addr, + NO_IRQ, legacy_port_flags, 0); + else + return add_legacy_port(np, -1, UPIO_MEM, addr, addr, + NO_IRQ, legacy_port_flags, 0); +} + +static int __init add_legacy_isa_port(struct device_node *np, + struct device_node *isa_brg) +{ + const __be32 *reg; + const char *typep; + int index = -1; + u64 taddr; + + DBG(" -> add_legacy_isa_port(%s)\n", np->full_name); + + /* Get the ISA port number */ + reg = of_get_property(np, "reg", NULL); + if (reg == NULL) + return -1; + + /* Verify it's an IO port, we don't support anything else */ + if (!(be32_to_cpu(reg[0]) & 0x00000001)) + return -1; + + /* Now look for an "ibm,aix-loc" property that gives us ordering + * if any... + */ + typep = of_get_property(np, "ibm,aix-loc", NULL); + + /* If we have a location index, then use it */ + if (typep && *typep == 'S') + index = simple_strtol(typep+1, NULL, 0) - 1; + + /* Translate ISA address. If it fails, we still register the port + * with no translated address so that it can be picked up as an IO + * port later by the serial driver + * + * Note: Don't even try on P8 lpc, we know it's not directly mapped + */ + if (!of_device_is_compatible(isa_brg, "ibm,power8-lpc")) { + taddr = of_translate_address(np, reg); + if (taddr == OF_BAD_ADDR) + taddr = 0; + } else + taddr = 0; + + /* Add port, irq will be dealt with later */ + return add_legacy_port(np, index, UPIO_PORT, be32_to_cpu(reg[1]), + taddr, NO_IRQ, legacy_port_flags, 0); + +} + +#ifdef CONFIG_PCI +static int __init add_legacy_pci_port(struct device_node *np, + struct device_node *pci_dev) +{ + u64 addr, base; + const __be32 *addrp; + unsigned int flags; + int iotype, index = -1, lindex = 0; + + DBG(" -> add_legacy_pci_port(%s)\n", np->full_name); + + /* We only support ports that have a clock frequency properly + * encoded in the device-tree (that is have an fcode). Anything + * else can't be used that early and will be normally probed by + * the generic 8250_pci driver later on. The reason is that 8250 + * compatible UARTs on PCI need all sort of quirks (port offsets + * etc...) that this code doesn't know about + */ + if (of_get_property(np, "clock-frequency", NULL) == NULL) + return -1; + + /* Get the PCI address. Assume BAR 0 */ + addrp = of_get_pci_address(pci_dev, 0, NULL, &flags); + if (addrp == NULL) + return -1; + + /* We only support BAR 0 for now */ + iotype = (flags & IORESOURCE_MEM) ? UPIO_MEM : UPIO_PORT; + addr = of_translate_address(pci_dev, addrp); + if (addr == OF_BAD_ADDR) + return -1; + + /* Set the IO base to the same as the translated address for MMIO, + * or to the domain local IO base for PIO (it will be fixed up later) + */ + if (iotype == UPIO_MEM) + base = addr; + else + base = of_read_number(&addrp[2], 1); + + /* Try to guess an index... If we have subdevices of the pci dev, + * we get to their "reg" property + */ + if (np != pci_dev) { + const __be32 *reg = of_get_property(np, "reg", NULL); + if (reg && (be32_to_cpup(reg) < 4)) + index = lindex = be32_to_cpup(reg); + } + + /* Local index means it's the Nth port in the PCI chip. Unfortunately + * the offset to add here is device specific. We know about those + * EXAR ports and we default to the most common case. If your UART + * doesn't work for these settings, you'll have to add your own special + * cases here + */ + if (of_device_is_compatible(pci_dev, "pci13a8,152") || + of_device_is_compatible(pci_dev, "pci13a8,154") || + of_device_is_compatible(pci_dev, "pci13a8,158")) { + addr += 0x200 * lindex; + base += 0x200 * lindex; + } else { + addr += 8 * lindex; + base += 8 * lindex; + } + + /* Add port, irq will be dealt with later. We passed a translated + * IO port value. It will be fixed up later along with the irq + */ + return add_legacy_port(np, index, iotype, base, addr, NO_IRQ, + legacy_port_flags, np != pci_dev); +} +#endif + +static void __init setup_legacy_serial_console(int console) +{ + struct legacy_serial_info *info = &legacy_serial_infos[console]; + struct plat_serial8250_port *port = &legacy_serial_ports[console]; + void __iomem *addr; + unsigned int stride; + + stride = 1 << port->regshift; + + /* Check if a translated MMIO address has been found */ + if (info->taddr) { + addr = ioremap(info->taddr, 0x1000); + if (addr == NULL) + return; + udbg_uart_init_mmio(addr, stride); + } else { + /* Check if it's PIO and we support untranslated PIO */ + if (port->iotype == UPIO_PORT && isa_io_special) + udbg_uart_init_pio(port->iobase, stride); + else + return; + } + + /* Try to query the current speed */ + if (info->speed == 0) + info->speed = udbg_probe_uart_speed(info->clock); + + /* Set it up */ + DBG("default console speed = %d\n", info->speed); + udbg_uart_setup(info->speed, info->clock); +} + +/* + * This is called very early, as part of setup_system() or eventually + * setup_arch(), basically before anything else in this file. This function + * will try to build a list of all the available 8250-compatible serial ports + * in the machine using the Open Firmware device-tree. It currently only deals + * with ISA and PCI busses but could be extended. It allows a very early boot + * console to be initialized, that list is also used later to provide 8250 with + * the machine non-PCI ports and to properly pick the default console port + */ +void __init find_legacy_serial_ports(void) +{ + struct device_node *np, *stdout = NULL; + const char *path; + int index; + + DBG(" -> find_legacy_serial_port()\n"); + + /* Now find out if one of these is out firmware console */ + path = of_get_property(of_chosen, "linux,stdout-path", NULL); + if (path != NULL) { + stdout = of_find_node_by_path(path); + if (stdout) + DBG("stdout is %s\n", stdout->full_name); + } else { + DBG(" no linux,stdout-path !\n"); + } + + /* Iterate over all the 16550 ports, looking for known parents */ + for_each_compatible_node(np, "serial", "ns16550") { + struct device_node *parent = of_get_parent(np); + if (!parent) + continue; + if (of_match_node(legacy_serial_parents, parent) != NULL) { + if (of_device_is_available(np)) { + index = add_legacy_soc_port(np, np); + if (index >= 0 && np == stdout) + legacy_serial_console = index; + } + } + of_node_put(parent); + } + + /* Next, fill our array with ISA ports */ + for_each_node_by_type(np, "serial") { + struct device_node *isa = of_get_parent(np); + if (isa && (!strcmp(isa->name, "isa") || + !strcmp(isa->name, "lpc"))) { + if (of_device_is_available(np)) { + index = add_legacy_isa_port(np, isa); + if (index >= 0 && np == stdout) + legacy_serial_console = index; + } + } + of_node_put(isa); + } + +#ifdef CONFIG_PCI + /* Next, try to locate PCI ports */ + for (np = NULL; (np = of_find_all_nodes(np));) { + struct device_node *pci, *parent = of_get_parent(np); + if (parent && !strcmp(parent->name, "isa")) { + of_node_put(parent); + continue; + } + if (strcmp(np->name, "serial") && strcmp(np->type, "serial")) { + of_node_put(parent); + continue; + } + /* Check for known pciclass, and also check whether we have + * a device with child nodes for ports or not + */ + if (of_device_is_compatible(np, "pciclass,0700") || + of_device_is_compatible(np, "pciclass,070002")) + pci = np; + else if (of_device_is_compatible(parent, "pciclass,0700") || + of_device_is_compatible(parent, "pciclass,070002")) + pci = parent; + else { + of_node_put(parent); + continue; + } + index = add_legacy_pci_port(np, pci); + if (index >= 0 && np == stdout) + legacy_serial_console = index; + of_node_put(parent); + } +#endif + + DBG("legacy_serial_console = %d\n", legacy_serial_console); + if (legacy_serial_console >= 0) + setup_legacy_serial_console(legacy_serial_console); + DBG(" <- find_legacy_serial_port()\n"); +} + +static struct platform_device serial_device = { + .name = "serial8250", + .id = PLAT8250_DEV_PLATFORM, + .dev = { + .platform_data = legacy_serial_ports, + }, +}; + +static void __init fixup_port_irq(int index, + struct device_node *np, + struct plat_serial8250_port *port) +{ + unsigned int virq; + + DBG("fixup_port_irq(%d)\n", index); + + virq = irq_of_parse_and_map(np, 0); + if (virq == NO_IRQ && legacy_serial_infos[index].irq_check_parent) { + np = of_get_parent(np); + if (np == NULL) + return; + virq = irq_of_parse_and_map(np, 0); + of_node_put(np); + } + if (virq == NO_IRQ) + return; + + port->irq = virq; + +#ifdef CONFIG_SERIAL_8250_FSL + if (of_device_is_compatible(np, "fsl,ns16550")) + port->handle_irq = fsl8250_handle_irq; +#endif +} + +static void __init fixup_port_pio(int index, + struct device_node *np, + struct plat_serial8250_port *port) +{ +#ifdef CONFIG_PCI + struct pci_controller *hose; + + DBG("fixup_port_pio(%d)\n", index); + + hose = pci_find_hose_for_OF_device(np); + if (hose) { + unsigned long offset = (unsigned long)hose->io_base_virt - +#ifdef CONFIG_PPC64 + pci_io_base; +#else + isa_io_base; +#endif + DBG("port %d, IO %lx -> %lx\n", + index, port->iobase, port->iobase + offset); + port->iobase += offset; + } +#endif +} + +static void __init fixup_port_mmio(int index, + struct device_node *np, + struct plat_serial8250_port *port) +{ + DBG("fixup_port_mmio(%d)\n", index); + + port->membase = ioremap(port->mapbase, 0x100); +} + +/* + * This is called as an arch initcall, hopefully before the PCI bus is + * probed and/or the 8250 driver loaded since we need to register our + * platform devices before 8250 PCI ones are detected as some of them + * must properly "override" the platform ones. + * + * This function fixes up the interrupt value for platform ports as it + * couldn't be done earlier before interrupt maps have been parsed. It + * also "corrects" the IO address for PIO ports for the same reason, + * since earlier, the PHBs virtual IO space wasn't assigned yet. It then + * registers all those platform ports for use by the 8250 driver when it + * finally loads. + */ +static int __init serial_dev_init(void) +{ + int i; + + if (legacy_serial_count == 0) + return -ENODEV; + + /* + * Before we register the platform serial devices, we need + * to fixup their interrupts and their IO ports. + */ + DBG("Fixing serial ports interrupts and IO ports ...\n"); + + for (i = 0; i < legacy_serial_count; i++) { + struct plat_serial8250_port *port = &legacy_serial_ports[i]; + struct device_node *np = legacy_serial_infos[i].np; + + if (port->irq == NO_IRQ) + fixup_port_irq(i, np, port); + if (port->iotype == UPIO_PORT) + fixup_port_pio(i, np, port); + if ((port->iotype == UPIO_MEM) || (port->iotype == UPIO_TSI)) + fixup_port_mmio(i, np, port); + } + + DBG("Registering platform serial ports\n"); + + return platform_device_register(&serial_device); +} +device_initcall(serial_dev_init); + + +#ifdef CONFIG_SERIAL_8250_CONSOLE +/* + * This is called very early, as part of console_init() (typically just after + * time_init()). This function is respondible for trying to find a good + * default console on serial ports. It tries to match the open firmware + * default output with one of the available serial console drivers that have + * been probed earlier by find_legacy_serial_ports() + */ +static int __init check_legacy_serial_console(void) +{ + struct device_node *prom_stdout = NULL; + int i, speed = 0, offset = 0; + const char *name; + const __be32 *spd; + + DBG(" -> check_legacy_serial_console()\n"); + + /* The user has requested a console so this is already set up. */ + if (strstr(boot_command_line, "console=")) { + DBG(" console was specified !\n"); + return -EBUSY; + } + + if (!of_chosen) { + DBG(" of_chosen is NULL !\n"); + return -ENODEV; + } + + if (legacy_serial_console < 0) { + DBG(" legacy_serial_console not found !\n"); + return -ENODEV; + } + /* We are getting a weird phandle from OF ... */ + /* ... So use the full path instead */ + name = of_get_property(of_chosen, "linux,stdout-path", NULL); + if (name == NULL) { + DBG(" no linux,stdout-path !\n"); + return -ENODEV; + } + prom_stdout = of_find_node_by_path(name); + if (!prom_stdout) { + DBG(" can't find stdout package %s !\n", name); + return -ENODEV; + } + DBG("stdout is %s\n", prom_stdout->full_name); + + name = of_get_property(prom_stdout, "name", NULL); + if (!name) { + DBG(" stdout package has no name !\n"); + goto not_found; + } + spd = of_get_property(prom_stdout, "current-speed", NULL); + if (spd) + speed = be32_to_cpup(spd); + + if (strcmp(name, "serial") != 0) + goto not_found; + + /* Look for it in probed array */ + for (i = 0; i < legacy_serial_count; i++) { + if (prom_stdout != legacy_serial_infos[i].np) + continue; + offset = i; + speed = legacy_serial_infos[i].speed; + break; + } + if (i >= legacy_serial_count) + goto not_found; + + of_node_put(prom_stdout); + + DBG("Found serial console at ttyS%d\n", offset); + + if (speed) { + static char __initdata opt[16]; + sprintf(opt, "%d", speed); + return add_preferred_console("ttyS", offset, opt); + } else + return add_preferred_console("ttyS", offset, NULL); + + not_found: + DBG("No preferred console found !\n"); + of_node_put(prom_stdout); + return -ENODEV; +} +console_initcall(check_legacy_serial_console); + +#endif /* CONFIG_SERIAL_8250_CONSOLE */ diff --git a/arch/powerpc/kernel/lparmap.c b/arch/powerpc/kernel/lparmap.c deleted file mode 100644 index b81de286df5..00000000000 --- a/arch/powerpc/kernel/lparmap.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (C) 2005 Stephen Rothwell IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include <asm/mmu.h> -#include <asm/page.h> -#include <asm/iSeries/LparMap.h> - -const struct LparMap __attribute__((__section__(".text"))) xLparMap = { - .xNumberEsids = HvEsidsToMap, - .xNumberRanges = HvRangesToMap, - .xSegmentTableOffs = STAB0_PAGE, - - .xEsids = { - { .xKernelEsid = GET_ESID(KERNELBASE), - .xKernelVsid = KERNEL_VSID(KERNELBASE), }, - { .xKernelEsid = GET_ESID(VMALLOCBASE), - .xKernelVsid = KERNEL_VSID(VMALLOCBASE), }, - }, - - .xRanges = { - { .xPages = HvPagesToMap, - .xOffset = 0, - .xVPN = KERNEL_VSID(KERNELBASE) << (SID_SHIFT - PAGE_SHIFT), - }, - }, -}; diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c new file mode 100644 index 00000000000..015ae55c186 --- /dev/null +++ b/arch/powerpc/kernel/machine_kexec.c @@ -0,0 +1,281 @@ +/* + * Code to handle transition of Linux booting another kernel. + * + * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com> + * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz + * Copyright (C) 2005 IBM Corporation. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/kexec.h> +#include <linux/reboot.h> +#include <linux/threads.h> +#include <linux/memblock.h> +#include <linux/of.h> +#include <linux/irq.h> +#include <linux/ftrace.h> + +#include <asm/machdep.h> +#include <asm/pgalloc.h> +#include <asm/prom.h> +#include <asm/sections.h> + +void machine_kexec_mask_interrupts(void) { + unsigned int i; + struct irq_desc *desc; + + for_each_irq_desc(i, desc) { + struct irq_chip *chip; + + chip = irq_desc_get_chip(desc); + if (!chip) + continue; + + if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data)) + chip->irq_eoi(&desc->irq_data); + + if (chip->irq_mask) + chip->irq_mask(&desc->irq_data); + + if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) + chip->irq_disable(&desc->irq_data); + } +} + +void machine_crash_shutdown(struct pt_regs *regs) +{ + default_machine_crash_shutdown(regs); +} + +/* + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. + */ +int machine_kexec_prepare(struct kimage *image) +{ + if (ppc_md.machine_kexec_prepare) + return ppc_md.machine_kexec_prepare(image); + else + return default_machine_kexec_prepare(image); +} + +void machine_kexec_cleanup(struct kimage *image) +{ +} + +void arch_crash_save_vmcoreinfo(void) +{ + +#ifdef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif +#ifndef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) + VMCOREINFO_SYMBOL(vmemmap_list); + VMCOREINFO_SYMBOL(mmu_vmemmap_psize); + VMCOREINFO_SYMBOL(mmu_psize_defs); + VMCOREINFO_STRUCT_SIZE(vmemmap_backing); + VMCOREINFO_OFFSET(vmemmap_backing, list); + VMCOREINFO_OFFSET(vmemmap_backing, phys); + VMCOREINFO_OFFSET(vmemmap_backing, virt_addr); + VMCOREINFO_STRUCT_SIZE(mmu_psize_def); + VMCOREINFO_OFFSET(mmu_psize_def, shift); +#endif +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +void machine_kexec(struct kimage *image) +{ + int save_ftrace_enabled; + + save_ftrace_enabled = __ftrace_enabled_save(); + + if (ppc_md.machine_kexec) + ppc_md.machine_kexec(image); + else + default_machine_kexec(image); + + __ftrace_enabled_restore(save_ftrace_enabled); + + /* Fall back to normal restart if we're still alive. */ + machine_restart(NULL); + for(;;); +} + +void __init reserve_crashkernel(void) +{ + unsigned long long crash_size, crash_base; + int ret; + + /* use common parsing */ + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + &crash_size, &crash_base); + if (ret == 0 && crash_size > 0) { + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + } + + if (crashk_res.end == crashk_res.start) { + crashk_res.start = crashk_res.end = 0; + return; + } + + /* We might have got these values via the command line or the + * device tree, either way sanitise them now. */ + + crash_size = resource_size(&crashk_res); + +#ifndef CONFIG_NONSTATIC_KERNEL + if (crashk_res.start != KDUMP_KERNELBASE) + printk("Crash kernel location must be 0x%x\n", + KDUMP_KERNELBASE); + + crashk_res.start = KDUMP_KERNELBASE; +#else + if (!crashk_res.start) { +#ifdef CONFIG_PPC64 + /* + * On 64bit we split the RMO in half but cap it at half of + * a small SLB (128MB) since the crash kernel needs to place + * itself and some stacks to be in the first segment. + */ + crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2)); +#else + crashk_res.start = KDUMP_KERNELBASE; +#endif + } + + crash_base = PAGE_ALIGN(crashk_res.start); + if (crash_base != crashk_res.start) { + printk("Crash kernel base must be aligned to 0x%lx\n", + PAGE_SIZE); + crashk_res.start = crash_base; + } + +#endif + crash_size = PAGE_ALIGN(crash_size); + crashk_res.end = crashk_res.start + crash_size - 1; + + /* The crash region must not overlap the current kernel */ + if (overlaps_crashkernel(__pa(_stext), _end - _stext)) { + printk(KERN_WARNING + "Crash kernel can not overlap current kernel\n"); + crashk_res.start = crashk_res.end = 0; + return; + } + + /* Crash kernel trumps memory limit */ + if (memory_limit && memory_limit <= crashk_res.end) { + memory_limit = crashk_res.end + 1; + printk("Adjusted memory limit for crashkernel, now 0x%llx\n", + memory_limit); + } + + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crashk_res.start >> 20), + (unsigned long)(memblock_phys_mem_size() >> 20)); + + memblock_reserve(crashk_res.start, crash_size); +} + +int overlaps_crashkernel(unsigned long start, unsigned long size) +{ + return (start + size) > crashk_res.start && start <= crashk_res.end; +} + +/* Values we need to export to the second kernel via the device tree. */ +static phys_addr_t kernel_end; +static phys_addr_t crashk_base; +static phys_addr_t crashk_size; +static unsigned long long mem_limit; + +static struct property kernel_end_prop = { + .name = "linux,kernel-end", + .length = sizeof(phys_addr_t), + .value = &kernel_end, +}; + +static struct property crashk_base_prop = { + .name = "linux,crashkernel-base", + .length = sizeof(phys_addr_t), + .value = &crashk_base +}; + +static struct property crashk_size_prop = { + .name = "linux,crashkernel-size", + .length = sizeof(phys_addr_t), + .value = &crashk_size, +}; + +static struct property memory_limit_prop = { + .name = "linux,memory-limit", + .length = sizeof(unsigned long long), + .value = &mem_limit, +}; + +#define cpu_to_be_ulong __PASTE(cpu_to_be, BITS_PER_LONG) + +static void __init export_crashk_values(struct device_node *node) +{ + struct property *prop; + + /* There might be existing crash kernel properties, but we can't + * be sure what's in them, so remove them. */ + prop = of_find_property(node, "linux,crashkernel-base", NULL); + if (prop) + of_remove_property(node, prop); + + prop = of_find_property(node, "linux,crashkernel-size", NULL); + if (prop) + of_remove_property(node, prop); + + if (crashk_res.start != 0) { + crashk_base = cpu_to_be_ulong(crashk_res.start), + of_add_property(node, &crashk_base_prop); + crashk_size = cpu_to_be_ulong(resource_size(&crashk_res)); + of_add_property(node, &crashk_size_prop); + } + + /* + * memory_limit is required by the kexec-tools to limit the + * crash regions to the actual memory used. + */ + mem_limit = cpu_to_be_ulong(memory_limit); + of_update_property(node, &memory_limit_prop); +} + +static int __init kexec_setup(void) +{ + struct device_node *node; + struct property *prop; + + node = of_find_node_by_path("/chosen"); + if (!node) + return -ENOENT; + + /* remove any stale properties so ours can be found */ + prop = of_find_property(node, kernel_end_prop.name, NULL); + if (prop) + of_remove_property(node, prop); + + /* information needed by userspace when using default_machine_kexec */ + kernel_end = cpu_to_be_ulong(__pa(_end)); + of_add_property(node, &kernel_end_prop); + + export_crashk_values(node); + + of_node_put(node); + return 0; +} +late_initcall(kexec_setup); diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c new file mode 100644 index 00000000000..affe5dcce7f --- /dev/null +++ b/arch/powerpc/kernel/machine_kexec_32.c @@ -0,0 +1,69 @@ +/* + * PPC32 code to handle Linux booting another kernel. + * + * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com> + * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz + * Copyright (C) 2005 IBM Corporation. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/kexec.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <asm/cacheflush.h> +#include <asm/hw_irq.h> +#include <asm/io.h> + +typedef void (*relocate_new_kernel_t)( + unsigned long indirection_page, + unsigned long reboot_code_buffer, + unsigned long start_address) __noreturn; + +/* + * This is a generic machine_kexec function suitable at least for + * non-OpenFirmware embedded platforms. + * It merely copies the image relocation code to the control page and + * jumps to it. + * A platform specific function may just call this one. + */ +void default_machine_kexec(struct kimage *image) +{ + extern const unsigned char relocate_new_kernel[]; + extern const unsigned int relocate_new_kernel_size; + unsigned long page_list; + unsigned long reboot_code_buffer, reboot_code_buffer_phys; + relocate_new_kernel_t rnk; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + /* mask each interrupt so we are in a more sane state for the + * kexec kernel */ + machine_kexec_mask_interrupts(); + + page_list = image->head; + + /* we need both effective and real address here */ + reboot_code_buffer = + (unsigned long)page_address(image->control_code_page); + reboot_code_buffer_phys = virt_to_phys((void *)reboot_code_buffer); + + /* copy our kernel relocation code to the control code page */ + memcpy((void *)reboot_code_buffer, relocate_new_kernel, + relocate_new_kernel_size); + + flush_icache_range(reboot_code_buffer, + reboot_code_buffer + KEXEC_CONTROL_PAGE_SIZE); + printk(KERN_INFO "Bye!\n"); + + /* now call it */ + rnk = (relocate_new_kernel_t) reboot_code_buffer; + (*rnk)(page_list, reboot_code_buffer_phys, image->start); +} + +int default_machine_kexec_prepare(struct kimage *image) +{ + return 0; +} diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c new file mode 100644 index 00000000000..879b3aacac3 --- /dev/null +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -0,0 +1,415 @@ +/* + * PPC64 code to handle Linux booting another kernel. + * + * Copyright (C) 2004-2005, IBM Corp. + * + * Created by: Milton D Miller II + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + +#include <linux/kexec.h> +#include <linux/smp.h> +#include <linux/thread_info.h> +#include <linux/init_task.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/cpu.h> +#include <linux/hardirq.h> + +#include <asm/page.h> +#include <asm/current.h> +#include <asm/machdep.h> +#include <asm/cacheflush.h> +#include <asm/paca.h> +#include <asm/mmu.h> +#include <asm/sections.h> /* _end */ +#include <asm/prom.h> +#include <asm/smp.h> +#include <asm/hw_breakpoint.h> + +int default_machine_kexec_prepare(struct kimage *image) +{ + int i; + unsigned long begin, end; /* limits of segment */ + unsigned long low, high; /* limits of blocked memory range */ + struct device_node *node; + const unsigned long *basep; + const unsigned int *sizep; + + if (!ppc_md.hpte_clear_all) + return -ENOENT; + + /* + * Since we use the kernel fault handlers and paging code to + * handle the virtual mode, we must make sure no destination + * overlaps kernel static data or bss. + */ + for (i = 0; i < image->nr_segments; i++) + if (image->segment[i].mem < __pa(_end)) + return -ETXTBSY; + + /* + * For non-LPAR, we absolutely can not overwrite the mmu hash + * table, since we are still using the bolted entries in it to + * do the copy. Check that here. + * + * It is safe if the end is below the start of the blocked + * region (end <= low), or if the beginning is after the + * end of the blocked region (begin >= high). Use the + * boolean identity !(a || b) === (!a && !b). + */ + if (htab_address) { + low = __pa(htab_address); + high = low + htab_size_bytes; + + for (i = 0; i < image->nr_segments; i++) { + begin = image->segment[i].mem; + end = begin + image->segment[i].memsz; + + if ((begin < high) && (end > low)) + return -ETXTBSY; + } + } + + /* We also should not overwrite the tce tables */ + for_each_node_by_type(node, "pci") { + basep = of_get_property(node, "linux,tce-base", NULL); + sizep = of_get_property(node, "linux,tce-size", NULL); + if (basep == NULL || sizep == NULL) + continue; + + low = *basep; + high = low + (*sizep); + + for (i = 0; i < image->nr_segments; i++) { + begin = image->segment[i].mem; + end = begin + image->segment[i].memsz; + + if ((begin < high) && (end > low)) + return -ETXTBSY; + } + } + + return 0; +} + +#define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE) + +static void copy_segments(unsigned long ind) +{ + unsigned long entry; + unsigned long *ptr; + void *dest; + void *addr; + + /* + * We rely on kexec_load to create a lists that properly + * initializes these pointers before they are used. + * We will still crash if the list is wrong, but at least + * the compiler will be quiet. + */ + ptr = NULL; + dest = NULL; + + for (entry = ind; !(entry & IND_DONE); entry = *ptr++) { + addr = __va(entry & PAGE_MASK); + + switch (entry & IND_FLAGS) { + case IND_DESTINATION: + dest = addr; + break; + case IND_INDIRECTION: + ptr = addr; + break; + case IND_SOURCE: + copy_page(dest, addr); + dest += PAGE_SIZE; + } + } +} + +void kexec_copy_flush(struct kimage *image) +{ + long i, nr_segments = image->nr_segments; + struct kexec_segment ranges[KEXEC_SEGMENT_MAX]; + + /* save the ranges on the stack to efficiently flush the icache */ + memcpy(ranges, image->segment, sizeof(ranges)); + + /* + * After this call we may not use anything allocated in dynamic + * memory, including *image. + * + * Only globals and the stack are allowed. + */ + copy_segments(image->head); + + /* + * we need to clear the icache for all dest pages sometime, + * including ones that were in place on the original copy + */ + for (i = 0; i < nr_segments; i++) + flush_icache_range((unsigned long)__va(ranges[i].mem), + (unsigned long)__va(ranges[i].mem + ranges[i].memsz)); +} + +#ifdef CONFIG_SMP + +static int kexec_all_irq_disabled = 0; + +static void kexec_smp_down(void *arg) +{ + local_irq_disable(); + hard_irq_disable(); + + mb(); /* make sure our irqs are disabled before we say they are */ + get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF; + while(kexec_all_irq_disabled == 0) + cpu_relax(); + mb(); /* make sure all irqs are disabled before this */ + hw_breakpoint_disable(); + /* + * Now every CPU has IRQs off, we can clear out any pending + * IPIs and be sure that no more will come in after this. + */ + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 1); + + kexec_smp_wait(); + /* NOTREACHED */ +} + +static void kexec_prepare_cpus_wait(int wait_state) +{ + int my_cpu, i, notified=-1; + + hw_breakpoint_disable(); + my_cpu = get_cpu(); + /* Make sure each CPU has at least made it to the state we need. + * + * FIXME: There is a (slim) chance of a problem if not all of the CPUs + * are correctly onlined. If somehow we start a CPU on boot with RTAS + * start-cpu, but somehow that CPU doesn't write callin_cpu_map[] in + * time, the boot CPU will timeout. If it does eventually execute + * stuff, the secondary will start up (paca[].cpu_start was written) and + * get into a peculiar state. If the platform supports + * smp_ops->take_timebase(), the secondary CPU will probably be spinning + * in there. If not (i.e. pseries), the secondary will continue on and + * try to online itself/idle/etc. If it survives that, we need to find + * these possible-but-not-online-but-should-be CPUs and chaperone them + * into kexec_smp_wait(). + */ + for_each_online_cpu(i) { + if (i == my_cpu) + continue; + + while (paca[i].kexec_state < wait_state) { + barrier(); + if (i != notified) { + printk(KERN_INFO "kexec: waiting for cpu %d " + "(physical %d) to enter %i state\n", + i, paca[i].hw_cpu_id, wait_state); + notified = i; + } + } + } + mb(); +} + +/* + * We need to make sure each present CPU is online. The next kernel will scan + * the device tree and assume primary threads are online and query secondary + * threads via RTAS to online them if required. If we don't online primary + * threads, they will be stuck. However, we also online secondary threads as we + * may be using 'cede offline'. In this case RTAS doesn't see the secondary + * threads as offline -- and again, these CPUs will be stuck. + * + * So, we online all CPUs that should be running, including secondary threads. + */ +static void wake_offline_cpus(void) +{ + int cpu = 0; + + for_each_present_cpu(cpu) { + if (!cpu_online(cpu)) { + printk(KERN_INFO "kexec: Waking offline cpu %d.\n", + cpu); + WARN_ON(cpu_up(cpu)); + } + } +} + +static void kexec_prepare_cpus(void) +{ + wake_offline_cpus(); + smp_call_function(kexec_smp_down, NULL, /* wait */0); + local_irq_disable(); + hard_irq_disable(); + + mb(); /* make sure IRQs are disabled before we say they are */ + get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF; + + kexec_prepare_cpus_wait(KEXEC_STATE_IRQS_OFF); + /* we are sure every CPU has IRQs off at this point */ + kexec_all_irq_disabled = 1; + + /* after we tell the others to go down */ + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 0); + + /* + * Before removing MMU mappings make sure all CPUs have entered real + * mode: + */ + kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE); + + put_cpu(); +} + +#else /* ! SMP */ + +static void kexec_prepare_cpus(void) +{ + /* + * move the secondarys to us so that we can copy + * the new kernel 0-0x100 safely + * + * do this if kexec in setup.c ? + * + * We need to release the cpus if we are ever going from an + * UP to an SMP kernel. + */ + smp_release_cpus(); + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 0); + local_irq_disable(); + hard_irq_disable(); +} + +#endif /* SMP */ + +/* + * kexec thread structure and stack. + * + * We need to make sure that this is 16384-byte aligned due to the + * way process stacks are handled. It also must be statically allocated + * or allocated as part of the kimage, because everything else may be + * overwritten when we copy the kexec image. We piggyback on the + * "init_task" linker section here to statically allocate a stack. + * + * We could use a smaller stack if we don't care about anything using + * current, but that audit has not been performed. + */ +static union thread_union kexec_stack __init_task_data = + { }; + +/* + * For similar reasons to the stack above, the kexecing CPU needs to be on a + * static PACA; we switch to kexec_paca. + */ +struct paca_struct kexec_paca; + +/* Our assembly helper, in misc_64.S */ +extern void kexec_sequence(void *newstack, unsigned long start, + void *image, void *control, + void (*clear_all)(void)) __noreturn; + +/* too late to fail here */ +void default_machine_kexec(struct kimage *image) +{ + /* prepare control code if any */ + + /* + * If the kexec boot is the normal one, need to shutdown other cpus + * into our wait loop and quiesce interrupts. + * Otherwise, in the case of crashed mode (crashing_cpu >= 0), + * stopping other CPUs and collecting their pt_regs is done before + * using debugger IPI. + */ + + if (crashing_cpu == -1) + kexec_prepare_cpus(); + + pr_debug("kexec: Starting switchover sequence.\n"); + + /* switch to a staticly allocated stack. Based on irq stack code. + * We setup preempt_count to avoid using VMX in memcpy. + * XXX: the task struct will likely be invalid once we do the copy! + */ + kexec_stack.thread_info.task = current_thread_info()->task; + kexec_stack.thread_info.flags = 0; + kexec_stack.thread_info.preempt_count = HARDIRQ_OFFSET; + kexec_stack.thread_info.cpu = current_thread_info()->cpu; + + /* We need a static PACA, too; copy this CPU's PACA over and switch to + * it. Also poison per_cpu_offset to catch anyone using non-static + * data. + */ + memcpy(&kexec_paca, get_paca(), sizeof(struct paca_struct)); + kexec_paca.data_offset = 0xedeaddeadeeeeeeeUL; + paca = (struct paca_struct *)RELOC_HIDE(&kexec_paca, 0) - + kexec_paca.paca_index; + setup_paca(&kexec_paca); + + /* XXX: If anyone does 'dynamic lppacas' this will also need to be + * switched to a static version! + */ + + /* Some things are best done in assembly. Finding globals with + * a toc is easier in C, so pass in what we can. + */ + kexec_sequence(&kexec_stack, image->start, image, + page_address(image->control_code_page), + ppc_md.hpte_clear_all); + /* NOTREACHED */ +} + +/* Values we need to export to the second kernel via the device tree. */ +static unsigned long htab_base; +static unsigned long htab_size; + +static struct property htab_base_prop = { + .name = "linux,htab-base", + .length = sizeof(unsigned long), + .value = &htab_base, +}; + +static struct property htab_size_prop = { + .name = "linux,htab-size", + .length = sizeof(unsigned long), + .value = &htab_size, +}; + +static int __init export_htab_values(void) +{ + struct device_node *node; + struct property *prop; + + /* On machines with no htab htab_address is NULL */ + if (!htab_address) + return -ENODEV; + + node = of_find_node_by_path("/chosen"); + if (!node) + return -ENODEV; + + /* remove any stale propertys so ours can be found */ + prop = of_find_property(node, htab_base_prop.name, NULL); + if (prop) + of_remove_property(node, prop); + prop = of_find_property(node, htab_size_prop.name, NULL); + if (prop) + of_remove_property(node, prop); + + htab_base = cpu_to_be64(__pa(htab_address)); + of_add_property(node, &htab_base_prop); + htab_size = cpu_to_be64(htab_size_bytes); + of_add_property(node, &htab_size_prop); + + of_node_put(node); + return 0; +} +late_initcall(export_htab_values); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c new file mode 100644 index 00000000000..a7fd4cb78b7 --- /dev/null +++ b/arch/powerpc/kernel/mce.c @@ -0,0 +1,352 @@ +/* + * Machine check exception handling. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2013 IBM Corporation + * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> + */ + +#undef DEBUG +#define pr_fmt(fmt) "mce: " fmt + +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/percpu.h> +#include <linux/export.h> +#include <linux/irq_work.h> +#include <asm/mce.h> + +static DEFINE_PER_CPU(int, mce_nest_count); +static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); + +/* Queue for delayed MCE events. */ +static DEFINE_PER_CPU(int, mce_queue_count); +static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); + +static void machine_check_process_queued_event(struct irq_work *work); +struct irq_work mce_event_process_work = { + .func = machine_check_process_queued_event, +}; + +static void mce_set_error_info(struct machine_check_event *mce, + struct mce_error_info *mce_err) +{ + mce->error_type = mce_err->error_type; + switch (mce_err->error_type) { + case MCE_ERROR_TYPE_UE: + mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type; + break; + case MCE_ERROR_TYPE_SLB: + mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type; + break; + case MCE_ERROR_TYPE_ERAT: + mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type; + break; + case MCE_ERROR_TYPE_TLB: + mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type; + break; + case MCE_ERROR_TYPE_UNKNOWN: + default: + break; + } +} + +/* + * Decode and save high level MCE information into per cpu buffer which + * is an array of machine_check_event structure. + */ +void save_mce_event(struct pt_regs *regs, long handled, + struct mce_error_info *mce_err, + uint64_t nip, uint64_t addr) +{ + uint64_t srr1; + int index = __get_cpu_var(mce_nest_count)++; + struct machine_check_event *mce = &__get_cpu_var(mce_event[index]); + + /* + * Return if we don't have enough space to log mce event. + * mce_nest_count may go beyond MAX_MC_EVT but that's ok, + * the check below will stop buffer overrun. + */ + if (index >= MAX_MC_EVT) + return; + + /* Populate generic machine check info */ + mce->version = MCE_V1; + mce->srr0 = nip; + mce->srr1 = regs->msr; + mce->gpr3 = regs->gpr[3]; + mce->in_use = 1; + + mce->initiator = MCE_INITIATOR_CPU; + if (handled) + mce->disposition = MCE_DISPOSITION_RECOVERED; + else + mce->disposition = MCE_DISPOSITION_NOT_RECOVERED; + mce->severity = MCE_SEV_ERROR_SYNC; + + srr1 = regs->msr; + + /* + * Populate the mce error_type and type-specific error_type. + */ + mce_set_error_info(mce, mce_err); + + if (!addr) + return; + + if (mce->error_type == MCE_ERROR_TYPE_TLB) { + mce->u.tlb_error.effective_address_provided = true; + mce->u.tlb_error.effective_address = addr; + } else if (mce->error_type == MCE_ERROR_TYPE_SLB) { + mce->u.slb_error.effective_address_provided = true; + mce->u.slb_error.effective_address = addr; + } else if (mce->error_type == MCE_ERROR_TYPE_ERAT) { + mce->u.erat_error.effective_address_provided = true; + mce->u.erat_error.effective_address = addr; + } else if (mce->error_type == MCE_ERROR_TYPE_UE) { + mce->u.ue_error.effective_address_provided = true; + mce->u.ue_error.effective_address = addr; + } + return; +} + +/* + * get_mce_event: + * mce Pointer to machine_check_event structure to be filled. + * release Flag to indicate whether to free the event slot or not. + * 0 <= do not release the mce event. Caller will invoke + * release_mce_event() once event has been consumed. + * 1 <= release the slot. + * + * return 1 = success + * 0 = failure + * + * get_mce_event() will be called by platform specific machine check + * handle routine and in KVM. + * When we call get_mce_event(), we are still in interrupt context and + * preemption will not be scheduled until ret_from_expect() routine + * is called. + */ +int get_mce_event(struct machine_check_event *mce, bool release) +{ + int index = __get_cpu_var(mce_nest_count) - 1; + struct machine_check_event *mc_evt; + int ret = 0; + + /* Sanity check */ + if (index < 0) + return ret; + + /* Check if we have MCE info to process. */ + if (index < MAX_MC_EVT) { + mc_evt = &__get_cpu_var(mce_event[index]); + /* Copy the event structure and release the original */ + if (mce) + *mce = *mc_evt; + if (release) + mc_evt->in_use = 0; + ret = 1; + } + /* Decrement the count to free the slot. */ + if (release) + __get_cpu_var(mce_nest_count)--; + + return ret; +} + +void release_mce_event(void) +{ + get_mce_event(NULL, true); +} + +/* + * Queue up the MCE event which then can be handled later. + */ +void machine_check_queue_event(void) +{ + int index; + struct machine_check_event evt; + + if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) + return; + + index = __get_cpu_var(mce_queue_count)++; + /* If queue is full, just return for now. */ + if (index >= MAX_MC_EVT) { + __get_cpu_var(mce_queue_count)--; + return; + } + __get_cpu_var(mce_event_queue[index]) = evt; + + /* Queue irq work to process this event later. */ + irq_work_queue(&mce_event_process_work); +} + +/* + * process pending MCE event from the mce event queue. This function will be + * called during syscall exit. + */ +static void machine_check_process_queued_event(struct irq_work *work) +{ + int index; + + /* + * For now just print it to console. + * TODO: log this error event to FSP or nvram. + */ + while (__get_cpu_var(mce_queue_count) > 0) { + index = __get_cpu_var(mce_queue_count) - 1; + machine_check_print_event_info( + &__get_cpu_var(mce_event_queue[index])); + __get_cpu_var(mce_queue_count)--; + } +} + +void machine_check_print_event_info(struct machine_check_event *evt) +{ + const char *level, *sevstr, *subtype; + static const char *mc_ue_types[] = { + "Indeterminate", + "Instruction fetch", + "Page table walk ifetch", + "Load/Store", + "Page table walk Load/Store", + }; + static const char *mc_slb_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + static const char *mc_erat_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + static const char *mc_tlb_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + + /* Print things out */ + if (evt->version != MCE_V1) { + pr_err("Machine Check Exception, Unknown event version %d !\n", + evt->version); + return; + } + switch (evt->severity) { + case MCE_SEV_NO_ERROR: + level = KERN_INFO; + sevstr = "Harmless"; + break; + case MCE_SEV_WARNING: + level = KERN_WARNING; + sevstr = ""; + break; + case MCE_SEV_ERROR_SYNC: + level = KERN_ERR; + sevstr = "Severe"; + break; + case MCE_SEV_FATAL: + default: + level = KERN_ERR; + sevstr = "Fatal"; + break; + } + + printk("%s%s Machine check interrupt [%s]\n", level, sevstr, + evt->disposition == MCE_DISPOSITION_RECOVERED ? + "Recovered" : "[Not recovered"); + printk("%s Initiator: %s\n", level, + evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown"); + switch (evt->error_type) { + case MCE_ERROR_TYPE_UE: + subtype = evt->u.ue_error.ue_error_type < + ARRAY_SIZE(mc_ue_types) ? + mc_ue_types[evt->u.ue_error.ue_error_type] + : "Unknown"; + printk("%s Error type: UE [%s]\n", level, subtype); + if (evt->u.ue_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt->u.ue_error.effective_address); + if (evt->u.ue_error.physical_address_provided) + printk("%s Physial address: %016llx\n", + level, evt->u.ue_error.physical_address); + break; + case MCE_ERROR_TYPE_SLB: + subtype = evt->u.slb_error.slb_error_type < + ARRAY_SIZE(mc_slb_types) ? + mc_slb_types[evt->u.slb_error.slb_error_type] + : "Unknown"; + printk("%s Error type: SLB [%s]\n", level, subtype); + if (evt->u.slb_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt->u.slb_error.effective_address); + break; + case MCE_ERROR_TYPE_ERAT: + subtype = evt->u.erat_error.erat_error_type < + ARRAY_SIZE(mc_erat_types) ? + mc_erat_types[evt->u.erat_error.erat_error_type] + : "Unknown"; + printk("%s Error type: ERAT [%s]\n", level, subtype); + if (evt->u.erat_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt->u.erat_error.effective_address); + break; + case MCE_ERROR_TYPE_TLB: + subtype = evt->u.tlb_error.tlb_error_type < + ARRAY_SIZE(mc_tlb_types) ? + mc_tlb_types[evt->u.tlb_error.tlb_error_type] + : "Unknown"; + printk("%s Error type: TLB [%s]\n", level, subtype); + if (evt->u.tlb_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt->u.tlb_error.effective_address); + break; + default: + case MCE_ERROR_TYPE_UNKNOWN: + printk("%s Error type: Unknown\n", level); + break; + } +} + +uint64_t get_mce_fault_addr(struct machine_check_event *evt) +{ + switch (evt->error_type) { + case MCE_ERROR_TYPE_UE: + if (evt->u.ue_error.effective_address_provided) + return evt->u.ue_error.effective_address; + break; + case MCE_ERROR_TYPE_SLB: + if (evt->u.slb_error.effective_address_provided) + return evt->u.slb_error.effective_address; + break; + case MCE_ERROR_TYPE_ERAT: + if (evt->u.erat_error.effective_address_provided) + return evt->u.erat_error.effective_address; + break; + case MCE_ERROR_TYPE_TLB: + if (evt->u.tlb_error.effective_address_provided) + return evt->u.tlb_error.effective_address; + break; + default: + case MCE_ERROR_TYPE_UNKNOWN: + break; + } + return 0; +} +EXPORT_SYMBOL(get_mce_fault_addr); diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c new file mode 100644 index 00000000000..aa9aff3d6ad --- /dev/null +++ b/arch/powerpc/kernel/mce_power.c @@ -0,0 +1,313 @@ +/* + * Machine check exception handling CPU-side for power7 and power8 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2013 IBM Corporation + * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> + */ + +#undef DEBUG +#define pr_fmt(fmt) "mce_power: " fmt + +#include <linux/types.h> +#include <linux/ptrace.h> +#include <asm/mmu.h> +#include <asm/mce.h> +#include <asm/machdep.h> + +/* flush SLBs and reload */ +static void flush_and_reload_slb(void) +{ + struct slb_shadow *slb; + unsigned long i, n; + + /* Invalidate all SLBs */ + asm volatile("slbmte %0,%0; slbia" : : "r" (0)); + +#ifdef CONFIG_KVM_BOOK3S_HANDLER + /* + * If machine check is hit when in guest or in transition, we will + * only flush the SLBs and continue. + */ + if (get_paca()->kvm_hstate.in_guest) + return; +#endif + + /* For host kernel, reload the SLBs from shadow SLB buffer. */ + slb = get_slb_shadow(); + if (!slb) + return; + + n = min_t(u32, be32_to_cpu(slb->persistent), SLB_MIN_SIZE); + + /* Load up the SLB entries from shadow SLB */ + for (i = 0; i < n; i++) { + unsigned long rb = be64_to_cpu(slb->save_area[i].esid); + unsigned long rs = be64_to_cpu(slb->save_area[i].vsid); + + rb = (rb & ~0xFFFul) | i; + asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); + } +} + +static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) +{ + long handled = 1; + + /* + * flush and reload SLBs for SLB errors and flush TLBs for TLB errors. + * reset the error bits whenever we handle them so that at the end + * we can check whether we handled all of them or not. + * */ + if (dsisr & slb_error_bits) { + flush_and_reload_slb(); + /* reset error bits */ + dsisr &= ~(slb_error_bits); + } + if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) { + if (cur_cpu_spec && cur_cpu_spec->flush_tlb) + cur_cpu_spec->flush_tlb(TLBIEL_INVAL_PAGE); + /* reset error bits */ + dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB; + } + /* Any other errors we don't understand? */ + if (dsisr & 0xffffffffUL) + handled = 0; + + return handled; +} + +static long mce_handle_derror_p7(uint64_t dsisr) +{ + return mce_handle_derror(dsisr, P7_DSISR_MC_SLB_ERRORS); +} + +static long mce_handle_common_ierror(uint64_t srr1) +{ + long handled = 0; + + switch (P7_SRR1_MC_IFETCH(srr1)) { + case 0: + break; + case P7_SRR1_MC_IFETCH_SLB_PARITY: + case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: + /* flush and reload SLBs for SLB errors. */ + flush_and_reload_slb(); + handled = 1; + break; + case P7_SRR1_MC_IFETCH_TLB_MULTIHIT: + if (cur_cpu_spec && cur_cpu_spec->flush_tlb) { + cur_cpu_spec->flush_tlb(TLBIEL_INVAL_PAGE); + handled = 1; + } + break; + default: + break; + } + + return handled; +} + +static long mce_handle_ierror_p7(uint64_t srr1) +{ + long handled = 0; + + handled = mce_handle_common_ierror(srr1); + + if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { + flush_and_reload_slb(); + handled = 1; + } + return handled; +} + +static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1) +{ + switch (P7_SRR1_MC_IFETCH(srr1)) { + case P7_SRR1_MC_IFETCH_SLB_PARITY: + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; + break; + case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; + break; + case P7_SRR1_MC_IFETCH_TLB_MULTIHIT: + mce_err->error_type = MCE_ERROR_TYPE_TLB; + mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; + break; + case P7_SRR1_MC_IFETCH_UE: + case P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL: + mce_err->error_type = MCE_ERROR_TYPE_UE; + mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH; + break; + case P7_SRR1_MC_IFETCH_UE_TLB_RELOAD: + mce_err->error_type = MCE_ERROR_TYPE_UE; + mce_err->u.ue_error_type = + MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; + break; + } +} + +static void mce_get_ierror_p7(struct mce_error_info *mce_err, uint64_t srr1) +{ + mce_get_common_ierror(mce_err, srr1); + if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; + } +} + +static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr) +{ + if (dsisr & P7_DSISR_MC_UE) { + mce_err->error_type = MCE_ERROR_TYPE_UE; + mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; + } else if (dsisr & P7_DSISR_MC_UE_TABLEWALK) { + mce_err->error_type = MCE_ERROR_TYPE_UE; + mce_err->u.ue_error_type = + MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; + } else if (dsisr & P7_DSISR_MC_ERAT_MULTIHIT) { + mce_err->error_type = MCE_ERROR_TYPE_ERAT; + mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; + } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT) { + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; + } else if (dsisr & P7_DSISR_MC_SLB_PARITY_MFSLB) { + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; + } else if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) { + mce_err->error_type = MCE_ERROR_TYPE_TLB; + mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; + } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT_PARITY) { + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; + } +} + +static long mce_handle_ue_error(struct pt_regs *regs) +{ + long handled = 0; + + /* + * On specific SCOM read via MMIO we may get a machine check + * exception with SRR0 pointing inside opal. If that is the + * case OPAL may have recovery address to re-read SCOM data in + * different way and hence we can recover from this MC. + */ + + if (ppc_md.mce_check_early_recovery) { + if (ppc_md.mce_check_early_recovery(regs)) + handled = 1; + } + return handled; +} + +long __machine_check_early_realmode_p7(struct pt_regs *regs) +{ + uint64_t srr1, nip, addr; + long handled = 1; + struct mce_error_info mce_error_info = { 0 }; + + srr1 = regs->msr; + nip = regs->nip; + + /* + * Handle memory errors depending whether this was a load/store or + * ifetch exception. Also, populate the mce error_type and + * type-specific error_type from either SRR1 or DSISR, depending + * whether this was a load/store or ifetch exception + */ + if (P7_SRR1_MC_LOADSTORE(srr1)) { + handled = mce_handle_derror_p7(regs->dsisr); + mce_get_derror_p7(&mce_error_info, regs->dsisr); + addr = regs->dar; + } else { + handled = mce_handle_ierror_p7(srr1); + mce_get_ierror_p7(&mce_error_info, srr1); + addr = regs->nip; + } + + /* Handle UE error. */ + if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) + handled = mce_handle_ue_error(regs); + + save_mce_event(regs, handled, &mce_error_info, nip, addr); + return handled; +} + +static void mce_get_ierror_p8(struct mce_error_info *mce_err, uint64_t srr1) +{ + mce_get_common_ierror(mce_err, srr1); + if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { + mce_err->error_type = MCE_ERROR_TYPE_ERAT; + mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; + } +} + +static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr) +{ + mce_get_derror_p7(mce_err, dsisr); + if (dsisr & P8_DSISR_MC_ERAT_MULTIHIT_SEC) { + mce_err->error_type = MCE_ERROR_TYPE_ERAT; + mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; + } +} + +static long mce_handle_ierror_p8(uint64_t srr1) +{ + long handled = 0; + + handled = mce_handle_common_ierror(srr1); + + if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { + flush_and_reload_slb(); + handled = 1; + } + return handled; +} + +static long mce_handle_derror_p8(uint64_t dsisr) +{ + return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS); +} + +long __machine_check_early_realmode_p8(struct pt_regs *regs) +{ + uint64_t srr1, nip, addr; + long handled = 1; + struct mce_error_info mce_error_info = { 0 }; + + srr1 = regs->msr; + nip = regs->nip; + + if (P7_SRR1_MC_LOADSTORE(srr1)) { + handled = mce_handle_derror_p8(regs->dsisr); + mce_get_derror_p8(&mce_error_info, regs->dsisr); + addr = regs->dar; + } else { + handled = mce_handle_ierror_p8(srr1); + mce_get_ierror_p8(&mce_error_info, srr1); + addr = regs->nip; + } + + /* Handle UE error. */ + if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) + handled = mce_handle_ue_error(regs); + + save_mce_event(regs, handled, &mce_error_info, nip, addr); + return handled; +} diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S new file mode 100644 index 00000000000..7ce26d45777 --- /dev/null +++ b/arch/powerpc/kernel/misc.S @@ -0,0 +1,116 @@ +/* + * This file contains miscellaneous low-level functions. + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Largely rewritten by Cort Dougan (cort@cs.nmt.edu) + * and Paul Mackerras. + * + * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com) + * + * setjmp/longjmp code by Paul Mackerras. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/ppc_asm.h> +#include <asm/unistd.h> +#include <asm/asm-compat.h> +#include <asm/asm-offsets.h> + + .text + +/* + * Returns (address we are running at) - (address we were linked at) + * for use before the text and data are mapped to KERNELBASE. + */ + +_GLOBAL(reloc_offset) + mflr r0 + bl 1f +1: mflr r3 + PPC_LL r4,(2f-1b)(r3) + subf r3,r4,r3 + mtlr r0 + blr + + .align 3 +2: PPC_LONG 1b + +/* + * add_reloc_offset(x) returns x + reloc_offset(). + */ +_GLOBAL(add_reloc_offset) + mflr r0 + bl 1f +1: mflr r5 + PPC_LL r4,(2f-1b)(r5) + subf r5,r4,r5 + add r3,r3,r5 + mtlr r0 + blr + + .align 3 +2: PPC_LONG 1b + +_GLOBAL(setjmp) + mflr r0 + PPC_STL r0,0(r3) + PPC_STL r1,SZL(r3) + PPC_STL r2,2*SZL(r3) + mfcr r0 + PPC_STL r0,3*SZL(r3) + PPC_STL r13,4*SZL(r3) + PPC_STL r14,5*SZL(r3) + PPC_STL r15,6*SZL(r3) + PPC_STL r16,7*SZL(r3) + PPC_STL r17,8*SZL(r3) + PPC_STL r18,9*SZL(r3) + PPC_STL r19,10*SZL(r3) + PPC_STL r20,11*SZL(r3) + PPC_STL r21,12*SZL(r3) + PPC_STL r22,13*SZL(r3) + PPC_STL r23,14*SZL(r3) + PPC_STL r24,15*SZL(r3) + PPC_STL r25,16*SZL(r3) + PPC_STL r26,17*SZL(r3) + PPC_STL r27,18*SZL(r3) + PPC_STL r28,19*SZL(r3) + PPC_STL r29,20*SZL(r3) + PPC_STL r30,21*SZL(r3) + PPC_STL r31,22*SZL(r3) + li r3,0 + blr + +_GLOBAL(longjmp) + PPC_LCMPI r4,0 + bne 1f + li r4,1 +1: PPC_LL r13,4*SZL(r3) + PPC_LL r14,5*SZL(r3) + PPC_LL r15,6*SZL(r3) + PPC_LL r16,7*SZL(r3) + PPC_LL r17,8*SZL(r3) + PPC_LL r18,9*SZL(r3) + PPC_LL r19,10*SZL(r3) + PPC_LL r20,11*SZL(r3) + PPC_LL r21,12*SZL(r3) + PPC_LL r22,13*SZL(r3) + PPC_LL r23,14*SZL(r3) + PPC_LL r24,15*SZL(r3) + PPC_LL r25,16*SZL(r3) + PPC_LL r26,17*SZL(r3) + PPC_LL r27,18*SZL(r3) + PPC_LL r28,19*SZL(r3) + PPC_LL r29,20*SZL(r3) + PPC_LL r30,21*SZL(r3) + PPC_LL r31,22*SZL(r3) + PPC_LL r0,3*SZL(r3) + mtcrf 0x38,r0 + PPC_LL r0,0(r3) + PPC_LL r1,SZL(r3) + PPC_LL r2,2*SZL(r3) + mtlr r0 + mr r3,r4 + blr diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 3bedb532aed..7c6bb4b17b4 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -5,6 +5,12 @@ * Largely rewritten by Cort Dougan (cort@cs.nmt.edu) * and Paul Mackerras. * + * kexec bits: + * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com> + * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz + * PPC44x port. Copyright (C) 2011, IBM Corporation + * Author: Suzuki Poulose <suzuki@in.ibm.com> + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -12,7 +18,6 @@ * */ -#include <linux/config.h> #include <linux/sys.h> #include <asm/unistd.h> #include <asm/errno.h> @@ -24,15 +29,52 @@ #include <asm/ppc_asm.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> +#include <asm/processor.h> +#include <asm/kexec.h> +#include <asm/bug.h> +#include <asm/ptrace.h> .text - .align 5 -_GLOBAL(__delay) - cmpwi 0,r3,0 - mtctr r3 - beqlr -1: bdnz 1b +/* + * We store the saved ksp_limit in the unused part + * of the STACK_FRAME_OVERHEAD + */ +_GLOBAL(call_do_softirq) + mflr r0 + stw r0,4(r1) + lwz r10,THREAD+KSP_LIMIT(r2) + addi r11,r3,THREAD_INFO_GAP + stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) + mr r1,r3 + stw r10,8(r1) + stw r11,THREAD+KSP_LIMIT(r2) + bl __do_softirq + lwz r10,8(r1) + lwz r1,0(r1) + lwz r0,4(r1) + stw r10,THREAD+KSP_LIMIT(r2) + mtlr r0 + blr + +/* + * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp); + */ +_GLOBAL(call_do_irq) + mflr r0 + stw r0,4(r1) + lwz r10,THREAD+KSP_LIMIT(r2) + addi r11,r4,THREAD_INFO_GAP + stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4) + mr r1,r4 + stw r10,8(r1) + stw r11,THREAD+KSP_LIMIT(r2) + bl __do_irq + lwz r10,8(r1) + lwz r1,0(r1) + lwz r0,4(r1) + stw r10,THREAD+KSP_LIMIT(r2) + mtlr r0 blr /* @@ -63,32 +105,6 @@ _GLOBAL(mulhdu) blr /* - * Returns (address we're running at) - (address we were linked at) - * for use before the text and data are mapped to KERNELBASE. - */ -_GLOBAL(reloc_offset) - mflr r0 - bl 1f -1: mflr r3 - LOADADDR(r4,1b) - subf r3,r4,r3 - mtlr r0 - blr - -/* - * add_reloc_offset(x) returns x + reloc_offset(). - */ -_GLOBAL(add_reloc_offset) - mflr r0 - bl 1f -1: mflr r5 - LOADADDR(r4,1b) - subf r5,r4,r5 - add r3,r3,r5 - mtlr r0 - blr - -/* * sub_reloc_offset(x) returns x - reloc_offset(). */ _GLOBAL(sub_reloc_offset) @@ -131,80 +147,6 @@ _GLOBAL(reloc_got2) blr /* - * identify_cpu, - * called with r3 = data offset and r4 = CPU number - * doesn't change r3 - */ -_GLOBAL(identify_cpu) - addis r8,r3,cpu_specs@ha - addi r8,r8,cpu_specs@l - mfpvr r7 -1: - lwz r5,CPU_SPEC_PVR_MASK(r8) - and r5,r5,r7 - lwz r6,CPU_SPEC_PVR_VALUE(r8) - cmplw 0,r6,r5 - beq 1f - addi r8,r8,CPU_SPEC_ENTRY_SIZE - b 1b -1: - addis r6,r3,cur_cpu_spec@ha - addi r6,r6,cur_cpu_spec@l - sub r8,r8,r3 - stw r8,0(r6) - blr - -/* - * do_cpu_ftr_fixups - goes through the list of CPU feature fixups - * and writes nop's over sections of code that don't apply for this cpu. - * r3 = data offset (not changed) - */ -_GLOBAL(do_cpu_ftr_fixups) - /* Get CPU 0 features */ - addis r6,r3,cur_cpu_spec@ha - addi r6,r6,cur_cpu_spec@l - lwz r4,0(r6) - add r4,r4,r3 - lwz r4,CPU_SPEC_FEATURES(r4) - - /* Get the fixup table */ - addis r6,r3,__start___ftr_fixup@ha - addi r6,r6,__start___ftr_fixup@l - addis r7,r3,__stop___ftr_fixup@ha - addi r7,r7,__stop___ftr_fixup@l - - /* Do the fixup */ -1: cmplw 0,r6,r7 - bgelr - addi r6,r6,16 - lwz r8,-16(r6) /* mask */ - and r8,r8,r4 - lwz r9,-12(r6) /* value */ - cmplw 0,r8,r9 - beq 1b - lwz r8,-8(r6) /* section begin */ - lwz r9,-4(r6) /* section end */ - subf. r9,r8,r9 - beq 1b - /* write nops over the section of code */ - /* todo: if large section, add a branch at the start of it */ - srwi r9,r9,2 - mtctr r9 - add r8,r8,r3 - lis r0,0x60000000@h /* nop */ -3: stw r0,0(r8) - andi. r10,r4,CPU_FTR_SPLIT_ID_CACHE@l - beq 2f - dcbst 0,r8 /* suboptimal, but simpler */ - sync - icbi 0,r8 -2: addi r8,r8,4 - bdnz 3b - sync /* additional sync needed on g4 */ - isync - b 1b - -/* * call_setup_cpu - call the setup_cpu function for this cpu * r3 = data offset, r24 = cpu number * @@ -218,7 +160,7 @@ _GLOBAL(call_setup_cpu) lwz r4,0(r4) add r4,r4,r3 lwz r5,CPU_SPEC_SETUP(r4) - cmpi 0,r5,0 + cmpwi 0,r5,0 add r5,r5,r3 beqlr mtctr r5 @@ -255,7 +197,7 @@ _GLOBAL(low_choose_750fx_pll) mtspr SPRN_HID1,r4 /* Store new HID1 image */ - rlwinm r6,r1,0,0,18 + CURRENT_THREAD_INFO(r6, r1) lwz r6,TI_CPU(r6) slwi r6,r6,2 addis r6,r6,nap_save_hid1@ha @@ -309,163 +251,47 @@ _GLOBAL(_nmask_and_or_msr) isync blr /* Done */ +#ifdef CONFIG_40x /* - * Flush MMU TLB + * Do an IO access in real mode */ -_GLOBAL(_tlbia) -#if defined(CONFIG_40x) - sync /* Flush to memory before changing mapping */ - tlbia - isync /* Flush shadow TLB */ -#elif defined(CONFIG_44x) - li r3,0 +_GLOBAL(real_readb) + mfmsr r7 + ori r0,r7,MSR_DR + xori r0,r0,MSR_DR sync - - /* Load high watermark */ - lis r4,tlb_44x_hwater@ha - lwz r5,tlb_44x_hwater@l(r4) - -1: tlbwe r3,r3,PPC44x_TLB_PAGEID - addi r3,r3,1 - cmpw 0,r3,r5 - ble 1b - - isync -#elif defined(CONFIG_FSL_BOOKE) - /* Invalidate all entries in TLB0 */ - li r3, 0x04 - tlbivax 0,3 - /* Invalidate all entries in TLB1 */ - li r3, 0x0c - tlbivax 0,3 - /* Invalidate all entries in TLB2 */ - li r3, 0x14 - tlbivax 0,3 - /* Invalidate all entries in TLB3 */ - li r3, 0x1c - tlbivax 0,3 - msync -#ifdef CONFIG_SMP - tlbsync -#endif /* CONFIG_SMP */ -#else /* !(CONFIG_40x || CONFIG_44x || CONFIG_FSL_BOOKE) */ -#if defined(CONFIG_SMP) - rlwinm r8,r1,0,0,18 - lwz r8,TI_CPU(r8) - oris r8,r8,10 - mfmsr r10 - SYNC - rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ - rlwinm r0,r0,0,28,26 /* clear DR */ mtmsr r0 - SYNC_601 - isync - lis r9,mmu_hash_lock@h - ori r9,r9,mmu_hash_lock@l - tophys(r9,r9) -10: lwarx r7,0,r9 - cmpwi 0,r7,0 - bne- 10b - stwcx. r8,0,r9 - bne- 10b - sync - tlbia sync - TLBSYNC - li r0,0 - stw r0,0(r9) /* clear mmu_hash_lock */ - mtmsr r10 - SYNC_601 isync -#else /* CONFIG_SMP */ + lbz r3,0(r3) sync - tlbia + mtmsr r7 sync -#endif /* CONFIG_SMP */ -#endif /* ! defined(CONFIG_40x) */ + isync blr -/* - * Flush MMU TLB for a particular address + /* + * Do an IO access in real mode */ -_GLOBAL(_tlbie) -#if defined(CONFIG_40x) - tlbsx. r3, 0, r3 - bne 10f - sync - /* There are only 64 TLB entries, so r3 < 64, which means bit 25 is clear. - * Since 25 is the V bit in the TLB_TAG, loading this value will invalidate - * the TLB entry. */ - tlbwe r3, r3, TLB_TAG - isync -10: -#elif defined(CONFIG_44x) - mfspr r4,SPRN_MMUCR - mfspr r5,SPRN_PID /* Get PID */ - rlwimi r4,r5,0,24,31 /* Set TID */ - mtspr SPRN_MMUCR,r4 - - tlbsx. r3, 0, r3 - bne 10f +_GLOBAL(real_writeb) + mfmsr r7 + ori r0,r7,MSR_DR + xori r0,r0,MSR_DR sync - /* There are only 64 TLB entries, so r3 < 64, - * which means bit 22, is clear. Since 22 is - * the V bit in the TLB_PAGEID, loading this - * value will invalidate the TLB entry. - */ - tlbwe r3, r3, PPC44x_TLB_PAGEID - isync -10: -#elif defined(CONFIG_FSL_BOOKE) - rlwinm r4, r3, 0, 0, 19 - ori r5, r4, 0x08 /* TLBSEL = 1 */ - ori r6, r4, 0x10 /* TLBSEL = 2 */ - ori r7, r4, 0x18 /* TLBSEL = 3 */ - tlbivax 0, r4 - tlbivax 0, r5 - tlbivax 0, r6 - tlbivax 0, r7 - msync -#if defined(CONFIG_SMP) - tlbsync -#endif /* CONFIG_SMP */ -#else /* !(CONFIG_40x || CONFIG_44x || CONFIG_FSL_BOOKE) */ -#if defined(CONFIG_SMP) - rlwinm r8,r1,0,0,18 - lwz r8,TI_CPU(r8) - oris r8,r8,11 - mfmsr r10 - SYNC - rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ - rlwinm r0,r0,0,28,26 /* clear DR */ mtmsr r0 - SYNC_601 - isync - lis r9,mmu_hash_lock@h - ori r9,r9,mmu_hash_lock@l - tophys(r9,r9) -10: lwarx r7,0,r9 - cmpwi 0,r7,0 - bne- 10b - stwcx. r8,0,r9 - bne- 10b - eieio - tlbie r3 sync - TLBSYNC - li r0,0 - stw r0,0(r9) /* clear mmu_hash_lock */ - mtmsr r10 - SYNC_601 isync -#else /* CONFIG_SMP */ - tlbie r3 + stb r3,0(r4) sync -#endif /* CONFIG_SMP */ -#endif /* ! CONFIG_40x */ + mtmsr r7 + sync + isync blr +#endif /* CONFIG_40x */ + + /* * Flush instruction cache. * This is a no-op on the 601. @@ -495,7 +321,7 @@ BEGIN_FTR_SECTION mtspr SPRN_L1CSR0,r3 isync blr -END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE) +END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE) mfspr r3,SPRN_L1CSR1 ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR mtspr SPRN_L1CSR1,r3 @@ -519,10 +345,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE) * * flush_icache_range(unsigned long start, unsigned long stop) */ -_GLOBAL(flush_icache_range) +_KPROBE(flush_icache_range) BEGIN_FTR_SECTION + PURGE_PREFETCHED_INS blr /* for 601, do nothing */ -END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE) +END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) li r5,L1_CACHE_BYTES-1 andc r3,r3,r5 subf r4,r3,r4 @@ -535,10 +362,17 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE) addi r3,r3,L1_CACHE_BYTES bdnz 1b sync /* wait for dcbst's to get to ram */ +#ifndef CONFIG_44x mtctr r4 2: icbi 0,r6 addi r6,r6,L1_CACHE_BYTES bdnz 2b +#else + /* Flash invalidate on 44x because we are passed kmapped addresses and + this doesn't work for userspace pages due to the virtually tagged + icache. Sigh. */ + iccci 0, r0 +#endif sync /* additional sync needed on g4 */ isync blr @@ -607,27 +441,6 @@ _GLOBAL(invalidate_dcache_range) sync /* wait for dcbi's to get to ram */ blr -#ifdef CONFIG_NOT_COHERENT_CACHE -/* - * 40x cores have 8K or 16K dcache and 32 byte line size. - * 44x has a 32K dcache and 32 byte line size. - * 8xx has 1, 2, 4, 8K variants. - * For now, cover the worst case of the 44x. - * Must be called with external interrupts disabled. - */ -#define CACHE_NWAYS 64 -#define CACHE_NLINES 16 - -_GLOBAL(flush_dcache_all) - li r4, (2 * CACHE_NWAYS * CACHE_NLINES) - mtctr r4 - lis r5, KERNELBASE@h -1: lwz r3, 0(r5) /* Load one word from every line */ - addi r5, r5, L1_CACHE_BYTES - bdnz 1b - blr -#endif /* CONFIG_NOT_COHERENT_CACHE */ - /* * Flush a particular page from the data cache to RAM. * Note: this is necessary because the instruction cache does *not* @@ -638,16 +451,29 @@ _GLOBAL(flush_dcache_all) */ _GLOBAL(__flush_dcache_icache) BEGIN_FTR_SECTION - blr /* for 601, do nothing */ -END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE) - rlwinm r3,r3,0,0,19 /* Get page base address */ - li r4,4096/L1_CACHE_BYTES /* Number of lines in a page */ + PURGE_PREFETCHED_INS + blr +END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) + rlwinm r3,r3,0,0,31-PAGE_SHIFT /* Get page base address */ + li r4,PAGE_SIZE/L1_CACHE_BYTES /* Number of lines in a page */ mtctr r4 mr r6,r3 0: dcbst 0,r3 /* Write line to ram */ addi r3,r3,L1_CACHE_BYTES bdnz 0b sync +#ifdef CONFIG_44x + /* We don't flush the icache on 44x. Those have a virtual icache + * and we don't have access to the virtual address here (it's + * not the page vaddr but where it's mapped in user space). The + * flushing of the icache on these is handled elsewhere, when + * a change in the address space occurs, before returning to + * user space + */ +BEGIN_MMU_FTR_SECTION + blr +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x) +#endif /* CONFIG_44x */ mtctr r4 1: icbi 0,r6 addi r6,r6,L1_CACHE_BYTES @@ -656,6 +482,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE) isync blr +#ifndef CONFIG_BOOKE /* * Flush a particular page from the data cache to RAM, identified * by its physical address. We turn off the MMU so we can just use @@ -666,14 +493,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE) */ _GLOBAL(__flush_dcache_icache_phys) BEGIN_FTR_SECTION + PURGE_PREFETCHED_INS blr /* for 601, do nothing */ -END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE) +END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) mfmsr r10 rlwinm r0,r10,0,28,26 /* clear DR */ mtmsr r0 isync - rlwinm r3,r3,0,0,19 /* Get page base address */ - li r4,4096/L1_CACHE_BYTES /* Number of lines in a page */ + rlwinm r3,r3,0,0,31-PAGE_SHIFT /* Get page base address */ + li r4,PAGE_SIZE/L1_CACHE_BYTES /* Number of lines in a page */ mtctr r4 mr r6,r3 0: dcbst 0,r3 /* Write line to ram */ @@ -688,6 +516,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE) mtmsr r10 /* restore DR */ isync blr +#endif /* CONFIG_BOOKE */ /* * Clear pages using the dcbz instruction, which doesn't cause any @@ -697,18 +526,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE) * void clear_pages(void *page, int order) ; */ _GLOBAL(clear_pages) - li r0,4096/L1_CACHE_BYTES + li r0,PAGE_SIZE/L1_CACHE_BYTES slw r0,r0,r4 mtctr r0 -#ifdef CONFIG_8xx - li r4, 0 -1: stw r4, 0(r3) - stw r4, 4(r3) - stw r4, 8(r3) - stw r4, 12(r3) -#else 1: dcbz 0,r3 -#endif addi r3,r3,L1_CACHE_BYTES bdnz 1b blr @@ -733,15 +554,6 @@ _GLOBAL(copy_page) addi r3,r3,-4 addi r4,r4,-4 -#ifdef CONFIG_8xx - /* don't use prefetch on 8xx */ - li r0,4096/L1_CACHE_BYTES - mtctr r0 -1: COPY_16_BYTES - bdnz 1b - blr - -#else /* not 8xx, we can prefetch */ li r5,4 #if MAX_COPY_PREFETCH > 1 @@ -755,7 +567,7 @@ _GLOBAL(copy_page) dcbt r5,r4 li r11,L1_CACHE_BYTES+4 #endif /* MAX_COPY_PREFETCH */ - li r0,4096/L1_CACHE_BYTES - MAX_COPY_PREFETCH + li r0,PAGE_SIZE/L1_CACHE_BYTES - MAX_COPY_PREFETCH crclr 4*cr0+eq 2: mtctr r0 @@ -782,7 +594,6 @@ _GLOBAL(copy_page) li r0,MAX_COPY_PREFETCH li r11,4 b 2b -#endif /* CONFIG_8xx */ /* * void atomic_clear_mask(atomic_t mask, atomic_t *addr) @@ -804,136 +615,6 @@ _GLOBAL(atomic_set_mask) blr /* - * I/O string operations - * - * insb(port, buf, len) - * outsb(port, buf, len) - * insw(port, buf, len) - * outsw(port, buf, len) - * insl(port, buf, len) - * outsl(port, buf, len) - * insw_ns(port, buf, len) - * outsw_ns(port, buf, len) - * insl_ns(port, buf, len) - * outsl_ns(port, buf, len) - * - * The *_ns versions don't do byte-swapping. - */ -_GLOBAL(_insb) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,1 - blelr- -00: lbz r5,0(r3) - eieio - stbu r5,1(r4) - bdnz 00b - blr - -_GLOBAL(_outsb) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,1 - blelr- -00: lbzu r5,1(r4) - stb r5,0(r3) - eieio - bdnz 00b - blr - -_GLOBAL(_insw) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,2 - blelr- -00: lhbrx r5,0,r3 - eieio - sthu r5,2(r4) - bdnz 00b - blr - -_GLOBAL(_outsw) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,2 - blelr- -00: lhzu r5,2(r4) - eieio - sthbrx r5,0,r3 - bdnz 00b - blr - -_GLOBAL(_insl) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,4 - blelr- -00: lwbrx r5,0,r3 - eieio - stwu r5,4(r4) - bdnz 00b - blr - -_GLOBAL(_outsl) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,4 - blelr- -00: lwzu r5,4(r4) - stwbrx r5,0,r3 - eieio - bdnz 00b - blr - -_GLOBAL(__ide_mm_insw) -_GLOBAL(_insw_ns) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,2 - blelr- -00: lhz r5,0(r3) - eieio - sthu r5,2(r4) - bdnz 00b - blr - -_GLOBAL(__ide_mm_outsw) -_GLOBAL(_outsw_ns) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,2 - blelr- -00: lhzu r5,2(r4) - sth r5,0(r3) - eieio - bdnz 00b - blr - -_GLOBAL(__ide_mm_insl) -_GLOBAL(_insl_ns) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,4 - blelr- -00: lwz r5,0(r3) - eieio - stwu r5,4(r4) - bdnz 00b - blr - -_GLOBAL(__ide_mm_outsl) -_GLOBAL(_outsl_ns) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,4 - blelr- -00: lwzu r5,4(r4) - stw r5,0(r3) - eieio - bdnz 00b - blr - -/* * Extended precision shifts. * * Updated to be valid for shift counts from 0 to 63 inclusive. @@ -982,56 +663,548 @@ _GLOBAL(__lshrdi3) or r4,r4,r7 # LSW |= t2 blr +/* + * 64-bit comparison: __cmpdi2(s64 a, s64 b) + * Returns 0 if a < b, 1 if a == b, 2 if a > b. + */ +_GLOBAL(__cmpdi2) + cmpw r3,r5 + li r3,1 + bne 1f + cmplw r4,r6 + beqlr +1: li r3,0 + bltlr + li r3,2 + blr +/* + * 64-bit comparison: __ucmpdi2(u64 a, u64 b) + * Returns 0 if a < b, 1 if a == b, 2 if a > b. + */ +_GLOBAL(__ucmpdi2) + cmplw r3,r5 + li r3,1 + bne 1f + cmplw r4,r6 + beqlr +1: li r3,0 + bltlr + li r3,2 + blr + +_GLOBAL(__bswapdi2) + rotlwi r9,r4,8 + rotlwi r10,r3,8 + rlwimi r9,r4,24,0,7 + rlwimi r10,r3,24,0,7 + rlwimi r9,r4,24,16,23 + rlwimi r10,r3,24,16,23 + mr r3,r9 + mr r4,r10 + blr + _GLOBAL(abs) srawi r4,r3,31 xor r3,r3,r4 sub r3,r3,r4 blr -_GLOBAL(_get_SP) - mr r3,r1 /* Close enough */ - blr - +#ifdef CONFIG_SMP +_GLOBAL(start_secondary_resume) + /* Reset stack */ + CURRENT_THREAD_INFO(r1, r1) + addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD + li r3,0 + stw r3,0(r1) /* Zero the stack frame pointer */ + bl start_secondary + b . +#endif /* CONFIG_SMP */ + /* - * Create a kernel thread - * kernel_thread(fn, arg, flags) + * This routine is just here to keep GCC happy - sigh... */ -_GLOBAL(kernel_thread) - stwu r1,-16(r1) - stw r30,8(r1) - stw r31,12(r1) - mr r30,r3 /* function */ - mr r31,r4 /* argument */ - ori r3,r5,CLONE_VM /* flags */ - oris r3,r3,CLONE_UNTRACED>>16 - li r4,0 /* new sp (unused) */ - li r0,__NR_clone - sc - cmpwi 0,r3,0 /* parent or child? */ - bne 1f /* return if parent */ - li r0,0 /* make top-level stack frame */ - stwu r0,-16(r1) - mtlr r30 /* fn addr in lr */ - mr r3,r31 /* load arg and call fn */ - PPC440EP_ERR42 - blrl - li r0,__NR_exit /* exit if function returns */ - li r3,0 - sc -1: lwz r30,8(r1) - lwz r31,12(r1) - addi r1,r1,16 +_GLOBAL(__main) blr -_GLOBAL(execve) - li r0,__NR_execve - sc - bnslr - neg r3,r3 - blr +#ifdef CONFIG_KEXEC + /* + * Must be relocatable PIC code callable as a C function. + */ + .globl relocate_new_kernel +relocate_new_kernel: + /* r3 = page_list */ + /* r4 = reboot_code_buffer */ + /* r5 = start_address */ + +#ifdef CONFIG_FSL_BOOKE + + mr r29, r3 + mr r30, r4 + mr r31, r5 + +#define ENTRY_MAPPING_KEXEC_SETUP +#include "fsl_booke_entry_mapping.S" +#undef ENTRY_MAPPING_KEXEC_SETUP + + mr r3, r29 + mr r4, r30 + mr r5, r31 + li r0, 0 +#elif defined(CONFIG_44x) + + /* Save our parameters */ + mr r29, r3 + mr r30, r4 + mr r31, r5 + +#ifdef CONFIG_PPC_47x + /* Check for 47x cores */ + mfspr r3,SPRN_PVR + srwi r3,r3,16 + cmplwi cr0,r3,PVR_476@h + beq setup_map_47x + cmplwi cr0,r3,PVR_476_ISS@h + beq setup_map_47x +#endif /* CONFIG_PPC_47x */ + /* - * This routine is just here to keep GCC happy - sigh... + * Code for setting up 1:1 mapping for PPC440x for KEXEC + * + * We cannot switch off the MMU on PPC44x. + * So we: + * 1) Invalidate all the mappings except the one we are running from. + * 2) Create a tmp mapping for our code in the other address space(TS) and + * jump to it. Invalidate the entry we started in. + * 3) Create a 1:1 mapping for 0-2GiB in chunks of 256M in original TS. + * 4) Jump to the 1:1 mapping in original TS. + * 5) Invalidate the tmp mapping. + * + * - Based on the kexec support code for FSL BookE + * */ -_GLOBAL(__main) - blr + + /* + * Load the PID with kernel PID (0). + * Also load our MSR_IS and TID to MMUCR for TLB search. + */ + li r3, 0 + mtspr SPRN_PID, r3 + mfmsr r4 + andi. r4,r4,MSR_IS@l + beq wmmucr + oris r3,r3,PPC44x_MMUCR_STS@h +wmmucr: + mtspr SPRN_MMUCR,r3 + sync + + /* + * Invalidate all the TLB entries except the current entry + * where we are running from + */ + bl 0f /* Find our address */ +0: mflr r5 /* Make it accessible */ + tlbsx r23,0,r5 /* Find entry we are in */ + li r4,0 /* Start at TLB entry 0 */ + li r3,0 /* Set PAGEID inval value */ +1: cmpw r23,r4 /* Is this our entry? */ + beq skip /* If so, skip the inval */ + tlbwe r3,r4,PPC44x_TLB_PAGEID /* If not, inval the entry */ +skip: + addi r4,r4,1 /* Increment */ + cmpwi r4,64 /* Are we done? */ + bne 1b /* If not, repeat */ + isync + + /* Create a temp mapping and jump to it */ + andi. r6, r23, 1 /* Find the index to use */ + addi r24, r6, 1 /* r24 will contain 1 or 2 */ + + mfmsr r9 /* get the MSR */ + rlwinm r5, r9, 27, 31, 31 /* Extract the MSR[IS] */ + xori r7, r5, 1 /* Use the other address space */ + + /* Read the current mapping entries */ + tlbre r3, r23, PPC44x_TLB_PAGEID + tlbre r4, r23, PPC44x_TLB_XLAT + tlbre r5, r23, PPC44x_TLB_ATTRIB + + /* Save our current XLAT entry */ + mr r25, r4 + + /* Extract the TLB PageSize */ + li r10, 1 /* r10 will hold PageSize */ + rlwinm r11, r3, 0, 24, 27 /* bits 24-27 */ + + /* XXX: As of now we use 256M, 4K pages */ + cmpwi r11, PPC44x_TLB_256M + bne tlb_4k + rotlwi r10, r10, 28 /* r10 = 256M */ + b write_out +tlb_4k: + cmpwi r11, PPC44x_TLB_4K + bne default + rotlwi r10, r10, 12 /* r10 = 4K */ + b write_out +default: + rotlwi r10, r10, 10 /* r10 = 1K */ + +write_out: + /* + * Write out the tmp 1:1 mapping for this code in other address space + * Fixup EPN = RPN , TS=other address space + */ + insrwi r3, r7, 1, 23 /* Bit 23 is TS for PAGEID field */ + + /* Write out the tmp mapping entries */ + tlbwe r3, r24, PPC44x_TLB_PAGEID + tlbwe r4, r24, PPC44x_TLB_XLAT + tlbwe r5, r24, PPC44x_TLB_ATTRIB + + subi r11, r10, 1 /* PageOffset Mask = PageSize - 1 */ + not r10, r11 /* Mask for PageNum */ + + /* Switch to other address space in MSR */ + insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */ + + bl 1f +1: mflr r8 + addi r8, r8, (2f-1b) /* Find the target offset */ + + /* Jump to the tmp mapping */ + mtspr SPRN_SRR0, r8 + mtspr SPRN_SRR1, r9 + rfi + +2: + /* Invalidate the entry we were executing from */ + li r3, 0 + tlbwe r3, r23, PPC44x_TLB_PAGEID + + /* attribute fields. rwx for SUPERVISOR mode */ + li r5, 0 + ori r5, r5, (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G) + + /* Create 1:1 mapping in 256M pages */ + xori r7, r7, 1 /* Revert back to Original TS */ + + li r8, 0 /* PageNumber */ + li r6, 3 /* TLB Index, start at 3 */ + +next_tlb: + rotlwi r3, r8, 28 /* Create EPN (bits 0-3) */ + mr r4, r3 /* RPN = EPN */ + ori r3, r3, (PPC44x_TLB_VALID | PPC44x_TLB_256M) /* SIZE = 256M, Valid */ + insrwi r3, r7, 1, 23 /* Set TS from r7 */ + + tlbwe r3, r6, PPC44x_TLB_PAGEID /* PageID field : EPN, V, SIZE */ + tlbwe r4, r6, PPC44x_TLB_XLAT /* Address translation : RPN */ + tlbwe r5, r6, PPC44x_TLB_ATTRIB /* Attributes */ + + addi r8, r8, 1 /* Increment PN */ + addi r6, r6, 1 /* Increment TLB Index */ + cmpwi r8, 8 /* Are we done ? */ + bne next_tlb + isync + + /* Jump to the new mapping 1:1 */ + li r9,0 + insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */ + + bl 1f +1: mflr r8 + and r8, r8, r11 /* Get our offset within page */ + addi r8, r8, (2f-1b) + + and r5, r25, r10 /* Get our target PageNum */ + or r8, r8, r5 /* Target jump address */ + + mtspr SPRN_SRR0, r8 + mtspr SPRN_SRR1, r9 + rfi +2: + /* Invalidate the tmp entry we used */ + li r3, 0 + tlbwe r3, r24, PPC44x_TLB_PAGEID + sync + b ppc44x_map_done + +#ifdef CONFIG_PPC_47x + + /* 1:1 mapping for 47x */ + +setup_map_47x: + + /* + * Load the kernel pid (0) to PID and also to MMUCR[TID]. + * Also set the MSR IS->MMUCR STS + */ + li r3, 0 + mtspr SPRN_PID, r3 /* Set PID */ + mfmsr r4 /* Get MSR */ + andi. r4, r4, MSR_IS@l /* TS=1? */ + beq 1f /* If not, leave STS=0 */ + oris r3, r3, PPC47x_MMUCR_STS@h /* Set STS=1 */ +1: mtspr SPRN_MMUCR, r3 /* Put MMUCR */ + sync + + /* Find the entry we are running from */ + bl 2f +2: mflr r23 + tlbsx r23, 0, r23 + tlbre r24, r23, 0 /* TLB Word 0 */ + tlbre r25, r23, 1 /* TLB Word 1 */ + tlbre r26, r23, 2 /* TLB Word 2 */ + + + /* + * Invalidates all the tlb entries by writing to 256 RPNs(r4) + * of 4k page size in all 4 ways (0-3 in r3). + * This would invalidate the entire UTLB including the one we are + * running from. However the shadow TLB entries would help us + * to continue the execution, until we flush them (rfi/isync). + */ + addis r3, 0, 0x8000 /* specify the way */ + addi r4, 0, 0 /* TLB Word0 = (EPN=0, VALID = 0) */ + addi r5, 0, 0 + b clear_utlb_entry + + /* Align the loop to speed things up. from head_44x.S */ + .align 6 + +clear_utlb_entry: + + tlbwe r4, r3, 0 + tlbwe r5, r3, 1 + tlbwe r5, r3, 2 + addis r3, r3, 0x2000 /* Increment the way */ + cmpwi r3, 0 + bne clear_utlb_entry + addis r3, 0, 0x8000 + addis r4, r4, 0x100 /* Increment the EPN */ + cmpwi r4, 0 + bne clear_utlb_entry + + /* Create the entries in the other address space */ + mfmsr r5 + rlwinm r7, r5, 27, 31, 31 /* Get the TS (Bit 26) from MSR */ + xori r7, r7, 1 /* r7 = !TS */ + + insrwi r24, r7, 1, 21 /* Change the TS in the saved TLB word 0 */ + + /* + * write out the TLB entries for the tmp mapping + * Use way '0' so that we could easily invalidate it later. + */ + lis r3, 0x8000 /* Way '0' */ + + tlbwe r24, r3, 0 + tlbwe r25, r3, 1 + tlbwe r26, r3, 2 + + /* Update the msr to the new TS */ + insrwi r5, r7, 1, 26 + + bl 1f +1: mflr r6 + addi r6, r6, (2f-1b) + + mtspr SPRN_SRR0, r6 + mtspr SPRN_SRR1, r5 + rfi + + /* + * Now we are in the tmp address space. + * Create a 1:1 mapping for 0-2GiB in the original TS. + */ +2: + li r3, 0 + li r4, 0 /* TLB Word 0 */ + li r5, 0 /* TLB Word 1 */ + li r6, 0 + ori r6, r6, PPC47x_TLB2_S_RWX /* TLB word 2 */ + + li r8, 0 /* PageIndex */ + + xori r7, r7, 1 /* revert back to original TS */ + +write_utlb: + rotlwi r5, r8, 28 /* RPN = PageIndex * 256M */ + /* ERPN = 0 as we don't use memory above 2G */ + + mr r4, r5 /* EPN = RPN */ + ori r4, r4, (PPC47x_TLB0_VALID | PPC47x_TLB0_256M) + insrwi r4, r7, 1, 21 /* Insert the TS to Word 0 */ + + tlbwe r4, r3, 0 /* Write out the entries */ + tlbwe r5, r3, 1 + tlbwe r6, r3, 2 + addi r8, r8, 1 + cmpwi r8, 8 /* Have we completed ? */ + bne write_utlb + + /* make sure we complete the TLB write up */ + isync + + /* + * Prepare to jump to the 1:1 mapping. + * 1) Extract page size of the tmp mapping + * DSIZ = TLB_Word0[22:27] + * 2) Calculate the physical address of the address + * to jump to. + */ + rlwinm r10, r24, 0, 22, 27 + + cmpwi r10, PPC47x_TLB0_4K + bne 0f + li r10, 0x1000 /* r10 = 4k */ + bl 1f + +0: + /* Defaults to 256M */ + lis r10, 0x1000 + + bl 1f +1: mflr r4 + addi r4, r4, (2f-1b) /* virtual address of 2f */ + + subi r11, r10, 1 /* offsetmask = Pagesize - 1 */ + not r10, r11 /* Pagemask = ~(offsetmask) */ + + and r5, r25, r10 /* Physical page */ + and r6, r4, r11 /* offset within the current page */ + + or r5, r5, r6 /* Physical address for 2f */ + + /* Switch the TS in MSR to the original one */ + mfmsr r8 + insrwi r8, r7, 1, 26 + + mtspr SPRN_SRR1, r8 + mtspr SPRN_SRR0, r5 + rfi + +2: + /* Invalidate the tmp mapping */ + lis r3, 0x8000 /* Way '0' */ + + clrrwi r24, r24, 12 /* Clear the valid bit */ + tlbwe r24, r3, 0 + tlbwe r25, r3, 1 + tlbwe r26, r3, 2 + + /* Make sure we complete the TLB write and flush the shadow TLB */ + isync + +#endif + +ppc44x_map_done: + + + /* Restore the parameters */ + mr r3, r29 + mr r4, r30 + mr r5, r31 + + li r0, 0 +#else + li r0, 0 + + /* + * Set Machine Status Register to a known status, + * switch the MMU off and jump to 1: in a single step. + */ + + mr r8, r0 + ori r8, r8, MSR_RI|MSR_ME + mtspr SPRN_SRR1, r8 + addi r8, r4, 1f - relocate_new_kernel + mtspr SPRN_SRR0, r8 + sync + rfi + +1: +#endif + /* from this point address translation is turned off */ + /* and interrupts are disabled */ + + /* set a new stack at the bottom of our page... */ + /* (not really needed now) */ + addi r1, r4, KEXEC_CONTROL_PAGE_SIZE - 8 /* for LR Save+Back Chain */ + stw r0, 0(r1) + + /* Do the copies */ + li r6, 0 /* checksum */ + mr r0, r3 + b 1f + +0: /* top, read another word for the indirection page */ + lwzu r0, 4(r3) + +1: + /* is it a destination page? (r8) */ + rlwinm. r7, r0, 0, 31, 31 /* IND_DESTINATION (1<<0) */ + beq 2f + + rlwinm r8, r0, 0, 0, 19 /* clear kexec flags, page align */ + b 0b + +2: /* is it an indirection page? (r3) */ + rlwinm. r7, r0, 0, 30, 30 /* IND_INDIRECTION (1<<1) */ + beq 2f + + rlwinm r3, r0, 0, 0, 19 /* clear kexec flags, page align */ + subi r3, r3, 4 + b 0b + +2: /* are we done? */ + rlwinm. r7, r0, 0, 29, 29 /* IND_DONE (1<<2) */ + beq 2f + b 3f + +2: /* is it a source page? (r9) */ + rlwinm. r7, r0, 0, 28, 28 /* IND_SOURCE (1<<3) */ + beq 0b + + rlwinm r9, r0, 0, 0, 19 /* clear kexec flags, page align */ + + li r7, PAGE_SIZE / 4 + mtctr r7 + subi r9, r9, 4 + subi r8, r8, 4 +9: + lwzu r0, 4(r9) /* do the copy */ + xor r6, r6, r0 + stwu r0, 4(r8) + dcbst 0, r8 + sync + icbi 0, r8 + bdnz 9b + + addi r9, r9, 4 + addi r8, r8, 4 + b 0b + +3: + + /* To be certain of avoiding problems with self-modifying code + * execute a serializing instruction here. + */ + isync + sync + + mfspr r3, SPRN_PIR /* current core we are running on */ + mr r4, r5 /* load physical address of chunk called */ + + /* jump to the entry point, usually the setup routine */ + mtlr r5 + blrl + +1: b 1b + +relocate_new_kernel_end: + + .globl relocate_new_kernel_size +relocate_new_kernel_size: + .long relocate_new_kernel_end - relocate_new_kernel +#endif diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index b3e95ff0dba..4e314b90c75 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -1,14 +1,12 @@ /* - * arch/powerpc/kernel/misc64.S - * * This file contains miscellaneous low-level functions. * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) * * Largely rewritten by Cort Dougan (cort@cs.nmt.edu) * and Paul Mackerras. * Adapted for iSeries by Mike Corrigan (mikejc@us.ibm.com) - * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com) - * + * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com) + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -16,7 +14,6 @@ * */ -#include <linux/config.h> #include <linux/sys.h> #include <asm/unistd.h> #include <asm/errno.h> @@ -27,121 +24,32 @@ #include <asm/asm-offsets.h> #include <asm/cputable.h> #include <asm/thread_info.h> +#include <asm/kexec.h> +#include <asm/ptrace.h> .text -/* - * Returns (address we are running at) - (address we were linked at) - * for use before the text and data are mapped to KERNELBASE. - */ - -_GLOBAL(reloc_offset) - mflr r0 - bl 1f -1: mflr r3 - LOADADDR(r4,1b) - subf r3,r4,r3 - mtlr r0 - blr - -/* - * add_reloc_offset(x) returns x + reloc_offset(). - */ -_GLOBAL(add_reloc_offset) - mflr r0 - bl 1f -1: mflr r5 - LOADADDR(r4,1b) - subf r5,r4,r5 - add r3,r3,r5 - mtlr r0 - blr - -_GLOBAL(get_msr) - mfmsr r3 - blr - -_GLOBAL(get_dar) - mfdar r3 - blr - -_GLOBAL(get_srr0) - mfsrr0 r3 - blr - -_GLOBAL(get_srr1) - mfsrr1 r3 - blr - -_GLOBAL(get_sp) - mr r3,r1 - blr - -#ifdef CONFIG_IRQSTACKS _GLOBAL(call_do_softirq) mflr r0 std r0,16(r1) - stdu r1,THREAD_SIZE-112(r3) + stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) mr r1,r3 - bl .__do_softirq + bl __do_softirq ld r1,0(r1) ld r0,16(r1) mtlr r0 blr -_GLOBAL(call_handle_IRQ_event) +_GLOBAL(call_do_irq) mflr r0 std r0,16(r1) - stdu r1,THREAD_SIZE-112(r6) - mr r1,r6 - bl .handle_IRQ_event + stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4) + mr r1,r4 + bl __do_irq ld r1,0(r1) ld r0,16(r1) mtlr r0 blr -#endif /* CONFIG_IRQSTACKS */ - - /* - * To be called by C code which needs to do some operations with MMU - * disabled. Note that interrupts have to be disabled by the caller - * prior to calling us. The code called _MUST_ be in the RMO of course - * and part of the linear mapping as we don't attempt to translate the - * stack pointer at all. The function is called with the stack switched - * to this CPU emergency stack - * - * prototype is void *call_with_mmu_off(void *func, void *data); - * - * the called function is expected to be of the form - * - * void *called(void *data); - */ -_GLOBAL(call_with_mmu_off) - mflr r0 /* get link, save it on stackframe */ - std r0,16(r1) - mr r1,r5 /* save old stack ptr */ - ld r1,PACAEMERGSP(r13) /* get emerg. stack */ - subi r1,r1,STACK_FRAME_OVERHEAD - std r0,16(r1) /* save link on emerg. stack */ - std r5,0(r1) /* save old stack ptr in backchain */ - ld r3,0(r3) /* get to real function ptr (assume same TOC) */ - bl 2f /* we need LR to return, continue at label 2 */ - - ld r0,16(r1) /* we return here from the call, get LR and */ - ld r1,0(r1) /* .. old stack ptr */ - mtspr SPRN_SRR0,r0 /* and get back to virtual mode with these */ - mfmsr r4 - ori r4,r4,MSR_IR|MSR_DR - mtspr SPRN_SRR1,r4 - rfid - -2: mtspr SPRN_SRR0,r3 /* coming from above, enter real mode */ - mr r3,r4 /* get parameter */ - mfmsr r0 - ori r0,r0,MSR_IR|MSR_DR - xori r0,r0,MSR_IR|MSR_DR - mtspr SPRN_SRR1,r0 - rfid - .section ".toc","aw" PPC64_CACHES: @@ -157,8 +65,11 @@ PPC64_CACHES: * flush all bytes from start through stop-1 inclusive */ -_KPROBE(__flush_icache_range) - +_KPROBE(flush_icache_range) +BEGIN_FTR_SECTION + PURGE_PREFETCHED_INS + blr +END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) /* * Flush the data cache to memory * @@ -301,6 +212,11 @@ _GLOBAL(__flush_dcache_icache) * Different systems have different cache line sizes */ +BEGIN_FTR_SECTION + PURGE_PREFETCHED_INS + blr +END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) + /* Flush the dcache */ ld r7,PPC64_CACHES@toc(r2) clrrdi r3,r3,PAGE_SHIFT /* Page align */ @@ -323,223 +239,54 @@ _GLOBAL(__flush_dcache_icache) bdnz 1b isync blr - -/* - * I/O string operations - * - * insb(port, buf, len) - * outsb(port, buf, len) - * insw(port, buf, len) - * outsw(port, buf, len) - * insl(port, buf, len) - * outsl(port, buf, len) - * insw_ns(port, buf, len) - * outsw_ns(port, buf, len) - * insl_ns(port, buf, len) - * outsl_ns(port, buf, len) - * - * The *_ns versions don't do byte-swapping. - */ -_GLOBAL(_insb) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,1 - blelr- -00: lbz r5,0(r3) - eieio - stbu r5,1(r4) - bdnz 00b - twi 0,r5,0 - isync - blr -_GLOBAL(_outsb) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,1 - blelr- -00: lbzu r5,1(r4) - stb r5,0(r3) - bdnz 00b - sync - blr - -_GLOBAL(_insw) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,2 - blelr- -00: lhbrx r5,0,r3 - eieio - sthu r5,2(r4) - bdnz 00b - twi 0,r5,0 - isync +_GLOBAL(__bswapdi2) + srdi r8,r3,32 + rlwinm r7,r3,8,0xffffffff + rlwimi r7,r3,24,0,7 + rlwinm r9,r8,8,0xffffffff + rlwimi r7,r3,24,16,23 + rlwimi r9,r8,24,0,7 + rlwimi r9,r8,24,16,23 + sldi r7,r7,32 + or r3,r7,r9 blr -_GLOBAL(_outsw) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,2 - blelr- -00: lhzu r5,2(r4) - sthbrx r5,0,r3 - bdnz 00b - sync - blr - -_GLOBAL(_insl) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,4 - blelr- -00: lwbrx r5,0,r3 - eieio - stwu r5,4(r4) - bdnz 00b - twi 0,r5,0 - isync - blr -_GLOBAL(_outsl) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,4 - blelr- -00: lwzu r5,4(r4) - stwbrx r5,0,r3 - bdnz 00b - sync - blr - -/* _GLOBAL(ide_insw) now in drivers/ide/ide-iops.c */ -_GLOBAL(_insw_ns) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,2 - blelr- -00: lhz r5,0(r3) - eieio - sthu r5,2(r4) - bdnz 00b - twi 0,r5,0 +#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX +_GLOBAL(rmci_on) + sync isync - blr - -/* _GLOBAL(ide_outsw) now in drivers/ide/ide-iops.c */ -_GLOBAL(_outsw_ns) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,2 - blelr- -00: lhzu r5,2(r4) - sth r5,0(r3) - bdnz 00b - sync - blr - -_GLOBAL(_insl_ns) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,4 - blelr- -00: lwz r5,0(r3) - eieio - stwu r5,4(r4) - bdnz 00b - twi 0,r5,0 + li r3,0x100 + rldicl r3,r3,32,0 + mfspr r5,SPRN_HID4 + or r5,r5,r3 + sync + mtspr SPRN_HID4,r5 + isync + slbia isync + sync blr -_GLOBAL(_outsl_ns) - cmpwi 0,r5,0 - mtctr r5 - subi r4,r4,4 - blelr- -00: lwzu r5,4(r4) - stw r5,0(r3) - bdnz 00b +_GLOBAL(rmci_off) sync - blr - -/* - * identify_cpu and calls setup_cpu - * In: r3 = base of the cpu_specs array - * r4 = address of cur_cpu_spec - * r5 = relocation offset - */ -_GLOBAL(identify_cpu) - mfpvr r7 -1: - lwz r8,CPU_SPEC_PVR_MASK(r3) - and r8,r8,r7 - lwz r9,CPU_SPEC_PVR_VALUE(r3) - cmplw 0,r9,r8 - beq 1f - addi r3,r3,CPU_SPEC_ENTRY_SIZE - b 1b -1: - sub r0,r3,r5 - std r0,0(r4) - ld r4,CPU_SPEC_SETUP(r3) - add r4,r4,r5 - ld r4,0(r4) - add r4,r4,r5 - mtctr r4 - /* Calling convention for cpu setup is r3=offset, r4=cur_cpu_spec */ - mr r4,r3 - mr r3,r5 - bctr - -/* - * do_cpu_ftr_fixups - goes through the list of CPU feature fixups - * and writes nop's over sections of code that don't apply for this cpu. - * r3 = data offset (not changed) - */ -_GLOBAL(do_cpu_ftr_fixups) - /* Get CPU 0 features */ - LOADADDR(r6,cur_cpu_spec) - sub r6,r6,r3 - ld r4,0(r6) - sub r4,r4,r3 - ld r4,CPU_SPEC_FEATURES(r4) - /* Get the fixup table */ - LOADADDR(r6,__start___ftr_fixup) - sub r6,r6,r3 - LOADADDR(r7,__stop___ftr_fixup) - sub r7,r7,r3 - /* Do the fixup */ -1: cmpld r6,r7 - bgelr - addi r6,r6,32 - ld r8,-32(r6) /* mask */ - and r8,r8,r4 - ld r9,-24(r6) /* value */ - cmpld r8,r9 - beq 1b - ld r8,-16(r6) /* section begin */ - ld r9,-8(r6) /* section end */ - subf. r9,r8,r9 - beq 1b - /* write nops over the section of code */ - /* todo: if large section, add a branch at the start of it */ - srwi r9,r9,2 - mtctr r9 - sub r8,r8,r3 - lis r0,0x60000000@h /* nop */ -3: stw r0,0(r8) - andi. r10,r4,CPU_FTR_SPLIT_ID_CACHE@l - beq 2f - dcbst 0,r8 /* suboptimal, but simpler */ - sync - icbi 0,r8 -2: addi r8,r8,4 - bdnz 3b - sync /* additional sync needed on g4 */ - isync - b 1b + isync + li r3,0x100 + rldicl r3,r3,32,0 + mfspr r5,SPRN_HID4 + andc r5,r5,r3 + sync + mtspr SPRN_HID4,r5 + isync + slbia + isync + sync + blr +#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */ #if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE) + /* * Do an IO access in real mode */ @@ -603,117 +350,119 @@ _GLOBAL(real_writeb) blr #endif /* defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE) */ -/* - * Create a kernel thread - * kernel_thread(fn, arg, flags) - */ -_GLOBAL(kernel_thread) - std r29,-24(r1) - std r30,-16(r1) - stdu r1,-STACK_FRAME_OVERHEAD(r1) - mr r29,r3 - mr r30,r4 - ori r3,r5,CLONE_VM /* flags */ - oris r3,r3,(CLONE_UNTRACED>>16) - li r4,0 /* new sp (unused) */ - li r0,__NR_clone - sc - cmpdi 0,r3,0 /* parent or child? */ - bne 1f /* return if parent */ - li r0,0 - stdu r0,-STACK_FRAME_OVERHEAD(r1) - ld r2,8(r29) - ld r29,0(r29) - mtlr r29 /* fn addr in lr */ - mr r3,r30 /* load arg and call fn */ - blrl - li r0,__NR_exit /* exit after child exits */ - li r3,0 - sc -1: addi r1,r1,STACK_FRAME_OVERHEAD - ld r29,-24(r1) - ld r30,-16(r1) +#ifdef CONFIG_PPC_PASEMI + +_GLOBAL(real_205_readb) + mfmsr r7 + ori r0,r7,MSR_DR + xori r0,r0,MSR_DR + sync + mtmsrd r0 + sync + isync + LBZCIX(R3,R0,R3) + isync + mtmsrd r7 + sync + isync blr -/* - * disable_kernel_fp() - * Disable the FPU. - */ -_GLOBAL(disable_kernel_fp) - mfmsr r3 - rldicl r0,r3,(63-MSR_FP_LG),1 - rldicl r3,r0,(MSR_FP_LG+1),0 - mtmsrd r3 /* disable use of fpu now */ +_GLOBAL(real_205_writeb) + mfmsr r7 + ori r0,r7,MSR_DR + xori r0,r0,MSR_DR + sync + mtmsrd r0 + sync + isync + STBCIX(R3,R0,R4) + isync + mtmsrd r7 + sync isync blr -#ifdef CONFIG_ALTIVEC +#endif /* CONFIG_PPC_PASEMI */ -#if 0 /* this has no callers for now */ + +#if defined(CONFIG_CPU_FREQ_PMAC64) || defined(CONFIG_CPU_FREQ_MAPLE) /* - * disable_kernel_altivec() - * Disable the VMX. + * SCOM access functions for 970 (FX only for now) + * + * unsigned long scom970_read(unsigned int address); + * void scom970_write(unsigned int address, unsigned long value); + * + * The address passed in is the 24 bits register address. This code + * is 970 specific and will not check the status bits, so you should + * know what you are doing. */ -_GLOBAL(disable_kernel_altivec) - mfmsr r3 - rldicl r0,r3,(63-MSR_VEC_LG),1 - rldicl r3,r0,(MSR_VEC_LG+1),0 - mtmsrd r3 /* disable use of VMX now */ +_GLOBAL(scom970_read) + /* interrupts off */ + mfmsr r4 + ori r0,r4,MSR_EE + xori r0,r0,MSR_EE + mtmsrd r0,1 + + /* rotate 24 bits SCOM address 8 bits left and mask out it's low 8 bits + * (including parity). On current CPUs they must be 0'd, + * and finally or in RW bit + */ + rlwinm r3,r3,8,0,15 + ori r3,r3,0x8000 + + /* do the actual scom read */ + sync + mtspr SPRN_SCOMC,r3 + isync + mfspr r3,SPRN_SCOMD isync + mfspr r0,SPRN_SCOMC + isync + + /* XXX: fixup result on some buggy 970's (ouch ! we lost a bit, bah + * that's the best we can do). Not implemented yet as we don't use + * the scom on any of the bogus CPUs yet, but may have to be done + * ultimately + */ + + /* restore interrupts */ + mtmsrd r4,1 blr -#endif /* 0 */ -/* - * giveup_altivec(tsk) - * Disable VMX for the task given as the argument, - * and save the vector registers in its thread_struct. - * Enables the VMX for use in the kernel on return. - */ -_GLOBAL(giveup_altivec) + +_GLOBAL(scom970_write) + /* interrupts off */ mfmsr r5 - oris r5,r5,MSR_VEC@h - mtmsrd r5 /* enable use of VMX now */ - isync - cmpdi 0,r3,0 - beqlr- /* if no previous owner, done */ - addi r3,r3,THREAD /* want THREAD of task */ - ld r5,PT_REGS(r3) - cmpdi 0,r5,0 - SAVE_32VRS(0,r4,r3) - mfvscr vr0 - li r4,THREAD_VSCR - stvx vr0,r4,r3 - beq 1f - ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) - lis r3,MSR_VEC@h - andc r4,r4,r3 /* disable FP for previous task */ - std r4,_MSR-STACK_FRAME_OVERHEAD(r5) -1: -#ifndef CONFIG_SMP - li r5,0 - ld r4,last_task_used_altivec@got(r2) - std r5,0(r4) -#endif /* CONFIG_SMP */ - blr + ori r0,r5,MSR_EE + xori r0,r0,MSR_EE + mtmsrd r0,1 -#endif /* CONFIG_ALTIVEC */ + /* rotate 24 bits SCOM address 8 bits left and mask out it's low 8 bits + * (including parity). On current CPUs they must be 0'd. + */ -_GLOBAL(__setup_cpu_power3) - blr + rlwinm r3,r3,8,0,15 + + sync + mtspr SPRN_SCOMD,r4 /* write data */ + isync + mtspr SPRN_SCOMC,r3 /* write command */ + isync + mfspr 3,SPRN_SCOMC + isync -_GLOBAL(execve) - li r0,__NR_execve - sc - bnslr - neg r3,r3 + /* restore interrupts */ + mtmsrd r5,1 blr +#endif /* CONFIG_CPU_FREQ_PMAC64 || CONFIG_CPU_FREQ_MAPLE */ /* kexec_wait(phys_cpu) * * wait for the flag to change, indicating this kernel is going away but * the slave code for the next one is at addresses 0 to 100. * - * This is used by all slaves. + * This is used by all slaves, even those that did not find a matching + * paca in the secondary startup code. * * Physical (hardware) cpu id should be in r3. */ @@ -745,16 +494,19 @@ kexec_flag: * note: this is a terminal routine, it does not save lr * * get phys id from paca - * set paca id to -1 to say we got here * switch to real mode + * mark the paca as no longer used * join other cpus in kexec_wait(phys_id) */ _GLOBAL(kexec_smp_wait) lhz r3,PACAHWCPUID(r13) - li r4,-1 - sth r4,PACAHWCPUID(r13) /* let others know we left */ bl real_mode - b .kexec_wait + + li r4,KEXEC_STATE_REAL_MODE + stb r4,PACAKEXECSTATE(r13) + SYNC + + b kexec_wait /* * switch to real mode (turn mmu off) @@ -789,7 +541,7 @@ _GLOBAL(kexec_sequence) std r0,16(r1) /* switch stacks to newstack -- &kexec_stack.stack */ - stdu r1,THREAD_SIZE-112(r3) + stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) mr r1,r3 li r0,0 @@ -806,7 +558,7 @@ _GLOBAL(kexec_sequence) std r26,-48(r1) std r25,-56(r1) - stdu r1,-112-64(r1) + stdu r1,-STACK_FRAME_OVERHEAD-64(r1) /* save args into preserved regs */ mr r31,r3 /* newstack (both) */ @@ -824,15 +576,32 @@ _GLOBAL(kexec_sequence) /* copy dest pages, flush whole dest image */ mr r3,r29 - bl .kexec_copy_flush /* (image) */ + bl kexec_copy_flush /* (image) */ /* turn off mmu */ bl real_mode + /* copy 0x100 bytes starting at start to 0 */ + li r3,0 + mr r4,r30 /* start, aka phys mem offset */ + li r5,0x100 + li r6,0 + bl copy_and_flush /* (dest, src, copy limit, start offset) */ +1: /* assume normal blr return */ + + /* release other cpus to the new kernel secondary start at 0x60 */ + mflr r5 + li r6,1 + stw r6,kexec_flag-1b(5) + /* clear out hardware hash page table and tlb */ - ld r5,0(r27) /* deref function descriptor */ - mtctr r5 - bctrl /* ppc_md.hash_clear_all(void); */ +#if !defined(_CALL_ELF) || _CALL_ELF != 2 + ld r12,0(r27) /* deref function descriptor */ +#else + mr r12,r27 +#endif + mtctr r12 + bctrl /* ppc_md.hpte_clear_all(void); */ /* * kexec image calling is: @@ -859,22 +628,37 @@ _GLOBAL(kexec_sequence) * are the boot cpu ????? * other device tree differences (prop sizes, va vs pa, etc)... */ - - /* copy 0x100 bytes starting at start to 0 */ - li r3,0 - mr r4,r30 - li r5,0x100 - li r6,0 - bl .copy_and_flush /* (dest, src, copy limit, start offset) */ -1: /* assume normal blr return */ - - /* release other cpus to the new kernel secondary start at 0x60 */ - mflr r5 - li r6,1 - stw r6,kexec_flag-1b(5) mr r3,r25 # my phys cpu mr r4,r30 # start, aka phys mem offset mtlr 4 li r5,0 blr /* image->start(physid, image->start, 0); */ #endif /* CONFIG_KEXEC */ + +#ifdef CONFIG_MODULES +#if defined(_CALL_ELF) && _CALL_ELF == 2 + +#ifdef CONFIG_MODVERSIONS +.weak __crc_TOC. +.section "___kcrctab+TOC.","a" +.globl __kcrctab_TOC. +__kcrctab_TOC.: + .llong __crc_TOC. +#endif + +/* + * Export a fake .TOC. since both modpost and depmod will complain otherwise. + * Both modpost and depmod strip the leading . so we do the same here. + */ +.section "__ksymtab_strings","a" +__kstrtab_TOC.: + .asciz "TOC." + +.section "___ksymtab+TOC.","a" +/* This symbol name is important: it's used by modpost to find exported syms */ +.globl __ksymtab_TOC. +__ksymtab_TOC.: + .llong 0 /* .value */ + .llong __kstrtab_TOC. +#endif /* ELFv2 */ +#endif /* MODULES */ diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c new file mode 100644 index 00000000000..9547381b631 --- /dev/null +++ b/arch/powerpc/kernel/module.c @@ -0,0 +1,79 @@ +/* Kernel module help for powerpc. + Copyright (C) 2001, 2003 Rusty Russell IBM Corporation. + Copyright (C) 2008 Freescale Semiconductor, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <linux/elf.h> +#include <linux/moduleloader.h> +#include <linux/err.h> +#include <linux/vmalloc.h> +#include <linux/bug.h> +#include <asm/module.h> +#include <asm/uaccess.h> +#include <asm/firmware.h> +#include <linux/sort.h> +#include <asm/setup.h> + +LIST_HEAD(module_bug_list); + +static const Elf_Shdr *find_section(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + const char *name) +{ + char *secstrings; + unsigned int i; + + secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + for (i = 1; i < hdr->e_shnum; i++) + if (strcmp(secstrings+sechdrs[i].sh_name, name) == 0) + return &sechdrs[i]; + return NULL; +} + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, struct module *me) +{ + const Elf_Shdr *sect; + + /* Apply feature fixups */ + sect = find_section(hdr, sechdrs, "__ftr_fixup"); + if (sect != NULL) + do_feature_fixups(cur_cpu_spec->cpu_features, + (void *)sect->sh_addr, + (void *)sect->sh_addr + sect->sh_size); + + sect = find_section(hdr, sechdrs, "__mmu_ftr_fixup"); + if (sect != NULL) + do_feature_fixups(cur_cpu_spec->mmu_features, + (void *)sect->sh_addr, + (void *)sect->sh_addr + sect->sh_size); + +#ifdef CONFIG_PPC64 + sect = find_section(hdr, sechdrs, "__fw_ftr_fixup"); + if (sect != NULL) + do_feature_fixups(powerpc_firmware_features, + (void *)sect->sh_addr, + (void *)sect->sh_addr + sect->sh_size); +#endif + + sect = find_section(hdr, sechdrs, "__lwsync_fixup"); + if (sect != NULL) + do_lwsync_fixups(cur_cpu_spec->cpu_features, + (void *)sect->sh_addr, + (void *)sect->sh_addr + sect->sh_size); + + return 0; +} diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c new file mode 100644 index 00000000000..6cff040bf45 --- /dev/null +++ b/arch/powerpc/kernel/module_32.c @@ -0,0 +1,307 @@ +/* Kernel module help for PPC. + Copyright (C) 2001 Rusty Russell. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <linux/module.h> +#include <linux/moduleloader.h> +#include <linux/elf.h> +#include <linux/vmalloc.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ftrace.h> +#include <linux/cache.h> +#include <linux/bug.h> +#include <linux/sort.h> +#include <asm/setup.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt , ...) +#endif + +/* Count how many different relocations (different symbol, different + addend) */ +static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num) +{ + unsigned int i, r_info, r_addend, _count_relocs; + + _count_relocs = 0; + r_info = 0; + r_addend = 0; + for (i = 0; i < num; i++) + /* Only count 24-bit relocs, others don't need stubs */ + if (ELF32_R_TYPE(rela[i].r_info) == R_PPC_REL24 && + (r_info != ELF32_R_SYM(rela[i].r_info) || + r_addend != rela[i].r_addend)) { + _count_relocs++; + r_info = ELF32_R_SYM(rela[i].r_info); + r_addend = rela[i].r_addend; + } + +#ifdef CONFIG_DYNAMIC_FTRACE + _count_relocs++; /* add one for ftrace_caller */ +#endif + return _count_relocs; +} + +static int relacmp(const void *_x, const void *_y) +{ + const Elf32_Rela *x, *y; + + y = (Elf32_Rela *)_x; + x = (Elf32_Rela *)_y; + + /* Compare the entire r_info (as opposed to ELF32_R_SYM(r_info) only) to + * make the comparison cheaper/faster. It won't affect the sorting or + * the counting algorithms' performance + */ + if (x->r_info < y->r_info) + return -1; + else if (x->r_info > y->r_info) + return 1; + else if (x->r_addend < y->r_addend) + return -1; + else if (x->r_addend > y->r_addend) + return 1; + else + return 0; +} + +static void relaswap(void *_x, void *_y, int size) +{ + uint32_t *x, *y, tmp; + int i; + + y = (uint32_t *)_x; + x = (uint32_t *)_y; + + for (i = 0; i < sizeof(Elf32_Rela) / sizeof(uint32_t); i++) { + tmp = x[i]; + x[i] = y[i]; + y[i] = tmp; + } +} + +/* Get the potential trampolines size required of the init and + non-init sections */ +static unsigned long get_plt_size(const Elf32_Ehdr *hdr, + const Elf32_Shdr *sechdrs, + const char *secstrings, + int is_init) +{ + unsigned long ret = 0; + unsigned i; + + /* Everything marked ALLOC (this includes the exported + symbols) */ + for (i = 1; i < hdr->e_shnum; i++) { + /* If it's called *.init*, and we're not init, we're + not interested */ + if ((strstr(secstrings + sechdrs[i].sh_name, ".init") != 0) + != is_init) + continue; + + /* We don't want to look at debug sections. */ + if (strstr(secstrings + sechdrs[i].sh_name, ".debug") != 0) + continue; + + if (sechdrs[i].sh_type == SHT_RELA) { + DEBUGP("Found relocations in section %u\n", i); + DEBUGP("Ptr: %p. Number: %u\n", + (void *)hdr + sechdrs[i].sh_offset, + sechdrs[i].sh_size / sizeof(Elf32_Rela)); + + /* Sort the relocation information based on a symbol and + * addend key. This is a stable O(n*log n) complexity + * alogrithm but it will reduce the complexity of + * count_relocs() to linear complexity O(n) + */ + sort((void *)hdr + sechdrs[i].sh_offset, + sechdrs[i].sh_size / sizeof(Elf32_Rela), + sizeof(Elf32_Rela), relacmp, relaswap); + + ret += count_relocs((void *)hdr + + sechdrs[i].sh_offset, + sechdrs[i].sh_size + / sizeof(Elf32_Rela)) + * sizeof(struct ppc_plt_entry); + } + } + + return ret; +} + +int module_frob_arch_sections(Elf32_Ehdr *hdr, + Elf32_Shdr *sechdrs, + char *secstrings, + struct module *me) +{ + unsigned int i; + + /* Find .plt and .init.plt sections */ + for (i = 0; i < hdr->e_shnum; i++) { + if (strcmp(secstrings + sechdrs[i].sh_name, ".init.plt") == 0) + me->arch.init_plt_section = i; + else if (strcmp(secstrings + sechdrs[i].sh_name, ".plt") == 0) + me->arch.core_plt_section = i; + } + if (!me->arch.core_plt_section || !me->arch.init_plt_section) { + printk("Module doesn't contain .plt or .init.plt sections.\n"); + return -ENOEXEC; + } + + /* Override their sizes */ + sechdrs[me->arch.core_plt_section].sh_size + = get_plt_size(hdr, sechdrs, secstrings, 0); + sechdrs[me->arch.init_plt_section].sh_size + = get_plt_size(hdr, sechdrs, secstrings, 1); + return 0; +} + +static inline int entry_matches(struct ppc_plt_entry *entry, Elf32_Addr val) +{ + if (entry->jump[0] == 0x3d800000 + ((val + 0x8000) >> 16) + && entry->jump[1] == 0x398c0000 + (val & 0xffff)) + return 1; + return 0; +} + +/* Set up a trampoline in the PLT to bounce us to the distant function */ +static uint32_t do_plt_call(void *location, + Elf32_Addr val, + Elf32_Shdr *sechdrs, + struct module *mod) +{ + struct ppc_plt_entry *entry; + + DEBUGP("Doing plt for call to 0x%x at 0x%x\n", val, (unsigned int)location); + /* Init, or core PLT? */ + if (location >= mod->module_core + && location < mod->module_core + mod->core_size) + entry = (void *)sechdrs[mod->arch.core_plt_section].sh_addr; + else + entry = (void *)sechdrs[mod->arch.init_plt_section].sh_addr; + + /* Find this entry, or if that fails, the next avail. entry */ + while (entry->jump[0]) { + if (entry_matches(entry, val)) return (uint32_t)entry; + entry++; + } + + entry->jump[0] = 0x3d800000+((val+0x8000)>>16); /* lis r12,sym@ha */ + entry->jump[1] = 0x398c0000 + (val&0xffff); /* addi r12,r12,sym@l*/ + entry->jump[2] = 0x7d8903a6; /* mtctr r12 */ + entry->jump[3] = 0x4e800420; /* bctr */ + + DEBUGP("Initialized plt for 0x%x at %p\n", val, entry); + return (uint32_t)entry; +} + +int apply_relocate_add(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *module) +{ + unsigned int i; + Elf32_Rela *rela = (void *)sechdrs[relsec].sh_addr; + Elf32_Sym *sym; + uint32_t *location; + uint32_t value; + + DEBUGP("Applying ADD relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rela[i].r_offset; + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf32_Sym *)sechdrs[symindex].sh_addr + + ELF32_R_SYM(rela[i].r_info); + /* `Everything is relative'. */ + value = sym->st_value + rela[i].r_addend; + + switch (ELF32_R_TYPE(rela[i].r_info)) { + case R_PPC_ADDR32: + /* Simply set it */ + *(uint32_t *)location = value; + break; + + case R_PPC_ADDR16_LO: + /* Low half of the symbol */ + *(uint16_t *)location = value; + break; + + case R_PPC_ADDR16_HI: + /* Higher half of the symbol */ + *(uint16_t *)location = (value >> 16); + break; + + case R_PPC_ADDR16_HA: + /* Sign-adjusted lower 16 bits: PPC ELF ABI says: + (((x >> 16) + ((x & 0x8000) ? 1 : 0))) & 0xFFFF. + This is the same, only sane. + */ + *(uint16_t *)location = (value + 0x8000) >> 16; + break; + + case R_PPC_REL24: + if ((int)(value - (uint32_t)location) < -0x02000000 + || (int)(value - (uint32_t)location) >= 0x02000000) + value = do_plt_call(location, value, + sechdrs, module); + + /* Only replace bits 2 through 26 */ + DEBUGP("REL24 value = %08X. location = %08X\n", + value, (uint32_t)location); + DEBUGP("Location before: %08X.\n", + *(uint32_t *)location); + *(uint32_t *)location + = (*(uint32_t *)location & ~0x03fffffc) + | ((value - (uint32_t)location) + & 0x03fffffc); + DEBUGP("Location after: %08X.\n", + *(uint32_t *)location); + DEBUGP("ie. jump to %08X+%08X = %08X\n", + *(uint32_t *)location & 0x03fffffc, + (uint32_t)location, + (*(uint32_t *)location & 0x03fffffc) + + (uint32_t)location); + break; + + case R_PPC_REL32: + /* 32-bit relative jump. */ + *(uint32_t *)location = value - (uint32_t)location; + break; + + default: + printk("%s: unknown ADD relocation: %u\n", + module->name, + ELF32_R_TYPE(rela[i].r_info)); + return -ENOEXEC; + } + } +#ifdef CONFIG_DYNAMIC_FTRACE + module->arch.tramp = + do_plt_call(module->module_core, + (unsigned long)ftrace_caller, + sechdrs, module); +#endif + return 0; +} diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c new file mode 100644 index 00000000000..d807ee626af --- /dev/null +++ b/arch/powerpc/kernel/module_64.c @@ -0,0 +1,673 @@ +/* Kernel module help for PPC64. + Copyright (C) 2001, 2003 Rusty Russell IBM Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <linux/module.h> +#include <linux/elf.h> +#include <linux/moduleloader.h> +#include <linux/err.h> +#include <linux/vmalloc.h> +#include <linux/ftrace.h> +#include <linux/bug.h> +#include <linux/uaccess.h> +#include <asm/module.h> +#include <asm/firmware.h> +#include <asm/code-patching.h> +#include <linux/sort.h> +#include <asm/setup.h> + +/* FIXME: We don't do .init separately. To do this, we'd need to have + a separate r2 value in the init and core section, and stub between + them, too. + + Using a magic allocator which places modules within 32MB solves + this, and makes other things simpler. Anton? + --RR. */ +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt , ...) +#endif + +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define R2_STACK_OFFSET 24 + +/* An address is simply the address of the function. */ +typedef unsigned long func_desc_t; + +static func_desc_t func_desc(unsigned long addr) +{ + return addr; +} +static unsigned long func_addr(unsigned long addr) +{ + return addr; +} +static unsigned long stub_func_addr(func_desc_t func) +{ + return func; +} + +/* PowerPC64 specific values for the Elf64_Sym st_other field. */ +#define STO_PPC64_LOCAL_BIT 5 +#define STO_PPC64_LOCAL_MASK (7 << STO_PPC64_LOCAL_BIT) +#define PPC64_LOCAL_ENTRY_OFFSET(other) \ + (((1 << (((other) & STO_PPC64_LOCAL_MASK) >> STO_PPC64_LOCAL_BIT)) >> 2) << 2) + +static unsigned int local_entry_offset(const Elf64_Sym *sym) +{ + /* sym->st_other indicates offset to local entry point + * (otherwise it will assume r12 is the address of the start + * of function and try to derive r2 from it). */ + return PPC64_LOCAL_ENTRY_OFFSET(sym->st_other); +} +#else +#define R2_STACK_OFFSET 40 + +/* An address is address of the OPD entry, which contains address of fn. */ +typedef struct ppc64_opd_entry func_desc_t; + +static func_desc_t func_desc(unsigned long addr) +{ + return *(struct ppc64_opd_entry *)addr; +} +static unsigned long func_addr(unsigned long addr) +{ + return func_desc(addr).funcaddr; +} +static unsigned long stub_func_addr(func_desc_t func) +{ + return func.funcaddr; +} +static unsigned int local_entry_offset(const Elf64_Sym *sym) +{ + return 0; +} +#endif + +/* Like PPC32, we need little trampolines to do > 24-bit jumps (into + the kernel itself). But on PPC64, these need to be used for every + jump, actually, to reset r2 (TOC+0x8000). */ +struct ppc64_stub_entry +{ + /* 28 byte jump instruction sequence (7 instructions). We only + * need 6 instructions on ABIv2 but we always allocate 7 so + * so we don't have to modify the trampoline load instruction. */ + u32 jump[7]; + u32 unused; + /* Data for the above code */ + func_desc_t funcdata; +}; + +/* + * PPC64 uses 24 bit jumps, but we need to jump into other modules or + * the kernel which may be further. So we jump to a stub. + * + * For ELFv1 we need to use this to set up the new r2 value (aka TOC + * pointer). For ELFv2 it's the callee's responsibility to set up the + * new r2, but for both we need to save the old r2. + * + * We could simply patch the new r2 value and function pointer into + * the stub, but it's significantly shorter to put these values at the + * end of the stub code, and patch the stub address (32-bits relative + * to the TOC ptr, r2) into the stub. + */ + +static u32 ppc64_stub_insns[] = { + 0x3d620000, /* addis r11,r2, <high> */ + 0x396b0000, /* addi r11,r11, <low> */ + /* Save current r2 value in magic place on the stack. */ + 0xf8410000|R2_STACK_OFFSET, /* std r2,R2_STACK_OFFSET(r1) */ + 0xe98b0020, /* ld r12,32(r11) */ +#if !defined(_CALL_ELF) || _CALL_ELF != 2 + /* Set up new r2 from function descriptor */ + 0xe84b0028, /* ld r2,40(r11) */ +#endif + 0x7d8903a6, /* mtctr r12 */ + 0x4e800420 /* bctr */ +}; + +#ifdef CONFIG_DYNAMIC_FTRACE + +static u32 ppc64_stub_mask[] = { + 0xffff0000, + 0xffff0000, + 0xffffffff, + 0xffffffff, +#if !defined(_CALL_ELF) || _CALL_ELF != 2 + 0xffffffff, +#endif + 0xffffffff, + 0xffffffff +}; + +bool is_module_trampoline(u32 *p) +{ + unsigned int i; + u32 insns[ARRAY_SIZE(ppc64_stub_insns)]; + + BUILD_BUG_ON(sizeof(ppc64_stub_insns) != sizeof(ppc64_stub_mask)); + + if (probe_kernel_read(insns, p, sizeof(insns))) + return -EFAULT; + + for (i = 0; i < ARRAY_SIZE(ppc64_stub_insns); i++) { + u32 insna = insns[i]; + u32 insnb = ppc64_stub_insns[i]; + u32 mask = ppc64_stub_mask[i]; + + if ((insna & mask) != (insnb & mask)) + return false; + } + + return true; +} + +int module_trampoline_target(struct module *mod, u32 *trampoline, + unsigned long *target) +{ + u32 buf[2]; + u16 upper, lower; + long offset; + void *toc_entry; + + if (probe_kernel_read(buf, trampoline, sizeof(buf))) + return -EFAULT; + + upper = buf[0] & 0xffff; + lower = buf[1] & 0xffff; + + /* perform the addis/addi, both signed */ + offset = ((short)upper << 16) + (short)lower; + + /* + * Now get the address this trampoline jumps to. This + * is always 32 bytes into our trampoline stub. + */ + toc_entry = (void *)mod->arch.toc + offset + 32; + + if (probe_kernel_read(target, toc_entry, sizeof(*target))) + return -EFAULT; + + return 0; +} + +#endif + +/* Count how many different 24-bit relocations (different symbol, + different addend) */ +static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num) +{ + unsigned int i, r_info, r_addend, _count_relocs; + + /* FIXME: Only count external ones --RR */ + _count_relocs = 0; + r_info = 0; + r_addend = 0; + for (i = 0; i < num; i++) + /* Only count 24-bit relocs, others don't need stubs */ + if (ELF64_R_TYPE(rela[i].r_info) == R_PPC_REL24 && + (r_info != ELF64_R_SYM(rela[i].r_info) || + r_addend != rela[i].r_addend)) { + _count_relocs++; + r_info = ELF64_R_SYM(rela[i].r_info); + r_addend = rela[i].r_addend; + } + + return _count_relocs; +} + +static int relacmp(const void *_x, const void *_y) +{ + const Elf64_Rela *x, *y; + + y = (Elf64_Rela *)_x; + x = (Elf64_Rela *)_y; + + /* Compare the entire r_info (as opposed to ELF64_R_SYM(r_info) only) to + * make the comparison cheaper/faster. It won't affect the sorting or + * the counting algorithms' performance + */ + if (x->r_info < y->r_info) + return -1; + else if (x->r_info > y->r_info) + return 1; + else if (x->r_addend < y->r_addend) + return -1; + else if (x->r_addend > y->r_addend) + return 1; + else + return 0; +} + +static void relaswap(void *_x, void *_y, int size) +{ + uint64_t *x, *y, tmp; + int i; + + y = (uint64_t *)_x; + x = (uint64_t *)_y; + + for (i = 0; i < sizeof(Elf64_Rela) / sizeof(uint64_t); i++) { + tmp = x[i]; + x[i] = y[i]; + y[i] = tmp; + } +} + +/* Get size of potential trampolines required. */ +static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, + const Elf64_Shdr *sechdrs) +{ + /* One extra reloc so it's always 0-funcaddr terminated */ + unsigned long relocs = 1; + unsigned i; + + /* Every relocated section... */ + for (i = 1; i < hdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_RELA) { + DEBUGP("Found relocations in section %u\n", i); + DEBUGP("Ptr: %p. Number: %lu\n", + (void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size / sizeof(Elf64_Rela)); + + /* Sort the relocation information based on a symbol and + * addend key. This is a stable O(n*log n) complexity + * alogrithm but it will reduce the complexity of + * count_relocs() to linear complexity O(n) + */ + sort((void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size / sizeof(Elf64_Rela), + sizeof(Elf64_Rela), relacmp, relaswap); + + relocs += count_relocs((void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size + / sizeof(Elf64_Rela)); + } + } + +#ifdef CONFIG_DYNAMIC_FTRACE + /* make the trampoline to the ftrace_caller */ + relocs++; +#endif + + DEBUGP("Looks like a total of %lu stubs, max\n", relocs); + return relocs * sizeof(struct ppc64_stub_entry); +} + +/* Still needed for ELFv2, for .TOC. */ +static void dedotify_versions(struct modversion_info *vers, + unsigned long size) +{ + struct modversion_info *end; + + for (end = (void *)vers + size; vers < end; vers++) + if (vers->name[0] == '.') { + memmove(vers->name, vers->name+1, strlen(vers->name)); +#ifdef ARCH_RELOCATES_KCRCTAB + /* The TOC symbol has no CRC computed. To avoid CRC + * check failing, we must force it to the expected + * value (see CRC check in module.c). + */ + if (!strcmp(vers->name, "TOC.")) + vers->crc = -(unsigned long)reloc_start; +#endif + } +} + +/* Undefined symbols which refer to .funcname, hack to funcname (or .TOC.) */ +static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab) +{ + unsigned int i; + + for (i = 1; i < numsyms; i++) { + if (syms[i].st_shndx == SHN_UNDEF) { + char *name = strtab + syms[i].st_name; + if (name[0] == '.') + memmove(name, name+1, strlen(name)); + } + } +} + +static Elf64_Sym *find_dot_toc(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex) +{ + unsigned int i, numsyms; + Elf64_Sym *syms; + + syms = (Elf64_Sym *)sechdrs[symindex].sh_addr; + numsyms = sechdrs[symindex].sh_size / sizeof(Elf64_Sym); + + for (i = 1; i < numsyms; i++) { + if (syms[i].st_shndx == SHN_UNDEF + && strcmp(strtab + syms[i].st_name, "TOC.") == 0) + return &syms[i]; + } + return NULL; +} + +int module_frob_arch_sections(Elf64_Ehdr *hdr, + Elf64_Shdr *sechdrs, + char *secstrings, + struct module *me) +{ + unsigned int i; + + /* Find .toc and .stubs sections, symtab and strtab */ + for (i = 1; i < hdr->e_shnum; i++) { + char *p; + if (strcmp(secstrings + sechdrs[i].sh_name, ".stubs") == 0) + me->arch.stubs_section = i; + else if (strcmp(secstrings + sechdrs[i].sh_name, ".toc") == 0) + me->arch.toc_section = i; + else if (strcmp(secstrings+sechdrs[i].sh_name,"__versions")==0) + dedotify_versions((void *)hdr + sechdrs[i].sh_offset, + sechdrs[i].sh_size); + + /* We don't handle .init for the moment: rename to _init */ + while ((p = strstr(secstrings + sechdrs[i].sh_name, ".init"))) + p[0] = '_'; + + if (sechdrs[i].sh_type == SHT_SYMTAB) + dedotify((void *)hdr + sechdrs[i].sh_offset, + sechdrs[i].sh_size / sizeof(Elf64_Sym), + (void *)hdr + + sechdrs[sechdrs[i].sh_link].sh_offset); + } + + if (!me->arch.stubs_section) { + printk("%s: doesn't contain .stubs.\n", me->name); + return -ENOEXEC; + } + + /* If we don't have a .toc, just use .stubs. We need to set r2 + to some reasonable value in case the module calls out to + other functions via a stub, or if a function pointer escapes + the module by some means. */ + if (!me->arch.toc_section) + me->arch.toc_section = me->arch.stubs_section; + + /* Override the stubs size */ + sechdrs[me->arch.stubs_section].sh_size = get_stubs_size(hdr, sechdrs); + return 0; +} + +/* r2 is the TOC pointer: it actually points 0x8000 into the TOC (this + gives the value maximum span in an instruction which uses a signed + offset) */ +static inline unsigned long my_r2(Elf64_Shdr *sechdrs, struct module *me) +{ + return sechdrs[me->arch.toc_section].sh_addr + 0x8000; +} + +/* Both low and high 16 bits are added as SIGNED additions, so if low + 16 bits has high bit set, high 16 bits must be adjusted. These + macros do that (stolen from binutils). */ +#define PPC_LO(v) ((v) & 0xffff) +#define PPC_HI(v) (((v) >> 16) & 0xffff) +#define PPC_HA(v) PPC_HI ((v) + 0x8000) + +/* Patch stub to reference function and correct r2 value. */ +static inline int create_stub(Elf64_Shdr *sechdrs, + struct ppc64_stub_entry *entry, + unsigned long addr, + struct module *me) +{ + long reladdr; + + memcpy(entry->jump, ppc64_stub_insns, sizeof(ppc64_stub_insns)); + + /* Stub uses address relative to r2. */ + reladdr = (unsigned long)entry - my_r2(sechdrs, me); + if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { + printk("%s: Address %p of stub out of range of %p.\n", + me->name, (void *)reladdr, (void *)my_r2); + return 0; + } + DEBUGP("Stub %p get data from reladdr %li\n", entry, reladdr); + + entry->jump[0] |= PPC_HA(reladdr); + entry->jump[1] |= PPC_LO(reladdr); + entry->funcdata = func_desc(addr); + return 1; +} + +/* Create stub to jump to function described in this OPD/ptr: we need the + stub to set up the TOC ptr (r2) for the function. */ +static unsigned long stub_for_addr(Elf64_Shdr *sechdrs, + unsigned long addr, + struct module *me) +{ + struct ppc64_stub_entry *stubs; + unsigned int i, num_stubs; + + num_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stubs); + + /* Find this stub, or if that fails, the next avail. entry */ + stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr; + for (i = 0; stub_func_addr(stubs[i].funcdata); i++) { + BUG_ON(i >= num_stubs); + + if (stub_func_addr(stubs[i].funcdata) == func_addr(addr)) + return (unsigned long)&stubs[i]; + } + + if (!create_stub(sechdrs, &stubs[i], addr, me)) + return 0; + + return (unsigned long)&stubs[i]; +} + +/* We expect a noop next: if it is, replace it with instruction to + restore r2. */ +static int restore_r2(u32 *instruction, struct module *me) +{ + if (*instruction != PPC_INST_NOP) { + printk("%s: Expect noop after relocate, got %08x\n", + me->name, *instruction); + return 0; + } + /* ld r2,R2_STACK_OFFSET(r1) */ + *instruction = 0xe8410000 | R2_STACK_OFFSET; + return 1; +} + +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf64_Rela *rela = (void *)sechdrs[relsec].sh_addr; + Elf64_Sym *sym; + unsigned long *location; + unsigned long value; + + DEBUGP("Applying ADD relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + + /* First time we're called, we can fix up .TOC. */ + if (!me->arch.toc_fixed) { + sym = find_dot_toc(sechdrs, strtab, symindex); + /* It's theoretically possible that a module doesn't want a + * .TOC. so don't fail it just for that. */ + if (sym) + sym->st_value = my_r2(sechdrs, me); + me->arch.toc_fixed = true; + } + + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rela[i].r_offset; + /* This is the symbol it is referring to */ + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rela[i].r_info); + + DEBUGP("RELOC at %p: %li-type as %s (%lu) + %li\n", + location, (long)ELF64_R_TYPE(rela[i].r_info), + strtab + sym->st_name, (unsigned long)sym->st_value, + (long)rela[i].r_addend); + + /* `Everything is relative'. */ + value = sym->st_value + rela[i].r_addend; + + switch (ELF64_R_TYPE(rela[i].r_info)) { + case R_PPC64_ADDR32: + /* Simply set it */ + *(u32 *)location = value; + break; + + case R_PPC64_ADDR64: + /* Simply set it */ + *(unsigned long *)location = value; + break; + + case R_PPC64_TOC: + *(unsigned long *)location = my_r2(sechdrs, me); + break; + + case R_PPC64_TOC16: + /* Subtract TOC pointer */ + value -= my_r2(sechdrs, me); + if (value + 0x8000 > 0xffff) { + printk("%s: bad TOC16 relocation (%lu)\n", + me->name, value); + return -ENOEXEC; + } + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xffff) + | (value & 0xffff); + break; + + case R_PPC64_TOC16_LO: + /* Subtract TOC pointer */ + value -= my_r2(sechdrs, me); + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xffff) + | (value & 0xffff); + break; + + case R_PPC64_TOC16_DS: + /* Subtract TOC pointer */ + value -= my_r2(sechdrs, me); + if ((value & 3) != 0 || value + 0x8000 > 0xffff) { + printk("%s: bad TOC16_DS relocation (%lu)\n", + me->name, value); + return -ENOEXEC; + } + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xfffc) + | (value & 0xfffc); + break; + + case R_PPC64_TOC16_LO_DS: + /* Subtract TOC pointer */ + value -= my_r2(sechdrs, me); + if ((value & 3) != 0) { + printk("%s: bad TOC16_LO_DS relocation (%lu)\n", + me->name, value); + return -ENOEXEC; + } + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xfffc) + | (value & 0xfffc); + break; + + case R_PPC64_TOC16_HA: + /* Subtract TOC pointer */ + value -= my_r2(sechdrs, me); + value = ((value + 0x8000) >> 16); + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xffff) + | (value & 0xffff); + break; + + case R_PPC_REL24: + /* FIXME: Handle weak symbols here --RR */ + if (sym->st_shndx == SHN_UNDEF) { + /* External: go via stub */ + value = stub_for_addr(sechdrs, value, me); + if (!value) + return -ENOENT; + if (!restore_r2((u32 *)location + 1, me)) + return -ENOEXEC; + } else + value += local_entry_offset(sym); + + /* Convert value to relative */ + value -= (unsigned long)location; + if (value + 0x2000000 > 0x3ffffff || (value & 3) != 0){ + printk("%s: REL24 %li out of range!\n", + me->name, (long int)value); + return -ENOEXEC; + } + + /* Only replace bits 2 through 26 */ + *(uint32_t *)location + = (*(uint32_t *)location & ~0x03fffffc) + | (value & 0x03fffffc); + break; + + case R_PPC64_REL64: + /* 64 bits relative (used by features fixups) */ + *location = value - (unsigned long)location; + break; + + case R_PPC64_TOCSAVE: + /* + * Marker reloc indicates we don't have to save r2. + * That would only save us one instruction, so ignore + * it. + */ + break; + + case R_PPC64_REL16_HA: + /* Subtract location pointer */ + value -= (unsigned long)location; + value = ((value + 0x8000) >> 16); + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xffff) + | (value & 0xffff); + break; + + case R_PPC64_REL16_LO: + /* Subtract location pointer */ + value -= (unsigned long)location; + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xffff) + | (value & 0xffff); + break; + + default: + printk("%s: Unknown ADD relocation: %lu\n", + me->name, + (unsigned long)ELF64_R_TYPE(rela[i].r_info)); + return -ENOEXEC; + } + } + +#ifdef CONFIG_DYNAMIC_FTRACE + me->arch.toc = my_r2(sechdrs, me); + me->arch.tramp = stub_for_addr(sechdrs, + (unsigned long)ftrace_caller, + me); +#endif + + return 0; +} diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c new file mode 100644 index 00000000000..8bbc12d20f5 --- /dev/null +++ b/arch/powerpc/kernel/msi.c @@ -0,0 +1,43 @@ +/* + * Copyright 2006-2007, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/msi.h> +#include <linux/pci.h> + +#include <asm/machdep.h> + +int arch_msi_check_device(struct pci_dev* dev, int nvec, int type) +{ + if (!ppc_md.setup_msi_irqs || !ppc_md.teardown_msi_irqs) { + pr_debug("msi: Platform doesn't provide MSI callbacks.\n"); + return -ENOSYS; + } + + /* PowerPC doesn't support multiple MSI yet */ + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + + if (ppc_md.msi_check_device) { + pr_debug("msi: Using platform check routine.\n"); + return ppc_md.msi_check_device(dev, nvec, type); + } + + return 0; +} + +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + return ppc_md.setup_msi_irqs(dev, nvec, type); +} + +void arch_teardown_msi_irqs(struct pci_dev *dev) +{ + ppc_md.teardown_msi_irqs(dev); +} diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c new file mode 100644 index 00000000000..28b898e6818 --- /dev/null +++ b/arch/powerpc/kernel/nvram_64.c @@ -0,0 +1,577 @@ +/* + * c 2001 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * /dev/nvram driver for PPC64 + * + * This perhaps should live in drivers/char + * + * TODO: Split the /dev/nvram part (that one can use + * drivers/char/generic_nvram.c) from the arch & partition + * parsing code. + */ + +#include <linux/module.h> + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include <linux/fcntl.h> +#include <linux/nvram.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <asm/uaccess.h> +#include <asm/nvram.h> +#include <asm/rtas.h> +#include <asm/prom.h> +#include <asm/machdep.h> + +#undef DEBUG_NVRAM + +#define NVRAM_HEADER_LEN sizeof(struct nvram_header) +#define NVRAM_BLOCK_LEN NVRAM_HEADER_LEN + +/* If change this size, then change the size of NVNAME_LEN */ +struct nvram_header { + unsigned char signature; + unsigned char checksum; + unsigned short length; + /* Terminating null required only for names < 12 chars. */ + char name[12]; +}; + +struct nvram_partition { + struct list_head partition; + struct nvram_header header; + unsigned int index; +}; + +static LIST_HEAD(nvram_partitions); + +static loff_t dev_nvram_llseek(struct file *file, loff_t offset, int origin) +{ + int size; + + if (ppc_md.nvram_size == NULL) + return -ENODEV; + size = ppc_md.nvram_size(); + + switch (origin) { + case 1: + offset += file->f_pos; + break; + case 2: + offset += size; + break; + } + if (offset < 0) + return -EINVAL; + file->f_pos = offset; + return file->f_pos; +} + + +static ssize_t dev_nvram_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + ssize_t ret; + char *tmp = NULL; + ssize_t size; + + if (!ppc_md.nvram_size) { + ret = -ENODEV; + goto out; + } + + size = ppc_md.nvram_size(); + if (size < 0) { + ret = size; + goto out; + } + + if (*ppos >= size) { + ret = 0; + goto out; + } + + count = min_t(size_t, count, size - *ppos); + count = min(count, PAGE_SIZE); + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto out; + } + + ret = ppc_md.nvram_read(tmp, count, ppos); + if (ret <= 0) + goto out; + + if (copy_to_user(buf, tmp, ret)) + ret = -EFAULT; + +out: + kfree(tmp); + return ret; + +} + +static ssize_t dev_nvram_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + ssize_t ret; + char *tmp = NULL; + ssize_t size; + + ret = -ENODEV; + if (!ppc_md.nvram_size) + goto out; + + ret = 0; + size = ppc_md.nvram_size(); + if (*ppos >= size || size < 0) + goto out; + + count = min_t(size_t, count, size - *ppos); + count = min(count, PAGE_SIZE); + + ret = -ENOMEM; + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) + goto out; + + ret = -EFAULT; + if (copy_from_user(tmp, buf, count)) + goto out; + + ret = ppc_md.nvram_write(tmp, count, ppos); + +out: + kfree(tmp); + return ret; + +} + +static long dev_nvram_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch(cmd) { +#ifdef CONFIG_PPC_PMAC + case OBSOLETE_PMAC_NVRAM_GET_OFFSET: + printk(KERN_WARNING "nvram: Using obsolete PMAC_NVRAM_GET_OFFSET ioctl\n"); + case IOC_NVRAM_GET_OFFSET: { + int part, offset; + + if (!machine_is(powermac)) + return -EINVAL; + if (copy_from_user(&part, (void __user*)arg, sizeof(part)) != 0) + return -EFAULT; + if (part < pmac_nvram_OF || part > pmac_nvram_NR) + return -EINVAL; + offset = pmac_get_partition(part); + if (offset < 0) + return offset; + if (copy_to_user((void __user*)arg, &offset, sizeof(offset)) != 0) + return -EFAULT; + return 0; + } +#endif /* CONFIG_PPC_PMAC */ + default: + return -EINVAL; + } +} + +const struct file_operations nvram_fops = { + .owner = THIS_MODULE, + .llseek = dev_nvram_llseek, + .read = dev_nvram_read, + .write = dev_nvram_write, + .unlocked_ioctl = dev_nvram_ioctl, +}; + +static struct miscdevice nvram_dev = { + NVRAM_MINOR, + "nvram", + &nvram_fops +}; + + +#ifdef DEBUG_NVRAM +static void __init nvram_print_partitions(char * label) +{ + struct nvram_partition * tmp_part; + + printk(KERN_WARNING "--------%s---------\n", label); + printk(KERN_WARNING "indx\t\tsig\tchks\tlen\tname\n"); + list_for_each_entry(tmp_part, &nvram_partitions, partition) { + printk(KERN_WARNING "%4d \t%02x\t%02x\t%d\t%12.12s\n", + tmp_part->index, tmp_part->header.signature, + tmp_part->header.checksum, tmp_part->header.length, + tmp_part->header.name); + } +} +#endif + + +static int __init nvram_write_header(struct nvram_partition * part) +{ + loff_t tmp_index; + int rc; + struct nvram_header phead; + + memcpy(&phead, &part->header, NVRAM_HEADER_LEN); + phead.length = cpu_to_be16(phead.length); + + tmp_index = part->index; + rc = ppc_md.nvram_write((char *)&phead, NVRAM_HEADER_LEN, &tmp_index); + + return rc; +} + + +static unsigned char __init nvram_checksum(struct nvram_header *p) +{ + unsigned int c_sum, c_sum2; + unsigned short *sp = (unsigned short *)p->name; /* assume 6 shorts */ + c_sum = p->signature + p->length + sp[0] + sp[1] + sp[2] + sp[3] + sp[4] + sp[5]; + + /* The sum may have spilled into the 3rd byte. Fold it back. */ + c_sum = ((c_sum & 0xffff) + (c_sum >> 16)) & 0xffff; + /* The sum cannot exceed 2 bytes. Fold it into a checksum */ + c_sum2 = (c_sum >> 8) + (c_sum << 8); + c_sum = ((c_sum + c_sum2) >> 8) & 0xff; + return c_sum; +} + +/* + * Per the criteria passed via nvram_remove_partition(), should this + * partition be removed? 1=remove, 0=keep + */ +static int nvram_can_remove_partition(struct nvram_partition *part, + const char *name, int sig, const char *exceptions[]) +{ + if (part->header.signature != sig) + return 0; + if (name) { + if (strncmp(name, part->header.name, 12)) + return 0; + } else if (exceptions) { + const char **except; + for (except = exceptions; *except; except++) { + if (!strncmp(*except, part->header.name, 12)) + return 0; + } + } + return 1; +} + +/** + * nvram_remove_partition - Remove one or more partitions in nvram + * @name: name of the partition to remove, or NULL for a + * signature only match + * @sig: signature of the partition(s) to remove + * @exceptions: When removing all partitions with a matching signature, + * leave these alone. + */ + +int __init nvram_remove_partition(const char *name, int sig, + const char *exceptions[]) +{ + struct nvram_partition *part, *prev, *tmp; + int rc; + + list_for_each_entry(part, &nvram_partitions, partition) { + if (!nvram_can_remove_partition(part, name, sig, exceptions)) + continue; + + /* Make partition a free partition */ + part->header.signature = NVRAM_SIG_FREE; + strncpy(part->header.name, "wwwwwwwwwwww", 12); + part->header.checksum = nvram_checksum(&part->header); + rc = nvram_write_header(part); + if (rc <= 0) { + printk(KERN_ERR "nvram_remove_partition: nvram_write failed (%d)\n", rc); + return rc; + } + } + + /* Merge contiguous ones */ + prev = NULL; + list_for_each_entry_safe(part, tmp, &nvram_partitions, partition) { + if (part->header.signature != NVRAM_SIG_FREE) { + prev = NULL; + continue; + } + if (prev) { + prev->header.length += part->header.length; + prev->header.checksum = nvram_checksum(&part->header); + rc = nvram_write_header(part); + if (rc <= 0) { + printk(KERN_ERR "nvram_remove_partition: nvram_write failed (%d)\n", rc); + return rc; + } + list_del(&part->partition); + kfree(part); + } else + prev = part; + } + + return 0; +} + +/** + * nvram_create_partition - Create a partition in nvram + * @name: name of the partition to create + * @sig: signature of the partition to create + * @req_size: size of data to allocate in bytes + * @min_size: minimum acceptable size (0 means req_size) + * + * Returns a negative error code or a positive nvram index + * of the beginning of the data area of the newly created + * partition. If you provided a min_size smaller than req_size + * you need to query for the actual size yourself after the + * call using nvram_partition_get_size(). + */ +loff_t __init nvram_create_partition(const char *name, int sig, + int req_size, int min_size) +{ + struct nvram_partition *part; + struct nvram_partition *new_part; + struct nvram_partition *free_part = NULL; + static char nv_init_vals[16]; + loff_t tmp_index; + long size = 0; + int rc; + + /* Convert sizes from bytes to blocks */ + req_size = _ALIGN_UP(req_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN; + min_size = _ALIGN_UP(min_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN; + + /* If no minimum size specified, make it the same as the + * requested size + */ + if (min_size == 0) + min_size = req_size; + if (min_size > req_size) + return -EINVAL; + + /* Now add one block to each for the header */ + req_size += 1; + min_size += 1; + + /* Find a free partition that will give us the maximum needed size + If can't find one that will give us the minimum size needed */ + list_for_each_entry(part, &nvram_partitions, partition) { + if (part->header.signature != NVRAM_SIG_FREE) + continue; + + if (part->header.length >= req_size) { + size = req_size; + free_part = part; + break; + } + if (part->header.length > size && + part->header.length >= min_size) { + size = part->header.length; + free_part = part; + } + } + if (!size) + return -ENOSPC; + + /* Create our OS partition */ + new_part = kmalloc(sizeof(*new_part), GFP_KERNEL); + if (!new_part) { + pr_err("nvram_create_os_partition: kmalloc failed\n"); + return -ENOMEM; + } + + new_part->index = free_part->index; + new_part->header.signature = sig; + new_part->header.length = size; + strncpy(new_part->header.name, name, 12); + new_part->header.checksum = nvram_checksum(&new_part->header); + + rc = nvram_write_header(new_part); + if (rc <= 0) { + pr_err("nvram_create_os_partition: nvram_write_header " + "failed (%d)\n", rc); + return rc; + } + list_add_tail(&new_part->partition, &free_part->partition); + + /* Adjust or remove the partition we stole the space from */ + if (free_part->header.length > size) { + free_part->index += size * NVRAM_BLOCK_LEN; + free_part->header.length -= size; + free_part->header.checksum = nvram_checksum(&free_part->header); + rc = nvram_write_header(free_part); + if (rc <= 0) { + pr_err("nvram_create_os_partition: nvram_write_header " + "failed (%d)\n", rc); + return rc; + } + } else { + list_del(&free_part->partition); + kfree(free_part); + } + + /* Clear the new partition */ + for (tmp_index = new_part->index + NVRAM_HEADER_LEN; + tmp_index < ((size - 1) * NVRAM_BLOCK_LEN); + tmp_index += NVRAM_BLOCK_LEN) { + rc = ppc_md.nvram_write(nv_init_vals, NVRAM_BLOCK_LEN, &tmp_index); + if (rc <= 0) { + pr_err("nvram_create_partition: nvram_write failed (%d)\n", rc); + return rc; + } + } + + return new_part->index + NVRAM_HEADER_LEN; +} + +/** + * nvram_get_partition_size - Get the data size of an nvram partition + * @data_index: This is the offset of the start of the data of + * the partition. The same value that is returned by + * nvram_create_partition(). + */ +int nvram_get_partition_size(loff_t data_index) +{ + struct nvram_partition *part; + + list_for_each_entry(part, &nvram_partitions, partition) { + if (part->index + NVRAM_HEADER_LEN == data_index) + return (part->header.length - 1) * NVRAM_BLOCK_LEN; + } + return -1; +} + + +/** + * nvram_find_partition - Find an nvram partition by signature and name + * @name: Name of the partition or NULL for any name + * @sig: Signature to test against + * @out_size: if non-NULL, returns the size of the data part of the partition + */ +loff_t nvram_find_partition(const char *name, int sig, int *out_size) +{ + struct nvram_partition *p; + + list_for_each_entry(p, &nvram_partitions, partition) { + if (p->header.signature == sig && + (!name || !strncmp(p->header.name, name, 12))) { + if (out_size) + *out_size = (p->header.length - 1) * + NVRAM_BLOCK_LEN; + return p->index + NVRAM_HEADER_LEN; + } + } + return 0; +} + +int __init nvram_scan_partitions(void) +{ + loff_t cur_index = 0; + struct nvram_header phead; + struct nvram_partition * tmp_part; + unsigned char c_sum; + char * header; + int total_size; + int err; + + if (ppc_md.nvram_size == NULL || ppc_md.nvram_size() <= 0) + return -ENODEV; + total_size = ppc_md.nvram_size(); + + header = kmalloc(NVRAM_HEADER_LEN, GFP_KERNEL); + if (!header) { + printk(KERN_ERR "nvram_scan_partitions: Failed kmalloc\n"); + return -ENOMEM; + } + + while (cur_index < total_size) { + + err = ppc_md.nvram_read(header, NVRAM_HEADER_LEN, &cur_index); + if (err != NVRAM_HEADER_LEN) { + printk(KERN_ERR "nvram_scan_partitions: Error parsing " + "nvram partitions\n"); + goto out; + } + + cur_index -= NVRAM_HEADER_LEN; /* nvram_read will advance us */ + + memcpy(&phead, header, NVRAM_HEADER_LEN); + + phead.length = be16_to_cpu(phead.length); + + err = 0; + c_sum = nvram_checksum(&phead); + if (c_sum != phead.checksum) { + printk(KERN_WARNING "WARNING: nvram partition checksum" + " was %02x, should be %02x!\n", + phead.checksum, c_sum); + printk(KERN_WARNING "Terminating nvram partition scan\n"); + goto out; + } + if (!phead.length) { + printk(KERN_WARNING "WARNING: nvram corruption " + "detected: 0-length partition\n"); + goto out; + } + tmp_part = kmalloc(sizeof(struct nvram_partition), GFP_KERNEL); + err = -ENOMEM; + if (!tmp_part) { + printk(KERN_ERR "nvram_scan_partitions: kmalloc failed\n"); + goto out; + } + + memcpy(&tmp_part->header, &phead, NVRAM_HEADER_LEN); + tmp_part->index = cur_index; + list_add_tail(&tmp_part->partition, &nvram_partitions); + + cur_index += phead.length * NVRAM_BLOCK_LEN; + } + err = 0; + +#ifdef DEBUG_NVRAM + nvram_print_partitions("NVRAM Partitions"); +#endif + + out: + kfree(header); + return err; +} + +static int __init nvram_init(void) +{ + int rc; + + BUILD_BUG_ON(NVRAM_BLOCK_LEN != 16); + + if (ppc_md.nvram_size == NULL || ppc_md.nvram_size() <= 0) + return -ENODEV; + + rc = misc_register(&nvram_dev); + if (rc != 0) { + printk(KERN_ERR "nvram_init: failed to register device\n"); + return rc; + } + + return rc; +} + +void __exit nvram_cleanup(void) +{ + misc_deregister( &nvram_dev ); +} + +module_init(nvram_init); +module_exit(nvram_cleanup); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/kernel/of_device.c b/arch/powerpc/kernel/of_device.c deleted file mode 100644 index 7065e40e2f4..00000000000 --- a/arch/powerpc/kernel/of_device.c +++ /dev/null @@ -1,276 +0,0 @@ -#include <linux/config.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mod_devicetable.h> -#include <linux/slab.h> - -#include <asm/errno.h> -#include <asm/of_device.h> - -/** - * of_match_device - Tell if an of_device structure has a matching - * of_match structure - * @ids: array of of device match structures to search in - * @dev: the of device structure to match against - * - * Used by a driver to check whether an of_device present in the - * system is in its list of supported devices. - */ -const struct of_device_id *of_match_device(const struct of_device_id *matches, - const struct of_device *dev) -{ - if (!dev->node) - return NULL; - while (matches->name[0] || matches->type[0] || matches->compatible[0]) { - int match = 1; - if (matches->name[0]) - match &= dev->node->name - && !strcmp(matches->name, dev->node->name); - if (matches->type[0]) - match &= dev->node->type - && !strcmp(matches->type, dev->node->type); - if (matches->compatible[0]) - match &= device_is_compatible(dev->node, - matches->compatible); - if (match) - return matches; - matches++; - } - return NULL; -} - -static int of_platform_bus_match(struct device *dev, struct device_driver *drv) -{ - struct of_device * of_dev = to_of_device(dev); - struct of_platform_driver * of_drv = to_of_platform_driver(drv); - const struct of_device_id * matches = of_drv->match_table; - - if (!matches) - return 0; - - return of_match_device(matches, of_dev) != NULL; -} - -struct of_device *of_dev_get(struct of_device *dev) -{ - struct device *tmp; - - if (!dev) - return NULL; - tmp = get_device(&dev->dev); - if (tmp) - return to_of_device(tmp); - else - return NULL; -} - -void of_dev_put(struct of_device *dev) -{ - if (dev) - put_device(&dev->dev); -} - - -static int of_device_probe(struct device *dev) -{ - int error = -ENODEV; - struct of_platform_driver *drv; - struct of_device *of_dev; - const struct of_device_id *match; - - drv = to_of_platform_driver(dev->driver); - of_dev = to_of_device(dev); - - if (!drv->probe) - return error; - - of_dev_get(of_dev); - - match = of_match_device(drv->match_table, of_dev); - if (match) - error = drv->probe(of_dev, match); - if (error) - of_dev_put(of_dev); - - return error; -} - -static int of_device_remove(struct device *dev) -{ - struct of_device * of_dev = to_of_device(dev); - struct of_platform_driver * drv = to_of_platform_driver(dev->driver); - - if (dev->driver && drv->remove) - drv->remove(of_dev); - return 0; -} - -static int of_device_suspend(struct device *dev, pm_message_t state) -{ - struct of_device * of_dev = to_of_device(dev); - struct of_platform_driver * drv = to_of_platform_driver(dev->driver); - int error = 0; - - if (dev->driver && drv->suspend) - error = drv->suspend(of_dev, state); - return error; -} - -static int of_device_resume(struct device * dev) -{ - struct of_device * of_dev = to_of_device(dev); - struct of_platform_driver * drv = to_of_platform_driver(dev->driver); - int error = 0; - - if (dev->driver && drv->resume) - error = drv->resume(of_dev); - return error; -} - -struct bus_type of_platform_bus_type = { - .name = "of_platform", - .match = of_platform_bus_match, - .suspend = of_device_suspend, - .resume = of_device_resume, -}; - -static int __init of_bus_driver_init(void) -{ - return bus_register(&of_platform_bus_type); -} - -postcore_initcall(of_bus_driver_init); - -int of_register_driver(struct of_platform_driver *drv) -{ - int count = 0; - - /* initialize common driver fields */ - drv->driver.name = drv->name; - drv->driver.bus = &of_platform_bus_type; - drv->driver.probe = of_device_probe; - drv->driver.remove = of_device_remove; - - /* register with core */ - count = driver_register(&drv->driver); - return count ? count : 1; -} - -void of_unregister_driver(struct of_platform_driver *drv) -{ - driver_unregister(&drv->driver); -} - - -static ssize_t dev_show_devspec(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct of_device *ofdev; - - ofdev = to_of_device(dev); - return sprintf(buf, "%s", ofdev->node->full_name); -} - -static DEVICE_ATTR(devspec, S_IRUGO, dev_show_devspec, NULL); - -/** - * of_release_dev - free an of device structure when all users of it are finished. - * @dev: device that's been disconnected - * - * Will be called only by the device core when all users of this of device are - * done. - */ -void of_release_dev(struct device *dev) -{ - struct of_device *ofdev; - - ofdev = to_of_device(dev); - of_node_put(ofdev->node); - kfree(ofdev); -} - -int of_device_register(struct of_device *ofdev) -{ - int rc; - struct of_device **odprop; - - BUG_ON(ofdev->node == NULL); - - odprop = (struct of_device **)get_property(ofdev->node, "linux,device", NULL); - if (!odprop) { - struct property *new_prop; - - new_prop = kmalloc(sizeof(struct property) + sizeof(struct of_device *), - GFP_KERNEL); - if (new_prop == NULL) - return -ENOMEM; - new_prop->name = "linux,device"; - new_prop->length = sizeof(sizeof(struct of_device *)); - new_prop->value = (unsigned char *)&new_prop[1]; - odprop = (struct of_device **)new_prop->value; - *odprop = NULL; - prom_add_property(ofdev->node, new_prop); - } - *odprop = ofdev; - - rc = device_register(&ofdev->dev); - if (rc) - return rc; - - device_create_file(&ofdev->dev, &dev_attr_devspec); - - return 0; -} - -void of_device_unregister(struct of_device *ofdev) -{ - struct of_device **odprop; - - device_remove_file(&ofdev->dev, &dev_attr_devspec); - - odprop = (struct of_device **)get_property(ofdev->node, "linux,device", NULL); - if (odprop) - *odprop = NULL; - - device_unregister(&ofdev->dev); -} - -struct of_device* of_platform_device_create(struct device_node *np, - const char *bus_id, - struct device *parent) -{ - struct of_device *dev; - - dev = kmalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return NULL; - memset(dev, 0, sizeof(*dev)); - - dev->node = of_node_get(np); - dev->dma_mask = 0xffffffffUL; - dev->dev.dma_mask = &dev->dma_mask; - dev->dev.parent = parent; - dev->dev.bus = &of_platform_bus_type; - dev->dev.release = of_release_dev; - - strlcpy(dev->dev.bus_id, bus_id, BUS_ID_SIZE); - - if (of_device_register(dev) != 0) { - kfree(dev); - return NULL; - } - - return dev; -} - -EXPORT_SYMBOL(of_match_device); -EXPORT_SYMBOL(of_platform_bus_type); -EXPORT_SYMBOL(of_register_driver); -EXPORT_SYMBOL(of_unregister_driver); -EXPORT_SYMBOL(of_device_register); -EXPORT_SYMBOL(of_device_unregister); -EXPORT_SYMBOL(of_dev_get); -EXPORT_SYMBOL(of_dev_put); -EXPORT_SYMBOL(of_platform_device_create); -EXPORT_SYMBOL(of_release_dev); diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c new file mode 100644 index 00000000000..a7b74307672 --- /dev/null +++ b/arch/powerpc/kernel/of_platform.c @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corp. + * <benh@kernel.crashing.org> + * and Arnd Bergmann, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#undef DEBUG + +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/mod_devicetable.h> +#include <linux/pci.h> +#include <linux/of.h> +#include <linux/of_device.h> +#include <linux/of_platform.h> +#include <linux/atomic.h> + +#include <asm/errno.h> +#include <asm/topology.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> +#include <asm/eeh.h> + +#ifdef CONFIG_PPC_OF_PLATFORM_PCI + +/* The probing of PCI controllers from of_platform is currently + * 64 bits only, mostly due to gratuitous differences between + * the 32 and 64 bits PCI code on PowerPC and the 32 bits one + * lacking some bits needed here. + */ + +static int of_pci_phb_probe(struct platform_device *dev) +{ + struct pci_controller *phb; + + /* Check if we can do that ... */ + if (ppc_md.pci_setup_phb == NULL) + return -ENODEV; + + pr_info("Setting up PCI bus %s\n", dev->dev.of_node->full_name); + + /* Alloc and setup PHB data structure */ + phb = pcibios_alloc_controller(dev->dev.of_node); + if (!phb) + return -ENODEV; + + /* Setup parent in sysfs */ + phb->parent = &dev->dev; + + /* Setup the PHB using arch provided callback */ + if (ppc_md.pci_setup_phb(phb)) { + pcibios_free_controller(phb); + return -ENODEV; + } + + /* Process "ranges" property */ + pci_process_bridge_OF_ranges(phb, dev->dev.of_node, 0); + + /* Init pci_dn data structures */ + pci_devs_phb_init_dynamic(phb); + + /* Create EEH devices for the PHB */ + eeh_dev_phb_init_dynamic(phb); + + /* Register devices with EEH */ + if (dev->dev.of_node->child) + eeh_add_device_tree_early(dev->dev.of_node); + + /* Scan the bus */ + pcibios_scan_phb(phb); + if (phb->bus == NULL) + return -ENXIO; + + /* Claim resources. This might need some rework as well depending + * whether we are doing probe-only or not, like assigning unassigned + * resources etc... + */ + pcibios_claim_one_bus(phb->bus); + + /* Finish EEH setup */ + eeh_add_device_tree_late(phb->bus); + + /* Add probed PCI devices to the device model */ + pci_bus_add_devices(phb->bus); + + /* sysfs files should only be added after devices are added */ + eeh_add_sysfs_files(phb->bus); + + return 0; +} + +static struct of_device_id of_pci_phb_ids[] = { + { .type = "pci", }, + { .type = "pcix", }, + { .type = "pcie", }, + { .type = "pciex", }, + { .type = "ht", }, + {} +}; + +static struct platform_driver of_pci_phb_driver = { + .probe = of_pci_phb_probe, + .driver = { + .name = "of-pci", + .owner = THIS_MODULE, + .of_match_table = of_pci_phb_ids, + }, +}; + +static __init int of_pci_phb_init(void) +{ + return platform_driver_register(&of_pci_phb_driver); +} + +device_initcall(of_pci_phb_init); + +#endif /* CONFIG_PPC_OF_PLATFORM_PCI */ diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c new file mode 100644 index 00000000000..d6e195e8cd4 --- /dev/null +++ b/arch/powerpc/kernel/paca.c @@ -0,0 +1,242 @@ +/* + * c 2001 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/smp.h> +#include <linux/export.h> +#include <linux/memblock.h> + +#include <asm/lppaca.h> +#include <asm/paca.h> +#include <asm/sections.h> +#include <asm/pgtable.h> +#include <asm/kexec.h> + +/* This symbol is provided by the linker - let it fill in the paca + * field correctly */ +extern unsigned long __toc_start; + +#ifdef CONFIG_PPC_BOOK3S + +/* + * The structure which the hypervisor knows about - this structure + * should not cross a page boundary. The vpa_init/register_vpa call + * is now known to fail if the lppaca structure crosses a page + * boundary. The lppaca is also used on POWER5 pSeries boxes. + * The lppaca is 640 bytes long, and cannot readily + * change since the hypervisor knows its layout, so a 1kB alignment + * will suffice to ensure that it doesn't cross a page boundary. + */ +struct lppaca lppaca[] = { + [0 ... (NR_LPPACAS-1)] = { + .desc = cpu_to_be32(0xd397d781), /* "LpPa" */ + .size = cpu_to_be16(sizeof(struct lppaca)), + .fpregs_in_use = 1, + .slb_count = cpu_to_be16(64), + .vmxregs_in_use = 0, + .page_ins = 0, + }, +}; + +static struct lppaca *extra_lppacas; +static long __initdata lppaca_size; + +static void __init allocate_lppacas(int nr_cpus, unsigned long limit) +{ + if (nr_cpus <= NR_LPPACAS) + return; + + lppaca_size = PAGE_ALIGN(sizeof(struct lppaca) * + (nr_cpus - NR_LPPACAS)); + extra_lppacas = __va(memblock_alloc_base(lppaca_size, + PAGE_SIZE, limit)); +} + +static struct lppaca * __init new_lppaca(int cpu) +{ + struct lppaca *lp; + + if (cpu < NR_LPPACAS) + return &lppaca[cpu]; + + lp = extra_lppacas + (cpu - NR_LPPACAS); + *lp = lppaca[0]; + + return lp; +} + +static void __init free_lppacas(void) +{ + long new_size = 0, nr; + + if (!lppaca_size) + return; + nr = num_possible_cpus() - NR_LPPACAS; + if (nr > 0) + new_size = PAGE_ALIGN(nr * sizeof(struct lppaca)); + if (new_size >= lppaca_size) + return; + + memblock_free(__pa(extra_lppacas) + new_size, lppaca_size - new_size); + lppaca_size = new_size; +} + +#else + +static inline void allocate_lppacas(int nr_cpus, unsigned long limit) { } +static inline void free_lppacas(void) { } + +#endif /* CONFIG_PPC_BOOK3S */ + +#ifdef CONFIG_PPC_STD_MMU_64 + +/* + * 3 persistent SLBs are registered here. The buffer will be zero + * initially, hence will all be invaild until we actually write them. + * + * If you make the number of persistent SLB entries dynamic, please also + * update PR KVM to flush and restore them accordingly. + */ +static struct slb_shadow *slb_shadow; + +static void __init allocate_slb_shadows(int nr_cpus, int limit) +{ + int size = PAGE_ALIGN(sizeof(struct slb_shadow) * nr_cpus); + slb_shadow = __va(memblock_alloc_base(size, PAGE_SIZE, limit)); + memset(slb_shadow, 0, size); +} + +static struct slb_shadow * __init init_slb_shadow(int cpu) +{ + struct slb_shadow *s = &slb_shadow[cpu]; + + s->persistent = cpu_to_be32(SLB_NUM_BOLTED); + s->buffer_length = cpu_to_be32(sizeof(*s)); + + return s; +} + +#else /* CONFIG_PPC_STD_MMU_64 */ + +static void __init allocate_slb_shadows(int nr_cpus, int limit) { } + +#endif /* CONFIG_PPC_STD_MMU_64 */ + +/* The Paca is an array with one entry per processor. Each contains an + * lppaca, which contains the information shared between the + * hypervisor and Linux. + * On systems with hardware multi-threading, there are two threads + * per processor. The Paca array must contain an entry for each thread. + * The VPD Areas will give a max logical processors = 2 * max physical + * processors. The processor VPD array needs one entry per physical + * processor (not thread). + */ +struct paca_struct *paca; +EXPORT_SYMBOL(paca); + +void __init initialise_paca(struct paca_struct *new_paca, int cpu) +{ + /* The TOC register (GPR2) points 32kB into the TOC, so that 64kB + * of the TOC can be addressed using a single machine instruction. + */ + unsigned long kernel_toc = (unsigned long)(&__toc_start) + 0x8000UL; + +#ifdef CONFIG_PPC_BOOK3S + new_paca->lppaca_ptr = new_lppaca(cpu); +#else + new_paca->kernel_pgd = swapper_pg_dir; +#endif + new_paca->lock_token = 0x8000; + new_paca->paca_index = cpu; + new_paca->kernel_toc = kernel_toc; + new_paca->kernelbase = (unsigned long) _stext; + /* Only set MSR:IR/DR when MMU is initialized */ + new_paca->kernel_msr = MSR_KERNEL & ~(MSR_IR | MSR_DR); + new_paca->hw_cpu_id = 0xffff; + new_paca->kexec_state = KEXEC_STATE_NONE; + new_paca->__current = &init_task; + new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL; +#ifdef CONFIG_PPC_STD_MMU_64 + new_paca->slb_shadow_ptr = init_slb_shadow(cpu); +#endif /* CONFIG_PPC_STD_MMU_64 */ + +#ifdef CONFIG_PPC_BOOK3E + /* For now -- if we have threads this will be adjusted later */ + new_paca->tcd_ptr = &new_paca->tcd; +#endif +} + +/* Put the paca pointer into r13 and SPRG_PACA */ +void setup_paca(struct paca_struct *new_paca) +{ + /* Setup r13 */ + local_paca = new_paca; + +#ifdef CONFIG_PPC_BOOK3E + /* On Book3E, initialize the TLB miss exception frames */ + mtspr(SPRN_SPRG_TLB_EXFRAME, local_paca->extlb); +#else + /* In HV mode, we setup both HPACA and PACA to avoid problems + * if we do a GET_PACA() before the feature fixups have been + * applied + */ + if (cpu_has_feature(CPU_FTR_HVMODE)) + mtspr(SPRN_SPRG_HPACA, local_paca); +#endif + mtspr(SPRN_SPRG_PACA, local_paca); + +} + +static int __initdata paca_size; + +void __init allocate_pacas(void) +{ + int cpu, limit; + + /* + * We can't take SLB misses on the paca, and we want to access them + * in real mode, so allocate them within the RMA and also within + * the first segment. + */ + limit = min(0x10000000ULL, ppc64_rma_size); + + paca_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpu_ids); + + paca = __va(memblock_alloc_base(paca_size, PAGE_SIZE, limit)); + memset(paca, 0, paca_size); + + printk(KERN_DEBUG "Allocated %u bytes for %d pacas at %p\n", + paca_size, nr_cpu_ids, paca); + + allocate_lppacas(nr_cpu_ids, limit); + + allocate_slb_shadows(nr_cpu_ids, limit); + + /* Can't use for_each_*_cpu, as they aren't functional yet */ + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + initialise_paca(&paca[cpu], cpu); +} + +void __init free_unused_pacas(void) +{ + int new_size; + + new_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpu_ids); + + if (new_size >= paca_size) + return; + + memblock_free(__pa(paca) + new_size, paca_size - new_size); + + printk(KERN_DEBUG "Freed %u bytes for unused pacas\n", + paca_size - new_size); + + paca_size = new_size; + + free_lppacas(); +} diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c new file mode 100644 index 00000000000..b49c72fd7f1 --- /dev/null +++ b/arch/powerpc/kernel/pci-common.c @@ -0,0 +1,1681 @@ +/* + * Contains common pci routines for ALL ppc platform + * (based on pci_32.c and pci_64.c) + * + * Port for PPC64 David Engebretsen, IBM Corp. + * Contains common pci routines for ppc64 platform, pSeries and iSeries brands. + * + * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM + * Rework, based on alpha PCI code. + * + * Common pmac/prep/chrp pci routines. -- Cort + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/of_address.h> +#include <linux/of_pci.h> +#include <linux/mm.h> +#include <linux/list.h> +#include <linux/syscalls.h> +#include <linux/irq.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> +#include <linux/vgaarb.h> + +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/byteorder.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/eeh.h> + +static DEFINE_SPINLOCK(hose_spinlock); +LIST_HEAD(hose_list); + +/* XXX kill that some day ... */ +static int global_phb_number; /* Global phb counter */ + +/* ISA Memory physical address */ +resource_size_t isa_mem_base; + + +static struct dma_map_ops *pci_dma_ops = &dma_direct_ops; + +void set_pci_dma_ops(struct dma_map_ops *dma_ops) +{ + pci_dma_ops = dma_ops; +} + +struct dma_map_ops *get_pci_dma_ops(void) +{ + return pci_dma_ops; +} +EXPORT_SYMBOL(get_pci_dma_ops); + +struct pci_controller *pcibios_alloc_controller(struct device_node *dev) +{ + struct pci_controller *phb; + + phb = zalloc_maybe_bootmem(sizeof(struct pci_controller), GFP_KERNEL); + if (phb == NULL) + return NULL; + spin_lock(&hose_spinlock); + phb->global_number = global_phb_number++; + list_add_tail(&phb->list_node, &hose_list); + spin_unlock(&hose_spinlock); + phb->dn = dev; + phb->is_dynamic = mem_init_done; +#ifdef CONFIG_PPC64 + if (dev) { + int nid = of_node_to_nid(dev); + + if (nid < 0 || !node_online(nid)) + nid = -1; + + PHB_SET_NODE(phb, nid); + } +#endif + return phb; +} + +void pcibios_free_controller(struct pci_controller *phb) +{ + spin_lock(&hose_spinlock); + list_del(&phb->list_node); + spin_unlock(&hose_spinlock); + + if (phb->is_dynamic) + kfree(phb); +} + +/* + * The function is used to return the minimal alignment + * for memory or I/O windows of the associated P2P bridge. + * By default, 4KiB alignment for I/O windows and 1MiB for + * memory windows. + */ +resource_size_t pcibios_window_alignment(struct pci_bus *bus, + unsigned long type) +{ + if (ppc_md.pcibios_window_alignment) + return ppc_md.pcibios_window_alignment(bus, type); + + /* + * PCI core will figure out the default + * alignment: 4KiB for I/O and 1MiB for + * memory window. + */ + return 1; +} + +void pcibios_reset_secondary_bus(struct pci_dev *dev) +{ + u16 ctrl; + + if (ppc_md.pcibios_reset_secondary_bus) { + ppc_md.pcibios_reset_secondary_bus(dev); + return; + } + + pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl); + ctrl |= PCI_BRIDGE_CTL_BUS_RESET; + pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); + msleep(2); + + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; + pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); + ssleep(1); +} + +static resource_size_t pcibios_io_size(const struct pci_controller *hose) +{ +#ifdef CONFIG_PPC64 + return hose->pci_io_size; +#else + return resource_size(&hose->io_resource); +#endif +} + +int pcibios_vaddr_is_ioport(void __iomem *address) +{ + int ret = 0; + struct pci_controller *hose; + resource_size_t size; + + spin_lock(&hose_spinlock); + list_for_each_entry(hose, &hose_list, list_node) { + size = pcibios_io_size(hose); + if (address >= hose->io_base_virt && + address < (hose->io_base_virt + size)) { + ret = 1; + break; + } + } + spin_unlock(&hose_spinlock); + return ret; +} + +unsigned long pci_address_to_pio(phys_addr_t address) +{ + struct pci_controller *hose; + resource_size_t size; + unsigned long ret = ~0; + + spin_lock(&hose_spinlock); + list_for_each_entry(hose, &hose_list, list_node) { + size = pcibios_io_size(hose); + if (address >= hose->io_base_phys && + address < (hose->io_base_phys + size)) { + unsigned long base = + (unsigned long)hose->io_base_virt - _IO_BASE; + ret = base + (address - hose->io_base_phys); + break; + } + } + spin_unlock(&hose_spinlock); + + return ret; +} +EXPORT_SYMBOL_GPL(pci_address_to_pio); + +/* + * Return the domain number for this bus. + */ +int pci_domain_nr(struct pci_bus *bus) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + + return hose->global_number; +} +EXPORT_SYMBOL(pci_domain_nr); + +/* This routine is meant to be used early during boot, when the + * PCI bus numbers have not yet been assigned, and you need to + * issue PCI config cycles to an OF device. + * It could also be used to "fix" RTAS config cycles if you want + * to set pci_assign_all_buses to 1 and still use RTAS for PCI + * config cycles. + */ +struct pci_controller* pci_find_hose_for_OF_device(struct device_node* node) +{ + while(node) { + struct pci_controller *hose, *tmp; + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + if (hose->dn == node) + return hose; + node = node->parent; + } + return NULL; +} + +/* + * Reads the interrupt pin to determine if interrupt is use by card. + * If the interrupt is used, then gets the interrupt line from the + * openfirmware and sets it in the pci_dev and pci_config line. + */ +static int pci_read_irq_line(struct pci_dev *pci_dev) +{ + struct of_phandle_args oirq; + unsigned int virq; + + pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev)); + +#ifdef DEBUG + memset(&oirq, 0xff, sizeof(oirq)); +#endif + /* Try to get a mapping from the device-tree */ + if (of_irq_parse_pci(pci_dev, &oirq)) { + u8 line, pin; + + /* If that fails, lets fallback to what is in the config + * space and map that through the default controller. We + * also set the type to level low since that's what PCI + * interrupts are. If your platform does differently, then + * either provide a proper interrupt tree or don't use this + * function. + */ + if (pci_read_config_byte(pci_dev, PCI_INTERRUPT_PIN, &pin)) + return -1; + if (pin == 0) + return -1; + if (pci_read_config_byte(pci_dev, PCI_INTERRUPT_LINE, &line) || + line == 0xff || line == 0) { + return -1; + } + pr_debug(" No map ! Using line %d (pin %d) from PCI config\n", + line, pin); + + virq = irq_create_mapping(NULL, line); + if (virq != NO_IRQ) + irq_set_irq_type(virq, IRQ_TYPE_LEVEL_LOW); + } else { + pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %s\n", + oirq.args_count, oirq.args[0], oirq.args[1], + of_node_full_name(oirq.np)); + + virq = irq_create_of_mapping(&oirq); + } + if(virq == NO_IRQ) { + pr_debug(" Failed to map !\n"); + return -1; + } + + pr_debug(" Mapped to linux irq %d\n", virq); + + pci_dev->irq = virq; + + return 0; +} + +/* + * Platform support for /proc/bus/pci/X/Y mmap()s, + * modelled on the sparc64 implementation by Dave Miller. + * -- paulus. + */ + +/* + * Adjust vm_pgoff of VMA such that it is the physical page offset + * corresponding to the 32-bit pci bus offset for DEV requested by the user. + * + * Basically, the user finds the base address for his device which he wishes + * to mmap. They read the 32-bit value from the config space base register, + * add whatever PAGE_SIZE multiple offset they wish, and feed this into the + * offset parameter of mmap on /proc/bus/pci/XXX for that device. + * + * Returns negative error code on failure, zero on success. + */ +static struct resource *__pci_mmap_make_offset(struct pci_dev *dev, + resource_size_t *offset, + enum pci_mmap_state mmap_state) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long io_offset = 0; + int i, res_bit; + + if (hose == NULL) + return NULL; /* should never happen */ + + /* If memory, add on the PCI bridge address offset */ + if (mmap_state == pci_mmap_mem) { +#if 0 /* See comment in pci_resource_to_user() for why this is disabled */ + *offset += hose->pci_mem_offset; +#endif + res_bit = IORESOURCE_MEM; + } else { + io_offset = (unsigned long)hose->io_base_virt - _IO_BASE; + *offset += io_offset; + res_bit = IORESOURCE_IO; + } + + /* + * Check that the offset requested corresponds to one of the + * resources of the device. + */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *rp = &dev->resource[i]; + int flags = rp->flags; + + /* treat ROM as memory (should be already) */ + if (i == PCI_ROM_RESOURCE) + flags |= IORESOURCE_MEM; + + /* Active and same type? */ + if ((flags & res_bit) == 0) + continue; + + /* In the range of this resource? */ + if (*offset < (rp->start & PAGE_MASK) || *offset > rp->end) + continue; + + /* found it! construct the final physical address */ + if (mmap_state == pci_mmap_io) + *offset += hose->io_base_phys - io_offset; + return rp; + } + + return NULL; +} + +/* + * Set vm_page_prot of VMA, as appropriate for this architecture, for a pci + * device mapping. + */ +static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp, + pgprot_t protection, + enum pci_mmap_state mmap_state, + int write_combine) +{ + + /* Write combine is always 0 on non-memory space mappings. On + * memory space, if the user didn't pass 1, we check for a + * "prefetchable" resource. This is a bit hackish, but we use + * this to workaround the inability of /sysfs to provide a write + * combine bit + */ + if (mmap_state != pci_mmap_mem) + write_combine = 0; + else if (write_combine == 0) { + if (rp->flags & IORESOURCE_PREFETCH) + write_combine = 1; + } + + /* XXX would be nice to have a way to ask for write-through */ + if (write_combine) + return pgprot_noncached_wc(protection); + else + return pgprot_noncached(protection); +} + +/* + * This one is used by /dev/mem and fbdev who have no clue about the + * PCI device, it tries to find the PCI device first and calls the + * above routine + */ +pgprot_t pci_phys_mem_access_prot(struct file *file, + unsigned long pfn, + unsigned long size, + pgprot_t prot) +{ + struct pci_dev *pdev = NULL; + struct resource *found = NULL; + resource_size_t offset = ((resource_size_t)pfn) << PAGE_SHIFT; + int i; + + if (page_is_ram(pfn)) + return prot; + + prot = pgprot_noncached(prot); + for_each_pci_dev(pdev) { + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *rp = &pdev->resource[i]; + int flags = rp->flags; + + /* Active and same type? */ + if ((flags & IORESOURCE_MEM) == 0) + continue; + /* In the range of this resource? */ + if (offset < (rp->start & PAGE_MASK) || + offset > rp->end) + continue; + found = rp; + break; + } + if (found) + break; + } + if (found) { + if (found->flags & IORESOURCE_PREFETCH) + prot = pgprot_noncached_wc(prot); + pci_dev_put(pdev); + } + + pr_debug("PCI: Non-PCI map for %llx, prot: %lx\n", + (unsigned long long)offset, pgprot_val(prot)); + + return prot; +} + + +/* + * Perform the actual remap of the pages for a PCI device mapping, as + * appropriate for this architecture. The region in the process to map + * is described by vm_start and vm_end members of VMA, the base physical + * address is found in vm_pgoff. + * The pci device structure is provided so that architectures may make mapping + * decisions on a per-device or per-bus basis. + * + * Returns a negative error code on failure, zero on success. + */ +int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, int write_combine) +{ + resource_size_t offset = + ((resource_size_t)vma->vm_pgoff) << PAGE_SHIFT; + struct resource *rp; + int ret; + + rp = __pci_mmap_make_offset(dev, &offset, mmap_state); + if (rp == NULL) + return -EINVAL; + + vma->vm_pgoff = offset >> PAGE_SHIFT; + vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp, + vma->vm_page_prot, + mmap_state, write_combine); + + ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + vma->vm_end - vma->vm_start, vma->vm_page_prot); + + return ret; +} + +/* This provides legacy IO read access on a bus */ +int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val, size_t size) +{ + unsigned long offset; + struct pci_controller *hose = pci_bus_to_host(bus); + struct resource *rp = &hose->io_resource; + void __iomem *addr; + + /* Check if port can be supported by that bus. We only check + * the ranges of the PHB though, not the bus itself as the rules + * for forwarding legacy cycles down bridges are not our problem + * here. So if the host bridge supports it, we do it. + */ + offset = (unsigned long)hose->io_base_virt - _IO_BASE; + offset += port; + + if (!(rp->flags & IORESOURCE_IO)) + return -ENXIO; + if (offset < rp->start || (offset + size) > rp->end) + return -ENXIO; + addr = hose->io_base_virt + port; + + switch(size) { + case 1: + *((u8 *)val) = in_8(addr); + return 1; + case 2: + if (port & 1) + return -EINVAL; + *((u16 *)val) = in_le16(addr); + return 2; + case 4: + if (port & 3) + return -EINVAL; + *((u32 *)val) = in_le32(addr); + return 4; + } + return -EINVAL; +} + +/* This provides legacy IO write access on a bus */ +int pci_legacy_write(struct pci_bus *bus, loff_t port, u32 val, size_t size) +{ + unsigned long offset; + struct pci_controller *hose = pci_bus_to_host(bus); + struct resource *rp = &hose->io_resource; + void __iomem *addr; + + /* Check if port can be supported by that bus. We only check + * the ranges of the PHB though, not the bus itself as the rules + * for forwarding legacy cycles down bridges are not our problem + * here. So if the host bridge supports it, we do it. + */ + offset = (unsigned long)hose->io_base_virt - _IO_BASE; + offset += port; + + if (!(rp->flags & IORESOURCE_IO)) + return -ENXIO; + if (offset < rp->start || (offset + size) > rp->end) + return -ENXIO; + addr = hose->io_base_virt + port; + + /* WARNING: The generic code is idiotic. It gets passed a pointer + * to what can be a 1, 2 or 4 byte quantity and always reads that + * as a u32, which means that we have to correct the location of + * the data read within those 32 bits for size 1 and 2 + */ + switch(size) { + case 1: + out_8(addr, val >> 24); + return 1; + case 2: + if (port & 1) + return -EINVAL; + out_le16(addr, val >> 16); + return 2; + case 4: + if (port & 3) + return -EINVAL; + out_le32(addr, val); + return 4; + } + return -EINVAL; +} + +/* This provides legacy IO or memory mmap access on a bus */ +int pci_mmap_legacy_page_range(struct pci_bus *bus, + struct vm_area_struct *vma, + enum pci_mmap_state mmap_state) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + resource_size_t offset = + ((resource_size_t)vma->vm_pgoff) << PAGE_SHIFT; + resource_size_t size = vma->vm_end - vma->vm_start; + struct resource *rp; + + pr_debug("pci_mmap_legacy_page_range(%04x:%02x, %s @%llx..%llx)\n", + pci_domain_nr(bus), bus->number, + mmap_state == pci_mmap_mem ? "MEM" : "IO", + (unsigned long long)offset, + (unsigned long long)(offset + size - 1)); + + if (mmap_state == pci_mmap_mem) { + /* Hack alert ! + * + * Because X is lame and can fail starting if it gets an error trying + * to mmap legacy_mem (instead of just moving on without legacy memory + * access) we fake it here by giving it anonymous memory, effectively + * behaving just like /dev/zero + */ + if ((offset + size) > hose->isa_mem_size) { + printk(KERN_DEBUG + "Process %s (pid:%d) mapped non-existing PCI legacy memory for 0%04x:%02x\n", + current->comm, current->pid, pci_domain_nr(bus), bus->number); + if (vma->vm_flags & VM_SHARED) + return shmem_zero_setup(vma); + return 0; + } + offset += hose->isa_mem_phys; + } else { + unsigned long io_offset = (unsigned long)hose->io_base_virt - _IO_BASE; + unsigned long roffset = offset + io_offset; + rp = &hose->io_resource; + if (!(rp->flags & IORESOURCE_IO)) + return -ENXIO; + if (roffset < rp->start || (roffset + size) > rp->end) + return -ENXIO; + offset += hose->io_base_phys; + } + pr_debug(" -> mapping phys %llx\n", (unsigned long long)offset); + + vma->vm_pgoff = offset >> PAGE_SHIFT; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +} + +void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + resource_size_t *start, resource_size_t *end) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + resource_size_t offset = 0; + + if (hose == NULL) + return; + + if (rsrc->flags & IORESOURCE_IO) + offset = (unsigned long)hose->io_base_virt - _IO_BASE; + + /* We pass a fully fixed up address to userland for MMIO instead of + * a BAR value because X is lame and expects to be able to use that + * to pass to /dev/mem ! + * + * That means that we'll have potentially 64 bits values where some + * userland apps only expect 32 (like X itself since it thinks only + * Sparc has 64 bits MMIO) but if we don't do that, we break it on + * 32 bits CHRPs :-( + * + * Hopefully, the sysfs insterface is immune to that gunk. Once X + * has been fixed (and the fix spread enough), we can re-enable the + * 2 lines below and pass down a BAR value to userland. In that case + * we'll also have to re-enable the matching code in + * __pci_mmap_make_offset(). + * + * BenH. + */ +#if 0 + else if (rsrc->flags & IORESOURCE_MEM) + offset = hose->pci_mem_offset; +#endif + + *start = rsrc->start - offset; + *end = rsrc->end - offset; +} + +/** + * pci_process_bridge_OF_ranges - Parse PCI bridge resources from device tree + * @hose: newly allocated pci_controller to be setup + * @dev: device node of the host bridge + * @primary: set if primary bus (32 bits only, soon to be deprecated) + * + * This function will parse the "ranges" property of a PCI host bridge device + * node and setup the resource mapping of a pci controller based on its + * content. + * + * Life would be boring if it wasn't for a few issues that we have to deal + * with here: + * + * - We can only cope with one IO space range and up to 3 Memory space + * ranges. However, some machines (thanks Apple !) tend to split their + * space into lots of small contiguous ranges. So we have to coalesce. + * + * - Some busses have IO space not starting at 0, which causes trouble with + * the way we do our IO resource renumbering. The code somewhat deals with + * it for 64 bits but I would expect problems on 32 bits. + * + * - Some 32 bits platforms such as 4xx can have physical space larger than + * 32 bits so we need to use 64 bits values for the parsing + */ +void pci_process_bridge_OF_ranges(struct pci_controller *hose, + struct device_node *dev, int primary) +{ + int memno = 0; + struct resource *res; + struct of_pci_range range; + struct of_pci_range_parser parser; + + printk(KERN_INFO "PCI host bridge %s %s ranges:\n", + dev->full_name, primary ? "(primary)" : ""); + + /* Check for ranges property */ + if (of_pci_range_parser_init(&parser, dev)) + return; + + /* Parse it */ + for_each_of_pci_range(&parser, &range) { + /* If we failed translation or got a zero-sized region + * (some FW try to feed us with non sensical zero sized regions + * such as power3 which look like some kind of attempt at exposing + * the VGA memory hole) + */ + if (range.cpu_addr == OF_BAD_ADDR || range.size == 0) + continue; + + /* Act based on address space type */ + res = NULL; + switch (range.flags & IORESOURCE_TYPE_BITS) { + case IORESOURCE_IO: + printk(KERN_INFO + " IO 0x%016llx..0x%016llx -> 0x%016llx\n", + range.cpu_addr, range.cpu_addr + range.size - 1, + range.pci_addr); + + /* We support only one IO range */ + if (hose->pci_io_size) { + printk(KERN_INFO + " \\--> Skipped (too many) !\n"); + continue; + } +#ifdef CONFIG_PPC32 + /* On 32 bits, limit I/O space to 16MB */ + if (range.size > 0x01000000) + range.size = 0x01000000; + + /* 32 bits needs to map IOs here */ + hose->io_base_virt = ioremap(range.cpu_addr, + range.size); + + /* Expect trouble if pci_addr is not 0 */ + if (primary) + isa_io_base = + (unsigned long)hose->io_base_virt; +#endif /* CONFIG_PPC32 */ + /* pci_io_size and io_base_phys always represent IO + * space starting at 0 so we factor in pci_addr + */ + hose->pci_io_size = range.pci_addr + range.size; + hose->io_base_phys = range.cpu_addr - range.pci_addr; + + /* Build resource */ + res = &hose->io_resource; + range.cpu_addr = range.pci_addr; + break; + case IORESOURCE_MEM: + printk(KERN_INFO + " MEM 0x%016llx..0x%016llx -> 0x%016llx %s\n", + range.cpu_addr, range.cpu_addr + range.size - 1, + range.pci_addr, + (range.pci_space & 0x40000000) ? + "Prefetch" : ""); + + /* We support only 3 memory ranges */ + if (memno >= 3) { + printk(KERN_INFO + " \\--> Skipped (too many) !\n"); + continue; + } + /* Handles ISA memory hole space here */ + if (range.pci_addr == 0) { + if (primary || isa_mem_base == 0) + isa_mem_base = range.cpu_addr; + hose->isa_mem_phys = range.cpu_addr; + hose->isa_mem_size = range.size; + } + + /* Build resource */ + hose->mem_offset[memno] = range.cpu_addr - + range.pci_addr; + res = &hose->mem_resources[memno++]; + break; + } + if (res != NULL) { + of_pci_range_to_resource(&range, dev, res); + } + } +} + +/* Decide whether to display the domain number in /proc */ +int pci_proc_domain(struct pci_bus *bus) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + + if (!pci_has_flag(PCI_ENABLE_PROC_DOMAINS)) + return 0; + if (pci_has_flag(PCI_COMPAT_DOMAIN_0)) + return hose->global_number != 0; + return 1; +} + +int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) +{ + if (ppc_md.pcibios_root_bridge_prepare) + return ppc_md.pcibios_root_bridge_prepare(bridge); + + return 0; +} + +/* This header fixup will do the resource fixup for all devices as they are + * probed, but not for bridge ranges + */ +static void pcibios_fixup_resources(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + int i; + + if (!hose) { + printk(KERN_ERR "No host bridge for PCI dev %s !\n", + pci_name(dev)); + return; + } + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + struct resource *res = dev->resource + i; + struct pci_bus_region reg; + if (!res->flags) + continue; + + /* If we're going to re-assign everything, we mark all resources + * as unset (and 0-base them). In addition, we mark BARs starting + * at 0 as unset as well, except if PCI_PROBE_ONLY is also set + * since in that case, we don't want to re-assign anything + */ + pcibios_resource_to_bus(dev->bus, ®, res); + if (pci_has_flag(PCI_REASSIGN_ALL_RSRC) || + (reg.start == 0 && !pci_has_flag(PCI_PROBE_ONLY))) { + /* Only print message if not re-assigning */ + if (!pci_has_flag(PCI_REASSIGN_ALL_RSRC)) + pr_debug("PCI:%s Resource %d %016llx-%016llx [%x] " + "is unassigned\n", + pci_name(dev), i, + (unsigned long long)res->start, + (unsigned long long)res->end, + (unsigned int)res->flags); + res->end -= res->start; + res->start = 0; + res->flags |= IORESOURCE_UNSET; + continue; + } + + pr_debug("PCI:%s Resource %d %016llx-%016llx [%x]\n", + pci_name(dev), i, + (unsigned long long)res->start,\ + (unsigned long long)res->end, + (unsigned int)res->flags); + } + + /* Call machine specific resource fixup */ + if (ppc_md.pcibios_fixup_resources) + ppc_md.pcibios_fixup_resources(dev); +} +DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_fixup_resources); + +/* This function tries to figure out if a bridge resource has been initialized + * by the firmware or not. It doesn't have to be absolutely bullet proof, but + * things go more smoothly when it gets it right. It should covers cases such + * as Apple "closed" bridge resources and bare-metal pSeries unassigned bridges + */ +static int pcibios_uninitialized_bridge_resource(struct pci_bus *bus, + struct resource *res) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pci_dev *dev = bus->self; + resource_size_t offset; + struct pci_bus_region region; + u16 command; + int i; + + /* We don't do anything if PCI_PROBE_ONLY is set */ + if (pci_has_flag(PCI_PROBE_ONLY)) + return 0; + + /* Job is a bit different between memory and IO */ + if (res->flags & IORESOURCE_MEM) { + pcibios_resource_to_bus(dev->bus, ®ion, res); + + /* If the BAR is non-0 then it's probably been initialized */ + if (region.start != 0) + return 0; + + /* The BAR is 0, let's check if memory decoding is enabled on + * the bridge. If not, we consider it unassigned + */ + pci_read_config_word(dev, PCI_COMMAND, &command); + if ((command & PCI_COMMAND_MEMORY) == 0) + return 1; + + /* Memory decoding is enabled and the BAR is 0. If any of the bridge + * resources covers that starting address (0 then it's good enough for + * us for memory space) + */ + for (i = 0; i < 3; i++) { + if ((hose->mem_resources[i].flags & IORESOURCE_MEM) && + hose->mem_resources[i].start == hose->mem_offset[i]) + return 0; + } + + /* Well, it starts at 0 and we know it will collide so we may as + * well consider it as unassigned. That covers the Apple case. + */ + return 1; + } else { + /* If the BAR is non-0, then we consider it assigned */ + offset = (unsigned long)hose->io_base_virt - _IO_BASE; + if (((res->start - offset) & 0xfffffffful) != 0) + return 0; + + /* Here, we are a bit different than memory as typically IO space + * starting at low addresses -is- valid. What we do instead if that + * we consider as unassigned anything that doesn't have IO enabled + * in the PCI command register, and that's it. + */ + pci_read_config_word(dev, PCI_COMMAND, &command); + if (command & PCI_COMMAND_IO) + return 0; + + /* It's starting at 0 and IO is disabled in the bridge, consider + * it unassigned + */ + return 1; + } +} + +/* Fixup resources of a PCI<->PCI bridge */ +static void pcibios_fixup_bridge(struct pci_bus *bus) +{ + struct resource *res; + int i; + + struct pci_dev *dev = bus->self; + + pci_bus_for_each_resource(bus, res, i) { + if (!res || !res->flags) + continue; + if (i >= 3 && bus->self->transparent) + continue; + + /* If we're going to reassign everything, we can + * shrink the P2P resource to have size as being + * of 0 in order to save space. + */ + if (pci_has_flag(PCI_REASSIGN_ALL_RSRC)) { + res->flags |= IORESOURCE_UNSET; + res->start = 0; + res->end = -1; + continue; + } + + pr_debug("PCI:%s Bus rsrc %d %016llx-%016llx [%x]\n", + pci_name(dev), i, + (unsigned long long)res->start,\ + (unsigned long long)res->end, + (unsigned int)res->flags); + + /* Try to detect uninitialized P2P bridge resources, + * and clear them out so they get re-assigned later + */ + if (pcibios_uninitialized_bridge_resource(bus, res)) { + res->flags = 0; + pr_debug("PCI:%s (unassigned)\n", pci_name(dev)); + } + } +} + +void pcibios_setup_bus_self(struct pci_bus *bus) +{ + /* Fix up the bus resources for P2P bridges */ + if (bus->self != NULL) + pcibios_fixup_bridge(bus); + + /* Platform specific bus fixups. This is currently only used + * by fsl_pci and I'm hoping to get rid of it at some point + */ + if (ppc_md.pcibios_fixup_bus) + ppc_md.pcibios_fixup_bus(bus); + + /* Setup bus DMA mappings */ + if (ppc_md.pci_dma_bus_setup) + ppc_md.pci_dma_bus_setup(bus); +} + +static void pcibios_setup_device(struct pci_dev *dev) +{ + /* Fixup NUMA node as it may not be setup yet by the generic + * code and is needed by the DMA init + */ + set_dev_node(&dev->dev, pcibus_to_node(dev->bus)); + + /* Hook up default DMA ops */ + set_dma_ops(&dev->dev, pci_dma_ops); + set_dma_offset(&dev->dev, PCI_DRAM_OFFSET); + + /* Additional platform DMA/iommu setup */ + if (ppc_md.pci_dma_dev_setup) + ppc_md.pci_dma_dev_setup(dev); + + /* Read default IRQs and fixup if necessary */ + pci_read_irq_line(dev); + if (ppc_md.pci_irq_fixup) + ppc_md.pci_irq_fixup(dev); +} + +int pcibios_add_device(struct pci_dev *dev) +{ + /* + * We can only call pcibios_setup_device() after bus setup is complete, + * since some of the platform specific DMA setup code depends on it. + */ + if (dev->bus->is_added) + pcibios_setup_device(dev); + return 0; +} + +void pcibios_setup_bus_devices(struct pci_bus *bus) +{ + struct pci_dev *dev; + + pr_debug("PCI: Fixup bus devices %d (%s)\n", + bus->number, bus->self ? pci_name(bus->self) : "PHB"); + + list_for_each_entry(dev, &bus->devices, bus_list) { + /* Cardbus can call us to add new devices to a bus, so ignore + * those who are already fully discovered + */ + if (dev->is_added) + continue; + + pcibios_setup_device(dev); + } +} + +void pcibios_set_master(struct pci_dev *dev) +{ + /* No special bus mastering setup handling */ +} + +void pcibios_fixup_bus(struct pci_bus *bus) +{ + /* When called from the generic PCI probe, read PCI<->PCI bridge + * bases. This is -not- called when generating the PCI tree from + * the OF device-tree. + */ + pci_read_bridge_bases(bus); + + /* Now fixup the bus bus */ + pcibios_setup_bus_self(bus); + + /* Now fixup devices on that bus */ + pcibios_setup_bus_devices(bus); +} +EXPORT_SYMBOL(pcibios_fixup_bus); + +void pci_fixup_cardbus(struct pci_bus *bus) +{ + /* Now fixup devices on that bus */ + pcibios_setup_bus_devices(bus); +} + + +static int skip_isa_ioresource_align(struct pci_dev *dev) +{ + if (pci_has_flag(PCI_CAN_SKIP_ISA_ALIGN) && + !(dev->bus->bridge_ctl & PCI_BRIDGE_CTL_ISA)) + return 1; + return 0; +} + +/* + * We need to avoid collisions with `mirrored' VGA ports + * and other strange ISA hardware, so we always want the + * addresses to be allocated in the 0x000-0x0ff region + * modulo 0x400. + * + * Why? Because some silly external IO cards only decode + * the low 10 bits of the IO address. The 0x00-0xff region + * is reserved for motherboard devices that decode all 16 + * bits, so it's ok to allocate at, say, 0x2800-0x28ff, + * but we want to try to avoid allocating at 0x2900-0x2bff + * which might have be mirrored at 0x0100-0x03ff.. + */ +resource_size_t pcibios_align_resource(void *data, const struct resource *res, + resource_size_t size, resource_size_t align) +{ + struct pci_dev *dev = data; + resource_size_t start = res->start; + + if (res->flags & IORESOURCE_IO) { + if (skip_isa_ioresource_align(dev)) + return start; + if (start & 0x300) + start = (start + 0x3ff) & ~0x3ff; + } + + return start; +} +EXPORT_SYMBOL(pcibios_align_resource); + +/* + * Reparent resource children of pr that conflict with res + * under res, and make res replace those children. + */ +static int reparent_resources(struct resource *parent, + struct resource *res) +{ + struct resource *p, **pp; + struct resource **firstpp = NULL; + + for (pp = &parent->child; (p = *pp) != NULL; pp = &p->sibling) { + if (p->end < res->start) + continue; + if (res->end < p->start) + break; + if (p->start < res->start || p->end > res->end) + return -1; /* not completely contained */ + if (firstpp == NULL) + firstpp = pp; + } + if (firstpp == NULL) + return -1; /* didn't find any conflicting entries? */ + res->parent = parent; + res->child = *firstpp; + res->sibling = *pp; + *firstpp = res; + *pp = NULL; + for (p = res->child; p != NULL; p = p->sibling) { + p->parent = res; + pr_debug("PCI: Reparented %s [%llx..%llx] under %s\n", + p->name, + (unsigned long long)p->start, + (unsigned long long)p->end, res->name); + } + return 0; +} + +/* + * Handle resources of PCI devices. If the world were perfect, we could + * just allocate all the resource regions and do nothing more. It isn't. + * On the other hand, we cannot just re-allocate all devices, as it would + * require us to know lots of host bridge internals. So we attempt to + * keep as much of the original configuration as possible, but tweak it + * when it's found to be wrong. + * + * Known BIOS problems we have to work around: + * - I/O or memory regions not configured + * - regions configured, but not enabled in the command register + * - bogus I/O addresses above 64K used + * - expansion ROMs left enabled (this may sound harmless, but given + * the fact the PCI specs explicitly allow address decoders to be + * shared between expansion ROMs and other resource regions, it's + * at least dangerous) + * + * Our solution: + * (1) Allocate resources for all buses behind PCI-to-PCI bridges. + * This gives us fixed barriers on where we can allocate. + * (2) Allocate resources for all enabled devices. If there is + * a collision, just mark the resource as unallocated. Also + * disable expansion ROMs during this step. + * (3) Try to allocate resources for disabled devices. If the + * resources were assigned correctly, everything goes well, + * if they weren't, they won't disturb allocation of other + * resources. + * (4) Assign new addresses to resources which were either + * not configured at all or misconfigured. If explicitly + * requested by the user, configure expansion ROM address + * as well. + */ + +void pcibios_allocate_bus_resources(struct pci_bus *bus) +{ + struct pci_bus *b; + int i; + struct resource *res, *pr; + + pr_debug("PCI: Allocating bus resources for %04x:%02x...\n", + pci_domain_nr(bus), bus->number); + + pci_bus_for_each_resource(bus, res, i) { + if (!res || !res->flags || res->start > res->end || res->parent) + continue; + + /* If the resource was left unset at this point, we clear it */ + if (res->flags & IORESOURCE_UNSET) + goto clear_resource; + + if (bus->parent == NULL) + pr = (res->flags & IORESOURCE_IO) ? + &ioport_resource : &iomem_resource; + else { + pr = pci_find_parent_resource(bus->self, res); + if (pr == res) { + /* this happens when the generic PCI + * code (wrongly) decides that this + * bridge is transparent -- paulus + */ + continue; + } + } + + pr_debug("PCI: %s (bus %d) bridge rsrc %d: %016llx-%016llx " + "[0x%x], parent %p (%s)\n", + bus->self ? pci_name(bus->self) : "PHB", + bus->number, i, + (unsigned long long)res->start, + (unsigned long long)res->end, + (unsigned int)res->flags, + pr, (pr && pr->name) ? pr->name : "nil"); + + if (pr && !(pr->flags & IORESOURCE_UNSET)) { + if (request_resource(pr, res) == 0) + continue; + /* + * Must be a conflict with an existing entry. + * Move that entry (or entries) under the + * bridge resource and try again. + */ + if (reparent_resources(pr, res) == 0) + continue; + } + pr_warning("PCI: Cannot allocate resource region " + "%d of PCI bridge %d, will remap\n", i, bus->number); + clear_resource: + /* The resource might be figured out when doing + * reassignment based on the resources required + * by the downstream PCI devices. Here we set + * the size of the resource to be 0 in order to + * save more space. + */ + res->start = 0; + res->end = -1; + res->flags = 0; + } + + list_for_each_entry(b, &bus->children, node) + pcibios_allocate_bus_resources(b); +} + +static inline void alloc_resource(struct pci_dev *dev, int idx) +{ + struct resource *pr, *r = &dev->resource[idx]; + + pr_debug("PCI: Allocating %s: Resource %d: %016llx..%016llx [%x]\n", + pci_name(dev), idx, + (unsigned long long)r->start, + (unsigned long long)r->end, + (unsigned int)r->flags); + + pr = pci_find_parent_resource(dev, r); + if (!pr || (pr->flags & IORESOURCE_UNSET) || + request_resource(pr, r) < 0) { + printk(KERN_WARNING "PCI: Cannot allocate resource region %d" + " of device %s, will remap\n", idx, pci_name(dev)); + if (pr) + pr_debug("PCI: parent is %p: %016llx-%016llx [%x]\n", + pr, + (unsigned long long)pr->start, + (unsigned long long)pr->end, + (unsigned int)pr->flags); + /* We'll assign a new address later */ + r->flags |= IORESOURCE_UNSET; + r->end -= r->start; + r->start = 0; + } +} + +static void __init pcibios_allocate_resources(int pass) +{ + struct pci_dev *dev = NULL; + int idx, disabled; + u16 command; + struct resource *r; + + for_each_pci_dev(dev) { + pci_read_config_word(dev, PCI_COMMAND, &command); + for (idx = 0; idx <= PCI_ROM_RESOURCE; idx++) { + r = &dev->resource[idx]; + if (r->parent) /* Already allocated */ + continue; + if (!r->flags || (r->flags & IORESOURCE_UNSET)) + continue; /* Not assigned at all */ + /* We only allocate ROMs on pass 1 just in case they + * have been screwed up by firmware + */ + if (idx == PCI_ROM_RESOURCE ) + disabled = 1; + if (r->flags & IORESOURCE_IO) + disabled = !(command & PCI_COMMAND_IO); + else + disabled = !(command & PCI_COMMAND_MEMORY); + if (pass == disabled) + alloc_resource(dev, idx); + } + if (pass) + continue; + r = &dev->resource[PCI_ROM_RESOURCE]; + if (r->flags) { + /* Turn the ROM off, leave the resource region, + * but keep it unregistered. + */ + u32 reg; + pci_read_config_dword(dev, dev->rom_base_reg, ®); + if (reg & PCI_ROM_ADDRESS_ENABLE) { + pr_debug("PCI: Switching off ROM of %s\n", + pci_name(dev)); + r->flags &= ~IORESOURCE_ROM_ENABLE; + pci_write_config_dword(dev, dev->rom_base_reg, + reg & ~PCI_ROM_ADDRESS_ENABLE); + } + } + } +} + +static void __init pcibios_reserve_legacy_regions(struct pci_bus *bus) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + resource_size_t offset; + struct resource *res, *pres; + int i; + + pr_debug("Reserving legacy ranges for domain %04x\n", pci_domain_nr(bus)); + + /* Check for IO */ + if (!(hose->io_resource.flags & IORESOURCE_IO)) + goto no_io; + offset = (unsigned long)hose->io_base_virt - _IO_BASE; + res = kzalloc(sizeof(struct resource), GFP_KERNEL); + BUG_ON(res == NULL); + res->name = "Legacy IO"; + res->flags = IORESOURCE_IO; + res->start = offset; + res->end = (offset + 0xfff) & 0xfffffffful; + pr_debug("Candidate legacy IO: %pR\n", res); + if (request_resource(&hose->io_resource, res)) { + printk(KERN_DEBUG + "PCI %04x:%02x Cannot reserve Legacy IO %pR\n", + pci_domain_nr(bus), bus->number, res); + kfree(res); + } + + no_io: + /* Check for memory */ + for (i = 0; i < 3; i++) { + pres = &hose->mem_resources[i]; + offset = hose->mem_offset[i]; + if (!(pres->flags & IORESOURCE_MEM)) + continue; + pr_debug("hose mem res: %pR\n", pres); + if ((pres->start - offset) <= 0xa0000 && + (pres->end - offset) >= 0xbffff) + break; + } + if (i >= 3) + return; + res = kzalloc(sizeof(struct resource), GFP_KERNEL); + BUG_ON(res == NULL); + res->name = "Legacy VGA memory"; + res->flags = IORESOURCE_MEM; + res->start = 0xa0000 + offset; + res->end = 0xbffff + offset; + pr_debug("Candidate VGA memory: %pR\n", res); + if (request_resource(pres, res)) { + printk(KERN_DEBUG + "PCI %04x:%02x Cannot reserve VGA memory %pR\n", + pci_domain_nr(bus), bus->number, res); + kfree(res); + } +} + +void __init pcibios_resource_survey(void) +{ + struct pci_bus *b; + + /* Allocate and assign resources */ + list_for_each_entry(b, &pci_root_buses, node) + pcibios_allocate_bus_resources(b); + pcibios_allocate_resources(0); + pcibios_allocate_resources(1); + + /* Before we start assigning unassigned resource, we try to reserve + * the low IO area and the VGA memory area if they intersect the + * bus available resources to avoid allocating things on top of them + */ + if (!pci_has_flag(PCI_PROBE_ONLY)) { + list_for_each_entry(b, &pci_root_buses, node) + pcibios_reserve_legacy_regions(b); + } + + /* Now, if the platform didn't decide to blindly trust the firmware, + * we proceed to assigning things that were left unassigned + */ + if (!pci_has_flag(PCI_PROBE_ONLY)) { + pr_debug("PCI: Assigning unassigned resources...\n"); + pci_assign_unassigned_resources(); + } + + /* Call machine dependent fixup */ + if (ppc_md.pcibios_fixup) + ppc_md.pcibios_fixup(); +} + +/* This is used by the PCI hotplug driver to allocate resource + * of newly plugged busses. We can try to consolidate with the + * rest of the code later, for now, keep it as-is as our main + * resource allocation function doesn't deal with sub-trees yet. + */ +void pcibios_claim_one_bus(struct pci_bus *bus) +{ + struct pci_dev *dev; + struct pci_bus *child_bus; + + list_for_each_entry(dev, &bus->devices, bus_list) { + int i; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + struct resource *r = &dev->resource[i]; + + if (r->parent || !r->start || !r->flags) + continue; + + pr_debug("PCI: Claiming %s: " + "Resource %d: %016llx..%016llx [%x]\n", + pci_name(dev), i, + (unsigned long long)r->start, + (unsigned long long)r->end, + (unsigned int)r->flags); + + pci_claim_resource(dev, i); + } + } + + list_for_each_entry(child_bus, &bus->children, node) + pcibios_claim_one_bus(child_bus); +} + + +/* pcibios_finish_adding_to_bus + * + * This is to be called by the hotplug code after devices have been + * added to a bus, this include calling it for a PHB that is just + * being added + */ +void pcibios_finish_adding_to_bus(struct pci_bus *bus) +{ + pr_debug("PCI: Finishing adding to hotplug bus %04x:%02x\n", + pci_domain_nr(bus), bus->number); + + /* Allocate bus and devices resources */ + pcibios_allocate_bus_resources(bus); + pcibios_claim_one_bus(bus); + if (!pci_has_flag(PCI_PROBE_ONLY)) + pci_assign_unassigned_bus_resources(bus); + + /* Fixup EEH */ + eeh_add_device_tree_late(bus); + + /* Add new devices to global lists. Register in proc, sysfs. */ + pci_bus_add_devices(bus); + + /* sysfs files should only be added after devices are added */ + eeh_add_sysfs_files(bus); +} +EXPORT_SYMBOL_GPL(pcibios_finish_adding_to_bus); + +int pcibios_enable_device(struct pci_dev *dev, int mask) +{ + if (ppc_md.pcibios_enable_device_hook) + if (ppc_md.pcibios_enable_device_hook(dev)) + return -EINVAL; + + return pci_enable_resources(dev, mask); +} + +resource_size_t pcibios_io_space_offset(struct pci_controller *hose) +{ + return (unsigned long) hose->io_base_virt - _IO_BASE; +} + +static void pcibios_setup_phb_resources(struct pci_controller *hose, + struct list_head *resources) +{ + struct resource *res; + resource_size_t offset; + int i; + + /* Hookup PHB IO resource */ + res = &hose->io_resource; + + if (!res->flags) { + printk(KERN_WARNING "PCI: I/O resource not set for host" + " bridge %s (domain %d)\n", + hose->dn->full_name, hose->global_number); + } else { + offset = pcibios_io_space_offset(hose); + + pr_debug("PCI: PHB IO resource = %08llx-%08llx [%lx] off 0x%08llx\n", + (unsigned long long)res->start, + (unsigned long long)res->end, + (unsigned long)res->flags, + (unsigned long long)offset); + pci_add_resource_offset(resources, res, offset); + } + + /* Hookup PHB Memory resources */ + for (i = 0; i < 3; ++i) { + res = &hose->mem_resources[i]; + if (!res->flags) { + if (i == 0) + printk(KERN_ERR "PCI: Memory resource 0 not set for " + "host bridge %s (domain %d)\n", + hose->dn->full_name, hose->global_number); + continue; + } + offset = hose->mem_offset[i]; + + + pr_debug("PCI: PHB MEM resource %d = %08llx-%08llx [%lx] off 0x%08llx\n", i, + (unsigned long long)res->start, + (unsigned long long)res->end, + (unsigned long)res->flags, + (unsigned long long)offset); + + pci_add_resource_offset(resources, res, offset); + } +} + +/* + * Null PCI config access functions, for the case when we can't + * find a hose. + */ +#define NULL_PCI_OP(rw, size, type) \ +static int \ +null_##rw##_config_##size(struct pci_dev *dev, int offset, type val) \ +{ \ + return PCIBIOS_DEVICE_NOT_FOUND; \ +} + +static int +null_read_config(struct pci_bus *bus, unsigned int devfn, int offset, + int len, u32 *val) +{ + return PCIBIOS_DEVICE_NOT_FOUND; +} + +static int +null_write_config(struct pci_bus *bus, unsigned int devfn, int offset, + int len, u32 val) +{ + return PCIBIOS_DEVICE_NOT_FOUND; +} + +static struct pci_ops null_pci_ops = +{ + .read = null_read_config, + .write = null_write_config, +}; + +/* + * These functions are used early on before PCI scanning is done + * and all of the pci_dev and pci_bus structures have been created. + */ +static struct pci_bus * +fake_pci_bus(struct pci_controller *hose, int busnr) +{ + static struct pci_bus bus; + + if (hose == NULL) { + printk(KERN_ERR "Can't find hose for PCI bus %d!\n", busnr); + } + bus.number = busnr; + bus.sysdata = hose; + bus.ops = hose? hose->ops: &null_pci_ops; + return &bus; +} + +#define EARLY_PCI_OP(rw, size, type) \ +int early_##rw##_config_##size(struct pci_controller *hose, int bus, \ + int devfn, int offset, type value) \ +{ \ + return pci_bus_##rw##_config_##size(fake_pci_bus(hose, bus), \ + devfn, offset, value); \ +} + +EARLY_PCI_OP(read, byte, u8 *) +EARLY_PCI_OP(read, word, u16 *) +EARLY_PCI_OP(read, dword, u32 *) +EARLY_PCI_OP(write, byte, u8) +EARLY_PCI_OP(write, word, u16) +EARLY_PCI_OP(write, dword, u32) + +extern int pci_bus_find_capability (struct pci_bus *bus, unsigned int devfn, int cap); +int early_find_capability(struct pci_controller *hose, int bus, int devfn, + int cap) +{ + return pci_bus_find_capability(fake_pci_bus(hose, bus), devfn, cap); +} + +struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) +{ + struct pci_controller *hose = bus->sysdata; + + return of_node_get(hose->dn); +} + +/** + * pci_scan_phb - Given a pci_controller, setup and scan the PCI bus + * @hose: Pointer to the PCI host controller instance structure + */ +void pcibios_scan_phb(struct pci_controller *hose) +{ + LIST_HEAD(resources); + struct pci_bus *bus; + struct device_node *node = hose->dn; + int mode; + + pr_debug("PCI: Scanning PHB %s\n", of_node_full_name(node)); + + /* Get some IO space for the new PHB */ + pcibios_setup_phb_io_space(hose); + + /* Wire up PHB bus resources */ + pcibios_setup_phb_resources(hose, &resources); + + hose->busn.start = hose->first_busno; + hose->busn.end = hose->last_busno; + hose->busn.flags = IORESOURCE_BUS; + pci_add_resource(&resources, &hose->busn); + + /* Create an empty bus for the toplevel */ + bus = pci_create_root_bus(hose->parent, hose->first_busno, + hose->ops, hose, &resources); + if (bus == NULL) { + pr_err("Failed to create bus for PCI domain %04x\n", + hose->global_number); + pci_free_resource_list(&resources); + return; + } + hose->bus = bus; + + /* Get probe mode and perform scan */ + mode = PCI_PROBE_NORMAL; + if (node && ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + pr_debug(" probe mode: %d\n", mode); + if (mode == PCI_PROBE_DEVTREE) + of_scan_bus(node, bus); + + if (mode == PCI_PROBE_NORMAL) { + pci_bus_update_busn_res_end(bus, 255); + hose->last_busno = pci_scan_child_bus(bus); + pci_bus_update_busn_res_end(bus, hose->last_busno); + } + + /* Platform gets a chance to do some global fixups before + * we proceed to resource allocation + */ + if (ppc_md.pcibios_fixup_phb) + ppc_md.pcibios_fixup_phb(hose); + + /* Configure PCI Express settings */ + if (bus && !pci_has_flag(PCI_PROBE_ONLY)) { + struct pci_bus *child; + list_for_each_entry(child, &bus->children, node) + pcie_bus_configure_settings(child); + } +} + +static void fixup_hide_host_resource_fsl(struct pci_dev *dev) +{ + int i, class = dev->class >> 8; + /* When configured as agent, programing interface = 1 */ + int prog_if = dev->class & 0xf; + + if ((class == PCI_CLASS_PROCESSOR_POWERPC || + class == PCI_CLASS_BRIDGE_OTHER) && + (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) && + (prog_if == 0) && + (dev->bus->parent == NULL)) { + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + dev->resource[i].start = 0; + dev->resource[i].end = 0; + dev->resource[i].flags = 0; + } + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MOTOROLA, PCI_ANY_ID, fixup_hide_host_resource_fsl); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, fixup_hide_host_resource_fsl); + +static void fixup_vga(struct pci_dev *pdev) +{ + u16 cmd; + + pci_read_config_word(pdev, PCI_COMMAND, &cmd); + if ((cmd & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) || !vga_default_device()) + vga_set_default_device(pdev); + +} +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, + PCI_CLASS_DISPLAY_VGA, 8, fixup_vga); diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c new file mode 100644 index 00000000000..5b789177aa2 --- /dev/null +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -0,0 +1,109 @@ +/* + * Derived from "arch/powerpc/platforms/pseries/pci_dlpar.c" + * + * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com> + * Copyright (C) 2005 International Business Machines + * + * Updates, 2005, John Rose <johnrose@austin.ibm.com> + * Updates, 2005, Linas Vepstas <linas@austin.ibm.com> + * Updates, 2013, Gavin Shan <shangw@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/pci.h> +#include <linux/export.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> +#include <asm/firmware.h> +#include <asm/eeh.h> + +/** + * pcibios_release_device - release PCI device + * @dev: PCI device + * + * The function is called before releasing the indicated PCI device. + */ +void pcibios_release_device(struct pci_dev *dev) +{ + eeh_remove_device(dev); +} + +/** + * pcibios_remove_pci_devices - remove all devices under this bus + * @bus: the indicated PCI bus + * + * Remove all of the PCI devices under this bus both from the + * linux pci device tree, and from the powerpc EEH address cache. + */ +void pcibios_remove_pci_devices(struct pci_bus *bus) +{ + struct pci_dev *dev, *tmp; + struct pci_bus *child_bus; + + /* First go down child busses */ + list_for_each_entry(child_bus, &bus->children, node) + pcibios_remove_pci_devices(child_bus); + + pr_debug("PCI: Removing devices on bus %04x:%02x\n", + pci_domain_nr(bus), bus->number); + list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) { + pr_debug(" Removing %s...\n", pci_name(dev)); + pci_stop_and_remove_bus_device(dev); + } +} + +EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); + +/** + * pcibios_add_pci_devices - adds new pci devices to bus + * @bus: the indicated PCI bus + * + * This routine will find and fixup new pci devices under + * the indicated bus. This routine presumes that there + * might already be some devices under this bridge, so + * it carefully tries to add only new devices. (And that + * is how this routine differs from other, similar pcibios + * routines.) + */ +void pcibios_add_pci_devices(struct pci_bus * bus) +{ + int slotno, mode, pass, max; + struct pci_dev *dev; + struct device_node *dn = pci_bus_to_OF_node(bus); + + eeh_add_device_tree_early(dn); + + mode = PCI_PROBE_NORMAL; + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + + if (mode == PCI_PROBE_DEVTREE) { + /* use ofdt-based probe */ + of_rescan_bus(dn, bus); + } else if (mode == PCI_PROBE_NORMAL) { + /* + * Use legacy probe. In the partial hotplug case, we + * probably have grandchildren devices unplugged. So + * we don't check the return value from pci_scan_slot() in + * order for fully rescan all the way down to pick them up. + * They can have been removed during partial hotplug. + */ + slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); + pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); + pcibios_setup_bus_devices(bus); + max = bus->busn_res.start; + for (pass = 0; pass < 2; pass++) { + list_for_each_entry(dev, &bus->devices, bus_list) { + if (pci_is_bridge(dev)) + max = pci_scan_bridge(bus, dev, + max, pass); + } + } + } + pcibios_finish_adding_to_bus(bus); +} +EXPORT_SYMBOL_GPL(pcibios_add_pci_devices); diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c new file mode 100644 index 00000000000..432459c817f --- /dev/null +++ b/arch/powerpc/kernel/pci_32.c @@ -0,0 +1,310 @@ +/* + * Common pmac/prep/chrp pci routines. -- Cort + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/capability.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/bootmem.h> +#include <linux/irq.h> +#include <linux/list.h> +#include <linux/of.h> +#include <linux/slab.h> +#include <linux/export.h> + +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/sections.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> +#include <asm/byteorder.h> +#include <asm/uaccess.h> +#include <asm/machdep.h> + +#undef DEBUG + +unsigned long isa_io_base = 0; +unsigned long pci_dram_offset = 0; +int pcibios_assign_bus_offset = 1; + +void pcibios_make_OF_bus_map(void); + +static void fixup_cpc710_pci64(struct pci_dev* dev); +static u8* pci_to_OF_bus_map; + +/* By default, we don't re-assign bus numbers. We do this only on + * some pmacs + */ +static int pci_assign_all_buses; + +static int pci_bus_count; + +/* This will remain NULL for now, until isa-bridge.c is made common + * to both 32-bit and 64-bit. + */ +struct pci_dev *isa_bridge_pcidev; +EXPORT_SYMBOL_GPL(isa_bridge_pcidev); + +static void +fixup_cpc710_pci64(struct pci_dev* dev) +{ + /* Hide the PCI64 BARs from the kernel as their content doesn't + * fit well in the resource management + */ + dev->resource[0].start = dev->resource[0].end = 0; + dev->resource[0].flags = 0; + dev->resource[1].start = dev->resource[1].end = 0; + dev->resource[1].flags = 0; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CPC710_PCI64, fixup_cpc710_pci64); + +/* + * Functions below are used on OpenFirmware machines. + */ +static void +make_one_node_map(struct device_node* node, u8 pci_bus) +{ + const int *bus_range; + int len; + + if (pci_bus >= pci_bus_count) + return; + bus_range = of_get_property(node, "bus-range", &len); + if (bus_range == NULL || len < 2 * sizeof(int)) { + printk(KERN_WARNING "Can't get bus-range for %s, " + "assuming it starts at 0\n", node->full_name); + pci_to_OF_bus_map[pci_bus] = 0; + } else + pci_to_OF_bus_map[pci_bus] = bus_range[0]; + + for_each_child_of_node(node, node) { + struct pci_dev* dev; + const unsigned int *class_code, *reg; + + class_code = of_get_property(node, "class-code", NULL); + if (!class_code || ((*class_code >> 8) != PCI_CLASS_BRIDGE_PCI && + (*class_code >> 8) != PCI_CLASS_BRIDGE_CARDBUS)) + continue; + reg = of_get_property(node, "reg", NULL); + if (!reg) + continue; + dev = pci_get_bus_and_slot(pci_bus, ((reg[0] >> 8) & 0xff)); + if (!dev || !dev->subordinate) { + pci_dev_put(dev); + continue; + } + make_one_node_map(node, dev->subordinate->number); + pci_dev_put(dev); + } +} + +void +pcibios_make_OF_bus_map(void) +{ + int i; + struct pci_controller *hose, *tmp; + struct property *map_prop; + struct device_node *dn; + + pci_to_OF_bus_map = kmalloc(pci_bus_count, GFP_KERNEL); + if (!pci_to_OF_bus_map) { + printk(KERN_ERR "Can't allocate OF bus map !\n"); + return; + } + + /* We fill the bus map with invalid values, that helps + * debugging. + */ + for (i=0; i<pci_bus_count; i++) + pci_to_OF_bus_map[i] = 0xff; + + /* For each hose, we begin searching bridges */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + struct device_node* node = hose->dn; + + if (!node) + continue; + make_one_node_map(node, hose->first_busno); + } + dn = of_find_node_by_path("/"); + map_prop = of_find_property(dn, "pci-OF-bus-map", NULL); + if (map_prop) { + BUG_ON(pci_bus_count > map_prop->length); + memcpy(map_prop->value, pci_to_OF_bus_map, pci_bus_count); + } + of_node_put(dn); +#ifdef DEBUG + printk("PCI->OF bus map:\n"); + for (i=0; i<pci_bus_count; i++) { + if (pci_to_OF_bus_map[i] == 0xff) + continue; + printk("%d -> %d\n", i, pci_to_OF_bus_map[i]); + } +#endif +} + + +/* + * Returns the PCI device matching a given OF node + */ +int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn) +{ + struct pci_dev *dev = NULL; + const __be32 *reg; + int size; + + /* Check if it might have a chance to be a PCI device */ + if (!pci_find_hose_for_OF_device(node)) + return -ENODEV; + + reg = of_get_property(node, "reg", &size); + if (!reg || size < 5 * sizeof(u32)) + return -ENODEV; + + *bus = (be32_to_cpup(®[0]) >> 16) & 0xff; + *devfn = (be32_to_cpup(®[0]) >> 8) & 0xff; + + /* Ok, here we need some tweak. If we have already renumbered + * all busses, we can't rely on the OF bus number any more. + * the pci_to_OF_bus_map is not enough as several PCI busses + * may match the same OF bus number. + */ + if (!pci_to_OF_bus_map) + return 0; + + for_each_pci_dev(dev) + if (pci_to_OF_bus_map[dev->bus->number] == *bus && + dev->devfn == *devfn) { + *bus = dev->bus->number; + pci_dev_put(dev); + return 0; + } + + return -ENODEV; +} +EXPORT_SYMBOL(pci_device_from_OF_node); + +/* We create the "pci-OF-bus-map" property now so it appears in the + * /proc device tree + */ +void __init +pci_create_OF_bus_map(void) +{ + struct property* of_prop; + struct device_node *dn; + + of_prop = (struct property*) alloc_bootmem(sizeof(struct property) + 256); + if (!of_prop) + return; + dn = of_find_node_by_path("/"); + if (dn) { + memset(of_prop, -1, sizeof(struct property) + 256); + of_prop->name = "pci-OF-bus-map"; + of_prop->length = 256; + of_prop->value = &of_prop[1]; + of_add_property(dn, of_prop); + of_node_put(dn); + } +} + +void pcibios_setup_phb_io_space(struct pci_controller *hose) +{ + unsigned long io_offset; + struct resource *res = &hose->io_resource; + + /* Fixup IO space offset */ + io_offset = pcibios_io_space_offset(hose); + res->start += io_offset; + res->end += io_offset; +} + +static int __init pcibios_init(void) +{ + struct pci_controller *hose, *tmp; + int next_busno = 0; + + printk(KERN_INFO "PCI: Probing PCI hardware\n"); + + if (pci_has_flag(PCI_REASSIGN_ALL_BUS)) + pci_assign_all_buses = 1; + + /* Scan all of the recorded PCI controllers. */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + if (pci_assign_all_buses) + hose->first_busno = next_busno; + hose->last_busno = 0xff; + pcibios_scan_phb(hose); + pci_bus_add_devices(hose->bus); + if (pci_assign_all_buses || next_busno <= hose->last_busno) + next_busno = hose->last_busno + pcibios_assign_bus_offset; + } + pci_bus_count = next_busno; + + /* OpenFirmware based machines need a map of OF bus + * numbers vs. kernel bus numbers since we may have to + * remap them. + */ + if (pci_assign_all_buses) + pcibios_make_OF_bus_map(); + + /* Call common code to handle resource allocation */ + pcibios_resource_survey(); + + /* Call machine dependent post-init code */ + if (ppc_md.pcibios_after_init) + ppc_md.pcibios_after_init(); + + return 0; +} + +subsys_initcall(pcibios_init); + +static struct pci_controller* +pci_bus_to_hose(int bus) +{ + struct pci_controller *hose, *tmp; + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + if (bus >= hose->first_busno && bus <= hose->last_busno) + return hose; + return NULL; +} + +/* Provide information on locations of various I/O regions in physical + * memory. Do this on a per-card basis so that we choose the right + * root bridge. + * Note that the returned IO or memory base is a physical address + */ + +long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn) +{ + struct pci_controller* hose; + long result = -EOPNOTSUPP; + + hose = pci_bus_to_hose(bus); + if (!hose) + return -ENODEV; + + switch (which) { + case IOBASE_BRIDGE_NUMBER: + return (long)hose->first_busno; + case IOBASE_MEMORY: + return (long)hose->mem_offset[0]; + case IOBASE_IO: + return (long)hose->io_base_phys; + case IOBASE_ISA_IO: + return (long)isa_io_base; + case IOBASE_ISA_MEM: + return (long)isa_mem_base; + } + + return result; +} + + diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c new file mode 100644 index 00000000000..155013da27e --- /dev/null +++ b/arch/powerpc/kernel/pci_64.c @@ -0,0 +1,278 @@ +/* + * Port for PPC64 David Engebretsen, IBM Corp. + * Contains common pci routines for ppc64 platform, pSeries and iSeries brands. + * + * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM + * Rework, based on alpha PCI code. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/list.h> +#include <linux/syscalls.h> +#include <linux/irq.h> +#include <linux/vmalloc.h> + +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/byteorder.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> + +/* pci_io_base -- the base address from which io bars are offsets. + * This is the lowest I/O base address (so bar values are always positive), + * and it *must* be the start of ISA space if an ISA bus exists because + * ISA drivers use hard coded offsets. If no ISA bus exists nothing + * is mapped on the first 64K of IO space + */ +unsigned long pci_io_base = ISA_IO_BASE; +EXPORT_SYMBOL(pci_io_base); + +static int __init pcibios_init(void) +{ + struct pci_controller *hose, *tmp; + + printk(KERN_INFO "PCI: Probing PCI hardware\n"); + + /* For now, override phys_mem_access_prot. If we need it,g + * later, we may move that initialization to each ppc_md + */ + ppc_md.phys_mem_access_prot = pci_phys_mem_access_prot; + + /* On ppc64, we always enable PCI domains and we keep domain 0 + * backward compatible in /proc for video cards + */ + pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0); + + /* Scan all of the recorded PCI controllers. */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + pcibios_scan_phb(hose); + pci_bus_add_devices(hose->bus); + } + + /* Call common code to handle resource allocation */ + pcibios_resource_survey(); + + printk(KERN_DEBUG "PCI: Probing PCI hardware done\n"); + + return 0; +} + +subsys_initcall(pcibios_init); + +int pcibios_unmap_io_space(struct pci_bus *bus) +{ + struct pci_controller *hose; + + WARN_ON(bus == NULL); + + /* If this is not a PHB, we only flush the hash table over + * the area mapped by this bridge. We don't play with the PTE + * mappings since we might have to deal with sub-page alignemnts + * so flushing the hash table is the only sane way to make sure + * that no hash entries are covering that removed bridge area + * while still allowing other busses overlapping those pages + * + * Note: If we ever support P2P hotplug on Book3E, we'll have + * to do an appropriate TLB flush here too + */ + if (bus->self) { +#ifdef CONFIG_PPC_STD_MMU_64 + struct resource *res = bus->resource[0]; +#endif + + pr_debug("IO unmapping for PCI-PCI bridge %s\n", + pci_name(bus->self)); + +#ifdef CONFIG_PPC_STD_MMU_64 + __flush_hash_table_range(&init_mm, res->start + _IO_BASE, + res->end + _IO_BASE + 1); +#endif + return 0; + } + + /* Get the host bridge */ + hose = pci_bus_to_host(bus); + + /* Check if we have IOs allocated */ + if (hose->io_base_alloc == NULL) + return 0; + + pr_debug("IO unmapping for PHB %s\n", hose->dn->full_name); + pr_debug(" alloc=0x%p\n", hose->io_base_alloc); + + /* This is a PHB, we fully unmap the IO area */ + vunmap(hose->io_base_alloc); + + return 0; +} +EXPORT_SYMBOL_GPL(pcibios_unmap_io_space); + +static int pcibios_map_phb_io_space(struct pci_controller *hose) +{ + struct vm_struct *area; + unsigned long phys_page; + unsigned long size_page; + unsigned long io_virt_offset; + + phys_page = _ALIGN_DOWN(hose->io_base_phys, PAGE_SIZE); + size_page = _ALIGN_UP(hose->pci_io_size, PAGE_SIZE); + + /* Make sure IO area address is clear */ + hose->io_base_alloc = NULL; + + /* If there's no IO to map on that bus, get away too */ + if (hose->pci_io_size == 0 || hose->io_base_phys == 0) + return 0; + + /* Let's allocate some IO space for that guy. We don't pass + * VM_IOREMAP because we don't care about alignment tricks that + * the core does in that case. Maybe we should due to stupid card + * with incomplete address decoding but I'd rather not deal with + * those outside of the reserved 64K legacy region. + */ + area = __get_vm_area(size_page, 0, PHB_IO_BASE, PHB_IO_END); + if (area == NULL) + return -ENOMEM; + hose->io_base_alloc = area->addr; + hose->io_base_virt = (void __iomem *)(area->addr + + hose->io_base_phys - phys_page); + + pr_debug("IO mapping for PHB %s\n", hose->dn->full_name); + pr_debug(" phys=0x%016llx, virt=0x%p (alloc=0x%p)\n", + hose->io_base_phys, hose->io_base_virt, hose->io_base_alloc); + pr_debug(" size=0x%016llx (alloc=0x%016lx)\n", + hose->pci_io_size, size_page); + + /* Establish the mapping */ + if (__ioremap_at(phys_page, area->addr, size_page, + _PAGE_NO_CACHE | _PAGE_GUARDED) == NULL) + return -ENOMEM; + + /* Fixup hose IO resource */ + io_virt_offset = pcibios_io_space_offset(hose); + hose->io_resource.start += io_virt_offset; + hose->io_resource.end += io_virt_offset; + + pr_debug(" hose->io_resource=%pR\n", &hose->io_resource); + + return 0; +} + +int pcibios_map_io_space(struct pci_bus *bus) +{ + WARN_ON(bus == NULL); + + /* If this not a PHB, nothing to do, page tables still exist and + * thus HPTEs will be faulted in when needed + */ + if (bus->self) { + pr_debug("IO mapping for PCI-PCI bridge %s\n", + pci_name(bus->self)); + pr_debug(" virt=0x%016llx...0x%016llx\n", + bus->resource[0]->start + _IO_BASE, + bus->resource[0]->end + _IO_BASE); + return 0; + } + + return pcibios_map_phb_io_space(pci_bus_to_host(bus)); +} +EXPORT_SYMBOL_GPL(pcibios_map_io_space); + +void pcibios_setup_phb_io_space(struct pci_controller *hose) +{ + pcibios_map_phb_io_space(hose); +} + +#define IOBASE_BRIDGE_NUMBER 0 +#define IOBASE_MEMORY 1 +#define IOBASE_IO 2 +#define IOBASE_ISA_IO 3 +#define IOBASE_ISA_MEM 4 + +long sys_pciconfig_iobase(long which, unsigned long in_bus, + unsigned long in_devfn) +{ + struct pci_controller* hose; + struct pci_bus *tmp_bus, *bus = NULL; + struct device_node *hose_node; + + /* Argh ! Please forgive me for that hack, but that's the + * simplest way to get existing XFree to not lockup on some + * G5 machines... So when something asks for bus 0 io base + * (bus 0 is HT root), we return the AGP one instead. + */ + if (in_bus == 0 && of_machine_is_compatible("MacRISC4")) { + struct device_node *agp; + + agp = of_find_compatible_node(NULL, NULL, "u3-agp"); + if (agp) + in_bus = 0xf0; + of_node_put(agp); + } + + /* That syscall isn't quite compatible with PCI domains, but it's + * used on pre-domains setup. We return the first match + */ + + list_for_each_entry(tmp_bus, &pci_root_buses, node) { + if (in_bus >= tmp_bus->number && + in_bus <= tmp_bus->busn_res.end) { + bus = tmp_bus; + break; + } + } + if (bus == NULL || bus->dev.of_node == NULL) + return -ENODEV; + + hose_node = bus->dev.of_node; + hose = PCI_DN(hose_node)->phb; + + switch (which) { + case IOBASE_BRIDGE_NUMBER: + return (long)hose->first_busno; + case IOBASE_MEMORY: + return (long)hose->mem_offset[0]; + case IOBASE_IO: + return (long)hose->io_base_phys; + case IOBASE_ISA_IO: + return (long)isa_io_base; + case IOBASE_ISA_MEM: + return -EINVAL; + } + + return -EOPNOTSUPP; +} + +#ifdef CONFIG_NUMA +int pcibus_to_node(struct pci_bus *bus) +{ + struct pci_controller *phb = pci_bus_to_host(bus); + return phb->node; +} +EXPORT_SYMBOL(pcibus_to_node); +#endif + +static void quirk_radeon_32bit_msi(struct pci_dev *dev) +{ + struct pci_dn *pdn = pci_get_pdn(dev); + + if (pdn) + pdn->force_32bit_msi = true; +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x68f2, quirk_radeon_32bit_msi); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0xaa68, quirk_radeon_32bit_msi); diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c new file mode 100644 index 00000000000..1f61fab59d9 --- /dev/null +++ b/arch/powerpc/kernel/pci_dn.c @@ -0,0 +1,175 @@ +/* + * pci_dn.c + * + * Copyright (C) 2001 Todd Inglett, IBM Corporation + * + * PCI manipulation via device_nodes. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/gfp.h> + +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> +#include <asm/firmware.h> + +struct pci_dn *pci_get_pdn(struct pci_dev *pdev) +{ + struct device_node *dn = pci_device_to_OF_node(pdev); + if (!dn) + return NULL; + return PCI_DN(dn); +} + +/* + * Traverse_func that inits the PCI fields of the device node. + * NOTE: this *must* be done before read/write config to the device. + */ +void *update_dn_pci_info(struct device_node *dn, void *data) +{ + struct pci_controller *phb = data; + const __be32 *type = of_get_property(dn, "ibm,pci-config-space-type", NULL); + const __be32 *regs; + struct pci_dn *pdn; + + pdn = zalloc_maybe_bootmem(sizeof(*pdn), GFP_KERNEL); + if (pdn == NULL) + return NULL; + dn->data = pdn; + pdn->node = dn; + pdn->phb = phb; +#ifdef CONFIG_PPC_POWERNV + pdn->pe_number = IODA_INVALID_PE; +#endif + regs = of_get_property(dn, "reg", NULL); + if (regs) { + u32 addr = of_read_number(regs, 1); + + /* First register entry is addr (00BBSS00) */ + pdn->busno = (addr >> 16) & 0xff; + pdn->devfn = (addr >> 8) & 0xff; + } + + pdn->pci_ext_config_space = (type && of_read_number(type, 1) == 1); + return NULL; +} + +/* + * Traverse a device tree stopping each PCI device in the tree. + * This is done depth first. As each node is processed, a "pre" + * function is called and the children are processed recursively. + * + * The "pre" func returns a value. If non-zero is returned from + * the "pre" func, the traversal stops and this value is returned. + * This return value is useful when using traverse as a method of + * finding a device. + * + * NOTE: we do not run the func for devices that do not appear to + * be PCI except for the start node which we assume (this is good + * because the start node is often a phb which may be missing PCI + * properties). + * We use the class-code as an indicator. If we run into + * one of these nodes we also assume its siblings are non-pci for + * performance. + */ +void *traverse_pci_devices(struct device_node *start, traverse_func pre, + void *data) +{ + struct device_node *dn, *nextdn; + void *ret; + + /* We started with a phb, iterate all childs */ + for (dn = start->child; dn; dn = nextdn) { + const __be32 *classp; + u32 class = 0; + + nextdn = NULL; + classp = of_get_property(dn, "class-code", NULL); + if (classp) + class = of_read_number(classp, 1); + + if (pre && ((ret = pre(dn, data)) != NULL)) + return ret; + + /* If we are a PCI bridge, go down */ + if (dn->child && ((class >> 8) == PCI_CLASS_BRIDGE_PCI || + (class >> 8) == PCI_CLASS_BRIDGE_CARDBUS)) + /* Depth first...do children */ + nextdn = dn->child; + else if (dn->sibling) + /* ok, try next sibling instead. */ + nextdn = dn->sibling; + if (!nextdn) { + /* Walk up to next valid sibling. */ + do { + dn = dn->parent; + if (dn == start) + return NULL; + } while (dn->sibling == NULL); + nextdn = dn->sibling; + } + } + return NULL; +} + +/** + * pci_devs_phb_init_dynamic - setup pci devices under this PHB + * phb: pci-to-host bridge (top-level bridge connecting to cpu) + * + * This routine is called both during boot, (before the memory + * subsystem is set up, before kmalloc is valid) and during the + * dynamic lpar operation of adding a PHB to a running system. + */ +void pci_devs_phb_init_dynamic(struct pci_controller *phb) +{ + struct device_node *dn = phb->dn; + struct pci_dn *pdn; + + /* PHB nodes themselves must not match */ + update_dn_pci_info(dn, phb); + pdn = dn->data; + if (pdn) { + pdn->devfn = pdn->busno = -1; + pdn->phb = phb; + } + + /* Update dn->phb ptrs for new phb and children devices */ + traverse_pci_devices(dn, update_dn_pci_info, phb); +} + +/** + * pci_devs_phb_init - Initialize phbs and pci devs under them. + * + * This routine walks over all phb's (pci-host bridges) on the + * system, and sets up assorted pci-related structures + * (including pci info in the device node structs) for each + * pci device found underneath. This routine runs once, + * early in the boot sequence. + */ +void __init pci_devs_phb_init(void) +{ + struct pci_controller *phb, *tmp; + + /* This must be done first so the device nodes have valid pci info! */ + list_for_each_entry_safe(phb, tmp, &hose_list, list_node) + pci_devs_phb_init_dynamic(phb); +} diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c new file mode 100644 index 00000000000..44562aa97f1 --- /dev/null +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -0,0 +1,404 @@ +/* + * Helper routines to scan the device tree for PCI devices and busses + * + * Migrated out of PowerPC architecture pci_64.c file by Grant Likely + * <grant.likely@secretlab.ca> so that these routines are available for + * 32 bit also. + * + * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM + * Rework, based on alpha PCI code. + * Copyright (c) 2009 Secret Lab Technologies Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/pci.h> +#include <linux/export.h> +#include <asm/pci-bridge.h> +#include <asm/prom.h> + +/** + * get_int_prop - Decode a u32 from a device tree property + */ +static u32 get_int_prop(struct device_node *np, const char *name, u32 def) +{ + const __be32 *prop; + int len; + + prop = of_get_property(np, name, &len); + if (prop && len >= 4) + return of_read_number(prop, 1); + return def; +} + +/** + * pci_parse_of_flags - Parse the flags cell of a device tree PCI address + * @addr0: value of 1st cell of a device tree PCI address. + * @bridge: Set this flag if the address is from a bridge 'ranges' property + */ +unsigned int pci_parse_of_flags(u32 addr0, int bridge) +{ + unsigned int flags = 0; + + if (addr0 & 0x02000000) { + flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; + flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; + flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; + if (addr0 & 0x40000000) + flags |= IORESOURCE_PREFETCH + | PCI_BASE_ADDRESS_MEM_PREFETCH; + /* Note: We don't know whether the ROM has been left enabled + * by the firmware or not. We mark it as disabled (ie, we do + * not set the IORESOURCE_ROM_ENABLE flag) for now rather than + * do a config space read, it will be force-enabled if needed + */ + if (!bridge && (addr0 & 0xff) == 0x30) + flags |= IORESOURCE_READONLY; + } else if (addr0 & 0x01000000) + flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO; + if (flags) + flags |= IORESOURCE_SIZEALIGN; + return flags; +} + +/** + * of_pci_parse_addrs - Parse PCI addresses assigned in the device tree node + * @node: device tree node for the PCI device + * @dev: pci_dev structure for the device + * + * This function parses the 'assigned-addresses' property of a PCI devices' + * device tree node and writes them into the associated pci_dev structure. + */ +static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev) +{ + u64 base, size; + unsigned int flags; + struct pci_bus_region region; + struct resource *res; + const __be32 *addrs; + u32 i; + int proplen; + + addrs = of_get_property(node, "assigned-addresses", &proplen); + if (!addrs) + return; + pr_debug(" parse addresses (%d bytes) @ %p\n", proplen, addrs); + for (; proplen >= 20; proplen -= 20, addrs += 5) { + flags = pci_parse_of_flags(of_read_number(addrs, 1), 0); + if (!flags) + continue; + base = of_read_number(&addrs[1], 2); + size = of_read_number(&addrs[3], 2); + if (!size) + continue; + i = of_read_number(addrs, 1) & 0xff; + pr_debug(" base: %llx, size: %llx, i: %x\n", + (unsigned long long)base, + (unsigned long long)size, i); + + if (PCI_BASE_ADDRESS_0 <= i && i <= PCI_BASE_ADDRESS_5) { + res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2]; + } else if (i == dev->rom_base_reg) { + res = &dev->resource[PCI_ROM_RESOURCE]; + flags |= IORESOURCE_READONLY | IORESOURCE_CACHEABLE; + } else { + printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i); + continue; + } + res->flags = flags; + res->name = pci_name(dev); + region.start = base; + region.end = base + size - 1; + pcibios_bus_to_resource(dev->bus, res, ®ion); + } +} + +/** + * of_create_pci_dev - Given a device tree node on a pci bus, create a pci_dev + * @node: device tree node pointer + * @bus: bus the device is sitting on + * @devfn: PCI function number, extracted from device tree by caller. + */ +struct pci_dev *of_create_pci_dev(struct device_node *node, + struct pci_bus *bus, int devfn) +{ + struct pci_dev *dev; + const char *type; + struct pci_slot *slot; + + dev = pci_alloc_dev(bus); + if (!dev) + return NULL; + type = of_get_property(node, "device_type", NULL); + if (type == NULL) + type = ""; + + pr_debug(" create device, devfn: %x, type: %s\n", devfn, type); + + dev->dev.of_node = of_node_get(node); + dev->dev.parent = bus->bridge; + dev->dev.bus = &pci_bus_type; + dev->devfn = devfn; + dev->multifunction = 0; /* maybe a lie? */ + dev->needs_freset = 0; /* pcie fundamental reset required */ + set_pcie_port_type(dev); + + list_for_each_entry(slot, &dev->bus->slots, list) + if (PCI_SLOT(dev->devfn) == slot->number) + dev->slot = slot; + + dev->vendor = get_int_prop(node, "vendor-id", 0xffff); + dev->device = get_int_prop(node, "device-id", 0xffff); + dev->subsystem_vendor = get_int_prop(node, "subsystem-vendor-id", 0); + dev->subsystem_device = get_int_prop(node, "subsystem-id", 0); + + dev->cfg_size = pci_cfg_space_size(dev); + + dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(bus), + dev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn)); + dev->class = get_int_prop(node, "class-code", 0); + dev->revision = get_int_prop(node, "revision-id", 0); + + pr_debug(" class: 0x%x\n", dev->class); + pr_debug(" revision: 0x%x\n", dev->revision); + + dev->current_state = PCI_UNKNOWN; /* unknown power state */ + dev->error_state = pci_channel_io_normal; + dev->dma_mask = 0xffffffff; + + /* Early fixups, before probing the BARs */ + pci_fixup_device(pci_fixup_early, dev); + + if (!strcmp(type, "pci") || !strcmp(type, "pciex")) { + /* a PCI-PCI bridge */ + dev->hdr_type = PCI_HEADER_TYPE_BRIDGE; + dev->rom_base_reg = PCI_ROM_ADDRESS1; + set_pcie_hotplug_bridge(dev); + } else if (!strcmp(type, "cardbus")) { + dev->hdr_type = PCI_HEADER_TYPE_CARDBUS; + } else { + dev->hdr_type = PCI_HEADER_TYPE_NORMAL; + dev->rom_base_reg = PCI_ROM_ADDRESS; + /* Maybe do a default OF mapping here */ + dev->irq = NO_IRQ; + } + + of_pci_parse_addrs(node, dev); + + pr_debug(" adding to system ...\n"); + + pci_device_add(dev, bus); + + return dev; +} +EXPORT_SYMBOL(of_create_pci_dev); + +/** + * of_scan_pci_bridge - Set up a PCI bridge and scan for child nodes + * @dev: pci_dev structure for the bridge + * + * of_scan_bus() calls this routine for each PCI bridge that it finds, and + * this routine in turn call of_scan_bus() recusively to scan for more child + * devices. + */ +void of_scan_pci_bridge(struct pci_dev *dev) +{ + struct device_node *node = dev->dev.of_node; + struct pci_bus *bus; + const __be32 *busrange, *ranges; + int len, i, mode; + struct pci_bus_region region; + struct resource *res; + unsigned int flags; + u64 size; + + pr_debug("of_scan_pci_bridge(%s)\n", node->full_name); + + /* parse bus-range property */ + busrange = of_get_property(node, "bus-range", &len); + if (busrange == NULL || len != 8) { + printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %s\n", + node->full_name); + return; + } + ranges = of_get_property(node, "ranges", &len); + if (ranges == NULL) { + printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %s\n", + node->full_name); + return; + } + + bus = pci_find_bus(pci_domain_nr(dev->bus), + of_read_number(busrange, 1)); + if (!bus) { + bus = pci_add_new_bus(dev->bus, dev, + of_read_number(busrange, 1)); + if (!bus) { + printk(KERN_ERR "Failed to create pci bus for %s\n", + node->full_name); + return; + } + } + + bus->primary = dev->bus->number; + pci_bus_insert_busn_res(bus, of_read_number(busrange, 1), + of_read_number(busrange+1, 1)); + bus->bridge_ctl = 0; + + /* parse ranges property */ + /* PCI #address-cells == 3 and #size-cells == 2 always */ + res = &dev->resource[PCI_BRIDGE_RESOURCES]; + for (i = 0; i < PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES; ++i) { + res->flags = 0; + bus->resource[i] = res; + ++res; + } + i = 1; + for (; len >= 32; len -= 32, ranges += 8) { + flags = pci_parse_of_flags(of_read_number(ranges, 1), 1); + size = of_read_number(&ranges[6], 2); + if (flags == 0 || size == 0) + continue; + if (flags & IORESOURCE_IO) { + res = bus->resource[0]; + if (res->flags) { + printk(KERN_ERR "PCI: ignoring extra I/O range" + " for bridge %s\n", node->full_name); + continue; + } + } else { + if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) { + printk(KERN_ERR "PCI: too many memory ranges" + " for bridge %s\n", node->full_name); + continue; + } + res = bus->resource[i]; + ++i; + } + res->flags = flags; + region.start = of_read_number(&ranges[1], 2); + region.end = region.start + size - 1; + pcibios_bus_to_resource(dev->bus, res, ®ion); + } + sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus), + bus->number); + pr_debug(" bus name: %s\n", bus->name); + + mode = PCI_PROBE_NORMAL; + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + pr_debug(" probe mode: %d\n", mode); + + if (mode == PCI_PROBE_DEVTREE) + of_scan_bus(node, bus); + else if (mode == PCI_PROBE_NORMAL) + pci_scan_child_bus(bus); +} +EXPORT_SYMBOL(of_scan_pci_bridge); + +static struct pci_dev *of_scan_pci_dev(struct pci_bus *bus, + struct device_node *dn) +{ + struct pci_dev *dev = NULL; + const __be32 *reg; + int reglen, devfn; +#ifdef CONFIG_EEH + struct eeh_dev *edev = of_node_to_eeh_dev(dn); +#endif + + pr_debug(" * %s\n", dn->full_name); + if (!of_device_is_available(dn)) + return NULL; + + reg = of_get_property(dn, "reg", ®len); + if (reg == NULL || reglen < 20) + return NULL; + devfn = (of_read_number(reg, 1) >> 8) & 0xff; + + /* Check if the PCI device is already there */ + dev = pci_get_slot(bus, devfn); + if (dev) { + pci_dev_put(dev); + return dev; + } + + /* Device removed permanently ? */ +#ifdef CONFIG_EEH + if (edev && (edev->mode & EEH_DEV_REMOVED)) + return NULL; +#endif + + /* create a new pci_dev for this device */ + dev = of_create_pci_dev(dn, bus, devfn); + if (!dev) + return NULL; + + pr_debug(" dev header type: %x\n", dev->hdr_type); + return dev; +} + +/** + * __of_scan_bus - given a PCI bus node, setup bus and scan for child devices + * @node: device tree node for the PCI bus + * @bus: pci_bus structure for the PCI bus + * @rescan_existing: Flag indicating bus has already been set up + */ +static void __of_scan_bus(struct device_node *node, struct pci_bus *bus, + int rescan_existing) +{ + struct device_node *child; + struct pci_dev *dev; + + pr_debug("of_scan_bus(%s) bus no %d...\n", + node->full_name, bus->number); + + /* Scan direct children */ + for_each_child_of_node(node, child) { + dev = of_scan_pci_dev(bus, child); + if (!dev) + continue; + pr_debug(" dev header type: %x\n", dev->hdr_type); + } + + /* Apply all fixups necessary. We don't fixup the bus "self" + * for an existing bridge that is being rescanned + */ + if (!rescan_existing) + pcibios_setup_bus_self(bus); + pcibios_setup_bus_devices(bus); + + /* Now scan child busses */ + list_for_each_entry(dev, &bus->devices, bus_list) { + if (pci_is_bridge(dev)) { + of_scan_pci_bridge(dev); + } + } +} + +/** + * of_scan_bus - given a PCI bus node, setup bus and scan for child devices + * @node: device tree node for the PCI bus + * @bus: pci_bus structure for the PCI bus + */ +void of_scan_bus(struct device_node *node, struct pci_bus *bus) +{ + __of_scan_bus(node, bus, 0); +} +EXPORT_SYMBOL_GPL(of_scan_bus); + +/** + * of_rescan_bus - given a PCI bus node, scan for child devices + * @node: device tree node for the PCI bus + * @bus: pci_bus structure for the PCI bus + * + * Same as of_scan_bus, but for a pci_bus structure that has already been + * setup. + */ +void of_rescan_bus(struct device_node *node, struct pci_bus *bus) +{ + __of_scan_bus(node, bus, 1); +} +EXPORT_SYMBOL_GPL(of_rescan_bus); + diff --git a/arch/powerpc/kernel/pmc.c b/arch/powerpc/kernel/pmc.c index 2d333cc8408..58eaa3ddf7b 100644 --- a/arch/powerpc/kernel/pmc.c +++ b/arch/powerpc/kernel/pmc.c @@ -12,43 +12,33 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <linux/errno.h> +#include <linux/bug.h> #include <linux/spinlock.h> -#include <linux/module.h> +#include <linux/export.h> #include <asm/processor.h> +#include <asm/cputable.h> #include <asm/pmc.h> -#if defined(CONFIG_FSL_BOOKE) && !defined(CONFIG_E200) -static void dummy_perf(struct pt_regs *regs) -{ - unsigned int pmgc0 = mfpmr(PMRN_PMGC0); - - pmgc0 &= ~PMGC0_PMIE; - mtpmr(PMRN_PMGC0, pmgc0); -} -#elif defined(CONFIG_PPC64) || defined(CONFIG_6xx) - #ifndef MMCR0_PMAO #define MMCR0_PMAO 0 #endif -/* Ensure exceptions are disabled */ static void dummy_perf(struct pt_regs *regs) { - unsigned int mmcr0 = mfspr(SPRN_MMCR0); - - mmcr0 &= ~(MMCR0_PMXE|MMCR0_PMAO); - mtspr(SPRN_MMCR0, mmcr0); -} +#if defined(CONFIG_FSL_EMB_PERFMON) + mtpmr(PMRN_PMGC0, mfpmr(PMRN_PMGC0) & ~PMGC0_PMIE); +#elif defined(CONFIG_PPC64) || defined(CONFIG_6xx) + if (cur_cpu_spec->pmc_type == PPC_PMC_IBM) + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~(MMCR0_PMXE|MMCR0_PMAO)); #else -static void dummy_perf(struct pt_regs *regs) -{ -} + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMXE); #endif +} + -static DEFINE_SPINLOCK(pmc_owner_lock); +static DEFINE_RAW_SPINLOCK(pmc_owner_lock); static void *pmc_owner_caller; /* mostly for debugging */ perf_irq_t perf_irq = dummy_perf; @@ -56,7 +46,7 @@ int reserve_pmc_hardware(perf_irq_t new_perf_irq) { int err = 0; - spin_lock(&pmc_owner_lock); + raw_spin_lock(&pmc_owner_lock); if (pmc_owner_caller) { printk(KERN_WARNING "reserve_pmc_hardware: " @@ -67,24 +57,24 @@ int reserve_pmc_hardware(perf_irq_t new_perf_irq) } pmc_owner_caller = __builtin_return_address(0); - perf_irq = new_perf_irq ? : dummy_perf; + perf_irq = new_perf_irq ? new_perf_irq : dummy_perf; out: - spin_unlock(&pmc_owner_lock); + raw_spin_unlock(&pmc_owner_lock); return err; } EXPORT_SYMBOL_GPL(reserve_pmc_hardware); void release_pmc_hardware(void) { - spin_lock(&pmc_owner_lock); + raw_spin_lock(&pmc_owner_lock); WARN_ON(! pmc_owner_caller); pmc_owner_caller = NULL; perf_irq = dummy_perf; - spin_unlock(&pmc_owner_lock); + raw_spin_unlock(&pmc_owner_lock); } EXPORT_SYMBOL_GPL(release_pmc_hardware); diff --git a/arch/powerpc/kernel/ppc32.h b/arch/powerpc/kernel/ppc32.h new file mode 100644 index 00000000000..a27c914d580 --- /dev/null +++ b/arch/powerpc/kernel/ppc32.h @@ -0,0 +1,64 @@ +#ifndef _PPC64_PPC32_H +#define _PPC64_PPC32_H + +#include <linux/compat.h> +#include <asm/siginfo.h> +#include <asm/signal.h> + +/* + * Data types and macros for providing 32b PowerPC support. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* These are here to support 32-bit syscalls on a 64-bit kernel. */ + +struct pt_regs32 { + unsigned int gpr[32]; + unsigned int nip; + unsigned int msr; + unsigned int orig_gpr3; /* Used for restarting system calls */ + unsigned int ctr; + unsigned int link; + unsigned int xer; + unsigned int ccr; + unsigned int mq; /* 601 only (not used at present) */ + unsigned int trap; /* Reason for being here */ + unsigned int dar; /* Fault registers */ + unsigned int dsisr; + unsigned int result; /* Result of a system call */ +}; + +struct sigcontext32 { + unsigned int _unused[4]; + int signal; + compat_uptr_t handler; + unsigned int oldmask; + compat_uptr_t regs; /* 4 byte pointer to the pt_regs32 structure. */ +}; + +struct mcontext32 { + elf_gregset_t32 mc_gregs; + elf_fpregset_t mc_fregs; + unsigned int mc_pad[2]; + elf_vrregset_t32 mc_vregs __attribute__((__aligned__(16))); + elf_vsrreghalf_t32 mc_vsregs __attribute__((__aligned__(16))); +}; + +struct ucontext32 { + unsigned int uc_flags; + unsigned int uc_link; + compat_stack_t uc_stack; + int uc_pad[7]; + compat_uptr_t uc_regs; /* points to uc_mcontext field */ + compat_sigset_t uc_sigmask; /* mask last for extensibility */ + /* glibc has 1024-bit signal masks, ours are 64-bit */ + int uc_maskext[30]; + int uc_pad2[3]; + struct mcontext32 uc_mcontext; +}; + +#endif /* _PPC64_PPC32_H */ diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 8bc540337ba..48d17d6fca5 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -1,28 +1,24 @@ -#include <linux/config.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/threads.h> #include <linux/smp.h> #include <linux/sched.h> #include <linux/elfcore.h> #include <linux/string.h> #include <linux/interrupt.h> -#include <linux/tty.h> +#include <linux/screen_info.h> #include <linux/vt_kern.h> #include <linux/nvram.h> -#include <linux/console.h> #include <linux/irq.h> #include <linux/pci.h> #include <linux/delay.h> -#include <linux/ide.h> #include <linux/bitops.h> #include <asm/page.h> -#include <asm/semaphore.h> #include <asm/processor.h> +#include <asm/cacheflush.h> #include <asm/uaccess.h> #include <asm/io.h> -#include <asm/ide.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/checksum.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -30,7 +26,6 @@ #include <linux/cuda.h> #include <linux/pmu.h> #include <asm/prom.h> -#include <asm/system.h> #include <asm/pci-bridge.h> #include <asm/irq.h> #include <asm/pmac_feature.h> @@ -44,10 +39,11 @@ #include <asm/cputable.h> #include <asm/btext.h> #include <asm/div64.h> - -#ifdef CONFIG_8xx -#include <asm/commproc.h> -#endif +#include <asm/signal.h> +#include <asm/dcr.h> +#include <asm/ftrace.h> +#include <asm/switch_to.h> +#include <asm/epapr_hcalls.h> #ifdef CONFIG_PPC32 extern void transfer_to_handler(void); @@ -56,17 +52,13 @@ extern void machine_check_exception(struct pt_regs *regs); extern void alignment_exception(struct pt_regs *regs); extern void program_check_exception(struct pt_regs *regs); extern void single_step_exception(struct pt_regs *regs); -extern int do_signal(sigset_t *, struct pt_regs *); -extern int pmac_newworld; extern int sys_sigreturn(struct pt_regs *regs); EXPORT_SYMBOL(clear_pages); EXPORT_SYMBOL(ISA_DMA_THRESHOLD); EXPORT_SYMBOL(DMA_MODE_READ); EXPORT_SYMBOL(DMA_MODE_WRITE); -EXPORT_SYMBOL(__div64_32); -EXPORT_SYMBOL(do_signal); EXPORT_SYMBOL(transfer_to_handler); EXPORT_SYMBOL(do_IRQ); EXPORT_SYMBOL(machine_check_exception); @@ -76,111 +68,61 @@ EXPORT_SYMBOL(single_step_exception); EXPORT_SYMBOL(sys_sigreturn); #endif -#if defined(CONFIG_PPC_PREP) -EXPORT_SYMBOL(_prep_type); -EXPORT_SYMBOL(ucSystemType); +#ifdef CONFIG_FUNCTION_TRACER +EXPORT_SYMBOL(_mcount); #endif -#if !defined(__INLINE_BITOPS) -EXPORT_SYMBOL(set_bit); -EXPORT_SYMBOL(clear_bit); -EXPORT_SYMBOL(change_bit); -EXPORT_SYMBOL(test_and_set_bit); -EXPORT_SYMBOL(test_and_clear_bit); -EXPORT_SYMBOL(test_and_change_bit); -#endif /* __INLINE_BITOPS */ - EXPORT_SYMBOL(strcpy); EXPORT_SYMBOL(strncpy); EXPORT_SYMBOL(strcat); -EXPORT_SYMBOL(strncat); -EXPORT_SYMBOL(strchr); -EXPORT_SYMBOL(strrchr); -EXPORT_SYMBOL(strpbrk); -EXPORT_SYMBOL(strstr); EXPORT_SYMBOL(strlen); -EXPORT_SYMBOL(strnlen); EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); -EXPORT_SYMBOL(strcasecmp); +#ifndef CONFIG_GENERIC_CSUM EXPORT_SYMBOL(csum_partial); EXPORT_SYMBOL(csum_partial_copy_generic); EXPORT_SYMBOL(ip_fast_csum); EXPORT_SYMBOL(csum_tcpudp_magic); +#endif EXPORT_SYMBOL(__copy_tofrom_user); EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(__strncpy_from_user); -EXPORT_SYMBOL(__strnlen_user); - -EXPORT_SYMBOL(_insb); -EXPORT_SYMBOL(_outsb); -EXPORT_SYMBOL(_insw); -EXPORT_SYMBOL(_outsw); -EXPORT_SYMBOL(_insl); -EXPORT_SYMBOL(_outsl); -EXPORT_SYMBOL(_insw_ns); -EXPORT_SYMBOL(_outsw_ns); -EXPORT_SYMBOL(_insl_ns); -EXPORT_SYMBOL(_outsl_ns); -EXPORT_SYMBOL(ioremap); -#ifdef CONFIG_44x -EXPORT_SYMBOL(ioremap64); -#endif -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(iounmap); -#ifdef CONFIG_PPC32 -EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ -#endif - -#if defined(CONFIG_PPC32) && (defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_IDE_MODULE)) -EXPORT_SYMBOL(ppc_ide_md); -#endif +EXPORT_SYMBOL(copy_page); #if defined(CONFIG_PCI) && defined(CONFIG_PPC32) EXPORT_SYMBOL(isa_io_base); EXPORT_SYMBOL(isa_mem_base); EXPORT_SYMBOL(pci_dram_offset); -EXPORT_SYMBOL(pci_alloc_consistent); -EXPORT_SYMBOL(pci_free_consistent); -EXPORT_SYMBOL(pci_bus_io_base); -EXPORT_SYMBOL(pci_bus_io_base_phys); -EXPORT_SYMBOL(pci_bus_mem_base_phys); -EXPORT_SYMBOL(pci_bus_to_hose); -EXPORT_SYMBOL(pci_resource_to_bus); -EXPORT_SYMBOL(pci_phys_to_bus); -EXPORT_SYMBOL(pci_bus_to_phys); #endif /* CONFIG_PCI */ -#ifdef CONFIG_NOT_COHERENT_CACHE -EXPORT_SYMBOL(flush_dcache_all); -#endif - EXPORT_SYMBOL(start_thread); -EXPORT_SYMBOL(kernel_thread); +#ifdef CONFIG_PPC_FPU EXPORT_SYMBOL(giveup_fpu); +EXPORT_SYMBOL(load_fp_state); +EXPORT_SYMBOL(store_fp_state); +#endif #ifdef CONFIG_ALTIVEC EXPORT_SYMBOL(giveup_altivec); +EXPORT_SYMBOL(load_vr_state); +EXPORT_SYMBOL(store_vr_state); #endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX +EXPORT_SYMBOL(giveup_vsx); +EXPORT_SYMBOL_GPL(__giveup_vsx); +#endif /* CONFIG_VSX */ #ifdef CONFIG_SPE EXPORT_SYMBOL(giveup_spe); #endif /* CONFIG_SPE */ -#ifdef CONFIG_PPC64 -EXPORT_SYMBOL(__flush_icache_range); -#else +#ifndef CONFIG_PPC64 EXPORT_SYMBOL(flush_instruction_cache); -EXPORT_SYMBOL(flush_icache_range); -EXPORT_SYMBOL(flush_tlb_kernel_range); -EXPORT_SYMBOL(flush_tlb_page); -EXPORT_SYMBOL(_tlbie); #endif EXPORT_SYMBOL(flush_dcache_range); +EXPORT_SYMBOL(flush_icache_range); #ifdef CONFIG_SMP -EXPORT_SYMBOL(smp_call_function); #ifdef CONFIG_PPC32 EXPORT_SYMBOL(smp_hw_index); #endif @@ -197,15 +139,6 @@ EXPORT_SYMBOL(adb_try_handler_change); EXPORT_SYMBOL(cuda_request); EXPORT_SYMBOL(cuda_poll); #endif /* CONFIG_ADB_CUDA */ -#if defined(CONFIG_PPC_MULTIPLATFORM) && defined(CONFIG_PPC32) -EXPORT_SYMBOL(_machine); -#endif -#ifdef CONFIG_PPC_PMAC -EXPORT_SYMBOL(sys_ctrler); -#endif -#ifdef CONFIG_VT -EXPORT_SYMBOL(kd_mksound); -#endif EXPORT_SYMBOL(to_tm); #ifdef CONFIG_PPC32 @@ -215,12 +148,16 @@ long long __lshrdi3(long long, int); EXPORT_SYMBOL(__ashrdi3); EXPORT_SYMBOL(__ashldi3); EXPORT_SYMBOL(__lshrdi3); +int __ucmpdi2(unsigned long long, unsigned long long); +EXPORT_SYMBOL(__ucmpdi2); +int __cmpdi2(long long, long long); +EXPORT_SYMBOL(__cmpdi2); #endif - +long long __bswapdi2(long long); +EXPORT_SYMBOL(__bswapdi2); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memmove); -EXPORT_SYMBOL(memscan); EXPORT_SYMBOL(memcmp); EXPORT_SYMBOL(memchr); @@ -229,37 +166,20 @@ EXPORT_SYMBOL(screen_info); #endif #ifdef CONFIG_PPC32 -EXPORT_SYMBOL(__delay); EXPORT_SYMBOL(timer_interrupt); -EXPORT_SYMBOL(irq_desc); EXPORT_SYMBOL(tb_ticks_per_jiffy); -EXPORT_SYMBOL(console_drivers); EXPORT_SYMBOL(cacheable_memcpy); -#endif - -EXPORT_SYMBOL(__up); -EXPORT_SYMBOL(__down); -EXPORT_SYMBOL(__down_interruptible); - -#ifdef CONFIG_8xx -EXPORT_SYMBOL(cpm_install_handler); -EXPORT_SYMBOL(cpm_free_handler); -#endif /* CONFIG_8xx */ -#if defined(CONFIG_8xx) || defined(CONFIG_40x) || defined(CONFIG_85xx) ||\ - defined(CONFIG_83xx) -EXPORT_SYMBOL(__res); +EXPORT_SYMBOL(cacheable_memzero); #endif #ifdef CONFIG_PPC32 -EXPORT_SYMBOL(next_mmu_context); -EXPORT_SYMBOL(set_context); +EXPORT_SYMBOL(switch_mmu_context); #endif #ifdef CONFIG_PPC_STD_MMU_32 extern long mol_trampoline; EXPORT_SYMBOL(mol_trampoline); /* For MOL */ EXPORT_SYMBOL(flush_hash_pages); /* For MOL */ -EXPORT_SYMBOL_GPL(__handle_mm_fault); /* For MOL */ #ifdef CONFIG_SMP extern int mmu_hash_lock; EXPORT_SYMBOL(mmu_hash_lock); /* For MOL */ @@ -267,7 +187,23 @@ EXPORT_SYMBOL(mmu_hash_lock); /* For MOL */ extern long *intercept_table; EXPORT_SYMBOL(intercept_table); #endif /* CONFIG_PPC_STD_MMU_32 */ -#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) +#ifdef CONFIG_PPC_DCR_NATIVE EXPORT_SYMBOL(__mtdcr); EXPORT_SYMBOL(__mfdcr); #endif +EXPORT_SYMBOL(empty_zero_page); + +#ifdef CONFIG_PPC64 +EXPORT_SYMBOL(__arch_hweight8); +EXPORT_SYMBOL(__arch_hweight16); +EXPORT_SYMBOL(__arch_hweight32); +EXPORT_SYMBOL(__arch_hweight64); +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 +EXPORT_SYMBOL_GPL(mmu_psize_defs); +#endif + +#ifdef CONFIG_EPAPR_PARAVIRT +EXPORT_SYMBOL(epapr_hypercall_start); +#endif diff --git a/arch/powerpc/kernel/ppc_save_regs.S b/arch/powerpc/kernel/ppc_save_regs.S new file mode 100644 index 00000000000..1b1787d5289 --- /dev/null +++ b/arch/powerpc/kernel/ppc_save_regs.S @@ -0,0 +1,75 @@ +/* + * Copyright (C) 1996 Paul Mackerras. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * NOTE: assert(sizeof(buf) > 23 * sizeof(long)) + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/ptrace.h> + +/* + * Grab the register values as they are now. + * This won't do a particularly good job because we really + * want our caller's caller's registers, and our caller has + * already executed its prologue. + * ToDo: We could reach back into the caller's save area to do + * a better job of representing the caller's state (note that + * that will be different for 32-bit and 64-bit, because of the + * different ABIs, though). + */ +_GLOBAL(ppc_save_regs) + PPC_STL r0,0*SZL(r3) + PPC_STL r2,2*SZL(r3) + PPC_STL r3,3*SZL(r3) + PPC_STL r4,4*SZL(r3) + PPC_STL r5,5*SZL(r3) + PPC_STL r6,6*SZL(r3) + PPC_STL r7,7*SZL(r3) + PPC_STL r8,8*SZL(r3) + PPC_STL r9,9*SZL(r3) + PPC_STL r10,10*SZL(r3) + PPC_STL r11,11*SZL(r3) + PPC_STL r12,12*SZL(r3) + PPC_STL r13,13*SZL(r3) + PPC_STL r14,14*SZL(r3) + PPC_STL r15,15*SZL(r3) + PPC_STL r16,16*SZL(r3) + PPC_STL r17,17*SZL(r3) + PPC_STL r18,18*SZL(r3) + PPC_STL r19,19*SZL(r3) + PPC_STL r20,20*SZL(r3) + PPC_STL r21,21*SZL(r3) + PPC_STL r22,22*SZL(r3) + PPC_STL r23,23*SZL(r3) + PPC_STL r24,24*SZL(r3) + PPC_STL r25,25*SZL(r3) + PPC_STL r26,26*SZL(r3) + PPC_STL r27,27*SZL(r3) + PPC_STL r28,28*SZL(r3) + PPC_STL r29,29*SZL(r3) + PPC_STL r30,30*SZL(r3) + PPC_STL r31,31*SZL(r3) + /* go up one stack frame for SP */ + PPC_LL r4,0(r1) + PPC_STL r4,1*SZL(r3) + /* get caller's LR */ + PPC_LL r0,LRSAVE(r4) + PPC_STL r0,_NIP-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_LINK-STACK_FRAME_OVERHEAD(r3) + mfmsr r0 + PPC_STL r0,_MSR-STACK_FRAME_OVERHEAD(r3) + mfctr r0 + PPC_STL r0,_CTR-STACK_FRAME_OVERHEAD(r3) + mfxer r0 + PPC_STL r0,_XER-STACK_FRAME_OVERHEAD(r3) + mfcr r0 + PPC_STL r0,_CCR-STACK_FRAME_OVERHEAD(r3) + li r0,0 + PPC_STL r0,_TRAP-STACK_FRAME_OVERHEAD(r3) + blr diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c new file mode 100644 index 00000000000..c30612aad68 --- /dev/null +++ b/arch/powerpc/kernel/proc_powerpc.c @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/kernel.h> + +#include <asm/machdep.h> +#include <asm/vdso_datapage.h> +#include <asm/rtas.h> +#include <asm/uaccess.h> +#include <asm/prom.h> + +#ifdef CONFIG_PPC64 + +static loff_t page_map_seek(struct file *file, loff_t off, int whence) +{ + return fixed_size_llseek(file, off, whence, PAGE_SIZE); +} + +static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes, + loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, + PDE_DATA(file_inode(file)), PAGE_SIZE); +} + +static int page_map_mmap( struct file *file, struct vm_area_struct *vma ) +{ + if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) + return -EINVAL; + + remap_pfn_range(vma, vma->vm_start, + __pa(PDE_DATA(file_inode(file))) >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot); + return 0; +} + +static const struct file_operations page_map_fops = { + .llseek = page_map_seek, + .read = page_map_read, + .mmap = page_map_mmap +}; + + +static int __init proc_ppc64_init(void) +{ + struct proc_dir_entry *pde; + + pde = proc_create_data("powerpc/systemcfg", S_IFREG|S_IRUGO, NULL, + &page_map_fops, vdso_data); + if (!pde) + return 1; + proc_set_size(pde, PAGE_SIZE); + + return 0; +} +__initcall(proc_ppc64_init); + +#endif /* CONFIG_PPC64 */ + +/* + * Create the ppc64 and ppc64/rtas directories early. This allows us to + * assume that they have been previously created in drivers. + */ +static int __init proc_ppc64_create(void) +{ + struct proc_dir_entry *root; + + root = proc_mkdir("powerpc", NULL); + if (!root) + return 1; + +#ifdef CONFIG_PPC64 + if (!proc_symlink("ppc64", NULL, "powerpc")) + pr_err("Failed to create link /proc/ppc64 -> /proc/powerpc\n"); +#endif + + if (!of_find_node_by_path("/rtas")) + return 0; + + if (!proc_mkdir("rtas", root)) + return 1; + + if (!proc_symlink("rtas", NULL, "powerpc/rtas")) + return 1; + + return 0; +} +core_initcall(proc_ppc64_create); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 8f85dabe4df..be99774d3f4 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1,6 +1,4 @@ /* - * arch/ppc/kernel/process.c - * * Derived from "arch/i386/kernel/process.c" * Copyright (C) 1995 Linus Torvalds * @@ -16,40 +14,55 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/stddef.h> #include <linux/unistd.h> #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/user.h> #include <linux/elf.h> -#include <linux/init.h> #include <linux/prctl.h> #include <linux/init_task.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/kallsyms.h> #include <linux/mqueue.h> #include <linux/hardirq.h> #include <linux/utsname.h> -#include <linux/kprobes.h> +#include <linux/ftrace.h> +#include <linux/kernel_stat.h> +#include <linux/personality.h> +#include <linux/random.h> +#include <linux/hw_breakpoint.h> #include <asm/pgtable.h> #include <asm/uaccess.h> -#include <asm/system.h> #include <asm/io.h> #include <asm/processor.h> #include <asm/mmu.h> #include <asm/prom.h> +#include <asm/machdep.h> +#include <asm/time.h> +#include <asm/runlatch.h> +#include <asm/syscalls.h> +#include <asm/switch_to.h> +#include <asm/tm.h> +#include <asm/debug.h> #ifdef CONFIG_PPC64 #include <asm/firmware.h> -#include <asm/plpar_wrappers.h> -#include <asm/time.h> +#endif +#include <asm/code-patching.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> + +/* Transactional Memory debug */ +#ifdef TM_DEBUG_SW +#define TM_DEBUG(x...) printk(KERN_INFO x) +#else +#define TM_DEBUG(x...) do { } while(0) #endif extern unsigned long _get_SP(void); @@ -57,9 +70,53 @@ extern unsigned long _get_SP(void); #ifndef CONFIG_SMP struct task_struct *last_task_used_math = NULL; struct task_struct *last_task_used_altivec = NULL; +struct task_struct *last_task_used_vsx = NULL; struct task_struct *last_task_used_spe = NULL; #endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +void giveup_fpu_maybe_transactional(struct task_struct *tsk) +{ + /* + * If we are saving the current thread's registers, and the + * thread is in a transactional state, set the TIF_RESTORE_TM + * bit so that we know to restore the registers before + * returning to userspace. + */ + if (tsk == current && tsk->thread.regs && + MSR_TM_ACTIVE(tsk->thread.regs->msr) && + !test_thread_flag(TIF_RESTORE_TM)) { + tsk->thread.tm_orig_msr = tsk->thread.regs->msr; + set_thread_flag(TIF_RESTORE_TM); + } + + giveup_fpu(tsk); +} + +void giveup_altivec_maybe_transactional(struct task_struct *tsk) +{ + /* + * If we are saving the current thread's registers, and the + * thread is in a transactional state, set the TIF_RESTORE_TM + * bit so that we know to restore the registers before + * returning to userspace. + */ + if (tsk == current && tsk->thread.regs && + MSR_TM_ACTIVE(tsk->thread.regs->msr) && + !test_thread_flag(TIF_RESTORE_TM)) { + tsk->thread.tm_orig_msr = tsk->thread.regs->msr; + set_thread_flag(TIF_RESTORE_TM); + } + + giveup_altivec(tsk); +} + +#else +#define giveup_fpu_maybe_transactional(tsk) giveup_fpu(tsk) +#define giveup_altivec_maybe_transactional(tsk) giveup_altivec(tsk) +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + +#ifdef CONFIG_PPC_FPU /* * Make sure the floating-point register state in the * the thread_struct is up to date for task tsk. @@ -87,11 +144,13 @@ void flush_fp_to_thread(struct task_struct *tsk) */ BUG_ON(tsk != current); #endif - giveup_fpu(current); + giveup_fpu_maybe_transactional(tsk); } preempt_enable(); } } +EXPORT_SYMBOL_GPL(flush_fp_to_thread); +#endif /* CONFIG_PPC_FPU */ void enable_kernel_fp(void) { @@ -99,26 +158,15 @@ void enable_kernel_fp(void) #ifdef CONFIG_SMP if (current->thread.regs && (current->thread.regs->msr & MSR_FP)) - giveup_fpu(current); + giveup_fpu_maybe_transactional(current); else giveup_fpu(NULL); /* just enables FP for kernel */ #else - giveup_fpu(last_task_used_math); + giveup_fpu_maybe_transactional(last_task_used_math); #endif /* CONFIG_SMP */ } EXPORT_SYMBOL(enable_kernel_fp); -int dump_task_fpu(struct task_struct *tsk, elf_fpregset_t *fpregs) -{ - if (!tsk->thread.regs) - return 0; - flush_fp_to_thread(current); - - memcpy(fpregs, &tsk->thread.fpr[0], sizeof(*fpregs)); - - return 1; -} - #ifdef CONFIG_ALTIVEC void enable_kernel_altivec(void) { @@ -126,11 +174,11 @@ void enable_kernel_altivec(void) #ifdef CONFIG_SMP if (current->thread.regs && (current->thread.regs->msr & MSR_VEC)) - giveup_altivec(current); + giveup_altivec_maybe_transactional(current); else - giveup_altivec(NULL); /* just enable AltiVec for kernel - force */ + giveup_altivec_notask(); #else - giveup_altivec(last_task_used_altivec); + giveup_altivec_maybe_transactional(last_task_used_altivec); #endif /* CONFIG_SMP */ } EXPORT_SYMBOL(enable_kernel_altivec); @@ -147,19 +195,55 @@ void flush_altivec_to_thread(struct task_struct *tsk) #ifdef CONFIG_SMP BUG_ON(tsk != current); #endif - giveup_altivec(current); + giveup_altivec_maybe_transactional(tsk); } preempt_enable(); } } +EXPORT_SYMBOL_GPL(flush_altivec_to_thread); +#endif /* CONFIG_ALTIVEC */ -int dump_task_altivec(struct pt_regs *regs, elf_vrregset_t *vrregs) +#ifdef CONFIG_VSX +#if 0 +/* not currently used, but some crazy RAID module might want to later */ +void enable_kernel_vsx(void) { - flush_altivec_to_thread(current); - memcpy(vrregs, ¤t->thread.vr[0], sizeof(*vrregs)); - return 1; + WARN_ON(preemptible()); + +#ifdef CONFIG_SMP + if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) + giveup_vsx(current); + else + giveup_vsx(NULL); /* just enable vsx for kernel - force */ +#else + giveup_vsx(last_task_used_vsx); +#endif /* CONFIG_SMP */ } -#endif /* CONFIG_ALTIVEC */ +EXPORT_SYMBOL(enable_kernel_vsx); +#endif + +void giveup_vsx(struct task_struct *tsk) +{ + giveup_fpu_maybe_transactional(tsk); + giveup_altivec_maybe_transactional(tsk); + __giveup_vsx(tsk); +} + +void flush_vsx_to_thread(struct task_struct *tsk) +{ + if (tsk->thread.regs) { + preempt_disable(); + if (tsk->thread.regs->msr & MSR_VSX) { +#ifdef CONFIG_SMP + BUG_ON(tsk != current); +#endif + giveup_vsx(tsk); + } + preempt_enable(); + } +} +EXPORT_SYMBOL_GPL(flush_vsx_to_thread); +#endif /* CONFIG_VSX */ #ifdef CONFIG_SPE @@ -186,55 +270,502 @@ void flush_spe_to_thread(struct task_struct *tsk) #ifdef CONFIG_SMP BUG_ON(tsk != current); #endif - giveup_spe(current); + tsk->thread.spefscr = mfspr(SPRN_SPEFSCR); + giveup_spe(tsk); } preempt_enable(); } } +#endif /* CONFIG_SPE */ + +#ifndef CONFIG_SMP +/* + * If we are doing lazy switching of CPU state (FP, altivec or SPE), + * and the current task has some state, discard it. + */ +void discard_lazy_cpu_state(void) +{ + preempt_disable(); + if (last_task_used_math == current) + last_task_used_math = NULL; +#ifdef CONFIG_ALTIVEC + if (last_task_used_altivec == current) + last_task_used_altivec = NULL; +#endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX + if (last_task_used_vsx == current) + last_task_used_vsx = NULL; +#endif /* CONFIG_VSX */ +#ifdef CONFIG_SPE + if (last_task_used_spe == current) + last_task_used_spe = NULL; +#endif + preempt_enable(); +} +#endif /* CONFIG_SMP */ -int dump_spe(struct pt_regs *regs, elf_vrregset_t *evrregs) +#ifdef CONFIG_PPC_ADV_DEBUG_REGS +void do_send_trap(struct pt_regs *regs, unsigned long address, + unsigned long error_code, int signal_code, int breakpt) { - flush_spe_to_thread(current); - /* We copy u32 evr[32] + u64 acc + u32 spefscr -> 35 */ - memcpy(evrregs, ¤t->thread.evr[0], sizeof(u32) * 35); - return 1; + siginfo_t info; + + current->thread.trap_nr = signal_code; + if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code, + 11, SIGSEGV) == NOTIFY_STOP) + return; + + /* Deliver the signal to userspace */ + info.si_signo = SIGTRAP; + info.si_errno = breakpt; /* breakpoint or watchpoint id */ + info.si_code = signal_code; + info.si_addr = (void __user *)address; + force_sig_info(SIGTRAP, &info, current); } -#endif /* CONFIG_SPE */ +#else /* !CONFIG_PPC_ADV_DEBUG_REGS */ +void do_break (struct pt_regs *regs, unsigned long address, + unsigned long error_code) +{ + siginfo_t info; + + current->thread.trap_nr = TRAP_HWBKPT; + if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code, + 11, SIGSEGV) == NOTIFY_STOP) + return; + + if (debugger_break_match(regs)) + return; + + /* Clear the breakpoint */ + hw_breakpoint_disable(); + + /* Deliver the signal to userspace */ + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = TRAP_HWBKPT; + info.si_addr = (void __user *)address; + force_sig_info(SIGTRAP, &info, current); +} +#endif /* CONFIG_PPC_ADV_DEBUG_REGS */ -static void set_dabr_spr(unsigned long val) +static DEFINE_PER_CPU(struct arch_hw_breakpoint, current_brk); + +#ifdef CONFIG_PPC_ADV_DEBUG_REGS +/* + * Set the debug registers back to their default "safe" values. + */ +static void set_debug_reg_defaults(struct thread_struct *thread) { - mtspr(SPRN_DABR, val); + thread->debug.iac1 = thread->debug.iac2 = 0; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + thread->debug.iac3 = thread->debug.iac4 = 0; +#endif + thread->debug.dac1 = thread->debug.dac2 = 0; +#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 + thread->debug.dvc1 = thread->debug.dvc2 = 0; +#endif + thread->debug.dbcr0 = 0; +#ifdef CONFIG_BOOKE + /* + * Force User/Supervisor bits to b11 (user-only MSR[PR]=1) + */ + thread->debug.dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US | + DBCR1_IAC3US | DBCR1_IAC4US; + /* + * Force Data Address Compare User/Supervisor bits to be User-only + * (0b11 MSR[PR]=1) and set all other bits in DBCR2 register to be 0. + */ + thread->debug.dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US; +#else + thread->debug.dbcr1 = 0; +#endif } -int set_dabr(unsigned long dabr) +static void prime_debug_regs(struct debug_reg *debug) { - int ret = 0; + /* + * We could have inherited MSR_DE from userspace, since + * it doesn't get cleared on exception entry. Make sure + * MSR_DE is clear before we enable any debug events. + */ + mtmsr(mfmsr() & ~MSR_DE); -#ifdef CONFIG_PPC64 - if (firmware_has_feature(FW_FEATURE_XDABR)) { - /* We want to catch accesses from kernel and userspace */ - unsigned long flags = H_DABRX_KERNEL|H_DABRX_USER; - ret = plpar_set_xdabr(dabr, flags); - } else if (firmware_has_feature(FW_FEATURE_DABR)) { - ret = plpar_set_dabr(dabr); - } else + mtspr(SPRN_IAC1, debug->iac1); + mtspr(SPRN_IAC2, debug->iac2); +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + mtspr(SPRN_IAC3, debug->iac3); + mtspr(SPRN_IAC4, debug->iac4); +#endif + mtspr(SPRN_DAC1, debug->dac1); + mtspr(SPRN_DAC2, debug->dac2); +#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 + mtspr(SPRN_DVC1, debug->dvc1); + mtspr(SPRN_DVC2, debug->dvc2); +#endif + mtspr(SPRN_DBCR0, debug->dbcr0); + mtspr(SPRN_DBCR1, debug->dbcr1); +#ifdef CONFIG_BOOKE + mtspr(SPRN_DBCR2, debug->dbcr2); #endif - set_dabr_spr(dabr); +} +/* + * Unless neither the old or new thread are making use of the + * debug registers, set the debug registers from the values + * stored in the new thread. + */ +void switch_booke_debug_regs(struct debug_reg *new_debug) +{ + if ((current->thread.debug.dbcr0 & DBCR0_IDM) + || (new_debug->dbcr0 & DBCR0_IDM)) + prime_debug_regs(new_debug); +} +EXPORT_SYMBOL_GPL(switch_booke_debug_regs); +#else /* !CONFIG_PPC_ADV_DEBUG_REGS */ +#ifndef CONFIG_HAVE_HW_BREAKPOINT +static void set_debug_reg_defaults(struct thread_struct *thread) +{ + thread->hw_brk.address = 0; + thread->hw_brk.type = 0; + set_breakpoint(&thread->hw_brk); +} +#endif /* !CONFIG_HAVE_HW_BREAKPOINT */ +#endif /* CONFIG_PPC_ADV_DEBUG_REGS */ - return ret; +#ifdef CONFIG_PPC_ADV_DEBUG_REGS +static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) +{ + mtspr(SPRN_DAC1, dabr); +#ifdef CONFIG_PPC_47x + isync(); +#endif + return 0; +} +#elif defined(CONFIG_PPC_BOOK3S) +static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) +{ + mtspr(SPRN_DABR, dabr); + if (cpu_has_feature(CPU_FTR_DABRX)) + mtspr(SPRN_DABRX, dabrx); + return 0; +} +#else +static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) +{ + return -EINVAL; +} +#endif + +static inline int set_dabr(struct arch_hw_breakpoint *brk) +{ + unsigned long dabr, dabrx; + + dabr = brk->address | (brk->type & HW_BRK_TYPE_DABR); + dabrx = ((brk->type >> 3) & 0x7); + + if (ppc_md.set_dabr) + return ppc_md.set_dabr(dabr, dabrx); + + return __set_dabr(dabr, dabrx); +} + +static inline int set_dawr(struct arch_hw_breakpoint *brk) +{ + unsigned long dawr, dawrx, mrd; + + dawr = brk->address; + + dawrx = (brk->type & (HW_BRK_TYPE_READ | HW_BRK_TYPE_WRITE)) \ + << (63 - 58); //* read/write bits */ + dawrx |= ((brk->type & (HW_BRK_TYPE_TRANSLATE)) >> 2) \ + << (63 - 59); //* translate */ + dawrx |= (brk->type & (HW_BRK_TYPE_PRIV_ALL)) \ + >> 3; //* PRIM bits */ + /* dawr length is stored in field MDR bits 48:53. Matches range in + doublewords (64 bits) baised by -1 eg. 0b000000=1DW and + 0b111111=64DW. + brk->len is in bytes. + This aligns up to double word size, shifts and does the bias. + */ + mrd = ((brk->len + 7) >> 3) - 1; + dawrx |= (mrd & 0x3f) << (63 - 53); + + if (ppc_md.set_dawr) + return ppc_md.set_dawr(dawr, dawrx); + mtspr(SPRN_DAWR, dawr); + mtspr(SPRN_DAWRX, dawrx); + return 0; +} + +void __set_breakpoint(struct arch_hw_breakpoint *brk) +{ + __get_cpu_var(current_brk) = *brk; + + if (cpu_has_feature(CPU_FTR_DAWR)) + set_dawr(brk); + else + set_dabr(brk); +} + +void set_breakpoint(struct arch_hw_breakpoint *brk) +{ + preempt_disable(); + __set_breakpoint(brk); + preempt_enable(); } #ifdef CONFIG_PPC64 DEFINE_PER_CPU(struct cpu_usage, cpu_usage_array); -static DEFINE_PER_CPU(unsigned long, current_dabr); #endif +static inline bool hw_brk_match(struct arch_hw_breakpoint *a, + struct arch_hw_breakpoint *b) +{ + if (a->address != b->address) + return false; + if (a->type != b->type) + return false; + if (a->len != b->len) + return false; + return true; +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static void tm_reclaim_thread(struct thread_struct *thr, + struct thread_info *ti, uint8_t cause) +{ + unsigned long msr_diff = 0; + + /* + * If FP/VSX registers have been already saved to the + * thread_struct, move them to the transact_fp array. + * We clear the TIF_RESTORE_TM bit since after the reclaim + * the thread will no longer be transactional. + */ + if (test_ti_thread_flag(ti, TIF_RESTORE_TM)) { + msr_diff = thr->tm_orig_msr & ~thr->regs->msr; + if (msr_diff & MSR_FP) + memcpy(&thr->transact_fp, &thr->fp_state, + sizeof(struct thread_fp_state)); + if (msr_diff & MSR_VEC) + memcpy(&thr->transact_vr, &thr->vr_state, + sizeof(struct thread_vr_state)); + clear_ti_thread_flag(ti, TIF_RESTORE_TM); + msr_diff &= MSR_FP | MSR_VEC | MSR_VSX | MSR_FE0 | MSR_FE1; + } + + tm_reclaim(thr, thr->regs->msr, cause); + + /* Having done the reclaim, we now have the checkpointed + * FP/VSX values in the registers. These might be valid + * even if we have previously called enable_kernel_fp() or + * flush_fp_to_thread(), so update thr->regs->msr to + * indicate their current validity. + */ + thr->regs->msr |= msr_diff; +} + +void tm_reclaim_current(uint8_t cause) +{ + tm_enable(); + tm_reclaim_thread(¤t->thread, current_thread_info(), cause); +} + +static inline void tm_reclaim_task(struct task_struct *tsk) +{ + /* We have to work out if we're switching from/to a task that's in the + * middle of a transaction. + * + * In switching we need to maintain a 2nd register state as + * oldtask->thread.ckpt_regs. We tm_reclaim(oldproc); this saves the + * checkpointed (tbegin) state in ckpt_regs and saves the transactional + * (current) FPRs into oldtask->thread.transact_fpr[]. + * + * We also context switch (save) TFHAR/TEXASR/TFIAR in here. + */ + struct thread_struct *thr = &tsk->thread; + + if (!thr->regs) + return; + + if (!MSR_TM_ACTIVE(thr->regs->msr)) + goto out_and_saveregs; + + /* Stash the original thread MSR, as giveup_fpu et al will + * modify it. We hold onto it to see whether the task used + * FP & vector regs. If the TIF_RESTORE_TM flag is set, + * tm_orig_msr is already set. + */ + if (!test_ti_thread_flag(task_thread_info(tsk), TIF_RESTORE_TM)) + thr->tm_orig_msr = thr->regs->msr; + + TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, " + "ccr=%lx, msr=%lx, trap=%lx)\n", + tsk->pid, thr->regs->nip, + thr->regs->ccr, thr->regs->msr, + thr->regs->trap); + + tm_reclaim_thread(thr, task_thread_info(tsk), TM_CAUSE_RESCHED); + + TM_DEBUG("--- tm_reclaim on pid %d complete\n", + tsk->pid); + +out_and_saveregs: + /* Always save the regs here, even if a transaction's not active. + * This context-switches a thread's TM info SPRs. We do it here to + * be consistent with the restore path (in recheckpoint) which + * cannot happen later in _switch(). + */ + tm_save_sprs(thr); +} + +extern void __tm_recheckpoint(struct thread_struct *thread, + unsigned long orig_msr); + +void tm_recheckpoint(struct thread_struct *thread, + unsigned long orig_msr) +{ + unsigned long flags; + + /* We really can't be interrupted here as the TEXASR registers can't + * change and later in the trecheckpoint code, we have a userspace R1. + * So let's hard disable over this region. + */ + local_irq_save(flags); + hard_irq_disable(); + + /* The TM SPRs are restored here, so that TEXASR.FS can be set + * before the trecheckpoint and no explosion occurs. + */ + tm_restore_sprs(thread); + + __tm_recheckpoint(thread, orig_msr); + + local_irq_restore(flags); +} + +static inline void tm_recheckpoint_new_task(struct task_struct *new) +{ + unsigned long msr; + + if (!cpu_has_feature(CPU_FTR_TM)) + return; + + /* Recheckpoint the registers of the thread we're about to switch to. + * + * If the task was using FP, we non-lazily reload both the original and + * the speculative FP register states. This is because the kernel + * doesn't see if/when a TM rollback occurs, so if we take an FP + * unavoidable later, we are unable to determine which set of FP regs + * need to be restored. + */ + if (!new->thread.regs) + return; + + if (!MSR_TM_ACTIVE(new->thread.regs->msr)){ + tm_restore_sprs(&new->thread); + return; + } + msr = new->thread.tm_orig_msr; + /* Recheckpoint to restore original checkpointed register state. */ + TM_DEBUG("*** tm_recheckpoint of pid %d " + "(new->msr 0x%lx, new->origmsr 0x%lx)\n", + new->pid, new->thread.regs->msr, msr); + + /* This loads the checkpointed FP/VEC state, if used */ + tm_recheckpoint(&new->thread, msr); + + /* This loads the speculative FP/VEC state, if used */ + if (msr & MSR_FP) { + do_load_up_transact_fpu(&new->thread); + new->thread.regs->msr |= + (MSR_FP | new->thread.fpexc_mode); + } +#ifdef CONFIG_ALTIVEC + if (msr & MSR_VEC) { + do_load_up_transact_altivec(&new->thread); + new->thread.regs->msr |= MSR_VEC; + } +#endif + /* We may as well turn on VSX too since all the state is restored now */ + if (msr & MSR_VSX) + new->thread.regs->msr |= MSR_VSX; + + TM_DEBUG("*** tm_recheckpoint of pid %d complete " + "(kernel msr 0x%lx)\n", + new->pid, mfmsr()); +} + +static inline void __switch_to_tm(struct task_struct *prev) +{ + if (cpu_has_feature(CPU_FTR_TM)) { + tm_enable(); + tm_reclaim_task(prev); + } +} + +/* + * This is called if we are on the way out to userspace and the + * TIF_RESTORE_TM flag is set. It checks if we need to reload + * FP and/or vector state and does so if necessary. + * If userspace is inside a transaction (whether active or + * suspended) and FP/VMX/VSX instructions have ever been enabled + * inside that transaction, then we have to keep them enabled + * and keep the FP/VMX/VSX state loaded while ever the transaction + * continues. The reason is that if we didn't, and subsequently + * got a FP/VMX/VSX unavailable interrupt inside a transaction, + * we don't know whether it's the same transaction, and thus we + * don't know which of the checkpointed state and the transactional + * state to use. + */ +void restore_tm_state(struct pt_regs *regs) +{ + unsigned long msr_diff; + + clear_thread_flag(TIF_RESTORE_TM); + if (!MSR_TM_ACTIVE(regs->msr)) + return; + + msr_diff = current->thread.tm_orig_msr & ~regs->msr; + msr_diff &= MSR_FP | MSR_VEC | MSR_VSX; + if (msr_diff & MSR_FP) { + fp_enable(); + load_fp_state(¤t->thread.fp_state); + regs->msr |= current->thread.fpexc_mode; + } + if (msr_diff & MSR_VEC) { + vec_enable(); + load_vr_state(¤t->thread.vr_state); + } + regs->msr |= msr_diff; +} + +#else +#define tm_recheckpoint_new_task(new) +#define __switch_to_tm(prev) +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *new) { struct thread_struct *new_thread, *old_thread; - unsigned long flags; struct task_struct *last; +#ifdef CONFIG_PPC_BOOK3S_64 + struct ppc64_tlb_batch *batch; +#endif + + WARN_ON(!irqs_disabled()); + + /* Back up the TAR and DSCR across context switches. + * Note that the TAR is not available for use in the kernel. (To + * provide this, the TAR should be backed up/restored on exception + * entry/exit instead, and be in pt_regs. FIXME, this should be in + * pt_regs anyway (for debug).) + * Save the TAR and DSCR here before we do treclaim/trecheckpoint as + * these will change them. + */ + save_early_sprs(&prev->thread); + + __switch_to_tm(prev); #ifdef CONFIG_SMP /* avoid complexity of lazy save/restore of fpu @@ -263,6 +794,11 @@ struct task_struct *__switch_to(struct task_struct *prev, if (prev->thread.regs && (prev->thread.regs->msr & MSR_VEC)) giveup_altivec(prev); #endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX + if (prev->thread.regs && (prev->thread.regs->msr & MSR_VSX)) + /* VMX and FPU registers are already save here */ + __giveup_vsx(prev); +#endif /* CONFIG_VSX */ #ifdef CONFIG_SPE /* * If the previous thread used spe in the last quantum @@ -283,6 +819,10 @@ struct task_struct *__switch_to(struct task_struct *prev, if (new->thread.regs && last_task_used_altivec == new) new->thread.regs->msr |= MSR_VEC; #endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX + if (new->thread.regs && last_task_used_vsx == new) + new->thread.regs->msr |= MSR_VSX; +#endif /* CONFIG_VSX */ #ifdef CONFIG_SPE /* Avoid the trap. On smp this this never happens since * we don't set last_task_used_spe @@ -293,15 +833,20 @@ struct task_struct *__switch_to(struct task_struct *prev, #endif /* CONFIG_SMP */ -#ifdef CONFIG_PPC64 /* for now */ - if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr)) { - set_dabr(new->thread.dabr); - __get_cpu_var(current_dabr) = new->thread.dabr; - } - - flush_tlb_pending(); +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + switch_booke_debug_regs(&new->thread.debug); +#else +/* + * For PPC_BOOK3S_64, we use the hw-breakpoint interfaces that would + * schedule DABR + */ +#ifndef CONFIG_HAVE_HW_BREAKPOINT + if (unlikely(!hw_brk_match(&__get_cpu_var(current_brk), &new->thread.hw_brk))) + __set_breakpoint(&new->thread.hw_brk); +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ #endif + new_thread = &new->thread; old_thread = ¤t->thread; @@ -317,25 +862,42 @@ struct task_struct *__switch_to(struct task_struct *prev, old_thread->accum_tb += (current_tb - start_tb); new_thread->start_tb = current_tb; } -#endif +#endif /* CONFIG_PPC64 */ + +#ifdef CONFIG_PPC_BOOK3S_64 + batch = &__get_cpu_var(ppc64_tlb_batch); + if (batch->active) { + current_thread_info()->local_flags |= _TLF_LAZY_MMU; + if (batch->index) + __flush_tlb_pending(batch); + batch->active = 0; + } +#endif /* CONFIG_PPC_BOOK3S_64 */ + + /* + * We can't take a PMU exception inside _switch() since there is a + * window where the kernel stack SLB and the kernel stack are out + * of sync. Hard disable here. + */ + hard_irq_disable(); + + tm_recheckpoint_new_task(new); - local_irq_save(flags); last = _switch(old_thread, new_thread); - local_irq_restore(flags); +#ifdef CONFIG_PPC_BOOK3S_64 + if (current_thread_info()->local_flags & _TLF_LAZY_MMU) { + current_thread_info()->local_flags &= ~_TLF_LAZY_MMU; + batch = &__get_cpu_var(ppc64_tlb_batch); + batch->active = 1; + } +#endif /* CONFIG_PPC_BOOK3S_64 */ return last; } static int instructions_to_print = 16; -#ifdef CONFIG_PPC64 -#define BAD_PC(pc) ((REGION_ID(pc) != KERNEL_REGION_ID) && \ - (REGION_ID(pc) != VMALLOC_REGION_ID)) -#else -#define BAD_PC(pc) ((pc) < KERNELBASE) -#endif - static void show_instructions(struct pt_regs *regs) { int i; @@ -350,13 +912,26 @@ static void show_instructions(struct pt_regs *regs) if (!(i % 8)) printk("\n"); - if (BAD_PC(pc) || __get_user(instr, (unsigned int *)pc)) { - printk("XXXXXXXX "); +#if !defined(CONFIG_BOOKE) + /* If executing with the IMMU off, adjust pc rather + * than print XXXXXXXX. + */ + if (!(regs->msr & MSR_IR)) + pc = (unsigned long)phys_to_virt(pc); +#endif + + /* We use __get_user here *only* to avoid an OOPS on a + * bad address because the pc *should* only be a + * kernel address. + */ + if (!__kernel_text_address(pc) || + __get_user(instr, (unsigned int __user *)pc)) { + printk(KERN_CONT "XXXXXXXX "); } else { if (regs->nip == pc) - printk("<%08x> ", instr); + printk(KERN_CONT "<%08x> ", instr); else - printk("%08x ", instr); + printk(KERN_CONT "%08x ", instr); } pc += sizeof(int); @@ -369,12 +944,32 @@ static struct regbit { unsigned long bit; const char *name; } msr_bits[] = { +#if defined(CONFIG_PPC64) && !defined(CONFIG_BOOKE) + {MSR_SF, "SF"}, + {MSR_HV, "HV"}, +#endif + {MSR_VEC, "VEC"}, + {MSR_VSX, "VSX"}, +#ifdef CONFIG_BOOKE + {MSR_CE, "CE"}, +#endif {MSR_EE, "EE"}, {MSR_PR, "PR"}, {MSR_FP, "FP"}, {MSR_ME, "ME"}, +#ifdef CONFIG_BOOKE + {MSR_DE, "DE"}, +#else + {MSR_SE, "SE"}, + {MSR_BE, "BE"}, +#endif {MSR_IR, "IR"}, {MSR_DR, "DR"}, + {MSR_PMM, "PMM"}, +#ifndef CONFIG_BOOKE + {MSR_RI, "RI"}, + {MSR_LE, "LE"}, +#endif {0, NULL} }; @@ -392,11 +987,11 @@ static void printbits(unsigned long val, struct regbit *bits) } #ifdef CONFIG_PPC64 -#define REG "%016lX" +#define REG "%016lx" #define REGS_PER_LINE 4 #define LAST_VOLATILE 13 #else -#define REG "%08lX" +#define REG "%08lx" #define REGS_PER_LINE 8 #define LAST_VOLATILE 12 #endif @@ -405,26 +1000,35 @@ void show_regs(struct pt_regs * regs) { int i, trap; + show_regs_print_info(KERN_DEFAULT); + printk("NIP: "REG" LR: "REG" CTR: "REG"\n", regs->nip, regs->link, regs->ctr); printk("REGS: %p TRAP: %04lx %s (%s)\n", - regs, regs->trap, print_tainted(), system_utsname.release); + regs, regs->trap, print_tainted(), init_utsname()->release); printk("MSR: "REG" ", regs->msr); printbits(regs->msr, msr_bits); - printk(" CR: %08lX XER: %08lX\n", regs->ccr, regs->xer); + printk(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); trap = TRAP(regs); - if (trap == 0x300 || trap == 0x600) - printk("DAR: "REG", DSISR: "REG"\n", regs->dar, regs->dsisr); - printk("TASK = %p[%d] '%s' THREAD: %p", - current, current->pid, current->comm, current->thread_info); - -#ifdef CONFIG_SMP - printk(" CPU: %d", smp_processor_id()); -#endif /* CONFIG_SMP */ + if ((regs->trap != 0xc00) && cpu_has_feature(CPU_FTR_CFAR)) + printk("CFAR: "REG" ", regs->orig_gpr3); + if (trap == 0x200 || trap == 0x300 || trap == 0x600) +#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) + printk("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr); +#else + printk("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); +#endif +#ifdef CONFIG_PPC64 + printk("SOFTE: %ld ", regs->softe); +#endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(regs->msr)) + printk("\nPACATMSCRATCH: %016llx ", get_paca()->tm_scratch); +#endif for (i = 0; i < 32; i++) { if ((i % REGS_PER_LINE) == 0) - printk("\n" KERN_INFO "GPR%02d: ", i); + printk("\nGPR%02d: ", i); printk(REG " ", regs->gpr[i]); if (i == LAST_VOLATILE && !FULL_REGS(regs)) break; @@ -435,10 +1039,8 @@ void show_regs(struct pt_regs * regs) * Lookup NIP late so we have the best change of getting the * above info out without failing */ - printk("NIP ["REG"] ", regs->nip); - print_symbol("%s\n", regs->nip); - printk("LR ["REG"] ", regs->link); - print_symbol("%s\n", regs->link); + printk("NIP ["REG"] %pS\n", regs->nip, (void *)regs->nip); + printk("LR ["REG"] %pS\n", regs->link, (void *)regs->link); #endif show_stack(current, (unsigned long *) regs->gpr[1]); if (!user_mode(regs)) @@ -447,51 +1049,18 @@ void show_regs(struct pt_regs * regs) void exit_thread(void) { - kprobe_flush_task(current); - -#ifndef CONFIG_SMP - if (last_task_used_math == current) - last_task_used_math = NULL; -#ifdef CONFIG_ALTIVEC - if (last_task_used_altivec == current) - last_task_used_altivec = NULL; -#endif /* CONFIG_ALTIVEC */ -#ifdef CONFIG_SPE - if (last_task_used_spe == current) - last_task_used_spe = NULL; -#endif -#endif /* CONFIG_SMP */ + discard_lazy_cpu_state(); } void flush_thread(void) { -#ifdef CONFIG_PPC64 - struct thread_info *t = current_thread_info(); - - if (t->flags & _TIF_ABI_PENDING) - t->flags ^= (_TIF_ABI_PENDING | _TIF_32BIT); -#endif - kprobe_flush_task(current); - -#ifndef CONFIG_SMP - if (last_task_used_math == current) - last_task_used_math = NULL; -#ifdef CONFIG_ALTIVEC - if (last_task_used_altivec == current) - last_task_used_altivec = NULL; -#endif /* CONFIG_ALTIVEC */ -#ifdef CONFIG_SPE - if (last_task_used_spe == current) - last_task_used_spe = NULL; -#endif -#endif /* CONFIG_SMP */ + discard_lazy_cpu_state(); -#ifdef CONFIG_PPC64 /* for now */ - if (current->thread.dabr) { - current->thread.dabr = 0; - set_dabr(0); - } -#endif +#ifdef CONFIG_HAVE_HW_BREAKPOINT + flush_ptrace_hw_breakpoint(current); +#else /* CONFIG_HAVE_HW_BREAKPOINT */ + set_debug_reg_defaults(¤t->thread); +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ } void @@ -500,54 +1069,83 @@ release_thread(struct task_struct *t) } /* - * This gets called before we allocate a new thread and copy - * the current task into it. + * this gets called so that we can store coprocessor state into memory and + * copy the current task into the new thread. */ -void prepare_to_copy(struct task_struct *tsk) +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - flush_fp_to_thread(current); - flush_altivec_to_thread(current); - flush_spe_to_thread(current); + flush_fp_to_thread(src); + flush_altivec_to_thread(src); + flush_vsx_to_thread(src); + flush_spe_to_thread(src); + /* + * Flush TM state out so we can copy it. __switch_to_tm() does this + * flush but it removes the checkpointed state from the current CPU and + * transitions the CPU out of TM mode. Hence we need to call + * tm_recheckpoint_new_task() (on the same task) to restore the + * checkpointed state back and the TM mode. + */ + __switch_to_tm(src); + tm_recheckpoint_new_task(src); + + *dst = *src; + + clear_task_ebb(dst); + + return 0; } /* * Copy a thread.. */ -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, - unsigned long unused, struct task_struct *p, - struct pt_regs *regs) +extern unsigned long dscr_default; /* defined in arch/powerpc/kernel/sysfs.c */ + +int copy_thread(unsigned long clone_flags, unsigned long usp, + unsigned long arg, struct task_struct *p) { struct pt_regs *childregs, *kregs; extern void ret_from_fork(void); - unsigned long sp = (unsigned long)p->thread_info + THREAD_SIZE; + extern void ret_from_kernel_thread(void); + void (*f)(void); + unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE; - CHECK_FULL_REGS(regs); /* Copy registers */ sp -= sizeof(struct pt_regs); childregs = (struct pt_regs *) sp; - *childregs = *regs; - if ((childregs->msr & MSR_PR) == 0) { - /* for kernel thread, set `current' and stackptr in new task */ + if (unlikely(p->flags & PF_KTHREAD)) { + struct thread_info *ti = (void *)task_stack_page(p); + memset(childregs, 0, sizeof(struct pt_regs)); childregs->gpr[1] = sp + sizeof(struct pt_regs); -#ifdef CONFIG_PPC32 - childregs->gpr[2] = (unsigned long) p; -#else - clear_ti_thread_flag(p->thread_info, TIF_32BIT); + /* function */ + if (usp) + childregs->gpr[14] = ppc_function_entry((void *)usp); +#ifdef CONFIG_PPC64 + clear_tsk_thread_flag(p, TIF_32BIT); + childregs->softe = 1; #endif + childregs->gpr[15] = arg; p->thread.regs = NULL; /* no user register state */ + ti->flags |= _TIF_RESTOREALL; + f = ret_from_kernel_thread; } else { - childregs->gpr[1] = usp; + struct pt_regs *regs = current_pt_regs(); + CHECK_FULL_REGS(regs); + *childregs = *regs; + if (usp) + childregs->gpr[1] = usp; p->thread.regs = childregs; + childregs->gpr[3] = 0; /* Result from fork() */ if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_PPC64 - if (!test_thread_flag(TIF_32BIT)) + if (!is_32bit_task()) childregs->gpr[13] = childregs->gpr[6]; else #endif childregs->gpr[2] = childregs->gpr[6]; } + + f = ret_from_fork; } - childregs->gpr[3] = 0; /* Result from fork() */ sp -= STACK_FRAME_OVERHEAD; /* @@ -558,35 +1156,48 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, * do some house keeping and then return from the fork or clone * system call, using the stack frame created above. */ + ((unsigned long *)sp)[0] = 0; sp -= sizeof(struct pt_regs); kregs = (struct pt_regs *) sp; sp -= STACK_FRAME_OVERHEAD; p->thread.ksp = sp; +#ifdef CONFIG_PPC32 + p->thread.ksp_limit = (unsigned long)task_stack_page(p) + + _ALIGN_UP(sizeof(struct thread_info), 16); +#endif +#ifdef CONFIG_HAVE_HW_BREAKPOINT + p->thread.ptrace_bps[0] = NULL; +#endif -#ifdef CONFIG_PPC64 - if (cpu_has_feature(CPU_FTR_SLB)) { - unsigned long sp_vsid = get_kernel_vsid(sp); + p->thread.fp_save_area = NULL; +#ifdef CONFIG_ALTIVEC + p->thread.vr_save_area = NULL; +#endif - sp_vsid <<= SLB_VSID_SHIFT; - sp_vsid |= SLB_VSID_KERNEL; - if (cpu_has_feature(CPU_FTR_16M_PAGE)) - sp_vsid |= SLB_VSID_L; +#ifdef CONFIG_PPC_STD_MMU_64 + if (mmu_has_feature(MMU_FTR_SLB)) { + unsigned long sp_vsid; + unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) + sp_vsid = get_kernel_vsid(sp, MMU_SEGSIZE_1T) + << SLB_VSID_SHIFT_1T; + else + sp_vsid = get_kernel_vsid(sp, MMU_SEGSIZE_256M) + << SLB_VSID_SHIFT; + sp_vsid |= SLB_VSID_KERNEL | llp; p->thread.ksp_vsid = sp_vsid; } - - /* - * The PPC64 ABI makes use of a TOC to contain function - * pointers. The function (ret_from_except) is actually a pointer - * to the TOC entry. The first entry is a pointer to the actual - * function. - */ - kregs->nip = *((unsigned long *)ret_from_fork); -#else - kregs->nip = (unsigned long)ret_from_fork; - p->thread.last_syscall = -1; +#endif /* CONFIG_PPC_STD_MMU_64 */ +#ifdef CONFIG_PPC64 + if (cpu_has_feature(CPU_FTR_DSCR)) { + p->thread.dscr_inherit = current->thread.dscr_inherit; + p->thread.dscr = current->thread.dscr; + } + if (cpu_has_feature(CPU_FTR_HAS_PPR)) + p->thread.ppr = INIT_PPR; #endif - + kregs->nip = ppc_function_entry(f); return 0; } @@ -599,17 +1210,13 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */ #endif - set_fs(USER_DS); - /* * If we exec out of a kernel thread then thread.regs will not be * set. Do it now. */ if (!current->thread.regs) { - unsigned long childregs = (unsigned long)current->thread_info + - THREAD_SIZE; - childregs -= sizeof(struct pt_regs); - current->thread.regs = (struct pt_regs *)childregs; + struct pt_regs *regs = task_stack_page(current) + THREAD_SIZE; + current->thread.regs = regs - 1; } memset(regs->gpr, 0, sizeof(regs->gpr)); @@ -619,31 +1226,58 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) regs->ccr = 0; regs->gpr[1] = sp; + /* + * We have just cleared all the nonvolatile GPRs, so make + * FULL_REGS(regs) return true. This is necessary to allow + * ptrace to examine the thread immediately after exec. + */ + regs->trap &= ~1UL; + #ifdef CONFIG_PPC32 regs->mq = 0; regs->nip = start; regs->msr = MSR_USER; #else - if (!test_thread_flag(TIF_32BIT)) { - unsigned long entry, toc; + if (!is_32bit_task()) { + unsigned long entry; - /* start is a relocated pointer to the function descriptor for - * the elf _start routine. The first entry in the function - * descriptor is the entry address of _start and the second - * entry is the TOC value we need to use. - */ - __get_user(entry, (unsigned long __user *)start); - __get_user(toc, (unsigned long __user *)start+1); + if (is_elf2_task()) { + /* Look ma, no function descriptors! */ + entry = start; - /* Check whether the e_entry function descriptor entries - * need to be relocated before we can use them. - */ - if (load_addr != 0) { - entry += load_addr; - toc += load_addr; + /* + * Ulrich says: + * The latest iteration of the ABI requires that when + * calling a function (at its global entry point), + * the caller must ensure r12 holds the entry point + * address (so that the function can quickly + * establish addressability). + */ + regs->gpr[12] = start; + /* Make sure that's restored on entry to userspace. */ + set_thread_flag(TIF_RESTOREALL); + } else { + unsigned long toc; + + /* start is a relocated pointer to the function + * descriptor for the elf _start routine. The first + * entry in the function descriptor is the entry + * address of _start and the second entry is the TOC + * value we need to use. + */ + __get_user(entry, (unsigned long __user *)start); + __get_user(toc, (unsigned long __user *)start+1); + + /* Check whether the e_entry function descriptor entries + * need to be relocated before we can use them. + */ + if (load_addr != 0) { + entry += load_addr; + toc += load_addr; + } + regs->gpr[2] = toc; } regs->nip = entry; - regs->gpr[2] = toc; regs->msr = MSR_USER64; } else { regs->nip = start; @@ -651,25 +1285,16 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) regs->msr = MSR_USER32; } #endif - -#ifndef CONFIG_SMP - if (last_task_used_math == current) - last_task_used_math = NULL; -#ifdef CONFIG_ALTIVEC - if (last_task_used_altivec == current) - last_task_used_altivec = NULL; -#endif -#ifdef CONFIG_SPE - if (last_task_used_spe == current) - last_task_used_spe = NULL; + discard_lazy_cpu_state(); +#ifdef CONFIG_VSX + current->thread.used_vsr = 0; #endif -#endif /* CONFIG_SMP */ - memset(current->thread.fpr, 0, sizeof(current->thread.fpr)); - current->thread.fpscr.val = 0; + memset(¤t->thread.fp_state, 0, sizeof(current->thread.fp_state)); + current->thread.fp_save_area = NULL; #ifdef CONFIG_ALTIVEC - memset(current->thread.vr, 0, sizeof(current->thread.vr)); - memset(¤t->thread.vscr, 0, sizeof(current->thread.vscr)); - current->thread.vscr.u[3] = 0x00010000; /* Java mode disabled */ + memset(¤t->thread.vr_state, 0, sizeof(current->thread.vr_state)); + current->thread.vr_state.vscr.u[3] = 0x00010000; /* Java mode disabled */ + current->thread.vr_save_area = NULL; current->thread.vrsave = 0; current->thread.used_vr = 0; #endif /* CONFIG_ALTIVEC */ @@ -679,6 +1304,13 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) current->thread.spefscr = 0; current->thread.used_spe = 0; #endif /* CONFIG_SPE */ +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (cpu_has_feature(CPU_FTR_TM)) + regs->msr |= MSR_TM; + current->thread.tm_tfhar = 0; + current->thread.tm_texasr = 0; + current->thread.tm_tfiar = 0; +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ } #define PR_FP_ALL_EXCEPT (PR_FP_EXC_DIV | PR_FP_EXC_OVF | PR_FP_EXC_UND \ @@ -694,9 +1326,26 @@ int set_fpexc_mode(struct task_struct *tsk, unsigned int val) * mode (asyn, precise, disabled) for 'Classic' FP. */ if (val & PR_FP_EXC_SW_ENABLE) { #ifdef CONFIG_SPE - tsk->thread.fpexc_mode = val & - (PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT); - return 0; + if (cpu_has_feature(CPU_FTR_SPE)) { + /* + * When the sticky exception bits are set + * directly by userspace, it must call prctl + * with PR_GET_FPEXC (with PR_FP_EXC_SW_ENABLE + * in the existing prctl settings) or + * PR_SET_FPEXC (with PR_FP_EXC_SW_ENABLE in + * the bits being set). <fenv.h> functions + * saving and restoring the whole + * floating-point environment need to do so + * anyway to restore the prctl settings from + * the saved environment. + */ + tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR); + tsk->thread.fpexc_mode = val & + (PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT); + return 0; + } else { + return -EINVAL; + } #else return -EINVAL; #endif @@ -722,7 +1371,23 @@ int get_fpexc_mode(struct task_struct *tsk, unsigned long adr) if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE) #ifdef CONFIG_SPE - val = tsk->thread.fpexc_mode; + if (cpu_has_feature(CPU_FTR_SPE)) { + /* + * When the sticky exception bits are set + * directly by userspace, it must call prctl + * with PR_GET_FPEXC (with PR_FP_EXC_SW_ENABLE + * in the existing prctl settings) or + * PR_SET_FPEXC (with PR_FP_EXC_SW_ENABLE in + * the bits being set). <fenv.h> functions + * saving and restoring the whole + * floating-point environment need to do so + * anyway to restore the prctl settings from + * the saved environment. + */ + tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR); + val = tsk->thread.fpexc_mode; + } else + return -EINVAL; #else return -EINVAL; #endif @@ -731,105 +1396,98 @@ int get_fpexc_mode(struct task_struct *tsk, unsigned long adr) return put_user(val, (unsigned int __user *) adr); } -#define TRUNC_PTR(x) ((typeof(x))(((unsigned long)(x)) & 0xffffffff)) +int set_endian(struct task_struct *tsk, unsigned int val) +{ + struct pt_regs *regs = tsk->thread.regs; + + if ((val == PR_ENDIAN_LITTLE && !cpu_has_feature(CPU_FTR_REAL_LE)) || + (val == PR_ENDIAN_PPC_LITTLE && !cpu_has_feature(CPU_FTR_PPC_LE))) + return -EINVAL; + + if (regs == NULL) + return -EINVAL; -int sys_clone(unsigned long clone_flags, unsigned long usp, - int __user *parent_tidp, void __user *child_threadptr, - int __user *child_tidp, int p6, - struct pt_regs *regs) + if (val == PR_ENDIAN_BIG) + regs->msr &= ~MSR_LE; + else if (val == PR_ENDIAN_LITTLE || val == PR_ENDIAN_PPC_LITTLE) + regs->msr |= MSR_LE; + else + return -EINVAL; + + return 0; +} + +int get_endian(struct task_struct *tsk, unsigned long adr) { - CHECK_FULL_REGS(regs); - if (usp == 0) - usp = regs->gpr[1]; /* stack pointer for child */ -#ifdef CONFIG_PPC64 - if (test_thread_flag(TIF_32BIT)) { - parent_tidp = TRUNC_PTR(parent_tidp); - child_tidp = TRUNC_PTR(child_tidp); - } -#endif - return do_fork(clone_flags, usp, regs, 0, parent_tidp, child_tidp); + struct pt_regs *regs = tsk->thread.regs; + unsigned int val; + + if (!cpu_has_feature(CPU_FTR_PPC_LE) && + !cpu_has_feature(CPU_FTR_REAL_LE)) + return -EINVAL; + + if (regs == NULL) + return -EINVAL; + + if (regs->msr & MSR_LE) { + if (cpu_has_feature(CPU_FTR_REAL_LE)) + val = PR_ENDIAN_LITTLE; + else + val = PR_ENDIAN_PPC_LITTLE; + } else + val = PR_ENDIAN_BIG; + + return put_user(val, (unsigned int __user *)adr); } -int sys_fork(unsigned long p1, unsigned long p2, unsigned long p3, - unsigned long p4, unsigned long p5, unsigned long p6, - struct pt_regs *regs) +int set_unalign_ctl(struct task_struct *tsk, unsigned int val) { - CHECK_FULL_REGS(regs); - return do_fork(SIGCHLD, regs->gpr[1], regs, 0, NULL, NULL); + tsk->thread.align_ctl = val; + return 0; } -int sys_vfork(unsigned long p1, unsigned long p2, unsigned long p3, - unsigned long p4, unsigned long p5, unsigned long p6, - struct pt_regs *regs) +int get_unalign_ctl(struct task_struct *tsk, unsigned long adr) { - CHECK_FULL_REGS(regs); - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->gpr[1], - regs, 0, NULL, NULL); + return put_user(tsk->thread.align_ctl, (unsigned int __user *)adr); } -int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2, - unsigned long a3, unsigned long a4, unsigned long a5, - struct pt_regs *regs) +static inline int valid_irq_stack(unsigned long sp, struct task_struct *p, + unsigned long nbytes) { - int error; - char *filename; + unsigned long stack_page; + unsigned long cpu = task_cpu(p); - filename = getname((char __user *) a0); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - flush_fp_to_thread(current); - flush_altivec_to_thread(current); - flush_spe_to_thread(current); - error = do_execve(filename, (char __user * __user *) a1, - (char __user * __user *) a2, regs); - if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); + /* + * Avoid crashing if the stack has overflowed and corrupted + * task_cpu(p), which is in the thread_info struct. + */ + if (cpu < NR_CPUS && cpu_possible(cpu)) { + stack_page = (unsigned long) hardirq_ctx[cpu]; + if (sp >= stack_page + sizeof(struct thread_struct) + && sp <= stack_page + THREAD_SIZE - nbytes) + return 1; + + stack_page = (unsigned long) softirq_ctx[cpu]; + if (sp >= stack_page + sizeof(struct thread_struct) + && sp <= stack_page + THREAD_SIZE - nbytes) + return 1; } - putname(filename); -out: - return error; + return 0; } -static int validate_sp(unsigned long sp, struct task_struct *p, +int validate_sp(unsigned long sp, struct task_struct *p, unsigned long nbytes) { - unsigned long stack_page = (unsigned long)p->thread_info; + unsigned long stack_page = (unsigned long)task_stack_page(p); if (sp >= stack_page + sizeof(struct thread_struct) && sp <= stack_page + THREAD_SIZE - nbytes) return 1; -#ifdef CONFIG_IRQSTACKS - stack_page = (unsigned long) hardirq_ctx[task_cpu(p)]; - if (sp >= stack_page + sizeof(struct thread_struct) - && sp <= stack_page + THREAD_SIZE - nbytes) - return 1; - - stack_page = (unsigned long) softirq_ctx[task_cpu(p)]; - if (sp >= stack_page + sizeof(struct thread_struct) - && sp <= stack_page + THREAD_SIZE - nbytes) - return 1; -#endif - - return 0; + return valid_irq_stack(sp, p, nbytes); } -#ifdef CONFIG_PPC64 -#define MIN_STACK_FRAME 112 /* same as STACK_FRAME_OVERHEAD, in fact */ -#define FRAME_LR_SAVE 2 -#define INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD + 288) -#define REGS_MARKER 0x7265677368657265ul -#define FRAME_MARKER 12 -#else -#define MIN_STACK_FRAME 16 -#define FRAME_LR_SAVE 1 -#define INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) -#define REGS_MARKER 0x72656773ul -#define FRAME_MARKER 2 -#endif +EXPORT_SYMBOL(validate_sp); unsigned long get_wchan(struct task_struct *p) { @@ -840,30 +1498,41 @@ unsigned long get_wchan(struct task_struct *p) return 0; sp = p->thread.ksp; - if (!validate_sp(sp, p, MIN_STACK_FRAME)) + if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD)) return 0; do { sp = *(unsigned long *)sp; - if (!validate_sp(sp, p, MIN_STACK_FRAME)) + if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD)) return 0; if (count > 0) { - ip = ((unsigned long *)sp)[FRAME_LR_SAVE]; + ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE]; if (!in_sched_functions(ip)) return ip; } } while (count++ < 16); return 0; } -EXPORT_SYMBOL(get_wchan); -static int kstack_depth_to_print = 64; +static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; void show_stack(struct task_struct *tsk, unsigned long *stack) { unsigned long sp, ip, lr, newsp; int count = 0; int firstframe = 1; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + int curr_frame = current->curr_ret_stack; + extern void return_to_handler(void); + unsigned long rth = (unsigned long)return_to_handler; + unsigned long mrth = -1; +#ifdef CONFIG_PPC64 + extern void mod_return_to_handler(void); + rth = *(unsigned long *)rth; + mrth = (unsigned long)mod_return_to_handler; + mrth = *(unsigned long *)mrth; +#endif +#endif sp = (unsigned long) stack; if (tsk == NULL) @@ -878,15 +1547,21 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) lr = 0; printk("Call Trace:\n"); do { - if (!validate_sp(sp, tsk, MIN_STACK_FRAME)) + if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD)) return; stack = (unsigned long *) sp; newsp = stack[0]; - ip = stack[FRAME_LR_SAVE]; + ip = stack[STACK_FRAME_LR_SAVE]; if (!firstframe || ip != lr) { - printk("["REG"] ["REG"] ", sp, ip); - print_symbol("%s", ip); + printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if ((ip == rth || ip == mrth) && curr_frame >= 0) { + printk(" (%pS)", + (void *)current->ret_stack[curr_frame].ret); + curr_frame--; + } +#endif if (firstframe) printk(" (unreliable)"); printk("\n"); @@ -897,14 +1572,13 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) * See if this is an exception frame. * We look for the "regshere" marker in the current frame. */ - if (validate_sp(sp, tsk, INT_FRAME_SIZE) - && stack[FRAME_MARKER] == REGS_MARKER) { + if (validate_sp(sp, tsk, STACK_INT_FRAME_SIZE) + && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { struct pt_regs *regs = (struct pt_regs *) (sp + STACK_FRAME_OVERHEAD); - printk("--- Exception: %lx", regs->trap); - print_symbol(" at %s\n", regs->nip); lr = regs->link; - print_symbol(" LR = %s\n", lr); + printk("--- Exception: %lx at %pS\n LR = %pS\n", + regs->trap, (void *)regs->nip, (void *)lr); firstframe = 1; } @@ -912,8 +1586,85 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) } while (count++ < kstack_depth_to_print); } -void dump_stack(void) +#ifdef CONFIG_PPC64 +/* Called with hard IRQs off */ +void notrace __ppc64_runlatch_on(void) { - show_stack(current, NULL); + struct thread_info *ti = current_thread_info(); + unsigned long ctrl; + + ctrl = mfspr(SPRN_CTRLF); + ctrl |= CTRL_RUNLATCH; + mtspr(SPRN_CTRLT, ctrl); + + ti->local_flags |= _TLF_RUNLATCH; +} + +/* Called with hard IRQs off */ +void notrace __ppc64_runlatch_off(void) +{ + struct thread_info *ti = current_thread_info(); + unsigned long ctrl; + + ti->local_flags &= ~_TLF_RUNLATCH; + + ctrl = mfspr(SPRN_CTRLF); + ctrl &= ~CTRL_RUNLATCH; + mtspr(SPRN_CTRLT, ctrl); +} +#endif /* CONFIG_PPC64 */ + +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() & ~PAGE_MASK; + return sp & ~0xf; +} + +static inline unsigned long brk_rnd(void) +{ + unsigned long rnd = 0; + + /* 8MB for 32bit, 1GB for 64bit */ + if (is_32bit_task()) + rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT))); + else + rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT))); + + return rnd << PAGE_SHIFT; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long base = mm->brk; + unsigned long ret; + +#ifdef CONFIG_PPC_STD_MMU_64 + /* + * If we are using 1TB segments and we are allowed to randomise + * the heap, we can put it above 1TB so it is backed by a 1TB + * segment. Otherwise the heap will be in the bottom 1TB + * which always uses 256MB segments and this may result in a + * performance penalty. + */ + if (!is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T)) + base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T); +#endif + + ret = PAGE_ALIGN(base + brk_rnd()); + + if (ret < mm->brk) + return mm->brk; + + return ret; +} + +unsigned long randomize_et_dyn(unsigned long base) +{ + unsigned long ret = PAGE_ALIGN(base + brk_rnd()); + + if (ret < base) + return base; + + return ret; } -EXPORT_SYMBOL(dump_stack); diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 2eccd0e159e..b694b073097 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -16,7 +16,6 @@ #undef DEBUG #include <stdarg.h> -#include <linux/config.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/init.h> @@ -28,29 +27,37 @@ #include <linux/delay.h> #include <linux/initrd.h> #include <linux/bitops.h> -#include <linux/module.h> +#include <linux/export.h> +#include <linux/kexec.h> +#include <linux/irq.h> +#include <linux/memblock.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/libfdt.h> #include <asm/prom.h> #include <asm/rtas.h> -#include <asm/lmb.h> #include <asm/page.h> #include <asm/processor.h> #include <asm/irq.h> #include <asm/io.h> +#include <asm/kdump.h> #include <asm/smp.h> -#include <asm/system.h> #include <asm/mmu.h> +#include <asm/paca.h> #include <asm/pgtable.h> #include <asm/pci.h> #include <asm/iommu.h> #include <asm/btext.h> #include <asm/sections.h> #include <asm/machdep.h> -#include <asm/pSeries_reconfig.h> #include <asm/pci-bridge.h> -#ifdef CONFIG_PPC64 -#include <asm/systemcfg.h> -#endif +#include <asm/kexec.h> +#include <asm/opal.h> +#include <asm/fadump.h> +#include <asm/debug.h> + +#include <mm/mmu_decl.h> #ifdef DEBUG #define DBG(fmt...) printk(KERN_ERR fmt) @@ -58,1859 +65,781 @@ #define DBG(fmt...) #endif -struct pci_reg_property { - struct pci_address addr; - u32 size_hi; - u32 size_lo; -}; - -struct isa_reg_property { - u32 space; - u32 address; - u32 size; -}; - - -typedef int interpret_func(struct device_node *, unsigned long *, - int, int, int); - -extern struct rtas_t rtas; -extern struct lmb lmb; -extern unsigned long klimit; - -static int __initdata dt_root_addr_cells; -static int __initdata dt_root_size_cells; - #ifdef CONFIG_PPC64 -static int __initdata iommu_is_off; +int __initdata iommu_is_off; int __initdata iommu_force_on; unsigned long tce_alloc_start, tce_alloc_end; +u64 ppc64_rma_size; #endif +static phys_addr_t first_memblock_size; +static int __initdata boot_cpu_count; -typedef u32 cell_t; - -#if 0 -static struct boot_param_header *initial_boot_params __initdata; -#else -struct boot_param_header *initial_boot_params; -#endif - -static struct device_node *allnodes = NULL; - -/* use when traversing tree through the allnext, child, sibling, - * or parent members of struct device_node. - */ -static DEFINE_RWLOCK(devtree_lock); - -/* export that to outside world */ -struct device_node *of_chosen; - -struct device_node *dflt_interrupt_controller; -int num_interrupt_controllers; - -/* - * Wrapper for allocating memory for various data that needs to be - * attached to device nodes as they are processed at boot or when - * added to the device tree later (e.g. DLPAR). At boot there is - * already a region reserved so we just increment *mem_start by size; - * otherwise we call kmalloc. - */ -static void * prom_alloc(unsigned long size, unsigned long *mem_start) +static int __init early_parse_mem(char *p) { - unsigned long tmp; - - if (!mem_start) - return kmalloc(size, GFP_KERNEL); + if (!p) + return 1; - tmp = *mem_start; - *mem_start += size; - return (void *)tmp; -} - -/* - * Find the device_node with a given phandle. - */ -static struct device_node * find_phandle(phandle ph) -{ - struct device_node *np; + memory_limit = PAGE_ALIGN(memparse(p, &p)); + DBG("memory limit = 0x%llx\n", memory_limit); - for (np = allnodes; np != 0; np = np->allnext) - if (np->linux_phandle == ph) - return np; - return NULL; + return 0; } +early_param("mem", early_parse_mem); /* - * Find the interrupt parent of a node. + * overlaps_initrd - check for overlap with page aligned extension of + * initrd. */ -static struct device_node * __devinit intr_parent(struct device_node *p) +static inline int overlaps_initrd(unsigned long start, unsigned long size) { - phandle *parp; - - parp = (phandle *) get_property(p, "interrupt-parent", NULL); - if (parp == NULL) - return p->parent; - p = find_phandle(*parp); - if (p != NULL) - return p; - /* - * On a powermac booted with BootX, we don't get to know the - * phandles for any nodes, so find_phandle will return NULL. - * Fortunately these machines only have one interrupt controller - * so there isn't in fact any ambiguity. -- paulus - */ - if (num_interrupt_controllers == 1) - p = dflt_interrupt_controller; - return p; -} +#ifdef CONFIG_BLK_DEV_INITRD + if (!initrd_start) + return 0; -/* - * Find out the size of each entry of the interrupts property - * for a node. - */ -int __devinit prom_n_intr_cells(struct device_node *np) -{ - struct device_node *p; - unsigned int *icp; - - for (p = np; (p = intr_parent(p)) != NULL; ) { - icp = (unsigned int *) - get_property(p, "#interrupt-cells", NULL); - if (icp != NULL) - return *icp; - if (get_property(p, "interrupt-controller", NULL) != NULL - || get_property(p, "interrupt-map", NULL) != NULL) { - printk("oops, node %s doesn't have #interrupt-cells\n", - p->full_name); - return 1; - } - } -#ifdef DEBUG_IRQ - printk("prom_n_intr_cells failed for %s\n", np->full_name); + return (start + size) > _ALIGN_DOWN(initrd_start, PAGE_SIZE) && + start <= _ALIGN_UP(initrd_end, PAGE_SIZE); +#else + return 0; #endif - return 1; } -/* - * Map an interrupt from a device up to the platform interrupt - * descriptor. +/** + * move_device_tree - move tree to an unused area, if needed. + * + * The device tree may be allocated beyond our memory limit, or inside the + * crash kernel region for kdump, or within the page aligned range of initrd. + * If so, move it out of the way. */ -static int __devinit map_interrupt(unsigned int **irq, struct device_node **ictrler, - struct device_node *np, unsigned int *ints, - int nintrc) +static void __init move_device_tree(void) { - struct device_node *p, *ipar; - unsigned int *imap, *imask, *ip; - int i, imaplen, match; - int newintrc = 0, newaddrc = 0; - unsigned int *reg; - int naddrc; - - reg = (unsigned int *) get_property(np, "reg", NULL); - naddrc = prom_n_addr_cells(np); - p = intr_parent(np); - while (p != NULL) { - if (get_property(p, "interrupt-controller", NULL) != NULL) - /* this node is an interrupt controller, stop here */ - break; - imap = (unsigned int *) - get_property(p, "interrupt-map", &imaplen); - if (imap == NULL) { - p = intr_parent(p); - continue; - } - imask = (unsigned int *) - get_property(p, "interrupt-map-mask", NULL); - if (imask == NULL) { - printk("oops, %s has interrupt-map but no mask\n", - p->full_name); - return 0; - } - imaplen /= sizeof(unsigned int); - match = 0; - ipar = NULL; - while (imaplen > 0 && !match) { - /* check the child-interrupt field */ - match = 1; - for (i = 0; i < naddrc && match; ++i) - match = ((reg[i] ^ imap[i]) & imask[i]) == 0; - for (; i < naddrc + nintrc && match; ++i) - match = ((ints[i-naddrc] ^ imap[i]) & imask[i]) == 0; - imap += naddrc + nintrc; - imaplen -= naddrc + nintrc; - /* grab the interrupt parent */ - ipar = find_phandle((phandle) *imap++); - --imaplen; - if (ipar == NULL && num_interrupt_controllers == 1) - /* cope with BootX not giving us phandles */ - ipar = dflt_interrupt_controller; - if (ipar == NULL) { - printk("oops, no int parent %x in map of %s\n", - imap[-1], p->full_name); - return 0; - } - /* find the parent's # addr and intr cells */ - ip = (unsigned int *) - get_property(ipar, "#interrupt-cells", NULL); - if (ip == NULL) { - printk("oops, no #interrupt-cells on %s\n", - ipar->full_name); - return 0; - } - newintrc = *ip; - ip = (unsigned int *) - get_property(ipar, "#address-cells", NULL); - newaddrc = (ip == NULL)? 0: *ip; - imap += newaddrc + newintrc; - imaplen -= newaddrc + newintrc; - } - if (imaplen < 0) { - printk("oops, error decoding int-map on %s, len=%d\n", - p->full_name, imaplen); - return 0; - } - if (!match) { -#ifdef DEBUG_IRQ - printk("oops, no match in %s int-map for %s\n", - p->full_name, np->full_name); -#endif - return 0; - } - p = ipar; - naddrc = newaddrc; - nintrc = newintrc; - ints = imap - nintrc; - reg = ints - naddrc; - } - if (p == NULL) { -#ifdef DEBUG_IRQ - printk("hmmm, int tree for %s doesn't have ctrler\n", - np->full_name); -#endif - return 0; - } - *irq = ints; - *ictrler = p; - return nintrc; -} + unsigned long start, size; + void *p; -static unsigned char map_isa_senses[4] = { - IRQ_SENSE_LEVEL | IRQ_POLARITY_NEGATIVE, - IRQ_SENSE_LEVEL | IRQ_POLARITY_POSITIVE, - IRQ_SENSE_EDGE | IRQ_POLARITY_NEGATIVE, - IRQ_SENSE_EDGE | IRQ_POLARITY_POSITIVE -}; + DBG("-> move_device_tree\n"); -static unsigned char map_mpic_senses[4] = { - IRQ_SENSE_EDGE | IRQ_POLARITY_POSITIVE, - IRQ_SENSE_LEVEL | IRQ_POLARITY_NEGATIVE, - /* 2 seems to be used for the 8259 cascade... */ - IRQ_SENSE_LEVEL | IRQ_POLARITY_POSITIVE, - IRQ_SENSE_EDGE | IRQ_POLARITY_NEGATIVE, -}; + start = __pa(initial_boot_params); + size = fdt_totalsize(initial_boot_params); -static int __devinit finish_node_interrupts(struct device_node *np, - unsigned long *mem_start, - int measure_only) -{ - unsigned int *ints; - int intlen, intrcells, intrcount; - int i, j, n, sense; - unsigned int *irq, virq; - struct device_node *ic; - - if (num_interrupt_controllers == 0) { - /* - * Old machines just have a list of interrupt numbers - * and no interrupt-controller nodes. - */ - ints = (unsigned int *) get_property(np, "AAPL,interrupts", - &intlen); - /* XXX old interpret_pci_props looked in parent too */ - /* XXX old interpret_macio_props looked for interrupts - before AAPL,interrupts */ - if (ints == NULL) - ints = (unsigned int *) get_property(np, "interrupts", - &intlen); - if (ints == NULL) - return 0; - - np->n_intrs = intlen / sizeof(unsigned int); - np->intrs = prom_alloc(np->n_intrs * sizeof(np->intrs[0]), - mem_start); - if (!np->intrs) - return -ENOMEM; - if (measure_only) - return 0; - - for (i = 0; i < np->n_intrs; ++i) { - np->intrs[i].line = *ints++; - np->intrs[i].sense = IRQ_SENSE_LEVEL - | IRQ_POLARITY_NEGATIVE; - } - return 0; + if ((memory_limit && (start + size) > PHYSICAL_START + memory_limit) || + overlaps_crashkernel(start, size) || + overlaps_initrd(start, size)) { + p = __va(memblock_alloc(size, PAGE_SIZE)); + memcpy(p, initial_boot_params, size); + initial_boot_params = p; + DBG("Moved device tree to 0x%p\n", p); } - ints = (unsigned int *) get_property(np, "interrupts", &intlen); - if (ints == NULL) - return 0; - intrcells = prom_n_intr_cells(np); - intlen /= intrcells * sizeof(unsigned int); + DBG("<- move_device_tree\n"); +} - np->intrs = prom_alloc(intlen * sizeof(*(np->intrs)), mem_start); - if (!np->intrs) - return -ENOMEM; +/* + * ibm,pa-features is a per-cpu property that contains a string of + * attribute descriptors, each of which has a 2 byte header plus up + * to 254 bytes worth of processor attribute bits. First header + * byte specifies the number of bytes following the header. + * Second header byte is an "attribute-specifier" type, of which + * zero is the only currently-defined value. + * Implementation: Pass in the byte and bit offset for the feature + * that we are interested in. The function will return -1 if the + * pa-features property is missing, or a 1/0 to indicate if the feature + * is supported/not supported. Note that the bit numbers are + * big-endian to match the definition in PAPR. + */ +static struct ibm_pa_feature { + unsigned long cpu_features; /* CPU_FTR_xxx bit */ + unsigned long mmu_features; /* MMU_FTR_xxx bit */ + unsigned int cpu_user_ftrs; /* PPC_FEATURE_xxx bit */ + unsigned char pabyte; /* byte number in ibm,pa-features */ + unsigned char pabit; /* bit number (big-endian) */ + unsigned char invert; /* if 1, pa bit set => clear feature */ +} ibm_pa_features[] __initdata = { + {0, 0, PPC_FEATURE_HAS_MMU, 0, 0, 0}, + {0, 0, PPC_FEATURE_HAS_FPU, 0, 1, 0}, + {0, MMU_FTR_SLB, 0, 0, 2, 0}, + {CPU_FTR_CTRL, 0, 0, 0, 3, 0}, + {CPU_FTR_NOEXECUTE, 0, 0, 0, 6, 0}, + {CPU_FTR_NODSISRALIGN, 0, 0, 1, 1, 1}, + {0, MMU_FTR_CI_LARGE_PAGE, 0, 1, 2, 0}, + {CPU_FTR_REAL_LE, PPC_FEATURE_TRUE_LE, 5, 0, 0}, +}; - if (measure_only) - return 0; +static void __init scan_features(unsigned long node, const unsigned char *ftrs, + unsigned long tablelen, + struct ibm_pa_feature *fp, + unsigned long ft_size) +{ + unsigned long i, len, bit; + + /* find descriptor with type == 0 */ + for (;;) { + if (tablelen < 3) + return; + len = 2 + ftrs[0]; + if (tablelen < len) + return; /* descriptor 0 not found */ + if (ftrs[1] == 0) + break; + tablelen -= len; + ftrs += len; + } - intrcount = 0; - for (i = 0; i < intlen; ++i, ints += intrcells) { - n = map_interrupt(&irq, &ic, np, ints, intrcells); - if (n <= 0) + /* loop over bits we know about */ + for (i = 0; i < ft_size; ++i, ++fp) { + if (fp->pabyte >= ftrs[0]) continue; - - /* don't map IRQ numbers under a cascaded 8259 controller */ - if (ic && device_is_compatible(ic, "chrp,iic")) { - np->intrs[intrcount].line = irq[0]; - sense = (n > 1)? (irq[1] & 3): 3; - np->intrs[intrcount].sense = map_isa_senses[sense]; + bit = (ftrs[2 + fp->pabyte] >> (7 - fp->pabit)) & 1; + if (bit ^ fp->invert) { + cur_cpu_spec->cpu_features |= fp->cpu_features; + cur_cpu_spec->cpu_user_features |= fp->cpu_user_ftrs; + cur_cpu_spec->mmu_features |= fp->mmu_features; } else { - virq = virt_irq_create_mapping(irq[0]); -#ifdef CONFIG_PPC64 - if (virq == NO_IRQ) { - printk(KERN_CRIT "Could not allocate interrupt" - " number for %s\n", np->full_name); - continue; - } -#endif - np->intrs[intrcount].line = irq_offset_up(virq); - sense = (n > 1)? (irq[1] & 3): 1; - np->intrs[intrcount].sense = map_mpic_senses[sense]; + cur_cpu_spec->cpu_features &= ~fp->cpu_features; + cur_cpu_spec->cpu_user_features &= ~fp->cpu_user_ftrs; + cur_cpu_spec->mmu_features &= ~fp->mmu_features; } - -#ifdef CONFIG_PPC64 - /* We offset irq numbers for the u3 MPIC by 128 in PowerMac */ - if (systemcfg->platform == PLATFORM_POWERMAC && ic && ic->parent) { - char *name = get_property(ic->parent, "name", NULL); - if (name && !strcmp(name, "u3")) - np->intrs[intrcount].line += 128; - else if (!(name && !strcmp(name, "mac-io"))) - /* ignore other cascaded controllers, such as - the k2-sata-root */ - break; - } -#endif - if (n > 2) { - printk("hmmm, got %d intr cells for %s:", n, - np->full_name); - for (j = 0; j < n; ++j) - printk(" %d", irq[j]); - printk("\n"); - } - ++intrcount; } - np->n_intrs = intrcount; - - return 0; } -static int __devinit interpret_pci_props(struct device_node *np, - unsigned long *mem_start, - int naddrc, int nsizec, - int measure_only) +static void __init check_cpu_pa_features(unsigned long node) { - struct address_range *adr; - struct pci_reg_property *pci_addrs; - int i, l, n_addrs; + const unsigned char *pa_ftrs; + int tablelen; - pci_addrs = (struct pci_reg_property *) - get_property(np, "assigned-addresses", &l); - if (!pci_addrs) - return 0; - - n_addrs = l / sizeof(*pci_addrs); - - adr = prom_alloc(n_addrs * sizeof(*adr), mem_start); - if (!adr) - return -ENOMEM; - - if (measure_only) - return 0; - - np->addrs = adr; - np->n_addrs = n_addrs; - - for (i = 0; i < n_addrs; i++) { - adr[i].space = pci_addrs[i].addr.a_hi; - adr[i].address = pci_addrs[i].addr.a_lo | - ((u64)pci_addrs[i].addr.a_mid << 32); - adr[i].size = pci_addrs[i].size_lo; - } - - return 0; -} - -static int __init interpret_dbdma_props(struct device_node *np, - unsigned long *mem_start, - int naddrc, int nsizec, - int measure_only) -{ - struct reg_property32 *rp; - struct address_range *adr; - unsigned long base_address; - int i, l; - struct device_node *db; - - base_address = 0; - if (!measure_only) { - for (db = np->parent; db != NULL; db = db->parent) { - if (!strcmp(db->type, "dbdma") && db->n_addrs != 0) { - base_address = db->addrs[0].address; - break; - } - } - } - - rp = (struct reg_property32 *) get_property(np, "reg", &l); - if (rp != 0 && l >= sizeof(struct reg_property32)) { - i = 0; - adr = (struct address_range *) (*mem_start); - while ((l -= sizeof(struct reg_property32)) >= 0) { - if (!measure_only) { - adr[i].space = 2; - adr[i].address = rp[i].address + base_address; - adr[i].size = rp[i].size; - } - ++i; - } - np->addrs = adr; - np->n_addrs = i; - (*mem_start) += i * sizeof(struct address_range); - } + pa_ftrs = of_get_flat_dt_prop(node, "ibm,pa-features", &tablelen); + if (pa_ftrs == NULL) + return; - return 0; + scan_features(node, pa_ftrs, tablelen, + ibm_pa_features, ARRAY_SIZE(ibm_pa_features)); } -static int __init interpret_macio_props(struct device_node *np, - unsigned long *mem_start, - int naddrc, int nsizec, - int measure_only) +#ifdef CONFIG_PPC_STD_MMU_64 +static void __init check_cpu_slb_size(unsigned long node) { - struct reg_property32 *rp; - struct address_range *adr; - unsigned long base_address; - int i, l; - struct device_node *db; - - base_address = 0; - if (!measure_only) { - for (db = np->parent; db != NULL; db = db->parent) { - if (!strcmp(db->type, "mac-io") && db->n_addrs != 0) { - base_address = db->addrs[0].address; - break; - } - } - } - - rp = (struct reg_property32 *) get_property(np, "reg", &l); - if (rp != 0 && l >= sizeof(struct reg_property32)) { - i = 0; - adr = (struct address_range *) (*mem_start); - while ((l -= sizeof(struct reg_property32)) >= 0) { - if (!measure_only) { - adr[i].space = 2; - adr[i].address = rp[i].address + base_address; - adr[i].size = rp[i].size; - } - ++i; - } - np->addrs = adr; - np->n_addrs = i; - (*mem_start) += i * sizeof(struct address_range); - } + const __be32 *slb_size_ptr; - return 0; -} - -static int __init interpret_isa_props(struct device_node *np, - unsigned long *mem_start, - int naddrc, int nsizec, - int measure_only) -{ - struct isa_reg_property *rp; - struct address_range *adr; - int i, l; - - rp = (struct isa_reg_property *) get_property(np, "reg", &l); - if (rp != 0 && l >= sizeof(struct isa_reg_property)) { - i = 0; - adr = (struct address_range *) (*mem_start); - while ((l -= sizeof(struct isa_reg_property)) >= 0) { - if (!measure_only) { - adr[i].space = rp[i].space; - adr[i].address = rp[i].address; - adr[i].size = rp[i].size; - } - ++i; - } - np->addrs = adr; - np->n_addrs = i; - (*mem_start) += i * sizeof(struct address_range); + slb_size_ptr = of_get_flat_dt_prop(node, "slb-size", NULL); + if (slb_size_ptr != NULL) { + mmu_slb_size = be32_to_cpup(slb_size_ptr); + return; } - - return 0; -} - -static int __init interpret_root_props(struct device_node *np, - unsigned long *mem_start, - int naddrc, int nsizec, - int measure_only) -{ - struct address_range *adr; - int i, l; - unsigned int *rp; - int rpsize = (naddrc + nsizec) * sizeof(unsigned int); - - rp = (unsigned int *) get_property(np, "reg", &l); - if (rp != 0 && l >= rpsize) { - i = 0; - adr = (struct address_range *) (*mem_start); - while ((l -= rpsize) >= 0) { - if (!measure_only) { - adr[i].space = 0; - adr[i].address = rp[naddrc - 1]; - adr[i].size = rp[naddrc + nsizec - 1]; - } - ++i; - rp += naddrc + nsizec; - } - np->addrs = adr; - np->n_addrs = i; - (*mem_start) += i * sizeof(struct address_range); + slb_size_ptr = of_get_flat_dt_prop(node, "ibm,slb-size", NULL); + if (slb_size_ptr != NULL) { + mmu_slb_size = be32_to_cpup(slb_size_ptr); } - - return 0; -} - -static int __devinit finish_node(struct device_node *np, - unsigned long *mem_start, - interpret_func *ifunc, - int naddrc, int nsizec, - int measure_only) -{ - struct device_node *child; - int *ip, rc = 0; - - /* get the device addresses and interrupts */ - if (ifunc != NULL) - rc = ifunc(np, mem_start, naddrc, nsizec, measure_only); - if (rc) - goto out; - - rc = finish_node_interrupts(np, mem_start, measure_only); - if (rc) - goto out; - - /* Look for #address-cells and #size-cells properties. */ - ip = (int *) get_property(np, "#address-cells", NULL); - if (ip != NULL) - naddrc = *ip; - ip = (int *) get_property(np, "#size-cells", NULL); - if (ip != NULL) - nsizec = *ip; - - if (!strcmp(np->name, "device-tree") || np->parent == NULL) - ifunc = interpret_root_props; - else if (np->type == 0) - ifunc = NULL; - else if (!strcmp(np->type, "pci") || !strcmp(np->type, "vci")) - ifunc = interpret_pci_props; - else if (!strcmp(np->type, "dbdma")) - ifunc = interpret_dbdma_props; - else if (!strcmp(np->type, "mac-io") || ifunc == interpret_macio_props) - ifunc = interpret_macio_props; - else if (!strcmp(np->type, "isa")) - ifunc = interpret_isa_props; - else if (!strcmp(np->name, "uni-n") || !strcmp(np->name, "u3")) - ifunc = interpret_root_props; - else if (!((ifunc == interpret_dbdma_props - || ifunc == interpret_macio_props) - && (!strcmp(np->type, "escc") - || !strcmp(np->type, "media-bay")))) - ifunc = NULL; - - for (child = np->child; child != NULL; child = child->sibling) { - rc = finish_node(child, mem_start, ifunc, - naddrc, nsizec, measure_only); - if (rc) - goto out; - } -out: - return rc; } +#else +#define check_cpu_slb_size(node) do { } while(0) +#endif -static void __init scan_interrupt_controllers(void) -{ - struct device_node *np; - int n = 0; - char *name, *ic; - int iclen; - - for (np = allnodes; np != NULL; np = np->allnext) { - ic = get_property(np, "interrupt-controller", &iclen); - name = get_property(np, "name", NULL); - /* checking iclen makes sure we don't get a false - match on /chosen.interrupt_controller */ - if ((name != NULL - && strcmp(name, "interrupt-controller") == 0) - || (ic != NULL && iclen == 0 - && strcmp(name, "AppleKiwi"))) { - if (n == 0) - dflt_interrupt_controller = np; - ++n; - } - } - num_interrupt_controllers = n; -} +static struct feature_property { + const char *name; + u32 min_value; + unsigned long cpu_feature; + unsigned long cpu_user_ftr; +} feature_properties[] __initdata = { +#ifdef CONFIG_ALTIVEC + {"altivec", 0, CPU_FTR_ALTIVEC, PPC_FEATURE_HAS_ALTIVEC}, + {"ibm,vmx", 1, CPU_FTR_ALTIVEC, PPC_FEATURE_HAS_ALTIVEC}, +#endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX + /* Yes, this _really_ is ibm,vmx == 2 to enable VSX */ + {"ibm,vmx", 2, CPU_FTR_VSX, PPC_FEATURE_HAS_VSX}, +#endif /* CONFIG_VSX */ +#ifdef CONFIG_PPC64 + {"ibm,dfp", 1, 0, PPC_FEATURE_HAS_DFP}, + {"ibm,purr", 1, CPU_FTR_PURR, 0}, + {"ibm,spurr", 1, CPU_FTR_SPURR, 0}, +#endif /* CONFIG_PPC64 */ +}; -/** - * finish_device_tree is called once things are running normally - * (i.e. with text and data mapped to the address they were linked at). - * It traverses the device tree and fills in some of the additional, - * fields in each node like {n_}addrs and {n_}intrs, the virt interrupt - * mapping is also initialized at this point. - */ -void __init finish_device_tree(void) +#if defined(CONFIG_44x) && defined(CONFIG_PPC_FPU) +static inline void identical_pvr_fixup(unsigned long node) { - unsigned long start, end, size = 0; - - DBG(" -> finish_device_tree\n"); - -#ifdef CONFIG_PPC64 - /* Initialize virtual IRQ map */ - virt_irq_init(); -#endif - scan_interrupt_controllers(); + unsigned int pvr; + const char *model = of_get_flat_dt_prop(node, "model", NULL); /* - * Finish device-tree (pre-parsing some properties etc...) - * We do this in 2 passes. One with "measure_only" set, which - * will only measure the amount of memory needed, then we can - * allocate that memory, and call finish_node again. However, - * we must be careful as most routines will fail nowadays when - * prom_alloc() returns 0, so we must make sure our first pass - * doesn't start at 0. We pre-initialize size to 16 for that - * reason and then remove those additional 16 bytes + * Since 440GR(x)/440EP(x) processors have the same pvr, + * we check the node path and set bit 28 in the cur_cpu_spec + * pvr for EP(x) processor version. This bit is always 0 in + * the "real" pvr. Then we call identify_cpu again with + * the new logical pvr to enable FPU support. */ - size = 16; - finish_node(allnodes, &size, NULL, 0, 0, 1); - size -= 16; - end = start = (unsigned long) __va(lmb_alloc(size, 128)); - finish_node(allnodes, &end, NULL, 0, 0, 0); - BUG_ON(end != start + size); - - DBG(" <- finish_device_tree\n"); -} - -static inline char *find_flat_dt_string(u32 offset) -{ - return ((char *)initial_boot_params) + - initial_boot_params->off_dt_strings + offset; -} - -/** - * This function is used to scan the flattened device-tree, it is - * used to extract the memory informations at boot before we can - * unflatten the tree - */ -static int __init scan_flat_dt(int (*it)(unsigned long node, - const char *uname, int depth, - void *data), - void *data) -{ - unsigned long p = ((unsigned long)initial_boot_params) + - initial_boot_params->off_dt_struct; - int rc = 0; - int depth = -1; - - do { - u32 tag = *((u32 *)p); - char *pathp; - - p += 4; - if (tag == OF_DT_END_NODE) { - depth --; - continue; - } - if (tag == OF_DT_NOP) - continue; - if (tag == OF_DT_END) - break; - if (tag == OF_DT_PROP) { - u32 sz = *((u32 *)p); - p += 8; - if (initial_boot_params->version < 0x10) - p = _ALIGN(p, sz >= 8 ? 8 : 4); - p += sz; - p = _ALIGN(p, 4); - continue; - } - if (tag != OF_DT_BEGIN_NODE) { - printk(KERN_WARNING "Invalid tag %x scanning flattened" - " device tree !\n", tag); - return -EINVAL; - } - depth++; - pathp = (char *)p; - p = _ALIGN(p + strlen(pathp) + 1, 4); - if ((*pathp) == '/') { - char *lp, *np; - for (lp = NULL, np = pathp; *np; np++) - if ((*np) == '/') - lp = np+1; - if (lp != NULL) - pathp = lp; - } - rc = it(p, pathp, depth, data); - if (rc != 0) - break; - } while(1); - - return rc; -} - -/** - * This function can be used within scan_flattened_dt callback to get - * access to properties - */ -static void* __init get_flat_dt_prop(unsigned long node, const char *name, - unsigned long *size) -{ - unsigned long p = node; - - do { - u32 tag = *((u32 *)p); - u32 sz, noff; - const char *nstr; - - p += 4; - if (tag == OF_DT_NOP) - continue; - if (tag != OF_DT_PROP) - return NULL; - - sz = *((u32 *)p); - noff = *((u32 *)(p + 4)); - p += 8; - if (initial_boot_params->version < 0x10) - p = _ALIGN(p, sz >= 8 ? 8 : 4); - - nstr = find_flat_dt_string(noff); - if (nstr == NULL) { - printk(KERN_WARNING "Can't find property index" - " name !\n"); - return NULL; - } - if (strcmp(name, nstr) == 0) { - if (size) - *size = sz; - return (void *)p; - } - p += sz; - p = _ALIGN(p, 4); - } while(1); -} - -static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size, - unsigned long align) -{ - void *res; - - *mem = _ALIGN(*mem, align); - res = (void *)*mem; - *mem += size; - - return res; + if (model && strstr(model, "440EP")) { + pvr = cur_cpu_spec->pvr_value | 0x8; + identify_cpu(0, pvr); + DBG("Using logical pvr %x for %s\n", pvr, model); + } } +#else +#define identical_pvr_fixup(node) do { } while(0) +#endif -static unsigned long __init unflatten_dt_node(unsigned long mem, - unsigned long *p, - struct device_node *dad, - struct device_node ***allnextpp, - unsigned long fpsize) +static void __init check_cpu_feature_properties(unsigned long node) { - struct device_node *np; - struct property *pp, **prev_pp = NULL; - char *pathp; - u32 tag; - unsigned int l, allocl; - int has_name = 0; - int new_format = 0; - - tag = *((u32 *)(*p)); - if (tag != OF_DT_BEGIN_NODE) { - printk("Weird tag at start of node: %x\n", tag); - return mem; - } - *p += 4; - pathp = (char *)*p; - l = allocl = strlen(pathp) + 1; - *p = _ALIGN(*p + l, 4); - - /* version 0x10 has a more compact unit name here instead of the full - * path. we accumulate the full path size using "fpsize", we'll rebuild - * it later. We detect this because the first character of the name is - * not '/'. - */ - if ((*pathp) != '/') { - new_format = 1; - if (fpsize == 0) { - /* root node: special case. fpsize accounts for path - * plus terminating zero. root node only has '/', so - * fpsize should be 2, but we want to avoid the first - * level nodes to have two '/' so we use fpsize 1 here - */ - fpsize = 1; - allocl = 2; - } else { - /* account for '/' and path size minus terminal 0 - * already in 'l' - */ - fpsize += l; - allocl = fpsize; - } - } - - - np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl, - __alignof__(struct device_node)); - if (allnextpp) { - memset(np, 0, sizeof(*np)); - np->full_name = ((char*)np) + sizeof(struct device_node); - if (new_format) { - char *p = np->full_name; - /* rebuild full path for new format */ - if (dad && dad->parent) { - strcpy(p, dad->full_name); -#ifdef DEBUG - if ((strlen(p) + l + 1) != allocl) { - DBG("%s: p: %d, l: %d, a: %d\n", - pathp, strlen(p), l, allocl); - } -#endif - p += strlen(p); - } - *(p++) = '/'; - memcpy(p, pathp, l); - } else - memcpy(np->full_name, pathp, l); - prev_pp = &np->properties; - **allnextpp = np; - *allnextpp = &np->allnext; - if (dad != NULL) { - np->parent = dad; - /* we temporarily use the next field as `last_child'*/ - if (dad->next == 0) - dad->child = np; - else - dad->next->sibling = np; - dad->next = np; - } - kref_init(&np->kref); - } - while(1) { - u32 sz, noff; - char *pname; + unsigned long i; + struct feature_property *fp = feature_properties; + const __be32 *prop; - tag = *((u32 *)(*p)); - if (tag == OF_DT_NOP) { - *p += 4; - continue; - } - if (tag != OF_DT_PROP) - break; - *p += 4; - sz = *((u32 *)(*p)); - noff = *((u32 *)((*p) + 4)); - *p += 8; - if (initial_boot_params->version < 0x10) - *p = _ALIGN(*p, sz >= 8 ? 8 : 4); - - pname = find_flat_dt_string(noff); - if (pname == NULL) { - printk("Can't find property name in list !\n"); - break; - } - if (strcmp(pname, "name") == 0) - has_name = 1; - l = strlen(pname) + 1; - pp = unflatten_dt_alloc(&mem, sizeof(struct property), - __alignof__(struct property)); - if (allnextpp) { - if (strcmp(pname, "linux,phandle") == 0) { - np->node = *((u32 *)*p); - if (np->linux_phandle == 0) - np->linux_phandle = np->node; - } - if (strcmp(pname, "ibm,phandle") == 0) - np->linux_phandle = *((u32 *)*p); - pp->name = pname; - pp->length = sz; - pp->value = (void *)*p; - *prev_pp = pp; - prev_pp = &pp->next; - } - *p = _ALIGN((*p) + sz, 4); - } - /* with version 0x10 we may not have the name property, recreate - * it here from the unit name if absent - */ - if (!has_name) { - char *p = pathp, *ps = pathp, *pa = NULL; - int sz; - - while (*p) { - if ((*p) == '@') - pa = p; - if ((*p) == '/') - ps = p + 1; - p++; + for (i = 0; i < ARRAY_SIZE(feature_properties); ++i, ++fp) { + prop = of_get_flat_dt_prop(node, fp->name, NULL); + if (prop && be32_to_cpup(prop) >= fp->min_value) { + cur_cpu_spec->cpu_features |= fp->cpu_feature; + cur_cpu_spec->cpu_user_features |= fp->cpu_user_ftr; } - if (pa < ps) - pa = p; - sz = (pa - ps) + 1; - pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz, - __alignof__(struct property)); - if (allnextpp) { - pp->name = "name"; - pp->length = sz; - pp->value = (unsigned char *)(pp + 1); - *prev_pp = pp; - prev_pp = &pp->next; - memcpy(pp->value, ps, sz - 1); - ((char *)pp->value)[sz - 1] = 0; - DBG("fixed up name for %s -> %s\n", pathp, pp->value); - } - } - if (allnextpp) { - *prev_pp = NULL; - np->name = get_property(np, "name", NULL); - np->type = get_property(np, "device_type", NULL); - - if (!np->name) - np->name = "<NULL>"; - if (!np->type) - np->type = "<NULL>"; - } - while (tag == OF_DT_BEGIN_NODE) { - mem = unflatten_dt_node(mem, p, np, allnextpp, fpsize); - tag = *((u32 *)(*p)); } - if (tag != OF_DT_END_NODE) { - printk("Weird tag at end of node: %x\n", tag); - return mem; - } - *p += 4; - return mem; } - -/** - * unflattens the device-tree passed by the firmware, creating the - * tree of struct device_node. It also fills the "name" and "type" - * pointers of the nodes so the normal device-tree walking functions - * can be used (this used to be done by finish_device_tree) - */ -void __init unflatten_device_tree(void) -{ - unsigned long start, mem, size; - struct device_node **allnextp = &allnodes; - char *p = NULL; - int l = 0; - - DBG(" -> unflatten_device_tree()\n"); - - /* First pass, scan for size */ - start = ((unsigned long)initial_boot_params) + - initial_boot_params->off_dt_struct; - size = unflatten_dt_node(0, &start, NULL, NULL, 0); - size = (size | 3) + 1; - - DBG(" size is %lx, allocating...\n", size); - - /* Allocate memory for the expanded device tree */ - mem = lmb_alloc(size + 4, __alignof__(struct device_node)); - if (!mem) { - DBG("Couldn't allocate memory with lmb_alloc()!\n"); - panic("Couldn't allocate memory with lmb_alloc()!\n"); - } - mem = (unsigned long) __va(mem); - - ((u32 *)mem)[size / 4] = 0xdeadbeef; - - DBG(" unflattening %lx...\n", mem); - - /* Second pass, do actual unflattening */ - start = ((unsigned long)initial_boot_params) + - initial_boot_params->off_dt_struct; - unflatten_dt_node(mem, &start, NULL, &allnextp, 0); - if (*((u32 *)start) != OF_DT_END) - printk(KERN_WARNING "Weird tag at end of tree: %08x\n", *((u32 *)start)); - if (((u32 *)mem)[size / 4] != 0xdeadbeef) - printk(KERN_WARNING "End of tree marker overwritten: %08x\n", - ((u32 *)mem)[size / 4] ); - *allnextp = NULL; - - /* Get pointer to OF "/chosen" node for use everywhere */ - of_chosen = of_find_node_by_path("/chosen"); - if (of_chosen == NULL) - of_chosen = of_find_node_by_path("/chosen@0"); - - /* Retreive command line */ - if (of_chosen != NULL) { - p = (char *)get_property(of_chosen, "bootargs", &l); - if (p != NULL && l > 0) - strlcpy(cmd_line, p, min(l, COMMAND_LINE_SIZE)); - } -#ifdef CONFIG_CMDLINE - if (l == 0 || (l == 1 && (*p) == 0)) - strlcpy(cmd_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE); -#endif /* CONFIG_CMDLINE */ - - DBG("Command line is: %s\n", cmd_line); - - DBG(" <- unflatten_device_tree()\n"); -} - - static int __init early_init_dt_scan_cpus(unsigned long node, - const char *uname, int depth, void *data) + const char *uname, int depth, + void *data) { - char *type = get_flat_dt_prop(node, "device_type", NULL); - u32 *prop; - unsigned long size = 0; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + const __be32 *intserv; + int i, nthreads; + int len; + int found = -1; + int found_thread = 0; /* We are scanning "cpu" nodes only */ if (type == NULL || strcmp(type, "cpu") != 0) return 0; -#ifdef CONFIG_PPC_PSERIES - /* On LPAR, look for the first ibm,pft-size property for the hash table size - */ - if (systemcfg->platform == PLATFORM_PSERIES_LPAR && ppc64_pft_size == 0) { - u32 *pft_size; - pft_size = get_flat_dt_prop(node, "ibm,pft-size", NULL); - if (pft_size != NULL) { - /* pft_size[0] is the NUMA CEC cookie */ - ppc64_pft_size = pft_size[1]; - } + /* Get physical cpuid */ + intserv = of_get_flat_dt_prop(node, "ibm,ppc-interrupt-server#s", &len); + if (intserv) { + nthreads = len / sizeof(int); + } else { + intserv = of_get_flat_dt_prop(node, "reg", NULL); + nthreads = 1; } -#endif - boot_cpuid = 0; - boot_cpuid_phys = 0; - if (initial_boot_params && initial_boot_params->version >= 2) { - /* version 2 of the kexec param format adds the phys cpuid - * of booted proc. + /* + * Now see if any of these threads match our boot cpu. + * NOTE: This must match the parsing done in smp_setup_cpu_maps. + */ + for (i = 0; i < nthreads; i++) { + /* + * version 2 of the kexec param format adds the phys cpuid of + * booted proc. */ - boot_cpuid_phys = initial_boot_params->boot_cpuid_phys; - } else { - /* Check if it's the boot-cpu, set it's hw index now */ - if (get_flat_dt_prop(node, "linux,boot-cpu", NULL) != NULL) { - prop = get_flat_dt_prop(node, "reg", NULL); - if (prop != NULL) - boot_cpuid_phys = *prop; + if (fdt_version(initial_boot_params) >= 2) { + if (be32_to_cpu(intserv[i]) == + fdt_boot_cpuid_phys(initial_boot_params)) { + found = boot_cpu_count; + found_thread = i; + } + } else { + /* + * Check if it's the boot-cpu, set it's hw index now, + * unfortunately this format did not support booting + * off secondary threads. + */ + if (of_get_flat_dt_prop(node, + "linux,boot-cpu", NULL) != NULL) + found = boot_cpu_count; } +#ifdef CONFIG_SMP + /* logical cpu id is always 0 on UP kernels */ + boot_cpu_count++; +#endif } - set_hard_smp_processor_id(0, boot_cpuid_phys); -#ifdef CONFIG_ALTIVEC - /* Check if we have a VMX and eventually update CPU features */ - prop = (u32 *)get_flat_dt_prop(node, "ibm,vmx", &size); - if (prop && (*prop) > 0) { - cur_cpu_spec->cpu_features |= CPU_FTR_ALTIVEC; - cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_ALTIVEC; - } + /* Not the boot CPU */ + if (found < 0) + return 0; - /* Same goes for Apple's "altivec" property */ - prop = (u32 *)get_flat_dt_prop(node, "altivec", NULL); - if (prop) { - cur_cpu_spec->cpu_features |= CPU_FTR_ALTIVEC; - cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_ALTIVEC; - } -#endif /* CONFIG_ALTIVEC */ + DBG("boot cpu: logical %d physical %d\n", found, + be32_to_cpu(intserv[found_thread])); + boot_cpuid = found; + set_hard_smp_processor_id(found, be32_to_cpu(intserv[found_thread])); -#ifdef CONFIG_PPC_PSERIES /* - * Check for an SMT capable CPU and set the CPU feature. We do - * this by looking at the size of the ibm,ppc-interrupt-server#s - * property + * PAPR defines "logical" PVR values for cpus that + * meet various levels of the architecture: + * 0x0f000001 Architecture version 2.04 + * 0x0f000002 Architecture version 2.05 + * If the cpu-version property in the cpu node contains + * such a value, we call identify_cpu again with the + * logical PVR value in order to use the cpu feature + * bits appropriate for the architecture level. + * + * A POWER6 partition in "POWER6 architected" mode + * uses the 0x0f000002 PVR value; in POWER5+ mode + * it uses 0x0f000001. */ - prop = (u32 *)get_flat_dt_prop(node, "ibm,ppc-interrupt-server#s", - &size); - cur_cpu_spec->cpu_features &= ~CPU_FTR_SMT; - if (prop && ((size / sizeof(u32)) > 1)) + prop = of_get_flat_dt_prop(node, "cpu-version", NULL); + if (prop && (be32_to_cpup(prop) & 0xff000000) == 0x0f000000) + identify_cpu(0, be32_to_cpup(prop)); + + identical_pvr_fixup(node); + + check_cpu_feature_properties(node); + check_cpu_pa_features(node); + check_cpu_slb_size(node); + +#ifdef CONFIG_PPC64 + if (nthreads > 1) cur_cpu_spec->cpu_features |= CPU_FTR_SMT; + else + cur_cpu_spec->cpu_features &= ~CPU_FTR_SMT; #endif - return 0; } -static int __init early_init_dt_scan_chosen(unsigned long node, - const char *uname, int depth, void *data) +int __init early_init_dt_scan_chosen_ppc(unsigned long node, const char *uname, + int depth, void *data) { - u32 *prop; - unsigned long *lprop; + const unsigned long *lprop; /* All these set by kernel, so no need to convert endian */ - DBG("search \"chosen\", depth: %d, uname: %s\n", depth, uname); - - if (depth != 1 || - (strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0)) + /* Use common scan routine to determine if this is the chosen node */ + if (early_init_dt_scan_chosen(node, uname, depth, data) == 0) return 0; - /* get platform type */ - prop = (u32 *)get_flat_dt_prop(node, "linux,platform", NULL); - if (prop == NULL) - return 0; -#ifdef CONFIG_PPC64 - systemcfg->platform = *prop; -#else -#ifdef CONFIG_PPC_MULTIPLATFORM - _machine = *prop; -#endif -#endif - #ifdef CONFIG_PPC64 /* check if iommu is forced on or off */ - if (get_flat_dt_prop(node, "linux,iommu-off", NULL) != NULL) + if (of_get_flat_dt_prop(node, "linux,iommu-off", NULL) != NULL) iommu_is_off = 1; - if (get_flat_dt_prop(node, "linux,iommu-force-on", NULL) != NULL) + if (of_get_flat_dt_prop(node, "linux,iommu-force-on", NULL) != NULL) iommu_force_on = 1; #endif - lprop = get_flat_dt_prop(node, "linux,memory-limit", NULL); - if (lprop) - memory_limit = *lprop; + /* mem=x on the command line is the preferred mechanism */ + lprop = of_get_flat_dt_prop(node, "linux,memory-limit", NULL); + if (lprop) + memory_limit = *lprop; #ifdef CONFIG_PPC64 - lprop = get_flat_dt_prop(node, "linux,tce-alloc-start", NULL); - if (lprop) - tce_alloc_start = *lprop; - lprop = get_flat_dt_prop(node, "linux,tce-alloc-end", NULL); - if (lprop) - tce_alloc_end = *lprop; + lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-start", NULL); + if (lprop) + tce_alloc_start = *lprop; + lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-end", NULL); + if (lprop) + tce_alloc_end = *lprop; #endif -#ifdef CONFIG_PPC_RTAS - /* To help early debugging via the front panel, we retreive a minimal - * set of RTAS infos now if available - */ - { - u64 *basep, *entryp; - - basep = get_flat_dt_prop(node, "linux,rtas-base", NULL); - entryp = get_flat_dt_prop(node, "linux,rtas-entry", NULL); - prop = get_flat_dt_prop(node, "linux,rtas-size", NULL); - if (basep && entryp && prop) { - rtas.base = *basep; - rtas.entry = *entryp; - rtas.size = *prop; - } - } -#endif /* CONFIG_PPC_RTAS */ +#ifdef CONFIG_KEXEC + lprop = of_get_flat_dt_prop(node, "linux,crashkernel-base", NULL); + if (lprop) + crashk_res.start = *lprop; - /* break now */ - return 1; -} - -static int __init early_init_dt_scan_root(unsigned long node, - const char *uname, int depth, void *data) -{ - u32 *prop; - - if (depth != 0) - return 0; - - prop = get_flat_dt_prop(node, "#size-cells", NULL); - dt_root_size_cells = (prop == NULL) ? 1 : *prop; - DBG("dt_root_size_cells = %x\n", dt_root_size_cells); + lprop = of_get_flat_dt_prop(node, "linux,crashkernel-size", NULL); + if (lprop) + crashk_res.end = crashk_res.start + *lprop - 1; +#endif - prop = get_flat_dt_prop(node, "#address-cells", NULL); - dt_root_addr_cells = (prop == NULL) ? 2 : *prop; - DBG("dt_root_addr_cells = %x\n", dt_root_addr_cells); - /* break now */ return 1; } -static unsigned long __init dt_mem_next_cell(int s, cell_t **cellp) -{ - cell_t *p = *cellp; - unsigned long r; - - /* Ignore more than 2 cells */ - while (s > sizeof(unsigned long) / 4) { - p++; - s--; - } - r = *p++; -#ifdef CONFIG_PPC64 - if (s > 1) { - r <<= 32; - r |= *(p++); - s--; - } -#endif - - *cellp = p; - return r; -} - - -static int __init early_init_dt_scan_memory(unsigned long node, - const char *uname, int depth, void *data) +#ifdef CONFIG_PPC_PSERIES +/* + * Interpret the ibm,dynamic-memory property in the + * /ibm,dynamic-reconfiguration-memory node. + * This contains a list of memory blocks along with NUMA affinity + * information. + */ +static int __init early_init_dt_scan_drconf_memory(unsigned long node) { - char *type = get_flat_dt_prop(node, "device_type", NULL); - cell_t *reg, *endp; - unsigned long l; + const __be32 *dm, *ls, *usm; + int l; + unsigned long n, flags; + u64 base, size, memblock_size; + unsigned int is_kexec_kdump = 0, rngs; - /* We are scanning "memory" nodes only */ - if (type == NULL || strcmp(type, "memory") != 0) + ls = of_get_flat_dt_prop(node, "ibm,lmb-size", &l); + if (ls == NULL || l < dt_root_size_cells * sizeof(__be32)) return 0; + memblock_size = dt_mem_next_cell(dt_root_size_cells, &ls); - reg = (cell_t *)get_flat_dt_prop(node, "reg", &l); - if (reg == NULL) + dm = of_get_flat_dt_prop(node, "ibm,dynamic-memory", &l); + if (dm == NULL || l < sizeof(__be32)) return 0; - endp = reg + (l / sizeof(cell_t)); - - DBG("memory scan node %s ..., reg size %ld, data: %x %x %x %x, ...\n", - uname, l, reg[0], reg[1], reg[2], reg[3]); - - while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) { - unsigned long base, size; - - base = dt_mem_next_cell(dt_root_addr_cells, ®); - size = dt_mem_next_cell(dt_root_size_cells, ®); + n = of_read_number(dm++, 1); /* number of entries */ + if (l < (n * (dt_root_addr_cells + 4) + 1) * sizeof(__be32)) + return 0; - if (size == 0) + /* check if this is a kexec/kdump kernel. */ + usm = of_get_flat_dt_prop(node, "linux,drconf-usable-memory", + &l); + if (usm != NULL) + is_kexec_kdump = 1; + + for (; n != 0; --n) { + base = dt_mem_next_cell(dt_root_addr_cells, &dm); + flags = of_read_number(&dm[3], 1); + /* skip DRC index, pad, assoc. list index, flags */ + dm += 4; + /* skip this block if the reserved bit is set in flags (0x80) + or if the block is not assigned to this partition (0x8) */ + if ((flags & 0x80) || !(flags & 0x8)) continue; - DBG(" - %lx , %lx\n", base, size); -#ifdef CONFIG_PPC64 - if (iommu_is_off) { - if (base >= 0x80000000ul) + size = memblock_size; + rngs = 1; + if (is_kexec_kdump) { + /* + * For each memblock in ibm,dynamic-memory, a corresponding + * entry in linux,drconf-usable-memory property contains + * a counter 'p' followed by 'p' (base, size) duple. + * Now read the counter from + * linux,drconf-usable-memory property + */ + rngs = dt_mem_next_cell(dt_root_size_cells, &usm); + if (!rngs) /* there are no (base, size) duple */ continue; - if ((base + size) > 0x80000000ul) - size = 0x80000000ul - base; } -#endif - lmb_add(base, size); + do { + if (is_kexec_kdump) { + base = dt_mem_next_cell(dt_root_addr_cells, + &usm); + size = dt_mem_next_cell(dt_root_size_cells, + &usm); + } + if (iommu_is_off) { + if (base >= 0x80000000ul) + continue; + if ((base + size) > 0x80000000ul) + size = 0x80000000ul - base; + } + memblock_add(base, size); + } while (--rngs); } + memblock_dump_all(); return 0; } +#else +#define early_init_dt_scan_drconf_memory(node) 0 +#endif /* CONFIG_PPC_PSERIES */ -static void __init early_reserve_mem(void) +static int __init early_init_dt_scan_memory_ppc(unsigned long node, + const char *uname, + int depth, void *data) { - unsigned long base, size; - unsigned long *reserve_map; - - reserve_map = (unsigned long *)(((unsigned long)initial_boot_params) + - initial_boot_params->off_mem_rsvmap); - while (1) { - base = *(reserve_map++); - size = *(reserve_map++); - if (size == 0) - break; - DBG("reserving: %lx -> %lx\n", base, size); - lmb_reserve(base, size); - } + if (depth == 1 && + strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) + return early_init_dt_scan_drconf_memory(node); + + return early_init_dt_scan_memory(node, uname, depth, data); +} -#if 0 - DBG("memory reserved, lmbs :\n"); - lmb_dump_all(); +/* + * For a relocatable kernel, we need to get the memstart_addr first, + * then use it to calculate the virtual kernel start address. This has + * to happen at a very early stage (before machine_init). In this case, + * we just want to get the memstart_address and would not like to mess the + * memblock at this stage. So introduce a variable to skip the memblock_add() + * for this reason. + */ +#ifdef CONFIG_RELOCATABLE +static int add_mem_to_memblock = 1; +#else +#define add_mem_to_memblock 1 #endif -} -void __init early_init_devtree(void *params) +void __init early_init_dt_add_memory_arch(u64 base, u64 size) { - DBG(" -> early_init_devtree()\n"); - - /* Setup flat device-tree pointer */ - initial_boot_params = params; - - /* Retrieve various informations from the /chosen node of the - * device-tree, including the platform type, initrd location and - * size, TCE reserve, and more ... - */ - scan_flat_dt(early_init_dt_scan_chosen, NULL); - - /* Scan memory nodes and rebuild LMBs */ - lmb_init(); - scan_flat_dt(early_init_dt_scan_root, NULL); - scan_flat_dt(early_init_dt_scan_memory, NULL); - lmb_enforce_memory_limit(memory_limit); - lmb_analyze(); #ifdef CONFIG_PPC64 - systemcfg->physicalMemorySize = lmb_phys_mem_size(); + if (iommu_is_off) { + if (base >= 0x80000000ul) + return; + if ((base + size) > 0x80000000ul) + size = 0x80000000ul - base; + } #endif - lmb_reserve(0, __pa(klimit)); - - DBG("Phys. mem: %lx\n", lmb_phys_mem_size()); - - /* Reserve LMB regions used by kernel, initrd, dt, etc... */ - early_reserve_mem(); - - DBG("Scanning CPUs ...\n"); - - /* Retreive hash table size from flattened tree plus other - * CPU related informations (altivec support, boot CPU ID, ...) + /* Keep track of the beginning of memory -and- the size of + * the very first block in the device-tree as it represents + * the RMA on ppc64 server */ - scan_flat_dt(early_init_dt_scan_cpus, NULL); + if (base < memstart_addr) { + memstart_addr = base; + first_memblock_size = size; + } - DBG(" <- early_init_devtree()\n"); + /* Add the chunk to the MEMBLOCK list */ + if (add_mem_to_memblock) + memblock_add(base, size); } -#undef printk - -int -prom_n_addr_cells(struct device_node* np) +static void __init early_reserve_mem_dt(void) { - int* ip; - do { - if (np->parent) - np = np->parent; - ip = (int *) get_property(np, "#address-cells", NULL); - if (ip != NULL) - return *ip; - } while (np->parent); - /* No #address-cells property for the root node, default to 1 */ - return 1; -} + unsigned long i, dt_root; + int len; + const __be32 *prop; -int -prom_n_size_cells(struct device_node* np) -{ - int* ip; - do { - if (np->parent) - np = np->parent; - ip = (int *) get_property(np, "#size-cells", NULL); - if (ip != NULL) - return *ip; - } while (np->parent); - /* No #size-cells property for the root node, default to 1 */ - return 1; -} + early_init_fdt_scan_reserved_mem(); -/** - * Work out the sense (active-low level / active-high edge) - * of each interrupt from the device tree. - */ -void __init prom_get_irq_senses(unsigned char *senses, int off, int max) -{ - struct device_node *np; - int i, j; + dt_root = of_get_flat_dt_root(); - /* default to level-triggered */ - memset(senses, IRQ_SENSE_LEVEL | IRQ_POLARITY_NEGATIVE, max - off); + prop = of_get_flat_dt_prop(dt_root, "reserved-ranges", &len); - for (np = allnodes; np != 0; np = np->allnext) { - for (j = 0; j < np->n_intrs; j++) { - i = np->intrs[j].line; - if (i >= off && i < max) - senses[i-off] = np->intrs[j].sense; - } - } -} + if (!prop) + return; -/** - * Construct and return a list of the device_nodes with a given name. - */ -struct device_node *find_devices(const char *name) -{ - struct device_node *head, **prevp, *np; + DBG("Found new-style reserved-ranges\n"); - prevp = &head; - for (np = allnodes; np != 0; np = np->allnext) { - if (np->name != 0 && strcasecmp(np->name, name) == 0) { - *prevp = np; - prevp = &np->next; - } - } - *prevp = NULL; - return head; -} -EXPORT_SYMBOL(find_devices); + /* Each reserved range is an (address,size) pair, 2 cells each, + * totalling 4 cells per range. */ + for (i = 0; i < len / (sizeof(*prop) * 4); i++) { + u64 base, size; -/** - * Construct and return a list of the device_nodes with a given type. - */ -struct device_node *find_type_devices(const char *type) -{ - struct device_node *head, **prevp, *np; + base = of_read_number(prop + (i * 4) + 0, 2); + size = of_read_number(prop + (i * 4) + 2, 2); - prevp = &head; - for (np = allnodes; np != 0; np = np->allnext) { - if (np->type != 0 && strcasecmp(np->type, type) == 0) { - *prevp = np; - prevp = &np->next; + if (size) { + DBG("reserving: %llx -> %llx\n", base, size); + memblock_reserve(base, size); } } - *prevp = NULL; - return head; } -EXPORT_SYMBOL(find_type_devices); -/** - * Returns all nodes linked together - */ -struct device_node *find_all_nodes(void) +static void __init early_reserve_mem(void) { - struct device_node *head, **prevp, *np; + __be64 *reserve_map; - prevp = &head; - for (np = allnodes; np != 0; np = np->allnext) { - *prevp = np; - prevp = &np->next; - } - *prevp = NULL; - return head; -} -EXPORT_SYMBOL(find_all_nodes); + reserve_map = (__be64 *)(((unsigned long)initial_boot_params) + + fdt_off_mem_rsvmap(initial_boot_params)); -/** Checks if the given "compat" string matches one of the strings in - * the device's "compatible" property - */ -int device_is_compatible(struct device_node *device, const char *compat) -{ - const char* cp; - int cplen, l; + /* Look for the new "reserved-regions" property in the DT */ + early_reserve_mem_dt(); - cp = (char *) get_property(device, "compatible", &cplen); - if (cp == NULL) - return 0; - while (cplen > 0) { - if (strncasecmp(cp, compat, strlen(compat)) == 0) - return 1; - l = strlen(cp) + 1; - cp += l; - cplen -= l; +#ifdef CONFIG_BLK_DEV_INITRD + /* Then reserve the initrd, if any */ + if (initrd_start && (initrd_end > initrd_start)) { + memblock_reserve(_ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE), + _ALIGN_UP(initrd_end, PAGE_SIZE) - + _ALIGN_DOWN(initrd_start, PAGE_SIZE)); } +#endif /* CONFIG_BLK_DEV_INITRD */ - return 0; -} -EXPORT_SYMBOL(device_is_compatible); +#ifdef CONFIG_PPC32 + /* + * Handle the case where we might be booting from an old kexec + * image that setup the mem_rsvmap as pairs of 32-bit values + */ + if (be64_to_cpup(reserve_map) > 0xffffffffull) { + u32 base_32, size_32; + __be32 *reserve_map_32 = (__be32 *)reserve_map; + DBG("Found old 32-bit reserve map\n"); -/** - * Indicates whether the root node has a given value in its - * compatible property. - */ -int machine_is_compatible(const char *compat) -{ - struct device_node *root; - int rc = 0; - - root = of_find_node_by_path("/"); - if (root) { - rc = device_is_compatible(root, compat); - of_node_put(root); + while (1) { + base_32 = be32_to_cpup(reserve_map_32++); + size_32 = be32_to_cpup(reserve_map_32++); + if (size_32 == 0) + break; + DBG("reserving: %x -> %x\n", base_32, size_32); + memblock_reserve(base_32, size_32); + } + return; } - return rc; +#endif } -EXPORT_SYMBOL(machine_is_compatible); -/** - * Construct and return a list of the device_nodes with a given type - * and compatible property. - */ -struct device_node *find_compatible_devices(const char *type, - const char *compat) +void __init early_init_devtree(void *params) { - struct device_node *head, **prevp, *np; + phys_addr_t limit; - prevp = &head; - for (np = allnodes; np != 0; np = np->allnext) { - if (type != NULL - && !(np->type != 0 && strcasecmp(np->type, type) == 0)) - continue; - if (device_is_compatible(np, compat)) { - *prevp = np; - prevp = &np->next; - } - } - *prevp = NULL; - return head; -} -EXPORT_SYMBOL(find_compatible_devices); + DBG(" -> early_init_devtree(%p)\n", params); -/** - * Find the device_node with a given full_name. - */ -struct device_node *find_path_device(const char *path) -{ - struct device_node *np; + /* Setup flat device-tree pointer */ + initial_boot_params = params; - for (np = allnodes; np != 0; np = np->allnext) - if (np->full_name != 0 && strcasecmp(np->full_name, path) == 0) - return np; - return NULL; -} -EXPORT_SYMBOL(find_path_device); +#ifdef CONFIG_PPC_RTAS + /* Some machines might need RTAS info for debugging, grab it now. */ + of_scan_flat_dt(early_init_dt_scan_rtas, NULL); +#endif -/******* - * - * New implementation of the OF "find" APIs, return a refcounted - * object, call of_node_put() when done. The device tree and list - * are protected by a rw_lock. - * - * Note that property management will need some locking as well, - * this isn't dealt with yet. - * - *******/ +#ifdef CONFIG_PPC_POWERNV + /* Some machines might need OPAL info for debugging, grab it now. */ + of_scan_flat_dt(early_init_dt_scan_opal, NULL); +#endif -/** - * of_find_node_by_name - Find a node by its "name" property - * @from: The node to start searching from or NULL, the node - * you pass will not be searched, only the next one - * will; typically, you pass what the previous call - * returned. of_node_put() will be called on it - * @name: The name string to match against - * - * Returns a node pointer with refcount incremented, use - * of_node_put() on it when done. - */ -struct device_node *of_find_node_by_name(struct device_node *from, - const char *name) -{ - struct device_node *np; +#ifdef CONFIG_FA_DUMP + /* scan tree to see if dump is active during last boot */ + of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL); +#endif - read_lock(&devtree_lock); - np = from ? from->allnext : allnodes; - for (; np != 0; np = np->allnext) - if (np->name != 0 && strcasecmp(np->name, name) == 0 - && of_node_get(np)) - break; - if (from) - of_node_put(from); - read_unlock(&devtree_lock); - return np; -} -EXPORT_SYMBOL(of_find_node_by_name); + /* Retrieve various informations from the /chosen node of the + * device-tree, including the platform type, initrd location and + * size, TCE reserve, and more ... + */ + of_scan_flat_dt(early_init_dt_scan_chosen_ppc, cmd_line); + + /* Scan memory nodes and rebuild MEMBLOCKs */ + of_scan_flat_dt(early_init_dt_scan_root, NULL); + of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL); + + /* Save command line for /proc/cmdline and then parse parameters */ + strlcpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE); + parse_early_param(); + + /* make sure we've parsed cmdline for mem= before this */ + if (memory_limit) + first_memblock_size = min_t(u64, first_memblock_size, memory_limit); + setup_initial_memory_limit(memstart_addr, first_memblock_size); + /* Reserve MEMBLOCK regions used by kernel, initrd, dt, etc... */ + memblock_reserve(PHYSICAL_START, __pa(klimit) - PHYSICAL_START); + /* If relocatable, reserve first 32k for interrupt vectors etc. */ + if (PHYSICAL_START > MEMORY_START) + memblock_reserve(MEMORY_START, 0x8000); + reserve_kdump_trampoline(); +#ifdef CONFIG_FA_DUMP + /* + * If we fail to reserve memory for firmware-assisted dump then + * fallback to kexec based kdump. + */ + if (fadump_reserve_mem() == 0) +#endif + reserve_crashkernel(); + early_reserve_mem(); -/** - * of_find_node_by_type - Find a node by its "device_type" property - * @from: The node to start searching from or NULL, the node - * you pass will not be searched, only the next one - * will; typically, you pass what the previous call - * returned. of_node_put() will be called on it - * @name: The type string to match against - * - * Returns a node pointer with refcount incremented, use - * of_node_put() on it when done. - */ -struct device_node *of_find_node_by_type(struct device_node *from, - const char *type) -{ - struct device_node *np; + /* + * Ensure that total memory size is page-aligned, because otherwise + * mark_bootmem() gets upset. + */ + limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE); + memblock_enforce_memory_limit(limit); - read_lock(&devtree_lock); - np = from ? from->allnext : allnodes; - for (; np != 0; np = np->allnext) - if (np->type != 0 && strcasecmp(np->type, type) == 0 - && of_node_get(np)) - break; - if (from) - of_node_put(from); - read_unlock(&devtree_lock); - return np; -} -EXPORT_SYMBOL(of_find_node_by_type); + memblock_allow_resize(); + memblock_dump_all(); -/** - * of_find_compatible_node - Find a node based on type and one of the - * tokens in its "compatible" property - * @from: The node to start searching from or NULL, the node - * you pass will not be searched, only the next one - * will; typically, you pass what the previous call - * returned. of_node_put() will be called on it - * @type: The type string to match "device_type" or NULL to ignore - * @compatible: The string to match to one of the tokens in the device - * "compatible" list. - * - * Returns a node pointer with refcount incremented, use - * of_node_put() on it when done. - */ -struct device_node *of_find_compatible_node(struct device_node *from, - const char *type, const char *compatible) -{ - struct device_node *np; + DBG("Phys. mem: %llx\n", memblock_phys_mem_size()); - read_lock(&devtree_lock); - np = from ? from->allnext : allnodes; - for (; np != 0; np = np->allnext) { - if (type != NULL - && !(np->type != 0 && strcasecmp(np->type, type) == 0)) - continue; - if (device_is_compatible(np, compatible) && of_node_get(np)) - break; - } - if (from) - of_node_put(from); - read_unlock(&devtree_lock); - return np; -} -EXPORT_SYMBOL(of_find_compatible_node); + /* We may need to relocate the flat tree, do it now. + * FIXME .. and the initrd too? */ + move_device_tree(); -/** - * of_find_node_by_path - Find a node matching a full OF path - * @path: The full path to match - * - * Returns a node pointer with refcount incremented, use - * of_node_put() on it when done. - */ -struct device_node *of_find_node_by_path(const char *path) -{ - struct device_node *np = allnodes; + allocate_pacas(); - read_lock(&devtree_lock); - for (; np != 0; np = np->allnext) { - if (np->full_name != 0 && strcasecmp(np->full_name, path) == 0 - && of_node_get(np)) - break; - } - read_unlock(&devtree_lock); - return np; -} -EXPORT_SYMBOL(of_find_node_by_path); + DBG("Scanning CPUs ...\n"); -/** - * of_find_node_by_phandle - Find a node given a phandle - * @handle: phandle of the node to find - * - * Returns a node pointer with refcount incremented, use - * of_node_put() on it when done. - */ -struct device_node *of_find_node_by_phandle(phandle handle) -{ - struct device_node *np; + /* Retrieve CPU related informations from the flat tree + * (altivec support, boot CPU ID, ...) + */ + of_scan_flat_dt(early_init_dt_scan_cpus, NULL); + if (boot_cpuid < 0) { + printk("Failed to indentify boot CPU !\n"); + BUG(); + } - read_lock(&devtree_lock); - for (np = allnodes; np != 0; np = np->allnext) - if (np->linux_phandle == handle) - break; - if (np) - of_node_get(np); - read_unlock(&devtree_lock); - return np; -} -EXPORT_SYMBOL(of_find_node_by_phandle); +#if defined(CONFIG_SMP) && defined(CONFIG_PPC64) + /* We'll later wait for secondaries to check in; there are + * NCPUS-1 non-boot CPUs :-) + */ + spinning_secondaries = boot_cpu_count - 1; +#endif -/** - * of_find_all_nodes - Get next node in global list - * @prev: Previous node or NULL to start iteration - * of_node_put() will be called on it - * - * Returns a node pointer with refcount incremented, use - * of_node_put() on it when done. - */ -struct device_node *of_find_all_nodes(struct device_node *prev) -{ - struct device_node *np; +#ifdef CONFIG_PPC_POWERNV + /* Scan and build the list of machine check recoverable ranges */ + of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL); +#endif - read_lock(&devtree_lock); - np = prev ? prev->allnext : allnodes; - for (; np != 0; np = np->allnext) - if (of_node_get(np)) - break; - if (prev) - of_node_put(prev); - read_unlock(&devtree_lock); - return np; + DBG(" <- early_init_devtree()\n"); } -EXPORT_SYMBOL(of_find_all_nodes); -/** - * of_get_parent - Get a node's parent if any - * @node: Node to get parent - * - * Returns a node pointer with refcount incremented, use - * of_node_put() on it when done. +#ifdef CONFIG_RELOCATABLE +/* + * This function run before early_init_devtree, so we have to init + * initial_boot_params. */ -struct device_node *of_get_parent(const struct device_node *node) +void __init early_get_first_memblock_info(void *params, phys_addr_t *size) { - struct device_node *np; + /* Setup flat device-tree pointer */ + initial_boot_params = params; - if (!node) - return NULL; + /* + * Scan the memory nodes and set add_mem_to_memblock to 0 to avoid + * mess the memblock. + */ + add_mem_to_memblock = 0; + of_scan_flat_dt(early_init_dt_scan_root, NULL); + of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL); + add_mem_to_memblock = 1; - read_lock(&devtree_lock); - np = of_node_get(node->parent); - read_unlock(&devtree_lock); - return np; + if (size) + *size = first_memblock_size; } -EXPORT_SYMBOL(of_get_parent); +#endif -/** - * of_get_next_child - Iterate a node childs - * @node: parent node - * @prev: previous child of the parent node, or NULL to get first +/******* * - * Returns a node pointer with refcount incremented, use - * of_node_put() on it when done. - */ -struct device_node *of_get_next_child(const struct device_node *node, - struct device_node *prev) -{ - struct device_node *next; - - read_lock(&devtree_lock); - next = prev ? prev->sibling : node->child; - for (; next != 0; next = next->sibling) - if (of_node_get(next)) - break; - if (prev) - of_node_put(prev); - read_unlock(&devtree_lock); - return next; -} -EXPORT_SYMBOL(of_get_next_child); - -/** - * of_node_get - Increment refcount of a node - * @node: Node to inc refcount, NULL is supported to - * simplify writing of callers + * New implementation of the OF "find" APIs, return a refcounted + * object, call of_node_put() when done. The device tree and list + * are protected by a rw_lock. * - * Returns node. - */ -struct device_node *of_node_get(struct device_node *node) -{ - if (node) - kref_get(&node->kref); - return node; -} -EXPORT_SYMBOL(of_node_get); - -static inline struct device_node * kref_to_device_node(struct kref *kref) -{ - return container_of(kref, struct device_node, kref); -} + * Note that property management will need some locking as well, + * this isn't dealt with yet. + * + *******/ /** - * of_node_release - release a dynamically allocated node - * @kref: kref element of the node to be released + * of_get_ibm_chip_id - Returns the IBM "chip-id" of a device + * @np: device node of the device * - * In of_node_put() this function is passed to kref_put() - * as the destructor. + * This looks for a property "ibm,chip-id" in the node or any + * of its parents and returns its content, or -1 if it cannot + * be found. */ -static void of_node_release(struct kref *kref) +int of_get_ibm_chip_id(struct device_node *np) { - struct device_node *node = kref_to_device_node(kref); - struct property *prop = node->properties; + of_node_get(np); + while(np) { + struct device_node *old = np; + const __be32 *prop; - if (!OF_IS_DYNAMIC(node)) - return; - while (prop) { - struct property *next = prop->next; - kfree(prop->name); - kfree(prop->value); - kfree(prop); - prop = next; + prop = of_get_property(np, "ibm,chip-id", NULL); + if (prop) { + of_node_put(np); + return be32_to_cpup(prop); + } + np = of_get_parent(np); + of_node_put(old); } - kfree(node->intrs); - kfree(node->addrs); - kfree(node->full_name); - kfree(node->data); - kfree(node); + return -1; } /** - * of_node_put - Decrement refcount of a node - * @node: Node to dec refcount, NULL is supported to - * simplify writing of callers + * cpu_to_chip_id - Return the cpus chip-id + * @cpu: The logical cpu number. * + * Return the value of the ibm,chip-id property corresponding to the given + * logical cpu number. If the chip-id can not be found, returns -1. */ -void of_node_put(struct device_node *node) -{ - if (node) - kref_put(&node->kref, of_node_release); -} -EXPORT_SYMBOL(of_node_put); - -/* - * Plug a device node into the tree and global list. - */ -void of_attach_node(struct device_node *np) -{ - write_lock(&devtree_lock); - np->sibling = np->parent->child; - np->allnext = allnodes; - np->parent->child = np; - allnodes = np; - write_unlock(&devtree_lock); -} - -/* - * "Unplug" a node from the device tree. The caller must hold - * a reference to the node. The memory associated with the node - * is not freed until its refcount goes to zero. - */ -void of_detach_node(const struct device_node *np) +int cpu_to_chip_id(int cpu) { - struct device_node *parent; - - write_lock(&devtree_lock); - - parent = np->parent; - - if (allnodes == np) - allnodes = np->allnext; - else { - struct device_node *prev; - for (prev = allnodes; - prev->allnext != np; - prev = prev->allnext) - ; - prev->allnext = np->allnext; - } + struct device_node *np; - if (parent->child == np) - parent->child = np->sibling; - else { - struct device_node *prevsib; - for (prevsib = np->parent->child; - prevsib->sibling != np; - prevsib = prevsib->sibling) - ; - prevsib->sibling = np->sibling; - } + np = of_get_cpu_node(cpu, NULL); + if (!np) + return -1; - write_unlock(&devtree_lock); + of_node_put(np); + return of_get_ibm_chip_id(np); } +EXPORT_SYMBOL(cpu_to_chip_id); #ifdef CONFIG_PPC_PSERIES /* * Fix up the uninitialized fields in a new device node: - * name, type, n_addrs, addrs, n_intrs, intrs, and pci-specific fields - * - * A lot of boot-time code is duplicated here, because functions such - * as finish_node_interrupts, interpret_pci_props, etc. cannot use the - * slab allocator. - * - * This should probably be split up into smaller chunks. + * name, type and pci-specific fields */ -static int of_finish_dynamic_node(struct device_node *node, - unsigned long *unused1, int unused2, - int unused3, int unused4) +static int of_finish_dynamic_node(struct device_node *node) { struct device_node *parent = of_get_parent(node); int err = 0; - phandle *ibm_phandle; + const phandle *ibm_phandle; + + node->name = of_get_property(node, "name", NULL); + node->type = of_get_property(node, "device_type", NULL); - node->name = get_property(node, "name", NULL); - node->type = get_property(node, "device_type", NULL); + if (!node->name) + node->name = "<NULL>"; + if (!node->type) + node->type = "<NULL>"; if (!parent) { err = -ENODEV; @@ -1920,12 +849,12 @@ static int of_finish_dynamic_node(struct device_node *node, /* We don't support that function on PowerMac, at least * not yet */ - if (systemcfg->platform == PLATFORM_POWERMAC) + if (machine_is(powermac)) return -ENODEV; - /* fix up new node's linux_phandle field */ - if ((ibm_phandle = (unsigned int *)get_property(node, "ibm,phandle", NULL))) - node->linux_phandle = *ibm_phandle; + /* fix up new node's phandle field */ + if ((ibm_phandle = of_get_property(node, "ibm,phandle", NULL))) + node->phandle = *ibm_phandle; out: of_node_put(parent); @@ -1938,18 +867,16 @@ static int prom_reconfig_notifier(struct notifier_block *nb, int err; switch (action) { - case PSERIES_RECONFIG_ADD: - err = finish_node(node, NULL, of_finish_dynamic_node, 0, 0, 0); - if (err < 0) { + case OF_RECONFIG_ATTACH_NODE: + err = of_finish_dynamic_node(node); + if (err < 0) printk(KERN_ERR "finish_node returned %d\n", err); - err = NOTIFY_BAD; - } break; default: - err = NOTIFY_DONE; + err = 0; break; } - return err; + return notifier_from_errno(err); } static struct notifier_block prom_reconfig_nb = { @@ -1959,212 +886,12 @@ static struct notifier_block prom_reconfig_nb = { static int __init prom_reconfig_setup(void) { - return pSeries_reconfig_notifier_register(&prom_reconfig_nb); + return of_reconfig_notifier_register(&prom_reconfig_nb); } __initcall(prom_reconfig_setup); #endif -/* - * Find a property with a given name for a given node - * and return the value. - */ -unsigned char *get_property(struct device_node *np, const char *name, - int *lenp) -{ - struct property *pp; - - for (pp = np->properties; pp != 0; pp = pp->next) - if (strcmp(pp->name, name) == 0) { - if (lenp != 0) - *lenp = pp->length; - return pp->value; - } - return NULL; -} -EXPORT_SYMBOL(get_property); - -/* - * Add a property to a node - */ -void prom_add_property(struct device_node* np, struct property* prop) -{ - struct property **next = &np->properties; - - prop->next = NULL; - while (*next) - next = &(*next)->next; - *next = prop; -} - -/* I quickly hacked that one, check against spec ! */ -static inline unsigned long -bus_space_to_resource_flags(unsigned int bus_space) -{ - u8 space = (bus_space >> 24) & 0xf; - if (space == 0) - space = 0x02; - if (space == 0x02) - return IORESOURCE_MEM; - else if (space == 0x01) - return IORESOURCE_IO; - else { - printk(KERN_WARNING "prom.c: bus_space_to_resource_flags(), space: %x\n", - bus_space); - return 0; - } -} - -#ifdef CONFIG_PCI -static struct resource *find_parent_pci_resource(struct pci_dev* pdev, - struct address_range *range) -{ - unsigned long mask; - int i; - - /* Check this one */ - mask = bus_space_to_resource_flags(range->space); - for (i=0; i<DEVICE_COUNT_RESOURCE; i++) { - if ((pdev->resource[i].flags & mask) == mask && - pdev->resource[i].start <= range->address && - pdev->resource[i].end > range->address) { - if ((range->address + range->size - 1) > pdev->resource[i].end) { - /* Add better message */ - printk(KERN_WARNING "PCI/OF resource overlap !\n"); - return NULL; - } - break; - } - } - if (i == DEVICE_COUNT_RESOURCE) - return NULL; - return &pdev->resource[i]; -} - -/* - * Request an OF device resource. Currently handles child of PCI devices, - * or other nodes attached to the root node. Ultimately, put some - * link to resources in the OF node. - */ -struct resource *request_OF_resource(struct device_node* node, int index, - const char* name_postfix) +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) { - struct pci_dev* pcidev; - u8 pci_bus, pci_devfn; - unsigned long iomask; - struct device_node* nd; - struct resource* parent; - struct resource *res = NULL; - int nlen, plen; - - if (index >= node->n_addrs) - goto fail; - - /* Sanity check on bus space */ - iomask = bus_space_to_resource_flags(node->addrs[index].space); - if (iomask & IORESOURCE_MEM) - parent = &iomem_resource; - else if (iomask & IORESOURCE_IO) - parent = &ioport_resource; - else - goto fail; - - /* Find a PCI parent if any */ - nd = node; - pcidev = NULL; - while (nd) { - if (!pci_device_from_OF_node(nd, &pci_bus, &pci_devfn)) - pcidev = pci_find_slot(pci_bus, pci_devfn); - if (pcidev) break; - nd = nd->parent; - } - if (pcidev) - parent = find_parent_pci_resource(pcidev, &node->addrs[index]); - if (!parent) { - printk(KERN_WARNING "request_OF_resource(%s), parent not found\n", - node->name); - goto fail; - } - - res = __request_region(parent, node->addrs[index].address, - node->addrs[index].size, NULL); - if (!res) - goto fail; - nlen = strlen(node->name); - plen = name_postfix ? strlen(name_postfix) : 0; - res->name = (const char *)kmalloc(nlen+plen+1, GFP_KERNEL); - if (res->name) { - strcpy((char *)res->name, node->name); - if (plen) - strcpy((char *)res->name+nlen, name_postfix); - } - return res; -fail: - return NULL; -} -EXPORT_SYMBOL(request_OF_resource); - -int release_OF_resource(struct device_node *node, int index) -{ - struct pci_dev* pcidev; - u8 pci_bus, pci_devfn; - unsigned long iomask, start, end; - struct device_node* nd; - struct resource* parent; - struct resource *res = NULL; - - if (index >= node->n_addrs) - return -EINVAL; - - /* Sanity check on bus space */ - iomask = bus_space_to_resource_flags(node->addrs[index].space); - if (iomask & IORESOURCE_MEM) - parent = &iomem_resource; - else if (iomask & IORESOURCE_IO) - parent = &ioport_resource; - else - return -EINVAL; - - /* Find a PCI parent if any */ - nd = node; - pcidev = NULL; - while(nd) { - if (!pci_device_from_OF_node(nd, &pci_bus, &pci_devfn)) - pcidev = pci_find_slot(pci_bus, pci_devfn); - if (pcidev) break; - nd = nd->parent; - } - if (pcidev) - parent = find_parent_pci_resource(pcidev, &node->addrs[index]); - if (!parent) { - printk(KERN_WARNING "release_OF_resource(%s), parent not found\n", - node->name); - return -ENODEV; - } - - /* Find us in the parent and its childs */ - res = parent->child; - start = node->addrs[index].address; - end = start + node->addrs[index].size - 1; - while (res) { - if (res->start == start && res->end == end && - (res->flags & IORESOURCE_BUSY)) - break; - if (res->start <= start && res->end >= end) - res = res->child; - else - res = res->sibling; - } - if (!res) - return -ENODEV; - - if (res->name) { - kfree(res->name); - res->name = NULL; - } - release_resource(res); - kfree(res); - - return 0; + return (int)phys_id == get_hard_smp_processor_id(cpu); } -EXPORT_SYMBOL(release_OF_resource); -#endif /* CONFIG_PCI */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 9750b3cd8ec..1a85d8f9673 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -16,7 +16,6 @@ #undef DEBUG_PROM #include <stdarg.h> -#include <linux/config.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/init.h> @@ -36,7 +35,6 @@ #include <asm/irq.h> #include <asm/io.h> #include <asm/smp.h> -#include <asm/system.h> #include <asm/mmu.h> #include <asm/pgtable.h> #include <asm/pci.h> @@ -44,19 +42,9 @@ #include <asm/btext.h> #include <asm/sections.h> #include <asm/machdep.h> +#include <asm/opal.h> -#ifdef CONFIG_LOGO_LINUX_CLUT224 #include <linux/linux_logo.h> -extern const struct linux_logo logo_linux_clut224; -#endif - -/* - * Properties whose value is longer than this get excluded from our - * copy of the device tree. This value does need to be big enough to - * ensure that we don't lose things like the interrupt-map property - * on a PCI-PCI bridge. - */ -#define MAX_PROPERTY_LENGTH (1UL * 1024 * 1024) /* * Eventually bump that one up @@ -78,8 +66,8 @@ extern const struct linux_logo logo_linux_clut224; * is running at whatever address it has been loaded at. * On ppc32 we compile with -mrelocatable, which means that references * to extern and static variables get relocated automatically. - * On ppc64 we have to relocate the references explicitly with - * RELOC. (Note that strings count as static variables.) + * ppc64 objects are always relocatable, we just need to relocate the + * TOC. * * Because OF may have mapped I/O devices into the area starting at * KERNELBASE, particularly on CHRP machines, we can't safely call @@ -91,17 +79,21 @@ extern const struct linux_logo logo_linux_clut224; * On ppc64, 64 bit values are truncated to 32 bits (and * fortunately don't get interpreted as two arguments). */ +#define ADDR(x) (u32)(unsigned long)(x) + #ifdef CONFIG_PPC64 -#define RELOC(x) (*PTRRELOC(&(x))) -#define ADDR(x) (u32) add_reloc_offset((unsigned long)(x)) +#define OF_WORKAROUNDS 0 #else -#define RELOC(x) (x) -#define ADDR(x) (u32) (x) +#define OF_WORKAROUNDS of_workarounds +int of_workarounds; #endif +#define OF_WA_CLAIM 1 /* do phys/virt claim separately, then map */ +#define OF_WA_LONGTRAIL 2 /* work around longtrail bugs */ + #define PROM_BUG() do { \ prom_printf("kernel BUG at %s line 0x%x!\n", \ - RELOC(__FILE__), __LINE__); \ + __FILE__, __LINE__); \ __asm__ __volatile__(".long " BUG_ILLEGAL_INSTR); \ } while (0) @@ -111,37 +103,35 @@ extern const struct linux_logo logo_linux_clut224; #define prom_debug(x...) #endif -#ifdef CONFIG_PPC32 -#define PLATFORM_POWERMAC _MACH_Pmac -#define PLATFORM_CHRP _MACH_chrp -#endif - typedef u32 prom_arg_t; struct prom_args { - u32 service; - u32 nargs; - u32 nret; - prom_arg_t args[10]; + __be32 service; + __be32 nargs; + __be32 nret; + __be32 args[10]; }; struct prom_t { ihandle root; - ihandle chosen; + phandle chosen; int cpu; ihandle stdout; ihandle mmumap; + ihandle memory; }; struct mem_map_entry { - unsigned long base; - unsigned long size; + __be64 base; + __be64 size; }; -typedef u32 cell_t; +typedef __be32 cell_t; -extern void __start(unsigned long r3, unsigned long r4, unsigned long r5); +extern void __start(unsigned long r3, unsigned long r4, unsigned long r5, + unsigned long r6, unsigned long r7, unsigned long r8, + unsigned long r9); #ifdef CONFIG_PPC64 extern int enter_prom(struct prom_args *args, unsigned long entry); @@ -172,12 +162,23 @@ static unsigned long __initdata dt_string_start, dt_string_end; static unsigned long __initdata prom_initrd_start, prom_initrd_end; #ifdef CONFIG_PPC64 -static int __initdata iommu_force_on; -static int __initdata ppc64_iommu_off; +static int __initdata prom_iommu_force_on; +static int __initdata prom_iommu_off; static unsigned long __initdata prom_tce_alloc_start; static unsigned long __initdata prom_tce_alloc_end; #endif +/* Platforms codes are now obsolete in the kernel. Now only used within this + * file and ultimately gone too. Feel free to change them if you need, they + * are not shared with anything outside of this file anymore + */ +#define PLATFORM_PSERIES 0x0100 +#define PLATFORM_PSERIES_LPAR 0x0101 +#define PLATFORM_LPAR 0x0001 +#define PLATFORM_POWERMAC 0x0400 +#define PLATFORM_GENERIC 0x0500 +#define PLATFORM_OPAL 0x0600 + static int __initdata of_platform; static char __initdata prom_cmd_line[COMMAND_LINE_SIZE]; @@ -195,16 +196,8 @@ static int __initdata mem_reserve_cnt; static cell_t __initdata regbuf[1024]; +static bool rtas_has_query_cpu_stopped; -#define MAX_CPU_THREADS 2 - -/* TO GO */ -#ifdef CONFIG_HMT -struct { - unsigned int pir; - unsigned int threadid; -} hmt_thread_data[NR_CPUS]; -#endif /* CONFIG_HMT */ /* * Error results ... some OF calls will return "-1" on error, some @@ -228,22 +221,22 @@ static int __init call_prom(const char *service, int nargs, int nret, ...) struct prom_args args; va_list list; - args.service = ADDR(service); - args.nargs = nargs; - args.nret = nret; + args.service = cpu_to_be32(ADDR(service)); + args.nargs = cpu_to_be32(nargs); + args.nret = cpu_to_be32(nret); va_start(list, nret); for (i = 0; i < nargs; i++) - args.args[i] = va_arg(list, prom_arg_t); + args.args[i] = cpu_to_be32(va_arg(list, prom_arg_t)); va_end(list); for (i = 0; i < nret; i++) args.args[nargs+i] = 0; - if (enter_prom(&args, RELOC(prom_entry)) < 0) + if (enter_prom(&args, prom_entry) < 0) return PROM_ERROR; - return (nret > 0) ? args.args[nargs] : 0; + return (nret > 0) ? be32_to_cpu(args.args[nargs]) : 0; } static int __init call_prom_ret(const char *service, int nargs, int nret, @@ -253,46 +246,45 @@ static int __init call_prom_ret(const char *service, int nargs, int nret, struct prom_args args; va_list list; - args.service = ADDR(service); - args.nargs = nargs; - args.nret = nret; + args.service = cpu_to_be32(ADDR(service)); + args.nargs = cpu_to_be32(nargs); + args.nret = cpu_to_be32(nret); va_start(list, rets); for (i = 0; i < nargs; i++) - args.args[i] = va_arg(list, prom_arg_t); + args.args[i] = cpu_to_be32(va_arg(list, prom_arg_t)); va_end(list); for (i = 0; i < nret; i++) - rets[nargs+i] = 0; + args.args[nargs+i] = 0; - if (enter_prom(&args, RELOC(prom_entry)) < 0) + if (enter_prom(&args, prom_entry) < 0) return PROM_ERROR; if (rets != NULL) for (i = 1; i < nret; ++i) - rets[i-1] = args.args[nargs+i]; + rets[i-1] = be32_to_cpu(args.args[nargs+i]); - return (nret > 0) ? args.args[nargs] : 0; + return (nret > 0) ? be32_to_cpu(args.args[nargs]) : 0; } static void __init prom_print(const char *msg) { const char *p, *q; - struct prom_t *_prom = &RELOC(prom); - if (_prom->stdout == 0) + if (prom.stdout == 0) return; for (p = msg; *p != 0; p = q) { for (q = p; *q != 0 && *q != '\n'; ++q) ; if (q > p) - call_prom("write", 3, 1, _prom->stdout, p, q - p); + call_prom("write", 3, 1, prom.stdout, p, q - p); if (*q == 0) break; ++q; - call_prom("write", 3, 1, _prom->stdout, ADDR("\r\n"), 2); + call_prom("write", 3, 1, prom.stdout, ADDR("\r\n"), 2); } } @@ -301,7 +293,6 @@ static void __init prom_print_hex(unsigned long val) { int i, nibbles = sizeof(val)*2; char buf[sizeof(val)*2+1]; - struct prom_t *_prom = &RELOC(prom); for (i = nibbles-1; i >= 0; i--) { buf[i] = (val & 0xf) + '0'; @@ -310,31 +301,45 @@ static void __init prom_print_hex(unsigned long val) val >>= 4; } buf[nibbles] = '\0'; - call_prom("write", 3, 1, _prom->stdout, buf, nibbles); + call_prom("write", 3, 1, prom.stdout, buf, nibbles); } +/* max number of decimal digits in an unsigned long */ +#define UL_DIGITS 21 +static void __init prom_print_dec(unsigned long val) +{ + int i, size; + char buf[UL_DIGITS+1]; + + for (i = UL_DIGITS-1; i >= 0; i--) { + buf[i] = (val % 10) + '0'; + val = val/10; + if (val == 0) + break; + } + /* shift stuff down */ + size = UL_DIGITS - i; + call_prom("write", 3, 1, prom.stdout, buf+i, size); +} static void __init prom_printf(const char *format, ...) { const char *p, *q, *s; va_list args; unsigned long v; - struct prom_t *_prom = &RELOC(prom); + long vs; va_start(args, format); -#ifdef CONFIG_PPC64 - format = PTRRELOC(format); -#endif for (p = format; *p != 0; p = q) { for (q = p; *q != 0 && *q != '\n' && *q != '%'; ++q) ; if (q > p) - call_prom("write", 3, 1, _prom->stdout, p, q - p); + call_prom("write", 3, 1, prom.stdout, p, q - p); if (*q == 0) break; if (*q == '\n') { ++q; - call_prom("write", 3, 1, _prom->stdout, + call_prom("write", 3, 1, prom.stdout, ADDR("\r\n"), 2); continue; } @@ -352,6 +357,37 @@ static void __init prom_printf(const char *format, ...) v = va_arg(args, unsigned long); prom_print_hex(v); break; + case 'd': + ++q; + vs = va_arg(args, int); + if (vs < 0) { + prom_print("-"); + vs = -vs; + } + prom_print_dec(vs); + break; + case 'l': + ++q; + if (*q == 0) + break; + else if (*q == 'x') { + ++q; + v = va_arg(args, unsigned long); + prom_print_hex(v); + } else if (*q == 'u') { /* '%lu' */ + ++q; + v = va_arg(args, unsigned long); + prom_print_dec(v); + } else if (*q == 'd') { /* %ld */ + ++q; + vs = va_arg(args, long); + if (vs < 0) { + prom_print("-"); + vs = -vs; + } + prom_print_dec(vs); + } + break; } } } @@ -360,25 +396,46 @@ static void __init prom_printf(const char *format, ...) static unsigned int __init prom_claim(unsigned long virt, unsigned long size, unsigned long align) { - int ret; - struct prom_t *_prom = &RELOC(prom); - ret = call_prom("claim", 3, 1, (prom_arg_t)virt, (prom_arg_t)size, - (prom_arg_t)align); - if (ret != -1 && _prom->mmumap != 0) - /* old pmacs need us to map as well */ + if (align == 0 && (OF_WORKAROUNDS & OF_WA_CLAIM)) { + /* + * Old OF requires we claim physical and virtual separately + * and then map explicitly (assuming virtual mode) + */ + int ret; + prom_arg_t result; + + ret = call_prom_ret("call-method", 5, 2, &result, + ADDR("claim"), prom.memory, + align, size, virt); + if (ret != 0 || result == -1) + return -1; + ret = call_prom_ret("call-method", 5, 2, &result, + ADDR("claim"), prom.mmumap, + align, size, virt); + if (ret != 0) { + call_prom("call-method", 4, 1, ADDR("release"), + prom.memory, size, virt); + return -1; + } + /* the 0x12 is M (coherence) + PP == read/write */ call_prom("call-method", 6, 1, - ADDR("map"), _prom->mmumap, 0, size, virt, virt); - return ret; + ADDR("map"), prom.mmumap, 0x12, size, virt, virt); + return virt; + } + return call_prom("claim", 3, 1, (prom_arg_t)virt, (prom_arg_t)size, + (prom_arg_t)align); } static void __init __attribute__((noreturn)) prom_panic(const char *reason) { -#ifdef CONFIG_PPC64 - reason = PTRRELOC(reason); -#endif prom_print(reason); - /* ToDo: should put up an SRC here on p/iSeries */ + /* Do not call exit because it clears the screen on pmac + * it also causes some sort of double-fault on early pmacs */ + if (of_platform == PLATFORM_POWERMAC) + asm("trap\n"); + + /* ToDo: should put up an SRC here on pSeries */ call_prom("exit", 0, 0); for (;;) /* should never get here */ @@ -403,26 +460,67 @@ static int __init prom_next_node(phandle *nodep) } } -static int __init prom_getprop(phandle node, const char *pname, +static int inline prom_getprop(phandle node, const char *pname, void *value, size_t valuelen) { return call_prom("getprop", 4, 1, node, ADDR(pname), (u32)(unsigned long) value, (u32) valuelen); } -static int __init prom_getproplen(phandle node, const char *pname) +static int inline prom_getproplen(phandle node, const char *pname) { return call_prom("getproplen", 2, 1, node, ADDR(pname)); } -static int __init prom_setprop(phandle node, const char *pname, - void *value, size_t valuelen) +static void add_string(char **str, const char *q) { - return call_prom("setprop", 4, 1, node, ADDR(pname), - (u32)(unsigned long) value, (u32) valuelen); + char *p = *str; + + while (*q) + *p++ = *q++; + *p++ = ' '; + *str = p; +} + +static char *tohex(unsigned int x) +{ + static char digits[] = "0123456789abcdef"; + static char result[9]; + int i; + + result[8] = 0; + i = 8; + do { + --i; + result[i] = digits[x & 0xf]; + x >>= 4; + } while (x != 0 && i > 0); + return &result[i]; +} + +static int __init prom_setprop(phandle node, const char *nodename, + const char *pname, void *value, size_t valuelen) +{ + char cmd[256], *p; + + if (!(OF_WORKAROUNDS & OF_WA_LONGTRAIL)) + return call_prom("setprop", 4, 1, node, ADDR(pname), + (u32)(unsigned long) value, (u32) valuelen); + + /* gah... setprop doesn't work on longtrail, have to use interpret */ + p = cmd; + add_string(&p, "dev"); + add_string(&p, nodename); + add_string(&p, tohex((u32)(unsigned long) value)); + add_string(&p, tohex(valuelen)); + add_string(&p, tohex(ADDR(pname))); + add_string(&p, tohex(strlen(pname))); + add_string(&p, "property"); + *p = 0; + return call_prom("interpret", 1, 1, (u32)(unsigned long) cmd); } -/* We can't use the standard versions because of RELOC headaches. */ +/* We can't use the standard versions because of relocation headaches. */ #define isxdigit(c) (('0' <= (c) && (c) <= '9') \ || ('a' <= (c) && (c) <= 'f') \ || ('A' <= (c) && (c) <= 'F')) @@ -431,7 +529,7 @@ static int __init prom_setprop(phandle node, const char *pname, #define islower(c) ('a' <= (c) && (c) <= 'z') #define toupper(c) (islower(c) ? ((c) - 'a' + 'A') : (c)) -unsigned long prom_strtoul(const char *cp, const char **endp) +static unsigned long prom_strtoul(const char *cp, const char **endp) { unsigned long result = 0, base = 10, value; @@ -456,7 +554,7 @@ unsigned long prom_strtoul(const char *cp, const char **endp) return result; } -unsigned long prom_memparse(const char *ptr, const char **retptr) +static unsigned long prom_memparse(const char *ptr, const char **retptr) { unsigned long ret = prom_strtoul(ptr, retptr); int shift = 0; @@ -489,52 +587,147 @@ unsigned long prom_memparse(const char *ptr, const char **retptr) */ static void __init early_cmdline_parse(void) { - struct prom_t *_prom = &RELOC(prom); - char *opt, *p; + const char *opt; + + char *p; int l = 0; - RELOC(prom_cmd_line[0]) = 0; - p = RELOC(prom_cmd_line); - if ((long)_prom->chosen > 0) - l = prom_getprop(_prom->chosen, "bootargs", p, COMMAND_LINE_SIZE-1); + prom_cmd_line[0] = 0; + p = prom_cmd_line; + if ((long)prom.chosen > 0) + l = prom_getprop(prom.chosen, "bootargs", p, COMMAND_LINE_SIZE-1); #ifdef CONFIG_CMDLINE - if (l == 0) /* dbl check */ - strlcpy(RELOC(prom_cmd_line), - RELOC(CONFIG_CMDLINE), sizeof(prom_cmd_line)); + if (l <= 0 || p[0] == '\0') /* dbl check */ + strlcpy(prom_cmd_line, + CONFIG_CMDLINE, sizeof(prom_cmd_line)); #endif /* CONFIG_CMDLINE */ - prom_printf("command line: %s\n", RELOC(prom_cmd_line)); + prom_printf("command line: %s\n", prom_cmd_line); #ifdef CONFIG_PPC64 - opt = strstr(RELOC(prom_cmd_line), RELOC("iommu=")); + opt = strstr(prom_cmd_line, "iommu="); if (opt) { prom_printf("iommu opt is: %s\n", opt); opt += 6; while (*opt && *opt == ' ') opt++; - if (!strncmp(opt, RELOC("off"), 3)) - RELOC(ppc64_iommu_off) = 1; - else if (!strncmp(opt, RELOC("force"), 5)) - RELOC(iommu_force_on) = 1; + if (!strncmp(opt, "off", 3)) + prom_iommu_off = 1; + else if (!strncmp(opt, "force", 5)) + prom_iommu_force_on = 1; } #endif - - opt = strstr(RELOC(prom_cmd_line), RELOC("mem=")); + opt = strstr(prom_cmd_line, "mem="); if (opt) { opt += 4; - RELOC(prom_memory_limit) = prom_memparse(opt, (const char **)&opt); + prom_memory_limit = prom_memparse(opt, (const char **)&opt); #ifdef CONFIG_PPC64 /* Align to 16 MB == size of ppc64 large page */ - RELOC(prom_memory_limit) = ALIGN(RELOC(prom_memory_limit), 0x1000000); + prom_memory_limit = ALIGN(prom_memory_limit, 0x1000000); #endif } } -#ifdef CONFIG_PPC_PSERIES +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* - * To tell the firmware what our capabilities are, we have to pass - * it a fake 32-bit ELF header containing a couple of PT_NOTE sections - * that contain structures that contain the actual values. + * The architecture vector has an array of PVR mask/value pairs, + * followed by # option vectors - 1, followed by the option vectors. + * + * See prom.h for the definition of the bits specified in the + * architecture vector. + * + * Because the description vector contains a mix of byte and word + * values, we declare it as an unsigned char array, and use this + * macro to put word values in. */ +#define W(x) ((x) >> 24) & 0xff, ((x) >> 16) & 0xff, \ + ((x) >> 8) & 0xff, (x) & 0xff + +unsigned char ibm_architecture_vec[] = { + W(0xfffe0000), W(0x003a0000), /* POWER5/POWER5+ */ + W(0xffff0000), W(0x003e0000), /* POWER6 */ + W(0xffff0000), W(0x003f0000), /* POWER7 */ + W(0xffff0000), W(0x004b0000), /* POWER8E */ + W(0xffff0000), W(0x004d0000), /* POWER8 */ + W(0xffffffff), W(0x0f000004), /* all 2.07-compliant */ + W(0xffffffff), W(0x0f000003), /* all 2.06-compliant */ + W(0xffffffff), W(0x0f000002), /* all 2.05-compliant */ + W(0xfffffffe), W(0x0f000001), /* all 2.04-compliant and earlier */ + 6 - 1, /* 6 option vectors */ + + /* option vector 1: processor architectures supported */ + 3 - 2, /* length */ + 0, /* don't ignore, don't halt */ + OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 | + OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07, + + /* option vector 2: Open Firmware options supported */ + 34 - 2, /* length */ + OV2_REAL_MODE, + 0, 0, + W(0xffffffff), /* real_base */ + W(0xffffffff), /* real_size */ + W(0xffffffff), /* virt_base */ + W(0xffffffff), /* virt_size */ + W(0xffffffff), /* load_base */ + W(256), /* 256MB min RMA */ + W(0xffffffff), /* full client load */ + 0, /* min RMA percentage of total RAM */ + 48, /* max log_2(hash table size) */ + + /* option vector 3: processor options supported */ + 3 - 2, /* length */ + 0, /* don't ignore, don't halt */ + OV3_FP | OV3_VMX | OV3_DFP, + + /* option vector 4: IBM PAPR implementation */ + 3 - 2, /* length */ + 0, /* don't halt */ + OV4_MIN_ENT_CAP, /* minimum VP entitled capacity */ + + /* option vector 5: PAPR/OF options */ + 19 - 2, /* length */ + 0, /* don't ignore, don't halt */ + OV5_FEAT(OV5_LPAR) | OV5_FEAT(OV5_SPLPAR) | OV5_FEAT(OV5_LARGE_PAGES) | + OV5_FEAT(OV5_DRCONF_MEMORY) | OV5_FEAT(OV5_DONATE_DEDICATE_CPU) | +#ifdef CONFIG_PCI_MSI + /* PCIe/MSI support. Without MSI full PCIe is not supported */ + OV5_FEAT(OV5_MSI), +#else + 0, +#endif + 0, +#ifdef CONFIG_PPC_SMLPAR + OV5_FEAT(OV5_CMO) | OV5_FEAT(OV5_XCMO), +#else + 0, +#endif + OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN), + 0, + 0, + 0, + /* WARNING: The offset of the "number of cores" field below + * must match by the macro below. Update the definition if + * the structure layout changes. + */ +#define IBM_ARCH_VEC_NRCORES_OFFSET 125 + W(NR_CPUS), /* number of cores supported */ + 0, + 0, + 0, + 0, + OV5_FEAT(OV5_PFO_HW_RNG) | OV5_FEAT(OV5_PFO_HW_ENCR) | + OV5_FEAT(OV5_PFO_HW_842), + OV5_FEAT(OV5_SUB_PROCESSORS), + /* option vector 6: IBM PAPR hints */ + 4 - 2, /* length */ + 0, + 0, + OV6_LINUX, + +}; + +/* Old method - ELF header with PT_NOTE sections only works on BE */ +#ifdef __BIG_ENDIAN__ static struct fake_elf { Elf32_Ehdr elfhdr; Elf32_Phdr phdr[2]; @@ -620,21 +813,124 @@ static struct fake_elf { } } }; +#endif /* __BIG_ENDIAN__ */ + +static int __init prom_count_smt_threads(void) +{ + phandle node; + char type[64]; + unsigned int plen; + + /* Pick up th first CPU node we can find */ + for (node = 0; prom_next_node(&node); ) { + type[0] = 0; + prom_getprop(node, "device_type", type, sizeof(type)); + + if (strcmp(type, "cpu")) + continue; + /* + * There is an entry for each smt thread, each entry being + * 4 bytes long. All cpus should have the same number of + * smt threads, so return after finding the first. + */ + plen = prom_getproplen(node, "ibm,ppc-interrupt-server#s"); + if (plen == PROM_ERROR) + break; + plen >>= 2; + prom_debug("Found %lu smt threads per core\n", (unsigned long)plen); + + /* Sanity check */ + if (plen < 1 || plen > 64) { + prom_printf("Threads per core %lu out of bounds, assuming 1\n", + (unsigned long)plen); + return 1; + } + return plen; + } + prom_debug("No threads found, assuming 1 per core\n"); + + return 1; + +} + static void __init prom_send_capabilities(void) { - ihandle elfloader; + ihandle root; + prom_arg_t ret; + u32 cores; + unsigned char *ptcores; - elfloader = call_prom("open", 1, 1, ADDR("/packages/elf-loader")); - if (elfloader == 0) { - prom_printf("couldn't open /packages/elf-loader\n"); - return; + root = call_prom("open", 1, 1, ADDR("/")); + if (root != 0) { + /* We need to tell the FW about the number of cores we support. + * + * To do that, we count the number of threads on the first core + * (we assume this is the same for all cores) and use it to + * divide NR_CPUS. + */ + + /* The core value may start at an odd address. If such a word + * access is made at a cache line boundary, this leads to an + * exception which may not be handled at this time. + * Forcing a per byte access to avoid exception. + */ + ptcores = &ibm_architecture_vec[IBM_ARCH_VEC_NRCORES_OFFSET]; + cores = 0; + cores |= ptcores[0] << 24; + cores |= ptcores[1] << 16; + cores |= ptcores[2] << 8; + cores |= ptcores[3]; + if (cores != NR_CPUS) { + prom_printf("WARNING ! " + "ibm_architecture_vec structure inconsistent: %lu!\n", + cores); + } else { + cores = DIV_ROUND_UP(NR_CPUS, prom_count_smt_threads()); + prom_printf("Max number of cores passed to firmware: %lu (NR_CPUS = %lu)\n", + cores, NR_CPUS); + ptcores[0] = (cores >> 24) & 0xff; + ptcores[1] = (cores >> 16) & 0xff; + ptcores[2] = (cores >> 8) & 0xff; + ptcores[3] = cores & 0xff; + } + + /* try calling the ibm,client-architecture-support method */ + prom_printf("Calling ibm,client-architecture-support..."); + if (call_prom_ret("call-method", 3, 2, &ret, + ADDR("ibm,client-architecture-support"), + root, + ADDR(ibm_architecture_vec)) == 0) { + /* the call exists... */ + if (ret) + prom_printf("\nWARNING: ibm,client-architecture" + "-support call FAILED!\n"); + call_prom("close", 1, 0, root); + prom_printf(" done\n"); + return; + } + call_prom("close", 1, 0, root); + prom_printf(" not implemented\n"); + } + +#ifdef __BIG_ENDIAN__ + { + ihandle elfloader; + + /* no ibm,client-architecture-support call, try the old way */ + elfloader = call_prom("open", 1, 1, + ADDR("/packages/elf-loader")); + if (elfloader == 0) { + prom_printf("couldn't open /packages/elf-loader\n"); + return; + } + call_prom("call-method", 3, 1, ADDR("process-elf-header"), + elfloader, ADDR(&fake_elf)); + call_prom("close", 1, 0, elfloader); } - call_prom("call-method", 3, 1, ADDR("process-elf-header"), - elfloader, ADDR(&fake_elf)); - call_prom("close", 1, 0, elfloader); +#endif /* __BIG_ENDIAN__ */ } -#endif +#endif /* #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ /* * Memory allocation strategy... our layout is normally: @@ -671,21 +967,21 @@ static void __init prom_send_capabilities(void) */ static unsigned long __init alloc_up(unsigned long size, unsigned long align) { - unsigned long base = RELOC(alloc_bottom); + unsigned long base = alloc_bottom; unsigned long addr = 0; if (align) base = _ALIGN_UP(base, align); prom_debug("alloc_up(%x, %x)\n", size, align); - if (RELOC(ram_top) == 0) + if (ram_top == 0) prom_panic("alloc_up() called with mem not initialized\n"); if (align) - base = _ALIGN_UP(RELOC(alloc_bottom), align); + base = _ALIGN_UP(alloc_bottom, align); else - base = RELOC(alloc_bottom); + base = alloc_bottom; - for(; (base + size) <= RELOC(alloc_top); + for(; (base + size) <= alloc_top; base = _ALIGN_UP(base + 0x100000, align)) { prom_debug(" trying: 0x%x\n\r", base); addr = (unsigned long)prom_claim(base, size, 0); @@ -697,14 +993,14 @@ static unsigned long __init alloc_up(unsigned long size, unsigned long align) } if (addr == 0) return 0; - RELOC(alloc_bottom) = addr; + alloc_bottom = addr + size; prom_debug(" -> %x\n", addr); - prom_debug(" alloc_bottom : %x\n", RELOC(alloc_bottom)); - prom_debug(" alloc_top : %x\n", RELOC(alloc_top)); - prom_debug(" alloc_top_hi : %x\n", RELOC(alloc_top_high)); - prom_debug(" rmo_top : %x\n", RELOC(rmo_top)); - prom_debug(" ram_top : %x\n", RELOC(ram_top)); + prom_debug(" alloc_bottom : %x\n", alloc_bottom); + prom_debug(" alloc_top : %x\n", alloc_top); + prom_debug(" alloc_top_hi : %x\n", alloc_top_high); + prom_debug(" rmo_top : %x\n", rmo_top); + prom_debug(" ram_top : %x\n", ram_top); return addr; } @@ -720,32 +1016,32 @@ static unsigned long __init alloc_down(unsigned long size, unsigned long align, unsigned long base, addr = 0; prom_debug("alloc_down(%x, %x, %s)\n", size, align, - highmem ? RELOC("(high)") : RELOC("(low)")); - if (RELOC(ram_top) == 0) + highmem ? "(high)" : "(low)"); + if (ram_top == 0) prom_panic("alloc_down() called with mem not initialized\n"); if (highmem) { /* Carve out storage for the TCE table. */ - addr = _ALIGN_DOWN(RELOC(alloc_top_high) - size, align); - if (addr <= RELOC(alloc_bottom)) + addr = _ALIGN_DOWN(alloc_top_high - size, align); + if (addr <= alloc_bottom) return 0; /* Will we bump into the RMO ? If yes, check out that we * didn't overlap existing allocations there, if we did, * we are dead, we must be the first in town ! */ - if (addr < RELOC(rmo_top)) { + if (addr < rmo_top) { /* Good, we are first */ - if (RELOC(alloc_top) == RELOC(rmo_top)) - RELOC(alloc_top) = RELOC(rmo_top) = addr; + if (alloc_top == rmo_top) + alloc_top = rmo_top = addr; else return 0; } - RELOC(alloc_top_high) = addr; + alloc_top_high = addr; goto bail; } - base = _ALIGN_DOWN(RELOC(alloc_top) - size, align); - for (; base > RELOC(alloc_bottom); + base = _ALIGN_DOWN(alloc_top - size, align); + for (; base > alloc_bottom; base = _ALIGN_DOWN(base - 0x100000, align)) { prom_debug(" trying: 0x%x\n\r", base); addr = (unsigned long)prom_claim(base, size, 0); @@ -755,15 +1051,15 @@ static unsigned long __init alloc_down(unsigned long size, unsigned long align, } if (addr == 0) return 0; - RELOC(alloc_top) = addr; + alloc_top = addr; bail: prom_debug(" -> %x\n", addr); - prom_debug(" alloc_bottom : %x\n", RELOC(alloc_bottom)); - prom_debug(" alloc_top : %x\n", RELOC(alloc_top)); - prom_debug(" alloc_top_hi : %x\n", RELOC(alloc_top_high)); - prom_debug(" rmo_top : %x\n", RELOC(rmo_top)); - prom_debug(" ram_top : %x\n", RELOC(ram_top)); + prom_debug(" alloc_bottom : %x\n", alloc_bottom); + prom_debug(" alloc_top : %x\n", alloc_top); + prom_debug(" alloc_top_hi : %x\n", alloc_top_high); + prom_debug(" rmo_top : %x\n", rmo_top); + prom_debug(" ram_top : %x\n", ram_top); return addr; } @@ -781,11 +1077,11 @@ static unsigned long __init prom_next_cell(int s, cell_t **cellp) p++; s--; } - r = *p++; + r = be32_to_cpu(*p++); #ifdef CONFIG_PPC64 if (s > 1) { r <<= 32; - r |= *(p++); + r |= be32_to_cpu(*(p++)); } #endif *cellp = p; @@ -800,10 +1096,10 @@ static unsigned long __init prom_next_cell(int s, cell_t **cellp) * If problems seem to show up, it would be a good start to track * them down. */ -static void reserve_mem(unsigned long base, unsigned long size) +static void __init reserve_mem(u64 base, u64 size) { - unsigned long top = base + size; - unsigned long cnt = RELOC(mem_reserve_cnt); + u64 top = base + size; + unsigned long cnt = mem_reserve_cnt; if (size == 0) return; @@ -818,13 +1114,13 @@ static void reserve_mem(unsigned long base, unsigned long size) if (cnt >= (MEM_RESERVE_MAP_SIZE - 1)) prom_panic("Memory reserve map exhausted !\n"); - RELOC(mem_reserve_map)[cnt].base = base; - RELOC(mem_reserve_map)[cnt].size = size; - RELOC(mem_reserve_cnt) = cnt + 1; + mem_reserve_map[cnt].base = cpu_to_be64(base); + mem_reserve_map[cnt].size = cpu_to_be64(size); + mem_reserve_cnt = cnt + 1; } /* - * Initialize memory allocation mecanism, parse "memory" nodes and + * Initialize memory allocation mechanism, parse "memory" nodes and * obtain that way the top of memory and RMO to setup out local allocator */ static void __init prom_init_mem(void) @@ -833,7 +1129,7 @@ static void __init prom_init_mem(void) char *path, type[64]; unsigned int plen; cell_t *p, *endp; - struct prom_t *_prom = &RELOC(prom); + __be32 val; u32 rac, rsc; /* @@ -841,15 +1137,17 @@ static void __init prom_init_mem(void) * 1) top of RMO (first node) * 2) top of memory */ - rac = 2; - prom_getprop(_prom->root, "#address-cells", &rac, sizeof(rac)); - rsc = 1; - prom_getprop(_prom->root, "#size-cells", &rsc, sizeof(rsc)); - prom_debug("root_addr_cells: %x\n", (unsigned long) rac); - prom_debug("root_size_cells: %x\n", (unsigned long) rsc); + val = cpu_to_be32(2); + prom_getprop(prom.root, "#address-cells", &val, sizeof(val)); + rac = be32_to_cpu(val); + val = cpu_to_be32(1); + prom_getprop(prom.root, "#size-cells", &val, sizeof(rsc)); + rsc = be32_to_cpu(val); + prom_debug("root_addr_cells: %x\n", rac); + prom_debug("root_size_cells: %x\n", rsc); prom_debug("scanning memory:\n"); - path = RELOC(prom_scratch); + path = prom_scratch; for (node = 0; prom_next_node(&node); ) { type[0] = 0; @@ -862,15 +1160,15 @@ static void __init prom_init_mem(void) */ prom_getprop(node, "name", type, sizeof(type)); } - if (strcmp(type, RELOC("memory"))) + if (strcmp(type, "memory")) continue; - plen = prom_getprop(node, "reg", RELOC(regbuf), sizeof(regbuf)); + plen = prom_getprop(node, "reg", regbuf, sizeof(regbuf)); if (plen > sizeof(regbuf)) { prom_printf("memory node too large for buffer !\n"); plen = sizeof(regbuf); } - p = RELOC(regbuf); + p = regbuf; endp = p + (plen / sizeof(cell_t)); #ifdef DEBUG_PROM @@ -888,22 +1186,14 @@ static void __init prom_init_mem(void) if (size == 0) continue; prom_debug(" %x %x\n", base, size); - if (base == 0) - RELOC(rmo_top) = size; - if ((base + size) > RELOC(ram_top)) - RELOC(ram_top) = base + size; + if (base == 0 && (of_platform & PLATFORM_LPAR)) + rmo_top = size; + if ((base + size) > ram_top) + ram_top = base + size; } } - RELOC(alloc_bottom) = PAGE_ALIGN((unsigned long)&RELOC(_end) + 0x4000); - - /* Check if we have an initrd after the kernel, if we do move our bottom - * point to after it - */ - if (RELOC(prom_initrd_start)) { - if (RELOC(prom_initrd_end) > RELOC(alloc_bottom)) - RELOC(alloc_bottom) = PAGE_ALIGN(RELOC(prom_initrd_end)); - } + alloc_bottom = PAGE_ALIGN((unsigned long)&_end + 0x4000); /* * If prom_memory_limit is set we reduce the upper limits *except* for @@ -911,20 +1201,20 @@ static void __init prom_init_mem(void) * TCE's up there. */ - RELOC(alloc_top_high) = RELOC(ram_top); + alloc_top_high = ram_top; - if (RELOC(prom_memory_limit)) { - if (RELOC(prom_memory_limit) <= RELOC(alloc_bottom)) { + if (prom_memory_limit) { + if (prom_memory_limit <= alloc_bottom) { prom_printf("Ignoring mem=%x <= alloc_bottom.\n", - RELOC(prom_memory_limit)); - RELOC(prom_memory_limit) = 0; - } else if (RELOC(prom_memory_limit) >= RELOC(ram_top)) { + prom_memory_limit); + prom_memory_limit = 0; + } else if (prom_memory_limit >= ram_top) { prom_printf("Ignoring mem=%x >= ram_top.\n", - RELOC(prom_memory_limit)); - RELOC(prom_memory_limit) = 0; + prom_memory_limit); + prom_memory_limit = 0; } else { - RELOC(ram_top) = RELOC(prom_memory_limit); - RELOC(rmo_top) = min(RELOC(rmo_top), RELOC(prom_memory_limit)); + ram_top = prom_memory_limit; + rmo_top = min(rmo_top, prom_memory_limit); } } @@ -936,20 +1226,122 @@ static void __init prom_init_mem(void) * Since 768MB is plenty of room, and we need to cap to something * reasonable on 32-bit, cap at 768MB on all machines. */ - if (!RELOC(rmo_top)) - RELOC(rmo_top) = RELOC(ram_top); - RELOC(rmo_top) = min(0x30000000ul, RELOC(rmo_top)); - RELOC(alloc_top) = RELOC(rmo_top); + if (!rmo_top) + rmo_top = ram_top; + rmo_top = min(0x30000000ul, rmo_top); + alloc_top = rmo_top; + alloc_top_high = ram_top; + + /* + * Check if we have an initrd after the kernel but still inside + * the RMO. If we do move our bottom point to after it. + */ + if (prom_initrd_start && + prom_initrd_start < rmo_top && + prom_initrd_end > alloc_bottom) + alloc_bottom = PAGE_ALIGN(prom_initrd_end); prom_printf("memory layout at init:\n"); - prom_printf(" memory_limit : %x (16 MB aligned)\n", RELOC(prom_memory_limit)); - prom_printf(" alloc_bottom : %x\n", RELOC(alloc_bottom)); - prom_printf(" alloc_top : %x\n", RELOC(alloc_top)); - prom_printf(" alloc_top_hi : %x\n", RELOC(alloc_top_high)); - prom_printf(" rmo_top : %x\n", RELOC(rmo_top)); - prom_printf(" ram_top : %x\n", RELOC(ram_top)); + prom_printf(" memory_limit : %x (16 MB aligned)\n", prom_memory_limit); + prom_printf(" alloc_bottom : %x\n", alloc_bottom); + prom_printf(" alloc_top : %x\n", alloc_top); + prom_printf(" alloc_top_hi : %x\n", alloc_top_high); + prom_printf(" rmo_top : %x\n", rmo_top); + prom_printf(" ram_top : %x\n", ram_top); +} + +static void __init prom_close_stdin(void) +{ + __be32 val; + ihandle stdin; + + if (prom_getprop(prom.chosen, "stdin", &val, sizeof(val)) > 0) { + stdin = be32_to_cpu(val); + call_prom("close", 1, 0, stdin); + } +} + +#ifdef CONFIG_PPC_POWERNV + +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL +static u64 __initdata prom_opal_base; +static u64 __initdata prom_opal_entry; +#endif + +/* + * Allocate room for and instantiate OPAL + */ +static void __init prom_instantiate_opal(void) +{ + phandle opal_node; + ihandle opal_inst; + u64 base, entry; + u64 size = 0, align = 0x10000; + __be64 val64; + u32 rets[2]; + + prom_debug("prom_instantiate_opal: start...\n"); + + opal_node = call_prom("finddevice", 1, 1, ADDR("/ibm,opal")); + prom_debug("opal_node: %x\n", opal_node); + if (!PHANDLE_VALID(opal_node)) + return; + + val64 = 0; + prom_getprop(opal_node, "opal-runtime-size", &val64, sizeof(val64)); + size = be64_to_cpu(val64); + if (size == 0) + return; + val64 = 0; + prom_getprop(opal_node, "opal-runtime-alignment", &val64,sizeof(val64)); + align = be64_to_cpu(val64); + + base = alloc_down(size, align, 0); + if (base == 0) { + prom_printf("OPAL allocation failed !\n"); + return; + } + + opal_inst = call_prom("open", 1, 1, ADDR("/ibm,opal")); + if (!IHANDLE_VALID(opal_inst)) { + prom_printf("opening opal package failed (%x)\n", opal_inst); + return; + } + + prom_printf("instantiating opal at 0x%x...", base); + + if (call_prom_ret("call-method", 4, 3, rets, + ADDR("load-opal-runtime"), + opal_inst, + base >> 32, base & 0xffffffff) != 0 + || (rets[0] == 0 && rets[1] == 0)) { + prom_printf(" failed\n"); + return; + } + entry = (((u64)rets[0]) << 32) | rets[1]; + + prom_printf(" done\n"); + + reserve_mem(base, size); + + prom_debug("opal base = 0x%x\n", base); + prom_debug("opal align = 0x%x\n", align); + prom_debug("opal entry = 0x%x\n", entry); + prom_debug("opal size = 0x%x\n", (long)size); + + prom_setprop(opal_node, "/ibm,opal", "opal-base-address", + &base, sizeof(base)); + prom_setprop(opal_node, "/ibm,opal", "opal-entry-address", + &entry, sizeof(entry)); + +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL + prom_opal_base = base; + prom_opal_entry = entry; +#endif + prom_debug("prom_instantiate_opal: end...\n"); } +#endif /* CONFIG_PPC_POWERNV */ /* * Allocate room for and instantiate RTAS @@ -959,6 +1351,7 @@ static void __init prom_instantiate_rtas(void) phandle rtas_node; ihandle rtas_inst; u32 base, entry = 0; + __be32 val; u32 size = 0; prom_debug("prom_instantiate_rtas: start...\n"); @@ -968,27 +1361,27 @@ static void __init prom_instantiate_rtas(void) if (!PHANDLE_VALID(rtas_node)) return; - prom_getprop(rtas_node, "rtas-size", &size, sizeof(size)); + val = 0; + prom_getprop(rtas_node, "rtas-size", &val, sizeof(size)); + size = be32_to_cpu(val); if (size == 0) return; base = alloc_down(size, PAGE_SIZE, 0); - if (base == 0) { - prom_printf("RTAS allocation failed !\n"); - return; - } + if (base == 0) + prom_panic("Could not allocate memory for RTAS\n"); rtas_inst = call_prom("open", 1, 1, ADDR("/rtas")); if (!IHANDLE_VALID(rtas_inst)) { - prom_printf("opening rtas package failed"); + prom_printf("opening rtas package failed (%x)\n", rtas_inst); return; } - prom_printf("instantiating rtas at 0x%x ...", base); + prom_printf("instantiating rtas at 0x%x...", base); if (call_prom_ret("call-method", 3, 2, &entry, ADDR("instantiate-rtas"), - rtas_inst, base) == PROM_ERROR + rtas_inst, base) != 0 || entry == 0) { prom_printf(" failed\n"); return; @@ -997,8 +1390,17 @@ static void __init prom_instantiate_rtas(void) reserve_mem(base, size); - prom_setprop(rtas_node, "linux,rtas-base", &base, sizeof(base)); - prom_setprop(rtas_node, "linux,rtas-entry", &entry, sizeof(entry)); + val = cpu_to_be32(base); + prom_setprop(rtas_node, "/rtas", "linux,rtas-base", + &val, sizeof(val)); + val = cpu_to_be32(entry); + prom_setprop(rtas_node, "/rtas", "linux,rtas-entry", + &val, sizeof(val)); + + /* Check if it supports "query-cpu-stopped-state" */ + if (prom_getprop(rtas_node, "query-cpu-stopped-state", + &val, sizeof(val)) != PROM_ERROR) + rtas_has_query_cpu_stopped = true; prom_debug("rtas base = 0x%x\n", base); prom_debug("rtas entry = 0x%x\n", entry); @@ -1009,27 +1411,85 @@ static void __init prom_instantiate_rtas(void) #ifdef CONFIG_PPC64 /* + * Allocate room for and instantiate Stored Measurement Log (SML) + */ +static void __init prom_instantiate_sml(void) +{ + phandle ibmvtpm_node; + ihandle ibmvtpm_inst; + u32 entry = 0, size = 0; + u64 base; + + prom_debug("prom_instantiate_sml: start...\n"); + + ibmvtpm_node = call_prom("finddevice", 1, 1, ADDR("/ibm,vtpm")); + prom_debug("ibmvtpm_node: %x\n", ibmvtpm_node); + if (!PHANDLE_VALID(ibmvtpm_node)) + return; + + ibmvtpm_inst = call_prom("open", 1, 1, ADDR("/ibm,vtpm")); + if (!IHANDLE_VALID(ibmvtpm_inst)) { + prom_printf("opening vtpm package failed (%x)\n", ibmvtpm_inst); + return; + } + + if (call_prom_ret("call-method", 2, 2, &size, + ADDR("sml-get-handover-size"), + ibmvtpm_inst) != 0 || size == 0) { + prom_printf("SML get handover size failed\n"); + return; + } + + base = alloc_down(size, PAGE_SIZE, 0); + if (base == 0) + prom_panic("Could not allocate memory for sml\n"); + + prom_printf("instantiating sml at 0x%x...", base); + + if (call_prom_ret("call-method", 4, 2, &entry, + ADDR("sml-handover"), + ibmvtpm_inst, size, base) != 0 || entry == 0) { + prom_printf("SML handover failed\n"); + return; + } + prom_printf(" done\n"); + + reserve_mem(base, size); + + prom_setprop(ibmvtpm_node, "/ibm,vtpm", "linux,sml-base", + &base, sizeof(base)); + prom_setprop(ibmvtpm_node, "/ibm,vtpm", "linux,sml-size", + &size, sizeof(size)); + + prom_debug("sml base = 0x%x\n", base); + prom_debug("sml size = 0x%x\n", (long)size); + + prom_debug("prom_instantiate_sml: end...\n"); +} + +/* * Allocate room for and initialize TCE tables */ +#ifdef __BIG_ENDIAN__ static void __init prom_initialize_tce_table(void) { phandle node; ihandle phb_node; char compatible[64], type[64], model[64]; - char *path = RELOC(prom_scratch); + char *path = prom_scratch; u64 base, align; u32 minalign, minsize; u64 tce_entry, *tce_entryp; u64 local_alloc_top, local_alloc_bottom; u64 i; - if (RELOC(ppc64_iommu_off)) + if (prom_iommu_off) return; prom_debug("starting prom_initialize_tce_table\n"); /* Cache current top of allocs so we reserve a single block */ - local_alloc_top = RELOC(alloc_top_high); + local_alloc_top = alloc_top_high; local_alloc_bottom = local_alloc_top; /* Search all nodes looking for PHBs. */ @@ -1042,19 +1502,19 @@ static void __init prom_initialize_tce_table(void) prom_getprop(node, "device_type", type, sizeof(type)); prom_getprop(node, "model", model, sizeof(model)); - if ((type[0] == 0) || (strstr(type, RELOC("pci")) == NULL)) + if ((type[0] == 0) || (strstr(type, "pci") == NULL)) continue; - /* Keep the old logic in tack to avoid regression. */ + /* Keep the old logic intact to avoid regression. */ if (compatible[0] != 0) { - if ((strstr(compatible, RELOC("python")) == NULL) && - (strstr(compatible, RELOC("Speedwagon")) == NULL) && - (strstr(compatible, RELOC("Winnipeg")) == NULL)) + if ((strstr(compatible, "python") == NULL) && + (strstr(compatible, "Speedwagon") == NULL) && + (strstr(compatible, "Winnipeg") == NULL)) continue; } else if (model[0] != 0) { - if ((strstr(model, RELOC("ython")) == NULL) && - (strstr(model, RELOC("peedwagon")) == NULL) && - (strstr(model, RELOC("innipeg")) == NULL)) + if ((strstr(model, "ython") == NULL) && + (strstr(model, "peedwagon") == NULL) && + (strstr(model, "innipeg") == NULL)) continue; } @@ -1076,7 +1536,7 @@ static void __init prom_initialize_tce_table(void) * else will impact performance, so we always allocate 8MB. * Anton */ - if (__is_processor(PV_POWER4) || __is_processor(PV_POWER4p)) + if (pvr_version_is(PVR_POWER4) || pvr_version_is(PVR_POWER4p)) minsize = 8UL << 20; else minsize = 4UL << 20; @@ -1089,18 +1549,18 @@ static void __init prom_initialize_tce_table(void) if (base < local_alloc_bottom) local_alloc_bottom = base; - /* Save away the TCE table attributes for later use. */ - prom_setprop(node, "linux,tce-base", &base, sizeof(base)); - prom_setprop(node, "linux,tce-size", &minsize, sizeof(minsize)); - /* It seems OF doesn't null-terminate the path :-( */ - memset(path, 0, sizeof(path)); + memset(path, 0, PROM_SCRATCH_SIZE); /* Call OF to setup the TCE hardware */ if (call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1) == PROM_ERROR) { prom_printf("package-to-path failed\n"); } + /* Save away the TCE table attributes for later use. */ + prom_setprop(node, path, "linux,tce-base", &base, sizeof(base)); + prom_setprop(node, path, "linux,tce-size", &minsize, sizeof(minsize)); + prom_debug("TCE table: %s\n", path); prom_debug("\tnode = 0x%x\n", node); prom_debug("\tbase = 0x%x\n", base); @@ -1109,7 +1569,7 @@ static void __init prom_initialize_tce_table(void) /* Initialize the table to have a one-to-one mapping * over the allocated size. */ - tce_entryp = (unsigned long *)base; + tce_entryp = (u64 *)base; for (i = 0; i < (minsize >> 3) ;tce_entryp++, i++) { tce_entry = (i << PAGE_SHIFT); tce_entry |= 0x3; @@ -1131,21 +1591,16 @@ static void __init prom_initialize_tce_table(void) reserve_mem(local_alloc_bottom, local_alloc_top - local_alloc_bottom); - if (RELOC(prom_memory_limit)) { - /* - * We align the start to a 16MB boundary so we can map - * the TCE area using large pages if possible. - * The end should be the top of RAM so no need to align it. - */ - RELOC(prom_tce_alloc_start) = _ALIGN_DOWN(local_alloc_bottom, - 0x1000000); - RELOC(prom_tce_alloc_end) = local_alloc_top; - } + /* These are only really needed if there is a memory limit in + * effect, but we don't know so export them always. */ + prom_tce_alloc_start = local_alloc_bottom; + prom_tce_alloc_end = local_alloc_top; /* Flag the first invalid entry */ prom_debug("ending prom_initialize_tce_table\n"); } -#endif +#endif /* __BIG_ENDIAN__ */ +#endif /* CONFIG_PPC64 */ /* * With CHRP SMP we need to use the OF to start the other processors. @@ -1165,10 +1620,6 @@ static void __init prom_initialize_tce_table(void) * * -- Cort */ -extern void __secondary_hold(void); -extern unsigned long __secondary_hold_spinloop; -extern unsigned long __secondary_hold_acknowledge; - /* * We want to reference the copy of __secondary_hold_* in the * 0 - 0x100 address range @@ -1178,25 +1629,25 @@ extern unsigned long __secondary_hold_acknowledge; static void __init prom_hold_cpus(void) { unsigned long i; - unsigned int reg; phandle node; char type[64]; - int cpuid = 0; - unsigned int interrupt_server[MAX_CPU_THREADS]; - unsigned int cpu_threads, hw_cpu_num; - int propsize; - struct prom_t *_prom = &RELOC(prom); unsigned long *spinloop = (void *) LOW_ADDR(__secondary_hold_spinloop); unsigned long *acknowledge = (void *) LOW_ADDR(__secondary_hold_acknowledge); -#ifdef CONFIG_PPC64 - /* __secondary_hold is actually a descriptor, not the text address */ - unsigned long secondary_hold - = __pa(*PTRRELOC((unsigned long *)__secondary_hold)); -#else unsigned long secondary_hold = LOW_ADDR(__secondary_hold); -#endif + + /* + * On pseries, if RTAS supports "query-cpu-stopped-state", + * we skip this stage, the CPUs will be started by the + * kernel using RTAS. + */ + if ((of_platform == PLATFORM_PSERIES || + of_platform == PLATFORM_PSERIES_LPAR) && + rtas_has_query_cpu_stopped) { + prom_printf("prom_hold_cpus: skipped\n"); + return; + } prom_debug("prom_hold_cpus: start...\n"); prom_debug(" 1) spinloop = 0x%x\n", (unsigned long)spinloop); @@ -1213,27 +1664,26 @@ static void __init prom_hold_cpus(void) */ *spinloop = 0; -#ifdef CONFIG_HMT - for (i = 0; i < NR_CPUS; i++) - RELOC(hmt_thread_data)[i].pir = 0xdeadbeef; -#endif /* look for cpus */ for (node = 0; prom_next_node(&node); ) { + unsigned int cpu_no; + __be32 reg; + type[0] = 0; prom_getprop(node, "device_type", type, sizeof(type)); - if (strcmp(type, RELOC("cpu")) != 0) + if (strcmp(type, "cpu") != 0) continue; /* Skip non-configured cpus. */ if (prom_getprop(node, "status", type, sizeof(type)) > 0) - if (strcmp(type, RELOC("okay")) != 0) + if (strcmp(type, "okay") != 0) continue; - reg = -1; + reg = cpu_to_be32(-1); /* make sparse happy */ prom_getprop(node, "reg", ®, sizeof(reg)); + cpu_no = be32_to_cpu(reg); - prom_debug("\ncpuid = 0x%x\n", cpuid); - prom_debug("cpu hw idx = 0x%x\n", reg); + prom_debug("cpu hw idx = %lu\n", cpu_no); /* Init the acknowledge var which will be reset by * the secondary cpu when it awakens from its OF @@ -1241,78 +1691,26 @@ static void __init prom_hold_cpus(void) */ *acknowledge = (unsigned long)-1; - propsize = prom_getprop(node, "ibm,ppc-interrupt-server#s", - &interrupt_server, - sizeof(interrupt_server)); - if (propsize < 0) { - /* no property. old hardware has no SMT */ - cpu_threads = 1; - interrupt_server[0] = reg; /* fake it with phys id */ - } else { - /* We have a threaded processor */ - cpu_threads = propsize / sizeof(u32); - if (cpu_threads > MAX_CPU_THREADS) { - prom_printf("SMT: too many threads!\n" - "SMT: found %x, max is %x\n", - cpu_threads, MAX_CPU_THREADS); - cpu_threads = 1; /* ToDo: panic? */ - } - } - - hw_cpu_num = interrupt_server[0]; - if (hw_cpu_num != _prom->cpu) { - /* Primary Thread of non-boot cpu */ - prom_printf("%x : starting cpu hw idx %x... ", cpuid, reg); + if (cpu_no != prom.cpu) { + /* Primary Thread of non-boot cpu or any thread */ + prom_printf("starting cpu hw idx %lu... ", cpu_no); call_prom("start-cpu", 3, 0, node, - secondary_hold, reg); + secondary_hold, cpu_no); for (i = 0; (i < 100000000) && (*acknowledge == ((unsigned long)-1)); i++ ) mb(); - if (*acknowledge == reg) + if (*acknowledge == cpu_no) prom_printf("done\n"); else prom_printf("failed: %x\n", *acknowledge); } #ifdef CONFIG_SMP else - prom_printf("%x : boot cpu %x\n", cpuid, reg); + prom_printf("boot cpu hw idx %lu\n", cpu_no); #endif /* CONFIG_SMP */ - - /* Reserve cpu #s for secondary threads. They start later. */ - cpuid += cpu_threads; - } -#ifdef CONFIG_HMT - /* Only enable HMT on processors that provide support. */ - if (__is_processor(PV_PULSAR) || - __is_processor(PV_ICESTAR) || - __is_processor(PV_SSTAR)) { - prom_printf(" starting secondary threads\n"); - - for (i = 0; i < NR_CPUS; i += 2) { - if (!cpu_online(i)) - continue; - - if (i == 0) { - unsigned long pir = mfspr(SPRN_PIR); - if (__is_processor(PV_PULSAR)) { - RELOC(hmt_thread_data)[i].pir = - pir & 0x1f; - } else { - RELOC(hmt_thread_data)[i].pir = - pir & 0x3ff; - } - } - } - } else { - prom_printf("Processor is not HMT capable\n"); } -#endif - - if (cpuid > NR_CPUS) - prom_printf("WARNING: maximum CPUs (" __stringify(NR_CPUS) - ") exceeded: ignoring extras\n"); prom_debug("prom_hold_cpus: end...\n"); } @@ -1320,32 +1718,30 @@ static void __init prom_hold_cpus(void) static void __init prom_init_client_services(unsigned long pp) { - struct prom_t *_prom = &RELOC(prom); - /* Get a handle to the prom entry point before anything else */ - RELOC(prom_entry) = pp; + prom_entry = pp; /* get a handle for the stdout device */ - _prom->chosen = call_prom("finddevice", 1, 1, ADDR("/chosen")); - if (!PHANDLE_VALID(_prom->chosen)) + prom.chosen = call_prom("finddevice", 1, 1, ADDR("/chosen")); + if (!PHANDLE_VALID(prom.chosen)) prom_panic("cannot find chosen"); /* msg won't be printed :( */ /* get device tree root */ - _prom->root = call_prom("finddevice", 1, 1, ADDR("/")); - if (!PHANDLE_VALID(_prom->root)) + prom.root = call_prom("finddevice", 1, 1, ADDR("/")); + if (!PHANDLE_VALID(prom.root)) prom_panic("cannot find device tree root"); /* msg won't be printed :( */ - _prom->mmumap = 0; + prom.mmumap = 0; } #ifdef CONFIG_PPC32 /* * For really old powermacs, we need to map things we claim. * For that, we need the ihandle of the mmu. + * Also, on the longtrail, we need to work around other bugs. */ static void __init prom_find_mmu(void) { - struct prom_t *_prom = &RELOC(prom); phandle oprom; char version[64]; @@ -1355,12 +1751,20 @@ static void __init prom_find_mmu(void) if (prom_getprop(oprom, "model", version, sizeof(version)) <= 0) return; version[sizeof(version) - 1] = 0; - prom_printf("OF version is '%s'\n", version); /* XXX might need to add other versions here */ - if (strcmp(version, "Open Firmware, 1.0.5") != 0) + if (strcmp(version, "Open Firmware, 1.0.5") == 0) + of_workarounds = OF_WA_CLAIM; + else if (strncmp(version, "FirmWorks,3.", 12) == 0) { + of_workarounds = OF_WA_CLAIM | OF_WA_LONGTRAIL; + call_prom("interpret", 1, 1, "dev /memory 0 to allow-reclaim"); + } else return; - prom_getprop(_prom->chosen, "mmu", &_prom->mmumap, - sizeof(_prom->mmumap)); + prom.memory = call_prom("open", 1, 1, ADDR("/memory")); + prom_getprop(prom.chosen, "mmu", &prom.mmumap, + sizeof(prom.mmumap)); + prom.mmumap = be32_to_cpu(prom.mmumap); + if (!IHANDLE_VALID(prom.memory) || !IHANDLE_VALID(prom.mmumap)) + of_workarounds &= ~OF_WA_CLAIM; /* hmmm */ } #else #define prom_find_mmu() @@ -1368,49 +1772,49 @@ static void __init prom_find_mmu(void) static void __init prom_init_stdout(void) { - struct prom_t *_prom = &RELOC(prom); - char *path = RELOC(of_stdout_device); + char *path = of_stdout_device; char type[16]; - u32 val; + phandle stdout_node; + __be32 val; - if (prom_getprop(_prom->chosen, "stdout", &val, sizeof(val)) <= 0) + if (prom_getprop(prom.chosen, "stdout", &val, sizeof(val)) <= 0) prom_panic("cannot find stdout"); - _prom->stdout = val; + prom.stdout = be32_to_cpu(val); /* Get the full OF pathname of the stdout device */ memset(path, 0, 256); - call_prom("instance-to-path", 3, 1, _prom->stdout, path, 255); - val = call_prom("instance-to-package", 1, 1, _prom->stdout); - prom_setprop(_prom->chosen, "linux,stdout-package", &val, sizeof(val)); - prom_printf("OF stdout device is: %s\n", RELOC(of_stdout_device)); - prom_setprop(_prom->chosen, "linux,stdout-path", - RELOC(of_stdout_device), strlen(RELOC(of_stdout_device))+1); - - /* If it's a display, note it */ - memset(type, 0, sizeof(type)); - prom_getprop(val, "device_type", type, sizeof(type)); - if (strcmp(type, RELOC("display")) == 0) - prom_setprop(val, "linux,boot-display", NULL, 0); -} - -static void __init prom_close_stdin(void) -{ - struct prom_t *_prom = &RELOC(prom); - ihandle val; - - if (prom_getprop(_prom->chosen, "stdin", &val, sizeof(val)) > 0) - call_prom("close", 1, 0, val); + call_prom("instance-to-path", 3, 1, prom.stdout, path, 255); + prom_printf("OF stdout device is: %s\n", of_stdout_device); + prom_setprop(prom.chosen, "/chosen", "linux,stdout-path", + path, strlen(path) + 1); + + /* instance-to-package fails on PA-Semi */ + stdout_node = call_prom("instance-to-package", 1, 1, prom.stdout); + if (stdout_node != PROM_ERROR) { + val = cpu_to_be32(stdout_node); + prom_setprop(prom.chosen, "/chosen", "linux,stdout-package", + &val, sizeof(val)); + + /* If it's a display, note it */ + memset(type, 0, sizeof(type)); + prom_getprop(stdout_node, "device_type", type, sizeof(type)); + if (strcmp(type, "display") == 0) + prom_setprop(stdout_node, path, "linux,boot-display", NULL, 0); + } } static int __init prom_find_machine_type(void) { - struct prom_t *_prom = &RELOC(prom); char compat[256]; int len, i = 0; +#ifdef CONFIG_PPC64 phandle rtas; + int x; +#endif - len = prom_getprop(_prom->root, "compatible", + /* Look for a PowerMac or a Cell */ + len = prom_getprop(prom.root, "compatible", compat, sizeof(compat)-1); if (len > 0) { compat[len] = 0; @@ -1419,29 +1823,51 @@ static int __init prom_find_machine_type(void) int sl = strlen(p); if (sl == 0) break; - if (strstr(p, RELOC("Power Macintosh")) || - strstr(p, RELOC("MacRISC"))) + if (strstr(p, "Power Macintosh") || + strstr(p, "MacRISC")) return PLATFORM_POWERMAC; #ifdef CONFIG_PPC64 - if (strstr(p, RELOC("Momentum,Maple"))) - return PLATFORM_MAPLE; -#endif + /* We must make sure we don't detect the IBM Cell + * blades as pSeries due to some firmware issues, + * so we do it here. + */ + if (strstr(p, "IBM,CBEA") || + strstr(p, "IBM,CPBW-1.0")) + return PLATFORM_GENERIC; +#endif /* CONFIG_PPC64 */ i += sl + 1; } } #ifdef CONFIG_PPC64 + /* Try to detect OPAL */ + if (PHANDLE_VALID(call_prom("finddevice", 1, 1, ADDR("/ibm,opal")))) + return PLATFORM_OPAL; + + /* Try to figure out if it's an IBM pSeries or any other + * PAPR compliant platform. We assume it is if : + * - /device_type is "chrp" (please, do NOT use that for future + * non-IBM designs ! + * - it has /rtas + */ + len = prom_getprop(prom.root, "device_type", + compat, sizeof(compat)-1); + if (len <= 0) + return PLATFORM_GENERIC; + if (strcmp(compat, "chrp")) + return PLATFORM_GENERIC; + /* Default to pSeries. We need to know if we are running LPAR */ rtas = call_prom("finddevice", 1, 1, ADDR("/rtas")); - if (PHANDLE_VALID(rtas)) { - int x = prom_getproplen(rtas, "ibm,hypertas-functions"); - if (x != PROM_ERROR) { - prom_printf("Hypertas detected, assuming LPAR !\n"); - return PLATFORM_PSERIES_LPAR; - } + if (!PHANDLE_VALID(rtas)) + return PLATFORM_GENERIC; + x = prom_getproplen(rtas, "ibm,hypertas-functions"); + if (x != PROM_ERROR) { + prom_debug("Hypertas detected, assuming LPAR !\n"); + return PLATFORM_PSERIES_LPAR; } return PLATFORM_PSERIES; #else - return PLATFORM_CHRP; + return PLATFORM_GENERIC; #endif } @@ -1485,15 +1911,15 @@ static void __init prom_check_displays(void) }; const unsigned char *clut; - prom_printf("Looking for displays\n"); + prom_debug("Looking for displays\n"); for (node = 0; prom_next_node(&node); ) { memset(type, 0, sizeof(type)); prom_getprop(node, "device_type", type, sizeof(type)); - if (strcmp(type, RELOC("display")) != 0) + if (strcmp(type, "display") != 0) continue; /* It seems OF doesn't null-terminate the path :-( */ - path = RELOC(prom_scratch); + path = prom_scratch; memset(path, 0, PROM_SCRATCH_SIZE); /* @@ -1503,7 +1929,7 @@ static void __init prom_check_displays(void) if (call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-10) == PROM_ERROR) continue; - prom_printf("found display : %s, opening ... ", path); + prom_printf("found display : %s, opening... ", path); ih = call_prom("open", 1, 1, path); if (ih == 0) { @@ -1513,23 +1939,39 @@ static void __init prom_check_displays(void) /* Success */ prom_printf("done\n"); - prom_setprop(node, "linux,opened", NULL, 0); + prom_setprop(node, path, "linux,opened", NULL, 0); /* Setup a usable color table when the appropriate * method is available. Should update this to set-colors */ - clut = RELOC(default_colors); - for (i = 0; i < 32; i++, clut += 3) + clut = default_colors; + for (i = 0; i < 16; i++, clut += 3) if (prom_set_color(ih, i, clut[0], clut[1], clut[2]) != 0) break; #ifdef CONFIG_LOGO_LINUX_CLUT224 - clut = PTRRELOC(RELOC(logo_linux_clut224.clut)); - for (i = 0; i < RELOC(logo_linux_clut224.clutsize); i++, clut += 3) + clut = PTRRELOC(logo_linux_clut224.clut); + for (i = 0; i < logo_linux_clut224.clutsize; i++, clut += 3) if (prom_set_color(ih, i + 32, clut[0], clut[1], clut[2]) != 0) break; #endif /* CONFIG_LOGO_LINUX_CLUT224 */ + +#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX + if (prom_getprop(node, "linux,boot-display", NULL, 0) != + PROM_ERROR) { + u32 width, height, pitch, addr; + + prom_printf("Setting btext !\n"); + prom_getprop(node, "width", &width, 4); + prom_getprop(node, "height", &height, 4); + prom_getprop(node, "linebytes", &pitch, 4); + prom_getprop(node, "address", &addr, 4); + prom_printf("W=%d H=%d LB=%d addr=0x%x\n", + width, height, pitch, addr); + btext_setup_display(width, height, 8, pitch, addr); + } +#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */ } } @@ -1545,16 +1987,18 @@ static void __init *make_room(unsigned long *mem_start, unsigned long *mem_end, unsigned long room, chunk; prom_debug("Chunk exhausted, claiming more at %x...\n", - RELOC(alloc_bottom)); - room = RELOC(alloc_top) - RELOC(alloc_bottom); + alloc_bottom); + room = alloc_top - alloc_bottom; if (room > DEVTREE_CHUNK_SIZE) room = DEVTREE_CHUNK_SIZE; if (room < PAGE_SIZE) - prom_panic("No memory for flatten_device_tree (no room)"); + prom_panic("No memory for flatten_device_tree " + "(no room)\n"); chunk = alloc_up(room, 0); if (chunk == 0) - prom_panic("No memory for flatten_device_tree (claim failed)"); - *mem_end = RELOC(alloc_top); + prom_panic("No memory for flatten_device_tree " + "(claim failed)\n"); + *mem_end = chunk + room; } ret = (void *)*mem_start; @@ -1563,16 +2007,18 @@ static void __init *make_room(unsigned long *mem_start, unsigned long *mem_end, return ret; } -#define dt_push_token(token, mem_start, mem_end) \ - do { *((u32 *)make_room(mem_start, mem_end, 4, 4)) = token; } while(0) +#define dt_push_token(token, mem_start, mem_end) do { \ + void *room = make_room(mem_start, mem_end, 4, 4); \ + *(__be32 *)room = cpu_to_be32(token); \ + } while(0) static unsigned long __init dt_find_string(char *str) { char *s, *os; - s = os = (char *)RELOC(dt_string_start); + s = os = (char *)dt_string_start; s += 4; - while (s < (char *)RELOC(dt_string_end)) { + while (s < (char *)dt_string_end) { if (strcmp(s, str) == 0) return s - os; s += strlen(s) + 1; @@ -1594,10 +2040,10 @@ static void __init scan_dt_build_strings(phandle node, unsigned long soff; phandle child; - sstart = (char *)RELOC(dt_string_start); + sstart = (char *)dt_string_start; /* get and store all property names */ - prev_name = RELOC(""); + prev_name = ""; for (;;) { /* 64 is max len of name including nul. */ namep = make_room(mem_start, mem_end, MAX_PROPERTY_NAME, 1); @@ -1608,9 +2054,9 @@ static void __init scan_dt_build_strings(phandle node, } /* skip "name" */ - if (strcmp(namep, RELOC("name")) == 0) { + if (strcmp(namep, "name") == 0) { *mem_start = (unsigned long)namep; - prev_name = RELOC("name"); + prev_name = "name"; continue; } /* get/create string entry */ @@ -1621,7 +2067,7 @@ static void __init scan_dt_build_strings(phandle node, } else { /* Trim off some if we can */ *mem_start = (unsigned long)namep + strlen(namep) + 1; - RELOC(dt_string_end) = *mem_start; + dt_string_end = *mem_start; } prev_name = namep; } @@ -1642,7 +2088,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, unsigned long soff; unsigned char *valp; static char pname[MAX_PROPERTY_NAME]; - int l, room; + int l, room, has_phandle = 0; dt_push_token(OF_DT_BEGIN_NODE, mem_start, mem_end); @@ -1676,46 +2122,39 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, } /* get it again for debugging */ - path = RELOC(prom_scratch); + path = prom_scratch; memset(path, 0, PROM_SCRATCH_SIZE); call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1); /* get and store all properties */ - prev_name = RELOC(""); - sstart = (char *)RELOC(dt_string_start); + prev_name = ""; + sstart = (char *)dt_string_start; for (;;) { if (call_prom("nextprop", 3, 1, node, prev_name, - RELOC(pname)) != 1) + pname) != 1) break; /* skip "name" */ - if (strcmp(RELOC(pname), RELOC("name")) == 0) { - prev_name = RELOC("name"); + if (strcmp(pname, "name") == 0) { + prev_name = "name"; continue; } /* find string offset */ - soff = dt_find_string(RELOC(pname)); + soff = dt_find_string(pname); if (soff == 0) { prom_printf("WARNING: Can't find string index for" - " <%s>, node %s\n", RELOC(pname), path); + " <%s>, node %s\n", pname, path); break; } prev_name = sstart + soff; /* get length */ - l = call_prom("getproplen", 2, 1, node, RELOC(pname)); + l = call_prom("getproplen", 2, 1, node, pname); /* sanity checks */ if (l == PROM_ERROR) continue; - if (l > MAX_PROPERTY_LENGTH) { - prom_printf("WARNING: ignoring large property "); - /* It seems OF doesn't null-terminate the path :-( */ - prom_printf("[%s] ", path); - prom_printf("%s length 0x%x\n", RELOC(pname), l); - continue; - } /* push property head */ dt_push_token(OF_DT_PROP, mem_start, mem_end); @@ -1724,21 +2163,28 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, /* push property content */ valp = make_room(mem_start, mem_end, l, 4); - call_prom("getprop", 4, 1, node, RELOC(pname), valp, l); + call_prom("getprop", 4, 1, node, pname, valp, l); *mem_start = _ALIGN(*mem_start, 4); + + if (!strcmp(pname, "phandle")) + has_phandle = 1; } - /* Add a "linux,phandle" property. */ - soff = dt_find_string(RELOC("linux,phandle")); - if (soff == 0) - prom_printf("WARNING: Can't find string index for" - " <linux-phandle> node %s\n", path); - else { - dt_push_token(OF_DT_PROP, mem_start, mem_end); - dt_push_token(4, mem_start, mem_end); - dt_push_token(soff, mem_start, mem_end); - valp = make_room(mem_start, mem_end, 4, 4); - *(u32 *)valp = node; + /* Add a "linux,phandle" property if no "phandle" property already + * existed (can happen with OPAL) + */ + if (!has_phandle) { + soff = dt_find_string("linux,phandle"); + if (soff == 0) + prom_printf("WARNING: Can't find string index for" + " <linux-phandle> node %s\n", path); + else { + dt_push_token(OF_DT_PROP, mem_start, mem_end); + dt_push_token(4, mem_start, mem_end); + dt_push_token(soff, mem_start, mem_end); + valp = make_room(mem_start, mem_end, 4, 4); + *(__be32 *)valp = cpu_to_be32(node); + } } /* do all our children */ @@ -1756,24 +2202,23 @@ static void __init flatten_device_tree(void) phandle root; unsigned long mem_start, mem_end, room; struct boot_param_header *hdr; - struct prom_t *_prom = &RELOC(prom); char *namep; u64 *rsvmap; /* * Check how much room we have between alloc top & bottom (+/- a - * few pages), crop to 4Mb, as this is our "chuck" size + * few pages), crop to 1MB, as this is our "chunk" size */ - room = RELOC(alloc_top) - RELOC(alloc_bottom) - 0x4000; + room = alloc_top - alloc_bottom - 0x4000; if (room > DEVTREE_CHUNK_SIZE) room = DEVTREE_CHUNK_SIZE; - prom_debug("starting device tree allocs at %x\n", RELOC(alloc_bottom)); + prom_debug("starting device tree allocs at %x\n", alloc_bottom); /* Now try to claim that */ mem_start = (unsigned long)alloc_up(room, PAGE_SIZE); if (mem_start == 0) prom_panic("Can't allocate initial device-tree chunk\n"); - mem_end = RELOC(alloc_top); + mem_end = mem_start + room; /* Get root of tree */ root = call_prom("peer", 1, 1, (phandle)0); @@ -1784,74 +2229,210 @@ static void __init flatten_device_tree(void) mem_start = _ALIGN(mem_start, 4); hdr = make_room(&mem_start, &mem_end, sizeof(struct boot_param_header), 4); - RELOC(dt_header_start) = (unsigned long)hdr; + dt_header_start = (unsigned long)hdr; rsvmap = make_room(&mem_start, &mem_end, sizeof(mem_reserve_map), 8); /* Start of strings */ mem_start = PAGE_ALIGN(mem_start); - RELOC(dt_string_start) = mem_start; + dt_string_start = mem_start; mem_start += 4; /* hole */ /* Add "linux,phandle" in there, we'll need it */ namep = make_room(&mem_start, &mem_end, 16, 1); - strcpy(namep, RELOC("linux,phandle")); + strcpy(namep, "linux,phandle"); mem_start = (unsigned long)namep + strlen(namep) + 1; /* Build string array */ prom_printf("Building dt strings...\n"); scan_dt_build_strings(root, &mem_start, &mem_end); - RELOC(dt_string_end) = mem_start; + dt_string_end = mem_start; /* Build structure */ mem_start = PAGE_ALIGN(mem_start); - RELOC(dt_struct_start) = mem_start; + dt_struct_start = mem_start; prom_printf("Building dt structure...\n"); scan_dt_build_struct(root, &mem_start, &mem_end); dt_push_token(OF_DT_END, &mem_start, &mem_end); - RELOC(dt_struct_end) = PAGE_ALIGN(mem_start); + dt_struct_end = PAGE_ALIGN(mem_start); /* Finish header */ - hdr->boot_cpuid_phys = _prom->cpu; - hdr->magic = OF_DT_HEADER; - hdr->totalsize = RELOC(dt_struct_end) - RELOC(dt_header_start); - hdr->off_dt_struct = RELOC(dt_struct_start) - RELOC(dt_header_start); - hdr->off_dt_strings = RELOC(dt_string_start) - RELOC(dt_header_start); - hdr->dt_strings_size = RELOC(dt_string_end) - RELOC(dt_string_start); - hdr->off_mem_rsvmap = ((unsigned long)rsvmap) - RELOC(dt_header_start); - hdr->version = OF_DT_VERSION; + hdr->boot_cpuid_phys = cpu_to_be32(prom.cpu); + hdr->magic = cpu_to_be32(OF_DT_HEADER); + hdr->totalsize = cpu_to_be32(dt_struct_end - dt_header_start); + hdr->off_dt_struct = cpu_to_be32(dt_struct_start - dt_header_start); + hdr->off_dt_strings = cpu_to_be32(dt_string_start - dt_header_start); + hdr->dt_strings_size = cpu_to_be32(dt_string_end - dt_string_start); + hdr->off_mem_rsvmap = cpu_to_be32(((unsigned long)rsvmap) - dt_header_start); + hdr->version = cpu_to_be32(OF_DT_VERSION); /* Version 16 is not backward compatible */ - hdr->last_comp_version = 0x10; + hdr->last_comp_version = cpu_to_be32(0x10); - /* Reserve the whole thing and copy the reserve map in, we - * also bump mem_reserve_cnt to cause further reservations to - * fail since it's too late. - */ - reserve_mem(RELOC(dt_header_start), hdr->totalsize); - memcpy(rsvmap, RELOC(mem_reserve_map), sizeof(mem_reserve_map)); + /* Copy the reserve map in */ + memcpy(rsvmap, mem_reserve_map, sizeof(mem_reserve_map)); #ifdef DEBUG_PROM { int i; prom_printf("reserved memory map:\n"); - for (i = 0; i < RELOC(mem_reserve_cnt); i++) + for (i = 0; i < mem_reserve_cnt; i++) prom_printf(" %x - %x\n", - RELOC(mem_reserve_map)[i].base, - RELOC(mem_reserve_map)[i].size); + be64_to_cpu(mem_reserve_map[i].base), + be64_to_cpu(mem_reserve_map[i].size)); } #endif - RELOC(mem_reserve_cnt) = MEM_RESERVE_MAP_SIZE; + /* Bump mem_reserve_cnt to cause further reservations to fail + * since it's too late. + */ + mem_reserve_cnt = MEM_RESERVE_MAP_SIZE; prom_printf("Device tree strings 0x%x -> 0x%x\n", - RELOC(dt_string_start), RELOC(dt_string_end)); + dt_string_start, dt_string_end); prom_printf("Device tree struct 0x%x -> 0x%x\n", - RELOC(dt_struct_start), RELOC(dt_struct_end)); + dt_struct_start, dt_struct_end); +} + +#ifdef CONFIG_PPC_MAPLE +/* PIBS Version 1.05.0000 04/26/2005 has an incorrect /ht/isa/ranges property. + * The values are bad, and it doesn't even have the right number of cells. */ +static void __init fixup_device_tree_maple(void) +{ + phandle isa; + u32 rloc = 0x01002000; /* IO space; PCI device = 4 */ + u32 isa_ranges[6]; + char *name; + + name = "/ht@0/isa@4"; + isa = call_prom("finddevice", 1, 1, ADDR(name)); + if (!PHANDLE_VALID(isa)) { + name = "/ht@0/isa@6"; + isa = call_prom("finddevice", 1, 1, ADDR(name)); + rloc = 0x01003000; /* IO space; PCI device = 6 */ + } + if (!PHANDLE_VALID(isa)) + return; + + if (prom_getproplen(isa, "ranges") != 12) + return; + if (prom_getprop(isa, "ranges", isa_ranges, sizeof(isa_ranges)) + == PROM_ERROR) + return; + + if (isa_ranges[0] != 0x1 || + isa_ranges[1] != 0xf4000000 || + isa_ranges[2] != 0x00010000) + return; + + prom_printf("Fixing up bogus ISA range on Maple/Apache...\n"); + isa_ranges[0] = 0x1; + isa_ranges[1] = 0x0; + isa_ranges[2] = rloc; + isa_ranges[3] = 0x0; + isa_ranges[4] = 0x0; + isa_ranges[5] = 0x00010000; + prom_setprop(isa, name, "ranges", + isa_ranges, sizeof(isa_ranges)); } +#define CPC925_MC_START 0xf8000000 +#define CPC925_MC_LENGTH 0x1000000 +/* The values for memory-controller don't have right number of cells */ +static void __init fixup_device_tree_maple_memory_controller(void) +{ + phandle mc; + u32 mc_reg[4]; + char *name = "/hostbridge@f8000000"; + u32 ac, sc; + + mc = call_prom("finddevice", 1, 1, ADDR(name)); + if (!PHANDLE_VALID(mc)) + return; -static void __init fixup_device_tree(void) + if (prom_getproplen(mc, "reg") != 8) + return; + + prom_getprop(prom.root, "#address-cells", &ac, sizeof(ac)); + prom_getprop(prom.root, "#size-cells", &sc, sizeof(sc)); + if ((ac != 2) || (sc != 2)) + return; + + if (prom_getprop(mc, "reg", mc_reg, sizeof(mc_reg)) == PROM_ERROR) + return; + + if (mc_reg[0] != CPC925_MC_START || mc_reg[1] != CPC925_MC_LENGTH) + return; + + prom_printf("Fixing up bogus hostbridge on Maple...\n"); + + mc_reg[0] = 0x0; + mc_reg[1] = CPC925_MC_START; + mc_reg[2] = 0x0; + mc_reg[3] = CPC925_MC_LENGTH; + prom_setprop(mc, name, "reg", mc_reg, sizeof(mc_reg)); +} +#else +#define fixup_device_tree_maple() +#define fixup_device_tree_maple_memory_controller() +#endif + +#ifdef CONFIG_PPC_CHRP +/* + * Pegasos and BriQ lacks the "ranges" property in the isa node + * Pegasos needs decimal IRQ 14/15, not hexadecimal + * Pegasos has the IDE configured in legacy mode, but advertised as native + */ +static void __init fixup_device_tree_chrp(void) { + phandle ph; + u32 prop[6]; + u32 rloc = 0x01006000; /* IO space; PCI device = 12 */ + char *name; + int rc; + + name = "/pci@80000000/isa@c"; + ph = call_prom("finddevice", 1, 1, ADDR(name)); + if (!PHANDLE_VALID(ph)) { + name = "/pci@ff500000/isa@6"; + ph = call_prom("finddevice", 1, 1, ADDR(name)); + rloc = 0x01003000; /* IO space; PCI device = 6 */ + } + if (PHANDLE_VALID(ph)) { + rc = prom_getproplen(ph, "ranges"); + if (rc == 0 || rc == PROM_ERROR) { + prom_printf("Fixing up missing ISA range on Pegasos...\n"); + + prop[0] = 0x1; + prop[1] = 0x0; + prop[2] = rloc; + prop[3] = 0x0; + prop[4] = 0x0; + prop[5] = 0x00010000; + prom_setprop(ph, name, "ranges", prop, sizeof(prop)); + } + } + + name = "/pci@80000000/ide@C,1"; + ph = call_prom("finddevice", 1, 1, ADDR(name)); + if (PHANDLE_VALID(ph)) { + prom_printf("Fixing up IDE interrupt on Pegasos...\n"); + prop[0] = 14; + prop[1] = 0x0; + prom_setprop(ph, name, "interrupts", prop, 2*sizeof(u32)); + prom_printf("Fixing up IDE class-code on Pegasos...\n"); + rc = prom_getprop(ph, "class-code", prop, sizeof(u32)); + if (rc == sizeof(u32)) { + prop[0] &= ~0x5; + prom_setprop(ph, name, "class-code", prop, sizeof(u32)); + } + } +} +#else +#define fixup_device_tree_chrp() +#endif + #if defined(CONFIG_PPC64) && defined(CONFIG_PPC_PMAC) +static void __init fixup_device_tree_pmac(void) +{ phandle u3, i2c, mpic; u32 u3_rev; u32 interrupts[2]; @@ -1872,7 +2453,7 @@ static void __init fixup_device_tree(void) if (prom_getprop(u3, "device-rev", &u3_rev, sizeof(u3_rev)) == PROM_ERROR) return; - if (u3_rev != 0x35 && u3_rev != 0x37) + if (u3_rev < 0x35 || u3_rev > 0x39) return; /* does it need fixup ? */ if (prom_getproplen(i2c, "interrupts") > 0) @@ -1883,59 +2464,253 @@ static void __init fixup_device_tree(void) /* interrupt on this revision of u3 is number 0 and level */ interrupts[0] = 0; interrupts[1] = 1; - prom_setprop(i2c, "interrupts", &interrupts, sizeof(interrupts)); + prom_setprop(i2c, "/u3@0,f8000000/i2c@f8001000", "interrupts", + &interrupts, sizeof(interrupts)); parent = (u32)mpic; - prom_setprop(i2c, "interrupt-parent", &parent, sizeof(parent)); + prom_setprop(i2c, "/u3@0,f8000000/i2c@f8001000", "interrupt-parent", + &parent, sizeof(parent)); +} +#else +#define fixup_device_tree_pmac() #endif + +#ifdef CONFIG_PPC_EFIKA +/* + * The MPC5200 FEC driver requires an phy-handle property to tell it how + * to talk to the phy. If the phy-handle property is missing, then this + * function is called to add the appropriate nodes and link it to the + * ethernet node. + */ +static void __init fixup_device_tree_efika_add_phy(void) +{ + u32 node; + char prop[64]; + int rv; + + /* Check if /builtin/ethernet exists - bail if it doesn't */ + node = call_prom("finddevice", 1, 1, ADDR("/builtin/ethernet")); + if (!PHANDLE_VALID(node)) + return; + + /* Check if the phy-handle property exists - bail if it does */ + rv = prom_getprop(node, "phy-handle", prop, sizeof(prop)); + if (!rv) + return; + + /* + * At this point the ethernet device doesn't have a phy described. + * Now we need to add the missing phy node and linkage + */ + + /* Check for an MDIO bus node - if missing then create one */ + node = call_prom("finddevice", 1, 1, ADDR("/builtin/mdio")); + if (!PHANDLE_VALID(node)) { + prom_printf("Adding Ethernet MDIO node\n"); + call_prom("interpret", 1, 1, + " s\" /builtin\" find-device" + " new-device" + " 1 encode-int s\" #address-cells\" property" + " 0 encode-int s\" #size-cells\" property" + " s\" mdio\" device-name" + " s\" fsl,mpc5200b-mdio\" encode-string" + " s\" compatible\" property" + " 0xf0003000 0x400 reg" + " 0x2 encode-int" + " 0x5 encode-int encode+" + " 0x3 encode-int encode+" + " s\" interrupts\" property" + " finish-device"); + }; + + /* Check for a PHY device node - if missing then create one and + * give it's phandle to the ethernet node */ + node = call_prom("finddevice", 1, 1, + ADDR("/builtin/mdio/ethernet-phy")); + if (!PHANDLE_VALID(node)) { + prom_printf("Adding Ethernet PHY node\n"); + call_prom("interpret", 1, 1, + " s\" /builtin/mdio\" find-device" + " new-device" + " s\" ethernet-phy\" device-name" + " 0x10 encode-int s\" reg\" property" + " my-self" + " ihandle>phandle" + " finish-device" + " s\" /builtin/ethernet\" find-device" + " encode-int" + " s\" phy-handle\" property" + " device-end"); + } } +static void __init fixup_device_tree_efika(void) +{ + int sound_irq[3] = { 2, 2, 0 }; + int bcomm_irq[3*16] = { 3,0,0, 3,1,0, 3,2,0, 3,3,0, + 3,4,0, 3,5,0, 3,6,0, 3,7,0, + 3,8,0, 3,9,0, 3,10,0, 3,11,0, + 3,12,0, 3,13,0, 3,14,0, 3,15,0 }; + u32 node; + char prop[64]; + int rv, len; + + /* Check if we're really running on a EFIKA */ + node = call_prom("finddevice", 1, 1, ADDR("/")); + if (!PHANDLE_VALID(node)) + return; + + rv = prom_getprop(node, "model", prop, sizeof(prop)); + if (rv == PROM_ERROR) + return; + if (strcmp(prop, "EFIKA5K2")) + return; + + prom_printf("Applying EFIKA device tree fixups\n"); + + /* Claiming to be 'chrp' is death */ + node = call_prom("finddevice", 1, 1, ADDR("/")); + rv = prom_getprop(node, "device_type", prop, sizeof(prop)); + if (rv != PROM_ERROR && (strcmp(prop, "chrp") == 0)) + prom_setprop(node, "/", "device_type", "efika", sizeof("efika")); + + /* CODEGEN,description is exposed in /proc/cpuinfo so + fix that too */ + rv = prom_getprop(node, "CODEGEN,description", prop, sizeof(prop)); + if (rv != PROM_ERROR && (strstr(prop, "CHRP"))) + prom_setprop(node, "/", "CODEGEN,description", + "Efika 5200B PowerPC System", + sizeof("Efika 5200B PowerPC System")); + + /* Fixup bestcomm interrupts property */ + node = call_prom("finddevice", 1, 1, ADDR("/builtin/bestcomm")); + if (PHANDLE_VALID(node)) { + len = prom_getproplen(node, "interrupts"); + if (len == 12) { + prom_printf("Fixing bestcomm interrupts property\n"); + prom_setprop(node, "/builtin/bestcom", "interrupts", + bcomm_irq, sizeof(bcomm_irq)); + } + } + + /* Fixup sound interrupts property */ + node = call_prom("finddevice", 1, 1, ADDR("/builtin/sound")); + if (PHANDLE_VALID(node)) { + rv = prom_getprop(node, "interrupts", prop, sizeof(prop)); + if (rv == PROM_ERROR) { + prom_printf("Adding sound interrupts property\n"); + prom_setprop(node, "/builtin/sound", "interrupts", + sound_irq, sizeof(sound_irq)); + } + } + + /* Make sure ethernet phy-handle property exists */ + fixup_device_tree_efika_add_phy(); +} +#else +#define fixup_device_tree_efika() +#endif + +static void __init fixup_device_tree(void) +{ + fixup_device_tree_maple(); + fixup_device_tree_maple_memory_controller(); + fixup_device_tree_chrp(); + fixup_device_tree_pmac(); + fixup_device_tree_efika(); +} static void __init prom_find_boot_cpu(void) { - struct prom_t *_prom = &RELOC(prom); - u32 getprop_rval; + __be32 rval; ihandle prom_cpu; phandle cpu_pkg; - _prom->cpu = 0; - if (prom_getprop(_prom->chosen, "cpu", &prom_cpu, sizeof(prom_cpu)) <= 0) + rval = 0; + if (prom_getprop(prom.chosen, "cpu", &rval, sizeof(rval)) <= 0) return; + prom_cpu = be32_to_cpu(rval); cpu_pkg = call_prom("instance-to-package", 1, 1, prom_cpu); - prom_getprop(cpu_pkg, "reg", &getprop_rval, sizeof(getprop_rval)); - _prom->cpu = getprop_rval; + prom_getprop(cpu_pkg, "reg", &rval, sizeof(rval)); + prom.cpu = be32_to_cpu(rval); - prom_debug("Booting CPU hw index = 0x%x\n", _prom->cpu); + prom_debug("Booting CPU hw index = %lu\n", prom.cpu); } static void __init prom_check_initrd(unsigned long r3, unsigned long r4) { #ifdef CONFIG_BLK_DEV_INITRD - struct prom_t *_prom = &RELOC(prom); - if (r3 && r4 && r4 != 0xdeadbeef) { - unsigned long val; + __be64 val; - RELOC(prom_initrd_start) = (r3 >= KERNELBASE) ? __pa(r3) : r3; - RELOC(prom_initrd_end) = RELOC(prom_initrd_start) + r4; + prom_initrd_start = is_kernel_addr(r3) ? __pa(r3) : r3; + prom_initrd_end = prom_initrd_start + r4; - val = RELOC(prom_initrd_start); - prom_setprop(_prom->chosen, "linux,initrd-start", &val, - sizeof(val)); - val = RELOC(prom_initrd_end); - prom_setprop(_prom->chosen, "linux,initrd-end", &val, - sizeof(val)); + val = cpu_to_be64(prom_initrd_start); + prom_setprop(prom.chosen, "/chosen", "linux,initrd-start", + &val, sizeof(val)); + val = cpu_to_be64(prom_initrd_end); + prom_setprop(prom.chosen, "/chosen", "linux,initrd-end", + &val, sizeof(val)); - reserve_mem(RELOC(prom_initrd_start), - RELOC(prom_initrd_end) - RELOC(prom_initrd_start)); + reserve_mem(prom_initrd_start, + prom_initrd_end - prom_initrd_start); - prom_debug("initrd_start=0x%x\n", RELOC(prom_initrd_start)); - prom_debug("initrd_end=0x%x\n", RELOC(prom_initrd_end)); + prom_debug("initrd_start=0x%x\n", prom_initrd_start); + prom_debug("initrd_end=0x%x\n", prom_initrd_end); } #endif /* CONFIG_BLK_DEV_INITRD */ } +#ifdef CONFIG_PPC64 +#ifdef CONFIG_RELOCATABLE +static void reloc_toc(void) +{ +} + +static void unreloc_toc(void) +{ +} +#else +static void __reloc_toc(unsigned long offset, unsigned long nr_entries) +{ + unsigned long i; + unsigned long *toc_entry; + + /* Get the start of the TOC by using r2 directly. */ + asm volatile("addi %0,2,-0x8000" : "=b" (toc_entry)); + + for (i = 0; i < nr_entries; i++) { + *toc_entry = *toc_entry + offset; + toc_entry++; + } +} + +static void reloc_toc(void) +{ + unsigned long offset = reloc_offset(); + unsigned long nr_entries = + (__prom_init_toc_end - __prom_init_toc_start) / sizeof(long); + + __reloc_toc(offset, nr_entries); + + mb(); +} + +static void unreloc_toc(void) +{ + unsigned long offset = reloc_offset(); + unsigned long nr_entries = + (__prom_init_toc_end - __prom_init_toc_start) / sizeof(long); + + mb(); + + __reloc_toc(-offset, nr_entries); +} +#endif +#endif + /* * We enter here early on, when the Open Firmware prom is still * handling exceptions and the MMU hash table for us. @@ -1943,23 +2718,22 @@ static void __init prom_check_initrd(unsigned long r3, unsigned long r4) unsigned long __init prom_init(unsigned long r3, unsigned long r4, unsigned long pp, - unsigned long r6, unsigned long r7) + unsigned long r6, unsigned long r7, + unsigned long kbase) { - struct prom_t *_prom; unsigned long hdr; - u32 getprop_rval; - unsigned long offset = reloc_offset(); #ifdef CONFIG_PPC32 + unsigned long offset = reloc_offset(); reloc_got2(offset); +#else + reloc_toc(); #endif - _prom = &RELOC(prom); - /* * First zero the BSS */ - memset(&RELOC(__bss_start), 0, __bss_stop - __bss_start); + memset(&__bss_start, 0, __bss_stop - __bss_start); /* * Init interface to Open Firmware, get some node references, @@ -1968,42 +2742,50 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, prom_init_client_services(pp); /* - * Init prom stdout device - */ - prom_init_stdout(); - - /* * See if this OF is old enough that we need to do explicit maps + * and other workarounds */ prom_find_mmu(); /* - * Check for an initrd + * Init prom stdout device */ - prom_check_initrd(r3, r4); + prom_init_stdout(); + + prom_printf("Preparing to boot %s", linux_banner); /* * Get default machine type. At this point, we do not differentiate * between pSeries SMP and pSeries LPAR */ - RELOC(of_platform) = prom_find_machine_type(); - getprop_rval = RELOC(of_platform); - prom_setprop(_prom->chosen, "linux,platform", - &getprop_rval, sizeof(getprop_rval)); + of_platform = prom_find_machine_type(); + prom_printf("Detected machine type: %x\n", of_platform); -#ifdef CONFIG_PPC_PSERIES +#ifndef CONFIG_NONSTATIC_KERNEL + /* Bail if this is a kdump kernel. */ + if (PHYSICAL_START > 0) + prom_panic("Error: You can't boot a kdump kernel from OF!\n"); +#endif + + /* + * Check for an initrd + */ + prom_check_initrd(r3, r4); + +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * On pSeries, inform the firmware about our capabilities */ - if (RELOC(of_platform) & PLATFORM_PSERIES) + if (of_platform == PLATFORM_PSERIES || + of_platform == PLATFORM_PSERIES_LPAR) prom_send_capabilities(); #endif /* - * On pSeries and BPA, copy the CPU hold code + * Copy the CPU hold code */ - if (RELOC(of_platform) != PLATFORM_POWERMAC) - copy_and_flush(0, KERNELBASE + offset, 0x100, 0); + if (of_platform != PLATFORM_POWERMAC) + copy_and_flush(0, kbase, 0x100, 0); /* * Do early parsing of command line @@ -2025,46 +2807,68 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, */ prom_check_displays(); -#ifdef CONFIG_PPC64 +#if defined(CONFIG_PPC64) && defined(__BIG_ENDIAN__) /* * Initialize IOMMU (TCE tables) on pSeries. Do that before anything else * that uses the allocator, we need to make sure we get the top of memory * available for us here... */ - if (RELOC(of_platform) == PLATFORM_PSERIES) + if (of_platform == PLATFORM_PSERIES) prom_initialize_tce_table(); #endif /* - * On non-powermacs, try to instantiate RTAS and puts all CPUs - * in spin-loops. PowerMacs don't have a working RTAS and use - * a different way to spin CPUs + * On non-powermacs, try to instantiate RTAS. PowerMacs don't + * have a usable RTAS implementation. */ - if (RELOC(of_platform) != PLATFORM_POWERMAC) { + if (of_platform != PLATFORM_POWERMAC && + of_platform != PLATFORM_OPAL) prom_instantiate_rtas(); + +#ifdef CONFIG_PPC_POWERNV + if (of_platform == PLATFORM_OPAL) + prom_instantiate_opal(); +#endif /* CONFIG_PPC_POWERNV */ + +#ifdef CONFIG_PPC64 + /* instantiate sml */ + prom_instantiate_sml(); +#endif + + /* + * On non-powermacs, put all CPUs in spin-loops. + * + * PowerMacs use a different mechanism to spin CPUs + * + * (This must be done after instanciating RTAS) + */ + if (of_platform != PLATFORM_POWERMAC && + of_platform != PLATFORM_OPAL) prom_hold_cpus(); - } /* * Fill in some infos for use by the kernel later on */ - if (RELOC(prom_memory_limit)) - prom_setprop(_prom->chosen, "linux,memory-limit", - &RELOC(prom_memory_limit), - sizeof(prom_memory_limit)); + if (prom_memory_limit) { + __be64 val = cpu_to_be64(prom_memory_limit); + prom_setprop(prom.chosen, "/chosen", "linux,memory-limit", + &val, sizeof(val)); + } #ifdef CONFIG_PPC64 - if (RELOC(ppc64_iommu_off)) - prom_setprop(_prom->chosen, "linux,iommu-off", NULL, 0); + if (prom_iommu_off) + prom_setprop(prom.chosen, "/chosen", "linux,iommu-off", + NULL, 0); - if (RELOC(iommu_force_on)) - prom_setprop(_prom->chosen, "linux,iommu-force-on", NULL, 0); + if (prom_iommu_force_on) + prom_setprop(prom.chosen, "/chosen", "linux,iommu-force-on", + NULL, 0); - if (RELOC(prom_tce_alloc_start)) { - prom_setprop(_prom->chosen, "linux,tce-alloc-start", - &RELOC(prom_tce_alloc_start), + if (prom_tce_alloc_start) { + prom_setprop(prom.chosen, "/chosen", "linux,tce-alloc-start", + &prom_tce_alloc_start, sizeof(prom_tce_alloc_start)); - prom_setprop(_prom->chosen, "linux,tce-alloc-end", - &RELOC(prom_tce_alloc_end), + prom_setprop(prom.chosen, "/chosen", "linux,tce-alloc-end", + &prom_tce_alloc_end, sizeof(prom_tce_alloc_end)); } #endif @@ -2077,17 +2881,24 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, /* * Now finally create the flattened device-tree */ - prom_printf("copying OF device tree ...\n"); + prom_printf("copying OF device tree...\n"); flatten_device_tree(); - /* in case stdin is USB and still active on IBM machines... */ - prom_close_stdin(); + /* + * in case stdin is USB and still active on IBM machines... + * Unfortunately quiesce crashes on some powermacs if we have + * closed stdin already (in particular the powerbook 101). It + * appears that the OPAL version of OFW doesn't like it either. + */ + if (of_platform != PLATFORM_POWERMAC && + of_platform != PLATFORM_OPAL) + prom_close_stdin(); /* * Call OF "quiesce" method to shut down pending DMA's from * devices etc... */ - prom_printf("Calling quiesce ...\n"); + prom_printf("Calling quiesce...\n"); call_prom("quiesce", 0, 0); /* @@ -2095,15 +2906,27 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * tree and NULL as r5, thus triggering the new entry point which * is common to us and kexec */ - hdr = RELOC(dt_header_start); - prom_printf("returning from prom_init\n"); - prom_debug("->dt_header_start=0x%x\n", hdr); + hdr = dt_header_start; + + /* Don't print anything after quiesce under OPAL, it crashes OFW */ + if (of_platform != PLATFORM_OPAL) { + prom_printf("returning from prom_init\n"); + prom_debug("->dt_header_start=0x%x\n", hdr); + } #ifdef CONFIG_PPC32 reloc_got2(-offset); +#else + unreloc_toc(); #endif - __start(hdr, KERNELBASE + offset, 0); +#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL + /* OPAL early debug gets the OPAL base & entry in r8 and r9 */ + __start(hdr, kbase, 0, 0, 0, + prom_opal_base, prom_opal_entry); +#else + __start(hdr, kbase, 0, 0, 0, 0, 0); +#endif return 0; } diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh new file mode 100644 index 00000000000..fe8e54b9ef7 --- /dev/null +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -0,0 +1,79 @@ +#!/bin/sh +# +# Copyright © 2008 IBM Corporation +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version +# 2 of the License, or (at your option) any later version. + +# This script checks prom_init.o to see what external symbols it +# is using, if it finds symbols not in the whitelist it returns +# an error. The point of this is to discourage people from +# intentionally or accidentally adding new code to prom_init.c +# which has side effects on other parts of the kernel. + +# If you really need to reference something from prom_init.o add +# it to the list below: + +WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush +_end enter_prom memcpy memset reloc_offset __secondary_hold +__secondary_hold_acknowledge __secondary_hold_spinloop __start +strcmp strcpy strlcpy strlen strncmp strstr logo_linux_clut224 +reloc_got2 kernstart_addr memstart_addr linux_banner _stext +__prom_init_toc_start __prom_init_toc_end btext_setup_display TOC." + +NM="$1" +OBJ="$2" + +ERROR=0 + +for UNDEF in $($NM -u $OBJ | awk '{print $2}') +do + # On 64-bit nm gives us the function descriptors, which have + # a leading . on the name, so strip it off here. + UNDEF="${UNDEF#.}" + + if [ $KBUILD_VERBOSE ]; then + if [ $KBUILD_VERBOSE -ne 0 ]; then + echo "Checking prom_init.o symbol '$UNDEF'" + fi + fi + + OK=0 + for WHITE in $WHITELIST + do + if [ "$UNDEF" = "$WHITE" ]; then + OK=1 + break + fi + done + + # ignore register save/restore funcitons + if [ "${UNDEF:0:9}" = "_restgpr_" ]; then + OK=1 + fi + if [ "${UNDEF:0:10}" = "_restgpr0_" ]; then + OK=1 + fi + if [ "${UNDEF:0:11}" = "_rest32gpr_" ]; then + OK=1 + fi + if [ "${UNDEF:0:9}" = "_savegpr_" ]; then + OK=1 + fi + if [ "${UNDEF:0:10}" = "_savegpr0_" ]; then + OK=1 + fi + if [ "${UNDEF:0:11}" = "_save32gpr_" ]; then + OK=1 + fi + + if [ $OK -eq 0 ]; then + ERROR=1 + echo "Error: External symbol '$UNDEF' referenced" \ + "from prom_init.c" >&2 + fi +done + +exit $ERROR diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c new file mode 100644 index 00000000000..6295e646f78 --- /dev/null +++ b/arch/powerpc/kernel/prom_parse.c @@ -0,0 +1,33 @@ +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/ioport.h> +#include <linux/etherdevice.h> +#include <linux/of_address.h> +#include <asm/prom.h> + +void of_parse_dma_window(struct device_node *dn, const __be32 *dma_window, + unsigned long *busno, unsigned long *phys, + unsigned long *size) +{ + u32 cells; + const __be32 *prop; + + /* busno is always one cell */ + *busno = of_read_number(dma_window, 1); + dma_window++; + + prop = of_get_property(dn, "ibm,#dma-address-cells", NULL); + if (!prop) + prop = of_get_property(dn, "#address-cells", NULL); + + cells = prop ? of_read_number(prop, 1) : of_n_addr_cells(dn); + *phys = of_read_number(dma_window, cells); + + dma_window += cells; + + prop = of_get_property(dn, "ibm,#dma-size-cells", NULL); + cells = prop ? of_read_number(prop, 1) : of_n_size_cells(dn); + *size = of_read_number(dma_window, cells); +} diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 568ea335d61..2e3d2bf536c 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -15,132 +15,537 @@ * this archive for more details. */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/errno.h> #include <linux/ptrace.h> +#include <linux/regset.h> +#include <linux/tracehook.h> +#include <linux/elf.h> #include <linux/user.h> #include <linux/security.h> #include <linux/signal.h> #include <linux/seccomp.h> #include <linux/audit.h> -#ifdef CONFIG_PPC32 -#include <linux/module.h> -#endif +#include <trace/syscall.h> +#include <linux/hw_breakpoint.h> +#include <linux/perf_event.h> +#include <linux/context_tracking.h> #include <asm/uaccess.h> #include <asm/page.h> #include <asm/pgtable.h> -#include <asm/system.h> +#include <asm/switch_to.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + +/* + * The parameter save area on the stack is used to store arguments being passed + * to callee function and is located at fixed offset from stack pointer. + */ +#ifdef CONFIG_PPC32 +#define PARAMETER_SAVE_AREA_OFFSET 24 /* bytes */ +#else /* CONFIG_PPC32 */ +#define PARAMETER_SAVE_AREA_OFFSET 48 /* bytes */ +#endif + +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define STR(s) #s /* convert to string */ +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define GPR_OFFSET_NAME(num) \ + {.name = STR(gpr##num), .offset = offsetof(struct pt_regs, gpr[num])} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { + GPR_OFFSET_NAME(0), + GPR_OFFSET_NAME(1), + GPR_OFFSET_NAME(2), + GPR_OFFSET_NAME(3), + GPR_OFFSET_NAME(4), + GPR_OFFSET_NAME(5), + GPR_OFFSET_NAME(6), + GPR_OFFSET_NAME(7), + GPR_OFFSET_NAME(8), + GPR_OFFSET_NAME(9), + GPR_OFFSET_NAME(10), + GPR_OFFSET_NAME(11), + GPR_OFFSET_NAME(12), + GPR_OFFSET_NAME(13), + GPR_OFFSET_NAME(14), + GPR_OFFSET_NAME(15), + GPR_OFFSET_NAME(16), + GPR_OFFSET_NAME(17), + GPR_OFFSET_NAME(18), + GPR_OFFSET_NAME(19), + GPR_OFFSET_NAME(20), + GPR_OFFSET_NAME(21), + GPR_OFFSET_NAME(22), + GPR_OFFSET_NAME(23), + GPR_OFFSET_NAME(24), + GPR_OFFSET_NAME(25), + GPR_OFFSET_NAME(26), + GPR_OFFSET_NAME(27), + GPR_OFFSET_NAME(28), + GPR_OFFSET_NAME(29), + GPR_OFFSET_NAME(30), + GPR_OFFSET_NAME(31), + REG_OFFSET_NAME(nip), + REG_OFFSET_NAME(msr), + REG_OFFSET_NAME(ctr), + REG_OFFSET_NAME(link), + REG_OFFSET_NAME(xer), + REG_OFFSET_NAME(ccr), #ifdef CONFIG_PPC64 -#include <asm/ptrace-common.h> + REG_OFFSET_NAME(softe), +#else + REG_OFFSET_NAME(mq), #endif + REG_OFFSET_NAME(trap), + REG_OFFSET_NAME(dar), + REG_OFFSET_NAME(dsisr), + REG_OFFSET_END, +}; + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +/** + * regs_query_register_name() - query register name from its offset + * @offset: the offset of a register in struct pt_regs. + * + * regs_query_register_name() returns the name of a register from its + * offset in struct pt_regs. If the @offset is invalid, this returns NULL; + */ +const char *regs_query_register_name(unsigned int offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ -#ifdef CONFIG_PPC32 /* * Set of msr bits that gdb can change on behalf of a process. */ -#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) +#ifdef CONFIG_PPC_ADV_DEBUG_REGS #define MSR_DEBUGCHANGE 0 #else #define MSR_DEBUGCHANGE (MSR_SE | MSR_BE) #endif -#endif /* CONFIG_PPC32 */ /* - * does not yet catch signals sent when the child dies. - * in exit.c or in signal.c. + * Max register writeable via put_reg */ - #ifdef CONFIG_PPC32 +#define PT_MAX_PUT_REG PT_MQ +#else +#define PT_MAX_PUT_REG PT_CCR +#endif + +static unsigned long get_user_msr(struct task_struct *task) +{ + return task->thread.regs->msr | task->thread.fpexc_mode; +} + +static int set_user_msr(struct task_struct *task, unsigned long msr) +{ + task->thread.regs->msr &= ~MSR_DEBUGCHANGE; + task->thread.regs->msr |= msr & MSR_DEBUGCHANGE; + return 0; +} + +#ifdef CONFIG_PPC64 +static int get_user_dscr(struct task_struct *task, unsigned long *data) +{ + *data = task->thread.dscr; + return 0; +} + +static int set_user_dscr(struct task_struct *task, unsigned long dscr) +{ + task->thread.dscr = dscr; + task->thread.dscr_inherit = 1; + return 0; +} +#else +static int get_user_dscr(struct task_struct *task, unsigned long *data) +{ + return -EIO; +} + +static int set_user_dscr(struct task_struct *task, unsigned long dscr) +{ + return -EIO; +} +#endif + +/* + * We prevent mucking around with the reserved area of trap + * which are used internally by the kernel. + */ +static int set_user_trap(struct task_struct *task, unsigned long trap) +{ + task->thread.regs->trap = trap & 0xfff0; + return 0; +} + /* * Get contents of register REGNO in task TASK. */ -static inline unsigned long get_reg(struct task_struct *task, int regno) +int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) { - if (regno < sizeof(struct pt_regs) / sizeof(unsigned long) - && task->thread.regs != NULL) - return ((unsigned long *)task->thread.regs)[regno]; - return (0); + if ((task->thread.regs == NULL) || !data) + return -EIO; + + if (regno == PT_MSR) { + *data = get_user_msr(task); + return 0; + } + + if (regno == PT_DSCR) + return get_user_dscr(task, data); + + if (regno < (sizeof(struct pt_regs) / sizeof(unsigned long))) { + *data = ((unsigned long *)task->thread.regs)[regno]; + return 0; + } + + return -EIO; } /* * Write contents of register REGNO in task TASK. */ -static inline int put_reg(struct task_struct *task, int regno, - unsigned long data) +int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data) { - if (regno <= PT_MQ && task->thread.regs != NULL) { - if (regno == PT_MSR) - data = (data & MSR_DEBUGCHANGE) - | (task->thread.regs->msr & ~MSR_DEBUGCHANGE); + if (task->thread.regs == NULL) + return -EIO; + + if (regno == PT_MSR) + return set_user_msr(task, data); + if (regno == PT_TRAP) + return set_user_trap(task, data); + if (regno == PT_DSCR) + return set_user_dscr(task, data); + + if (regno <= PT_MAX_PUT_REG) { ((unsigned long *)task->thread.regs)[regno] = data; return 0; } return -EIO; } +static int gpr_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int i, ret; + + if (target->thread.regs == NULL) + return -EIO; + + if (!FULL_REGS(target->thread.regs)) { + /* We have a partial register set. Fill 14-31 with bogus values */ + for (i = 14; i < 32; i++) + target->thread.regs->gpr[i] = NV_REG_POISON; + } + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + target->thread.regs, + 0, offsetof(struct pt_regs, msr)); + if (!ret) { + unsigned long msr = get_user_msr(target); + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &msr, + offsetof(struct pt_regs, msr), + offsetof(struct pt_regs, msr) + + sizeof(msr)); + } + + BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != + offsetof(struct pt_regs, msr) + sizeof(long)); + + if (!ret) + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.regs->orig_gpr3, + offsetof(struct pt_regs, orig_gpr3), + sizeof(struct pt_regs)); + if (!ret) + ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, + sizeof(struct pt_regs), -1); + + return ret; +} + +static int gpr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + unsigned long reg; + int ret; + + if (target->thread.regs == NULL) + return -EIO; + + CHECK_FULL_REGS(target->thread.regs); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + target->thread.regs, + 0, PT_MSR * sizeof(reg)); + + if (!ret && count > 0) { + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, ®, + PT_MSR * sizeof(reg), + (PT_MSR + 1) * sizeof(reg)); + if (!ret) + ret = set_user_msr(target, reg); + } + + BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) != + offsetof(struct pt_regs, msr) + sizeof(long)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.regs->orig_gpr3, + PT_ORIG_R3 * sizeof(reg), + (PT_MAX_PUT_REG + 1) * sizeof(reg)); + + if (PT_MAX_PUT_REG + 1 < PT_TRAP && !ret) + ret = user_regset_copyin_ignore( + &pos, &count, &kbuf, &ubuf, + (PT_MAX_PUT_REG + 1) * sizeof(reg), + PT_TRAP * sizeof(reg)); + + if (!ret && count > 0) { + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, ®, + PT_TRAP * sizeof(reg), + (PT_TRAP + 1) * sizeof(reg)); + if (!ret) + ret = set_user_trap(target, reg); + } + + if (!ret) + ret = user_regset_copyin_ignore( + &pos, &count, &kbuf, &ubuf, + (PT_TRAP + 1) * sizeof(reg), -1); + + return ret; +} + +static int fpr_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ +#ifdef CONFIG_VSX + u64 buf[33]; + int i; +#endif + flush_fp_to_thread(target); + +#ifdef CONFIG_VSX + /* copy to local buffer then write that out */ + for (i = 0; i < 32 ; i++) + buf[i] = target->thread.TS_FPR(i); + buf[32] = target->thread.fp_state.fpscr; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1); + +#else + BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != + offsetof(struct thread_fp_state, fpr[32][0])); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.fp_state, 0, -1); +#endif +} + +static int fpr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ +#ifdef CONFIG_VSX + u64 buf[33]; + int i; +#endif + flush_fp_to_thread(target); + +#ifdef CONFIG_VSX + /* copy to local buffer then write that out */ + i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1); + if (i) + return i; + for (i = 0; i < 32 ; i++) + target->thread.TS_FPR(i) = buf[i]; + target->thread.fp_state.fpscr = buf[32]; + return 0; +#else + BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != + offsetof(struct thread_fp_state, fpr[32][0])); + + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.fp_state, 0, -1); +#endif +} + #ifdef CONFIG_ALTIVEC /* - * Get contents of AltiVec register state in task TASK + * Get/set all the altivec registers vr0..vr31, vscr, vrsave, in one go. + * The transfer totals 34 quadword. Quadwords 0-31 contain the + * corresponding vector registers. Quadword 32 contains the vscr as the + * last word (offset 12) within that quadword. Quadword 33 contains the + * vrsave as the first word (offset 0) within the quadword. + * + * This definition of the VMX state is compatible with the current PPC32 + * ptrace interface. This allows signal handling and ptrace to use the + * same structures. This also simplifies the implementation of a bi-arch + * (combined (32- and 64-bit) gdb. */ -static inline int get_vrregs(unsigned long __user *data, struct task_struct *task) -{ - int i, j; - if (!access_ok(VERIFY_WRITE, data, 133 * sizeof(unsigned long))) - return -EFAULT; +static int vr_active(struct task_struct *target, + const struct user_regset *regset) +{ + flush_altivec_to_thread(target); + return target->thread.used_vr ? regset->n : 0; +} - /* copy AltiVec registers VR[0] .. VR[31] */ - for (i = 0; i < 32; i++) - for (j = 0; j < 4; j++, data++) - if (__put_user(task->thread.vr[i].u[j], data)) - return -EFAULT; +static int vr_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + flush_altivec_to_thread(target); + + BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) != + offsetof(struct thread_vr_state, vr[32])); + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.vr_state, 0, + 33 * sizeof(vector128)); + if (!ret) { + /* + * Copy out only the low-order word of vrsave. + */ + union { + elf_vrreg_t reg; + u32 word; + } vrsave; + memset(&vrsave, 0, sizeof(vrsave)); + vrsave.word = target->thread.vrsave; + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave, + 33 * sizeof(vector128), -1); + } - /* copy VSCR */ - for (i = 0; i < 4; i++, data++) - if (__put_user(task->thread.vscr.u[i], data)) - return -EFAULT; + return ret; +} - /* copy VRSAVE */ - if (__put_user(task->thread.vrsave, data)) - return -EFAULT; +static int vr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + flush_altivec_to_thread(target); + + BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) != + offsetof(struct thread_vr_state, vr[32])); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.vr_state, 0, + 33 * sizeof(vector128)); + if (!ret && count > 0) { + /* + * We use only the first word of vrsave. + */ + union { + elf_vrreg_t reg; + u32 word; + } vrsave; + memset(&vrsave, 0, sizeof(vrsave)); + vrsave.word = target->thread.vrsave; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave, + 33 * sizeof(vector128), -1); + if (!ret) + target->thread.vrsave = vrsave.word; + } - return 0; + return ret; } +#endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX /* - * Write contents of AltiVec register state into task TASK. + * Currently to set and and get all the vsx state, you need to call + * the fp and VMX calls as well. This only get/sets the lower 32 + * 128bit VSX registers. */ -static inline int set_vrregs(struct task_struct *task, unsigned long __user *data) + +static int vsr_active(struct task_struct *target, + const struct user_regset *regset) { - int i, j; + flush_vsx_to_thread(target); + return target->thread.used_vsr ? regset->n : 0; +} - if (!access_ok(VERIFY_READ, data, 133 * sizeof(unsigned long))) - return -EFAULT; +static int vsr_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + u64 buf[32]; + int ret, i; - /* copy AltiVec registers VR[0] .. VR[31] */ - for (i = 0; i < 32; i++) - for (j = 0; j < 4; j++, data++) - if (__get_user(task->thread.vr[i].u[j], data)) - return -EFAULT; + flush_vsx_to_thread(target); - /* copy VSCR */ - for (i = 0; i < 4; i++, data++) - if (__get_user(task->thread.vscr.u[i], data)) - return -EFAULT; + for (i = 0; i < 32 ; i++) + buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + buf, 0, 32 * sizeof(double)); - /* copy VRSAVE */ - if (__get_user(task->thread.vrsave, data)) - return -EFAULT; + return ret; +} - return 0; +static int vsr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + u64 buf[32]; + int ret,i; + + flush_vsx_to_thread(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + buf, 0, 32 * sizeof(double)); + for (i = 0; i < 32 ; i++) + target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + + + return ret; } -#endif +#endif /* CONFIG_VSX */ #ifdef CONFIG_SPE @@ -154,88 +559,502 @@ static inline int set_vrregs(struct task_struct *task, unsigned long __user *dat * } */ +static int evr_active(struct task_struct *target, + const struct user_regset *regset) +{ + flush_spe_to_thread(target); + return target->thread.used_spe ? regset->n : 0; +} + +static int evr_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + flush_spe_to_thread(target); + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.evr, + 0, sizeof(target->thread.evr)); + + BUILD_BUG_ON(offsetof(struct thread_struct, acc) + sizeof(u64) != + offsetof(struct thread_struct, spefscr)); + + if (!ret) + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.acc, + sizeof(target->thread.evr), -1); + + return ret; +} + +static int evr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + flush_spe_to_thread(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.evr, + 0, sizeof(target->thread.evr)); + + BUILD_BUG_ON(offsetof(struct thread_struct, acc) + sizeof(u64) != + offsetof(struct thread_struct, spefscr)); + + if (!ret) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.acc, + sizeof(target->thread.evr), -1); + + return ret; +} +#endif /* CONFIG_SPE */ + + /* - * Get contents of SPE register state in task TASK. + * These are our native regset flavors. */ -static inline int get_evrregs(unsigned long *data, struct task_struct *task) +enum powerpc_regset { + REGSET_GPR, + REGSET_FPR, +#ifdef CONFIG_ALTIVEC + REGSET_VMX, +#endif +#ifdef CONFIG_VSX + REGSET_VSX, +#endif +#ifdef CONFIG_SPE + REGSET_SPE, +#endif +}; + +static const struct user_regset native_regsets[] = { + [REGSET_GPR] = { + .core_note_type = NT_PRSTATUS, .n = ELF_NGREG, + .size = sizeof(long), .align = sizeof(long), + .get = gpr_get, .set = gpr_set + }, + [REGSET_FPR] = { + .core_note_type = NT_PRFPREG, .n = ELF_NFPREG, + .size = sizeof(double), .align = sizeof(double), + .get = fpr_get, .set = fpr_set + }, +#ifdef CONFIG_ALTIVEC + [REGSET_VMX] = { + .core_note_type = NT_PPC_VMX, .n = 34, + .size = sizeof(vector128), .align = sizeof(vector128), + .active = vr_active, .get = vr_get, .set = vr_set + }, +#endif +#ifdef CONFIG_VSX + [REGSET_VSX] = { + .core_note_type = NT_PPC_VSX, .n = 32, + .size = sizeof(double), .align = sizeof(double), + .active = vsr_active, .get = vsr_get, .set = vsr_set + }, +#endif +#ifdef CONFIG_SPE + [REGSET_SPE] = { + .core_note_type = NT_PPC_SPE, .n = 35, + .size = sizeof(u32), .align = sizeof(u32), + .active = evr_active, .get = evr_get, .set = evr_set + }, +#endif +}; + +static const struct user_regset_view user_ppc_native_view = { + .name = UTS_MACHINE, .e_machine = ELF_ARCH, .ei_osabi = ELF_OSABI, + .regsets = native_regsets, .n = ARRAY_SIZE(native_regsets) +}; + +#ifdef CONFIG_PPC64 +#include <linux/compat.h> + +static int gpr32_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) { + const unsigned long *regs = &target->thread.regs->gpr[0]; + compat_ulong_t *k = kbuf; + compat_ulong_t __user *u = ubuf; + compat_ulong_t reg; int i; - if (!access_ok(VERIFY_WRITE, data, 35 * sizeof(unsigned long))) - return -EFAULT; + if (target->thread.regs == NULL) + return -EIO; - /* copy SPEFSCR */ - if (__put_user(task->thread.spefscr, &data[34])) - return -EFAULT; + if (!FULL_REGS(target->thread.regs)) { + /* We have a partial register set. Fill 14-31 with bogus values */ + for (i = 14; i < 32; i++) + target->thread.regs->gpr[i] = NV_REG_POISON; + } + + pos /= sizeof(reg); + count /= sizeof(reg); - /* copy SPE registers EVR[0] .. EVR[31] */ - for (i = 0; i < 32; i++, data++) - if (__put_user(task->thread.evr[i], data)) + if (kbuf) + for (; count > 0 && pos < PT_MSR; --count) + *k++ = regs[pos++]; + else + for (; count > 0 && pos < PT_MSR; --count) + if (__put_user((compat_ulong_t) regs[pos++], u++)) + return -EFAULT; + + if (count > 0 && pos == PT_MSR) { + reg = get_user_msr(target); + if (kbuf) + *k++ = reg; + else if (__put_user(reg, u++)) return -EFAULT; + ++pos; + --count; + } - /* copy ACC */ - if (__put_user64(task->thread.acc, (unsigned long long *)data)) - return -EFAULT; + if (kbuf) + for (; count > 0 && pos < PT_REGS_COUNT; --count) + *k++ = regs[pos++]; + else + for (; count > 0 && pos < PT_REGS_COUNT; --count) + if (__put_user((compat_ulong_t) regs[pos++], u++)) + return -EFAULT; - return 0; + kbuf = k; + ubuf = u; + pos *= sizeof(reg); + count *= sizeof(reg); + return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf, + PT_REGS_COUNT * sizeof(reg), -1); } -/* - * Write contents of SPE register state into task TASK. - */ -static inline int set_evrregs(struct task_struct *task, unsigned long *data) +static int gpr32_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) { - int i; + unsigned long *regs = &target->thread.regs->gpr[0]; + const compat_ulong_t *k = kbuf; + const compat_ulong_t __user *u = ubuf; + compat_ulong_t reg; - if (!access_ok(VERIFY_READ, data, 35 * sizeof(unsigned long))) - return -EFAULT; + if (target->thread.regs == NULL) + return -EIO; - /* copy SPEFSCR */ - if (__get_user(task->thread.spefscr, &data[34])) - return -EFAULT; + CHECK_FULL_REGS(target->thread.regs); - /* copy SPE registers EVR[0] .. EVR[31] */ - for (i = 0; i < 32; i++, data++) - if (__get_user(task->thread.evr[i], data)) + pos /= sizeof(reg); + count /= sizeof(reg); + + if (kbuf) + for (; count > 0 && pos < PT_MSR; --count) + regs[pos++] = *k++; + else + for (; count > 0 && pos < PT_MSR; --count) { + if (__get_user(reg, u++)) + return -EFAULT; + regs[pos++] = reg; + } + + + if (count > 0 && pos == PT_MSR) { + if (kbuf) + reg = *k++; + else if (__get_user(reg, u++)) return -EFAULT; - /* copy ACC */ - if (__get_user64(task->thread.acc, (unsigned long long*)data)) - return -EFAULT; + set_user_msr(target, reg); + ++pos; + --count; + } - return 0; + if (kbuf) { + for (; count > 0 && pos <= PT_MAX_PUT_REG; --count) + regs[pos++] = *k++; + for (; count > 0 && pos < PT_TRAP; --count, ++pos) + ++k; + } else { + for (; count > 0 && pos <= PT_MAX_PUT_REG; --count) { + if (__get_user(reg, u++)) + return -EFAULT; + regs[pos++] = reg; + } + for (; count > 0 && pos < PT_TRAP; --count, ++pos) + if (__get_user(reg, u++)) + return -EFAULT; + } + + if (count > 0 && pos == PT_TRAP) { + if (kbuf) + reg = *k++; + else if (__get_user(reg, u++)) + return -EFAULT; + set_user_trap(target, reg); + ++pos; + --count; + } + + kbuf = k; + ubuf = u; + pos *= sizeof(reg); + count *= sizeof(reg); + return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, + (PT_TRAP + 1) * sizeof(reg), -1); } -#endif /* CONFIG_SPE */ -static inline void -set_single_step(struct task_struct *task) +/* + * These are the regset flavors matching the CONFIG_PPC32 native set. + */ +static const struct user_regset compat_regsets[] = { + [REGSET_GPR] = { + .core_note_type = NT_PRSTATUS, .n = ELF_NGREG, + .size = sizeof(compat_long_t), .align = sizeof(compat_long_t), + .get = gpr32_get, .set = gpr32_set + }, + [REGSET_FPR] = { + .core_note_type = NT_PRFPREG, .n = ELF_NFPREG, + .size = sizeof(double), .align = sizeof(double), + .get = fpr_get, .set = fpr_set + }, +#ifdef CONFIG_ALTIVEC + [REGSET_VMX] = { + .core_note_type = NT_PPC_VMX, .n = 34, + .size = sizeof(vector128), .align = sizeof(vector128), + .active = vr_active, .get = vr_get, .set = vr_set + }, +#endif +#ifdef CONFIG_SPE + [REGSET_SPE] = { + .core_note_type = NT_PPC_SPE, .n = 35, + .size = sizeof(u32), .align = sizeof(u32), + .active = evr_active, .get = evr_get, .set = evr_set + }, +#endif +}; + +static const struct user_regset_view user_ppc_compat_view = { + .name = "ppc", .e_machine = EM_PPC, .ei_osabi = ELF_OSABI, + .regsets = compat_regsets, .n = ARRAY_SIZE(compat_regsets) +}; +#endif /* CONFIG_PPC64 */ + +const struct user_regset_view *task_user_regset_view(struct task_struct *task) +{ +#ifdef CONFIG_PPC64 + if (test_tsk_thread_flag(task, TIF_32BIT)) + return &user_ppc_compat_view; +#endif + return &user_ppc_native_view; +} + + +void user_enable_single_step(struct task_struct *task) { struct pt_regs *regs = task->thread.regs; if (regs != NULL) { -#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) - task->thread.dbcr0 = DBCR0_IDM | DBCR0_IC; +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + task->thread.debug.dbcr0 &= ~DBCR0_BT; + task->thread.debug.dbcr0 |= DBCR0_IDM | DBCR0_IC; regs->msr |= MSR_DE; #else + regs->msr &= ~MSR_BE; regs->msr |= MSR_SE; #endif } + set_tsk_thread_flag(task, TIF_SINGLESTEP); } -static inline void -clear_single_step(struct task_struct *task) +void user_enable_block_step(struct task_struct *task) { struct pt_regs *regs = task->thread.regs; if (regs != NULL) { -#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) - task->thread.dbcr0 = 0; - regs->msr &= ~MSR_DE; +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + task->thread.debug.dbcr0 &= ~DBCR0_IC; + task->thread.debug.dbcr0 = DBCR0_IDM | DBCR0_BT; + regs->msr |= MSR_DE; #else regs->msr &= ~MSR_SE; + regs->msr |= MSR_BE; +#endif + } + set_tsk_thread_flag(task, TIF_SINGLESTEP); +} + +void user_disable_single_step(struct task_struct *task) +{ + struct pt_regs *regs = task->thread.regs; + + if (regs != NULL) { +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + /* + * The logic to disable single stepping should be as + * simple as turning off the Instruction Complete flag. + * And, after doing so, if all debug flags are off, turn + * off DBCR0(IDM) and MSR(DE) .... Torez + */ + task->thread.debug.dbcr0 &= ~(DBCR0_IC|DBCR0_BT); + /* + * Test to see if any of the DBCR_ACTIVE_EVENTS bits are set. + */ + if (!DBCR_ACTIVE_EVENTS(task->thread.debug.dbcr0, + task->thread.debug.dbcr1)) { + /* + * All debug events were off..... + */ + task->thread.debug.dbcr0 &= ~DBCR0_IDM; + regs->msr &= ~MSR_DE; + } +#else + regs->msr &= ~(MSR_SE | MSR_BE); #endif } + clear_tsk_thread_flag(task, TIF_SINGLESTEP); +} + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +void ptrace_triggered(struct perf_event *bp, + struct perf_sample_data *data, struct pt_regs *regs) +{ + struct perf_event_attr attr; + + /* + * Disable the breakpoint request here since ptrace has defined a + * one-shot behaviour for breakpoint exceptions in PPC64. + * The SIGTRAP signal is generated automatically for us in do_dabr(). + * We don't have to do anything about that here + */ + attr = bp->attr; + attr.disabled = true; + modify_user_hw_breakpoint(bp, &attr); +} +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ + +int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, + unsigned long data) +{ +#ifdef CONFIG_HAVE_HW_BREAKPOINT + int ret; + struct thread_struct *thread = &(task->thread); + struct perf_event *bp; + struct perf_event_attr attr; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ +#ifndef CONFIG_PPC_ADV_DEBUG_REGS + struct arch_hw_breakpoint hw_brk; +#endif + + /* For ppc64 we support one DABR and no IABR's at the moment (ppc64). + * For embedded processors we support one DAC and no IAC's at the + * moment. + */ + if (addr > 0) + return -EINVAL; + + /* The bottom 3 bits in dabr are flags */ + if ((data & ~0x7UL) >= TASK_SIZE) + return -EIO; + +#ifndef CONFIG_PPC_ADV_DEBUG_REGS + /* For processors using DABR (i.e. 970), the bottom 3 bits are flags. + * It was assumed, on previous implementations, that 3 bits were + * passed together with the data address, fitting the design of the + * DABR register, as follows: + * + * bit 0: Read flag + * bit 1: Write flag + * bit 2: Breakpoint translation + * + * Thus, we use them here as so. + */ + + /* Ensure breakpoint translation bit is set */ + if (data && !(data & HW_BRK_TYPE_TRANSLATE)) + return -EIO; + hw_brk.address = data & (~HW_BRK_TYPE_DABR); + hw_brk.type = (data & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL; + hw_brk.len = 8; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + bp = thread->ptrace_bps[0]; + if ((!data) || !(hw_brk.type & HW_BRK_TYPE_RDWR)) { + if (bp) { + unregister_hw_breakpoint(bp); + thread->ptrace_bps[0] = NULL; + } + return 0; + } + if (bp) { + attr = bp->attr; + attr.bp_addr = hw_brk.address; + arch_bp_generic_fields(hw_brk.type, &attr.bp_type); + + /* Enable breakpoint */ + attr.disabled = false; + + ret = modify_user_hw_breakpoint(bp, &attr); + if (ret) { + return ret; + } + thread->ptrace_bps[0] = bp; + thread->hw_brk = hw_brk; + return 0; + } + + /* Create a new breakpoint request if one doesn't exist already */ + hw_breakpoint_init(&attr); + attr.bp_addr = hw_brk.address; + arch_bp_generic_fields(hw_brk.type, + &attr.bp_type); + + thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr, + ptrace_triggered, NULL, task); + if (IS_ERR(bp)) { + thread->ptrace_bps[0] = NULL; + return PTR_ERR(bp); + } + +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ + task->thread.hw_brk = hw_brk; +#else /* CONFIG_PPC_ADV_DEBUG_REGS */ + /* As described above, it was assumed 3 bits were passed with the data + * address, but we will assume only the mode bits will be passed + * as to not cause alignment restrictions for DAC-based processors. + */ + + /* DAC's hold the whole address without any mode flags */ + task->thread.debug.dac1 = data & ~0x3UL; + + if (task->thread.debug.dac1 == 0) { + dbcr_dac(task) &= ~(DBCR_DAC1R | DBCR_DAC1W); + if (!DBCR_ACTIVE_EVENTS(task->thread.debug.dbcr0, + task->thread.debug.dbcr1)) { + task->thread.regs->msr &= ~MSR_DE; + task->thread.debug.dbcr0 &= ~DBCR0_IDM; + } + return 0; + } + + /* Read or Write bits must be set */ + + if (!(data & 0x3UL)) + return -EINVAL; + + /* Set the Internal Debugging flag (IDM bit 1) for the DBCR0 + register */ + task->thread.debug.dbcr0 |= DBCR0_IDM; + + /* Check for write and read flags and set DBCR0 + accordingly */ + dbcr_dac(task) &= ~(DBCR_DAC1R|DBCR_DAC1W); + if (data & 0x1UL) + dbcr_dac(task) |= DBCR_DAC1R; + if (data & 0x2UL) + dbcr_dac(task) |= DBCR_DAC1W; + task->thread.regs->msr |= MSR_DE; +#endif /* CONFIG_PPC_ADV_DEBUG_REGS */ + return 0; } -#endif /* CONFIG_PPC32 */ /* * Called by kernel/ptrace.c when detaching.. @@ -245,64 +1064,471 @@ clear_single_step(struct task_struct *task) void ptrace_disable(struct task_struct *child) { /* make sure the single step bit is not set. */ - clear_single_step(child); + user_disable_single_step(child); } -long sys_ptrace(long request, long pid, long addr, long data) +#ifdef CONFIG_PPC_ADV_DEBUG_REGS +static long set_instruction_bp(struct task_struct *child, + struct ppc_hw_breakpoint *bp_info) { - struct task_struct *child; - int ret = -EPERM; - - lock_kernel(); - if (request == PTRACE_TRACEME) { - /* are we already being traced? */ - if (current->ptrace & PT_PTRACED) - goto out; - ret = security_ptrace(current->parent, current); - if (ret) - goto out; - /* set the ptrace bit in the process flags. */ - current->ptrace |= PT_PTRACED; - ret = 0; - goto out; - } - ret = -ESRCH; - read_lock(&tasklist_lock); - child = find_task_by_pid(pid); - if (child) - get_task_struct(child); - read_unlock(&tasklist_lock); - if (!child) - goto out; - - ret = -EPERM; - if (pid == 1) /* you may not mess with init */ - goto out_tsk; - - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); - goto out_tsk; - } - - ret = ptrace_check_attach(child, request == PTRACE_KILL); - if (ret < 0) - goto out_tsk; + int slot; + int slot1_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC1) != 0); + int slot2_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC2) != 0); + int slot3_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC3) != 0); + int slot4_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC4) != 0); + + if (dbcr_iac_range(child) & DBCR_IAC12MODE) + slot2_in_use = 1; + if (dbcr_iac_range(child) & DBCR_IAC34MODE) + slot4_in_use = 1; + + if (bp_info->addr >= TASK_SIZE) + return -EIO; + + if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) { + + /* Make sure range is valid. */ + if (bp_info->addr2 >= TASK_SIZE) + return -EIO; + + /* We need a pair of IAC regsisters */ + if ((!slot1_in_use) && (!slot2_in_use)) { + slot = 1; + child->thread.debug.iac1 = bp_info->addr; + child->thread.debug.iac2 = bp_info->addr2; + child->thread.debug.dbcr0 |= DBCR0_IAC1; + if (bp_info->addr_mode == + PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE) + dbcr_iac_range(child) |= DBCR_IAC12X; + else + dbcr_iac_range(child) |= DBCR_IAC12I; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + } else if ((!slot3_in_use) && (!slot4_in_use)) { + slot = 3; + child->thread.debug.iac3 = bp_info->addr; + child->thread.debug.iac4 = bp_info->addr2; + child->thread.debug.dbcr0 |= DBCR0_IAC3; + if (bp_info->addr_mode == + PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE) + dbcr_iac_range(child) |= DBCR_IAC34X; + else + dbcr_iac_range(child) |= DBCR_IAC34I; +#endif + } else + return -ENOSPC; + } else { + /* We only need one. If possible leave a pair free in + * case a range is needed later + */ + if (!slot1_in_use) { + /* + * Don't use iac1 if iac1-iac2 are free and either + * iac3 or iac4 (but not both) are free + */ + if (slot2_in_use || (slot3_in_use == slot4_in_use)) { + slot = 1; + child->thread.debug.iac1 = bp_info->addr; + child->thread.debug.dbcr0 |= DBCR0_IAC1; + goto out; + } + } + if (!slot2_in_use) { + slot = 2; + child->thread.debug.iac2 = bp_info->addr; + child->thread.debug.dbcr0 |= DBCR0_IAC2; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + } else if (!slot3_in_use) { + slot = 3; + child->thread.debug.iac3 = bp_info->addr; + child->thread.debug.dbcr0 |= DBCR0_IAC3; + } else if (!slot4_in_use) { + slot = 4; + child->thread.debug.iac4 = bp_info->addr; + child->thread.debug.dbcr0 |= DBCR0_IAC4; +#endif + } else + return -ENOSPC; + } +out: + child->thread.debug.dbcr0 |= DBCR0_IDM; + child->thread.regs->msr |= MSR_DE; - switch (request) { - /* when I and D space are separate, these will need to be fixed. */ - case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: { - unsigned long tmp; - int copied; + return slot; +} - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); - ret = -EIO; - if (copied != sizeof(tmp)) - break; - ret = put_user(tmp,(unsigned long __user *) data); +static int del_instruction_bp(struct task_struct *child, int slot) +{ + switch (slot) { + case 1: + if ((child->thread.debug.dbcr0 & DBCR0_IAC1) == 0) + return -ENOENT; + + if (dbcr_iac_range(child) & DBCR_IAC12MODE) { + /* address range - clear slots 1 & 2 */ + child->thread.debug.iac2 = 0; + dbcr_iac_range(child) &= ~DBCR_IAC12MODE; + } + child->thread.debug.iac1 = 0; + child->thread.debug.dbcr0 &= ~DBCR0_IAC1; + break; + case 2: + if ((child->thread.debug.dbcr0 & DBCR0_IAC2) == 0) + return -ENOENT; + + if (dbcr_iac_range(child) & DBCR_IAC12MODE) + /* used in a range */ + return -EINVAL; + child->thread.debug.iac2 = 0; + child->thread.debug.dbcr0 &= ~DBCR0_IAC2; + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case 3: + if ((child->thread.debug.dbcr0 & DBCR0_IAC3) == 0) + return -ENOENT; + + if (dbcr_iac_range(child) & DBCR_IAC34MODE) { + /* address range - clear slots 3 & 4 */ + child->thread.debug.iac4 = 0; + dbcr_iac_range(child) &= ~DBCR_IAC34MODE; + } + child->thread.debug.iac3 = 0; + child->thread.debug.dbcr0 &= ~DBCR0_IAC3; + break; + case 4: + if ((child->thread.debug.dbcr0 & DBCR0_IAC4) == 0) + return -ENOENT; + + if (dbcr_iac_range(child) & DBCR_IAC34MODE) + /* Used in a range */ + return -EINVAL; + child->thread.debug.iac4 = 0; + child->thread.debug.dbcr0 &= ~DBCR0_IAC4; break; +#endif + default: + return -EINVAL; + } + return 0; +} + +static int set_dac(struct task_struct *child, struct ppc_hw_breakpoint *bp_info) +{ + int byte_enable = + (bp_info->condition_mode >> PPC_BREAKPOINT_CONDITION_BE_SHIFT) + & 0xf; + int condition_mode = + bp_info->condition_mode & PPC_BREAKPOINT_CONDITION_MODE; + int slot; + + if (byte_enable && (condition_mode == 0)) + return -EINVAL; + + if (bp_info->addr >= TASK_SIZE) + return -EIO; + + if ((dbcr_dac(child) & (DBCR_DAC1R | DBCR_DAC1W)) == 0) { + slot = 1; + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) + dbcr_dac(child) |= DBCR_DAC1R; + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) + dbcr_dac(child) |= DBCR_DAC1W; + child->thread.debug.dac1 = (unsigned long)bp_info->addr; +#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 + if (byte_enable) { + child->thread.debug.dvc1 = + (unsigned long)bp_info->condition_value; + child->thread.debug.dbcr2 |= + ((byte_enable << DBCR2_DVC1BE_SHIFT) | + (condition_mode << DBCR2_DVC1M_SHIFT)); + } +#endif +#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE + } else if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) { + /* Both dac1 and dac2 are part of a range */ + return -ENOSPC; +#endif + } else if ((dbcr_dac(child) & (DBCR_DAC2R | DBCR_DAC2W)) == 0) { + slot = 2; + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) + dbcr_dac(child) |= DBCR_DAC2R; + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) + dbcr_dac(child) |= DBCR_DAC2W; + child->thread.debug.dac2 = (unsigned long)bp_info->addr; +#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 + if (byte_enable) { + child->thread.debug.dvc2 = + (unsigned long)bp_info->condition_value; + child->thread.debug.dbcr2 |= + ((byte_enable << DBCR2_DVC2BE_SHIFT) | + (condition_mode << DBCR2_DVC2M_SHIFT)); + } +#endif + } else + return -ENOSPC; + child->thread.debug.dbcr0 |= DBCR0_IDM; + child->thread.regs->msr |= MSR_DE; + + return slot + 4; +} + +static int del_dac(struct task_struct *child, int slot) +{ + if (slot == 1) { + if ((dbcr_dac(child) & (DBCR_DAC1R | DBCR_DAC1W)) == 0) + return -ENOENT; + + child->thread.debug.dac1 = 0; + dbcr_dac(child) &= ~(DBCR_DAC1R | DBCR_DAC1W); +#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE + if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) { + child->thread.debug.dac2 = 0; + child->thread.debug.dbcr2 &= ~DBCR2_DAC12MODE; + } + child->thread.debug.dbcr2 &= ~(DBCR2_DVC1M | DBCR2_DVC1BE); +#endif +#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 + child->thread.debug.dvc1 = 0; +#endif + } else if (slot == 2) { + if ((dbcr_dac(child) & (DBCR_DAC2R | DBCR_DAC2W)) == 0) + return -ENOENT; + +#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE + if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) + /* Part of a range */ + return -EINVAL; + child->thread.debug.dbcr2 &= ~(DBCR2_DVC2M | DBCR2_DVC2BE); +#endif +#if CONFIG_PPC_ADV_DEBUG_DVCS > 0 + child->thread.debug.dvc2 = 0; +#endif + child->thread.debug.dac2 = 0; + dbcr_dac(child) &= ~(DBCR_DAC2R | DBCR_DAC2W); + } else + return -EINVAL; + + return 0; +} +#endif /* CONFIG_PPC_ADV_DEBUG_REGS */ + +#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE +static int set_dac_range(struct task_struct *child, + struct ppc_hw_breakpoint *bp_info) +{ + int mode = bp_info->addr_mode & PPC_BREAKPOINT_MODE_MASK; + + /* We don't allow range watchpoints to be used with DVC */ + if (bp_info->condition_mode) + return -EINVAL; + + /* + * Best effort to verify the address range. The user/supervisor bits + * prevent trapping in kernel space, but let's fail on an obvious bad + * range. The simple test on the mask is not fool-proof, and any + * exclusive range will spill over into kernel space. + */ + if (bp_info->addr >= TASK_SIZE) + return -EIO; + if (mode == PPC_BREAKPOINT_MODE_MASK) { + /* + * dac2 is a bitmask. Don't allow a mask that makes a + * kernel space address from a valid dac1 value + */ + if (~((unsigned long)bp_info->addr2) >= TASK_SIZE) + return -EIO; + } else { + /* + * For range breakpoints, addr2 must also be a valid address + */ + if (bp_info->addr2 >= TASK_SIZE) + return -EIO; + } + + if (child->thread.debug.dbcr0 & + (DBCR0_DAC1R | DBCR0_DAC1W | DBCR0_DAC2R | DBCR0_DAC2W)) + return -ENOSPC; + + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) + child->thread.debug.dbcr0 |= (DBCR0_DAC1R | DBCR0_IDM); + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) + child->thread.debug.dbcr0 |= (DBCR0_DAC1W | DBCR0_IDM); + child->thread.debug.dac1 = bp_info->addr; + child->thread.debug.dac2 = bp_info->addr2; + if (mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) + child->thread.debug.dbcr2 |= DBCR2_DAC12M; + else if (mode == PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE) + child->thread.debug.dbcr2 |= DBCR2_DAC12MX; + else /* PPC_BREAKPOINT_MODE_MASK */ + child->thread.debug.dbcr2 |= DBCR2_DAC12MM; + child->thread.regs->msr |= MSR_DE; + + return 5; +} +#endif /* CONFIG_PPC_ADV_DEBUG_DAC_RANGE */ + +static long ppc_set_hwdebug(struct task_struct *child, + struct ppc_hw_breakpoint *bp_info) +{ +#ifdef CONFIG_HAVE_HW_BREAKPOINT + int len = 0; + struct thread_struct *thread = &(child->thread); + struct perf_event *bp; + struct perf_event_attr attr; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ +#ifndef CONFIG_PPC_ADV_DEBUG_REGS + struct arch_hw_breakpoint brk; +#endif + + if (bp_info->version != 1) + return -ENOTSUPP; +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + /* + * Check for invalid flags and combinations + */ + if ((bp_info->trigger_type == 0) || + (bp_info->trigger_type & ~(PPC_BREAKPOINT_TRIGGER_EXECUTE | + PPC_BREAKPOINT_TRIGGER_RW)) || + (bp_info->addr_mode & ~PPC_BREAKPOINT_MODE_MASK) || + (bp_info->condition_mode & + ~(PPC_BREAKPOINT_CONDITION_MODE | + PPC_BREAKPOINT_CONDITION_BE_ALL))) + return -EINVAL; +#if CONFIG_PPC_ADV_DEBUG_DVCS == 0 + if (bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE) + return -EINVAL; +#endif + + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_EXECUTE) { + if ((bp_info->trigger_type != PPC_BREAKPOINT_TRIGGER_EXECUTE) || + (bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE)) + return -EINVAL; + return set_instruction_bp(child, bp_info); + } + if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) + return set_dac(child, bp_info); + +#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE + return set_dac_range(child, bp_info); +#else + return -EINVAL; +#endif +#else /* !CONFIG_PPC_ADV_DEBUG_DVCS */ + /* + * We only support one data breakpoint + */ + if ((bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_RW) == 0 || + (bp_info->trigger_type & ~PPC_BREAKPOINT_TRIGGER_RW) != 0 || + bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE) + return -EINVAL; + + if ((unsigned long)bp_info->addr >= TASK_SIZE) + return -EIO; + + brk.address = bp_info->addr & ~7UL; + brk.type = HW_BRK_TYPE_TRANSLATE; + brk.len = 8; + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) + brk.type |= HW_BRK_TYPE_READ; + if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) + brk.type |= HW_BRK_TYPE_WRITE; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + /* + * Check if the request is for 'range' breakpoints. We can + * support it if range < 8 bytes. + */ + if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) + len = bp_info->addr2 - bp_info->addr; + else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) + len = 1; + else + return -EINVAL; + bp = thread->ptrace_bps[0]; + if (bp) + return -ENOSPC; + + /* Create a new breakpoint request if one doesn't exist already */ + hw_breakpoint_init(&attr); + attr.bp_addr = (unsigned long)bp_info->addr & ~HW_BREAKPOINT_ALIGN; + attr.bp_len = len; + arch_bp_generic_fields(brk.type, &attr.bp_type); + + thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr, + ptrace_triggered, NULL, child); + if (IS_ERR(bp)) { + thread->ptrace_bps[0] = NULL; + return PTR_ERR(bp); + } + + return 1; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ + + if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) + return -EINVAL; + + if (child->thread.hw_brk.address) + return -ENOSPC; + + child->thread.hw_brk = brk; + + return 1; +#endif /* !CONFIG_PPC_ADV_DEBUG_DVCS */ +} + +static long ppc_del_hwdebug(struct task_struct *child, long data) +{ +#ifdef CONFIG_HAVE_HW_BREAKPOINT + int ret = 0; + struct thread_struct *thread = &(child->thread); + struct perf_event *bp; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + int rc; + + if (data <= 4) + rc = del_instruction_bp(child, (int)data); + else + rc = del_dac(child, (int)data - 4); + + if (!rc) { + if (!DBCR_ACTIVE_EVENTS(child->thread.debug.dbcr0, + child->thread.debug.dbcr1)) { + child->thread.debug.dbcr0 &= ~DBCR0_IDM; + child->thread.regs->msr &= ~MSR_DE; + } } + return rc; +#else + if (data != 1) + return -EINVAL; + +#ifdef CONFIG_HAVE_HW_BREAKPOINT + bp = thread->ptrace_bps[0]; + if (bp) { + unregister_hw_breakpoint(bp); + thread->ptrace_bps[0] = NULL; + } else + ret = -ENOENT; + return ret; +#else /* CONFIG_HAVE_HW_BREAKPOINT */ + if (child->thread.hw_brk.address == 0) + return -ENOENT; + child->thread.hw_brk.address = 0; + child->thread.hw_brk.type = 0; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ + + return 0; +#endif +} + +long arch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret = -EPERM; + void __user *datavp = (void __user *) data; + unsigned long __user *datalp = datavp; + + switch (request) { /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { unsigned long index, tmp; @@ -310,38 +1536,34 @@ long sys_ptrace(long request, long pid, long addr, long data) ret = -EIO; /* convert to index and check */ #ifdef CONFIG_PPC32 - index = (unsigned long) addr >> 2; + index = addr >> 2; if ((addr & 3) || (index > PT_FPSCR) || (child->thread.regs == NULL)) #else - index = (unsigned long) addr >> 3; + index = addr >> 3; if ((addr & 7) || (index > PT_FPSCR)) #endif break; -#ifdef CONFIG_PPC32 CHECK_FULL_REGS(child->thread.regs); -#endif if (index < PT_FPR0) { - tmp = get_reg(child, (int) index); + ret = ptrace_get_reg(child, (int) index, &tmp); + if (ret) + break; } else { + unsigned int fpidx = index - PT_FPR0; + flush_fp_to_thread(child); - tmp = ((unsigned long *)child->thread.fpr)[index - PT_FPR0]; + if (fpidx < (PT_FPSCR - PT_FPR0)) + memcpy(&tmp, &child->thread.TS_FPR(fpidx), + sizeof(long)); + else + tmp = child->thread.fp_state.fpscr; } - ret = put_user(tmp,(unsigned long __user *) data); + ret = put_user(tmp, datalp); break; } - /* If I and D space are separate, this will have to be fixed. */ - case PTRACE_POKETEXT: /* write the word at location addr. */ - case PTRACE_POKEDATA: - ret = 0; - if (access_process_vm(child, addr, &data, sizeof(data), 1) - == sizeof(data)) - break; - ret = -EIO; - break; - /* write the word at location addr in the USER area */ case PTRACE_POKEUSR: { unsigned long index; @@ -349,265 +1571,251 @@ long sys_ptrace(long request, long pid, long addr, long data) ret = -EIO; /* convert to index and check */ #ifdef CONFIG_PPC32 - index = (unsigned long) addr >> 2; + index = addr >> 2; if ((addr & 3) || (index > PT_FPSCR) || (child->thread.regs == NULL)) #else - index = (unsigned long) addr >> 3; + index = addr >> 3; if ((addr & 7) || (index > PT_FPSCR)) #endif break; -#ifdef CONFIG_PPC32 CHECK_FULL_REGS(child->thread.regs); -#endif - if (index == PT_ORIG_R3) - break; if (index < PT_FPR0) { - ret = put_reg(child, index, data); + ret = ptrace_put_reg(child, index, data); } else { + unsigned int fpidx = index - PT_FPR0; + flush_fp_to_thread(child); - ((unsigned long *)child->thread.fpr)[index - PT_FPR0] = data; + if (fpidx < (PT_FPSCR - PT_FPR0)) + memcpy(&child->thread.TS_FPR(fpidx), &data, + sizeof(long)); + else + child->thread.fp_state.fpscr = data; ret = 0; } break; } - case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: { /* restart after signal. */ - ret = -EIO; - if (!valid_signal(data)) - break; - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - else - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - child->exit_code = data; - /* make sure the single step bit is not set. */ - clear_single_step(child); - wake_up_process(child); - ret = 0; + case PPC_PTRACE_GETHWDBGINFO: { + struct ppc_debug_info dbginfo; + + dbginfo.version = 1; +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + dbginfo.num_instruction_bps = CONFIG_PPC_ADV_DEBUG_IACS; + dbginfo.num_data_bps = CONFIG_PPC_ADV_DEBUG_DACS; + dbginfo.num_condition_regs = CONFIG_PPC_ADV_DEBUG_DVCS; + dbginfo.data_bp_alignment = 4; + dbginfo.sizeof_condition = 4; + dbginfo.features = PPC_DEBUG_FEATURE_INSN_BP_RANGE | + PPC_DEBUG_FEATURE_INSN_BP_MASK; +#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE + dbginfo.features |= + PPC_DEBUG_FEATURE_DATA_BP_RANGE | + PPC_DEBUG_FEATURE_DATA_BP_MASK; +#endif +#else /* !CONFIG_PPC_ADV_DEBUG_REGS */ + dbginfo.num_instruction_bps = 0; + dbginfo.num_data_bps = 1; + dbginfo.num_condition_regs = 0; +#ifdef CONFIG_PPC64 + dbginfo.data_bp_alignment = 8; +#else + dbginfo.data_bp_alignment = 4; +#endif + dbginfo.sizeof_condition = 0; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + dbginfo.features = PPC_DEBUG_FEATURE_DATA_BP_RANGE; + if (cpu_has_feature(CPU_FTR_DAWR)) + dbginfo.features |= PPC_DEBUG_FEATURE_DATA_BP_DAWR; +#else + dbginfo.features = 0; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ +#endif /* CONFIG_PPC_ADV_DEBUG_REGS */ + + if (!access_ok(VERIFY_WRITE, datavp, + sizeof(struct ppc_debug_info))) + return -EFAULT; + ret = __copy_to_user(datavp, &dbginfo, + sizeof(struct ppc_debug_info)) ? + -EFAULT : 0; break; } -/* - * make the child exit. Best I can do is send it a sigkill. - * perhaps it should be put in the status that it wants to - * exit. - */ - case PTRACE_KILL: { - ret = 0; - if (child->exit_state == EXIT_ZOMBIE) /* already dead */ - break; - child->exit_code = SIGKILL; - /* make sure the single step bit is not set. */ - clear_single_step(child); - wake_up_process(child); + case PPC_PTRACE_SETHWDEBUG: { + struct ppc_hw_breakpoint bp_info; + + if (!access_ok(VERIFY_READ, datavp, + sizeof(struct ppc_hw_breakpoint))) + return -EFAULT; + ret = __copy_from_user(&bp_info, datavp, + sizeof(struct ppc_hw_breakpoint)) ? + -EFAULT : 0; + if (!ret) + ret = ppc_set_hwdebug(child, &bp_info); break; } - case PTRACE_SINGLESTEP: { /* set the trap flag. */ - ret = -EIO; - if (!valid_signal(data)) - break; - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - set_single_step(child); - child->exit_code = data; - /* give it a chance to run. */ - wake_up_process(child); - ret = 0; + case PPC_PTRACE_DELHWDEBUG: { + ret = ppc_del_hwdebug(child, data); break; } -#ifdef CONFIG_PPC64 case PTRACE_GET_DEBUGREG: { +#ifndef CONFIG_PPC_ADV_DEBUG_REGS + unsigned long dabr_fake; +#endif ret = -EINVAL; /* We only support one DABR and no IABRS at the moment */ if (addr > 0) break; - ret = put_user(child->thread.dabr, - (unsigned long __user *)data); +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + ret = put_user(child->thread.debug.dac1, datalp); +#else + dabr_fake = ((child->thread.hw_brk.address & (~HW_BRK_TYPE_DABR)) | + (child->thread.hw_brk.type & HW_BRK_TYPE_DABR)); + ret = put_user(dabr_fake, datalp); +#endif break; } case PTRACE_SET_DEBUGREG: ret = ptrace_set_debugreg(child, addr, data); break; -#endif - - case PTRACE_DETACH: - ret = ptrace_detach(child, data); - break; #ifdef CONFIG_PPC64 - case PPC_PTRACE_GETREGS: { /* Get GPRs 0 - 31. */ - int i; - unsigned long *reg = &((unsigned long *)child->thread.regs)[0]; - unsigned long __user *tmp = (unsigned long __user *)addr; - - for (i = 0; i < 32; i++) { - ret = put_user(*reg, tmp); - if (ret) - break; - reg++; - tmp++; - } - break; - } - - case PPC_PTRACE_SETREGS: { /* Set GPRs 0 - 31. */ - int i; - unsigned long *reg = &((unsigned long *)child->thread.regs)[0]; - unsigned long __user *tmp = (unsigned long __user *)addr; - - for (i = 0; i < 32; i++) { - ret = get_user(*reg, tmp); - if (ret) - break; - reg++; - tmp++; - } - break; - } - - case PPC_PTRACE_GETFPREGS: { /* Get FPRs 0 - 31. */ - int i; - unsigned long *reg = &((unsigned long *)child->thread.fpr)[0]; - unsigned long __user *tmp = (unsigned long __user *)addr; - - flush_fp_to_thread(child); - - for (i = 0; i < 32; i++) { - ret = put_user(*reg, tmp); - if (ret) - break; - reg++; - tmp++; - } - break; - } - - case PPC_PTRACE_SETFPREGS: { /* Get FPRs 0 - 31. */ - int i; - unsigned long *reg = &((unsigned long *)child->thread.fpr)[0]; - unsigned long __user *tmp = (unsigned long __user *)addr; - - flush_fp_to_thread(child); + case PTRACE_GETREGS64: +#endif + case PTRACE_GETREGS: /* Get all pt_regs from the child. */ + return copy_regset_to_user(child, &user_ppc_native_view, + REGSET_GPR, + 0, sizeof(struct pt_regs), + datavp); - for (i = 0; i < 32; i++) { - ret = get_user(*reg, tmp); - if (ret) - break; - reg++; - tmp++; - } - break; - } -#endif /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC64 + case PTRACE_SETREGS64: +#endif + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, &user_ppc_native_view, + REGSET_GPR, + 0, sizeof(struct pt_regs), + datavp); + + case PTRACE_GETFPREGS: /* Get the child FPU state (FPR0...31 + FPSCR) */ + return copy_regset_to_user(child, &user_ppc_native_view, + REGSET_FPR, + 0, sizeof(elf_fpregset_t), + datavp); + + case PTRACE_SETFPREGS: /* Set the child FPU state (FPR0...31 + FPSCR) */ + return copy_regset_from_user(child, &user_ppc_native_view, + REGSET_FPR, + 0, sizeof(elf_fpregset_t), + datavp); #ifdef CONFIG_ALTIVEC case PTRACE_GETVRREGS: - /* Get the child altivec register state. */ - flush_altivec_to_thread(child); - ret = get_vrregs((unsigned long __user *)data, child); - break; + return copy_regset_to_user(child, &user_ppc_native_view, + REGSET_VMX, + 0, (33 * sizeof(vector128) + + sizeof(u32)), + datavp); case PTRACE_SETVRREGS: - /* Set the child altivec register state. */ - flush_altivec_to_thread(child); - ret = set_vrregs(child, (unsigned long __user *)data); - break; + return copy_regset_from_user(child, &user_ppc_native_view, + REGSET_VMX, + 0, (33 * sizeof(vector128) + + sizeof(u32)), + datavp); +#endif +#ifdef CONFIG_VSX + case PTRACE_GETVSRREGS: + return copy_regset_to_user(child, &user_ppc_native_view, + REGSET_VSX, + 0, 32 * sizeof(double), + datavp); + + case PTRACE_SETVSRREGS: + return copy_regset_from_user(child, &user_ppc_native_view, + REGSET_VSX, + 0, 32 * sizeof(double), + datavp); #endif #ifdef CONFIG_SPE case PTRACE_GETEVRREGS: /* Get the child spe register state. */ - if (child->thread.regs->msr & MSR_SPE) - giveup_spe(child); - ret = get_evrregs((unsigned long __user *)data, child); - break; + return copy_regset_to_user(child, &user_ppc_native_view, + REGSET_SPE, 0, 35 * sizeof(u32), + datavp); case PTRACE_SETEVRREGS: /* Set the child spe register state. */ - /* this is to clear the MSR_SPE bit to force a reload - * of register state from memory */ - if (child->thread.regs->msr & MSR_SPE) - giveup_spe(child); - ret = set_evrregs(child, (unsigned long __user *)data); - break; + return copy_regset_from_user(child, &user_ppc_native_view, + REGSET_SPE, 0, 35 * sizeof(u32), + datavp); #endif default: ret = ptrace_request(child, request, addr, data); break; } -out_tsk: - put_task_struct(child); -out: - unlock_kernel(); return ret; } -static void do_syscall_trace(void) +/* + * We must return the syscall number to actually look up in the table. + * This can be -1L to skip running any syscall at all. + */ +long do_syscall_trace_enter(struct pt_regs *regs) { - /* the 0x80 provides a way for the tracing parent to distinguish - between a syscall stop and SIGTRAP delivery */ - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0)); + long ret = 0; - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } -} + user_exit(); -void do_syscall_trace_enter(struct pt_regs *regs) -{ -#ifdef CONFIG_PPC64 - secure_computing(regs->gpr[0]); -#endif + secure_computing_strict(regs->gpr[0]); - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) - do_syscall_trace(); + if (test_thread_flag(TIF_SYSCALL_TRACE) && + tracehook_report_syscall_entry(regs)) + /* + * Tracing decided this syscall should not happen. + * We'll return a bogus call number to get an ENOSYS + * error, but leave the original number in regs->gpr[0]. + */ + ret = -1L; - if (unlikely(current->audit_context)) - audit_syscall_entry(current, -#ifdef CONFIG_PPC32 - AUDIT_ARCH_PPC, -#else - test_thread_flag(TIF_32BIT)?AUDIT_ARCH_PPC:AUDIT_ARCH_PPC64, -#endif + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->gpr[0]); + +#ifdef CONFIG_PPC64 + if (!is_32bit_task()) + audit_syscall_entry(AUDIT_ARCH_PPC64, regs->gpr[0], regs->gpr[3], regs->gpr[4], regs->gpr[5], regs->gpr[6]); + else +#endif + audit_syscall_entry(AUDIT_ARCH_PPC, + regs->gpr[0], + regs->gpr[3] & 0xffffffff, + regs->gpr[4] & 0xffffffff, + regs->gpr[5] & 0xffffffff, + regs->gpr[6] & 0xffffffff); + + return ret ?: regs->gpr[0]; } void do_syscall_trace_leave(struct pt_regs *regs) { -#ifdef CONFIG_PPC32 - secure_computing(regs->gpr[0]); -#endif + int step; - if (unlikely(current->audit_context)) - audit_syscall_exit(current, - (regs->ccr&0x1000)?AUDITSC_FAILURE:AUDITSC_SUCCESS, - regs->result); + audit_syscall_exit(regs); - if ((test_thread_flag(TIF_SYSCALL_TRACE) -#ifdef CONFIG_PPC64 - || test_thread_flag(TIF_SINGLESTEP) -#endif - ) - && (current->ptrace & PT_PTRACED)) - do_syscall_trace(); -} + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_exit(regs, regs->result); -#ifdef CONFIG_PPC32 -EXPORT_SYMBOL(do_syscall_trace_enter); -EXPORT_SYMBOL(do_syscall_trace_leave); -#endif + step = test_thread_flag(TIF_SINGLESTEP); + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, step); + + user_enter(); +} diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c index 91eb952e029..f52b7db327c 100644 --- a/arch/powerpc/kernel/ptrace32.c +++ b/arch/powerpc/kernel/ptrace32.c @@ -17,85 +17,41 @@ * this archive for more details. */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/errno.h> #include <linux/ptrace.h> +#include <linux/regset.h> #include <linux/user.h> #include <linux/security.h> #include <linux/signal.h> +#include <linux/compat.h> #include <asm/uaccess.h> #include <asm/page.h> #include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/ptrace-common.h> +#include <asm/switch_to.h> /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. */ -long compat_sys_ptrace(int request, int pid, unsigned long addr, - unsigned long data) -{ - struct task_struct *child; - int ret = -EPERM; - - lock_kernel(); - if (request == PTRACE_TRACEME) { - /* are we already being traced? */ - if (current->ptrace & PT_PTRACED) - goto out; - ret = security_ptrace(current->parent, current); - if (ret) - goto out; - /* set the ptrace bit in the process flags. */ - current->ptrace |= PT_PTRACED; - ret = 0; - goto out; - } - ret = -ESRCH; - read_lock(&tasklist_lock); - child = find_task_by_pid(pid); - if (child) - get_task_struct(child); - read_unlock(&tasklist_lock); - if (!child) - goto out; - - ret = -EPERM; - if (pid == 1) /* you may not mess with init */ - goto out_tsk; - - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); - goto out_tsk; - } +/* Macros to workout the correct index for the FPR in the thread struct */ +#define FPRNUMBER(i) (((i) - PT_FPR0) >> 1) +#define FPRHALF(i) (((i) - PT_FPR0) & 1) +#define FPRINDEX(i) TS_FPRWIDTH * FPRNUMBER(i) * 2 + FPRHALF(i) - ret = ptrace_check_attach(child, request == PTRACE_KILL); - if (ret < 0) - goto out_tsk; +long compat_arch_ptrace(struct task_struct *child, compat_long_t request, + compat_ulong_t caddr, compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + int ret; switch (request) { - /* when I and D space are separate, these will need to be fixed. */ - case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: { - unsigned int tmp; - int copied; - - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); - ret = -EIO; - if (copied != sizeof(tmp)) - break; - ret = put_user(tmp, (u32 __user *)data); - break; - } - /* * Read 4 bytes of the other process' storage * data is a pointer specifying where the user wants the @@ -136,8 +92,11 @@ long compat_sys_ptrace(int request, int pid, unsigned long addr, if ((addr & 3) || (index > PT_FPSCR32)) break; + CHECK_FULL_REGS(child->thread.regs); if (index < PT_FPR0) { - tmp = get_reg(child, index); + ret = ptrace_get_reg(child, index, &tmp); + if (ret) + break; } else { flush_fp_to_thread(child); /* @@ -145,7 +104,8 @@ long compat_sys_ptrace(int request, int pid, unsigned long addr, * to be an array of unsigned int (32 bits) - the * index passed in is based on this assumption. */ - tmp = ((unsigned int *)child->thread.fpr)[index - PT_FPR0]; + tmp = ((unsigned int *)child->thread.fp_state.fpr) + [FPRINDEX(index)]; } ret = put_user((unsigned int)tmp, (u32 __user *)data); break; @@ -176,34 +136,29 @@ long compat_sys_ptrace(int request, int pid, unsigned long addr, else part = 0; /* want the 1st half of the register (left-most). */ - /* Validate the input - check to see if address is on the wrong boundary or beyond the end of the user area */ + /* Validate the input - check to see if address is on the wrong boundary + * or beyond the end of the user area + */ if ((addr & 3) || numReg > PT_FPSCR) break; + CHECK_FULL_REGS(child->thread.regs); if (numReg >= PT_FPR0) { flush_fp_to_thread(child); - tmp = ((unsigned long int *)child->thread.fpr)[numReg - PT_FPR0]; + /* get 64 bit FPR */ + tmp = child->thread.fp_state.fpr[numReg - PT_FPR0][0]; } else { /* register within PT_REGS struct */ - tmp = get_reg(child, numReg); + unsigned long tmp2; + ret = ptrace_get_reg(child, numReg, &tmp2); + if (ret) + break; + tmp = tmp2; } reg32bits = ((u32*)&tmp)[part]; ret = put_user(reg32bits, (u32 __user *)data); break; } - /* If I and D space are separate, this will have to be fixed. */ - case PTRACE_POKETEXT: /* write the word at location addr. */ - case PTRACE_POKEDATA: { - unsigned int tmp; - tmp = data; - ret = 0; - if (access_process_vm(child, addr, &tmp, sizeof(tmp), 1) - == sizeof(tmp)) - break; - ret = -EIO; - break; - } - /* * Write 4 bytes into the other process' storage * data is the 4 bytes that the user wants written @@ -240,10 +195,9 @@ long compat_sys_ptrace(int request, int pid, unsigned long addr, if ((addr & 3) || (index > PT_FPSCR32)) break; - if (index == PT_ORIG_R3) - break; + CHECK_FULL_REGS(child->thread.regs); if (index < PT_FPR0) { - ret = put_reg(child, index, data); + ret = ptrace_put_reg(child, index, data); } else { flush_fp_to_thread(child); /* @@ -251,7 +205,8 @@ long compat_sys_ptrace(int request, int pid, unsigned long addr, * to be an array of unsigned int (32 bits) - the * index passed in is based on this assumption. */ - ((unsigned int *)child->thread.fpr)[index - PT_FPR0] = data; + ((unsigned int *)child->thread.fp_state.fpr) + [FPRINDEX(index)] = data; ret = 0; } break; @@ -272,179 +227,91 @@ long compat_sys_ptrace(int request, int pid, unsigned long addr, /* Determine which register the user wants */ index = (u64)addr >> 2; numReg = index / 2; + /* * Validate the input - check to see if address is on the * wrong boundary or beyond the end of the user area */ if ((addr & 3) || (numReg > PT_FPSCR)) break; - /* Insure it is a register we let them change */ - if ((numReg == PT_ORIG_R3) - || ((numReg > PT_CCR) && (numReg < PT_FPR0))) - break; - if (numReg >= PT_FPR0) { + CHECK_FULL_REGS(child->thread.regs); + if (numReg < PT_FPR0) { + unsigned long freg; + ret = ptrace_get_reg(child, numReg, &freg); + if (ret) + break; + if (index % 2) + freg = (freg & ~0xfffffffful) | (data & 0xfffffffful); + else + freg = (freg & 0xfffffffful) | (data << 32); + ret = ptrace_put_reg(child, numReg, freg); + } else { + u64 *tmp; flush_fp_to_thread(child); + /* get 64 bit FPR ... */ + tmp = &child->thread.fp_state.fpr[numReg - PT_FPR0][0]; + /* ... write the 32 bit part we want */ + ((u32 *)tmp)[index % 2] = data; + ret = 0; } - if (numReg == PT_MSR) - data = (data & MSR_DEBUGCHANGE) - | (child->thread.regs->msr & ~MSR_DEBUGCHANGE); - ((u32*)child->thread.regs)[index] = data; - ret = 0; - break; - } - - case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: { /* restart after signal. */ - ret = -EIO; - if (!valid_signal(data)) - break; - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - else - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - child->exit_code = data; - /* make sure the single step bit is not set. */ - clear_single_step(child); - wake_up_process(child); - ret = 0; - break; - } - - /* - * make the child exit. Best I can do is send it a sigkill. - * perhaps it should be put in the status that it wants to - * exit. - */ - case PTRACE_KILL: { - ret = 0; - if (child->exit_state == EXIT_ZOMBIE) /* already dead */ - break; - child->exit_code = SIGKILL; - /* make sure the single step bit is not set. */ - clear_single_step(child); - wake_up_process(child); - break; - } - - case PTRACE_SINGLESTEP: { /* set the trap flag. */ - ret = -EIO; - if (!valid_signal(data)) - break; - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - set_single_step(child); - child->exit_code = data; - /* give it a chance to run. */ - wake_up_process(child); - ret = 0; break; } case PTRACE_GET_DEBUGREG: { +#ifndef CONFIG_PPC_ADV_DEBUG_REGS + unsigned long dabr_fake; +#endif ret = -EINVAL; /* We only support one DABR and no IABRS at the moment */ if (addr > 0) break; - ret = put_user(child->thread.dabr, (u32 __user *)data); - break; - } - - case PTRACE_SET_DEBUGREG: - ret = ptrace_set_debugreg(child, addr, data); - break; - - case PTRACE_DETACH: - ret = ptrace_detach(child, data); - break; - - case PPC_PTRACE_GETREGS: { /* Get GPRs 0 - 31. */ - int i; - unsigned long *reg = &((unsigned long *)child->thread.regs)[0]; - unsigned int __user *tmp = (unsigned int __user *)addr; - - for (i = 0; i < 32; i++) { - ret = put_user(*reg, tmp); - if (ret) - break; - reg++; - tmp++; - } - break; - } - - case PPC_PTRACE_SETREGS: { /* Set GPRs 0 - 31. */ - int i; - unsigned long *reg = &((unsigned long *)child->thread.regs)[0]; - unsigned int __user *tmp = (unsigned int __user *)addr; - - for (i = 0; i < 32; i++) { - ret = get_user(*reg, tmp); - if (ret) - break; - reg++; - tmp++; - } - break; - } - - case PPC_PTRACE_GETFPREGS: { /* Get FPRs 0 - 31. */ - int i; - unsigned long *reg = &((unsigned long *)child->thread.fpr)[0]; - unsigned int __user *tmp = (unsigned int __user *)addr; - - flush_fp_to_thread(child); - - for (i = 0; i < 32; i++) { - ret = put_user(*reg, tmp); - if (ret) - break; - reg++; - tmp++; - } +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + ret = put_user(child->thread.debug.dac1, (u32 __user *)data); +#else + dabr_fake = ( + (child->thread.hw_brk.address & (~HW_BRK_TYPE_DABR)) | + (child->thread.hw_brk.type & HW_BRK_TYPE_DABR)); + ret = put_user(dabr_fake, (u32 __user *)data); +#endif break; } - case PPC_PTRACE_SETFPREGS: { /* Get FPRs 0 - 31. */ - int i; - unsigned long *reg = &((unsigned long *)child->thread.fpr)[0]; - unsigned int __user *tmp = (unsigned int __user *)addr; + case PTRACE_GETREGS: /* Get all pt_regs from the child. */ + return copy_regset_to_user( + child, task_user_regset_view(current), 0, + 0, PT_REGS_COUNT * sizeof(compat_long_t), + compat_ptr(data)); - flush_fp_to_thread(child); + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user( + child, task_user_regset_view(current), 0, + 0, PT_REGS_COUNT * sizeof(compat_long_t), + compat_ptr(data)); - for (i = 0; i < 32; i++) { - ret = get_user(*reg, tmp); - if (ret) - break; - reg++; - tmp++; - } - break; - } - - case PTRACE_GETEVENTMSG: - ret = put_user(child->ptrace_message, (unsigned int __user *) data); - break; - -#ifdef CONFIG_ALTIVEC + case PTRACE_GETFPREGS: + case PTRACE_SETFPREGS: case PTRACE_GETVRREGS: - /* Get the child altivec register state. */ - flush_altivec_to_thread(child); - ret = get_vrregs((unsigned long __user *)data, child); - break; - case PTRACE_SETVRREGS: - /* Set the child altivec register state. */ - flush_altivec_to_thread(child); - ret = set_vrregs(child, (unsigned long __user *)data); + case PTRACE_GETVSRREGS: + case PTRACE_SETVSRREGS: + case PTRACE_GETREGS64: + case PTRACE_SETREGS64: + case PTRACE_KILL: + case PTRACE_SINGLESTEP: + case PTRACE_DETACH: + case PTRACE_SET_DEBUGREG: + case PTRACE_SYSCALL: + case PTRACE_CONT: + case PPC_PTRACE_GETHWDBGINFO: + case PPC_PTRACE_SETHWDEBUG: + case PPC_PTRACE_DELHWDEBUG: + ret = arch_ptrace(child, request, addr, data); break; -#endif default: - ret = ptrace_request(child, request, addr, data); + ret = compat_ptrace_request(child, request, addr, data); break; } -out_tsk: - put_task_struct(child); -out: - unlock_kernel(); + return ret; } diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S new file mode 100644 index 00000000000..f366fedb087 --- /dev/null +++ b/arch/powerpc/kernel/reloc_32.S @@ -0,0 +1,209 @@ +/* + * Code to process dynamic relocations for PPC32. + * + * Copyrights (C) IBM Corporation, 2011. + * Author: Suzuki Poulose <suzuki@in.ibm.com> + * + * - Based on ppc64 code - reloc_64.S + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/ppc_asm.h> + +/* Dynamic section table entry tags */ +DT_RELA = 7 /* Tag for Elf32_Rela section */ +DT_RELASZ = 8 /* Size of the Rela relocs */ +DT_RELAENT = 9 /* Size of one Rela reloc entry */ + +STN_UNDEF = 0 /* Undefined symbol index */ +STB_LOCAL = 0 /* Local binding for the symbol */ + +R_PPC_ADDR16_LO = 4 /* Lower half of (S+A) */ +R_PPC_ADDR16_HI = 5 /* Upper half of (S+A) */ +R_PPC_ADDR16_HA = 6 /* High Adjusted (S+A) */ +R_PPC_RELATIVE = 22 + +/* + * r3 = desired final address + */ + +_GLOBAL(relocate) + + mflr r0 /* Save our LR */ + bl 0f /* Find our current runtime address */ +0: mflr r12 /* Make it accessible */ + mtlr r0 + + lwz r11, (p_dyn - 0b)(r12) + add r11, r11, r12 /* runtime address of .dynamic section */ + lwz r9, (p_rela - 0b)(r12) + add r9, r9, r12 /* runtime address of .rela.dyn section */ + lwz r10, (p_st - 0b)(r12) + add r10, r10, r12 /* runtime address of _stext section */ + lwz r13, (p_sym - 0b)(r12) + add r13, r13, r12 /* runtime address of .dynsym section */ + + /* + * Scan the dynamic section for RELA, RELASZ entries + */ + li r6, 0 + li r7, 0 + li r8, 0 +1: lwz r5, 0(r11) /* ELF_Dyn.d_tag */ + cmpwi r5, 0 /* End of ELF_Dyn[] */ + beq eodyn + cmpwi r5, DT_RELA + bne relasz + lwz r7, 4(r11) /* r7 = rela.link */ + b skip +relasz: + cmpwi r5, DT_RELASZ + bne relaent + lwz r8, 4(r11) /* r8 = Total Rela relocs size */ + b skip +relaent: + cmpwi r5, DT_RELAENT + bne skip + lwz r6, 4(r11) /* r6 = Size of one Rela reloc */ +skip: + addi r11, r11, 8 + b 1b +eodyn: /* End of Dyn Table scan */ + + /* Check if we have found all the entries */ + cmpwi r7, 0 + beq done + cmpwi r8, 0 + beq done + cmpwi r6, 0 + beq done + + + /* + * Work out the current offset from the link time address of .rela + * section. + * cur_offset[r7] = rela.run[r9] - rela.link [r7] + * _stext.link[r12] = _stext.run[r10] - cur_offset[r7] + * final_offset[r3] = _stext.final[r3] - _stext.link[r12] + */ + subf r7, r7, r9 /* cur_offset */ + subf r12, r7, r10 + subf r3, r12, r3 /* final_offset */ + + subf r8, r6, r8 /* relaz -= relaent */ + /* + * Scan through the .rela table and process each entry + * r9 - points to the current .rela table entry + * r13 - points to the symbol table + */ + + /* + * Check if we have a relocation based on symbol + * r5 will hold the value of the symbol. + */ +applyrela: + lwz r4, 4(r9) /* r4 = rela.r_info */ + srwi r5, r4, 8 /* ELF32_R_SYM(r_info) */ + cmpwi r5, STN_UNDEF /* sym == STN_UNDEF ? */ + beq get_type /* value = 0 */ + /* Find the value of the symbol at index(r5) */ + slwi r5, r5, 4 /* r5 = r5 * sizeof(Elf32_Sym) */ + add r12, r13, r5 /* r12 = &__dyn_sym[Index] */ + + /* + * GNU ld has a bug, where dynamic relocs based on + * STB_LOCAL symbols, the value should be assumed + * to be zero. - Alan Modra + */ + /* XXX: Do we need to check if we are using GNU ld ? */ + lbz r5, 12(r12) /* r5 = dyn_sym[Index].st_info */ + extrwi r5, r5, 4, 24 /* r5 = ELF32_ST_BIND(r5) */ + cmpwi r5, STB_LOCAL /* st_value = 0, ld bug */ + beq get_type /* We have r5 = 0 */ + lwz r5, 4(r12) /* r5 = __dyn_sym[Index].st_value */ + +get_type: + /* Load the relocation type to r4 */ + extrwi r4, r4, 8, 24 /* r4 = ELF32_R_TYPE(r_info) = ((char*)r4)[3] */ + + /* R_PPC_RELATIVE */ + cmpwi r4, R_PPC_RELATIVE + bne hi16 + lwz r4, 0(r9) /* r_offset */ + lwz r0, 8(r9) /* r_addend */ + add r0, r0, r3 /* final addend */ + stwx r0, r4, r7 /* memory[r4+r7]) = (u32)r0 */ + b nxtrela /* continue */ + + /* R_PPC_ADDR16_HI */ +hi16: + cmpwi r4, R_PPC_ADDR16_HI + bne ha16 + lwz r4, 0(r9) /* r_offset */ + lwz r0, 8(r9) /* r_addend */ + add r0, r0, r3 + add r0, r0, r5 /* r0 = (S+A+Offset) */ + extrwi r0, r0, 16, 0 /* r0 = (r0 >> 16) */ + b store_half + + /* R_PPC_ADDR16_HA */ +ha16: + cmpwi r4, R_PPC_ADDR16_HA + bne lo16 + lwz r4, 0(r9) /* r_offset */ + lwz r0, 8(r9) /* r_addend */ + add r0, r0, r3 + add r0, r0, r5 /* r0 = (S+A+Offset) */ + extrwi r5, r0, 1, 16 /* Extract bit 16 */ + extrwi r0, r0, 16, 0 /* r0 = (r0 >> 16) */ + add r0, r0, r5 /* Add it to r0 */ + b store_half + + /* R_PPC_ADDR16_LO */ +lo16: + cmpwi r4, R_PPC_ADDR16_LO + bne unknown_type + lwz r4, 0(r9) /* r_offset */ + lwz r0, 8(r9) /* r_addend */ + add r0, r0, r3 + add r0, r0, r5 /* r0 = (S+A+Offset) */ + extrwi r0, r0, 16, 16 /* r0 &= 0xffff */ + /* Fall through to */ + + /* Store half word */ +store_half: + sthx r0, r4, r7 /* memory[r4+r7] = (u16)r0 */ + +nxtrela: + /* + * We have to flush the modified instructions to the + * main storage from the d-cache. And also, invalidate the + * cached instructions in i-cache which has been modified. + * + * We delay the sync / isync operation till the end, since + * we won't be executing the modified instructions until + * we return from here. + */ + dcbst r4,r7 + sync /* Ensure the data is flushed before icbi */ + icbi r4,r7 +unknown_type: + cmpwi r8, 0 /* relasz = 0 ? */ + ble done + add r9, r9, r6 /* move to next entry in the .rela table */ + subf r8, r6, r8 /* relasz -= relaent */ + b applyrela + +done: + sync /* Wait for the flush to finish */ + isync /* Discard prefetched instructions */ + blr + +p_dyn: .long __dynamic_start - 0b +p_rela: .long __rela_dyn_start - 0b +p_sym: .long __dynamic_symtab - 0b +p_st: .long _stext - 0b diff --git a/arch/powerpc/kernel/reloc_64.S b/arch/powerpc/kernel/reloc_64.S new file mode 100644 index 00000000000..d88736fbece --- /dev/null +++ b/arch/powerpc/kernel/reloc_64.S @@ -0,0 +1,88 @@ +/* + * Code to process dynamic relocations in the kernel. + * + * Copyright 2008 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/ppc_asm.h> + +RELA = 7 +RELACOUNT = 0x6ffffff9 +R_PPC64_RELATIVE = 22 + +/* + * r3 = desired final address of kernel + */ +_GLOBAL(relocate) + mflr r0 + bcl 20,31,$+4 +0: mflr r12 /* r12 has runtime addr of label 0 */ + mtlr r0 + ld r11,(p_dyn - 0b)(r12) + add r11,r11,r12 /* r11 has runtime addr of .dynamic section */ + ld r9,(p_rela - 0b)(r12) + add r9,r9,r12 /* r9 has runtime addr of .rela.dyn section */ + ld r10,(p_st - 0b)(r12) + add r10,r10,r12 /* r10 has runtime addr of _stext */ + + /* + * Scan the dynamic section for the RELA and RELACOUNT entries. + */ + li r7,0 + li r8,0 +1: ld r6,0(r11) /* get tag */ + cmpdi r6,0 + beq 4f /* end of list */ + cmpdi r6,RELA + bne 2f + ld r7,8(r11) /* get RELA pointer in r7 */ + b 3f +2: addis r6,r6,(-RELACOUNT)@ha + cmpdi r6,RELACOUNT@l + bne 3f + ld r8,8(r11) /* get RELACOUNT value in r8 */ +3: addi r11,r11,16 + b 1b +4: cmpdi r7,0 /* check we have both RELA and RELACOUNT */ + cmpdi cr1,r8,0 + beq 6f + beq cr1,6f + + /* + * Work out linktime address of _stext and hence the + * relocation offset to be applied. + * cur_offset [r7] = rela.run [r9] - rela.link [r7] + * _stext.link [r10] = _stext.run [r10] - cur_offset [r7] + * final_offset [r3] = _stext.final [r3] - _stext.link [r10] + */ + subf r7,r7,r9 /* cur_offset */ + subf r10,r7,r10 + subf r3,r10,r3 /* final_offset */ + + /* + * Run through the list of relocations and process the + * R_PPC64_RELATIVE ones. + */ + mtctr r8 +5: ld r0,8(9) /* ELF64_R_TYPE(reloc->r_info) */ + cmpdi r0,R_PPC64_RELATIVE + bne 6f + ld r6,0(r9) /* reloc->r_offset */ + ld r0,16(r9) /* reloc->r_addend */ + add r0,r0,r3 + stdx r0,r7,r6 + addi r9,r9,24 + bdnz 5b + +6: blr + +.balign 8 +p_dyn: .llong __dynamic_start - 0b +p_rela: .llong __rela_dyn_start - 0b +p_st: .llong _stext - 0b + diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c new file mode 100644 index 00000000000..8777fb02349 --- /dev/null +++ b/arch/powerpc/kernel/rtas-proc.c @@ -0,0 +1,790 @@ +/* + * Copyright (C) 2000 Tilmann Bitterberg + * (tilmann@bitterberg.de) + * + * RTAS (Runtime Abstraction Services) stuff + * Intention is to provide a clean user interface + * to use the RTAS. + * + * TODO: + * Split off a header file and maybe move it to a different + * location. Write Documentation on what the /proc/rtas/ entries + * actually do. + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/ctype.h> +#include <linux/time.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/seq_file.h> +#include <linux/bitops.h> +#include <linux/rtc.h> + +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/machdep.h> /* for ppc_md */ +#include <asm/time.h> + +/* Token for Sensors */ +#define KEY_SWITCH 0x0001 +#define ENCLOSURE_SWITCH 0x0002 +#define THERMAL_SENSOR 0x0003 +#define LID_STATUS 0x0004 +#define POWER_SOURCE 0x0005 +#define BATTERY_VOLTAGE 0x0006 +#define BATTERY_REMAINING 0x0007 +#define BATTERY_PERCENTAGE 0x0008 +#define EPOW_SENSOR 0x0009 +#define BATTERY_CYCLESTATE 0x000a +#define BATTERY_CHARGING 0x000b + +/* IBM specific sensors */ +#define IBM_SURVEILLANCE 0x2328 /* 9000 */ +#define IBM_FANRPM 0x2329 /* 9001 */ +#define IBM_VOLTAGE 0x232a /* 9002 */ +#define IBM_DRCONNECTOR 0x232b /* 9003 */ +#define IBM_POWERSUPPLY 0x232c /* 9004 */ + +/* Status return values */ +#define SENSOR_CRITICAL_HIGH 13 +#define SENSOR_WARNING_HIGH 12 +#define SENSOR_NORMAL 11 +#define SENSOR_WARNING_LOW 10 +#define SENSOR_CRITICAL_LOW 9 +#define SENSOR_SUCCESS 0 +#define SENSOR_HW_ERROR -1 +#define SENSOR_BUSY -2 +#define SENSOR_NOT_EXIST -3 +#define SENSOR_DR_ENTITY -9000 + +/* Location Codes */ +#define LOC_SCSI_DEV_ADDR 'A' +#define LOC_SCSI_DEV_LOC 'B' +#define LOC_CPU 'C' +#define LOC_DISKETTE 'D' +#define LOC_ETHERNET 'E' +#define LOC_FAN 'F' +#define LOC_GRAPHICS 'G' +/* reserved / not used 'H' */ +#define LOC_IO_ADAPTER 'I' +/* reserved / not used 'J' */ +#define LOC_KEYBOARD 'K' +#define LOC_LCD 'L' +#define LOC_MEMORY 'M' +#define LOC_NV_MEMORY 'N' +#define LOC_MOUSE 'O' +#define LOC_PLANAR 'P' +#define LOC_OTHER_IO 'Q' +#define LOC_PARALLEL 'R' +#define LOC_SERIAL 'S' +#define LOC_DEAD_RING 'T' +#define LOC_RACKMOUNTED 'U' /* for _u_nit is rack mounted */ +#define LOC_VOLTAGE 'V' +#define LOC_SWITCH_ADAPTER 'W' +#define LOC_OTHER 'X' +#define LOC_FIRMWARE 'Y' +#define LOC_SCSI 'Z' + +/* Tokens for indicators */ +#define TONE_FREQUENCY 0x0001 /* 0 - 1000 (HZ)*/ +#define TONE_VOLUME 0x0002 /* 0 - 100 (%) */ +#define SYSTEM_POWER_STATE 0x0003 +#define WARNING_LIGHT 0x0004 +#define DISK_ACTIVITY_LIGHT 0x0005 +#define HEX_DISPLAY_UNIT 0x0006 +#define BATTERY_WARNING_TIME 0x0007 +#define CONDITION_CYCLE_REQUEST 0x0008 +#define SURVEILLANCE_INDICATOR 0x2328 /* 9000 */ +#define DR_ACTION 0x2329 /* 9001 */ +#define DR_INDICATOR 0x232a /* 9002 */ +/* 9003 - 9004: Vendor specific */ +/* 9006 - 9999: Vendor specific */ + +/* other */ +#define MAX_SENSORS 17 /* I only know of 17 sensors */ +#define MAX_LINELENGTH 256 +#define SENSOR_PREFIX "ibm,sensor-" +#define cel_to_fahr(x) ((x*9/5)+32) + + +/* Globals */ +static struct rtas_sensors sensors; +static struct device_node *rtas_node = NULL; +static unsigned long power_on_time = 0; /* Save the time the user set */ +static char progress_led[MAX_LINELENGTH]; + +static unsigned long rtas_tone_frequency = 1000; +static unsigned long rtas_tone_volume = 0; + +/* ****************STRUCTS******************************************* */ +struct individual_sensor { + unsigned int token; + unsigned int quant; +}; + +struct rtas_sensors { + struct individual_sensor sensor[MAX_SENSORS]; + unsigned int quant; +}; + +/* ****************************************************************** */ +/* Declarations */ +static int ppc_rtas_sensors_show(struct seq_file *m, void *v); +static int ppc_rtas_clock_show(struct seq_file *m, void *v); +static ssize_t ppc_rtas_clock_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos); +static int ppc_rtas_progress_show(struct seq_file *m, void *v); +static ssize_t ppc_rtas_progress_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos); +static int ppc_rtas_poweron_show(struct seq_file *m, void *v); +static ssize_t ppc_rtas_poweron_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos); + +static ssize_t ppc_rtas_tone_freq_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos); +static int ppc_rtas_tone_freq_show(struct seq_file *m, void *v); +static ssize_t ppc_rtas_tone_volume_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos); +static int ppc_rtas_tone_volume_show(struct seq_file *m, void *v); +static int ppc_rtas_rmo_buf_show(struct seq_file *m, void *v); + +static int sensors_open(struct inode *inode, struct file *file) +{ + return single_open(file, ppc_rtas_sensors_show, NULL); +} + +static const struct file_operations ppc_rtas_sensors_operations = { + .open = sensors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int poweron_open(struct inode *inode, struct file *file) +{ + return single_open(file, ppc_rtas_poweron_show, NULL); +} + +static const struct file_operations ppc_rtas_poweron_operations = { + .open = poweron_open, + .read = seq_read, + .llseek = seq_lseek, + .write = ppc_rtas_poweron_write, + .release = single_release, +}; + +static int progress_open(struct inode *inode, struct file *file) +{ + return single_open(file, ppc_rtas_progress_show, NULL); +} + +static const struct file_operations ppc_rtas_progress_operations = { + .open = progress_open, + .read = seq_read, + .llseek = seq_lseek, + .write = ppc_rtas_progress_write, + .release = single_release, +}; + +static int clock_open(struct inode *inode, struct file *file) +{ + return single_open(file, ppc_rtas_clock_show, NULL); +} + +static const struct file_operations ppc_rtas_clock_operations = { + .open = clock_open, + .read = seq_read, + .llseek = seq_lseek, + .write = ppc_rtas_clock_write, + .release = single_release, +}; + +static int tone_freq_open(struct inode *inode, struct file *file) +{ + return single_open(file, ppc_rtas_tone_freq_show, NULL); +} + +static const struct file_operations ppc_rtas_tone_freq_operations = { + .open = tone_freq_open, + .read = seq_read, + .llseek = seq_lseek, + .write = ppc_rtas_tone_freq_write, + .release = single_release, +}; + +static int tone_volume_open(struct inode *inode, struct file *file) +{ + return single_open(file, ppc_rtas_tone_volume_show, NULL); +} + +static const struct file_operations ppc_rtas_tone_volume_operations = { + .open = tone_volume_open, + .read = seq_read, + .llseek = seq_lseek, + .write = ppc_rtas_tone_volume_write, + .release = single_release, +}; + +static int rmo_buf_open(struct inode *inode, struct file *file) +{ + return single_open(file, ppc_rtas_rmo_buf_show, NULL); +} + +static const struct file_operations ppc_rtas_rmo_buf_ops = { + .open = rmo_buf_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ppc_rtas_find_all_sensors(void); +static void ppc_rtas_process_sensor(struct seq_file *m, + struct individual_sensor *s, int state, int error, const char *loc); +static char *ppc_rtas_process_error(int error); +static void get_location_code(struct seq_file *m, + struct individual_sensor *s, const char *loc); +static void check_location_string(struct seq_file *m, const char *c); +static void check_location(struct seq_file *m, const char *c); + +static int __init proc_rtas_init(void) +{ + if (!machine_is(pseries)) + return -ENODEV; + + rtas_node = of_find_node_by_name(NULL, "rtas"); + if (rtas_node == NULL) + return -ENODEV; + + proc_create("powerpc/rtas/progress", S_IRUGO|S_IWUSR, NULL, + &ppc_rtas_progress_operations); + proc_create("powerpc/rtas/clock", S_IRUGO|S_IWUSR, NULL, + &ppc_rtas_clock_operations); + proc_create("powerpc/rtas/poweron", S_IWUSR|S_IRUGO, NULL, + &ppc_rtas_poweron_operations); + proc_create("powerpc/rtas/sensors", S_IRUGO, NULL, + &ppc_rtas_sensors_operations); + proc_create("powerpc/rtas/frequency", S_IWUSR|S_IRUGO, NULL, + &ppc_rtas_tone_freq_operations); + proc_create("powerpc/rtas/volume", S_IWUSR|S_IRUGO, NULL, + &ppc_rtas_tone_volume_operations); + proc_create("powerpc/rtas/rmo_buffer", S_IRUSR, NULL, + &ppc_rtas_rmo_buf_ops); + return 0; +} + +__initcall(proc_rtas_init); + +static int parse_number(const char __user *p, size_t count, unsigned long *val) +{ + char buf[40]; + char *end; + + if (count > 39) + return -EINVAL; + + if (copy_from_user(buf, p, count)) + return -EFAULT; + + buf[count] = 0; + + *val = simple_strtoul(buf, &end, 10); + if (*end && *end != '\n') + return -EINVAL; + + return 0; +} + +/* ****************************************************************** */ +/* POWER-ON-TIME */ +/* ****************************************************************** */ +static ssize_t ppc_rtas_poweron_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct rtc_time tm; + unsigned long nowtime; + int error = parse_number(buf, count, &nowtime); + if (error) + return error; + + power_on_time = nowtime; /* save the time */ + + to_tm(nowtime, &tm); + + error = rtas_call(rtas_token("set-time-for-power-on"), 7, 1, NULL, + tm.tm_year, tm.tm_mon, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, 0 /* nano */); + if (error) + printk(KERN_WARNING "error: setting poweron time returned: %s\n", + ppc_rtas_process_error(error)); + return count; +} +/* ****************************************************************** */ +static int ppc_rtas_poweron_show(struct seq_file *m, void *v) +{ + if (power_on_time == 0) + seq_printf(m, "Power on time not set\n"); + else + seq_printf(m, "%lu\n",power_on_time); + return 0; +} + +/* ****************************************************************** */ +/* PROGRESS */ +/* ****************************************************************** */ +static ssize_t ppc_rtas_progress_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + unsigned long hex; + + if (count >= MAX_LINELENGTH) + count = MAX_LINELENGTH -1; + if (copy_from_user(progress_led, buf, count)) { /* save the string */ + return -EFAULT; + } + progress_led[count] = 0; + + /* Lets see if the user passed hexdigits */ + hex = simple_strtoul(progress_led, NULL, 10); + + rtas_progress ((char *)progress_led, hex); + return count; + + /* clear the line */ + /* rtas_progress(" ", 0xffff);*/ +} +/* ****************************************************************** */ +static int ppc_rtas_progress_show(struct seq_file *m, void *v) +{ + if (progress_led[0]) + seq_printf(m, "%s\n", progress_led); + return 0; +} + +/* ****************************************************************** */ +/* CLOCK */ +/* ****************************************************************** */ +static ssize_t ppc_rtas_clock_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct rtc_time tm; + unsigned long nowtime; + int error = parse_number(buf, count, &nowtime); + if (error) + return error; + + to_tm(nowtime, &tm); + error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL, + tm.tm_year, tm.tm_mon, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, 0); + if (error) + printk(KERN_WARNING "error: setting the clock returned: %s\n", + ppc_rtas_process_error(error)); + return count; +} +/* ****************************************************************** */ +static int ppc_rtas_clock_show(struct seq_file *m, void *v) +{ + int ret[8]; + int error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + + if (error) { + printk(KERN_WARNING "error: reading the clock returned: %s\n", + ppc_rtas_process_error(error)); + seq_printf(m, "0"); + } else { + unsigned int year, mon, day, hour, min, sec; + year = ret[0]; mon = ret[1]; day = ret[2]; + hour = ret[3]; min = ret[4]; sec = ret[5]; + seq_printf(m, "%lu\n", + mktime(year, mon, day, hour, min, sec)); + } + return 0; +} + +/* ****************************************************************** */ +/* SENSOR STUFF */ +/* ****************************************************************** */ +static int ppc_rtas_sensors_show(struct seq_file *m, void *v) +{ + int i,j; + int state, error; + int get_sensor_state = rtas_token("get-sensor-state"); + + seq_printf(m, "RTAS (RunTime Abstraction Services) Sensor Information\n"); + seq_printf(m, "Sensor\t\tValue\t\tCondition\tLocation\n"); + seq_printf(m, "********************************************************\n"); + + if (ppc_rtas_find_all_sensors() != 0) { + seq_printf(m, "\nNo sensors are available\n"); + return 0; + } + + for (i=0; i<sensors.quant; i++) { + struct individual_sensor *p = &sensors.sensor[i]; + char rstr[64]; + const char *loc; + int llen, offs; + + sprintf (rstr, SENSOR_PREFIX"%04d", p->token); + loc = of_get_property(rtas_node, rstr, &llen); + + /* A sensor may have multiple instances */ + for (j = 0, offs = 0; j <= p->quant; j++) { + error = rtas_call(get_sensor_state, 2, 2, &state, + p->token, j); + + ppc_rtas_process_sensor(m, p, state, error, loc); + seq_putc(m, '\n'); + if (loc) { + offs += strlen(loc) + 1; + loc += strlen(loc) + 1; + if (offs >= llen) + loc = NULL; + } + } + } + return 0; +} + +/* ****************************************************************** */ + +static int ppc_rtas_find_all_sensors(void) +{ + const unsigned int *utmp; + int len, i; + + utmp = of_get_property(rtas_node, "rtas-sensors", &len); + if (utmp == NULL) { + printk (KERN_ERR "error: could not get rtas-sensors\n"); + return 1; + } + + sensors.quant = len / 8; /* int + int */ + + for (i=0; i<sensors.quant; i++) { + sensors.sensor[i].token = *utmp++; + sensors.sensor[i].quant = *utmp++; + } + return 0; +} + +/* ****************************************************************** */ +/* + * Builds a string of what rtas returned + */ +static char *ppc_rtas_process_error(int error) +{ + switch (error) { + case SENSOR_CRITICAL_HIGH: + return "(critical high)"; + case SENSOR_WARNING_HIGH: + return "(warning high)"; + case SENSOR_NORMAL: + return "(normal)"; + case SENSOR_WARNING_LOW: + return "(warning low)"; + case SENSOR_CRITICAL_LOW: + return "(critical low)"; + case SENSOR_SUCCESS: + return "(read ok)"; + case SENSOR_HW_ERROR: + return "(hardware error)"; + case SENSOR_BUSY: + return "(busy)"; + case SENSOR_NOT_EXIST: + return "(non existent)"; + case SENSOR_DR_ENTITY: + return "(dr entity removed)"; + default: + return "(UNKNOWN)"; + } +} + +/* ****************************************************************** */ +/* + * Builds a string out of what the sensor said + */ + +static void ppc_rtas_process_sensor(struct seq_file *m, + struct individual_sensor *s, int state, int error, const char *loc) +{ + /* Defined return vales */ + const char * key_switch[] = { "Off\t", "Normal\t", "Secure\t", + "Maintenance" }; + const char * enclosure_switch[] = { "Closed", "Open" }; + const char * lid_status[] = { " ", "Open", "Closed" }; + const char * power_source[] = { "AC\t", "Battery", + "AC & Battery" }; + const char * battery_remaining[] = { "Very Low", "Low", "Mid", "High" }; + const char * epow_sensor[] = { + "EPOW Reset", "Cooling warning", "Power warning", + "System shutdown", "System halt", "EPOW main enclosure", + "EPOW power off" }; + const char * battery_cyclestate[] = { "None", "In progress", + "Requested" }; + const char * battery_charging[] = { "Charging", "Discharching", + "No current flow" }; + const char * ibm_drconnector[] = { "Empty", "Present", "Unusable", + "Exchange" }; + + int have_strings = 0; + int num_states = 0; + int temperature = 0; + int unknown = 0; + + /* What kind of sensor do we have here? */ + + switch (s->token) { + case KEY_SWITCH: + seq_printf(m, "Key switch:\t"); + num_states = sizeof(key_switch) / sizeof(char *); + if (state < num_states) { + seq_printf(m, "%s\t", key_switch[state]); + have_strings = 1; + } + break; + case ENCLOSURE_SWITCH: + seq_printf(m, "Enclosure switch:\t"); + num_states = sizeof(enclosure_switch) / sizeof(char *); + if (state < num_states) { + seq_printf(m, "%s\t", + enclosure_switch[state]); + have_strings = 1; + } + break; + case THERMAL_SENSOR: + seq_printf(m, "Temp. (C/F):\t"); + temperature = 1; + break; + case LID_STATUS: + seq_printf(m, "Lid status:\t"); + num_states = sizeof(lid_status) / sizeof(char *); + if (state < num_states) { + seq_printf(m, "%s\t", lid_status[state]); + have_strings = 1; + } + break; + case POWER_SOURCE: + seq_printf(m, "Power source:\t"); + num_states = sizeof(power_source) / sizeof(char *); + if (state < num_states) { + seq_printf(m, "%s\t", + power_source[state]); + have_strings = 1; + } + break; + case BATTERY_VOLTAGE: + seq_printf(m, "Battery voltage:\t"); + break; + case BATTERY_REMAINING: + seq_printf(m, "Battery remaining:\t"); + num_states = sizeof(battery_remaining) / sizeof(char *); + if (state < num_states) + { + seq_printf(m, "%s\t", + battery_remaining[state]); + have_strings = 1; + } + break; + case BATTERY_PERCENTAGE: + seq_printf(m, "Battery percentage:\t"); + break; + case EPOW_SENSOR: + seq_printf(m, "EPOW Sensor:\t"); + num_states = sizeof(epow_sensor) / sizeof(char *); + if (state < num_states) { + seq_printf(m, "%s\t", epow_sensor[state]); + have_strings = 1; + } + break; + case BATTERY_CYCLESTATE: + seq_printf(m, "Battery cyclestate:\t"); + num_states = sizeof(battery_cyclestate) / + sizeof(char *); + if (state < num_states) { + seq_printf(m, "%s\t", + battery_cyclestate[state]); + have_strings = 1; + } + break; + case BATTERY_CHARGING: + seq_printf(m, "Battery Charging:\t"); + num_states = sizeof(battery_charging) / sizeof(char *); + if (state < num_states) { + seq_printf(m, "%s\t", + battery_charging[state]); + have_strings = 1; + } + break; + case IBM_SURVEILLANCE: + seq_printf(m, "Surveillance:\t"); + break; + case IBM_FANRPM: + seq_printf(m, "Fan (rpm):\t"); + break; + case IBM_VOLTAGE: + seq_printf(m, "Voltage (mv):\t"); + break; + case IBM_DRCONNECTOR: + seq_printf(m, "DR connector:\t"); + num_states = sizeof(ibm_drconnector) / sizeof(char *); + if (state < num_states) { + seq_printf(m, "%s\t", + ibm_drconnector[state]); + have_strings = 1; + } + break; + case IBM_POWERSUPPLY: + seq_printf(m, "Powersupply:\t"); + break; + default: + seq_printf(m, "Unknown sensor (type %d), ignoring it\n", + s->token); + unknown = 1; + have_strings = 1; + break; + } + if (have_strings == 0) { + if (temperature) { + seq_printf(m, "%4d /%4d\t", state, cel_to_fahr(state)); + } else + seq_printf(m, "%10d\t", state); + } + if (unknown == 0) { + seq_printf(m, "%s\t", ppc_rtas_process_error(error)); + get_location_code(m, s, loc); + } +} + +/* ****************************************************************** */ + +static void check_location(struct seq_file *m, const char *c) +{ + switch (c[0]) { + case LOC_PLANAR: + seq_printf(m, "Planar #%c", c[1]); + break; + case LOC_CPU: + seq_printf(m, "CPU #%c", c[1]); + break; + case LOC_FAN: + seq_printf(m, "Fan #%c", c[1]); + break; + case LOC_RACKMOUNTED: + seq_printf(m, "Rack #%c", c[1]); + break; + case LOC_VOLTAGE: + seq_printf(m, "Voltage #%c", c[1]); + break; + case LOC_LCD: + seq_printf(m, "LCD #%c", c[1]); + break; + case '.': + seq_printf(m, "- %c", c[1]); + break; + default: + seq_printf(m, "Unknown location"); + break; + } +} + + +/* ****************************************************************** */ +/* + * Format: + * ${LETTER}${NUMBER}[[-/]${LETTER}${NUMBER} [ ... ] ] + * the '.' may be an abbrevation + */ +static void check_location_string(struct seq_file *m, const char *c) +{ + while (*c) { + if (isalpha(*c) || *c == '.') + check_location(m, c); + else if (*c == '/' || *c == '-') + seq_printf(m, " at "); + c++; + } +} + + +/* ****************************************************************** */ + +static void get_location_code(struct seq_file *m, struct individual_sensor *s, + const char *loc) +{ + if (!loc || !*loc) { + seq_printf(m, "---");/* does not have a location */ + } else { + check_location_string(m, loc); + } + seq_putc(m, ' '); +} +/* ****************************************************************** */ +/* INDICATORS - Tone Frequency */ +/* ****************************************************************** */ +static ssize_t ppc_rtas_tone_freq_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + unsigned long freq; + int error = parse_number(buf, count, &freq); + if (error) + return error; + + rtas_tone_frequency = freq; /* save it for later */ + error = rtas_call(rtas_token("set-indicator"), 3, 1, NULL, + TONE_FREQUENCY, 0, freq); + if (error) + printk(KERN_WARNING "error: setting tone frequency returned: %s\n", + ppc_rtas_process_error(error)); + return count; +} +/* ****************************************************************** */ +static int ppc_rtas_tone_freq_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%lu\n", rtas_tone_frequency); + return 0; +} +/* ****************************************************************** */ +/* INDICATORS - Tone Volume */ +/* ****************************************************************** */ +static ssize_t ppc_rtas_tone_volume_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + unsigned long volume; + int error = parse_number(buf, count, &volume); + if (error) + return error; + + if (volume > 100) + volume = 100; + + rtas_tone_volume = volume; /* save it for later */ + error = rtas_call(rtas_token("set-indicator"), 3, 1, NULL, + TONE_VOLUME, 0, volume); + if (error) + printk(KERN_WARNING "error: setting tone volume returned: %s\n", + ppc_rtas_process_error(error)); + return count; +} +/* ****************************************************************** */ +static int ppc_rtas_tone_volume_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%lu\n", rtas_tone_volume); + return 0; +} + +#define RMO_READ_BUF_MAX 30 + +/* RTAS Userspace access */ +static int ppc_rtas_rmo_buf_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%016lx %x\n", rtas_rmo_buf, RTAS_RMOBUF_MAX); + return 0; +} diff --git a/arch/powerpc/kernel/rtas-rtc.c b/arch/powerpc/kernel/rtas-rtc.c new file mode 100644 index 00000000000..c57c19358a2 --- /dev/null +++ b/arch/powerpc/kernel/rtas-rtc.c @@ -0,0 +1,112 @@ +#include <linux/kernel.h> +#include <linux/time.h> +#include <linux/timer.h> +#include <linux/init.h> +#include <linux/rtc.h> +#include <linux/delay.h> +#include <linux/ratelimit.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/time.h> + + +#define MAX_RTC_WAIT 5000 /* 5 sec */ +#define RTAS_CLOCK_BUSY (-2) +unsigned long __init rtas_get_boot_time(void) +{ + int ret[8]; + int error; + unsigned int wait_time; + u64 max_wait_tb; + + max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; + do { + error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + + wait_time = rtas_busy_delay_time(error); + if (wait_time) { + /* This is boot time so we spin. */ + udelay(wait_time*1000); + } + } while (wait_time && (get_tb() < max_wait_tb)); + + if (error != 0) { + printk_ratelimited(KERN_WARNING + "error: reading the clock failed (%d)\n", + error); + return 0; + } + + return mktime(ret[0], ret[1], ret[2], ret[3], ret[4], ret[5]); +} + +/* NOTE: get_rtc_time will get an error if executed in interrupt context + * and if a delay is needed to read the clock. In this case we just + * silently return without updating rtc_tm. + */ +void rtas_get_rtc_time(struct rtc_time *rtc_tm) +{ + int ret[8]; + int error; + unsigned int wait_time; + u64 max_wait_tb; + + max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; + do { + error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + + wait_time = rtas_busy_delay_time(error); + if (wait_time) { + if (in_interrupt()) { + memset(rtc_tm, 0, sizeof(struct rtc_time)); + printk_ratelimited(KERN_WARNING + "error: reading clock " + "would delay interrupt\n"); + return; /* delay not allowed */ + } + msleep(wait_time); + } + } while (wait_time && (get_tb() < max_wait_tb)); + + if (error != 0) { + printk_ratelimited(KERN_WARNING + "error: reading the clock failed (%d)\n", + error); + return; + } + + rtc_tm->tm_sec = ret[5]; + rtc_tm->tm_min = ret[4]; + rtc_tm->tm_hour = ret[3]; + rtc_tm->tm_mday = ret[2]; + rtc_tm->tm_mon = ret[1] - 1; + rtc_tm->tm_year = ret[0] - 1900; +} + +int rtas_set_rtc_time(struct rtc_time *tm) +{ + int error, wait_time; + u64 max_wait_tb; + + max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; + do { + error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL, + tm->tm_year + 1900, tm->tm_mon + 1, + tm->tm_mday, tm->tm_hour, tm->tm_min, + tm->tm_sec, 0); + + wait_time = rtas_busy_delay_time(error); + if (wait_time) { + if (in_interrupt()) + return 1; /* probably decrementer */ + msleep(wait_time); + } + } while (wait_time && (get_tb() < max_wait_tb)); + + if (error != 0) + printk_ratelimited(KERN_WARNING + "error: setting the clock failed (%d)\n", + error); + + return 0; +} diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 4d22eeeeb91..8b4c857c142 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -15,59 +15,103 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/spinlock.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/init.h> +#include <linux/capability.h> +#include <linux/delay.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/completion.h> +#include <linux/cpumask.h> +#include <linux/memblock.h> +#include <linux/slab.h> +#include <linux/reboot.h> #include <asm/prom.h> #include <asm/rtas.h> -#include <asm/semaphore.h> +#include <asm/hvcall.h> #include <asm/machdep.h> +#include <asm/firmware.h> #include <asm/page.h> #include <asm/param.h> -#include <asm/system.h> #include <asm/delay.h> #include <asm/uaccess.h> -#include <asm/lmb.h> -#ifdef CONFIG_PPC64 -#include <asm/systemcfg.h> -#endif +#include <asm/udbg.h> +#include <asm/syscalls.h> +#include <asm/smp.h> +#include <linux/atomic.h> +#include <asm/time.h> +#include <asm/mmu.h> +#include <asm/topology.h> struct rtas_t rtas = { - .lock = SPIN_LOCK_UNLOCKED + .lock = __ARCH_SPIN_LOCK_UNLOCKED }; - EXPORT_SYMBOL(rtas); DEFINE_SPINLOCK(rtas_data_buf_lock); +EXPORT_SYMBOL(rtas_data_buf_lock); + char rtas_data_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned; +EXPORT_SYMBOL(rtas_data_buf); + unsigned long rtas_rmo_buf; /* + * If non-NULL, this gets called when the kernel terminates. + * This is done like this so rtas_flash can be a module. + */ +void (*rtas_flash_term_hook)(int); +EXPORT_SYMBOL(rtas_flash_term_hook); + +/* RTAS use home made raw locking instead of spin_lock_irqsave + * because those can be called from within really nasty contexts + * such as having the timebase stopped which would lockup with + * normal locks and spinlock debugging enabled + */ +static unsigned long lock_rtas(void) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + arch_spin_lock_flags(&rtas.lock, flags); + return flags; +} + +static void unlock_rtas(unsigned long flags) +{ + arch_spin_unlock(&rtas.lock); + local_irq_restore(flags); + preempt_enable(); +} + +/* * call_rtas_display_status and call_rtas_display_status_delay * are designed only for very early low-level debugging, which * is why the token is hard-coded to 10. */ -void call_rtas_display_status(unsigned char c) +static void call_rtas_display_status(unsigned char c) { struct rtas_args *args = &rtas.args; unsigned long s; if (!rtas.base) return; - spin_lock_irqsave(&rtas.lock, s); + s = lock_rtas(); - args->token = 10; - args->nargs = 1; - args->nret = 1; - args->rets = (rtas_arg_t *)&(args->args[1]); - args->args[0] = (int)c; + args->token = cpu_to_be32(10); + args->nargs = cpu_to_be32(1); + args->nret = cpu_to_be32(1); + args->rets = &(args->args[1]); + args->args[0] = cpu_to_be32(c); enter_rtas(__pa(args)); - spin_unlock_irqrestore(&rtas.lock, s); + unlock_rtas(s); } -void call_rtas_display_status_delay(unsigned char c) +static void call_rtas_display_status_delay(char c) { static int pending_newline = 0; /* did last write end with unprinted newline? */ static int width = 16; @@ -76,7 +120,7 @@ void call_rtas_display_status_delay(unsigned char c) while (width-- > 0) call_rtas_display_status(' '); width = 16; - udelay(500000); + mdelay(500); pending_newline = 1; } else { if (pending_newline) { @@ -91,13 +135,80 @@ void call_rtas_display_status_delay(unsigned char c) } } +void __init udbg_init_rtas_panel(void) +{ + udbg_putc = call_rtas_display_status_delay; +} + +#ifdef CONFIG_UDBG_RTAS_CONSOLE + +/* If you think you're dying before early_init_dt_scan_rtas() does its + * work, you can hard code the token values for your firmware here and + * hardcode rtas.base/entry etc. + */ +static unsigned int rtas_putchar_token = RTAS_UNKNOWN_SERVICE; +static unsigned int rtas_getchar_token = RTAS_UNKNOWN_SERVICE; + +static void udbg_rtascon_putc(char c) +{ + int tries; + + if (!rtas.base) + return; + + /* Add CRs before LFs */ + if (c == '\n') + udbg_rtascon_putc('\r'); + + /* if there is more than one character to be displayed, wait a bit */ + for (tries = 0; tries < 16; tries++) { + if (rtas_call(rtas_putchar_token, 1, 1, NULL, c) == 0) + break; + udelay(1000); + } +} + +static int udbg_rtascon_getc_poll(void) +{ + int c; + + if (!rtas.base) + return -1; + + if (rtas_call(rtas_getchar_token, 0, 2, &c)) + return -1; + + return c; +} + +static int udbg_rtascon_getc(void) +{ + int c; + + while ((c = udbg_rtascon_getc_poll()) == -1) + ; + + return c; +} + + +void __init udbg_init_rtas_console(void) +{ + udbg_putc = udbg_rtascon_putc; + udbg_getc = udbg_rtascon_getc; + udbg_getc_poll = udbg_rtascon_getc_poll; +} +#endif /* CONFIG_UDBG_RTAS_CONSOLE */ + void rtas_progress(char *s, unsigned short hex) { struct device_node *root; - int width, *p; + int width; + const __be32 *p; char *os; static int display_character, set_indicator; - static int display_width, display_lines, *row_width, form_feed; + static int display_width, display_lines, form_feed; + static const int *row_width; static DEFINE_SPINLOCK(progress_lock); static int current_line; static int pending_newline = 0; /* did last write end with unprinted newline? */ @@ -107,18 +218,19 @@ void rtas_progress(char *s, unsigned short hex) if (display_width == 0) { display_width = 0x10; - if ((root = find_path_device("/rtas"))) { - if ((p = (unsigned int *)get_property(root, + if ((root = of_find_node_by_path("/rtas"))) { + if ((p = of_get_property(root, "ibm,display-line-length", NULL))) - display_width = *p; - if ((p = (unsigned int *)get_property(root, + display_width = be32_to_cpu(*p); + if ((p = of_get_property(root, "ibm,form-feed", NULL))) - form_feed = *p; - if ((p = (unsigned int *)get_property(root, + form_feed = be32_to_cpu(*p); + if ((p = of_get_property(root, "ibm,display-number-of-lines", NULL))) - display_lines = *p; - row_width = (unsigned int *)get_property(root, + display_lines = be32_to_cpu(*p); + row_width = of_get_property(root, "ibm,display-truncation-length", NULL); + of_node_put(root); } display_character = rtas_token("display-character"); set_indicator = rtas_token("set-indicator"); @@ -206,15 +318,23 @@ void rtas_progress(char *s, unsigned short hex) spin_unlock(&progress_lock); } +EXPORT_SYMBOL(rtas_progress); /* needed by rtas_flash module */ int rtas_token(const char *service) { - int *tokp; + const __be32 *tokp; if (rtas.dev == NULL) return RTAS_UNKNOWN_SERVICE; - tokp = (int *) get_property(rtas.dev, service, NULL); - return tokp ? *tokp : RTAS_UNKNOWN_SERVICE; + tokp = of_get_property(rtas.dev, service, NULL); + return tokp ? be32_to_cpu(*tokp) : RTAS_UNKNOWN_SERVICE; +} +EXPORT_SYMBOL(rtas_token); + +int rtas_service_present(const char *service) +{ + return rtas_token(service) != RTAS_UNKNOWN_SERVICE; } +EXPORT_SYMBOL(rtas_service_present); #ifdef CONFIG_RTAS_ERROR_LOGGING /* @@ -240,8 +360,8 @@ int rtas_get_error_log_max(void) EXPORT_SYMBOL(rtas_get_error_log_max); -char rtas_err_buf[RTAS_ERROR_LOG_MAX]; -int rtas_last_error_token; +static char rtas_err_buf[RTAS_ERROR_LOG_MAX]; +static int rtas_last_error_token; /** Return a copy of the detailed error text associated with the * most recent failed call to rtas. Because the error text @@ -260,11 +380,11 @@ static char *__fetch_rtas_last_error(char *altbuf) bufsz = rtas_get_error_log_max(); - err_args.token = rtas_last_error_token; - err_args.nargs = 2; - err_args.nret = 1; - err_args.args[0] = (rtas_arg_t)__pa(rtas_err_buf); - err_args.args[1] = bufsz; + err_args.token = cpu_to_be32(rtas_last_error_token); + err_args.nargs = cpu_to_be32(2); + err_args.nret = cpu_to_be32(1); + err_args.args[0] = cpu_to_be32(__pa(rtas_err_buf)); + err_args.args[1] = cpu_to_be32(bufsz); err_args.args[2] = 0; save_args = rtas.args; @@ -307,20 +427,19 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...) char *buff_copy = NULL; int ret; - if (token == RTAS_UNKNOWN_SERVICE) + if (!rtas.entry || token == RTAS_UNKNOWN_SERVICE) return -1; - /* Gotta do something different here, use global lock for now... */ - spin_lock_irqsave(&rtas.lock, s); + s = lock_rtas(); rtas_args = &rtas.args; - rtas_args->token = token; - rtas_args->nargs = nargs; - rtas_args->nret = nret; - rtas_args->rets = (rtas_arg_t *)&(rtas_args->args[nargs]); + rtas_args->token = cpu_to_be32(token); + rtas_args->nargs = cpu_to_be32(nargs); + rtas_args->nret = cpu_to_be32(nret); + rtas_args->rets = &(rtas_args->args[nargs]); va_start(list, outputs); for (i = 0; i < nargs; ++i) - rtas_args->args[i] = va_arg(list, rtas_arg_t); + rtas_args->args[i] = cpu_to_be32(va_arg(list, __u32)); va_end(list); for (i = 0; i < nret; ++i) @@ -330,16 +449,15 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...) /* A -1 return code indicates that the last command couldn't be completed due to a hardware error. */ - if (rtas_args->rets[0] == -1) + if (be32_to_cpu(rtas_args->rets[0]) == -1) buff_copy = __fetch_rtas_last_error(NULL); if (nret > 1 && outputs != NULL) for (i = 0; i < nret-1; ++i) - outputs[i] = rtas_args->rets[i+1]; - ret = (nret > 0)? rtas_args->rets[0]: 0; + outputs[i] = be32_to_cpu(rtas_args->rets[i+1]); + ret = (nret > 0)? be32_to_cpu(rtas_args->rets[0]): 0; - /* Gotta do something different here, use global lock for now... */ - spin_unlock_irqrestore(&rtas.lock, s); + unlock_rtas(s); if (buff_copy) { log_error(buff_copy, ERR_TYPE_RTAS_LOG, 0); @@ -348,28 +466,43 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...) } return ret; } +EXPORT_SYMBOL(rtas_call); -/* Given an RTAS status code of 990n compute the hinted delay of 10^n - * (last digit) milliseconds. For now we bound at n=5 (100 sec). +/* For RTAS_BUSY (-2), delay for 1 millisecond. For an extended busy status + * code of 990n, perform the hinted delay of 10^n (last digit) milliseconds. */ -unsigned int rtas_extended_busy_delay_time(int status) +unsigned int rtas_busy_delay_time(int status) { - int order = status - 9900; - unsigned long ms; + int order; + unsigned int ms = 0; + + if (status == RTAS_BUSY) { + ms = 1; + } else if (status >= 9900 && status <= 9905) { + order = status - 9900; + for (ms = 1; order > 0; order--) + ms *= 10; + } + + return ms; +} +EXPORT_SYMBOL(rtas_busy_delay_time); - if (order < 0) - order = 0; /* RTC depends on this for -2 clock busy */ - else if (order > 5) - order = 5; /* bound */ +/* For an RTAS busy status code, perform the hinted delay. */ +unsigned int rtas_busy_delay(int status) +{ + unsigned int ms; - /* Use microseconds for reasonable accuracy */ - for (ms = 1; order > 0; order--) - ms *= 10; + might_sleep(); + ms = rtas_busy_delay_time(status); + if (ms && need_resched()) + msleep(ms); - return ms; + return ms; } +EXPORT_SYMBOL(rtas_busy_delay); -int rtas_error_rc(int rtas_rc) +static int rtas_error_rc(int rtas_rc) { int rc; @@ -391,7 +524,7 @@ int rtas_error_rc(int rtas_rc) break; default: printk(KERN_ERR "%s: unexpected RTAS error %d\n", - __FUNCTION__, rtas_rc); + __func__, rtas_rc); rc = -ERANGE; break; } @@ -413,85 +546,113 @@ int rtas_get_power_level(int powerdomain, int *level) return rtas_error_rc(rc); return rc; } +EXPORT_SYMBOL(rtas_get_power_level); int rtas_set_power_level(int powerdomain, int level, int *setlevel) { int token = rtas_token("set-power-level"); - unsigned int wait_time; int rc; if (token == RTAS_UNKNOWN_SERVICE) return -ENOENT; - while (1) { + do { rc = rtas_call(token, 2, 2, setlevel, powerdomain, level); - if (rc == RTAS_BUSY) - udelay(1); - else if (rtas_is_extended_busy(rc)) { - wait_time = rtas_extended_busy_delay_time(rc); - udelay(wait_time * 1000); - } else - break; - } + } while (rtas_busy_delay(rc)); if (rc < 0) return rtas_error_rc(rc); return rc; } +EXPORT_SYMBOL(rtas_set_power_level); int rtas_get_sensor(int sensor, int index, int *state) { int token = rtas_token("get-sensor-state"); - unsigned int wait_time; int rc; if (token == RTAS_UNKNOWN_SERVICE) return -ENOENT; - while (1) { + do { rc = rtas_call(token, 2, 2, state, sensor, index); - if (rc == RTAS_BUSY) - udelay(1); - else if (rtas_is_extended_busy(rc)) { - wait_time = rtas_extended_busy_delay_time(rc); - udelay(wait_time * 1000); - } else - break; - } + } while (rtas_busy_delay(rc)); if (rc < 0) return rtas_error_rc(rc); return rc; } +EXPORT_SYMBOL(rtas_get_sensor); + +bool rtas_indicator_present(int token, int *maxindex) +{ + int proplen, count, i; + const struct indicator_elem { + __be32 token; + __be32 maxindex; + } *indicators; + + indicators = of_get_property(rtas.dev, "rtas-indicators", &proplen); + if (!indicators) + return false; + + count = proplen / sizeof(struct indicator_elem); + + for (i = 0; i < count; i++) { + if (__be32_to_cpu(indicators[i].token) != token) + continue; + if (maxindex) + *maxindex = __be32_to_cpu(indicators[i].maxindex); + return true; + } + + return false; +} +EXPORT_SYMBOL(rtas_indicator_present); int rtas_set_indicator(int indicator, int index, int new_value) { int token = rtas_token("set-indicator"); - unsigned int wait_time; int rc; if (token == RTAS_UNKNOWN_SERVICE) return -ENOENT; - while (1) { + do { rc = rtas_call(token, 3, 1, NULL, indicator, index, new_value); - if (rc == RTAS_BUSY) - udelay(1); - else if (rtas_is_extended_busy(rc)) { - wait_time = rtas_extended_busy_delay_time(rc); - udelay(wait_time * 1000); - } - else - break; - } + } while (rtas_busy_delay(rc)); + + if (rc < 0) + return rtas_error_rc(rc); + return rc; +} +EXPORT_SYMBOL(rtas_set_indicator); + +/* + * Ignoring RTAS extended delay + */ +int rtas_set_indicator_fast(int indicator, int index, int new_value) +{ + int rc; + int token = rtas_token("set-indicator"); + + if (token == RTAS_UNKNOWN_SERVICE) + return -ENOENT; + + rc = rtas_call(token, 3, 1, NULL, indicator, index, new_value); + + WARN_ON(rc == -2 || (rc >= 9900 && rc <= 9905)); if (rc < 0) return rtas_error_rc(rc); + return rc; } void rtas_restart(char *cmd) { + if (rtas_flash_term_hook) + rtas_flash_term_hook(SYS_RESTART); printk("RTAS system-reboot returned %d\n", rtas_call(rtas_token("system-reboot"), 0, 1, NULL)); for (;;); @@ -499,6 +660,8 @@ void rtas_restart(char *cmd) void rtas_power_off(void) { + if (rtas_flash_term_hook) + rtas_flash_term_hook(SYS_POWER_OFF); /* allow power on only with power button press */ printk("RTAS power-off returned %d\n", rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1)); @@ -507,7 +670,12 @@ void rtas_power_off(void) void rtas_halt(void) { - rtas_power_off(); + if (rtas_flash_term_hook) + rtas_flash_term_hook(SYS_HALT); + /* allow power on only with power button press */ + printk("RTAS power-off returned %d\n", + rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1)); + for (;;); } /* Must be in the RMO region, so we place it here */ @@ -517,7 +685,14 @@ void rtas_os_term(char *str) { int status; - if (RTAS_UNKNOWN_SERVICE == rtas_token("ibm,os-term")) + /* + * Firmware with the ibm,extended-os-term property is guaranteed + * to always return from an ibm,os-term call. Earlier versions without + * this property may terminate the partition which we want to avoid + * since it interferes with panic_timeout. + */ + if (RTAS_UNKNOWN_SERVICE == rtas_token("ibm,os-term") || + RTAS_UNKNOWN_SERVICE == rtas_token("ibm,extended-os-term")) return; snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str); @@ -525,22 +700,330 @@ void rtas_os_term(char *str) do { status = rtas_call(rtas_token("ibm,os-term"), 1, 1, NULL, __pa(rtas_os_term_buf)); + } while (rtas_busy_delay(status)); + + if (status != 0) + printk(KERN_EMERG "ibm,os-term call failed %d\n", status); +} + +static int ibm_suspend_me_token = RTAS_UNKNOWN_SERVICE; +#ifdef CONFIG_PPC_PSERIES +static int __rtas_suspend_last_cpu(struct rtas_suspend_me_data *data, int wake_when_done) +{ + u16 slb_size = mmu_slb_size; + int rc = H_MULTI_THREADS_ACTIVE; + int cpu; + + slb_set_size(SLB_MIN_SIZE); + printk(KERN_DEBUG "calling ibm,suspend-me on cpu %i\n", smp_processor_id()); - if (status == RTAS_BUSY) - udelay(1); - else if (status != 0) - printk(KERN_EMERG "ibm,os-term call failed %d\n", - status); - } while (status == RTAS_BUSY); + while (rc == H_MULTI_THREADS_ACTIVE && !atomic_read(&data->done) && + !atomic_read(&data->error)) + rc = rtas_call(data->token, 0, 1, NULL); + + if (rc || atomic_read(&data->error)) { + printk(KERN_DEBUG "ibm,suspend-me returned %d\n", rc); + slb_set_size(slb_size); + } + + if (atomic_read(&data->error)) + rc = atomic_read(&data->error); + + atomic_set(&data->error, rc); + pSeries_coalesce_init(); + + if (wake_when_done) { + atomic_set(&data->done, 1); + + for_each_online_cpu(cpu) + plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu)); + } + + if (atomic_dec_return(&data->working) == 0) + complete(data->complete); + + return rc; +} + +int rtas_suspend_last_cpu(struct rtas_suspend_me_data *data) +{ + atomic_inc(&data->working); + return __rtas_suspend_last_cpu(data, 0); } +static int __rtas_suspend_cpu(struct rtas_suspend_me_data *data, int wake_when_done) +{ + long rc = H_SUCCESS; + unsigned long msr_save; + int cpu; + + atomic_inc(&data->working); + + /* really need to ensure MSR.EE is off for H_JOIN */ + msr_save = mfmsr(); + mtmsr(msr_save & ~(MSR_EE)); + while (rc == H_SUCCESS && !atomic_read(&data->done) && !atomic_read(&data->error)) + rc = plpar_hcall_norets(H_JOIN); + + mtmsr(msr_save); + + if (rc == H_SUCCESS) { + /* This cpu was prodded and the suspend is complete. */ + goto out; + } else if (rc == H_CONTINUE) { + /* All other cpus are in H_JOIN, this cpu does + * the suspend. + */ + return __rtas_suspend_last_cpu(data, wake_when_done); + } else { + printk(KERN_ERR "H_JOIN on cpu %i failed with rc = %ld\n", + smp_processor_id(), rc); + atomic_set(&data->error, rc); + } + + if (wake_when_done) { + atomic_set(&data->done, 1); + + /* This cpu did the suspend or got an error; in either case, + * we need to prod all other other cpus out of join state. + * Extra prods are harmless. + */ + for_each_online_cpu(cpu) + plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu)); + } +out: + if (atomic_dec_return(&data->working) == 0) + complete(data->complete); + return rc; +} + +int rtas_suspend_cpu(struct rtas_suspend_me_data *data) +{ + return __rtas_suspend_cpu(data, 0); +} + +static void rtas_percpu_suspend_me(void *info) +{ + __rtas_suspend_cpu((struct rtas_suspend_me_data *)info, 1); +} + +enum rtas_cpu_state { + DOWN, + UP, +}; + +#ifndef CONFIG_SMP +static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, + cpumask_var_t cpus) +{ + if (!cpumask_empty(cpus)) { + cpumask_clear(cpus); + return -EINVAL; + } else + return 0; +} +#else +/* On return cpumask will be altered to indicate CPUs changed. + * CPUs with states changed will be set in the mask, + * CPUs with status unchanged will be unset in the mask. */ +static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, + cpumask_var_t cpus) +{ + int cpu; + int cpuret = 0; + int ret = 0; + + if (cpumask_empty(cpus)) + return 0; + + for_each_cpu(cpu, cpus) { + switch (state) { + case DOWN: + cpuret = cpu_down(cpu); + break; + case UP: + cpuret = cpu_up(cpu); + break; + } + if (cpuret) { + pr_debug("%s: cpu_%s for cpu#%d returned %d.\n", + __func__, + ((state == UP) ? "up" : "down"), + cpu, cpuret); + if (!ret) + ret = cpuret; + if (state == UP) { + /* clear bits for unchanged cpus, return */ + cpumask_shift_right(cpus, cpus, cpu); + cpumask_shift_left(cpus, cpus, cpu); + break; + } else { + /* clear bit for unchanged cpu, continue */ + cpumask_clear_cpu(cpu, cpus); + } + } + } + + return ret; +} +#endif + +int rtas_online_cpus_mask(cpumask_var_t cpus) +{ + int ret; + + ret = rtas_cpu_state_change_mask(UP, cpus); + + if (ret) { + cpumask_var_t tmp_mask; + + if (!alloc_cpumask_var(&tmp_mask, GFP_TEMPORARY)) + return ret; + + /* Use tmp_mask to preserve cpus mask from first failure */ + cpumask_copy(tmp_mask, cpus); + rtas_offline_cpus_mask(tmp_mask); + free_cpumask_var(tmp_mask); + } + + return ret; +} +EXPORT_SYMBOL(rtas_online_cpus_mask); + +int rtas_offline_cpus_mask(cpumask_var_t cpus) +{ + return rtas_cpu_state_change_mask(DOWN, cpus); +} +EXPORT_SYMBOL(rtas_offline_cpus_mask); + +int rtas_ibm_suspend_me(struct rtas_args *args) +{ + long state; + long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + struct rtas_suspend_me_data data; + DECLARE_COMPLETION_ONSTACK(done); + cpumask_var_t offline_mask; + int cpuret; + + if (!rtas_service_present("ibm,suspend-me")) + return -ENOSYS; + + /* Make sure the state is valid */ + rc = plpar_hcall(H_VASI_STATE, retbuf, + ((u64)args->args[0] << 32) | args->args[1]); + + state = retbuf[0]; + + if (rc) { + printk(KERN_ERR "rtas_ibm_suspend_me: vasi_state returned %ld\n",rc); + return rc; + } else if (state == H_VASI_ENABLED) { + args->args[args->nargs] = RTAS_NOT_SUSPENDABLE; + return 0; + } else if (state != H_VASI_SUSPENDING) { + printk(KERN_ERR "rtas_ibm_suspend_me: vasi_state returned state %ld\n", + state); + args->args[args->nargs] = -1; + return 0; + } + + if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY)) + return -ENOMEM; + + atomic_set(&data.working, 0); + atomic_set(&data.done, 0); + atomic_set(&data.error, 0); + data.token = rtas_token("ibm,suspend-me"); + data.complete = &done; + + /* All present CPUs must be online */ + cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask); + cpuret = rtas_online_cpus_mask(offline_mask); + if (cpuret) { + pr_err("%s: Could not bring present CPUs online.\n", __func__); + atomic_set(&data.error, cpuret); + goto out; + } + + stop_topology_update(); + + /* Call function on all CPUs. One of us will make the + * rtas call + */ + if (on_each_cpu(rtas_percpu_suspend_me, &data, 0)) + atomic_set(&data.error, -EINVAL); + + wait_for_completion(&done); + + if (atomic_read(&data.error) != 0) + printk(KERN_ERR "Error doing global join\n"); + + start_topology_update(); + + /* Take down CPUs not online prior to suspend */ + cpuret = rtas_offline_cpus_mask(offline_mask); + if (cpuret) + pr_warn("%s: Could not restore CPUs to offline state.\n", + __func__); + +out: + free_cpumask_var(offline_mask); + return atomic_read(&data.error); +} +#else /* CONFIG_PPC_PSERIES */ +int rtas_ibm_suspend_me(struct rtas_args *args) +{ + return -ENOSYS; +} +#endif + +/** + * Find a specific pseries error log in an RTAS extended event log. + * @log: RTAS error/event log + * @section_id: two character section identifier + * + * Returns a pointer to the specified errorlog or NULL if not found. + */ +struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, + uint16_t section_id) +{ + struct rtas_ext_event_log_v6 *ext_log = + (struct rtas_ext_event_log_v6 *)log->buffer; + struct pseries_errorlog *sect; + unsigned char *p, *log_end; + uint32_t ext_log_length = rtas_error_extended_log_length(log); + uint8_t log_format = rtas_ext_event_log_format(ext_log); + uint32_t company_id = rtas_ext_event_company_id(ext_log); + + /* Check that we understand the format */ + if (ext_log_length < sizeof(struct rtas_ext_event_log_v6) || + log_format != RTAS_V6EXT_LOG_FORMAT_EVENT_LOG || + company_id != RTAS_V6EXT_COMPANY_ID_IBM) + return NULL; + + log_end = log->buffer + ext_log_length; + p = ext_log->vendor_log; + + while (p < log_end) { + sect = (struct pseries_errorlog *)p; + if (pseries_errorlog_id(sect) == section_id) + return sect; + p += pseries_errorlog_length(sect); + } + + return NULL; +} + +/* We assume to be passed big endian arguments */ asmlinkage int ppc_rtas(struct rtas_args __user *uargs) { struct rtas_args args; unsigned long flags; char *buff_copy, *errbuf = NULL; - int nargs; + int nargs, nret, token; + int rc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -548,10 +1031,13 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs) if (copy_from_user(&args, uargs, 3 * sizeof(u32)) != 0) return -EFAULT; - nargs = args.nargs; + nargs = be32_to_cpu(args.nargs); + nret = be32_to_cpu(args.nret); + token = be32_to_cpu(args.token); + if (nargs > ARRAY_SIZE(args.args) - || args.nret > ARRAY_SIZE(args.args) - || nargs + args.nret > ARRAY_SIZE(args.args)) + || nret > ARRAY_SIZE(args.args) + || nargs + nret > ARRAY_SIZE(args.args)) return -EINVAL; /* Copy in args. */ @@ -559,22 +1045,34 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs) nargs * sizeof(rtas_arg_t)) != 0) return -EFAULT; + if (token == RTAS_UNKNOWN_SERVICE) + return -EINVAL; + + args.rets = &args.args[nargs]; + memset(args.rets, 0, nret * sizeof(rtas_arg_t)); + + /* Need to handle ibm,suspend_me call specially */ + if (token == ibm_suspend_me_token) { + rc = rtas_ibm_suspend_me(&args); + if (rc) + return rc; + goto copy_return; + } + buff_copy = get_errorlog_buffer(); - spin_lock_irqsave(&rtas.lock, flags); + flags = lock_rtas(); rtas.args = args; enter_rtas(__pa(&rtas.args)); args = rtas.args; - args.rets = &args.args[nargs]; - /* A -1 return code indicates that the last command couldn't be completed due to a hardware error. */ - if (args.rets[0] == -1) + if (be32_to_cpu(args.rets[0]) == -1) errbuf = __fetch_rtas_last_error(buff_copy); - spin_unlock_irqrestore(&rtas.lock, flags); + unlock_rtas(flags); if (buff_copy) { if (errbuf) @@ -582,44 +1080,18 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs) kfree(buff_copy); } + copy_return: /* Copy out args. */ if (copy_to_user(uargs->args + nargs, args.args + nargs, - args.nret * sizeof(rtas_arg_t)) != 0) + nret * sizeof(rtas_arg_t)) != 0) return -EFAULT; return 0; } -#ifdef CONFIG_SMP -/* This version can't take the spinlock, because it never returns */ - -struct rtas_args rtas_stop_self_args = { - /* The token is initialized for real in setup_system() */ - .token = RTAS_UNKNOWN_SERVICE, - .nargs = 0, - .nret = 1, - .rets = &rtas_stop_self_args.args[0], -}; - -void rtas_stop_self(void) -{ - struct rtas_args *rtas_args = &rtas_stop_self_args; - - local_irq_disable(); - - BUG_ON(rtas_args->token == RTAS_UNKNOWN_SERVICE); - - printk("cpu %u (hwid %u) Ready to die...\n", - smp_processor_id(), hard_smp_processor_id()); - enter_rtas(__pa(rtas_args)); - - panic("Alas, I survived.\n"); -} -#endif - /* - * Call early during boot, before mem init or bootmem, to retreive the RTAS + * Call early during boot, before mem init or bootmem, to retrieve the RTAS * informations from the device-tree and allocate the RMO buffer for userland * accesses. */ @@ -632,19 +1104,19 @@ void __init rtas_initialize(void) */ rtas.dev = of_find_node_by_name(NULL, "rtas"); if (rtas.dev) { - u32 *basep, *entryp; - u32 *sizep; + const __be32 *basep, *entryp, *sizep; - basep = (u32 *)get_property(rtas.dev, "linux,rtas-base", NULL); - sizep = (u32 *)get_property(rtas.dev, "rtas-size", NULL); + basep = of_get_property(rtas.dev, "linux,rtas-base", NULL); + sizep = of_get_property(rtas.dev, "rtas-size", NULL); if (basep != NULL && sizep != NULL) { - rtas.base = *basep; - rtas.size = *sizep; - entryp = (u32 *)get_property(rtas.dev, "linux,rtas-entry", NULL); + rtas.base = __be32_to_cpu(*basep); + rtas.size = __be32_to_cpu(*sizep); + entryp = of_get_property(rtas.dev, + "linux,rtas-entry", NULL); if (entryp == NULL) /* Ugh */ rtas.entry = rtas.base; else - rtas.entry = *entryp; + rtas.entry = __be32_to_cpu(*entryp); } else rtas.dev = NULL; } @@ -655,26 +1127,81 @@ void __init rtas_initialize(void) * the stop-self token if any */ #ifdef CONFIG_PPC64 - if (systemcfg->platform == PLATFORM_PSERIES_LPAR) - rtas_region = min(lmb.rmo_size, RTAS_INSTANTIATE_MAX); + if (machine_is(pseries) && firmware_has_feature(FW_FEATURE_LPAR)) { + rtas_region = min(ppc64_rma_size, RTAS_INSTANTIATE_MAX); + ibm_suspend_me_token = rtas_token("ibm,suspend-me"); + } #endif - rtas_rmo_buf = lmb_alloc_base(RTAS_RMOBUF_MAX, PAGE_SIZE, rtas_region); + rtas_rmo_buf = memblock_alloc_base(RTAS_RMOBUF_MAX, PAGE_SIZE, rtas_region); -#ifdef CONFIG_HOTPLUG_CPU - rtas_stop_self_args.token = rtas_token("stop-self"); -#endif /* CONFIG_HOTPLUG_CPU */ #ifdef CONFIG_RTAS_ERROR_LOGGING rtas_last_error_token = rtas_token("rtas-last-error"); #endif } +int __init early_init_dt_scan_rtas(unsigned long node, + const char *uname, int depth, void *data) +{ + const u32 *basep, *entryp, *sizep; -EXPORT_SYMBOL(rtas_token); -EXPORT_SYMBOL(rtas_call); -EXPORT_SYMBOL(rtas_data_buf); -EXPORT_SYMBOL(rtas_data_buf_lock); -EXPORT_SYMBOL(rtas_extended_busy_delay_time); -EXPORT_SYMBOL(rtas_get_sensor); -EXPORT_SYMBOL(rtas_get_power_level); -EXPORT_SYMBOL(rtas_set_power_level); -EXPORT_SYMBOL(rtas_set_indicator); + if (depth != 1 || strcmp(uname, "rtas") != 0) + return 0; + + basep = of_get_flat_dt_prop(node, "linux,rtas-base", NULL); + entryp = of_get_flat_dt_prop(node, "linux,rtas-entry", NULL); + sizep = of_get_flat_dt_prop(node, "rtas-size", NULL); + + if (basep && entryp && sizep) { + rtas.base = *basep; + rtas.entry = *entryp; + rtas.size = *sizep; + } + +#ifdef CONFIG_UDBG_RTAS_CONSOLE + basep = of_get_flat_dt_prop(node, "put-term-char", NULL); + if (basep) + rtas_putchar_token = *basep; + + basep = of_get_flat_dt_prop(node, "get-term-char", NULL); + if (basep) + rtas_getchar_token = *basep; + + if (rtas_putchar_token != RTAS_UNKNOWN_SERVICE && + rtas_getchar_token != RTAS_UNKNOWN_SERVICE) + udbg_init_rtas_console(); + +#endif + + /* break now */ + return 1; +} + +static arch_spinlock_t timebase_lock; +static u64 timebase = 0; + +void rtas_give_timebase(void) +{ + unsigned long flags; + + local_irq_save(flags); + hard_irq_disable(); + arch_spin_lock(&timebase_lock); + rtas_call(rtas_token("freeze-time-base"), 0, 1, NULL); + timebase = get_tb(); + arch_spin_unlock(&timebase_lock); + + while (timebase) + barrier(); + rtas_call(rtas_token("thaw-time-base"), 0, 1, NULL); + local_irq_restore(flags); +} + +void rtas_take_timebase(void) +{ + while (!timebase) + barrier(); + arch_spin_lock(&timebase_lock); + set_tb(timebase >> 32, timebase & 0xffffffff); + timebase = 0; + arch_spin_unlock(&timebase_lock); +} diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c new file mode 100644 index 00000000000..db2b482af65 --- /dev/null +++ b/arch/powerpc/kernel/rtas_flash.c @@ -0,0 +1,781 @@ +/* + * c 2001 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * /proc/powerpc/rtas/firmware_flash interface + * + * This file implements a firmware_flash interface to pump a firmware + * image into the kernel. At reboot time rtas_restart() will see the + * firmware image and flash it as it reboots (see rtas.c). + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/proc_fs.h> +#include <linux/reboot.h> +#include <asm/delay.h> +#include <asm/uaccess.h> +#include <asm/rtas.h> + +#define MODULE_VERS "1.0" +#define MODULE_NAME "rtas_flash" + +#define FIRMWARE_FLASH_NAME "firmware_flash" +#define FIRMWARE_UPDATE_NAME "firmware_update" +#define MANAGE_FLASH_NAME "manage_flash" +#define VALIDATE_FLASH_NAME "validate_flash" + +/* General RTAS Status Codes */ +#define RTAS_RC_SUCCESS 0 +#define RTAS_RC_HW_ERR -1 +#define RTAS_RC_BUSY -2 + +/* Flash image status values */ +#define FLASH_AUTH -9002 /* RTAS Not Service Authority Partition */ +#define FLASH_NO_OP -1099 /* No operation initiated by user */ +#define FLASH_IMG_SHORT -1005 /* Flash image shorter than expected */ +#define FLASH_IMG_BAD_LEN -1004 /* Bad length value in flash list block */ +#define FLASH_IMG_NULL_DATA -1003 /* Bad data value in flash list block */ +#define FLASH_IMG_READY 0 /* Firmware img ready for flash on reboot */ + +/* Manage image status values */ +#define MANAGE_AUTH -9002 /* RTAS Not Service Authority Partition */ +#define MANAGE_ACTIVE_ERR -9001 /* RTAS Cannot Overwrite Active Img */ +#define MANAGE_NO_OP -1099 /* No operation initiated by user */ +#define MANAGE_PARAM_ERR -3 /* RTAS Parameter Error */ +#define MANAGE_HW_ERR -1 /* RTAS Hardware Error */ + +/* Validate image status values */ +#define VALIDATE_AUTH -9002 /* RTAS Not Service Authority Partition */ +#define VALIDATE_NO_OP -1099 /* No operation initiated by the user */ +#define VALIDATE_INCOMPLETE -1002 /* User copied < VALIDATE_BUF_SIZE */ +#define VALIDATE_READY -1001 /* Firmware image ready for validation */ +#define VALIDATE_PARAM_ERR -3 /* RTAS Parameter Error */ +#define VALIDATE_HW_ERR -1 /* RTAS Hardware Error */ + +/* ibm,validate-flash-image update result tokens */ +#define VALIDATE_TMP_UPDATE 0 /* T side will be updated */ +#define VALIDATE_FLASH_AUTH 1 /* Partition does not have authority */ +#define VALIDATE_INVALID_IMG 2 /* Candidate image is not valid */ +#define VALIDATE_CUR_UNKNOWN 3 /* Current fixpack level is unknown */ +/* + * Current T side will be committed to P side before being replace with new + * image, and the new image is downlevel from current image + */ +#define VALIDATE_TMP_COMMIT_DL 4 +/* + * Current T side will be committed to P side before being replaced with new + * image + */ +#define VALIDATE_TMP_COMMIT 5 +/* + * T side will be updated with a downlevel image + */ +#define VALIDATE_TMP_UPDATE_DL 6 +/* + * The candidate image's release date is later than the system's firmware + * service entitlement date - service warranty period has expired + */ +#define VALIDATE_OUT_OF_WRNTY 7 + +/* ibm,manage-flash-image operation tokens */ +#define RTAS_REJECT_TMP_IMG 0 +#define RTAS_COMMIT_TMP_IMG 1 + +/* Array sizes */ +#define VALIDATE_BUF_SIZE 4096 +#define VALIDATE_MSG_LEN 256 +#define RTAS_MSG_MAXLEN 64 + +/* Quirk - RTAS requires 4k list length and block size */ +#define RTAS_BLKLIST_LENGTH 4096 +#define RTAS_BLK_SIZE 4096 + +struct flash_block { + char *data; + unsigned long length; +}; + +/* This struct is very similar but not identical to + * that needed by the rtas flash update. + * All we need to do for rtas is rewrite num_blocks + * into a version/length and translate the pointers + * to absolute. + */ +#define FLASH_BLOCKS_PER_NODE ((RTAS_BLKLIST_LENGTH - 16) / sizeof(struct flash_block)) +struct flash_block_list { + unsigned long num_blocks; + struct flash_block_list *next; + struct flash_block blocks[FLASH_BLOCKS_PER_NODE]; +}; + +static struct flash_block_list *rtas_firmware_flash_list; + +/* Use slab cache to guarantee 4k alignment */ +static struct kmem_cache *flash_block_cache = NULL; + +#define FLASH_BLOCK_LIST_VERSION (1UL) + +/* + * Local copy of the flash block list. + * + * The rtas_firmware_flash_list varable will be + * set once the data is fully read. + * + * For convenience as we build the list we use virtual addrs, + * we do not fill in the version number, and the length field + * is treated as the number of entries currently in the block + * (i.e. not a byte count). This is all fixed when calling + * the flash routine. + */ + +/* Status int must be first member of struct */ +struct rtas_update_flash_t +{ + int status; /* Flash update status */ + struct flash_block_list *flist; /* Local copy of flash block list */ +}; + +/* Status int must be first member of struct */ +struct rtas_manage_flash_t +{ + int status; /* Returned status */ +}; + +/* Status int must be first member of struct */ +struct rtas_validate_flash_t +{ + int status; /* Returned status */ + char *buf; /* Candidate image buffer */ + unsigned int buf_size; /* Size of image buf */ + unsigned int update_results; /* Update results token */ +}; + +static struct rtas_update_flash_t rtas_update_flash_data; +static struct rtas_manage_flash_t rtas_manage_flash_data; +static struct rtas_validate_flash_t rtas_validate_flash_data; +static DEFINE_MUTEX(rtas_update_flash_mutex); +static DEFINE_MUTEX(rtas_manage_flash_mutex); +static DEFINE_MUTEX(rtas_validate_flash_mutex); + +/* Do simple sanity checks on the flash image. */ +static int flash_list_valid(struct flash_block_list *flist) +{ + struct flash_block_list *f; + int i; + unsigned long block_size, image_size; + + /* Paranoid self test here. We also collect the image size. */ + image_size = 0; + for (f = flist; f; f = f->next) { + for (i = 0; i < f->num_blocks; i++) { + if (f->blocks[i].data == NULL) { + return FLASH_IMG_NULL_DATA; + } + block_size = f->blocks[i].length; + if (block_size <= 0 || block_size > RTAS_BLK_SIZE) { + return FLASH_IMG_BAD_LEN; + } + image_size += block_size; + } + } + + if (image_size < (256 << 10)) { + if (image_size < 2) + return FLASH_NO_OP; + } + + printk(KERN_INFO "FLASH: flash image with %ld bytes stored for hardware flash on reboot\n", image_size); + + return FLASH_IMG_READY; +} + +static void free_flash_list(struct flash_block_list *f) +{ + struct flash_block_list *next; + int i; + + while (f) { + for (i = 0; i < f->num_blocks; i++) + kmem_cache_free(flash_block_cache, f->blocks[i].data); + next = f->next; + kmem_cache_free(flash_block_cache, f); + f = next; + } +} + +static int rtas_flash_release(struct inode *inode, struct file *file) +{ + struct rtas_update_flash_t *const uf = &rtas_update_flash_data; + + mutex_lock(&rtas_update_flash_mutex); + + if (uf->flist) { + /* File was opened in write mode for a new flash attempt */ + /* Clear saved list */ + if (rtas_firmware_flash_list) { + free_flash_list(rtas_firmware_flash_list); + rtas_firmware_flash_list = NULL; + } + + if (uf->status != FLASH_AUTH) + uf->status = flash_list_valid(uf->flist); + + if (uf->status == FLASH_IMG_READY) + rtas_firmware_flash_list = uf->flist; + else + free_flash_list(uf->flist); + + uf->flist = NULL; + } + + mutex_unlock(&rtas_update_flash_mutex); + return 0; +} + +static size_t get_flash_status_msg(int status, char *buf) +{ + const char *msg; + size_t len; + + switch (status) { + case FLASH_AUTH: + msg = "error: this partition does not have service authority\n"; + break; + case FLASH_NO_OP: + msg = "info: no firmware image for flash\n"; + break; + case FLASH_IMG_SHORT: + msg = "error: flash image short\n"; + break; + case FLASH_IMG_BAD_LEN: + msg = "error: internal error bad length\n"; + break; + case FLASH_IMG_NULL_DATA: + msg = "error: internal error null data\n"; + break; + case FLASH_IMG_READY: + msg = "ready: firmware image ready for flash on reboot\n"; + break; + default: + return sprintf(buf, "error: unexpected status value %d\n", + status); + } + + len = strlen(msg); + memcpy(buf, msg, len + 1); + return len; +} + +/* Reading the proc file will show status (not the firmware contents) */ +static ssize_t rtas_flash_read_msg(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct rtas_update_flash_t *const uf = &rtas_update_flash_data; + char msg[RTAS_MSG_MAXLEN]; + size_t len; + int status; + + mutex_lock(&rtas_update_flash_mutex); + status = uf->status; + mutex_unlock(&rtas_update_flash_mutex); + + /* Read as text message */ + len = get_flash_status_msg(status, msg); + return simple_read_from_buffer(buf, count, ppos, msg, len); +} + +static ssize_t rtas_flash_read_num(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct rtas_update_flash_t *const uf = &rtas_update_flash_data; + char msg[RTAS_MSG_MAXLEN]; + int status; + + mutex_lock(&rtas_update_flash_mutex); + status = uf->status; + mutex_unlock(&rtas_update_flash_mutex); + + /* Read as number */ + sprintf(msg, "%d\n", status); + return simple_read_from_buffer(buf, count, ppos, msg, strlen(msg)); +} + +/* We could be much more efficient here. But to keep this function + * simple we allocate a page to the block list no matter how small the + * count is. If the system is low on memory it will be just as well + * that we fail.... + */ +static ssize_t rtas_flash_write(struct file *file, const char __user *buffer, + size_t count, loff_t *off) +{ + struct rtas_update_flash_t *const uf = &rtas_update_flash_data; + char *p; + int next_free, rc; + struct flash_block_list *fl; + + mutex_lock(&rtas_update_flash_mutex); + + if (uf->status == FLASH_AUTH || count == 0) + goto out; /* discard data */ + + /* In the case that the image is not ready for flashing, the memory + * allocated for the block list will be freed upon the release of the + * proc file + */ + if (uf->flist == NULL) { + uf->flist = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL); + if (!uf->flist) + goto nomem; + } + + fl = uf->flist; + while (fl->next) + fl = fl->next; /* seek to last block_list for append */ + next_free = fl->num_blocks; + if (next_free == FLASH_BLOCKS_PER_NODE) { + /* Need to allocate another block_list */ + fl->next = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL); + if (!fl->next) + goto nomem; + fl = fl->next; + next_free = 0; + } + + if (count > RTAS_BLK_SIZE) + count = RTAS_BLK_SIZE; + p = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL); + if (!p) + goto nomem; + + if(copy_from_user(p, buffer, count)) { + kmem_cache_free(flash_block_cache, p); + rc = -EFAULT; + goto error; + } + fl->blocks[next_free].data = p; + fl->blocks[next_free].length = count; + fl->num_blocks++; +out: + mutex_unlock(&rtas_update_flash_mutex); + return count; + +nomem: + rc = -ENOMEM; +error: + mutex_unlock(&rtas_update_flash_mutex); + return rc; +} + +/* + * Flash management routines. + */ +static void manage_flash(struct rtas_manage_flash_t *args_buf, unsigned int op) +{ + s32 rc; + + do { + rc = rtas_call(rtas_token("ibm,manage-flash-image"), 1, 1, + NULL, op); + } while (rtas_busy_delay(rc)); + + args_buf->status = rc; +} + +static ssize_t manage_flash_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct rtas_manage_flash_t *const args_buf = &rtas_manage_flash_data; + char msg[RTAS_MSG_MAXLEN]; + int msglen, status; + + mutex_lock(&rtas_manage_flash_mutex); + status = args_buf->status; + mutex_unlock(&rtas_manage_flash_mutex); + + msglen = sprintf(msg, "%d\n", status); + return simple_read_from_buffer(buf, count, ppos, msg, msglen); +} + +static ssize_t manage_flash_write(struct file *file, const char __user *buf, + size_t count, loff_t *off) +{ + struct rtas_manage_flash_t *const args_buf = &rtas_manage_flash_data; + static const char reject_str[] = "0"; + static const char commit_str[] = "1"; + char stkbuf[10]; + int op, rc; + + mutex_lock(&rtas_manage_flash_mutex); + + if ((args_buf->status == MANAGE_AUTH) || (count == 0)) + goto out; + + op = -1; + if (buf) { + if (count > 9) count = 9; + rc = -EFAULT; + if (copy_from_user (stkbuf, buf, count)) + goto error; + if (strncmp(stkbuf, reject_str, strlen(reject_str)) == 0) + op = RTAS_REJECT_TMP_IMG; + else if (strncmp(stkbuf, commit_str, strlen(commit_str)) == 0) + op = RTAS_COMMIT_TMP_IMG; + } + + if (op == -1) { /* buf is empty, or contains invalid string */ + rc = -EINVAL; + goto error; + } + + manage_flash(args_buf, op); +out: + mutex_unlock(&rtas_manage_flash_mutex); + return count; + +error: + mutex_unlock(&rtas_manage_flash_mutex); + return rc; +} + +/* + * Validation routines. + */ +static void validate_flash(struct rtas_validate_flash_t *args_buf) +{ + int token = rtas_token("ibm,validate-flash-image"); + int update_results; + s32 rc; + + rc = 0; + do { + spin_lock(&rtas_data_buf_lock); + memcpy(rtas_data_buf, args_buf->buf, VALIDATE_BUF_SIZE); + rc = rtas_call(token, 2, 2, &update_results, + (u32) __pa(rtas_data_buf), args_buf->buf_size); + memcpy(args_buf->buf, rtas_data_buf, VALIDATE_BUF_SIZE); + spin_unlock(&rtas_data_buf_lock); + } while (rtas_busy_delay(rc)); + + args_buf->status = rc; + args_buf->update_results = update_results; +} + +static int get_validate_flash_msg(struct rtas_validate_flash_t *args_buf, + char *msg, int msglen) +{ + int n; + + if (args_buf->status >= VALIDATE_TMP_UPDATE) { + n = sprintf(msg, "%d\n", args_buf->update_results); + if ((args_buf->update_results >= VALIDATE_CUR_UNKNOWN) || + (args_buf->update_results == VALIDATE_TMP_UPDATE)) + n += snprintf(msg + n, msglen - n, "%s\n", + args_buf->buf); + } else { + n = sprintf(msg, "%d\n", args_buf->status); + } + return n; +} + +static ssize_t validate_flash_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct rtas_validate_flash_t *const args_buf = + &rtas_validate_flash_data; + char msg[VALIDATE_MSG_LEN]; + int msglen; + + mutex_lock(&rtas_validate_flash_mutex); + msglen = get_validate_flash_msg(args_buf, msg, VALIDATE_MSG_LEN); + mutex_unlock(&rtas_validate_flash_mutex); + + return simple_read_from_buffer(buf, count, ppos, msg, msglen); +} + +static ssize_t validate_flash_write(struct file *file, const char __user *buf, + size_t count, loff_t *off) +{ + struct rtas_validate_flash_t *const args_buf = + &rtas_validate_flash_data; + int rc; + + mutex_lock(&rtas_validate_flash_mutex); + + /* We are only interested in the first 4K of the + * candidate image */ + if ((*off >= VALIDATE_BUF_SIZE) || + (args_buf->status == VALIDATE_AUTH)) { + *off += count; + mutex_unlock(&rtas_validate_flash_mutex); + return count; + } + + if (*off + count >= VALIDATE_BUF_SIZE) { + count = VALIDATE_BUF_SIZE - *off; + args_buf->status = VALIDATE_READY; + } else { + args_buf->status = VALIDATE_INCOMPLETE; + } + + if (!access_ok(VERIFY_READ, buf, count)) { + rc = -EFAULT; + goto done; + } + if (copy_from_user(args_buf->buf + *off, buf, count)) { + rc = -EFAULT; + goto done; + } + + *off += count; + rc = count; +done: + mutex_unlock(&rtas_validate_flash_mutex); + return rc; +} + +static int validate_flash_release(struct inode *inode, struct file *file) +{ + struct rtas_validate_flash_t *const args_buf = + &rtas_validate_flash_data; + + mutex_lock(&rtas_validate_flash_mutex); + + if (args_buf->status == VALIDATE_READY) { + args_buf->buf_size = VALIDATE_BUF_SIZE; + validate_flash(args_buf); + } + + mutex_unlock(&rtas_validate_flash_mutex); + return 0; +} + +/* + * On-reboot flash update applicator. + */ +static void rtas_flash_firmware(int reboot_type) +{ + unsigned long image_size; + struct flash_block_list *f, *next, *flist; + unsigned long rtas_block_list; + int i, status, update_token; + + if (rtas_firmware_flash_list == NULL) + return; /* nothing to do */ + + if (reboot_type != SYS_RESTART) { + printk(KERN_ALERT "FLASH: firmware flash requires a reboot\n"); + printk(KERN_ALERT "FLASH: the firmware image will NOT be flashed\n"); + return; + } + + update_token = rtas_token("ibm,update-flash-64-and-reboot"); + if (update_token == RTAS_UNKNOWN_SERVICE) { + printk(KERN_ALERT "FLASH: ibm,update-flash-64-and-reboot " + "is not available -- not a service partition?\n"); + printk(KERN_ALERT "FLASH: firmware will not be flashed\n"); + return; + } + + /* + * Just before starting the firmware flash, cancel the event scan work + * to avoid any soft lockup issues. + */ + rtas_cancel_event_scan(); + + /* + * NOTE: the "first" block must be under 4GB, so we create + * an entry with no data blocks in the reserved buffer in + * the kernel data segment. + */ + spin_lock(&rtas_data_buf_lock); + flist = (struct flash_block_list *)&rtas_data_buf[0]; + flist->num_blocks = 0; + flist->next = rtas_firmware_flash_list; + rtas_block_list = __pa(flist); + if (rtas_block_list >= 4UL*1024*1024*1024) { + printk(KERN_ALERT "FLASH: kernel bug...flash list header addr above 4GB\n"); + spin_unlock(&rtas_data_buf_lock); + return; + } + + printk(KERN_ALERT "FLASH: preparing saved firmware image for flash\n"); + /* Update the block_list in place. */ + rtas_firmware_flash_list = NULL; /* too hard to backout on error */ + image_size = 0; + for (f = flist; f; f = next) { + /* Translate data addrs to absolute */ + for (i = 0; i < f->num_blocks; i++) { + f->blocks[i].data = (char *)cpu_to_be64(__pa(f->blocks[i].data)); + image_size += f->blocks[i].length; + f->blocks[i].length = cpu_to_be64(f->blocks[i].length); + } + next = f->next; + /* Don't translate NULL pointer for last entry */ + if (f->next) + f->next = (struct flash_block_list *)cpu_to_be64(__pa(f->next)); + else + f->next = NULL; + /* make num_blocks into the version/length field */ + f->num_blocks = (FLASH_BLOCK_LIST_VERSION << 56) | ((f->num_blocks+1)*16); + f->num_blocks = cpu_to_be64(f->num_blocks); + } + + printk(KERN_ALERT "FLASH: flash image is %ld bytes\n", image_size); + printk(KERN_ALERT "FLASH: performing flash and reboot\n"); + rtas_progress("Flashing \n", 0x0); + rtas_progress("Please Wait... ", 0x0); + printk(KERN_ALERT "FLASH: this will take several minutes. Do not power off!\n"); + status = rtas_call(update_token, 1, 1, NULL, rtas_block_list); + switch (status) { /* should only get "bad" status */ + case 0: + printk(KERN_ALERT "FLASH: success\n"); + break; + case -1: + printk(KERN_ALERT "FLASH: hardware error. Firmware may not be not flashed\n"); + break; + case -3: + printk(KERN_ALERT "FLASH: image is corrupt or not correct for this platform. Firmware not flashed\n"); + break; + case -4: + printk(KERN_ALERT "FLASH: flash failed when partially complete. System may not reboot\n"); + break; + default: + printk(KERN_ALERT "FLASH: unknown flash return code %d\n", status); + break; + } + spin_unlock(&rtas_data_buf_lock); +} + +/* + * Manifest of proc files to create + */ +struct rtas_flash_file { + const char *filename; + const char *rtas_call_name; + int *status; + const struct file_operations fops; +}; + +static const struct rtas_flash_file rtas_flash_files[] = { + { + .filename = "powerpc/rtas/" FIRMWARE_FLASH_NAME, + .rtas_call_name = "ibm,update-flash-64-and-reboot", + .status = &rtas_update_flash_data.status, + .fops.read = rtas_flash_read_msg, + .fops.write = rtas_flash_write, + .fops.release = rtas_flash_release, + .fops.llseek = default_llseek, + }, + { + .filename = "powerpc/rtas/" FIRMWARE_UPDATE_NAME, + .rtas_call_name = "ibm,update-flash-64-and-reboot", + .status = &rtas_update_flash_data.status, + .fops.read = rtas_flash_read_num, + .fops.write = rtas_flash_write, + .fops.release = rtas_flash_release, + .fops.llseek = default_llseek, + }, + { + .filename = "powerpc/rtas/" VALIDATE_FLASH_NAME, + .rtas_call_name = "ibm,validate-flash-image", + .status = &rtas_validate_flash_data.status, + .fops.read = validate_flash_read, + .fops.write = validate_flash_write, + .fops.release = validate_flash_release, + .fops.llseek = default_llseek, + }, + { + .filename = "powerpc/rtas/" MANAGE_FLASH_NAME, + .rtas_call_name = "ibm,manage-flash-image", + .status = &rtas_manage_flash_data.status, + .fops.read = manage_flash_read, + .fops.write = manage_flash_write, + .fops.llseek = default_llseek, + } +}; + +static int __init rtas_flash_init(void) +{ + int i; + + if (rtas_token("ibm,update-flash-64-and-reboot") == + RTAS_UNKNOWN_SERVICE) { + pr_info("rtas_flash: no firmware flash support\n"); + return -EINVAL; + } + + rtas_validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL); + if (!rtas_validate_flash_data.buf) + return -ENOMEM; + + flash_block_cache = kmem_cache_create("rtas_flash_cache", + RTAS_BLK_SIZE, RTAS_BLK_SIZE, 0, + NULL); + if (!flash_block_cache) { + printk(KERN_ERR "%s: failed to create block cache\n", + __func__); + goto enomem_buf; + } + + for (i = 0; i < ARRAY_SIZE(rtas_flash_files); i++) { + const struct rtas_flash_file *f = &rtas_flash_files[i]; + int token; + + if (!proc_create(f->filename, S_IRUSR | S_IWUSR, NULL, &f->fops)) + goto enomem; + + /* + * This code assumes that the status int is the first member of the + * struct + */ + token = rtas_token(f->rtas_call_name); + if (token == RTAS_UNKNOWN_SERVICE) + *f->status = FLASH_AUTH; + else + *f->status = FLASH_NO_OP; + } + + rtas_flash_term_hook = rtas_flash_firmware; + return 0; + +enomem: + while (--i >= 0) { + const struct rtas_flash_file *f = &rtas_flash_files[i]; + remove_proc_entry(f->filename, NULL); + } + + kmem_cache_destroy(flash_block_cache); +enomem_buf: + kfree(rtas_validate_flash_data.buf); + return -ENOMEM; +} + +static void __exit rtas_flash_cleanup(void) +{ + int i; + + rtas_flash_term_hook = NULL; + + if (rtas_firmware_flash_list) { + free_flash_list(rtas_firmware_flash_list); + rtas_firmware_flash_list = NULL; + } + + for (i = 0; i < ARRAY_SIZE(rtas_flash_files); i++) { + const struct rtas_flash_file *f = &rtas_flash_files[i]; + remove_proc_entry(f->filename, NULL); + } + + kmem_cache_destroy(flash_block_cache); + kfree(rtas_validate_flash_data.buf); +} + +module_init(rtas_flash_init); +module_exit(rtas_flash_cleanup); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c new file mode 100644 index 00000000000..c168337aef9 --- /dev/null +++ b/arch/powerpc/kernel/rtas_pci.c @@ -0,0 +1,335 @@ +/* + * Copyright (C) 2001 Dave Engebretsen, IBM Corporation + * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM + * + * RTAS specific routines for PCI. + * + * Based on code from pci.c, chrp_pci.c and pSeries_pci.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/threads.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> + +#include <asm/io.h> +#include <asm/pgtable.h> +#include <asm/irq.h> +#include <asm/prom.h> +#include <asm/machdep.h> +#include <asm/pci-bridge.h> +#include <asm/iommu.h> +#include <asm/rtas.h> +#include <asm/mpic.h> +#include <asm/ppc-pci.h> +#include <asm/eeh.h> + +/* RTAS tokens */ +static int read_pci_config; +static int write_pci_config; +static int ibm_read_pci_config; +static int ibm_write_pci_config; + +static inline int config_access_valid(struct pci_dn *dn, int where) +{ + if (where < 256) + return 1; + if (where < 4096 && dn->pci_ext_config_space) + return 1; + + return 0; +} + +int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val) +{ + int returnval = -1; + unsigned long buid, addr; + int ret; + + if (!pdn) + return PCIBIOS_DEVICE_NOT_FOUND; + if (!config_access_valid(pdn, where)) + return PCIBIOS_BAD_REGISTER_NUMBER; + + addr = rtas_config_addr(pdn->busno, pdn->devfn, where); + buid = pdn->phb->buid; + if (buid) { + ret = rtas_call(ibm_read_pci_config, 4, 2, &returnval, + addr, BUID_HI(buid), BUID_LO(buid), size); + } else { + ret = rtas_call(read_pci_config, 2, 2, &returnval, addr, size); + } + *val = returnval; + + if (ret) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; +} + +static int rtas_pci_read_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 *val) +{ + struct device_node *busdn, *dn; + struct pci_dn *pdn; + bool found = false; +#ifdef CONFIG_EEH + struct eeh_dev *edev; +#endif + int ret; + + /* Search only direct children of the bus */ + *val = 0xFFFFFFFF; + busdn = pci_bus_to_OF_node(bus); + for (dn = busdn->child; dn; dn = dn->sibling) { + pdn = PCI_DN(dn); + if (pdn && pdn->devfn == devfn + && of_device_is_available(dn)) { + found = true; + break; + } + } + + if (!found) + return PCIBIOS_DEVICE_NOT_FOUND; +#ifdef CONFIG_EEH + edev = of_node_to_eeh_dev(dn); + if (edev && edev->pe && edev->pe->state & EEH_PE_RESET) + return PCIBIOS_DEVICE_NOT_FOUND; +#endif + + ret = rtas_read_config(pdn, where, size, val); + if (*val == EEH_IO_ERROR_VALUE(size) && + eeh_dev_check_failure(of_node_to_eeh_dev(dn))) + return PCIBIOS_DEVICE_NOT_FOUND; + + return ret; +} + +int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val) +{ + unsigned long buid, addr; + int ret; + + if (!pdn) + return PCIBIOS_DEVICE_NOT_FOUND; + if (!config_access_valid(pdn, where)) + return PCIBIOS_BAD_REGISTER_NUMBER; + + addr = rtas_config_addr(pdn->busno, pdn->devfn, where); + buid = pdn->phb->buid; + if (buid) { + ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr, + BUID_HI(buid), BUID_LO(buid), size, (ulong) val); + } else { + ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, (ulong)val); + } + + if (ret) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; +} + +static int rtas_pci_write_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 val) +{ + struct device_node *busdn, *dn; + struct pci_dn *pdn; + bool found = false; +#ifdef CONFIG_EEH + struct eeh_dev *edev; +#endif + int ret; + + /* Search only direct children of the bus */ + busdn = pci_bus_to_OF_node(bus); + for (dn = busdn->child; dn; dn = dn->sibling) { + pdn = PCI_DN(dn); + if (pdn && pdn->devfn == devfn + && of_device_is_available(dn)) { + found = true; + break; + } + } + + if (!found) + return PCIBIOS_DEVICE_NOT_FOUND; +#ifdef CONFIG_EEH + edev = of_node_to_eeh_dev(dn); + if (edev && edev->pe && (edev->pe->state & EEH_PE_RESET)) + return PCIBIOS_DEVICE_NOT_FOUND; +#endif + ret = rtas_write_config(pdn, where, size, val); + + return ret; +} + +static struct pci_ops rtas_pci_ops = { + .read = rtas_pci_read_config, + .write = rtas_pci_write_config, +}; + +static int is_python(struct device_node *dev) +{ + const char *model = of_get_property(dev, "model", NULL); + + if (model && strstr(model, "Python")) + return 1; + + return 0; +} + +static void python_countermeasures(struct device_node *dev) +{ + struct resource registers; + void __iomem *chip_regs; + volatile u32 val; + + if (of_address_to_resource(dev, 0, ®isters)) { + printk(KERN_ERR "Can't get address for Python workarounds !\n"); + return; + } + + /* Python's register file is 1 MB in size. */ + chip_regs = ioremap(registers.start & ~(0xfffffUL), 0x100000); + + /* + * Firmware doesn't always clear this bit which is critical + * for good performance - Anton + */ + +#define PRG_CL_RESET_VALID 0x00010000 + + val = in_be32(chip_regs + 0xf6030); + if (val & PRG_CL_RESET_VALID) { + printk(KERN_INFO "Python workaround: "); + val &= ~PRG_CL_RESET_VALID; + out_be32(chip_regs + 0xf6030, val); + /* + * We must read it back for changes to + * take effect + */ + val = in_be32(chip_regs + 0xf6030); + printk("reg0: %x\n", val); + } + + iounmap(chip_regs); +} + +void __init init_pci_config_tokens(void) +{ + read_pci_config = rtas_token("read-pci-config"); + write_pci_config = rtas_token("write-pci-config"); + ibm_read_pci_config = rtas_token("ibm,read-pci-config"); + ibm_write_pci_config = rtas_token("ibm,write-pci-config"); +} + +unsigned long get_phb_buid(struct device_node *phb) +{ + struct resource r; + + if (ibm_read_pci_config == -1) + return 0; + if (of_address_to_resource(phb, 0, &r)) + return 0; + return r.start; +} + +static int phb_set_bus_ranges(struct device_node *dev, + struct pci_controller *phb) +{ + const __be32 *bus_range; + unsigned int len; + + bus_range = of_get_property(dev, "bus-range", &len); + if (bus_range == NULL || len < 2 * sizeof(int)) { + return 1; + } + + phb->first_busno = be32_to_cpu(bus_range[0]); + phb->last_busno = be32_to_cpu(bus_range[1]); + + return 0; +} + +int rtas_setup_phb(struct pci_controller *phb) +{ + struct device_node *dev = phb->dn; + + if (is_python(dev)) + python_countermeasures(dev); + + if (phb_set_bus_ranges(dev, phb)) + return 1; + + phb->ops = &rtas_pci_ops; + phb->buid = get_phb_buid(dev); + + return 0; +} + +void __init find_and_init_phbs(void) +{ + struct device_node *node; + struct pci_controller *phb; + struct device_node *root = of_find_node_by_path("/"); + + for_each_child_of_node(root, node) { + if (node->type == NULL || (strcmp(node->type, "pci") != 0 && + strcmp(node->type, "pciex") != 0)) + continue; + + phb = pcibios_alloc_controller(node); + if (!phb) + continue; + rtas_setup_phb(phb); + pci_process_bridge_OF_ranges(phb, node, 0); + isa_bridge_find_early(phb); + } + + of_node_put(root); + pci_devs_phb_init(); + + /* + * PCI_PROBE_ONLY and PCI_REASSIGN_ALL_BUS can be set via properties + * in chosen. + */ + if (of_chosen) { + const int *prop; + + prop = of_get_property(of_chosen, + "linux,pci-probe-only", NULL); + if (prop) { + if (*prop) + pci_add_flags(PCI_PROBE_ONLY); + else + pci_clear_flags(PCI_PROBE_ONLY); + } + +#ifdef CONFIG_PPC32 /* Will be made generic soon */ + prop = of_get_property(of_chosen, + "linux,pci-assign-all-buses", NULL); + if (prop && *prop) + pci_add_flags(PCI_REASSIGN_ALL_BUS); +#endif /* CONFIG_PPC32 */ + } +} diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c new file mode 100644 index 00000000000..e736387fee6 --- /dev/null +++ b/arch/powerpc/kernel/rtasd.c @@ -0,0 +1,602 @@ +/* + * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Communication to userspace based on kernel/printk.c + */ + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/poll.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/vmalloc.h> +#include <linux/spinlock.h> +#include <linux/cpu.h> +#include <linux/workqueue.h> +#include <linux/slab.h> + +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/rtas.h> +#include <asm/prom.h> +#include <asm/nvram.h> +#include <linux/atomic.h> +#include <asm/machdep.h> +#include <asm/topology.h> + + +static DEFINE_SPINLOCK(rtasd_log_lock); + +static DECLARE_WAIT_QUEUE_HEAD(rtas_log_wait); + +static char *rtas_log_buf; +static unsigned long rtas_log_start; +static unsigned long rtas_log_size; + +static int surveillance_timeout = -1; + +static unsigned int rtas_error_log_max; +static unsigned int rtas_error_log_buffer_max; + +/* RTAS service tokens */ +static unsigned int event_scan; +static unsigned int rtas_event_scan_rate; + +static int full_rtas_msgs = 0; + +/* Stop logging to nvram after first fatal error */ +static int logging_enabled; /* Until we initialize everything, + * make sure we don't try logging + * anything */ +static int error_log_cnt; + +/* + * Since we use 32 bit RTAS, the physical address of this must be below + * 4G or else bad things happen. Allocate this in the kernel data and + * make it big enough. + */ +static unsigned char logdata[RTAS_ERROR_LOG_MAX]; + +static char *rtas_type[] = { + "Unknown", "Retry", "TCE Error", "Internal Device Failure", + "Timeout", "Data Parity", "Address Parity", "Cache Parity", + "Address Invalid", "ECC Uncorrected", "ECC Corrupted", +}; + +static char *rtas_event_type(int type) +{ + if ((type > 0) && (type < 11)) + return rtas_type[type]; + + switch (type) { + case RTAS_TYPE_EPOW: + return "EPOW"; + case RTAS_TYPE_PLATFORM: + return "Platform Error"; + case RTAS_TYPE_IO: + return "I/O Event"; + case RTAS_TYPE_INFO: + return "Platform Information Event"; + case RTAS_TYPE_DEALLOC: + return "Resource Deallocation Event"; + case RTAS_TYPE_DUMP: + return "Dump Notification Event"; + case RTAS_TYPE_PRRN: + return "Platform Resource Reassignment Event"; + } + + return rtas_type[0]; +} + +/* To see this info, grep RTAS /var/log/messages and each entry + * will be collected together with obvious begin/end. + * There will be a unique identifier on the begin and end lines. + * This will persist across reboots. + * + * format of error logs returned from RTAS: + * bytes (size) : contents + * -------------------------------------------------------- + * 0-7 (8) : rtas_error_log + * 8-47 (40) : extended info + * 48-51 (4) : vendor id + * 52-1023 (vendor specific) : location code and debug data + */ +static void printk_log_rtas(char *buf, int len) +{ + + int i,j,n = 0; + int perline = 16; + char buffer[64]; + char * str = "RTAS event"; + + if (full_rtas_msgs) { + printk(RTAS_DEBUG "%d -------- %s begin --------\n", + error_log_cnt, str); + + /* + * Print perline bytes on each line, each line will start + * with RTAS and a changing number, so syslogd will + * print lines that are otherwise the same. Separate every + * 4 bytes with a space. + */ + for (i = 0; i < len; i++) { + j = i % perline; + if (j == 0) { + memset(buffer, 0, sizeof(buffer)); + n = sprintf(buffer, "RTAS %d:", i/perline); + } + + if ((i % 4) == 0) + n += sprintf(buffer+n, " "); + + n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]); + + if (j == (perline-1)) + printk(KERN_DEBUG "%s\n", buffer); + } + if ((i % perline) != 0) + printk(KERN_DEBUG "%s\n", buffer); + + printk(RTAS_DEBUG "%d -------- %s end ----------\n", + error_log_cnt, str); + } else { + struct rtas_error_log *errlog = (struct rtas_error_log *)buf; + + printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n", + error_log_cnt, rtas_event_type(rtas_error_type(errlog)), + rtas_error_severity(errlog)); + } +} + +static int log_rtas_len(char * buf) +{ + int len; + struct rtas_error_log *err; + uint32_t extended_log_length; + + /* rtas fixed header */ + len = 8; + err = (struct rtas_error_log *)buf; + extended_log_length = rtas_error_extended_log_length(err); + if (rtas_error_extended(err) && extended_log_length) { + + /* extended header */ + len += extended_log_length; + } + + if (rtas_error_log_max == 0) + rtas_error_log_max = rtas_get_error_log_max(); + + if (len > rtas_error_log_max) + len = rtas_error_log_max; + + return len; +} + +/* + * First write to nvram, if fatal error, that is the only + * place we log the info. The error will be picked up + * on the next reboot by rtasd. If not fatal, run the + * method for the type of error. Currently, only RTAS + * errors have methods implemented, but in the future + * there might be a need to store data in nvram before a + * call to panic(). + * + * XXX We write to nvram periodically, to indicate error has + * been written and sync'd, but there is a possibility + * that if we don't shutdown correctly, a duplicate error + * record will be created on next reboot. + */ +void pSeries_log_error(char *buf, unsigned int err_type, int fatal) +{ + unsigned long offset; + unsigned long s; + int len = 0; + + pr_debug("rtasd: logging event\n"); + if (buf == NULL) + return; + + spin_lock_irqsave(&rtasd_log_lock, s); + + /* get length and increase count */ + switch (err_type & ERR_TYPE_MASK) { + case ERR_TYPE_RTAS_LOG: + len = log_rtas_len(buf); + if (!(err_type & ERR_FLAG_BOOT)) + error_log_cnt++; + break; + case ERR_TYPE_KERNEL_PANIC: + default: + WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ + spin_unlock_irqrestore(&rtasd_log_lock, s); + return; + } + +#ifdef CONFIG_PPC64 + /* Write error to NVRAM */ + if (logging_enabled && !(err_type & ERR_FLAG_BOOT)) + nvram_write_error_log(buf, len, err_type, error_log_cnt); +#endif /* CONFIG_PPC64 */ + + /* + * rtas errors can occur during boot, and we do want to capture + * those somewhere, even if nvram isn't ready (why not?), and even + * if rtasd isn't ready. Put them into the boot log, at least. + */ + if ((err_type & ERR_TYPE_MASK) == ERR_TYPE_RTAS_LOG) + printk_log_rtas(buf, len); + + /* Check to see if we need to or have stopped logging */ + if (fatal || !logging_enabled) { + logging_enabled = 0; + WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ + spin_unlock_irqrestore(&rtasd_log_lock, s); + return; + } + + /* call type specific method for error */ + switch (err_type & ERR_TYPE_MASK) { + case ERR_TYPE_RTAS_LOG: + offset = rtas_error_log_buffer_max * + ((rtas_log_start+rtas_log_size) & LOG_NUMBER_MASK); + + /* First copy over sequence number */ + memcpy(&rtas_log_buf[offset], (void *) &error_log_cnt, sizeof(int)); + + /* Second copy over error log data */ + offset += sizeof(int); + memcpy(&rtas_log_buf[offset], buf, len); + + if (rtas_log_size < LOG_NUMBER) + rtas_log_size += 1; + else + rtas_log_start += 1; + + WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ + spin_unlock_irqrestore(&rtasd_log_lock, s); + wake_up_interruptible(&rtas_log_wait); + break; + case ERR_TYPE_KERNEL_PANIC: + default: + WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ + spin_unlock_irqrestore(&rtasd_log_lock, s); + return; + } +} + +#ifdef CONFIG_PPC_PSERIES +static s32 prrn_update_scope; + +static void prrn_work_fn(struct work_struct *work) +{ + /* + * For PRRN, we must pass the negative of the scope value in + * the RTAS event. + */ + pseries_devicetree_update(-prrn_update_scope); +} + +static DECLARE_WORK(prrn_work, prrn_work_fn); + +void prrn_schedule_update(u32 scope) +{ + flush_work(&prrn_work); + prrn_update_scope = scope; + schedule_work(&prrn_work); +} + +static void handle_rtas_event(const struct rtas_error_log *log) +{ + if (rtas_error_type(log) != RTAS_TYPE_PRRN || !prrn_is_enabled()) + return; + + /* For PRRN Events the extended log length is used to denote + * the scope for calling rtas update-nodes. + */ + prrn_schedule_update(rtas_error_extended_log_length(log)); +} + +#else + +static void handle_rtas_event(const struct rtas_error_log *log) +{ + return; +} + +#endif + +static int rtas_log_open(struct inode * inode, struct file * file) +{ + return 0; +} + +static int rtas_log_release(struct inode * inode, struct file * file) +{ + return 0; +} + +/* This will check if all events are logged, if they are then, we + * know that we can safely clear the events in NVRAM. + * Next we'll sit and wait for something else to log. + */ +static ssize_t rtas_log_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + int error; + char *tmp; + unsigned long s; + unsigned long offset; + + if (!buf || count < rtas_error_log_buffer_max) + return -EINVAL; + + count = rtas_error_log_buffer_max; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + spin_lock_irqsave(&rtasd_log_lock, s); + + /* if it's 0, then we know we got the last one (the one in NVRAM) */ + while (rtas_log_size == 0) { + if (file->f_flags & O_NONBLOCK) { + spin_unlock_irqrestore(&rtasd_log_lock, s); + error = -EAGAIN; + goto out; + } + + if (!logging_enabled) { + spin_unlock_irqrestore(&rtasd_log_lock, s); + error = -ENODATA; + goto out; + } +#ifdef CONFIG_PPC64 + nvram_clear_error_log(); +#endif /* CONFIG_PPC64 */ + + spin_unlock_irqrestore(&rtasd_log_lock, s); + error = wait_event_interruptible(rtas_log_wait, rtas_log_size); + if (error) + goto out; + spin_lock_irqsave(&rtasd_log_lock, s); + } + + offset = rtas_error_log_buffer_max * (rtas_log_start & LOG_NUMBER_MASK); + memcpy(tmp, &rtas_log_buf[offset], count); + + rtas_log_start += 1; + rtas_log_size -= 1; + spin_unlock_irqrestore(&rtasd_log_lock, s); + + error = copy_to_user(buf, tmp, count) ? -EFAULT : count; +out: + kfree(tmp); + return error; +} + +static unsigned int rtas_log_poll(struct file *file, poll_table * wait) +{ + poll_wait(file, &rtas_log_wait, wait); + if (rtas_log_size) + return POLLIN | POLLRDNORM; + return 0; +} + +static const struct file_operations proc_rtas_log_operations = { + .read = rtas_log_read, + .poll = rtas_log_poll, + .open = rtas_log_open, + .release = rtas_log_release, + .llseek = noop_llseek, +}; + +static int enable_surveillance(int timeout) +{ + int error; + + error = rtas_set_indicator(SURVEILLANCE_TOKEN, 0, timeout); + + if (error == 0) + return 0; + + if (error == -EINVAL) { + printk(KERN_DEBUG "rtasd: surveillance not supported\n"); + return 0; + } + + printk(KERN_ERR "rtasd: could not update surveillance\n"); + return -1; +} + +static void do_event_scan(void) +{ + int error; + do { + memset(logdata, 0, rtas_error_log_max); + error = rtas_call(event_scan, 4, 1, NULL, + RTAS_EVENT_SCAN_ALL_EVENTS, 0, + __pa(logdata), rtas_error_log_max); + if (error == -1) { + printk(KERN_ERR "event-scan failed\n"); + break; + } + + if (error == 0) { + pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0); + handle_rtas_event((struct rtas_error_log *)logdata); + } + + } while(error == 0); +} + +static void rtas_event_scan(struct work_struct *w); +DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan); + +/* + * Delay should be at least one second since some machines have problems if + * we call event-scan too quickly. + */ +static unsigned long event_scan_delay = 1*HZ; +static int first_pass = 1; + +static void rtas_event_scan(struct work_struct *w) +{ + unsigned int cpu; + + do_event_scan(); + + get_online_cpus(); + + /* raw_ OK because just using CPU as starting point. */ + cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (cpu >= nr_cpu_ids) { + cpu = cpumask_first(cpu_online_mask); + + if (first_pass) { + first_pass = 0; + event_scan_delay = 30*HZ/rtas_event_scan_rate; + + if (surveillance_timeout != -1) { + pr_debug("rtasd: enabling surveillance\n"); + enable_surveillance(surveillance_timeout); + pr_debug("rtasd: surveillance enabled\n"); + } + } + } + + schedule_delayed_work_on(cpu, &event_scan_work, + __round_jiffies_relative(event_scan_delay, cpu)); + + put_online_cpus(); +} + +#ifdef CONFIG_PPC64 +static void retreive_nvram_error_log(void) +{ + unsigned int err_type ; + int rc ; + + /* See if we have any error stored in NVRAM */ + memset(logdata, 0, rtas_error_log_max); + rc = nvram_read_error_log(logdata, rtas_error_log_max, + &err_type, &error_log_cnt); + /* We can use rtas_log_buf now */ + logging_enabled = 1; + if (!rc) { + if (err_type != ERR_FLAG_ALREADY_LOGGED) { + pSeries_log_error(logdata, err_type | ERR_FLAG_BOOT, 0); + } + } +} +#else /* CONFIG_PPC64 */ +static void retreive_nvram_error_log(void) +{ +} +#endif /* CONFIG_PPC64 */ + +static void start_event_scan(void) +{ + printk(KERN_DEBUG "RTAS daemon started\n"); + pr_debug("rtasd: will sleep for %d milliseconds\n", + (30000 / rtas_event_scan_rate)); + + /* Retrieve errors from nvram if any */ + retreive_nvram_error_log(); + + schedule_delayed_work_on(cpumask_first(cpu_online_mask), + &event_scan_work, event_scan_delay); +} + +/* Cancel the rtas event scan work */ +void rtas_cancel_event_scan(void) +{ + cancel_delayed_work_sync(&event_scan_work); +} +EXPORT_SYMBOL_GPL(rtas_cancel_event_scan); + +static int __init rtas_init(void) +{ + struct proc_dir_entry *entry; + + if (!machine_is(pseries) && !machine_is(chrp)) + return 0; + + /* No RTAS */ + event_scan = rtas_token("event-scan"); + if (event_scan == RTAS_UNKNOWN_SERVICE) { + printk(KERN_INFO "rtasd: No event-scan on system\n"); + return -ENODEV; + } + + rtas_event_scan_rate = rtas_token("rtas-event-scan-rate"); + if (rtas_event_scan_rate == RTAS_UNKNOWN_SERVICE) { + printk(KERN_ERR "rtasd: no rtas-event-scan-rate on system\n"); + return -ENODEV; + } + + if (!rtas_event_scan_rate) { + /* Broken firmware: take a rate of zero to mean don't scan */ + printk(KERN_DEBUG "rtasd: scan rate is 0, not scanning\n"); + return 0; + } + + /* Make room for the sequence number */ + rtas_error_log_max = rtas_get_error_log_max(); + rtas_error_log_buffer_max = rtas_error_log_max + sizeof(int); + + rtas_log_buf = vmalloc(rtas_error_log_buffer_max*LOG_NUMBER); + if (!rtas_log_buf) { + printk(KERN_ERR "rtasd: no memory\n"); + return -ENOMEM; + } + + entry = proc_create("powerpc/rtas/error_log", S_IRUSR, NULL, + &proc_rtas_log_operations); + if (!entry) + printk(KERN_ERR "Failed to create error_log proc entry\n"); + + start_event_scan(); + + return 0; +} +__initcall(rtas_init); + +static int __init surveillance_setup(char *str) +{ + int i; + + /* We only do surveillance on pseries */ + if (!machine_is(pseries)) + return 0; + + if (get_option(&str,&i)) { + if (i >= 0 && i <= 255) + surveillance_timeout = i; + } + + return 1; +} +__setup("surveillance=", surveillance_setup); + +static int __init rtasmsgs_setup(char *str) +{ + if (strcmp(str, "on") == 0) + full_rtas_msgs = 1; + else if (strcmp(str, "off") == 0) + full_rtas_msgs = 0; + + return 1; +} +__setup("rtasmsgs=", rtasmsgs_setup); diff --git a/arch/powerpc/kernel/semaphore.c b/arch/powerpc/kernel/semaphore.c deleted file mode 100644 index 2f8c3c95139..00000000000 --- a/arch/powerpc/kernel/semaphore.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * PowerPC-specific semaphore code. - * - * Copyright (C) 1999 Cort Dougan <cort@cs.nmt.edu> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * April 2001 - Reworked by Paul Mackerras <paulus@samba.org> - * to eliminate the SMP races in the old version between the updates - * of `count' and `waking'. Now we use negative `count' values to - * indicate that some process(es) are waiting for the semaphore. - */ - -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/module.h> - -#include <asm/atomic.h> -#include <asm/semaphore.h> -#include <asm/errno.h> - -/* - * Atomically update sem->count. - * This does the equivalent of the following: - * - * old_count = sem->count; - * tmp = MAX(old_count, 0) + incr; - * sem->count = tmp; - * return old_count; - */ -static inline int __sem_update_count(struct semaphore *sem, int incr) -{ - int old_count, tmp; - - __asm__ __volatile__("\n" -"1: lwarx %0,0,%3\n" -" srawi %1,%0,31\n" -" andc %1,%0,%1\n" -" add %1,%1,%4\n" - PPC405_ERR77(0,%3) -" stwcx. %1,0,%3\n" -" bne 1b" - : "=&r" (old_count), "=&r" (tmp), "=m" (sem->count) - : "r" (&sem->count), "r" (incr), "m" (sem->count) - : "cc"); - - return old_count; -} - -void __up(struct semaphore *sem) -{ - /* - * Note that we incremented count in up() before we came here, - * but that was ineffective since the result was <= 0, and - * any negative value of count is equivalent to 0. - * This ends up setting count to 1, unless count is now > 0 - * (i.e. because some other cpu has called up() in the meantime), - * in which case we just increment count. - */ - __sem_update_count(sem, 1); - wake_up(&sem->wait); -} -EXPORT_SYMBOL(__up); - -/* - * Note that when we come in to __down or __down_interruptible, - * we have already decremented count, but that decrement was - * ineffective since the result was < 0, and any negative value - * of count is equivalent to 0. - * Thus it is only when we decrement count from some value > 0 - * that we have actually got the semaphore. - */ -void __sched __down(struct semaphore *sem) -{ - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - - __set_task_state(tsk, TASK_UNINTERRUPTIBLE); - add_wait_queue_exclusive(&sem->wait, &wait); - - /* - * Try to get the semaphore. If the count is > 0, then we've - * got the semaphore; we decrement count and exit the loop. - * If the count is 0 or negative, we set it to -1, indicating - * that we are asleep, and then sleep. - */ - while (__sem_update_count(sem, -1) <= 0) { - schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } - remove_wait_queue(&sem->wait, &wait); - __set_task_state(tsk, TASK_RUNNING); - - /* - * If there are any more sleepers, wake one of them up so - * that it can either get the semaphore, or set count to -1 - * indicating that there are still processes sleeping. - */ - wake_up(&sem->wait); -} -EXPORT_SYMBOL(__down); - -int __sched __down_interruptible(struct semaphore * sem) -{ - int retval = 0; - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - - __set_task_state(tsk, TASK_INTERRUPTIBLE); - add_wait_queue_exclusive(&sem->wait, &wait); - - while (__sem_update_count(sem, -1) <= 0) { - if (signal_pending(current)) { - /* - * A signal is pending - give up trying. - * Set sem->count to 0 if it is negative, - * since we are no longer sleeping. - */ - __sem_update_count(sem, 0); - retval = -EINTR; - break; - } - schedule(); - set_task_state(tsk, TASK_INTERRUPTIBLE); - } - remove_wait_queue(&sem->wait, &wait); - __set_task_state(tsk, TASK_RUNNING); - - wake_up(&sem->wait); - return retval; -} -EXPORT_SYMBOL(__down_interruptible); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 1292460fcde..e5b022c55cc 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -9,8 +9,10 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> -#include <linux/module.h> + +#undef DEBUG + +#include <linux/export.h> #include <linux/string.h> #include <linux/sched.h> #include <linux/init.h> @@ -18,21 +20,26 @@ #include <linux/reboot.h> #include <linux/delay.h> #include <linux/initrd.h> -#include <linux/ide.h> +#include <linux/platform_device.h> #include <linux/seq_file.h> #include <linux/ioport.h> #include <linux/console.h> -#include <linux/utsname.h> -#include <linux/tty.h> +#include <linux/screen_info.h> #include <linux/root_dev.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/unistd.h> #include <linux/serial.h> #include <linux/serial_8250.h> +#include <linux/debugfs.h> +#include <linux/percpu.h> +#include <linux/memblock.h> +#include <linux/of_platform.h> #include <asm/io.h> +#include <asm/paca.h> #include <asm/prom.h> #include <asm/processor.h> +#include <asm/vdso_datapage.h> #include <asm/pgtable.h> #include <asm/smp.h> #include <asm/elf.h> @@ -40,26 +47,42 @@ #include <asm/time.h> #include <asm/cputable.h> #include <asm/sections.h> +#include <asm/firmware.h> #include <asm/btext.h> #include <asm/nvram.h> #include <asm/setup.h> -#include <asm/system.h> #include <asm/rtas.h> #include <asm/iommu.h> #include <asm/serial.h> #include <asm/cache.h> #include <asm/page.h> #include <asm/mmu.h> -#include <asm/lmb.h> - -#undef DEBUG +#include <asm/xmon.h> +#include <asm/cputhreads.h> +#include <mm/mmu_decl.h> +#include <asm/fadump.h> #ifdef DEBUG +#include <asm/udbg.h> #define DBG(fmt...) udbg_printf(fmt) #else #define DBG(fmt...) #endif +/* The main machine-dep calls structure + */ +struct machdep_calls ppc_md; +EXPORT_SYMBOL(ppc_md); +struct machdep_calls *machine_id; +EXPORT_SYMBOL(machine_id); + +int boot_cpuid = -1; +EXPORT_SYMBOL_GPL(boot_cpuid); + +unsigned long klimit = (unsigned long) _end; + +char cmd_line[COMMAND_LINE_SIZE]; + /* * This still seems to be needed... -- paulus */ @@ -72,6 +95,12 @@ struct screen_info screen_info = { .orig_video_points = 16 }; +/* Variables required to store legacy IO irq routing */ +int of_i8042_kbd_irq; +EXPORT_SYMBOL_GPL(of_i8042_kbd_irq); +int of_i8042_aux_irq; +EXPORT_SYMBOL_GPL(of_i8042_aux_irq); + #ifdef __DO_IRQ_CANON /* XXX should go elsewhere eventually */ int ppc_do_canonicalize_irqs; @@ -81,14 +110,23 @@ EXPORT_SYMBOL(ppc_do_canonicalize_irqs); /* also used by kexec */ void machine_shutdown(void) { - if (ppc_md.nvram_sync) - ppc_md.nvram_sync(); +#ifdef CONFIG_FA_DUMP + /* + * if fadump is active, cleanup the fadump registration before we + * shutdown. + */ + fadump_cleanup(); +#endif + + if (ppc_md.machine_shutdown) + ppc_md.machine_shutdown(); } void machine_restart(char *cmd) { machine_shutdown(); - ppc_md.restart(cmd); + if (ppc_md.restart) + ppc_md.restart(cmd); #ifdef CONFIG_SMP smp_send_stop(); #endif @@ -100,7 +138,8 @@ void machine_restart(char *cmd) void machine_power_off(void) { machine_shutdown(); - ppc_md.power_off(); + if (ppc_md.power_off) + ppc_md.power_off(); #ifdef CONFIG_SMP smp_send_stop(); #endif @@ -117,7 +156,8 @@ EXPORT_SYMBOL_GPL(pm_power_off); void machine_halt(void) { machine_shutdown(); - ppc_md.halt(); + if (ppc_md.halt) + ppc_md.halt(); #ifdef CONFIG_SMP smp_send_stop(); #endif @@ -133,34 +173,49 @@ extern u32 cpu_temp_both(unsigned long cpu); #endif /* CONFIG_TAU */ #ifdef CONFIG_SMP -DEFINE_PER_CPU(unsigned int, pvr); +DEFINE_PER_CPU(unsigned int, cpu_pvr); +#endif + +static void show_cpuinfo_summary(struct seq_file *m) +{ + struct device_node *root; + const char *model = NULL; +#if defined(CONFIG_SMP) && defined(CONFIG_PPC32) + unsigned long bogosum = 0; + int i; + for_each_online_cpu(i) + bogosum += loops_per_jiffy; + seq_printf(m, "total bogomips\t: %lu.%02lu\n", + bogosum/(500000/HZ), bogosum/(5000/HZ) % 100); +#endif /* CONFIG_SMP && CONFIG_PPC32 */ + seq_printf(m, "timebase\t: %lu\n", ppc_tb_freq); + if (ppc_md.name) + seq_printf(m, "platform\t: %s\n", ppc_md.name); + root = of_find_node_by_path("/"); + if (root) + model = of_get_property(root, "model", NULL); + if (model) + seq_printf(m, "model\t\t: %s\n", model); + of_node_put(root); + + if (ppc_md.show_cpuinfo != NULL) + ppc_md.show_cpuinfo(m); + +#ifdef CONFIG_PPC32 + /* Display the amount of memory */ + seq_printf(m, "Memory\t\t: %d MB\n", + (unsigned int)(total_memory / (1024 * 1024))); #endif +} static int show_cpuinfo(struct seq_file *m, void *v) { unsigned long cpu_id = (unsigned long)v - 1; unsigned int pvr; + unsigned long proc_freq; unsigned short maj; unsigned short min; - if (cpu_id == NR_CPUS) { -#if defined(CONFIG_SMP) && defined(CONFIG_PPC32) - unsigned long bogosum = 0; - int i; - for (i = 0; i < NR_CPUS; ++i) - if (cpu_online(i)) - bogosum += loops_per_jiffy; - seq_printf(m, "total bogomips\t: %lu.%02lu\n", - bogosum/(500000/HZ), bogosum/(5000/HZ) % 100); -#endif /* CONFIG_SMP && CONFIG_PPC32 */ - seq_printf(m, "timebase\t: %lu\n", ppc_tb_freq); - - if (ppc_md.show_cpuinfo != NULL) - ppc_md.show_cpuinfo(m); - - return 0; - } - /* We only show online cpus: disable preempt (overzealous, I * knew) to prevent cpu going down. */ preempt_disable(); @@ -170,11 +225,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) } #ifdef CONFIG_SMP -#ifdef CONFIG_PPC64 /* XXX for now */ - pvr = per_cpu(pvr, cpu_id); -#else - pvr = cpu_data[cpu_id].pvr; -#endif + pvr = per_cpu(cpu_pvr, cpu_id); #else pvr = mfspr(SPRN_PVR); #endif @@ -201,11 +252,11 @@ static int show_cpuinfo(struct seq_file *m, void *v) #ifdef CONFIG_TAU_AVERAGE /* more straightforward, but potentially misleading */ seq_printf(m, "temperature \t: %u C (uncalibrated)\n", - cpu_temp(i)); + cpu_temp(cpu_id)); #else /* show the actual temp sensor range */ u32 temp; - temp = cpu_temp_both(i); + temp = cpu_temp_both(cpu_id); seq_printf(m, "temperature \t: %u-%u C (uncalibrated)\n", temp & 0xff, temp >> 16); #endif @@ -213,12 +264,19 @@ static int show_cpuinfo(struct seq_file *m, void *v) #endif /* CONFIG_TAU */ /* - * Assume here that all clock rates are the same in a - * smp system. -- Cort + * Platforms that have variable clock rates, should implement + * the method ppc_md.get_proc_freq() that reports the clock + * rate of a given cpu. The rest can use ppc_proc_freq to + * report the clock rate that is same across all cpus. */ - if (ppc_proc_freq) + if (ppc_md.get_proc_freq) + proc_freq = ppc_md.get_proc_freq(cpu_id); + else + proc_freq = ppc_proc_freq; + + if (proc_freq) seq_printf(m, "clock\t\t: %lu.%06luMHz\n", - ppc_proc_freq / 1000000, ppc_proc_freq % 1000000); + proc_freq / 1000000, proc_freq % 1000000); if (ppc_md.show_percpuinfo != NULL) ppc_md.show_percpuinfo(m, cpu_id); @@ -226,8 +284,21 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* If we are a Freescale core do a simple check so * we dont have to keep adding cases in the future */ if (PVR_VER(pvr) & 0x8000) { - maj = PVR_MAJ(pvr); - min = PVR_MIN(pvr); + switch (PVR_VER(pvr)) { + case 0x8000: /* 7441/7450/7451, Voyager */ + case 0x8001: /* 7445/7455, Apollo 6 */ + case 0x8002: /* 7447/7457, Apollo 7 */ + case 0x8003: /* 7447A, Apollo 7 PM */ + case 0x8004: /* 7448, Apollo 8 */ + case 0x800c: /* 7410, Nitro */ + maj = ((pvr >> 8) & 0xF); + min = PVR_MIN(pvr); + break; + default: /* e500/book-e */ + maj = PVR_MAJ(pvr); + min = PVR_MIN(pvr); + break; + } } else { switch (PVR_VER(pvr)) { case 0x0020: /* 403 family */ @@ -259,19 +330,28 @@ static int show_cpuinfo(struct seq_file *m, void *v) #endif preempt_enable(); + + /* If this is the last cpu, print the summary */ + if (cpumask_next(cpu_id, cpu_online_mask) >= nr_cpu_ids) + show_cpuinfo_summary(m); + return 0; } static void *c_start(struct seq_file *m, loff_t *pos) { - unsigned long i = *pos; - - return i <= NR_CPUS ? (void *)(i + 1) : NULL; + if (*pos == 0) /* just in case, cpu 0 is not the first */ + *pos = cpumask_first(cpu_online_mask); + else + *pos = cpumask_next(*pos - 1, cpu_online_mask); + if ((*pos) < nr_cpu_ids) + return (void *)(unsigned long)(*pos + 1); + return NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { - ++*pos; + (*pos)++; return c_start(m, pos); } @@ -279,132 +359,391 @@ static void c_stop(struct seq_file *m, void *v) { } -struct seq_operations cpuinfo_op = { +const struct seq_operations cpuinfo_op = { .start =c_start, .next = c_next, .stop = c_stop, .show = show_cpuinfo, }; -#ifdef CONFIG_PPC_MULTIPLATFORM -static int __init set_preferred_console(void) +void __init check_for_initrd(void) +{ +#ifdef CONFIG_BLK_DEV_INITRD + DBG(" -> check_for_initrd() initrd_start=0x%lx initrd_end=0x%lx\n", + initrd_start, initrd_end); + + /* If we were passed an initrd, set the ROOT_DEV properly if the values + * look sensible. If not, clear initrd reference. + */ + if (is_kernel_addr(initrd_start) && is_kernel_addr(initrd_end) && + initrd_end > initrd_start) + ROOT_DEV = Root_RAM0; + else + initrd_start = initrd_end = 0; + + if (initrd_start) + printk("Found initrd at 0x%lx:0x%lx\n", initrd_start, initrd_end); + + DBG(" <- check_for_initrd()\n"); +#endif /* CONFIG_BLK_DEV_INITRD */ +} + +#ifdef CONFIG_SMP + +int threads_per_core, threads_per_subcore, threads_shift; +cpumask_t threads_core_mask; +EXPORT_SYMBOL_GPL(threads_per_core); +EXPORT_SYMBOL_GPL(threads_per_subcore); +EXPORT_SYMBOL_GPL(threads_shift); +EXPORT_SYMBOL_GPL(threads_core_mask); + +static void __init cpu_init_thread_core_maps(int tpc) { - struct device_node *prom_stdout = NULL; - char *name; - u32 *spd; - int offset = 0; + int i; + + threads_per_core = tpc; + threads_per_subcore = tpc; + cpumask_clear(&threads_core_mask); + + /* This implementation only supports power of 2 number of threads + * for simplicity and performance + */ + threads_shift = ilog2(tpc); + BUG_ON(tpc != (1 << threads_shift)); + + for (i = 0; i < tpc; i++) + cpumask_set_cpu(i, &threads_core_mask); + + printk(KERN_INFO "CPU maps initialized for %d thread%s per core\n", + tpc, tpc > 1 ? "s" : ""); + printk(KERN_DEBUG " (thread shift is %d)\n", threads_shift); +} + + +/** + * setup_cpu_maps - initialize the following cpu maps: + * cpu_possible_mask + * cpu_present_mask + * + * Having the possible map set up early allows us to restrict allocations + * of things like irqstacks to nr_cpu_ids rather than NR_CPUS. + * + * We do not initialize the online map here; cpus set their own bits in + * cpu_online_mask as they come up. + * + * This function is valid only for Open Firmware systems. finish_device_tree + * must be called before using this. + * + * While we're here, we may as well set the "physical" cpu ids in the paca. + * + * NOTE: This must match the parsing done in early_init_dt_scan_cpus. + */ +void __init smp_setup_cpu_maps(void) +{ + struct device_node *dn = NULL; + int cpu = 0; + int nthreads = 1; + + DBG("smp_setup_cpu_maps()\n"); + + while ((dn = of_find_node_by_type(dn, "cpu")) && cpu < nr_cpu_ids) { + const __be32 *intserv; + __be32 cpu_be; + int j, len; + + DBG(" * %s...\n", dn->full_name); + + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", + &len); + if (intserv) { + nthreads = len / sizeof(int); + DBG(" ibm,ppc-interrupt-server#s -> %d threads\n", + nthreads); + } else { + DBG(" no ibm,ppc-interrupt-server#s -> 1 thread\n"); + intserv = of_get_property(dn, "reg", NULL); + if (!intserv) { + cpu_be = cpu_to_be32(cpu); + intserv = &cpu_be; /* assume logical == phys */ + } + } - DBG(" -> set_preferred_console()\n"); + for (j = 0; j < nthreads && cpu < nr_cpu_ids; j++) { + bool avail; - /* The user has requested a console so this is already set up. */ - if (strstr(saved_command_line, "console=")) { - DBG(" console was specified !\n"); - return -EBUSY; + DBG(" thread %d -> cpu %d (hard id %d)\n", + j, cpu, be32_to_cpu(intserv[j])); + + avail = of_device_is_available(dn); + if (!avail) + avail = !of_property_match_string(dn, + "enable-method", "spin-table"); + + set_cpu_present(cpu, avail); + set_hard_smp_processor_id(cpu, be32_to_cpu(intserv[j])); + set_cpu_possible(cpu, true); + cpu++; + } } - if (!of_chosen) { - DBG(" of_chosen is NULL !\n"); - return -ENODEV; + /* If no SMT supported, nthreads is forced to 1 */ + if (!cpu_has_feature(CPU_FTR_SMT)) { + DBG(" SMT disabled ! nthreads forced to 1\n"); + nthreads = 1; } - /* We are getting a weird phandle from OF ... */ - /* ... So use the full path instead */ - name = (char *)get_property(of_chosen, "linux,stdout-path", NULL); - if (name == NULL) { - DBG(" no linux,stdout-path !\n"); - return -ENODEV; + +#ifdef CONFIG_PPC64 + /* + * On pSeries LPAR, we need to know how many cpus + * could possibly be added to this partition. + */ + if (machine_is(pseries) && firmware_has_feature(FW_FEATURE_LPAR) && + (dn = of_find_node_by_path("/rtas"))) { + int num_addr_cell, num_size_cell, maxcpus; + const __be32 *ireg; + + num_addr_cell = of_n_addr_cells(dn); + num_size_cell = of_n_size_cells(dn); + + ireg = of_get_property(dn, "ibm,lrdr-capacity", NULL); + + if (!ireg) + goto out; + + maxcpus = be32_to_cpup(ireg + num_addr_cell + num_size_cell); + + /* Double maxcpus for processors which have SMT capability */ + if (cpu_has_feature(CPU_FTR_SMT)) + maxcpus *= nthreads; + + if (maxcpus > nr_cpu_ids) { + printk(KERN_WARNING + "Partition configured for %d cpus, " + "operating system maximum is %d.\n", + maxcpus, nr_cpu_ids); + maxcpus = nr_cpu_ids; + } else + printk(KERN_INFO "Partition configured for %d cpus.\n", + maxcpus); + + for (cpu = 0; cpu < maxcpus; cpu++) + set_cpu_possible(cpu, true); + out: + of_node_put(dn); } - prom_stdout = of_find_node_by_path(name); - if (!prom_stdout) { - DBG(" can't find stdout package %s !\n", name); + vdso_data->processorCount = num_present_cpus(); +#endif /* CONFIG_PPC64 */ + + /* Initialize CPU <=> thread mapping/ + * + * WARNING: We assume that the number of threads is the same for + * every CPU in the system. If that is not the case, then some code + * here will have to be reworked + */ + cpu_init_thread_core_maps(nthreads); + + /* Now that possible cpus are set, set nr_cpu_ids for later use */ + setup_nr_cpu_ids(); + + free_unused_pacas(); +} +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_PCSPKR_PLATFORM +static __init int add_pcspkr(void) +{ + struct device_node *np; + struct platform_device *pd; + int ret; + + np = of_find_compatible_node(NULL, NULL, "pnpPNP,100"); + of_node_put(np); + if (!np) return -ENODEV; - } - DBG("stdout is %s\n", prom_stdout->full_name); - name = (char *)get_property(prom_stdout, "name", NULL); - if (!name) { - DBG(" stdout package has no name !\n"); - goto not_found; + pd = platform_device_alloc("pcspkr", -1); + if (!pd) + return -ENOMEM; + + ret = platform_device_add(pd); + if (ret) + platform_device_put(pd); + + return ret; +} +device_initcall(add_pcspkr); +#endif /* CONFIG_PCSPKR_PLATFORM */ + +void probe_machine(void) +{ + extern struct machdep_calls __machine_desc_start; + extern struct machdep_calls __machine_desc_end; + + /* + * Iterate all ppc_md structures until we find the proper + * one for the current machine type + */ + DBG("Probing machine type ...\n"); + + for (machine_id = &__machine_desc_start; + machine_id < &__machine_desc_end; + machine_id++) { + DBG(" %s ...", machine_id->name); + memcpy(&ppc_md, machine_id, sizeof(struct machdep_calls)); + if (ppc_md.probe()) { + DBG(" match !\n"); + break; + } + DBG("\n"); } - spd = (u32 *)get_property(prom_stdout, "current-speed", NULL); - - if (0) - ; -#ifdef CONFIG_SERIAL_8250_CONSOLE - else if (strcmp(name, "serial") == 0) { - int i; - u32 *reg = (u32 *)get_property(prom_stdout, "reg", &i); - if (i > 8) { - switch (reg[1]) { - case 0x3f8: - offset = 0; - break; - case 0x2f8: - offset = 1; - break; - case 0x898: - offset = 2; - break; - case 0x890: - offset = 3; - break; - default: - /* We dont recognise the serial port */ - goto not_found; - } + /* What can we do if we didn't find ? */ + if (machine_id >= &__machine_desc_end) { + DBG("No suitable machine found !\n"); + for (;;); + } + + printk(KERN_INFO "Using %s machine description\n", ppc_md.name); +} + +/* Match a class of boards, not a specific device configuration. */ +int check_legacy_ioport(unsigned long base_port) +{ + struct device_node *parent, *np = NULL; + int ret = -ENODEV; + + switch(base_port) { + case I8042_DATA_REG: + if (!(np = of_find_compatible_node(NULL, NULL, "pnpPNP,303"))) + np = of_find_compatible_node(NULL, NULL, "pnpPNP,f03"); + if (np) { + parent = of_get_parent(np); + + of_i8042_kbd_irq = irq_of_parse_and_map(parent, 0); + if (!of_i8042_kbd_irq) + of_i8042_kbd_irq = 1; + + of_i8042_aux_irq = irq_of_parse_and_map(parent, 1); + if (!of_i8042_aux_irq) + of_i8042_aux_irq = 12; + + of_node_put(np); + np = parent; + break; } + np = of_find_node_by_type(NULL, "8042"); + /* Pegasos has no device_type on its 8042 node, look for the + * name instead */ + if (!np) + np = of_find_node_by_name(NULL, "8042"); + if (np) { + of_i8042_kbd_irq = 1; + of_i8042_aux_irq = 12; + } + break; + case FDC_BASE: /* FDC1 */ + np = of_find_node_by_type(NULL, "fdc"); + break; + default: + /* ipmi is supposed to fail here */ + break; } -#endif /* CONFIG_SERIAL_8250_CONSOLE */ -#ifdef CONFIG_PPC_PSERIES - else if (strcmp(name, "vty") == 0) { - u32 *reg = (u32 *)get_property(prom_stdout, "reg", NULL); - char *compat = (char *)get_property(prom_stdout, "compatible", NULL); - - if (reg && compat && (strcmp(compat, "hvterm-protocol") == 0)) { - /* Host Virtual Serial Interface */ - switch (reg[0]) { - case 0x30000000: - offset = 0; - break; - case 0x30000001: - offset = 1; - break; - default: - goto not_found; - } - of_node_put(prom_stdout); - DBG("Found hvsi console at offset %d\n", offset); - return add_preferred_console("hvsi", offset, NULL); - } else { - /* pSeries LPAR virtual console */ - of_node_put(prom_stdout); - DBG("Found hvc console\n"); - return add_preferred_console("hvc", 0, NULL); - } + if (!np) + return ret; + parent = of_get_parent(np); + if (parent) { + if (strcmp(parent->type, "isa") == 0) + ret = 0; + of_node_put(parent); } -#endif /* CONFIG_PPC_PSERIES */ -#ifdef CONFIG_SERIAL_PMACZILOG_CONSOLE - else if (strcmp(name, "ch-a") == 0) - offset = 0; - else if (strcmp(name, "ch-b") == 0) - offset = 1; -#endif /* CONFIG_SERIAL_PMACZILOG_CONSOLE */ - else - goto not_found; - of_node_put(prom_stdout); - - DBG("Found serial console at ttyS%d\n", offset); - - if (spd) { - static char __initdata opt[16]; - sprintf(opt, "%d", *spd); - return add_preferred_console("ttyS", offset, opt); - } else - return add_preferred_console("ttyS", offset, NULL); - - not_found: - DBG("No preferred console found !\n"); - of_node_put(prom_stdout); - return -ENODEV; + of_node_put(np); + return ret; +} +EXPORT_SYMBOL(check_legacy_ioport); + +static int ppc_panic_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + /* + * If firmware-assisted dump has been registered then trigger + * firmware-assisted dump and let firmware handle everything else. + */ + crash_fadump(NULL, ptr); + ppc_md.panic(ptr); /* May not return */ + return NOTIFY_DONE; +} + +static struct notifier_block ppc_panic_block = { + .notifier_call = ppc_panic_event, + .priority = INT_MIN /* may not return; must be done last */ +}; + +void __init setup_panic(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &ppc_panic_block); +} + +#ifdef CONFIG_CHECK_CACHE_COHERENCY +/* + * For platforms that have configurable cache-coherency. This function + * checks that the cache coherency setting of the kernel matches the setting + * left by the firmware, as indicated in the device tree. Since a mismatch + * will eventually result in DMA failures, we print * and error and call + * BUG() in that case. + */ + +#ifdef CONFIG_NOT_COHERENT_CACHE +#define KERNEL_COHERENCY 0 +#else +#define KERNEL_COHERENCY 1 +#endif + +static int __init check_cache_coherency(void) +{ + struct device_node *np; + const void *prop; + int devtree_coherency; + + np = of_find_node_by_path("/"); + prop = of_get_property(np, "coherency-off", NULL); + of_node_put(np); + + devtree_coherency = prop ? 0 : 1; + + if (devtree_coherency != KERNEL_COHERENCY) { + printk(KERN_ERR + "kernel coherency:%s != device tree_coherency:%s\n", + KERNEL_COHERENCY ? "on" : "off", + devtree_coherency ? "on" : "off"); + BUG(); + } + + return 0; +} + +late_initcall(check_cache_coherency); +#endif /* CONFIG_CHECK_CACHE_COHERENCY */ + +#ifdef CONFIG_DEBUG_FS +struct dentry *powerpc_debugfs_root; +EXPORT_SYMBOL(powerpc_debugfs_root); + +static int powerpc_debugfs_init(void) +{ + powerpc_debugfs_root = debugfs_create_dir("powerpc", NULL); + + return powerpc_debugfs_root == NULL; +} +arch_initcall(powerpc_debugfs_init); +#endif + +void ppc_printk_progress(char *s, unsigned short hex) +{ + pr_info("%s\n", s); +} + +void arch_setup_pdev_archdata(struct platform_device *pdev) +{ + pdev->archdata.dma_mask = DMA_BIT_MASK(32); + pdev->dev.dma_mask = &pdev->archdata.dma_mask; + set_dma_ops(&pdev->dev, &dma_direct_ops); } -console_initcall(set_preferred_console); -#endif /* CONFIG_PPC_MULTIPLATFORM */ diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 9680ae99b08..ea4fda60e57 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -2,7 +2,6 @@ * Common prep/pmac/chrp boot and setup code. */ -#include <linux/config.h> #include <linux/module.h> #include <linux/string.h> #include <linux/sched.h> @@ -11,21 +10,19 @@ #include <linux/reboot.h> #include <linux/delay.h> #include <linux/initrd.h> -#include <linux/ide.h> #include <linux/tty.h> #include <linux/bootmem.h> #include <linux/seq_file.h> #include <linux/root_dev.h> #include <linux/cpu.h> #include <linux/console.h> +#include <linux/memblock.h> -#include <asm/residual.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/processor.h> #include <asm/pgtable.h> #include <asm/setup.h> -#include <asm/amigappc.h> #include <asm/smp.h> #include <asm/elf.h> #include <asm/cputable.h> @@ -33,63 +30,34 @@ #include <asm/btext.h> #include <asm/machdep.h> #include <asm/uaccess.h> -#include <asm/system.h> #include <asm/pmac_feature.h> #include <asm/sections.h> #include <asm/nvram.h> #include <asm/xmon.h> #include <asm/time.h> +#include <asm/serial.h> +#include <asm/udbg.h> +#include <asm/mmu_context.h> +#include <asm/epapr_hcalls.h> #define DBG(fmt...) -#if defined CONFIG_KGDB -#include <asm/kgdb.h> -#endif - -extern void platform_init(void); extern void bootx_init(unsigned long r4, unsigned long phys); -extern void ppc6xx_idle(void); -extern void power4_idle(void); - -boot_infos_t *boot_infos; -struct ide_machdep_calls ppc_ide_md; - -/* XXX should go elsewhere */ -int __irq_offset_value; -EXPORT_SYMBOL(__irq_offset_value); - -int boot_cpuid; -EXPORT_SYMBOL_GPL(boot_cpuid); int boot_cpuid_phys; +EXPORT_SYMBOL_GPL(boot_cpuid_phys); + +int smp_hw_index[NR_CPUS]; unsigned long ISA_DMA_THRESHOLD; unsigned int DMA_MODE_READ; unsigned int DMA_MODE_WRITE; -int have_of = 1; - -#ifdef CONFIG_PPC_MULTIPLATFORM -int _machine = 0; - -extern void prep_init(void); -extern void pmac_init(void); -extern void chrp_init(void); - -dev_t boot_dev; -#endif /* CONFIG_PPC_MULTIPLATFORM */ - -#ifdef CONFIG_MAGIC_SYSRQ -unsigned long SYSRQ_KEY = 0x54; -#endif /* CONFIG_MAGIC_SYSRQ */ - #ifdef CONFIG_VGA_CONSOLE unsigned long vgacon_remap_base; +EXPORT_SYMBOL(vgacon_remap_base); #endif -struct machdep_calls ppc_md; -EXPORT_SYMBOL(ppc_md); - /* * These are used in binfmt_elf.c to put aux entries on the stack * for each elf executable being started. @@ -107,66 +75,39 @@ int ucache_bsize; * from the address that it was linked at, so we must use RELOC/PTRRELOC * to access static data (including strings). -- paulus */ -unsigned long __init early_init(unsigned long dt_ptr) +notrace unsigned long __init early_init(unsigned long dt_ptr) { unsigned long offset = reloc_offset(); + struct cpu_spec *spec; /* First zero the BSS -- use memset_io, some platforms don't have * caches on yet */ - memset_io(PTRRELOC(&__bss_start), 0, _end - __bss_start); + memset_io((void __iomem *)PTRRELOC(&__bss_start), 0, + __bss_stop - __bss_start); /* * Identify the CPU type and fix up code sections * that depend on which cpu we have. */ - identify_cpu(offset, 0); - do_cpu_ftr_fixups(offset); + spec = identify_cpu(offset, mfspr(SPRN_PVR)); - return KERNELBASE + offset; -} + do_feature_fixups(spec->cpu_features, + PTRRELOC(&__start___ftr_fixup), + PTRRELOC(&__stop___ftr_fixup)); -#ifdef CONFIG_PPC_MULTIPLATFORM -/* - * The PPC_MULTIPLATFORM version of platform_init... - */ -void __init platform_init(void) -{ - /* if we didn't get any bootinfo telling us what we are... */ - if (_machine == 0) { - /* prep boot loader tells us if we're prep or not */ - if ( *(unsigned long *)(KERNELBASE) == (0xdeadc0de) ) - _machine = _MACH_prep; - } + do_feature_fixups(spec->mmu_features, + PTRRELOC(&__start___mmu_ftr_fixup), + PTRRELOC(&__stop___mmu_ftr_fixup)); -#ifdef CONFIG_PPC_PREP - /* not much more to do here, if prep */ - if (_machine == _MACH_prep) { - prep_init(); - return; - } -#endif + do_lwsync_fixups(spec->cpu_features, + PTRRELOC(&__start___lwsync_fixup), + PTRRELOC(&__stop___lwsync_fixup)); -#ifdef CONFIG_ADB - if (strstr(cmd_line, "adb_sync")) { - extern int __adb_probe_sync; - __adb_probe_sync = 1; - } -#endif /* CONFIG_ADB */ + do_final_fixups(); - switch (_machine) { -#ifdef CONFIG_PPC_PMAC - case _MACH_Pmac: - pmac_init(); - break; -#endif -#ifdef CONFIG_PPC_CHRP - case _MACH_chrp: - chrp_init(); - break; -#endif - } + return KERNELBASE + offset; } -#endif + /* * Find out what kind of machine we're on and save any data we need @@ -174,45 +115,39 @@ void __init platform_init(void) * This is called very early on the boot process, after a minimal * MMU environment has been set up but before MMU_init is called. */ -void __init machine_init(unsigned long dt_ptr, unsigned long phys) +notrace void __init machine_init(u64 dt_ptr) { + lockdep_init(); + + /* Enable early debugging if any specified (see udbg.h) */ + udbg_early_init(); + + /* Do some early initialization based on the flat device tree */ early_init_devtree(__va(dt_ptr)); -#ifdef CONFIG_CMDLINE - strlcpy(cmd_line, CONFIG_CMDLINE, sizeof(cmd_line)); -#endif /* CONFIG_CMDLINE */ + epapr_paravirt_early_init(); + + early_init_mmu(); + + probe_machine(); - platform_init(); + setup_kdump_trampoline(); #ifdef CONFIG_6xx - ppc_md.power_save = ppc6xx_idle; + if (cpu_has_feature(CPU_FTR_CAN_DOZE) || + cpu_has_feature(CPU_FTR_CAN_NAP)) + ppc_md.power_save = ppc6xx_idle; #endif +#ifdef CONFIG_E500 + if (cpu_has_feature(CPU_FTR_CAN_DOZE) || + cpu_has_feature(CPU_FTR_CAN_NAP)) + ppc_md.power_save = e500_idle; +#endif if (ppc_md.progress) ppc_md.progress("id mach(): done", 0x200); } -#ifdef CONFIG_BOOKE_WDT -/* Checks wdt=x and wdt_period=xx command-line option */ -int __init early_parse_wdt(char *p) -{ - if (p && strncmp(p, "0", 1) != 0) - booke_wdt_enabled = 1; - - return 0; -} -early_param("wdt", early_parse_wdt); - -int __init early_parse_wdt_period (char *p) -{ - if (p) - booke_wdt_period = simple_strtoul(p, NULL, 0); - - return 0; -} -early_param("wdt_period", early_parse_wdt_period); -#endif /* CONFIG_BOOKE_WDT */ - /* Checks "l2cr=xxxx" command-line option */ int __init ppc_setup_l2cr(char *str) { @@ -226,6 +161,18 @@ int __init ppc_setup_l2cr(char *str) } __setup("l2cr=", ppc_setup_l2cr); +/* Checks "l3cr=xxxx" command-line option */ +int __init ppc_setup_l3cr(char *str) +{ + if (cpu_has_feature(CPU_FTR_L3CR)) { + unsigned long val = simple_strtoul(str, NULL, 0); + printk(KERN_INFO "l3cr set to %lx\n", val); + _set_L3CR(val); /* and enable it */ + } + return 1; +} +__setup("l3cr=", ppc_setup_l3cr); + #ifdef CONFIG_GENERIC_NVRAM /* Generic nvram hooks used by drivers/char/gen_nvram.c */ @@ -244,6 +191,14 @@ void nvram_write_byte(unsigned char val, int addr) } EXPORT_SYMBOL(nvram_write_byte); +ssize_t nvram_get_size(void) +{ + if (ppc_md.nvram_size) + return ppc_md.nvram_size(); + return -1; +} +EXPORT_SYMBOL(nvram_get_size); + void nvram_sync(void) { if (ppc_md.nvram_sync) @@ -253,19 +208,11 @@ EXPORT_SYMBOL(nvram_sync); #endif /* CONFIG_NVRAM */ -static struct cpu cpu_devices[NR_CPUS]; - int __init ppc_init(void) { - int i; - /* clear the progress line */ - if ( ppc_md.progress ) ppc_md.progress(" ", 0xffff); - - /* register CPU devices */ - for (i = 0; i < NR_CPUS; i++) - if (cpu_possible(i)) - register_cpu(&cpu_devices[i], i, NULL); + if (ppc_md.progress) + ppc_md.progress(" ", 0xffff); /* call platform init */ if (ppc_md.init != NULL) { @@ -276,97 +223,108 @@ int __init ppc_init(void) arch_initcall(ppc_init); +static void __init irqstack_early_init(void) +{ + unsigned int i; + + /* interrupt stacks must be in lowmem, we get that for free on ppc32 + * as the memblock is limited to lowmem by default */ + for_each_possible_cpu(i) { + softirq_ctx[i] = (struct thread_info *) + __va(memblock_alloc(THREAD_SIZE, THREAD_SIZE)); + hardirq_ctx[i] = (struct thread_info *) + __va(memblock_alloc(THREAD_SIZE, THREAD_SIZE)); + } +} + +#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) +static void __init exc_lvl_early_init(void) +{ + unsigned int i, hw_cpu; + + /* interrupt stacks must be in lowmem, we get that for free on ppc32 + * as the memblock is limited to lowmem by MEMBLOCK_REAL_LIMIT */ + for_each_possible_cpu(i) { +#ifdef CONFIG_SMP + hw_cpu = get_hard_smp_processor_id(i); +#else + hw_cpu = 0; +#endif + + critirq_ctx[hw_cpu] = (struct thread_info *) + __va(memblock_alloc(THREAD_SIZE, THREAD_SIZE)); +#ifdef CONFIG_BOOKE + dbgirq_ctx[hw_cpu] = (struct thread_info *) + __va(memblock_alloc(THREAD_SIZE, THREAD_SIZE)); + mcheckirq_ctx[hw_cpu] = (struct thread_info *) + __va(memblock_alloc(THREAD_SIZE, THREAD_SIZE)); +#endif + } +} +#else +#define exc_lvl_early_init() +#endif + /* Warning, IO base is not yet inited */ void __init setup_arch(char **cmdline_p) { - extern char *klimit; - extern void do_init_bootmem(void); + *cmdline_p = cmd_line; /* so udelay does something sensible, assume <= 1000 bogomips */ loops_per_jiffy = 500000000 / HZ; unflatten_device_tree(); - finish_device_tree(); + check_for_initrd(); -#ifdef CONFIG_BOOTX_TEXT - init_boot_display(); -#endif + if (ppc_md.init_early) + ppc_md.init_early(); -#ifdef CONFIG_PPC_PMAC - /* This could be called "early setup arch", it must be done - * now because xmon need it - */ - if (_machine == _MACH_Pmac) - pmac_feature_init(); /* New cool way */ -#endif + find_legacy_serial_ports(); -#ifdef CONFIG_XMON - xmon_map_scc(); - if (strstr(cmd_line, "xmon")) { - xmon_init(1); - debugger(NULL); - } -#endif /* CONFIG_XMON */ - if ( ppc_md.progress ) ppc_md.progress("setup_arch: enter", 0x3eab); - -#if defined(CONFIG_KGDB) - if (ppc_md.kgdb_map_scc) - ppc_md.kgdb_map_scc(); - set_debug_traps(); - if (strstr(cmd_line, "gdb")) { - if (ppc_md.progress) - ppc_md.progress("setup_arch: kgdb breakpoint", 0x4000); - printk("kgdb breakpoint activated\n"); - breakpoint(); - } -#endif + smp_setup_cpu_maps(); + + /* Register early console */ + register_early_udbg_console(); + + xmon_setup(); /* * Set cache line size based on type of cpu as a default. * Systems with OF can look in the properties on the cpu node(s) * for a possibly more accurate value. */ - if (cpu_has_feature(CPU_FTR_SPLIT_ID_CACHE)) { - dcache_bsize = cur_cpu_spec->dcache_bsize; - icache_bsize = cur_cpu_spec->icache_bsize; - ucache_bsize = 0; - } else - ucache_bsize = dcache_bsize = icache_bsize - = cur_cpu_spec->dcache_bsize; - - /* reboot on panic */ - panic_timeout = 180; - - init_mm.start_code = PAGE_OFFSET; + dcache_bsize = cur_cpu_spec->dcache_bsize; + icache_bsize = cur_cpu_spec->icache_bsize; + ucache_bsize = 0; + if (cpu_has_feature(CPU_FTR_UNIFIED_ID_CACHE)) + ucache_bsize = icache_bsize = dcache_bsize; + + if (ppc_md.panic) + setup_panic(); + + init_mm.start_code = (unsigned long)_stext; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; - init_mm.brk = (unsigned long) klimit; + init_mm.brk = klimit; - /* Save unparsed command line copy for /proc/cmdline */ - strlcpy(saved_command_line, cmd_line, COMMAND_LINE_SIZE); - *cmdline_p = cmd_line; + exc_lvl_early_init(); - parse_early_param(); + irqstack_early_init(); /* set up the bootmem stuff with available memory */ do_init_bootmem(); if ( ppc_md.progress ) ppc_md.progress("setup_arch: bootmem", 0x3eab); -#ifdef CONFIG_PPC_OCP - /* Initialize OCP device list */ - ocp_early_init(); - if ( ppc_md.progress ) ppc_md.progress("ocp: exit", 0x3eab); -#endif - #ifdef CONFIG_DUMMY_CONSOLE conswitchp = &dummy_con; #endif - ppc_md.setup_arch(); + if (ppc_md.setup_arch) + ppc_md.setup_arch(); if ( ppc_md.progress ) ppc_md.progress("arch: exit", 0x3eab); paging_init(); - /* this is for modules since _machine can be a define -- Cort */ - ppc_md.ppc_machine = _machine; + /* Initialize the MMU context management stuff */ + mmu_context_init(); } diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 40c48100bf1..ee082d77117 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -10,10 +10,9 @@ * 2 of the License, or (at your option) any later version. */ -#undef DEBUG +#define DEBUG -#include <linux/config.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/string.h> #include <linux/sched.h> #include <linux/init.h> @@ -21,7 +20,6 @@ #include <linux/reboot.h> #include <linux/delay.h> #include <linux/initrd.h> -#include <linux/ide.h> #include <linux/seq_file.h> #include <linux/ioport.h> #include <linux/console.h> @@ -33,7 +31,15 @@ #include <linux/unistd.h> #include <linux/serial.h> #include <linux/serial_8250.h> +#include <linux/bootmem.h> +#include <linux/pci.h> +#include <linux/lockdep.h> +#include <linux/memblock.h> +#include <linux/hugetlb.h> +#include <linux/memory.h> + #include <asm/io.h> +#include <asm/kdump.h> #include <asm/prom.h> #include <asm/processor.h> #include <asm/pgtable.h> @@ -41,25 +47,27 @@ #include <asm/elf.h> #include <asm/machdep.h> #include <asm/paca.h> -#include <asm/ppcdebug.h> #include <asm/time.h> #include <asm/cputable.h> #include <asm/sections.h> #include <asm/btext.h> #include <asm/nvram.h> #include <asm/setup.h> -#include <asm/system.h> #include <asm/rtas.h> #include <asm/iommu.h> #include <asm/serial.h> #include <asm/cache.h> #include <asm/page.h> #include <asm/mmu.h> -#include <asm/lmb.h> -#include <asm/iSeries/ItLpNaca.h> #include <asm/firmware.h> -#include <asm/systemcfg.h> #include <asm/xmon.h> +#include <asm/udbg.h> +#include <asm/kexec.h> +#include <asm/mmu_context.h> +#include <asm/code-patching.h> +#include <asm/kvm_ppc.h> +#include <asm/hugetlb.h> +#include <asm/epapr_hcalls.h> #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -67,51 +75,18 @@ #define DBG(fmt...) #endif -/* - * Here are some early debugging facilities. You can enable one - * but your kernel will not boot on anything else if you do so - */ - -/* This one is for use on LPAR machines that support an HVC console - * on vterm 0 - */ -extern void udbg_init_debug_lpar(void); -/* This one is for use on Apple G5 machines - */ -extern void udbg_init_pmac_realmode(void); -/* That's RTAS panel debug */ -extern void call_rtas_display_status_delay(unsigned char c); -/* Here's maple real mode debug */ -extern void udbg_init_maple_realmode(void); - -#define EARLY_DEBUG_INIT() do {} while(0) - -#if 0 -#define EARLY_DEBUG_INIT() udbg_init_debug_lpar() -#define EARLY_DEBUG_INIT() udbg_init_maple_realmode() -#define EARLY_DEBUG_INIT() udbg_init_pmac_realmode() -#define EARLY_DEBUG_INIT() \ - do { udbg_putc = call_rtas_display_status_delay; } while(0) -#endif - -/* extern void *stab; */ -extern unsigned long klimit; - -extern void mm_init_ppc64(void); -extern void stab_initialize(unsigned long stab); -extern void htab_initialize(void); -extern void early_init_devtree(void *flat_dt); -extern void unflatten_device_tree(void); - -extern void smp_release_cpus(void); - -int have_of = 1; -int boot_cpuid = 0; -int boot_cpuid_phys = 0; -dev_t boot_dev; +int spinning_secondaries; u64 ppc64_pft_size; -struct ppc64_caches ppc64_caches; +/* Pick defaults since we might want to patch instructions + * before we've read this from the device tree. + */ +struct ppc64_caches ppc64_caches = { + .dline_size = 0x40, + .log_dline_size = 6, + .iline_size = 0x40, + .log_iline_size = 6 +}; EXPORT_SYMBOL_GPL(ppc64_caches); /* @@ -122,202 +97,117 @@ int dcache_bsize; int icache_bsize; int ucache_bsize; -/* The main machine-dep calls structure - */ -struct machdep_calls ppc_md; -EXPORT_SYMBOL(ppc_md); +#if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP) +static void setup_tlb_core_data(void) +{ + int cpu; -#ifdef CONFIG_MAGIC_SYSRQ -unsigned long SYSRQ_KEY; -#endif /* CONFIG_MAGIC_SYSRQ */ + BUILD_BUG_ON(offsetof(struct tlb_core_data, lock) != 0); + for_each_possible_cpu(cpu) { + int first = cpu_first_thread_sibling(cpu); -static int ppc64_panic_event(struct notifier_block *, unsigned long, void *); -static struct notifier_block ppc64_panic_block = { - .notifier_call = ppc64_panic_event, - .priority = INT_MIN /* may not return; must be done last */ -}; + paca[cpu].tcd_ptr = &paca[first].tcd; + + /* + * If we have threads, we need either tlbsrx. + * or e6500 tablewalk mode, or else TLB handlers + * will be racy and could produce duplicate entries. + */ + if (smt_enabled_at_boot >= 2 && + !mmu_has_feature(MMU_FTR_USE_TLBRSRV) && + book3e_htw_mode != PPC_HTW_E6500) { + /* Should we panic instead? */ + WARN_ONCE("%s: unsupported MMU configuration -- expect problems\n", + __func__); + } + } +} +#else +static void setup_tlb_core_data(void) +{ +} +#endif #ifdef CONFIG_SMP -static int smt_enabled_cmdline; +static char *smt_enabled_cmdline; /* Look for ibm,smt-enabled OF option */ static void check_smt_enabled(void) { struct device_node *dn; - char *smt_option; - - /* Allow the command line to overrule the OF option */ - if (smt_enabled_cmdline) - return; + const char *smt_option; - dn = of_find_node_by_path("/options"); + /* Default to enabling all threads */ + smt_enabled_at_boot = threads_per_core; - if (dn) { - smt_option = (char *)get_property(dn, "ibm,smt-enabled", NULL); + /* Allow the command line to overrule the OF option */ + if (smt_enabled_cmdline) { + if (!strcmp(smt_enabled_cmdline, "on")) + smt_enabled_at_boot = threads_per_core; + else if (!strcmp(smt_enabled_cmdline, "off")) + smt_enabled_at_boot = 0; + else { + long smt; + int rc; + + rc = strict_strtol(smt_enabled_cmdline, 10, &smt); + if (!rc) + smt_enabled_at_boot = + min(threads_per_core, (int)smt); + } + } else { + dn = of_find_node_by_path("/options"); + if (dn) { + smt_option = of_get_property(dn, "ibm,smt-enabled", + NULL); + + if (smt_option) { + if (!strcmp(smt_option, "on")) + smt_enabled_at_boot = threads_per_core; + else if (!strcmp(smt_option, "off")) + smt_enabled_at_boot = 0; + } - if (smt_option) { - if (!strcmp(smt_option, "on")) - smt_enabled_at_boot = 1; - else if (!strcmp(smt_option, "off")) - smt_enabled_at_boot = 0; - } - } + of_node_put(dn); + } + } } /* Look for smt-enabled= cmdline option */ static int __init early_smt_enabled(char *p) { - smt_enabled_cmdline = 1; - - if (!p) - return 0; - - if (!strcmp(p, "on") || !strcmp(p, "1")) - smt_enabled_at_boot = 1; - else if (!strcmp(p, "off") || !strcmp(p, "0")) - smt_enabled_at_boot = 0; - + smt_enabled_cmdline = p; return 0; } early_param("smt-enabled", early_smt_enabled); -/** - * setup_cpu_maps - initialize the following cpu maps: - * cpu_possible_map - * cpu_present_map - * cpu_sibling_map - * - * Having the possible map set up early allows us to restrict allocations - * of things like irqstacks to num_possible_cpus() rather than NR_CPUS. - * - * We do not initialize the online map here; cpus set their own bits in - * cpu_online_map as they come up. - * - * This function is valid only for Open Firmware systems. finish_device_tree - * must be called before using this. - * - * While we're here, we may as well set the "physical" cpu ids in the paca. - */ -static void __init setup_cpu_maps(void) -{ - struct device_node *dn = NULL; - int cpu = 0; - int swap_cpuid = 0; - - check_smt_enabled(); - - while ((dn = of_find_node_by_type(dn, "cpu")) && cpu < NR_CPUS) { - u32 *intserv; - int j, len = sizeof(u32), nthreads; - - intserv = (u32 *)get_property(dn, "ibm,ppc-interrupt-server#s", - &len); - if (!intserv) - intserv = (u32 *)get_property(dn, "reg", NULL); - - nthreads = len / sizeof(u32); - - for (j = 0; j < nthreads && cpu < NR_CPUS; j++) { - cpu_set(cpu, cpu_present_map); - set_hard_smp_processor_id(cpu, intserv[j]); - - if (intserv[j] == boot_cpuid_phys) - swap_cpuid = cpu; - cpu_set(cpu, cpu_possible_map); - cpu++; - } - } - - /* Swap CPU id 0 with boot_cpuid_phys, so we can always assume that - * boot cpu is logical 0. - */ - if (boot_cpuid_phys != get_hard_smp_processor_id(0)) { - u32 tmp; - tmp = get_hard_smp_processor_id(0); - set_hard_smp_processor_id(0, boot_cpuid_phys); - set_hard_smp_processor_id(swap_cpuid, tmp); - } +#else +#define check_smt_enabled() +#endif /* CONFIG_SMP */ - /* - * On pSeries LPAR, we need to know how many cpus - * could possibly be added to this partition. - */ - if (systemcfg->platform == PLATFORM_PSERIES_LPAR && - (dn = of_find_node_by_path("/rtas"))) { - int num_addr_cell, num_size_cell, maxcpus; - unsigned int *ireg; - - num_addr_cell = prom_n_addr_cells(dn); - num_size_cell = prom_n_size_cells(dn); - - ireg = (unsigned int *) - get_property(dn, "ibm,lrdr-capacity", NULL); - - if (!ireg) - goto out; - - maxcpus = ireg[num_addr_cell + num_size_cell]; - - /* Double maxcpus for processors which have SMT capability */ - if (cpu_has_feature(CPU_FTR_SMT)) - maxcpus *= 2; - - if (maxcpus > NR_CPUS) { - printk(KERN_WARNING - "Partition configured for %d cpus, " - "operating system maximum is %d.\n", - maxcpus, NR_CPUS); - maxcpus = NR_CPUS; - } else - printk(KERN_INFO "Partition configured for %d cpus.\n", - maxcpus); - - for (cpu = 0; cpu < maxcpus; cpu++) - cpu_set(cpu, cpu_possible_map); - out: - of_node_put(dn); - } +/** Fix up paca fields required for the boot cpu */ +static void fixup_boot_paca(void) +{ + /* The boot cpu is started */ + get_paca()->cpu_start = 1; + /* Allow percpu accesses to work until we setup percpu data */ + get_paca()->data_offset = 0; +} - /* - * Do the sibling map; assume only two threads per processor. - */ - for_each_cpu(cpu) { - cpu_set(cpu, cpu_sibling_map[cpu]); - if (cpu_has_feature(CPU_FTR_SMT)) - cpu_set(cpu ^ 0x1, cpu_sibling_map[cpu]); +static void cpu_ready_for_interrupts(void) +{ + /* Set IR and DR in PACA MSR */ + get_paca()->kernel_msr = MSR_KERNEL; + + /* Enable AIL if supported */ + if (cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_ARCH_207S)) { + unsigned long lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3); } - - systemcfg->processorCount = num_present_cpus(); } -#endif /* CONFIG_SMP */ - -extern struct machdep_calls pSeries_md; -extern struct machdep_calls pmac_md; -extern struct machdep_calls maple_md; -extern struct machdep_calls bpa_md; -extern struct machdep_calls iseries_md; - -/* Ultimately, stuff them in an elf section like initcalls... */ -static struct machdep_calls __initdata *machines[] = { -#ifdef CONFIG_PPC_PSERIES - &pSeries_md, -#endif /* CONFIG_PPC_PSERIES */ -#ifdef CONFIG_PPC_PMAC - &pmac_md, -#endif /* CONFIG_PPC_PMAC */ -#ifdef CONFIG_PPC_MAPLE - &maple_md, -#endif /* CONFIG_PPC_MAPLE */ -#ifdef CONFIG_PPC_BPA - &bpa_md, -#endif -#ifdef CONFIG_PPC_ISERIES - &iseries_md, -#endif - NULL -}; /* * Early initialization entry point. This is called by head.S @@ -325,7 +215,7 @@ static struct machdep_calls __initdata *machines[] = { * the CPU that ignores the top 2 bits of the address in real * mode so we can access kernel globals normally provided we * only toy with things in the RMO region. From here, we do - * some early parsing of the device-tree to setup out LMB + * some early parsing of the device-tree to setup out MEMBLOCK * data structures, and allocate & initialize the hash table * and segment tables so we can start running with translation * enabled. @@ -340,68 +230,137 @@ static struct machdep_calls __initdata *machines[] = { void __init early_setup(unsigned long dt_ptr) { - struct paca_struct *lpaca = get_paca(); - static struct machdep_calls **mach; + static __initdata struct paca_struct boot_paca; + + /* -------- printk is _NOT_ safe to use here ! ------- */ + + /* Identify CPU type */ + identify_cpu(0, mfspr(SPRN_PVR)); + + /* Assume we're on cpu 0 for now. Don't write to the paca yet! */ + initialise_paca(&boot_paca, 0); + setup_paca(&boot_paca); + fixup_boot_paca(); + + /* Initialize lockdep early or else spinlocks will blow */ + lockdep_init(); + + /* -------- printk is now safe to use ------- */ + + /* Enable early debugging if any specified (see udbg.h) */ + udbg_early_init(); + + DBG(" -> early_setup(), dt_ptr: 0x%lx\n", dt_ptr); /* - * Enable early debugging if any specified (see top of - * this file) + * Do early initialization using the flattened device + * tree, such as retrieving the physical memory map or + * calculating/retrieving the hash table size. */ - EARLY_DEBUG_INIT(); + early_init_devtree(__va(dt_ptr)); + + epapr_paravirt_early_init(); + + /* Now we know the logical id of our boot cpu, setup the paca. */ + setup_paca(&paca[boot_cpuid]); + fixup_boot_paca(); + + /* Probe the machine type */ + probe_machine(); - DBG(" -> early_setup()\n"); + setup_kdump_trampoline(); + + DBG("Found, Initializing memory management...\n"); + + /* Initialize the hash table or TLB handling */ + early_init_mmu(); /* - * Fill the default DBG level (do we want to keep - * that old mecanism around forever ?) + * At this point, we can let interrupts switch to virtual mode + * (the MMU has been setup), so adjust the MSR in the PACA to + * have IR and DR set and enable AIL if it exists */ - ppcdbg_initialize(); + cpu_ready_for_interrupts(); + + /* Reserve large chunks of memory for use by CMA for KVM */ + kvm_cma_reserve(); /* - * Do early initializations using the flattened device - * tree, like retreiving the physical memory map or - * calculating/retreiving the hash table size + * Reserve any gigantic pages requested on the command line. + * memblock needs to have been initialized by the time this is + * called since this will reserve memory. */ - early_init_devtree(__va(dt_ptr)); + reserve_hugetlb_gpages(); + + DBG(" <- early_setup()\n"); +#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX /* - * Iterate all ppc_md structures until we find the proper - * one for the current machine type + * This needs to be done *last* (after the above DBG() even) + * + * Right after we return from this function, we turn on the MMU + * which means the real-mode access trick that btext does will + * no longer work, it needs to switch to using a real MMU + * mapping. This call will ensure that it does */ - DBG("Probing machine type for platform %x...\n", - systemcfg->platform); + btext_map(); +#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */ +} - for (mach = machines; *mach; mach++) { - if ((*mach)->probe(systemcfg->platform)) - break; - } - /* What can we do if we didn't find ? */ - if (*mach == NULL) { - DBG("No suitable machine found !\n"); - for (;;); - } - ppc_md = **mach; +#ifdef CONFIG_SMP +void early_setup_secondary(void) +{ + /* Mark interrupts enabled in PACA */ + get_paca()->soft_enabled = 0; - DBG("Found, Initializing memory management...\n"); + /* Initialize the hash table or TLB handling */ + early_init_mmu_secondary(); /* - * Initialize stab / SLB management + * At this point, we can let interrupts switch to virtual mode + * (the MMU has been setup), so adjust the MSR in the PACA to + * have IR and DR set. */ - if (!firmware_has_feature(FW_FEATURE_ISERIES)) - stab_initialize(lpaca->stab_real); + cpu_ready_for_interrupts(); +} - /* - * Initialize the MMU Hash table and create the linear mapping - * of memory +#endif /* CONFIG_SMP */ + +#if defined(CONFIG_SMP) || defined(CONFIG_KEXEC) +void smp_release_cpus(void) +{ + unsigned long *ptr; + int i; + + DBG(" -> smp_release_cpus()\n"); + + /* All secondary cpus are spinning on a common spinloop, release them + * all now so they can start to spin on their individual paca + * spinloops. For non SMP kernels, the secondary cpus never get out + * of the common spinloop. */ - htab_initialize(); - DBG(" <- early_setup()\n"); -} + ptr = (unsigned long *)((unsigned long)&__secondary_hold_spinloop + - PHYSICAL_START); + *ptr = ppc_function_entry(generic_secondary_smp_init); + /* And wait a bit for them to catch up */ + for (i = 0; i < 100000; i++) { + mb(); + HMT_low(); + if (spinning_secondaries == 0) + break; + udelay(1); + } + DBG("spinning_secondaries = %d\n", spinning_secondaries); + + DBG(" <- smp_release_cpus()\n"); +} +#endif /* CONFIG_SMP || CONFIG_KEXEC */ /* - * Initialize some remaining members of the ppc64_caches and systemcfg structures + * Initialize some remaining members of the ppc64_caches and systemcfg + * structures * (at least until we get rid of them completely). This is mostly some * cache informations about the CPU that will be used by cache flush * routines and/or provided to userland @@ -413,111 +372,67 @@ static void __init initialize_cache_info(void) DBG(" -> initialize_cache_info()\n"); - for (np = NULL; (np = of_find_node_by_type(np, "cpu"));) { + for_each_node_by_type(np, "cpu") { num_cpus += 1; - /* We're assuming *all* of the CPUs have the same + /* + * We're assuming *all* of the CPUs have the same * d-cache and i-cache sizes... -Peter */ - - if ( num_cpus == 1 ) { - u32 *sizep, *lsizep; + if (num_cpus == 1) { + const __be32 *sizep, *lsizep; u32 size, lsize; - const char *dc, *ic; - - /* Then read cache informations */ - if (systemcfg->platform == PLATFORM_POWERMAC) { - dc = "d-cache-block-size"; - ic = "i-cache-block-size"; - } else { - dc = "d-cache-line-size"; - ic = "i-cache-line-size"; - } size = 0; lsize = cur_cpu_spec->dcache_bsize; - sizep = (u32 *)get_property(np, "d-cache-size", NULL); + sizep = of_get_property(np, "d-cache-size", NULL); if (sizep != NULL) - size = *sizep; - lsizep = (u32 *) get_property(np, dc, NULL); + size = be32_to_cpu(*sizep); + lsizep = of_get_property(np, "d-cache-block-size", + NULL); + /* fallback if block size missing */ + if (lsizep == NULL) + lsizep = of_get_property(np, + "d-cache-line-size", + NULL); if (lsizep != NULL) - lsize = *lsizep; - if (sizep == 0 || lsizep == 0) + lsize = be32_to_cpu(*lsizep); + if (sizep == NULL || lsizep == NULL) DBG("Argh, can't find dcache properties ! " "sizep: %p, lsizep: %p\n", sizep, lsizep); - systemcfg->dcache_size = ppc64_caches.dsize = size; - systemcfg->dcache_line_size = - ppc64_caches.dline_size = lsize; + ppc64_caches.dsize = size; + ppc64_caches.dline_size = lsize; ppc64_caches.log_dline_size = __ilog2(lsize); ppc64_caches.dlines_per_page = PAGE_SIZE / lsize; size = 0; lsize = cur_cpu_spec->icache_bsize; - sizep = (u32 *)get_property(np, "i-cache-size", NULL); + sizep = of_get_property(np, "i-cache-size", NULL); if (sizep != NULL) - size = *sizep; - lsizep = (u32 *)get_property(np, ic, NULL); + size = be32_to_cpu(*sizep); + lsizep = of_get_property(np, "i-cache-block-size", + NULL); + if (lsizep == NULL) + lsizep = of_get_property(np, + "i-cache-line-size", + NULL); if (lsizep != NULL) - lsize = *lsizep; - if (sizep == 0 || lsizep == 0) + lsize = be32_to_cpu(*lsizep); + if (sizep == NULL || lsizep == NULL) DBG("Argh, can't find icache properties ! " "sizep: %p, lsizep: %p\n", sizep, lsizep); - systemcfg->icache_size = ppc64_caches.isize = size; - systemcfg->icache_line_size = - ppc64_caches.iline_size = lsize; + ppc64_caches.isize = size; + ppc64_caches.iline_size = lsize; ppc64_caches.log_iline_size = __ilog2(lsize); ppc64_caches.ilines_per_page = PAGE_SIZE / lsize; } } - /* Add an eye catcher and the systemcfg layout version number */ - strcpy(systemcfg->eye_catcher, "SYSTEMCFG:PPC64"); - systemcfg->version.major = SYSTEMCFG_MAJOR; - systemcfg->version.minor = SYSTEMCFG_MINOR; - systemcfg->processor = mfspr(SPRN_PVR); - DBG(" <- initialize_cache_info()\n"); } -static void __init check_for_initrd(void) -{ -#ifdef CONFIG_BLK_DEV_INITRD - u64 *prop; - - DBG(" -> check_for_initrd()\n"); - - if (of_chosen) { - prop = (u64 *)get_property(of_chosen, - "linux,initrd-start", NULL); - if (prop != NULL) { - initrd_start = (unsigned long)__va(*prop); - prop = (u64 *)get_property(of_chosen, - "linux,initrd-end", NULL); - if (prop != NULL) { - initrd_end = (unsigned long)__va(*prop); - initrd_below_start_ok = 1; - } else - initrd_start = 0; - } - } - - /* If we were passed an initrd, set the ROOT_DEV properly if the values - * look sensible. If not, clear initrd reference. - */ - if (initrd_start >= KERNELBASE && initrd_end >= KERNELBASE && - initrd_end > initrd_start) - ROOT_DEV = Root_RAM0; - else - initrd_start = initrd_end = 0; - - if (initrd_start) - printk("Found initrd at 0x%lx:0x%lx\n", initrd_start, initrd_end); - - DBG(" <- check_for_initrd()\n"); -#endif /* CONFIG_BLK_DEV_INITRD */ -} /* * Do some initial setup of the system. The parameters are those which @@ -527,6 +442,19 @@ void __init setup_system(void) { DBG(" -> setup_system()\n"); + /* Apply the CPUs-specific and firmware specific fixups to kernel + * text (nop out sections not relevant to this CPU or this firmware) + */ + do_feature_fixups(cur_cpu_spec->cpu_features, + &__start___ftr_fixup, &__stop___ftr_fixup); + do_feature_fixups(cur_cpu_spec->mmu_features, + &__start___mmu_ftr_fixup, &__stop___mmu_ftr_fixup); + do_feature_fixups(powerpc_firmware_features, + &__start___fw_ftr_fixup, &__stop___fw_ftr_fixup); + do_lwsync_fixups(cur_cpu_spec->cpu_features, + &__start___lwsync_fixup, &__stop___lwsync_fixup); + do_final_fixups(); + /* * Unflatten the device-tree passed by prom_init or kexec */ @@ -534,13 +462,7 @@ void __init setup_system(void) /* * Fill the ppc64_caches & systemcfg structures with informations - * retreived from the device-tree. Need to be called before - * finish_device_tree() since the later requires some of the - * informations filled up here to properly parse the interrupt - * tree. - * It also sets up the cache line sizes which allows to call - * routines like flush_icache_range (used by the hash init - * later on). + * retrieved from the device-tree. */ initialize_cache_info(); @@ -561,120 +483,136 @@ void __init setup_system(void) * setting up the hash table pointers. It also sets up some interrupt-mapping * related options that will be used by finish_device_tree() */ - ppc_md.init_early(); + if (ppc_md.init_early) + ppc_md.init_early(); - /* - * "Finish" the device-tree, that is do the actual parsing of - * some of the properties like the interrupt map + /* + * We can discover serial ports now since the above did setup the + * hash table management for us, thus ioremap works. We do that early + * so that further code can be debugged */ - finish_device_tree(); - -#ifdef CONFIG_BOOTX_TEXT - init_boot_display(); -#endif + find_legacy_serial_ports(); /* - * Initialize xmon - */ -#ifdef CONFIG_XMON_DEFAULT - xmon_init(1); -#endif - /* * Register early console */ register_early_udbg_console(); - /* Save unparsed command line copy for /proc/cmdline */ - strlcpy(saved_command_line, cmd_line, COMMAND_LINE_SIZE); - - parse_early_param(); - -#ifdef CONFIG_SMP /* - * iSeries has already initialized the cpu maps at this point. + * Initialize xmon */ - setup_cpu_maps(); + xmon_setup(); + smp_setup_cpu_maps(); + check_smt_enabled(); + setup_tlb_core_data(); + +#ifdef CONFIG_SMP /* Release secondary cpus out of their spinloops at 0x60 now that * we can map physical -> logical CPU ids */ smp_release_cpus(); #endif - printk("Starting Linux PPC64 %s\n", system_utsname.version); + printk("Starting Linux PPC64 %s\n", init_utsname()->version); printk("-----------------------------------------------------\n"); - printk("ppc64_pft_size = 0x%lx\n", ppc64_pft_size); - printk("ppc64_debug_switch = 0x%lx\n", ppc64_debug_switch); - printk("ppc64_interrupt_controller = 0x%ld\n", ppc64_interrupt_controller); - printk("systemcfg = 0x%p\n", systemcfg); - printk("systemcfg->platform = 0x%x\n", systemcfg->platform); - printk("systemcfg->processorCount = 0x%lx\n", systemcfg->processorCount); - printk("systemcfg->physicalMemorySize = 0x%lx\n", systemcfg->physicalMemorySize); - printk("ppc64_caches.dcache_line_size = 0x%x\n", - ppc64_caches.dline_size); - printk("ppc64_caches.icache_line_size = 0x%x\n", - ppc64_caches.iline_size); - printk("htab_address = 0x%p\n", htab_address); + printk("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); + printk("physicalMemorySize = 0x%llx\n", memblock_phys_mem_size()); + if (ppc64_caches.dline_size != 0x80) + printk("ppc64_caches.dcache_line_size = 0x%x\n", + ppc64_caches.dline_size); + if (ppc64_caches.iline_size != 0x80) + printk("ppc64_caches.icache_line_size = 0x%x\n", + ppc64_caches.iline_size); +#ifdef CONFIG_PPC_STD_MMU_64 + if (htab_address) + printk("htab_address = 0x%p\n", htab_address); printk("htab_hash_mask = 0x%lx\n", htab_hash_mask); +#endif /* CONFIG_PPC_STD_MMU_64 */ + if (PHYSICAL_START > 0) + printk("physical_start = 0x%llx\n", + (unsigned long long)PHYSICAL_START); printk("-----------------------------------------------------\n"); - mm_init_ppc64(); - DBG(" <- setup_system()\n"); } -static int ppc64_panic_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - ppc_md.panic((char *)ptr); /* May not return */ - return NOTIFY_DONE; -} - -#ifdef CONFIG_PPC_ISERIES -/* - * On iSeries we just parse the mem=X option from the command line. - * On pSeries it's a bit more complicated, see prom_init_mem() +/* This returns the limit below which memory accesses to the linear + * mapping are guarnateed not to cause a TLB or SLB miss. This is + * used to allocate interrupt or emergency stacks for which our + * exception entry path doesn't deal with being interrupted. */ -static int __init early_parsemem(char *p) +static u64 safe_stack_limit(void) { - if (!p) - return 0; - - memory_limit = ALIGN(memparse(p, &p), PAGE_SIZE); - - return 0; +#ifdef CONFIG_PPC_BOOK3E + /* Freescale BookE bolts the entire linear mapping */ + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) + return linear_map_top; + /* Other BookE, we assume the first GB is bolted */ + return 1ul << 30; +#else + /* BookS, the first segment is bolted */ + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) + return 1UL << SID_SHIFT_1T; + return 1UL << SID_SHIFT; +#endif } -early_param("mem", early_parsemem); -#endif /* CONFIG_PPC_ISERIES */ -#ifdef CONFIG_IRQSTACKS static void __init irqstack_early_init(void) { + u64 limit = safe_stack_limit(); unsigned int i; /* - * interrupt stacks must be under 256MB, we cannot afford to take - * SLB misses on them. + * Interrupt stacks must be in the first segment since we + * cannot afford to take SLB misses on them. */ - for_each_cpu(i) { - softirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE, - THREAD_SIZE, 0x10000000)); - hardirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE, - THREAD_SIZE, 0x10000000)); + for_each_possible_cpu(i) { + softirq_ctx[i] = (struct thread_info *) + __va(memblock_alloc_base(THREAD_SIZE, + THREAD_SIZE, limit)); + hardirq_ctx[i] = (struct thread_info *) + __va(memblock_alloc_base(THREAD_SIZE, + THREAD_SIZE, limit)); } } + +#ifdef CONFIG_PPC_BOOK3E +static void __init exc_lvl_early_init(void) +{ + unsigned int i; + unsigned long sp; + + for_each_possible_cpu(i) { + sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE); + critirq_ctx[i] = (struct thread_info *)__va(sp); + paca[i].crit_kstack = __va(sp + THREAD_SIZE); + + sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE); + dbgirq_ctx[i] = (struct thread_info *)__va(sp); + paca[i].dbg_kstack = __va(sp + THREAD_SIZE); + + sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE); + mcheckirq_ctx[i] = (struct thread_info *)__va(sp); + paca[i].mc_kstack = __va(sp + THREAD_SIZE); + } + + if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) + patch_exception(0x040, exc_debug_debug_book3e); +} #else -#define irqstack_early_init() +#define exc_lvl_early_init() #endif /* * Stack space used when we detect a bad kernel stack pointer, and - * early in SMP boots before relocation is enabled. + * early in SMP boots before relocation is enabled. Exclusive emergency + * stack for machine checks. */ static void __init emergency_stack_init(void) { - unsigned long limit; + u64 limit; unsigned int i; /* @@ -686,49 +624,29 @@ static void __init emergency_stack_init(void) * bringup, we need to get at them in real mode. This means they * must also be within the RMO region. */ - limit = min(0x10000000UL, lmb.rmo_size); - - for_each_cpu(i) - paca[i].emergency_sp = __va(lmb_alloc_base(PAGE_SIZE, 128, - limit)) + PAGE_SIZE; -} - -/* - * Called from setup_arch to initialize the bitmap of available - * syscalls in the systemcfg page - */ -void __init setup_syscall_map(void) -{ - unsigned int i, count64 = 0, count32 = 0; - extern unsigned long *sys_call_table; - extern unsigned long sys_ni_syscall; - - - for (i = 0; i < __NR_syscalls; i++) { - if (sys_call_table[i*2] != sys_ni_syscall) { - count64++; - systemcfg->syscall_map_64[i >> 5] |= - 0x80000000UL >> (i & 0x1f); - } - if (sys_call_table[i*2+1] != sys_ni_syscall) { - count32++; - systemcfg->syscall_map_32[i >> 5] |= - 0x80000000UL >> (i & 0x1f); - } + limit = min(safe_stack_limit(), ppc64_rma_size); + + for_each_possible_cpu(i) { + unsigned long sp; + sp = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit); + sp += THREAD_SIZE; + paca[i].emergency_sp = __va(sp); + +#ifdef CONFIG_PPC_BOOK3S_64 + /* emergency stack for machine check exception handling. */ + sp = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit); + sp += THREAD_SIZE; + paca[i].mc_emergency_sp = __va(sp); +#endif } - printk(KERN_INFO "Syscall map setup, %d 32-bit and %d 64-bit syscalls\n", - count32, count64); } /* - * Called into from start_kernel, after lock_kernel has been called. - * Initializes bootmem, which is unsed to manage page allocation until - * mem_init is called. + * Called into from start_kernel this initializes bootmem, which is used + * to manage page allocation until mem_init is called. */ void __init setup_arch(char **cmdline_p) { - extern void do_init_bootmem(void); - ppc64_boot_msg(0x12, "Setup Arch"); *cmdline_p = cmd_line; @@ -741,42 +659,44 @@ void __init setup_arch(char **cmdline_p) dcache_bsize = ppc64_caches.dline_size; icache_bsize = ppc64_caches.iline_size; - /* reboot on panic */ - panic_timeout = 180; - if (ppc_md.panic) - notifier_chain_register(&panic_notifier_list, &ppc64_panic_block); + setup_panic(); - init_mm.start_code = PAGE_OFFSET; + init_mm.start_code = (unsigned long)_stext; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; init_mm.brk = klimit; - +#ifdef CONFIG_PPC_64K_PAGES + init_mm.context.pte_frag = NULL; +#endif irqstack_early_init(); + exc_lvl_early_init(); emergency_stack_init(); +#ifdef CONFIG_PPC_STD_MMU_64 stabs_alloc(); - +#endif /* set up the bootmem stuff with available memory */ do_init_bootmem(); sparse_init(); - /* initialize the syscall map in systemcfg */ - setup_syscall_map(); - #ifdef CONFIG_DUMMY_CONSOLE conswitchp = &dummy_con; #endif - ppc_md.setup_arch(); - - /* Use the default idle loop if the platform hasn't provided one. */ - if (NULL == ppc_md.idle_loop) { - ppc_md.idle_loop = default_idle; - printk(KERN_INFO "Using default idle loop\n"); - } + if (ppc_md.setup_arch) + ppc_md.setup_arch(); paging_init(); + + /* Initialize the MMU context management stuff */ + mmu_context_init(); + + /* Interrupt code needs to be 64K-aligned */ + if ((unsigned long)_stext & 0xffff) + panic("Kernelbase not 64K-aligned (0x%lx)!\n", + (unsigned long)_stext); + ppc64_boot_msg(0x15, "Setup Done"); } @@ -805,224 +725,73 @@ void ppc64_boot_msg(unsigned int src, const char *msg) printk("[boot]%04x %s\n", src, msg); } -/* Print a termination message (print only -- does not stop the kernel) */ -void ppc64_terminate_msg(unsigned int src, const char *msg) +#ifdef CONFIG_SMP +#define PCPU_DYN_SIZE () + +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - ppc64_do_msg(PPC64_LINUX_FUNCTION|PPC64_TERM_MESSAGE|src, msg); - printk("[terminate]%04x %s\n", src, msg); + return __alloc_bootmem_node(NODE_DATA(cpu_to_node(cpu)), size, align, + __pa(MAX_DMA_ADDRESS)); } -#ifndef CONFIG_PPC_ISERIES -/* - * This function can be used by platforms to "find" legacy serial ports. - * It works for "serial" nodes under an "isa" node, and will try to - * respect the "ibm,aix-loc" property if any. It works with up to 8 - * ports. - */ - -#define MAX_LEGACY_SERIAL_PORTS 8 -static struct plat_serial8250_port serial_ports[MAX_LEGACY_SERIAL_PORTS+1]; -static unsigned int old_serial_count; - -void __init generic_find_legacy_serial_ports(u64 *physport, - unsigned int *default_speed) +static void __init pcpu_fc_free(void *ptr, size_t size) { - struct device_node *np; - u32 *sizeprop; - - struct isa_reg_property { - u32 space; - u32 address; - u32 size; - }; - struct pci_reg_property { - struct pci_address addr; - u32 size_hi; - u32 size_lo; - }; - - DBG(" -> generic_find_legacy_serial_port()\n"); - - *physport = 0; - if (default_speed) - *default_speed = 0; - - np = of_find_node_by_path("/"); - if (!np) - return; - - /* First fill our array */ - for (np = NULL; (np = of_find_node_by_type(np, "serial"));) { - struct device_node *isa, *pci; - struct isa_reg_property *reg; - unsigned long phys_size, addr_size, io_base; - u32 *rangesp; - u32 *interrupts, *clk, *spd; - char *typep; - int index, rlen, rentsize; - - /* Ok, first check if it's under an "isa" parent */ - isa = of_get_parent(np); - if (!isa || strcmp(isa->name, "isa")) { - DBG("%s: no isa parent found\n", np->full_name); - continue; - } - - /* Now look for an "ibm,aix-loc" property that gives us ordering - * if any... - */ - typep = (char *)get_property(np, "ibm,aix-loc", NULL); - - /* Get the ISA port number */ - reg = (struct isa_reg_property *)get_property(np, "reg", NULL); - if (reg == NULL) - goto next_port; - /* We assume the interrupt number isn't translated ... */ - interrupts = (u32 *)get_property(np, "interrupts", NULL); - /* get clock freq. if present */ - clk = (u32 *)get_property(np, "clock-frequency", NULL); - /* get default speed if present */ - spd = (u32 *)get_property(np, "current-speed", NULL); - /* Default to locate at end of array */ - index = old_serial_count; /* end of the array by default */ - - /* If we have a location index, then use it */ - if (typep && *typep == 'S') { - index = simple_strtol(typep+1, NULL, 0) - 1; - /* if index is out of range, use end of array instead */ - if (index >= MAX_LEGACY_SERIAL_PORTS) - index = old_serial_count; - /* if our index is still out of range, that mean that - * array is full, we could scan for a free slot but that - * make little sense to bother, just skip the port - */ - if (index >= MAX_LEGACY_SERIAL_PORTS) - goto next_port; - if (index >= old_serial_count) - old_serial_count = index + 1; - /* Check if there is a port who already claimed our slot */ - if (serial_ports[index].iobase != 0) { - /* if we still have some room, move it, else override */ - if (old_serial_count < MAX_LEGACY_SERIAL_PORTS) { - DBG("Moved legacy port %d -> %d\n", index, - old_serial_count); - serial_ports[old_serial_count++] = - serial_ports[index]; - } else { - DBG("Replacing legacy port %d\n", index); - } - } - } - if (index >= MAX_LEGACY_SERIAL_PORTS) - goto next_port; - if (index >= old_serial_count) - old_serial_count = index + 1; - - /* Now fill the entry */ - memset(&serial_ports[index], 0, sizeof(struct plat_serial8250_port)); - serial_ports[index].uartclk = clk ? *clk : BASE_BAUD * 16; - serial_ports[index].iobase = reg->address; - serial_ports[index].irq = interrupts ? interrupts[0] : 0; - serial_ports[index].flags = ASYNC_BOOT_AUTOCONF; - - DBG("Added legacy port, index: %d, port: %x, irq: %d, clk: %d\n", - index, - serial_ports[index].iobase, - serial_ports[index].irq, - serial_ports[index].uartclk); - - /* Get phys address of IO reg for port 1 */ - if (index != 0) - goto next_port; - - pci = of_get_parent(isa); - if (!pci) { - DBG("%s: no pci parent found\n", np->full_name); - goto next_port; - } - - rangesp = (u32 *)get_property(pci, "ranges", &rlen); - if (rangesp == NULL) { - of_node_put(pci); - goto next_port; - } - rlen /= 4; - - /* we need the #size-cells of the PCI bridge node itself */ - phys_size = 1; - sizeprop = (u32 *)get_property(pci, "#size-cells", NULL); - if (sizeprop != NULL) - phys_size = *sizeprop; - /* we need the parent #addr-cells */ - addr_size = prom_n_addr_cells(pci); - rentsize = 3 + addr_size + phys_size; - io_base = 0; - for (;rlen >= rentsize; rlen -= rentsize,rangesp += rentsize) { - if (((rangesp[0] >> 24) & 0x3) != 1) - continue; /* not IO space */ - io_base = rangesp[3]; - if (addr_size == 2) - io_base = (io_base << 32) | rangesp[4]; - } - if (io_base != 0) { - *physport = io_base + reg->address; - if (default_speed && spd) - *default_speed = *spd; - } - of_node_put(pci); - next_port: - of_node_put(isa); - } - - DBG(" <- generic_find_legacy_serial_port()\n"); + free_bootmem(__pa(ptr), size); } -static struct platform_device serial_device = { - .name = "serial8250", - .id = PLAT8250_DEV_PLATFORM, - .dev = { - .platform_data = serial_ports, - }, -}; - -static int __init serial_dev_init(void) +static int pcpu_cpu_distance(unsigned int from, unsigned int to) { - return platform_device_register(&serial_device); + if (cpu_to_node(from) == cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; } -arch_initcall(serial_dev_init); -#endif /* CONFIG_PPC_ISERIES */ +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); -int check_legacy_ioport(unsigned long base_port) +void __init setup_per_cpu_areas(void) { - if (ppc_md.check_legacy_ioport == NULL) - return 0; - return ppc_md.check_legacy_ioport(base_port); -} -EXPORT_SYMBOL(check_legacy_ioport); + const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; + size_t atom_size; + unsigned long delta; + unsigned int cpu; + int rc; -#ifdef CONFIG_XMON -static int __init early_xmon(char *p) -{ - /* ensure xmon is enabled */ - if (p) { - if (strncmp(p, "on", 2) == 0) - xmon_init(1); - if (strncmp(p, "off", 3) == 0) - xmon_init(0); - if (strncmp(p, "early", 5) != 0) - return 0; - } - xmon_init(1); - debugger(NULL); + /* + * Linear mapping is one of 4K, 1M and 16M. For 4K, no need + * to group units. For larger mappings, use 1M atom which + * should be large enough to contain a number of units. + */ + if (mmu_linear_psize == MMU_PAGE_4K) + atom_size = PAGE_SIZE; + else + atom_size = 1 << 20; - return 0; + rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) { + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; + paca[cpu].data_offset = __per_cpu_offset[cpu]; + } } -early_param("xmon", early_xmon); #endif -void cpu_die(void) +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +unsigned long memory_block_size_bytes(void) { - if (ppc_md.cpu_die) - ppc_md.cpu_die(); + if (ppc_md.memory_block_size) + return ppc_md.memory_block_size(); + + return MIN_MEMORY_BLOCK_SIZE; } +#endif + +#if defined(CONFIG_PPC_INDIRECT_PIO) || defined(CONFIG_PPC_INDIRECT_MMIO) +struct ppc_pci_io ppc_pci_io; +EXPORT_SYMBOL(ppc_pci_io); +#endif diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c new file mode 100644 index 00000000000..1c794cef288 --- /dev/null +++ b/arch/powerpc/kernel/signal.c @@ -0,0 +1,212 @@ +/* + * Common signal handling code for both 32 and 64 bits + * + * Copyright (c) 2007 Benjamin Herrenschmidt, IBM Coproration + * Extracted from signal_32.c and signal_64.c + * + * This file is subject to the terms and conditions of the GNU General + * Public License. See the file README.legal in the main directory of + * this archive for more details. + */ + +#include <linux/tracehook.h> +#include <linux/signal.h> +#include <linux/uprobes.h> +#include <linux/key.h> +#include <linux/context_tracking.h> +#include <asm/hw_breakpoint.h> +#include <asm/uaccess.h> +#include <asm/unistd.h> +#include <asm/debug.h> +#include <asm/tm.h> + +#include "signal.h" + +/* Log an error when sending an unhandled signal to a process. Controlled + * through debug.exception-trace sysctl. + */ + +int show_unhandled_signals = 1; + +/* + * Allocate space for the signal frame + */ +void __user * get_sigframe(struct k_sigaction *ka, unsigned long sp, + size_t frame_size, int is_32) +{ + unsigned long oldsp, newsp; + + /* Default to using normal stack */ + oldsp = get_clean_sp(sp, is_32); + + /* Check for alt stack */ + if ((ka->sa.sa_flags & SA_ONSTACK) && + current->sas_ss_size && !on_sig_stack(oldsp)) + oldsp = (current->sas_ss_sp + current->sas_ss_size); + + /* Get aligned frame */ + newsp = (oldsp - frame_size) & ~0xFUL; + + /* Check access */ + if (!access_ok(VERIFY_WRITE, (void __user *)newsp, oldsp - newsp)) + return NULL; + + return (void __user *)newsp; +} + +static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka, + int has_handler) +{ + unsigned long ret = regs->gpr[3]; + int restart = 1; + + /* syscall ? */ + if (TRAP(regs) != 0x0C00) + return; + + /* error signalled ? */ + if (!(regs->ccr & 0x10000000)) + return; + + switch (ret) { + case ERESTART_RESTARTBLOCK: + case ERESTARTNOHAND: + /* ERESTARTNOHAND means that the syscall should only be + * restarted if there was no handler for the signal, and since + * we only get here if there is a handler, we dont restart. + */ + restart = !has_handler; + break; + case ERESTARTSYS: + /* ERESTARTSYS means to restart the syscall if there is no + * handler or the handler was registered with SA_RESTART + */ + restart = !has_handler || (ka->sa.sa_flags & SA_RESTART) != 0; + break; + case ERESTARTNOINTR: + /* ERESTARTNOINTR means that the syscall should be + * called again after the signal handler returns. + */ + break; + default: + return; + } + if (restart) { + if (ret == ERESTART_RESTARTBLOCK) + regs->gpr[0] = __NR_restart_syscall; + else + regs->gpr[3] = regs->orig_gpr3; + regs->nip -= 4; + regs->result = 0; + } else { + regs->result = -EINTR; + regs->gpr[3] = EINTR; + regs->ccr |= 0x10000000; + } +} + +static int do_signal(struct pt_regs *regs) +{ + sigset_t *oldset = sigmask_to_save(); + siginfo_t info; + int signr; + struct k_sigaction ka; + int ret; + int is32 = is_32bit_task(); + + signr = get_signal_to_deliver(&info, &ka, regs, NULL); + + /* Is there any syscall restart business here ? */ + check_syscall_restart(regs, &ka, signr > 0); + + if (signr <= 0) { + /* No signal to deliver -- put the saved sigmask back */ + restore_saved_sigmask(); + regs->trap = 0; + return 0; /* no signals delivered */ + } + +#ifndef CONFIG_PPC_ADV_DEBUG_REGS + /* + * Reenable the DABR before delivering the signal to + * user space. The DABR will have been cleared if it + * triggered inside the kernel. + */ + if (current->thread.hw_brk.address && + current->thread.hw_brk.type) + __set_breakpoint(¤t->thread.hw_brk); +#endif + /* Re-enable the breakpoints for the signal stack */ + thread_change_pc(current, regs); + + if (is32) { + if (ka.sa.sa_flags & SA_SIGINFO) + ret = handle_rt_signal32(signr, &ka, &info, oldset, + regs); + else + ret = handle_signal32(signr, &ka, &info, oldset, + regs); + } else { + ret = handle_rt_signal64(signr, &ka, &info, oldset, regs); + } + + regs->trap = 0; + if (ret) { + signal_delivered(signr, &info, &ka, regs, + test_thread_flag(TIF_SINGLESTEP)); + } + + return ret; +} + +void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) +{ + user_exit(); + + if (thread_info_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + + if (thread_info_flags & _TIF_SIGPENDING) + do_signal(regs); + + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } + + user_enter(); +} + +unsigned long get_tm_stackpointer(struct pt_regs *regs) +{ + /* When in an active transaction that takes a signal, we need to be + * careful with the stack. It's possible that the stack has moved back + * up after the tbegin. The obvious case here is when the tbegin is + * called inside a function that returns before a tend. In this case, + * the stack is part of the checkpointed transactional memory state. + * If we write over this non transactionally or in suspend, we are in + * trouble because if we get a tm abort, the program counter and stack + * pointer will be back at the tbegin but our in memory stack won't be + * valid anymore. + * + * To avoid this, when taking a signal in an active transaction, we + * need to use the stack pointer from the checkpointed state, rather + * than the speculated state. This ensures that the signal context + * (written tm suspended) will be written below the stack required for + * the rollback. The transaction is aborted becuase of the treclaim, + * so any memory written between the tbegin and the signal will be + * rolled back anyway. + * + * For signals taken in non-TM or suspended mode, we use the + * normal/non-checkpointed stack pointer. + */ + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(regs->msr)) { + tm_reclaim_current(TM_CAUSE_SIGNAL); + if (MSR_TM_TRANSACTIONAL(regs->msr)) + return current->thread.ckpt_regs.gpr[1]; + } +#endif + return regs->gpr[1]; +} diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h new file mode 100644 index 00000000000..c69b9aeb9f2 --- /dev/null +++ b/arch/powerpc/kernel/signal.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2007 Benjamin Herrenschmidt, IBM Coproration + * Extracted from signal_32.c and signal_64.c + * + * This file is subject to the terms and conditions of the GNU General + * Public License. See the file README.legal in the main directory of + * this archive for more details. + */ + +#ifndef _POWERPC_ARCH_SIGNAL_H +#define _POWERPC_ARCH_SIGNAL_H + +extern void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags); + +extern void __user * get_sigframe(struct k_sigaction *ka, unsigned long sp, + size_t frame_size, int is_32); + +extern int handle_signal32(unsigned long sig, struct k_sigaction *ka, + siginfo_t *info, sigset_t *oldset, + struct pt_regs *regs); + +extern int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, + siginfo_t *info, sigset_t *oldset, + struct pt_regs *regs); + +extern unsigned long copy_fpr_to_user(void __user *to, + struct task_struct *task); +extern unsigned long copy_transact_fpr_to_user(void __user *to, + struct task_struct *task); +extern unsigned long copy_fpr_from_user(struct task_struct *task, + void __user *from); +extern unsigned long copy_transact_fpr_from_user(struct task_struct *task, + void __user *from); +#ifdef CONFIG_VSX +extern unsigned long copy_vsx_to_user(void __user *to, + struct task_struct *task); +extern unsigned long copy_transact_vsx_to_user(void __user *to, + struct task_struct *task); +extern unsigned long copy_vsx_from_user(struct task_struct *task, + void __user *from); +extern unsigned long copy_transact_vsx_from_user(struct task_struct *task, + void __user *from); +#endif + +#ifdef CONFIG_PPC64 + +extern int handle_rt_signal64(int signr, struct k_sigaction *ka, + siginfo_t *info, sigset_t *set, + struct pt_regs *regs); + +#else /* CONFIG_PPC64 */ + +static inline int handle_rt_signal64(int signr, struct k_sigaction *ka, + siginfo_t *info, sigset_t *set, + struct pt_regs *regs) +{ + return -EFAULT; +} + +#endif /* !defined(CONFIG_PPC64) */ + +#endif /* _POWERPC_ARCH_SIGNAL_H */ diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 444c3e81884..1bc5a1755ed 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -17,51 +17,46 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/kernel.h> #include <linux/signal.h> #include <linux/errno.h> #include <linux/elf.h> +#include <linux/ptrace.h> +#include <linux/ratelimit.h> #ifdef CONFIG_PPC64 #include <linux/syscalls.h> #include <linux/compat.h> -#include <linux/ptrace.h> #else #include <linux/wait.h> -#include <linux/ptrace.h> #include <linux/unistd.h> #include <linux/stddef.h> #include <linux/tty.h> #include <linux/binfmts.h> -#include <linux/suspend.h> #endif #include <asm/uaccess.h> #include <asm/cacheflush.h> +#include <asm/syscalls.h> +#include <asm/sigcontext.h> +#include <asm/vdso.h> +#include <asm/switch_to.h> +#include <asm/tm.h> #ifdef CONFIG_PPC64 -#include <asm/ppc32.h> -#include <asm/ppcdebug.h> +#include "ppc32.h" #include <asm/unistd.h> -#include <asm/vdso.h> #else #include <asm/ucontext.h> #include <asm/pgtable.h> #endif -#undef DEBUG_SIG +#include "signal.h" -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) #ifdef CONFIG_PPC64 -#define do_signal do_signal32 -#define sys_sigsuspend compat_sys_sigsuspend -#define sys_rt_sigsuspend compat_sys_rt_sigsuspend #define sys_rt_sigreturn compat_sys_rt_sigreturn -#define sys_sigaction compat_sys_sigaction #define sys_swapcontext compat_sys_swapcontext #define sys_sigreturn compat_sys_sigreturn @@ -70,13 +65,21 @@ #define mcontext mcontext32 #define ucontext ucontext32 +#define __save_altstack __compat_save_altstack + +/* + * Userspace code may pass a ucontext which doesn't include VSX added + * at the end. We need to check for this case. + */ +#define UCONTEXTSIZEWITHOUTVSX \ + (sizeof(struct ucontext) - sizeof(elf_vsrreghalf_t32)) + /* * Returning 0 means we return to userspace via * ret_from_except and thus restore all user * registers from *regs. This is what we need * to do when a signal has been delivered. */ -#define sigreturn_exit(regs) return 0 #define GP_REGS_SIZE min(sizeof(elf_gregset_t32), sizeof(struct pt_regs32)) #undef __SIGNAL_FRAMESIZE @@ -93,7 +96,7 @@ static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set) compat_sigset_t cset; switch (_NSIG_WORDS) { - case 4: cset.sig[5] = set->sig[3] & 0xffffffffull; + case 4: cset.sig[6] = set->sig[3] & 0xffffffffull; cset.sig[7] = set->sig[3] >> 32; case 3: cset.sig[4] = set->sig[2] & 0xffffffffull; cset.sig[5] = set->sig[2] >> 32; @@ -126,28 +129,7 @@ static inline int get_sigset_t(sigset_t *set, return 0; } -static inline int get_old_sigaction(struct k_sigaction *new_ka, - struct old_sigaction __user *act) -{ - compat_old_sigset_t mask; - compat_uptr_t handler, restorer; - - if (get_user(handler, &act->sa_handler) || - __get_user(restorer, &act->sa_restorer) || - __get_user(new_ka->sa.sa_flags, &act->sa_flags) || - __get_user(mask, &act->sa_mask)) - return -EFAULT; - new_ka->sa.sa_handler = compat_ptr(handler); - new_ka->sa.sa_restorer = compat_ptr(restorer); - siginitset(&new_ka->sa.sa_mask, mask); - return 0; -} - -static inline compat_uptr_t to_user_ptr(void *kp) -{ - return (compat_uptr_t)(u64)kp; -} - +#define to_user_ptr(p) ptr_to_compat(p) #define from_user_ptr(p) compat_ptr(p) static inline int save_general_regs(struct pt_regs *regs, @@ -156,9 +138,14 @@ static inline int save_general_regs(struct pt_regs *regs, elf_greg_t64 *gregs = (elf_greg_t64 *)regs; int i; - for (i = 0; i <= PT_RESULT; i ++) + WARN_ON(!FULL_REGS(regs)); + + for (i = 0; i <= PT_RESULT; i ++) { + if (i == 14 && !FULL_REGS(regs)) + i = 32; if (__put_user((unsigned int)gregs[i], &frame->mc_gregs[i])) return -EFAULT; + } return 0; } @@ -179,8 +166,6 @@ static inline int restore_general_regs(struct pt_regs *regs, #else /* CONFIG_PPC64 */ -extern void sigreturn_exit(struct pt_regs *); - #define GP_REGS_SIZE min(sizeof(elf_gregset_t), sizeof(struct pt_regs)) static inline int put_sigset_t(sigset_t __user *uset, sigset_t *set) @@ -193,27 +178,13 @@ static inline int get_sigset_t(sigset_t *set, const sigset_t __user *uset) return copy_from_user(set, uset, sizeof(*uset)); } -static inline int get_old_sigaction(struct k_sigaction *new_ka, - struct old_sigaction __user *act) -{ - old_sigset_t mask; - - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || - __get_user(new_ka->sa.sa_handler, &act->sa_handler) || - __get_user(new_ka->sa.sa_restorer, &act->sa_restorer)) - return -EFAULT; - __get_user(new_ka->sa.sa_flags, &act->sa_flags); - __get_user(mask, &act->sa_mask); - siginitset(&new_ka->sa.sa_mask, mask); - return 0; -} - -#define to_user_ptr(p) (p) -#define from_user_ptr(p) (p) +#define to_user_ptr(p) ((unsigned long)(p)) +#define from_user_ptr(p) ((void __user *)(p)) static inline int save_general_regs(struct pt_regs *regs, struct mcontext __user *frame) { + WARN_ON(!FULL_REGS(regs)); return __copy_to_user(&frame->mc_gregs, regs, GP_REGS_SIZE); } @@ -230,124 +201,27 @@ static inline int restore_general_regs(struct pt_regs *regs, return -EFAULT; return 0; } - -#endif /* CONFIG_PPC64 */ - -int do_signal(sigset_t *oldset, struct pt_regs *regs); - -/* - * Atomically swap in the new signal mask, and wait for a signal. - */ -long sys_sigsuspend(old_sigset_t mask, int p2, int p3, int p4, int p6, int p7, - struct pt_regs *regs) -{ - sigset_t saveset; - - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - regs->result = -EINTR; - regs->gpr[3] = EINTR; - regs->ccr |= 0x10000000; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(&saveset, regs)) - sigreturn_exit(regs); - } -} - -long sys_rt_sigsuspend( -#ifdef CONFIG_PPC64 - compat_sigset_t __user *unewset, -#else - sigset_t __user *unewset, -#endif - size_t sigsetsize, int p3, int p4, - int p6, int p7, struct pt_regs *regs) -{ - sigset_t saveset, newset; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (get_sigset_t(&newset, unewset)) - return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - regs->result = -EINTR; - regs->gpr[3] = EINTR; - regs->ccr |= 0x10000000; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(&saveset, regs)) - sigreturn_exit(regs); - } -} - -#ifdef CONFIG_PPC32 -long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, int r5, - int r6, int r7, int r8, struct pt_regs *regs) -{ - return do_sigaltstack(uss, uoss, regs->gpr[1]); -} #endif -long sys_sigaction(int sig, struct old_sigaction __user *act, - struct old_sigaction __user *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret; - -#ifdef CONFIG_PPC64 - if (sig < 0) - sig = -sig; -#endif - - if (act) { - if (get_old_sigaction(&new_ka, act)) - return -EFAULT; - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(to_user_ptr(old_ka.sa.sa_handler), - &oact->sa_handler) || - __put_user(to_user_ptr(old_ka.sa.sa_restorer), - &oact->sa_restorer) || - __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) - return -EFAULT; - } - - return ret; -} - /* * When we have signals to deliver, we set up on the * user stack, going down from the original stack pointer: - * a sigregs struct + * an ABI gap of 56 words + * an mcontext struct * a sigcontext struct * a gap of __SIGNAL_FRAMESIZE bytes * - * Each of these things must be a multiple of 16 bytes in size. + * Each of these things must be a multiple of 16 bytes in size. The following + * structure represent all of this except the __SIGNAL_FRAMESIZE gap * */ -struct sigregs { +struct sigframe { + struct sigcontext sctx; /* the sigcontext */ struct mcontext mctx; /* all the register values */ +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + struct sigcontext sctx_transact; + struct mcontext mctx_transact; +#endif /* * Programs using the rs6000/xcoff abi can save up to 19 gp * regs and 18 fp regs below sp before decrementing it. @@ -376,6 +250,9 @@ struct rt_sigframe { struct siginfo info; #endif struct ucontext uc; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + struct ucontext uc_transact; +#endif /* * Programs using the rs6000/xcoff abi can save up to 19 gp * regs and 18 fp regs below sp before decrementing it. @@ -383,39 +260,174 @@ struct rt_sigframe { int abigap[56]; }; +#ifdef CONFIG_VSX +unsigned long copy_fpr_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NFPREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + buf[i] = task->thread.TS_FPR(i); + buf[i] = task->thread.fp_state.fpscr; + return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); +} + +unsigned long copy_fpr_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NFPREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) + return 1; + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + task->thread.TS_FPR(i) = buf[i]; + task->thread.fp_state.fpscr = buf[i]; + + return 0; +} + +unsigned long copy_vsx_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < ELF_NVSRHALFREG; i++) + buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; + return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); +} + +unsigned long copy_vsx_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) + return 1; + for (i = 0; i < ELF_NVSRHALFREG ; i++) + task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + return 0; +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +unsigned long copy_transact_fpr_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NFPREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + buf[i] = task->thread.TS_TRANS_FPR(i); + buf[i] = task->thread.transact_fp.fpscr; + return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); +} + +unsigned long copy_transact_fpr_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NFPREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) + return 1; + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + task->thread.TS_TRANS_FPR(i) = buf[i]; + task->thread.transact_fp.fpscr = buf[i]; + + return 0; +} + +unsigned long copy_transact_vsx_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < ELF_NVSRHALFREG; i++) + buf[i] = task->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET]; + return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); +} + +unsigned long copy_transact_vsx_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) + return 1; + for (i = 0; i < ELF_NVSRHALFREG ; i++) + task->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + return 0; +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ +#else +inline unsigned long copy_fpr_to_user(void __user *to, + struct task_struct *task) +{ + return __copy_to_user(to, task->thread.fp_state.fpr, + ELF_NFPREG * sizeof(double)); +} + +inline unsigned long copy_fpr_from_user(struct task_struct *task, + void __user *from) +{ + return __copy_from_user(task->thread.fp_state.fpr, from, + ELF_NFPREG * sizeof(double)); +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +inline unsigned long copy_transact_fpr_to_user(void __user *to, + struct task_struct *task) +{ + return __copy_to_user(to, task->thread.transact_fp.fpr, + ELF_NFPREG * sizeof(double)); +} + +inline unsigned long copy_transact_fpr_from_user(struct task_struct *task, + void __user *from) +{ + return __copy_from_user(task->thread.transact_fp.fpr, from, + ELF_NFPREG * sizeof(double)); +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ +#endif + /* * Save the current user registers on the user stack. * We only save the altivec/spe registers if the process has used * altivec/spe instructions at some point. */ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, - int sigret) + struct mcontext __user *tm_frame, int sigret, + int ctx_has_vsx_region) { -#ifdef CONFIG_PPC32 - CHECK_FULL_REGS(regs); -#endif + unsigned long msr = regs->msr; + /* Make sure floating point registers are stored in regs */ flush_fp_to_thread(current); - /* save general and floating-point registers */ - if (save_general_regs(regs, frame) || - __copy_to_user(&frame->mc_fregs, current->thread.fpr, - ELF_NFPREG * sizeof(double))) + /* save general registers */ + if (save_general_regs(regs, frame)) return 1; - current->thread.fpscr.val = 0; /* turn off all fp exceptions */ - #ifdef CONFIG_ALTIVEC /* save altivec registers */ if (current->thread.used_vr) { flush_altivec_to_thread(current); - if (__copy_to_user(&frame->mc_vregs, current->thread.vr, + if (__copy_to_user(&frame->mc_vregs, ¤t->thread.vr_state, ELF_NVRREG * sizeof(vector128))) return 1; /* set MSR_VEC in the saved MSR value to indicate that frame->mc_vregs contains valid data */ - if (__put_user(regs->msr | MSR_VEC, &frame->mc_gregs[PT_MSR])) - return 1; + msr |= MSR_VEC; } /* else assert((regs->msr & MSR_VEC) == 0) */ @@ -423,11 +435,35 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, * use altivec. Since VSCR only contains 32 bits saved in the least * significant bits of a vector, we "cheat" and stuff VRSAVE in the * most significant bits of that same vector. --BenH + * Note that the current VRSAVE value is in the SPR at this point. */ + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + current->thread.vrsave = mfspr(SPRN_VRSAVE); if (__put_user(current->thread.vrsave, (u32 __user *)&frame->mc_vregs[32])) return 1; #endif /* CONFIG_ALTIVEC */ + if (copy_fpr_to_user(&frame->mc_fregs, current)) + return 1; + /* + * Clear the MSR VSX bit to indicate there is no valid state attached + * to this context, except in the specific case below where we set it. + */ + msr &= ~MSR_VSX; +#ifdef CONFIG_VSX + /* + * Copy VSR 0-31 upper half from thread_struct to local + * buffer, then write that to userspace. Also set MSR_VSX in + * the saved MSR value to indicate that frame->mc_vregs + * contains valid data + */ + if (current->thread.used_vsr && ctx_has_vsx_region) { + __giveup_vsx(current); + if (copy_vsx_to_user(&frame->mc_vsregs, current)) + return 1; + msr |= MSR_VSX; + } +#endif /* CONFIG_VSX */ #ifdef CONFIG_SPE /* save spe registers */ if (current->thread.used_spe) { @@ -437,8 +473,7 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, return 1; /* set MSR_SPE in the saved MSR value to indicate that frame->mc_vregs contains valid data */ - if (__put_user(regs->msr | MSR_SPE, &frame->mc_gregs[PT_MSR])) - return 1; + msr |= MSR_SPE; } /* else assert((regs->msr & MSR_SPE) == 0) */ @@ -447,6 +482,166 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, return 1; #endif /* CONFIG_SPE */ + if (__put_user(msr, &frame->mc_gregs[PT_MSR])) + return 1; + /* We need to write 0 the MSR top 32 bits in the tm frame so that we + * can check it on the restore to see if TM is active + */ + if (tm_frame && __put_user(0, &tm_frame->mc_gregs[PT_MSR])) + return 1; + + if (sigret) { + /* Set up the sigreturn trampoline: li r0,sigret; sc */ + if (__put_user(0x38000000UL + sigret, &frame->tramp[0]) + || __put_user(0x44000002UL, &frame->tramp[1])) + return 1; + flush_icache_range((unsigned long) &frame->tramp[0], + (unsigned long) &frame->tramp[2]); + } + + return 0; +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Save the current user registers on the user stack. + * We only save the altivec/spe registers if the process has used + * altivec/spe instructions at some point. + * We also save the transactional registers to a second ucontext in the + * frame. + * + * See save_user_regs() and signal_64.c:setup_tm_sigcontexts(). + */ +static int save_tm_user_regs(struct pt_regs *regs, + struct mcontext __user *frame, + struct mcontext __user *tm_frame, int sigret) +{ + unsigned long msr = regs->msr; + + /* Remove TM bits from thread's MSR. The MSR in the sigcontext + * just indicates to userland that we were doing a transaction, but we + * don't want to return in transactional state. This also ensures + * that flush_fp_to_thread won't set TIF_RESTORE_TM again. + */ + regs->msr &= ~MSR_TS_MASK; + + /* Make sure floating point registers are stored in regs */ + flush_fp_to_thread(current); + + /* Save both sets of general registers */ + if (save_general_regs(¤t->thread.ckpt_regs, frame) + || save_general_regs(regs, tm_frame)) + return 1; + + /* Stash the top half of the 64bit MSR into the 32bit MSR word + * of the transactional mcontext. This way we have a backward-compatible + * MSR in the 'normal' (checkpointed) mcontext and additionally one can + * also look at what type of transaction (T or S) was active at the + * time of the signal. + */ + if (__put_user((msr >> 32), &tm_frame->mc_gregs[PT_MSR])) + return 1; + +#ifdef CONFIG_ALTIVEC + /* save altivec registers */ + if (current->thread.used_vr) { + flush_altivec_to_thread(current); + if (__copy_to_user(&frame->mc_vregs, ¤t->thread.vr_state, + ELF_NVRREG * sizeof(vector128))) + return 1; + if (msr & MSR_VEC) { + if (__copy_to_user(&tm_frame->mc_vregs, + ¤t->thread.transact_vr, + ELF_NVRREG * sizeof(vector128))) + return 1; + } else { + if (__copy_to_user(&tm_frame->mc_vregs, + ¤t->thread.vr_state, + ELF_NVRREG * sizeof(vector128))) + return 1; + } + + /* set MSR_VEC in the saved MSR value to indicate that + * frame->mc_vregs contains valid data + */ + msr |= MSR_VEC; + } + + /* We always copy to/from vrsave, it's 0 if we don't have or don't + * use altivec. Since VSCR only contains 32 bits saved in the least + * significant bits of a vector, we "cheat" and stuff VRSAVE in the + * most significant bits of that same vector. --BenH + */ + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + current->thread.vrsave = mfspr(SPRN_VRSAVE); + if (__put_user(current->thread.vrsave, + (u32 __user *)&frame->mc_vregs[32])) + return 1; + if (msr & MSR_VEC) { + if (__put_user(current->thread.transact_vrsave, + (u32 __user *)&tm_frame->mc_vregs[32])) + return 1; + } else { + if (__put_user(current->thread.vrsave, + (u32 __user *)&tm_frame->mc_vregs[32])) + return 1; + } +#endif /* CONFIG_ALTIVEC */ + + if (copy_fpr_to_user(&frame->mc_fregs, current)) + return 1; + if (msr & MSR_FP) { + if (copy_transact_fpr_to_user(&tm_frame->mc_fregs, current)) + return 1; + } else { + if (copy_fpr_to_user(&tm_frame->mc_fregs, current)) + return 1; + } + +#ifdef CONFIG_VSX + /* + * Copy VSR 0-31 upper half from thread_struct to local + * buffer, then write that to userspace. Also set MSR_VSX in + * the saved MSR value to indicate that frame->mc_vregs + * contains valid data + */ + if (current->thread.used_vsr) { + __giveup_vsx(current); + if (copy_vsx_to_user(&frame->mc_vsregs, current)) + return 1; + if (msr & MSR_VSX) { + if (copy_transact_vsx_to_user(&tm_frame->mc_vsregs, + current)) + return 1; + } else { + if (copy_vsx_to_user(&tm_frame->mc_vsregs, current)) + return 1; + } + + msr |= MSR_VSX; + } +#endif /* CONFIG_VSX */ +#ifdef CONFIG_SPE + /* SPE regs are not checkpointed with TM, so this section is + * simply the same as in save_user_regs(). + */ + if (current->thread.used_spe) { + flush_spe_to_thread(current); + if (__copy_to_user(&frame->mc_vregs, current->thread.evr, + ELF_NEVRREG * sizeof(u32))) + return 1; + /* set MSR_SPE in the saved MSR value to indicate that + * frame->mc_vregs contains valid data */ + msr |= MSR_SPE; + } + + /* We always copy to/from spefscr */ + if (__put_user(current->thread.spefscr, (u32 __user *)&frame->mc_vregs + ELF_NEVRREG)) + return 1; +#endif /* CONFIG_SPE */ + + if (__put_user(msr, &frame->mc_gregs[PT_MSR])) + return 1; if (sigret) { /* Set up the sigreturn trampoline: li r0,sigret; sc */ if (__put_user(0x38000000UL + sigret, &frame->tramp[0]) @@ -458,6 +653,7 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, return 0; } +#endif /* * Restore the current user register values from the user stack, @@ -468,8 +664,9 @@ static long restore_user_regs(struct pt_regs *regs, { long err; unsigned int save_r2 = 0; -#if defined(CONFIG_ALTIVEC) || defined(CONFIG_SPE) unsigned long msr; +#ifdef CONFIG_VSX + int i; #endif /* @@ -479,40 +676,78 @@ static long restore_user_regs(struct pt_regs *regs, if (!sig) save_r2 = (unsigned int)regs->gpr[2]; err = restore_general_regs(regs, sr); + regs->trap = 0; + err |= __get_user(msr, &sr->mc_gregs[PT_MSR]); if (!sig) regs->gpr[2] = (unsigned long) save_r2; if (err) return 1; - /* force the process to reload the FP registers from - current->thread when it next does FP instructions */ - regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1); - if (__copy_from_user(current->thread.fpr, &sr->mc_fregs, - sizeof(sr->mc_fregs))) - return 1; + /* if doing signal return, restore the previous little-endian mode */ + if (sig) + regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); + + /* + * Do this before updating the thread state in + * current->thread.fpr/vr/evr. That way, if we get preempted + * and another task grabs the FPU/Altivec/SPE, it won't be + * tempted to save the current CPU state into the thread_struct + * and corrupt what we are writing there. + */ + discard_lazy_cpu_state(); #ifdef CONFIG_ALTIVEC - /* force the process to reload the altivec registers from - current->thread when it next does altivec instructions */ + /* + * Force the process to reload the altivec registers from + * current->thread when it next does altivec instructions + */ regs->msr &= ~MSR_VEC; - if (!__get_user(msr, &sr->mc_gregs[PT_MSR]) && (msr & MSR_VEC) != 0) { + if (msr & MSR_VEC) { /* restore altivec registers from the stack */ - if (__copy_from_user(current->thread.vr, &sr->mc_vregs, + if (__copy_from_user(¤t->thread.vr_state, &sr->mc_vregs, sizeof(sr->mc_vregs))) return 1; } else if (current->thread.used_vr) - memset(current->thread.vr, 0, ELF_NVRREG * sizeof(vector128)); + memset(¤t->thread.vr_state, 0, + ELF_NVRREG * sizeof(vector128)); /* Always get VRSAVE back */ if (__get_user(current->thread.vrsave, (u32 __user *)&sr->mc_vregs[32])) return 1; + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + mtspr(SPRN_VRSAVE, current->thread.vrsave); #endif /* CONFIG_ALTIVEC */ + if (copy_fpr_from_user(current, &sr->mc_fregs)) + return 1; + +#ifdef CONFIG_VSX + /* + * Force the process to reload the VSX registers from + * current->thread when it next does VSX instruction. + */ + regs->msr &= ~MSR_VSX; + if (msr & MSR_VSX) { + /* + * Restore altivec registers from the stack to a local + * buffer, then write this out to the thread_struct + */ + if (copy_vsx_from_user(current, &sr->mc_vsregs)) + return 1; + } else if (current->thread.used_vsr) + for (i = 0; i < 32 ; i++) + current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0; +#endif /* CONFIG_VSX */ + /* + * force the process to reload the FP registers from + * current->thread when it next does FP instructions + */ + regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1); #ifdef CONFIG_SPE /* force the process to reload the spe registers from current->thread when it next does spe instructions */ regs->msr &= ~MSR_SPE; - if (!__get_user(msr, &sr->mc_gregs[PT_MSR]) && (msr & MSR_SPE) != 0) { + if (msr & MSR_SPE) { /* restore spe registers from the stack */ if (__copy_from_user(current->thread.evr, &sr->mc_vregs, ELF_NEVRREG * sizeof(u32))) @@ -525,106 +760,154 @@ static long restore_user_regs(struct pt_regs *regs, return 1; #endif /* CONFIG_SPE */ -#ifndef CONFIG_SMP - preempt_disable(); - if (last_task_used_math == current) - last_task_used_math = NULL; - if (last_task_used_altivec == current) - last_task_used_altivec = NULL; -#ifdef CONFIG_SPE - if (last_task_used_spe == current) - last_task_used_spe = NULL; -#endif - preempt_enable(); -#endif return 0; } -#ifdef CONFIG_PPC64 -long compat_sys_rt_sigaction(int sig, const struct sigaction32 __user *act, - struct sigaction32 __user *oact, size_t sigsetsize) +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Restore the current user register values from the user stack, except for + * MSR, and recheckpoint the original checkpointed register state for processes + * in transactions. + */ +static long restore_tm_user_regs(struct pt_regs *regs, + struct mcontext __user *sr, + struct mcontext __user *tm_sr) { - struct k_sigaction new_ka, old_ka; - int ret; + long err; + unsigned long msr, msr_hi; +#ifdef CONFIG_VSX + int i; +#endif - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; + /* + * restore general registers but not including MSR or SOFTE. Also + * take care of keeping r2 (TLS) intact if not a signal. + * See comment in signal_64.c:restore_tm_sigcontexts(); + * TFHAR is restored from the checkpointed NIP; TEXASR and TFIAR + * were set by the signal delivery. + */ + err = restore_general_regs(regs, tm_sr); + err |= restore_general_regs(¤t->thread.ckpt_regs, sr); - if (act) { - compat_uptr_t handler; + err |= __get_user(current->thread.tm_tfhar, &sr->mc_gregs[PT_NIP]); - ret = get_user(handler, &act->sa_handler); - new_ka.sa.sa_handler = compat_ptr(handler); - ret |= get_sigset_t(&new_ka.sa.sa_mask, &act->sa_mask); - ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); - if (ret) - return -EFAULT; - } + err |= __get_user(msr, &sr->mc_gregs[PT_MSR]); + if (err) + return 1; - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - if (!ret && oact) { - ret = put_user((long)old_ka.sa.sa_handler, &oact->sa_handler); - ret |= put_sigset_t(&oact->sa_mask, &old_ka.sa.sa_mask); - ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - } - return ret; -} + /* Restore the previous little-endian mode */ + regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); -/* - * Note: it is necessary to treat how as an unsigned int, with the - * corresponding cast to a signed int to insure that the proper - * conversion (sign extension) between the register representation - * of a signed int (msr in 32-bit mode) and the register representation - * of a signed int (msr in 64-bit mode) is performed. - */ -long compat_sys_rt_sigprocmask(u32 how, compat_sigset_t __user *set, - compat_sigset_t __user *oset, size_t sigsetsize) -{ - sigset_t s; - sigset_t __user *up; - int ret; - mm_segment_t old_fs = get_fs(); + /* + * Do this before updating the thread state in + * current->thread.fpr/vr/evr. That way, if we get preempted + * and another task grabs the FPU/Altivec/SPE, it won't be + * tempted to save the current CPU state into the thread_struct + * and corrupt what we are writing there. + */ + discard_lazy_cpu_state(); - if (set) { - if (get_sigset_t(&s, set)) - return -EFAULT; +#ifdef CONFIG_ALTIVEC + regs->msr &= ~MSR_VEC; + if (msr & MSR_VEC) { + /* restore altivec registers from the stack */ + if (__copy_from_user(¤t->thread.vr_state, &sr->mc_vregs, + sizeof(sr->mc_vregs)) || + __copy_from_user(¤t->thread.transact_vr, + &tm_sr->mc_vregs, + sizeof(sr->mc_vregs))) + return 1; + } else if (current->thread.used_vr) { + memset(¤t->thread.vr_state, 0, + ELF_NVRREG * sizeof(vector128)); + memset(¤t->thread.transact_vr, 0, + ELF_NVRREG * sizeof(vector128)); } - set_fs(KERNEL_DS); - /* This is valid because of the set_fs() */ - up = (sigset_t __user *) &s; - ret = sys_rt_sigprocmask((int)how, set ? up : NULL, oset ? up : NULL, - sigsetsize); - set_fs(old_fs); - if (ret) - return ret; - if (oset) { - if (put_sigset_t(oset, &s)) - return -EFAULT; - } - return 0; -} + /* Always get VRSAVE back */ + if (__get_user(current->thread.vrsave, + (u32 __user *)&sr->mc_vregs[32]) || + __get_user(current->thread.transact_vrsave, + (u32 __user *)&tm_sr->mc_vregs[32])) + return 1; + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + mtspr(SPRN_VRSAVE, current->thread.vrsave); +#endif /* CONFIG_ALTIVEC */ -long compat_sys_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) -{ - sigset_t s; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - /* The __user pointer cast is valid because of the set_fs() */ - ret = sys_rt_sigpending((sigset_t __user *) &s, sigsetsize); - set_fs(old_fs); - if (!ret) { - if (put_sigset_t(set, &s)) - return -EFAULT; + regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1); + + if (copy_fpr_from_user(current, &sr->mc_fregs) || + copy_transact_fpr_from_user(current, &tm_sr->mc_fregs)) + return 1; + +#ifdef CONFIG_VSX + regs->msr &= ~MSR_VSX; + if (msr & MSR_VSX) { + /* + * Restore altivec registers from the stack to a local + * buffer, then write this out to the thread_struct + */ + if (copy_vsx_from_user(current, &sr->mc_vsregs) || + copy_transact_vsx_from_user(current, &tm_sr->mc_vsregs)) + return 1; + } else if (current->thread.used_vsr) + for (i = 0; i < 32 ; i++) { + current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0; + current->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET] = 0; + } +#endif /* CONFIG_VSX */ + +#ifdef CONFIG_SPE + /* SPE regs are not checkpointed with TM, so this section is + * simply the same as in restore_user_regs(). + */ + regs->msr &= ~MSR_SPE; + if (msr & MSR_SPE) { + if (__copy_from_user(current->thread.evr, &sr->mc_vregs, + ELF_NEVRREG * sizeof(u32))) + return 1; + } else if (current->thread.used_spe) + memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32)); + + /* Always get SPEFSCR back */ + if (__get_user(current->thread.spefscr, (u32 __user *)&sr->mc_vregs + + ELF_NEVRREG)) + return 1; +#endif /* CONFIG_SPE */ + + /* Now, recheckpoint. This loads up all of the checkpointed (older) + * registers, including FP and V[S]Rs. After recheckpointing, the + * transactional versions should be loaded. + */ + tm_enable(); + /* Make sure the transaction is marked as failed */ + current->thread.tm_texasr |= TEXASR_FS; + /* This loads the checkpointed FP/VEC state, if used */ + tm_recheckpoint(¤t->thread, msr); + /* Get the top half of the MSR */ + if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR])) + return 1; + /* Pull in MSR TM from user context */ + regs->msr = (regs->msr & ~MSR_TS_MASK) | ((msr_hi<<32) & MSR_TS_MASK); + + /* This loads the speculative FP/VEC state, if used */ + if (msr & MSR_FP) { + do_load_up_transact_fpu(¤t->thread); + regs->msr |= (MSR_FP | current->thread.fpexc_mode); } - return ret; -} +#ifdef CONFIG_ALTIVEC + if (msr & MSR_VEC) { + do_load_up_transact_altivec(¤t->thread); + regs->msr |= MSR_VEC; + } +#endif + return 0; +} +#endif -int copy_siginfo_to_user32(struct compat_siginfo __user *d, siginfo_t *s) +#ifdef CONFIG_PPC64 +int copy_siginfo_to_user32(struct compat_siginfo __user *d, const siginfo_t *s) { int err; @@ -681,127 +964,46 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *d, siginfo_t *s) #define copy_siginfo_to_user copy_siginfo_to_user32 -/* - * Note: it is necessary to treat pid and sig as unsigned ints, with the - * corresponding cast to a signed int to insure that the proper conversion - * (sign extension) between the register representation of a signed int - * (msr in 32-bit mode) and the register representation of a signed int - * (msr in 64-bit mode) is performed. - */ -long compat_sys_rt_sigqueueinfo(u32 pid, u32 sig, compat_siginfo_t __user *uinfo) +int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from) { - siginfo_t info; - int ret; - mm_segment_t old_fs = get_fs(); + memset(to, 0, sizeof *to); - if (copy_from_user (&info, uinfo, 3*sizeof(int)) || - copy_from_user (info._sifields._pad, uinfo->_sifields._pad, SI_PAD_SIZE32)) + if (copy_from_user(to, from, 3*sizeof(int)) || + copy_from_user(to->_sifields._pad, + from->_sifields._pad, SI_PAD_SIZE32)) return -EFAULT; - set_fs (KERNEL_DS); - /* The __user pointer cast is valid becasuse of the set_fs() */ - ret = sys_rt_sigqueueinfo((int)pid, (int)sig, (siginfo_t __user *) &info); - set_fs (old_fs); - return ret; -} -/* - * Start Alternate signal stack support - * - * System Calls - * sigaltatck compat_sys_sigaltstack - */ - -int compat_sys_sigaltstack(u32 __new, u32 __old, int r5, - int r6, int r7, int r8, struct pt_regs *regs) -{ - stack_32_t __user * newstack = (stack_32_t __user *)(long) __new; - stack_32_t __user * oldstack = (stack_32_t __user *)(long) __old; - stack_t uss, uoss; - int ret; - mm_segment_t old_fs; - unsigned long sp; - compat_uptr_t ss_sp; - /* - * set sp to the user stack on entry to the system call - * the system call router sets R9 to the saved registers - */ - sp = regs->gpr[1]; - - /* Put new stack info in local 64 bit stack struct */ - if (newstack) { - if (get_user(ss_sp, &newstack->ss_sp) || - __get_user(uss.ss_flags, &newstack->ss_flags) || - __get_user(uss.ss_size, &newstack->ss_size)) - return -EFAULT; - uss.ss_sp = compat_ptr(ss_sp); - } - - old_fs = get_fs(); - set_fs(KERNEL_DS); - /* The __user pointer casts are valid because of the set_fs() */ - ret = do_sigaltstack( - newstack ? (stack_t __user *) &uss : NULL, - oldstack ? (stack_t __user *) &uoss : NULL, - sp); - set_fs(old_fs); - /* Copy the stack information to the user output buffer */ - if (!ret && oldstack && - (put_user((long)uoss.ss_sp, &oldstack->ss_sp) || - __put_user(uoss.ss_flags, &oldstack->ss_flags) || - __put_user(uoss.ss_size, &oldstack->ss_size))) - return -EFAULT; - return ret; + return 0; } #endif /* CONFIG_PPC64 */ - -/* - * Restore the user process's signal mask - */ -#ifdef CONFIG_PPC64 -extern void restore_sigmask(sigset_t *set); -#else /* CONFIG_PPC64 */ -static void restore_sigmask(sigset_t *set) -{ - sigdelsetmask(set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = *set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); -} -#endif - /* * Set up a signal frame for a "real-time" signal handler * (one which gets siginfo). */ -static int handle_rt_signal(unsigned long sig, struct k_sigaction *ka, +int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset, - struct pt_regs *regs, unsigned long newsp) + struct pt_regs *regs) { struct rt_sigframe __user *rt_sf; struct mcontext __user *frame; - unsigned long origsp = newsp; + struct mcontext __user *tm_frame = NULL; + void __user *addr; + unsigned long newsp = 0; + int sigret; + unsigned long tramp; /* Set up Signal Frame */ /* Put a Real Time Context onto stack */ - newsp -= sizeof(*rt_sf); - rt_sf = (struct rt_sigframe __user *)newsp; - - /* create a stack frame for the caller of the handler */ - newsp -= __SIGNAL_FRAMESIZE + 16; - - if (!access_ok(VERIFY_WRITE, (void __user *)newsp, origsp - newsp)) + rt_sf = get_sigframe(ka, get_tm_stackpointer(regs), sizeof(*rt_sf), 1); + addr = rt_sf; + if (unlikely(rt_sf == NULL)) goto badframe; /* Put the siginfo & fill in most of the ucontext */ if (copy_siginfo_to_user(&rt_sf->info, info) || __put_user(0, &rt_sf->uc.uc_flags) - || __put_user(0, &rt_sf->uc.uc_link) - || __put_user(current->sas_ss_sp, &rt_sf->uc.uc_stack.ss_sp) - || __put_user(sas_ss_flags(regs->gpr[1]), - &rt_sf->uc.uc_stack.ss_flags) - || __put_user(current->sas_ss_size, &rt_sf->uc.uc_stack.ss_size) + || __save_altstack(&rt_sf->uc.uc_stack, regs->gpr[1]) || __put_user(to_user_ptr(&rt_sf->uc.uc_mcontext), &rt_sf->uc.uc_regs) || put_sigset_t(&rt_sf->uc.uc_sigmask, oldset)) @@ -809,40 +1011,64 @@ static int handle_rt_signal(unsigned long sig, struct k_sigaction *ka, /* Save user registers on the stack */ frame = &rt_sf->uc.uc_mcontext; -#ifdef CONFIG_PPC64 - if (vdso32_rt_sigtramp && current->thread.vdso_base) { - if (save_user_regs(regs, frame, 0)) + addr = frame; + if (vdso32_rt_sigtramp && current->mm->context.vdso_base) { + sigret = 0; + tramp = current->mm->context.vdso_base + vdso32_rt_sigtramp; + } else { + sigret = __NR_rt_sigreturn; + tramp = (unsigned long) frame->tramp; + } + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + tm_frame = &rt_sf->uc_transact.uc_mcontext; + if (MSR_TM_ACTIVE(regs->msr)) { + if (__put_user((unsigned long)&rt_sf->uc_transact, + &rt_sf->uc.uc_link) || + __put_user((unsigned long)tm_frame, + &rt_sf->uc_transact.uc_regs)) goto badframe; - regs->link = current->thread.vdso_base + vdso32_rt_sigtramp; - } else + if (save_tm_user_regs(regs, frame, tm_frame, sigret)) + goto badframe; + } + else #endif { - if (save_user_regs(regs, frame, __NR_rt_sigreturn)) + if (__put_user(0, &rt_sf->uc.uc_link)) + goto badframe; + if (save_user_regs(regs, frame, tm_frame, sigret, 1)) goto badframe; - regs->link = (unsigned long) frame->tramp; } + regs->link = tramp; + + current->thread.fp_state.fpscr = 0; /* turn off all fp exceptions */ + + /* create a stack frame for the caller of the handler */ + newsp = ((unsigned long)rt_sf) - (__SIGNAL_FRAMESIZE + 16); + addr = (void __user *)regs->gpr[1]; if (put_user(regs->gpr[1], (u32 __user *)newsp)) goto badframe; + + /* Fill registers for signal handler */ regs->gpr[1] = newsp; regs->gpr[3] = sig; regs->gpr[4] = (unsigned long) &rt_sf->info; regs->gpr[5] = (unsigned long) &rt_sf->uc; regs->gpr[6] = (unsigned long) rt_sf; regs->nip = (unsigned long) ka->sa.sa_handler; - regs->trap = 0; -#ifdef CONFIG_PPC64 - regs->result = 0; - - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); -#endif + /* enter the signal handler in native-endian mode */ + regs->msr &= ~MSR_LE; + regs->msr |= (MSR_KERNEL & MSR_LE); return 1; badframe: -#ifdef DEBUG_SIG - printk("badframe in handle_rt_signal, regs=%p frame=%p newsp=%lx\n", - regs, frame, newsp); -#endif + if (show_unhandled_signals) + printk_ratelimited(KERN_INFO + "%s[%d]: bad frame in handle_rt_signal32: " + "%p nip %08lx lr %08lx\n", + current->comm, current->pid, + addr, regs->nip, regs->link); + force_sigsegv(sig, current); return 0; } @@ -861,43 +1087,121 @@ static int do_setcontext(struct ucontext __user *ucp, struct pt_regs *regs, int if (__get_user(cmcp, &ucp->uc_regs)) return -EFAULT; mcp = (struct mcontext __user *)(u64)cmcp; + /* no need to check access_ok(mcp), since mcp < 4GB */ } #else if (__get_user(mcp, &ucp->uc_regs)) return -EFAULT; + if (!access_ok(VERIFY_READ, mcp, sizeof(*mcp))) + return -EFAULT; #endif - restore_sigmask(&set); + set_current_blocked(&set); if (restore_user_regs(regs, mcp, sig)) return -EFAULT; return 0; } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static int do_setcontext_tm(struct ucontext __user *ucp, + struct ucontext __user *tm_ucp, + struct pt_regs *regs) +{ + sigset_t set; + struct mcontext __user *mcp; + struct mcontext __user *tm_mcp; + u32 cmcp; + u32 tm_cmcp; + + if (get_sigset_t(&set, &ucp->uc_sigmask)) + return -EFAULT; + + if (__get_user(cmcp, &ucp->uc_regs) || + __get_user(tm_cmcp, &tm_ucp->uc_regs)) + return -EFAULT; + mcp = (struct mcontext __user *)(u64)cmcp; + tm_mcp = (struct mcontext __user *)(u64)tm_cmcp; + /* no need to check access_ok(mcp), since mcp < 4GB */ + + set_current_blocked(&set); + if (restore_tm_user_regs(regs, mcp, tm_mcp)) + return -EFAULT; + + return 0; +} +#endif + long sys_swapcontext(struct ucontext __user *old_ctx, - struct ucontext __user *new_ctx, - int ctx_size, int r6, int r7, int r8, struct pt_regs *regs) + struct ucontext __user *new_ctx, + int ctx_size, int r6, int r7, int r8, struct pt_regs *regs) { unsigned char tmp; + int ctx_has_vsx_region = 0; + +#ifdef CONFIG_PPC64 + unsigned long new_msr = 0; + if (new_ctx) { + struct mcontext __user *mcp; + u32 cmcp; + + /* + * Get pointer to the real mcontext. No need for + * access_ok since we are dealing with compat + * pointers. + */ + if (__get_user(cmcp, &new_ctx->uc_regs)) + return -EFAULT; + mcp = (struct mcontext __user *)(u64)cmcp; + if (__get_user(new_msr, &mcp->mc_gregs[PT_MSR])) + return -EFAULT; + } + /* + * Check that the context is not smaller than the original + * size (with VMX but without VSX) + */ + if (ctx_size < UCONTEXTSIZEWITHOUTVSX) + return -EINVAL; + /* + * If the new context state sets the MSR VSX bits but + * it doesn't provide VSX state. + */ + if ((ctx_size < sizeof(struct ucontext)) && + (new_msr & MSR_VSX)) + return -EINVAL; + /* Does the context have enough room to store VSX data? */ + if (ctx_size >= sizeof(struct ucontext)) + ctx_has_vsx_region = 1; +#else /* Context size is for future use. Right now, we only make sure * we are passed something we understand */ if (ctx_size < sizeof(struct ucontext)) return -EINVAL; - +#endif if (old_ctx != NULL) { - if (!access_ok(VERIFY_WRITE, old_ctx, sizeof(*old_ctx)) - || save_user_regs(regs, &old_ctx->uc_mcontext, 0) + struct mcontext __user *mctx; + + /* + * old_ctx might not be 16-byte aligned, in which + * case old_ctx->uc_mcontext won't be either. + * Because we have the old_ctx->uc_pad2 field + * before old_ctx->uc_mcontext, we need to round down + * from &old_ctx->uc_mcontext to a 16-byte boundary. + */ + mctx = (struct mcontext __user *) + ((unsigned long) &old_ctx->uc_mcontext & ~0xfUL); + if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size) + || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region) || put_sigset_t(&old_ctx->uc_sigmask, ¤t->blocked) - || __put_user(to_user_ptr(&old_ctx->uc_mcontext), - &old_ctx->uc_regs)) + || __put_user(to_user_ptr(mctx), &old_ctx->uc_regs)) return -EFAULT; } if (new_ctx == NULL) return 0; - if (!access_ok(VERIFY_READ, new_ctx, sizeof(*new_ctx)) + if (!access_ok(VERIFY_READ, new_ctx, ctx_size) || __get_user(tmp, (u8 __user *) new_ctx) - || __get_user(tmp, (u8 __user *) (new_ctx + 1) - 1)) + || __get_user(tmp, (u8 __user *) new_ctx + ctx_size - 1)) return -EFAULT; /* @@ -913,8 +1217,8 @@ long sys_swapcontext(struct ucontext __user *old_ctx, */ if (do_setcontext(new_ctx, regs, 0)) do_exit(SIGSEGV); - sigreturn_exit(regs); - /* doesn't actually return back to here */ + + set_thread_flag(TIF_RESTOREALL); return 0; } @@ -922,7 +1226,12 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, struct pt_regs *regs) { struct rt_sigframe __user *rt_sf; - +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + struct ucontext __user *uc_transact; + unsigned long msr_hi; + unsigned long tmp; + int tm_restore = 0; +#endif /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; @@ -930,6 +1239,34 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, (regs->gpr[1] + __SIGNAL_FRAMESIZE + 16); if (!access_ok(VERIFY_READ, rt_sf, sizeof(*rt_sf))) goto bad; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (__get_user(tmp, &rt_sf->uc.uc_link)) + goto bad; + uc_transact = (struct ucontext __user *)(uintptr_t)tmp; + if (uc_transact) { + u32 cmcp; + struct mcontext __user *mcp; + + if (__get_user(cmcp, &uc_transact->uc_regs)) + return -EFAULT; + mcp = (struct mcontext __user *)(u64)cmcp; + /* The top 32 bits of the MSR are stashed in the transactional + * ucontext. */ + if (__get_user(msr_hi, &mcp->mc_gregs[PT_MSR])) + goto bad; + + if (MSR_TM_ACTIVE(msr_hi<<32)) { + /* We only recheckpoint on return if we're + * transaction. + */ + tm_restore = 1; + if (do_setcontext_tm(&rt_sf->uc, uc_transact, regs)) + goto bad; + } + } + if (!tm_restore) + /* Fall through, for non-TM restore */ +#endif if (do_setcontext(&rt_sf->uc, regs, 1)) goto bad; @@ -941,20 +1278,23 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, * change it. -- paulus */ #ifdef CONFIG_PPC64 - /* - * We use the compat_sys_ version that does the 32/64 bits conversion - * and takes userland pointer directly. What about error checking ? - * nobody does any... - */ - compat_sys_sigaltstack((u32)(u64)&rt_sf->uc.uc_stack, 0, 0, 0, 0, 0, regs); - return (int)regs->result; + if (compat_restore_altstack(&rt_sf->uc.uc_stack)) + goto bad; #else - do_sigaltstack(&rt_sf->uc.uc_stack, NULL, regs->gpr[1]); - sigreturn_exit(regs); /* doesn't return here */ - return 0; + if (restore_altstack(&rt_sf->uc.uc_stack)) + goto bad; #endif + set_thread_flag(TIF_RESTOREALL); + return 0; bad: + if (show_unhandled_signals) + printk_ratelimited(KERN_INFO + "%s[%d]: bad frame in sys_rt_sigreturn: " + "%p nip %08lx lr %08lx\n", + current->comm, current->pid, + rt_sf, regs->nip, regs->link); + force_sig(SIGSEGV, current); return 0; } @@ -967,23 +1307,28 @@ int sys_debug_setcontext(struct ucontext __user *ctx, { struct sig_dbg_op op; int i; + unsigned char tmp; unsigned long new_msr = regs->msr; -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - unsigned long new_dbcr0 = current->thread.dbcr0; +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + unsigned long new_dbcr0 = current->thread.debug.dbcr0; #endif for (i=0; i<ndbg; i++) { - if (__copy_from_user(&op, dbg, sizeof(op))) + if (copy_from_user(&op, dbg + i, sizeof(op))) return -EFAULT; switch (op.dbg_type) { case SIG_DBG_SINGLE_STEPPING: -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) +#ifdef CONFIG_PPC_ADV_DEBUG_REGS if (op.dbg_value) { new_msr |= MSR_DE; new_dbcr0 |= (DBCR0_IDM | DBCR0_IC); } else { - new_msr &= ~MSR_DE; - new_dbcr0 &= ~(DBCR0_IDM | DBCR0_IC); + new_dbcr0 &= ~DBCR0_IC; + if (!DBCR_ACTIVE_EVENTS(new_dbcr0, + current->thread.debug.dbcr1)) { + new_msr &= ~MSR_DE; + new_dbcr0 &= ~DBCR0_IDM; + } } #else if (op.dbg_value) @@ -993,7 +1338,7 @@ int sys_debug_setcontext(struct ucontext __user *ctx, #endif break; case SIG_DBG_BRANCH_TRACING: -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) +#ifdef CONFIG_PPC_ADV_DEBUG_REGS return -EINVAL; #else if (op.dbg_value) @@ -1014,10 +1359,15 @@ int sys_debug_setcontext(struct ucontext __user *ctx, failure is a problem, anyway, and it's very unlikely unless the user is really doing something wrong. */ regs->msr = new_msr; -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - current->thread.dbcr0 = new_dbcr0; +#ifdef CONFIG_PPC_ADV_DEBUG_REGS + current->thread.debug.dbcr0 = new_dbcr0; #endif + if (!access_ok(VERIFY_READ, ctx, sizeof(*ctx)) + || __get_user(tmp, (u8 __user *) ctx) + || __get_user(tmp, (u8 __user *) (ctx + 1) - 1)) + return -EFAULT; + /* * If we get a fault copying the context into the kernel's * image of the user's registers, we can't just return -EFAULT @@ -1030,6 +1380,13 @@ int sys_debug_setcontext(struct ucontext __user *ctx, * We kill the task with a SIGSEGV in this situation. */ if (do_setcontext(ctx, regs, 1)) { + if (show_unhandled_signals) + printk_ratelimited(KERN_INFO "%s[%d]: bad frame in " + "sys_debug_setcontext: %p nip %08lx " + "lr %08lx\n", + current->comm, current->pid, + ctx, regs->nip, regs->link); + force_sig(SIGSEGV, current); goto out; } @@ -1041,11 +1398,9 @@ int sys_debug_setcontext(struct ucontext __user *ctx, * always done it up until now so it is probably better not to * change it. -- paulus */ - do_sigaltstack(&ctx->uc_stack, NULL, regs->gpr[1]); - - sigreturn_exit(regs); - /* doesn't actually return back to here */ + restore_altstack(&ctx->uc_stack); + set_thread_flag(TIF_RESTOREALL); out: return 0; } @@ -1054,27 +1409,21 @@ int sys_debug_setcontext(struct ucontext __user *ctx, /* * OK, we're invoking a handler */ -static int handle_signal(unsigned long sig, struct k_sigaction *ka, - siginfo_t *info, sigset_t *oldset, struct pt_regs *regs, - unsigned long newsp) +int handle_signal32(unsigned long sig, struct k_sigaction *ka, + siginfo_t *info, sigset_t *oldset, struct pt_regs *regs) { struct sigcontext __user *sc; - struct sigregs __user *frame; - unsigned long origsp = newsp; + struct sigframe __user *frame; + struct mcontext __user *tm_mctx = NULL; + unsigned long newsp = 0; + int sigret; + unsigned long tramp; /* Set up Signal Frame */ - newsp -= sizeof(struct sigregs); - frame = (struct sigregs __user *) newsp; - - /* Put a sigcontext on the stack */ - newsp -= sizeof(*sc); - sc = (struct sigcontext __user *) newsp; - - /* create a stack frame for the caller of the handler */ - newsp -= __SIGNAL_FRAMESIZE; - - if (!access_ok(VERIFY_WRITE, (void __user *) newsp, origsp - newsp)) + frame = get_sigframe(ka, get_tm_stackpointer(regs), sizeof(*frame), 1); + if (unlikely(frame == NULL)) goto badframe; + sc = (struct sigcontext __user *) &frame->sctx; #if _NSIG != 64 #error "Please adjust handle_signal()" @@ -1086,44 +1435,57 @@ static int handle_signal(unsigned long sig, struct k_sigaction *ka, #else || __put_user(oldset->sig[1], &sc->_unused[3]) #endif - || __put_user(to_user_ptr(frame), &sc->regs) + || __put_user(to_user_ptr(&frame->mctx), &sc->regs) || __put_user(sig, &sc->signal)) goto badframe; -#ifdef CONFIG_PPC64 - if (vdso32_sigtramp && current->thread.vdso_base) { - if (save_user_regs(regs, &frame->mctx, 0)) + if (vdso32_sigtramp && current->mm->context.vdso_base) { + sigret = 0; + tramp = current->mm->context.vdso_base + vdso32_sigtramp; + } else { + sigret = __NR_sigreturn; + tramp = (unsigned long) frame->mctx.tramp; + } + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + tm_mctx = &frame->mctx_transact; + if (MSR_TM_ACTIVE(regs->msr)) { + if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact, + sigret)) goto badframe; - regs->link = current->thread.vdso_base + vdso32_sigtramp; - } else + } + else #endif { - if (save_user_regs(regs, &frame->mctx, __NR_sigreturn)) + if (save_user_regs(regs, &frame->mctx, tm_mctx, sigret, 1)) goto badframe; - regs->link = (unsigned long) frame->mctx.tramp; } + regs->link = tramp; + + current->thread.fp_state.fpscr = 0; /* turn off all fp exceptions */ + + /* create a stack frame for the caller of the handler */ + newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE; if (put_user(regs->gpr[1], (u32 __user *)newsp)) goto badframe; + regs->gpr[1] = newsp; regs->gpr[3] = sig; regs->gpr[4] = (unsigned long) sc; regs->nip = (unsigned long) ka->sa.sa_handler; - regs->trap = 0; -#ifdef CONFIG_PPC64 - regs->result = 0; - - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); -#endif - + /* enter the signal handler in big-endian mode */ + regs->msr &= ~MSR_LE; return 1; badframe: -#ifdef DEBUG_SIG - printk("badframe in handle_signal, regs=%p frame=%p newsp=%lx\n", - regs, frame, newsp); -#endif + if (show_unhandled_signals) + printk_ratelimited(KERN_INFO + "%s[%d]: bad frame in handle_signal32: " + "%p nip %08lx lr %08lx\n", + current->comm, current->pid, + frame, regs->nip, regs->link); + force_sigsegv(sig, current); return 0; } @@ -1134,15 +1496,23 @@ badframe: long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, struct pt_regs *regs) { + struct sigframe __user *sf; struct sigcontext __user *sc; struct sigcontext sigctx; struct mcontext __user *sr; + void __user *addr; sigset_t set; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + struct mcontext __user *mcp, *tm_mcp; + unsigned long msr_hi; +#endif /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; - sc = (struct sigcontext __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); + sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); + sc = &sf->sctx; + addr = sc; if (copy_from_user(&sigctx, sc, sizeof(sigctx))) goto badframe; @@ -1156,114 +1526,39 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, set.sig[0] = sigctx.oldmask; set.sig[1] = sigctx._unused[3]; #endif - restore_sigmask(&set); + set_current_blocked(&set); - sr = (struct mcontext __user *)from_user_ptr(sigctx.regs); - if (!access_ok(VERIFY_READ, sr, sizeof(*sr)) - || restore_user_regs(regs, sr, 1)) +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + mcp = (struct mcontext __user *)&sf->mctx; + tm_mcp = (struct mcontext __user *)&sf->mctx_transact; + if (__get_user(msr_hi, &tm_mcp->mc_gregs[PT_MSR])) goto badframe; + if (MSR_TM_ACTIVE(msr_hi<<32)) { + if (!cpu_has_feature(CPU_FTR_TM)) + goto badframe; + if (restore_tm_user_regs(regs, mcp, tm_mcp)) + goto badframe; + } else +#endif + { + sr = (struct mcontext __user *)from_user_ptr(sigctx.regs); + addr = sr; + if (!access_ok(VERIFY_READ, sr, sizeof(*sr)) + || restore_user_regs(regs, sr, 1)) + goto badframe; + } -#ifdef CONFIG_PPC64 - return (int)regs->result; -#else - sigreturn_exit(regs); /* doesn't return */ + set_thread_flag(TIF_RESTOREALL); return 0; -#endif badframe: + if (show_unhandled_signals) + printk_ratelimited(KERN_INFO + "%s[%d]: bad frame in sys_sigreturn: " + "%p nip %08lx lr %08lx\n", + current->comm, current->pid, + addr, regs->nip, regs->link); + force_sig(SIGSEGV, current); return 0; } - -/* - * Note that 'init' is a special process: it doesn't get signals it doesn't - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - */ -int do_signal(sigset_t *oldset, struct pt_regs *regs) -{ - siginfo_t info; - struct k_sigaction ka; - unsigned int frame, newsp; - int signr, ret; - -#ifdef CONFIG_PPC32 - if (try_to_freeze()) { - signr = 0; - if (!signal_pending(current)) - goto no_signal; - } -#endif - - if (!oldset) - oldset = ¤t->blocked; - - newsp = frame = 0; - - signr = get_signal_to_deliver(&info, &ka, regs, NULL); -#ifdef CONFIG_PPC32 -no_signal: -#endif - if (TRAP(regs) == 0x0C00 /* System Call! */ - && regs->ccr & 0x10000000 /* error signalled */ - && ((ret = regs->gpr[3]) == ERESTARTSYS - || ret == ERESTARTNOHAND || ret == ERESTARTNOINTR - || ret == ERESTART_RESTARTBLOCK)) { - - if (signr > 0 - && (ret == ERESTARTNOHAND || ret == ERESTART_RESTARTBLOCK - || (ret == ERESTARTSYS - && !(ka.sa.sa_flags & SA_RESTART)))) { - /* make the system call return an EINTR error */ - regs->result = -EINTR; - regs->gpr[3] = EINTR; - /* note that the cr0.SO bit is already set */ - } else { - regs->nip -= 4; /* Back up & retry system call */ - regs->result = 0; - regs->trap = 0; - if (ret == ERESTART_RESTARTBLOCK) - regs->gpr[0] = __NR_restart_syscall; - else - regs->gpr[3] = regs->orig_gpr3; - } - } - - if (signr == 0) - return 0; /* no signals delivered */ - - if ((ka.sa.sa_flags & SA_ONSTACK) && current->sas_ss_size - && !on_sig_stack(regs->gpr[1])) - newsp = current->sas_ss_sp + current->sas_ss_size; - else - newsp = regs->gpr[1]; - newsp &= ~0xfUL; - -#ifdef CONFIG_PPC64 - /* - * Reenable the DABR before delivering the signal to - * user space. The DABR will have been cleared if it - * triggered inside the kernel. - */ - if (current->thread.dabr) - set_dabr(current->thread.dabr); -#endif - - /* Whee! Actually deliver the signal. */ - if (ka.sa.sa_flags & SA_SIGINFO) - ret = handle_rt_signal(signr, &ka, &info, oldset, regs, newsp); - else - ret = handle_signal(signr, &ka, &info, oldset, regs, newsp); - - if (ret) { - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked, ¤t->blocked, - &ka.sa.sa_mask); - if (!(ka.sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked, signr); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - } - - return ret; -} diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c new file mode 100644 index 00000000000..97c1e4b683f --- /dev/null +++ b/arch/powerpc/kernel/signal_64.c @@ -0,0 +1,814 @@ +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Derived from "arch/i386/kernel/signal.c" + * Copyright (C) 1991, 1992 Linus Torvalds + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/kernel.h> +#include <linux/signal.h> +#include <linux/errno.h> +#include <linux/wait.h> +#include <linux/unistd.h> +#include <linux/stddef.h> +#include <linux/elf.h> +#include <linux/ptrace.h> +#include <linux/ratelimit.h> + +#include <asm/sigcontext.h> +#include <asm/ucontext.h> +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/unistd.h> +#include <asm/cacheflush.h> +#include <asm/syscalls.h> +#include <asm/vdso.h> +#include <asm/switch_to.h> +#include <asm/tm.h> + +#include "signal.h" + + +#define GP_REGS_SIZE min(sizeof(elf_gregset_t), sizeof(struct pt_regs)) +#define FP_REGS_SIZE sizeof(elf_fpregset_t) + +#define TRAMP_TRACEBACK 3 +#define TRAMP_SIZE 6 + +/* + * When we have signals to deliver, we set up on the user stack, + * going down from the original stack pointer: + * 1) a rt_sigframe struct which contains the ucontext + * 2) a gap of __SIGNAL_FRAMESIZE bytes which acts as a dummy caller + * frame for the signal handler. + */ + +struct rt_sigframe { + /* sys_rt_sigreturn requires the ucontext be the first field */ + struct ucontext uc; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + struct ucontext uc_transact; +#endif + unsigned long _unused[2]; + unsigned int tramp[TRAMP_SIZE]; + struct siginfo __user *pinfo; + void __user *puc; + struct siginfo info; + /* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */ + char abigap[USER_REDZONE_SIZE]; +} __attribute__ ((aligned (16))); + +static const char fmt32[] = KERN_INFO \ + "%s[%d]: bad frame in %s: %08lx nip %08lx lr %08lx\n"; +static const char fmt64[] = KERN_INFO \ + "%s[%d]: bad frame in %s: %016lx nip %016lx lr %016lx\n"; + +/* + * Set up the sigcontext for the signal frame. + */ + +static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, + int signr, sigset_t *set, unsigned long handler, + int ctx_has_vsx_region) +{ + /* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the + * process never used altivec yet (MSR_VEC is zero in pt_regs of + * the context). This is very important because we must ensure we + * don't lose the VRSAVE content that may have been set prior to + * the process doing its first vector operation + * Userland shall check AT_HWCAP to know whether it can rely on the + * v_regs pointer or not + */ +#ifdef CONFIG_ALTIVEC + elf_vrreg_t __user *v_regs = (elf_vrreg_t __user *)(((unsigned long)sc->vmx_reserve + 15) & ~0xful); +#endif + unsigned long msr = regs->msr; + long err = 0; + +#ifdef CONFIG_ALTIVEC + err |= __put_user(v_regs, &sc->v_regs); + + /* save altivec registers */ + if (current->thread.used_vr) { + flush_altivec_to_thread(current); + /* Copy 33 vec registers (vr0..31 and vscr) to the stack */ + err |= __copy_to_user(v_regs, ¤t->thread.vr_state, + 33 * sizeof(vector128)); + /* set MSR_VEC in the MSR value in the frame to indicate that sc->v_reg) + * contains valid data. + */ + msr |= MSR_VEC; + } + /* We always copy to/from vrsave, it's 0 if we don't have or don't + * use altivec. + */ + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + current->thread.vrsave = mfspr(SPRN_VRSAVE); + err |= __put_user(current->thread.vrsave, (u32 __user *)&v_regs[33]); +#else /* CONFIG_ALTIVEC */ + err |= __put_user(0, &sc->v_regs); +#endif /* CONFIG_ALTIVEC */ + flush_fp_to_thread(current); + /* copy fpr regs and fpscr */ + err |= copy_fpr_to_user(&sc->fp_regs, current); + + /* + * Clear the MSR VSX bit to indicate there is no valid state attached + * to this context, except in the specific case below where we set it. + */ + msr &= ~MSR_VSX; +#ifdef CONFIG_VSX + /* + * Copy VSX low doubleword to local buffer for formatting, + * then out to userspace. Update v_regs to point after the + * VMX data. + */ + if (current->thread.used_vsr && ctx_has_vsx_region) { + __giveup_vsx(current); + v_regs += ELF_NVRREG; + err |= copy_vsx_to_user(v_regs, current); + /* set MSR_VSX in the MSR value in the frame to + * indicate that sc->vs_reg) contains valid data. + */ + msr |= MSR_VSX; + } +#endif /* CONFIG_VSX */ + err |= __put_user(&sc->gp_regs, &sc->regs); + WARN_ON(!FULL_REGS(regs)); + err |= __copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE); + err |= __put_user(msr, &sc->gp_regs[PT_MSR]); + err |= __put_user(signr, &sc->signal); + err |= __put_user(handler, &sc->handler); + if (set != NULL) + err |= __put_user(set->sig[0], &sc->oldmask); + + return err; +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * As above, but Transactional Memory is in use, so deliver sigcontexts + * containing checkpointed and transactional register states. + * + * To do this, we treclaim (done before entering here) to gather both sets of + * registers and set up the 'normal' sigcontext registers with rolled-back + * register values such that a simple signal handler sees a correct + * checkpointed register state. If interested, a TM-aware sighandler can + * examine the transactional registers in the 2nd sigcontext to determine the + * real origin of the signal. + */ +static long setup_tm_sigcontexts(struct sigcontext __user *sc, + struct sigcontext __user *tm_sc, + struct pt_regs *regs, + int signr, sigset_t *set, unsigned long handler) +{ + /* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the + * process never used altivec yet (MSR_VEC is zero in pt_regs of + * the context). This is very important because we must ensure we + * don't lose the VRSAVE content that may have been set prior to + * the process doing its first vector operation + * Userland shall check AT_HWCAP to know wether it can rely on the + * v_regs pointer or not. + */ +#ifdef CONFIG_ALTIVEC + elf_vrreg_t __user *v_regs = (elf_vrreg_t __user *) + (((unsigned long)sc->vmx_reserve + 15) & ~0xful); + elf_vrreg_t __user *tm_v_regs = (elf_vrreg_t __user *) + (((unsigned long)tm_sc->vmx_reserve + 15) & ~0xful); +#endif + unsigned long msr = regs->msr; + long err = 0; + + BUG_ON(!MSR_TM_ACTIVE(regs->msr)); + + /* Remove TM bits from thread's MSR. The MSR in the sigcontext + * just indicates to userland that we were doing a transaction, but we + * don't want to return in transactional state. This also ensures + * that flush_fp_to_thread won't set TIF_RESTORE_TM again. + */ + regs->msr &= ~MSR_TS_MASK; + + flush_fp_to_thread(current); + +#ifdef CONFIG_ALTIVEC + err |= __put_user(v_regs, &sc->v_regs); + err |= __put_user(tm_v_regs, &tm_sc->v_regs); + + /* save altivec registers */ + if (current->thread.used_vr) { + flush_altivec_to_thread(current); + /* Copy 33 vec registers (vr0..31 and vscr) to the stack */ + err |= __copy_to_user(v_regs, ¤t->thread.vr_state, + 33 * sizeof(vector128)); + /* If VEC was enabled there are transactional VRs valid too, + * else they're a copy of the checkpointed VRs. + */ + if (msr & MSR_VEC) + err |= __copy_to_user(tm_v_regs, + ¤t->thread.transact_vr, + 33 * sizeof(vector128)); + else + err |= __copy_to_user(tm_v_regs, + ¤t->thread.vr_state, + 33 * sizeof(vector128)); + + /* set MSR_VEC in the MSR value in the frame to indicate + * that sc->v_reg contains valid data. + */ + msr |= MSR_VEC; + } + /* We always copy to/from vrsave, it's 0 if we don't have or don't + * use altivec. + */ + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + current->thread.vrsave = mfspr(SPRN_VRSAVE); + err |= __put_user(current->thread.vrsave, (u32 __user *)&v_regs[33]); + if (msr & MSR_VEC) + err |= __put_user(current->thread.transact_vrsave, + (u32 __user *)&tm_v_regs[33]); + else + err |= __put_user(current->thread.vrsave, + (u32 __user *)&tm_v_regs[33]); + +#else /* CONFIG_ALTIVEC */ + err |= __put_user(0, &sc->v_regs); + err |= __put_user(0, &tm_sc->v_regs); +#endif /* CONFIG_ALTIVEC */ + + /* copy fpr regs and fpscr */ + err |= copy_fpr_to_user(&sc->fp_regs, current); + if (msr & MSR_FP) + err |= copy_transact_fpr_to_user(&tm_sc->fp_regs, current); + else + err |= copy_fpr_to_user(&tm_sc->fp_regs, current); + +#ifdef CONFIG_VSX + /* + * Copy VSX low doubleword to local buffer for formatting, + * then out to userspace. Update v_regs to point after the + * VMX data. + */ + if (current->thread.used_vsr) { + __giveup_vsx(current); + v_regs += ELF_NVRREG; + tm_v_regs += ELF_NVRREG; + + err |= copy_vsx_to_user(v_regs, current); + + if (msr & MSR_VSX) + err |= copy_transact_vsx_to_user(tm_v_regs, current); + else + err |= copy_vsx_to_user(tm_v_regs, current); + + /* set MSR_VSX in the MSR value in the frame to + * indicate that sc->vs_reg) contains valid data. + */ + msr |= MSR_VSX; + } +#endif /* CONFIG_VSX */ + + err |= __put_user(&sc->gp_regs, &sc->regs); + err |= __put_user(&tm_sc->gp_regs, &tm_sc->regs); + WARN_ON(!FULL_REGS(regs)); + err |= __copy_to_user(&tm_sc->gp_regs, regs, GP_REGS_SIZE); + err |= __copy_to_user(&sc->gp_regs, + ¤t->thread.ckpt_regs, GP_REGS_SIZE); + err |= __put_user(msr, &tm_sc->gp_regs[PT_MSR]); + err |= __put_user(msr, &sc->gp_regs[PT_MSR]); + err |= __put_user(signr, &sc->signal); + err |= __put_user(handler, &sc->handler); + if (set != NULL) + err |= __put_user(set->sig[0], &sc->oldmask); + + return err; +} +#endif + +/* + * Restore the sigcontext from the signal frame. + */ + +static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig, + struct sigcontext __user *sc) +{ +#ifdef CONFIG_ALTIVEC + elf_vrreg_t __user *v_regs; +#endif + unsigned long err = 0; + unsigned long save_r13 = 0; + unsigned long msr; +#ifdef CONFIG_VSX + int i; +#endif + + /* If this is not a signal return, we preserve the TLS in r13 */ + if (!sig) + save_r13 = regs->gpr[13]; + + /* copy the GPRs */ + err |= __copy_from_user(regs->gpr, sc->gp_regs, sizeof(regs->gpr)); + err |= __get_user(regs->nip, &sc->gp_regs[PT_NIP]); + /* get MSR separately, transfer the LE bit if doing signal return */ + err |= __get_user(msr, &sc->gp_regs[PT_MSR]); + if (sig) + regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); + err |= __get_user(regs->orig_gpr3, &sc->gp_regs[PT_ORIG_R3]); + err |= __get_user(regs->ctr, &sc->gp_regs[PT_CTR]); + err |= __get_user(regs->link, &sc->gp_regs[PT_LNK]); + err |= __get_user(regs->xer, &sc->gp_regs[PT_XER]); + err |= __get_user(regs->ccr, &sc->gp_regs[PT_CCR]); + /* skip SOFTE */ + regs->trap = 0; + err |= __get_user(regs->dar, &sc->gp_regs[PT_DAR]); + err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]); + err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]); + + if (!sig) + regs->gpr[13] = save_r13; + if (set != NULL) + err |= __get_user(set->sig[0], &sc->oldmask); + + /* + * Do this before updating the thread state in + * current->thread.fpr/vr. That way, if we get preempted + * and another task grabs the FPU/Altivec, it won't be + * tempted to save the current CPU state into the thread_struct + * and corrupt what we are writing there. + */ + discard_lazy_cpu_state(); + + /* + * Force reload of FP/VEC. + * This has to be done before copying stuff into current->thread.fpr/vr + * for the reasons explained in the previous comment. + */ + regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX); + +#ifdef CONFIG_ALTIVEC + err |= __get_user(v_regs, &sc->v_regs); + if (err) + return err; + if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128))) + return -EFAULT; + /* Copy 33 vec registers (vr0..31 and vscr) from the stack */ + if (v_regs != NULL && (msr & MSR_VEC) != 0) + err |= __copy_from_user(¤t->thread.vr_state, v_regs, + 33 * sizeof(vector128)); + else if (current->thread.used_vr) + memset(¤t->thread.vr_state, 0, 33 * sizeof(vector128)); + /* Always get VRSAVE back */ + if (v_regs != NULL) + err |= __get_user(current->thread.vrsave, (u32 __user *)&v_regs[33]); + else + current->thread.vrsave = 0; + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + mtspr(SPRN_VRSAVE, current->thread.vrsave); +#endif /* CONFIG_ALTIVEC */ + /* restore floating point */ + err |= copy_fpr_from_user(current, &sc->fp_regs); +#ifdef CONFIG_VSX + /* + * Get additional VSX data. Update v_regs to point after the + * VMX data. Copy VSX low doubleword from userspace to local + * buffer for formatting, then into the taskstruct. + */ + v_regs += ELF_NVRREG; + if ((msr & MSR_VSX) != 0) + err |= copy_vsx_from_user(current, v_regs); + else + for (i = 0; i < 32 ; i++) + current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0; +#endif + return err; +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Restore the two sigcontexts from the frame of a transactional processes. + */ + +static long restore_tm_sigcontexts(struct pt_regs *regs, + struct sigcontext __user *sc, + struct sigcontext __user *tm_sc) +{ +#ifdef CONFIG_ALTIVEC + elf_vrreg_t __user *v_regs, *tm_v_regs; +#endif + unsigned long err = 0; + unsigned long msr; +#ifdef CONFIG_VSX + int i; +#endif + /* copy the GPRs */ + err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr)); + err |= __copy_from_user(¤t->thread.ckpt_regs, sc->gp_regs, + sizeof(regs->gpr)); + + /* + * TFHAR is restored from the checkpointed 'wound-back' ucontext's NIP. + * TEXASR was set by the signal delivery reclaim, as was TFIAR. + * Users doing anything abhorrent like thread-switching w/ signals for + * TM-Suspended code will have to back TEXASR/TFIAR up themselves. + * For the case of getting a signal and simply returning from it, + * we don't need to re-copy them here. + */ + err |= __get_user(regs->nip, &tm_sc->gp_regs[PT_NIP]); + err |= __get_user(current->thread.tm_tfhar, &sc->gp_regs[PT_NIP]); + + /* get MSR separately, transfer the LE bit if doing signal return */ + err |= __get_user(msr, &sc->gp_regs[PT_MSR]); + /* pull in MSR TM from user context */ + regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK); + + /* pull in MSR LE from user context */ + regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); + + /* The following non-GPR non-FPR non-VR state is also checkpointed: */ + err |= __get_user(regs->ctr, &tm_sc->gp_regs[PT_CTR]); + err |= __get_user(regs->link, &tm_sc->gp_regs[PT_LNK]); + err |= __get_user(regs->xer, &tm_sc->gp_regs[PT_XER]); + err |= __get_user(regs->ccr, &tm_sc->gp_regs[PT_CCR]); + err |= __get_user(current->thread.ckpt_regs.ctr, + &sc->gp_regs[PT_CTR]); + err |= __get_user(current->thread.ckpt_regs.link, + &sc->gp_regs[PT_LNK]); + err |= __get_user(current->thread.ckpt_regs.xer, + &sc->gp_regs[PT_XER]); + err |= __get_user(current->thread.ckpt_regs.ccr, + &sc->gp_regs[PT_CCR]); + + /* These regs are not checkpointed; they can go in 'regs'. */ + err |= __get_user(regs->trap, &sc->gp_regs[PT_TRAP]); + err |= __get_user(regs->dar, &sc->gp_regs[PT_DAR]); + err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]); + err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]); + + /* + * Do this before updating the thread state in + * current->thread.fpr/vr. That way, if we get preempted + * and another task grabs the FPU/Altivec, it won't be + * tempted to save the current CPU state into the thread_struct + * and corrupt what we are writing there. + */ + discard_lazy_cpu_state(); + + /* + * Force reload of FP/VEC. + * This has to be done before copying stuff into current->thread.fpr/vr + * for the reasons explained in the previous comment. + */ + regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX); + +#ifdef CONFIG_ALTIVEC + err |= __get_user(v_regs, &sc->v_regs); + err |= __get_user(tm_v_regs, &tm_sc->v_regs); + if (err) + return err; + if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128))) + return -EFAULT; + if (tm_v_regs && !access_ok(VERIFY_READ, + tm_v_regs, 34 * sizeof(vector128))) + return -EFAULT; + /* Copy 33 vec registers (vr0..31 and vscr) from the stack */ + if (v_regs != NULL && tm_v_regs != NULL && (msr & MSR_VEC) != 0) { + err |= __copy_from_user(¤t->thread.vr_state, v_regs, + 33 * sizeof(vector128)); + err |= __copy_from_user(¤t->thread.transact_vr, tm_v_regs, + 33 * sizeof(vector128)); + } + else if (current->thread.used_vr) { + memset(¤t->thread.vr_state, 0, 33 * sizeof(vector128)); + memset(¤t->thread.transact_vr, 0, 33 * sizeof(vector128)); + } + /* Always get VRSAVE back */ + if (v_regs != NULL && tm_v_regs != NULL) { + err |= __get_user(current->thread.vrsave, + (u32 __user *)&v_regs[33]); + err |= __get_user(current->thread.transact_vrsave, + (u32 __user *)&tm_v_regs[33]); + } + else { + current->thread.vrsave = 0; + current->thread.transact_vrsave = 0; + } + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + mtspr(SPRN_VRSAVE, current->thread.vrsave); +#endif /* CONFIG_ALTIVEC */ + /* restore floating point */ + err |= copy_fpr_from_user(current, &sc->fp_regs); + err |= copy_transact_fpr_from_user(current, &tm_sc->fp_regs); +#ifdef CONFIG_VSX + /* + * Get additional VSX data. Update v_regs to point after the + * VMX data. Copy VSX low doubleword from userspace to local + * buffer for formatting, then into the taskstruct. + */ + if (v_regs && ((msr & MSR_VSX) != 0)) { + v_regs += ELF_NVRREG; + tm_v_regs += ELF_NVRREG; + err |= copy_vsx_from_user(current, v_regs); + err |= copy_transact_vsx_from_user(current, tm_v_regs); + } else { + for (i = 0; i < 32 ; i++) { + current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0; + current->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET] = 0; + } + } +#endif + tm_enable(); + /* Make sure the transaction is marked as failed */ + current->thread.tm_texasr |= TEXASR_FS; + /* This loads the checkpointed FP/VEC state, if used */ + tm_recheckpoint(¤t->thread, msr); + + /* This loads the speculative FP/VEC state, if used */ + if (msr & MSR_FP) { + do_load_up_transact_fpu(¤t->thread); + regs->msr |= (MSR_FP | current->thread.fpexc_mode); + } +#ifdef CONFIG_ALTIVEC + if (msr & MSR_VEC) { + do_load_up_transact_altivec(¤t->thread); + regs->msr |= MSR_VEC; + } +#endif + + return err; +} +#endif + +/* + * Setup the trampoline code on the stack + */ +static long setup_trampoline(unsigned int syscall, unsigned int __user *tramp) +{ + int i; + long err = 0; + + /* addi r1, r1, __SIGNAL_FRAMESIZE # Pop the dummy stackframe */ + err |= __put_user(0x38210000UL | (__SIGNAL_FRAMESIZE & 0xffff), &tramp[0]); + /* li r0, __NR_[rt_]sigreturn| */ + err |= __put_user(0x38000000UL | (syscall & 0xffff), &tramp[1]); + /* sc */ + err |= __put_user(0x44000002UL, &tramp[2]); + + /* Minimal traceback info */ + for (i=TRAMP_TRACEBACK; i < TRAMP_SIZE ;i++) + err |= __put_user(0, &tramp[i]); + + if (!err) + flush_icache_range((unsigned long) &tramp[0], + (unsigned long) &tramp[TRAMP_SIZE]); + + return err; +} + +/* + * Userspace code may pass a ucontext which doesn't include VSX added + * at the end. We need to check for this case. + */ +#define UCONTEXTSIZEWITHOUTVSX \ + (sizeof(struct ucontext) - 32*sizeof(long)) + +/* + * Handle {get,set,swap}_context operations + */ +int sys_swapcontext(struct ucontext __user *old_ctx, + struct ucontext __user *new_ctx, + long ctx_size, long r6, long r7, long r8, struct pt_regs *regs) +{ + unsigned char tmp; + sigset_t set; + unsigned long new_msr = 0; + int ctx_has_vsx_region = 0; + + if (new_ctx && + get_user(new_msr, &new_ctx->uc_mcontext.gp_regs[PT_MSR])) + return -EFAULT; + /* + * Check that the context is not smaller than the original + * size (with VMX but without VSX) + */ + if (ctx_size < UCONTEXTSIZEWITHOUTVSX) + return -EINVAL; + /* + * If the new context state sets the MSR VSX bits but + * it doesn't provide VSX state. + */ + if ((ctx_size < sizeof(struct ucontext)) && + (new_msr & MSR_VSX)) + return -EINVAL; + /* Does the context have enough room to store VSX data? */ + if (ctx_size >= sizeof(struct ucontext)) + ctx_has_vsx_region = 1; + + if (old_ctx != NULL) { + if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size) + || setup_sigcontext(&old_ctx->uc_mcontext, regs, 0, NULL, 0, + ctx_has_vsx_region) + || __copy_to_user(&old_ctx->uc_sigmask, + ¤t->blocked, sizeof(sigset_t))) + return -EFAULT; + } + if (new_ctx == NULL) + return 0; + if (!access_ok(VERIFY_READ, new_ctx, ctx_size) + || __get_user(tmp, (u8 __user *) new_ctx) + || __get_user(tmp, (u8 __user *) new_ctx + ctx_size - 1)) + return -EFAULT; + + /* + * If we get a fault copying the context into the kernel's + * image of the user's registers, we can't just return -EFAULT + * because the user's registers will be corrupted. For instance + * the NIP value may have been updated but not some of the + * other registers. Given that we have done the access_ok + * and successfully read the first and last bytes of the region + * above, this should only happen in an out-of-memory situation + * or if another thread unmaps the region containing the context. + * We kill the task with a SIGSEGV in this situation. + */ + + if (__copy_from_user(&set, &new_ctx->uc_sigmask, sizeof(set))) + do_exit(SIGSEGV); + set_current_blocked(&set); + if (restore_sigcontext(regs, NULL, 0, &new_ctx->uc_mcontext)) + do_exit(SIGSEGV); + + /* This returns like rt_sigreturn */ + set_thread_flag(TIF_RESTOREALL); + return 0; +} + + +/* + * Do a signal return; undo the signal stack. + */ + +int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5, + unsigned long r6, unsigned long r7, unsigned long r8, + struct pt_regs *regs) +{ + struct ucontext __user *uc = (struct ucontext __user *)regs->gpr[1]; + sigset_t set; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + unsigned long msr; +#endif + + /* Always make any pending restarted system calls return -EINTR */ + current_thread_info()->restart_block.fn = do_no_restart_syscall; + + if (!access_ok(VERIFY_READ, uc, sizeof(*uc))) + goto badframe; + + if (__copy_from_user(&set, &uc->uc_sigmask, sizeof(set))) + goto badframe; + set_current_blocked(&set); +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR])) + goto badframe; + if (MSR_TM_ACTIVE(msr)) { + /* We recheckpoint on return. */ + struct ucontext __user *uc_transact; + if (__get_user(uc_transact, &uc->uc_link)) + goto badframe; + if (restore_tm_sigcontexts(regs, &uc->uc_mcontext, + &uc_transact->uc_mcontext)) + goto badframe; + } + else + /* Fall through, for non-TM restore */ +#endif + if (restore_sigcontext(regs, NULL, 1, &uc->uc_mcontext)) + goto badframe; + + if (restore_altstack(&uc->uc_stack)) + goto badframe; + + set_thread_flag(TIF_RESTOREALL); + return 0; + +badframe: + if (show_unhandled_signals) + printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, + current->comm, current->pid, "rt_sigreturn", + (long)uc, regs->nip, regs->link); + + force_sig(SIGSEGV, current); + return 0; +} + +int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + unsigned long newsp = 0; + long err = 0; + + frame = get_sigframe(ka, get_tm_stackpointer(regs), sizeof(*frame), 0); + if (unlikely(frame == NULL)) + goto badframe; + + err |= __put_user(&frame->info, &frame->pinfo); + err |= __put_user(&frame->uc, &frame->puc); + err |= copy_siginfo_to_user(&frame->info, info); + if (err) + goto badframe; + + /* Create the ucontext. */ + err |= __put_user(0, &frame->uc.uc_flags); + err |= __save_altstack(&frame->uc.uc_stack, regs->gpr[1]); +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(regs->msr)) { + /* The ucontext_t passed to userland points to the second + * ucontext_t (for transactional state) with its uc_link ptr. + */ + err |= __put_user(&frame->uc_transact, &frame->uc.uc_link); + err |= setup_tm_sigcontexts(&frame->uc.uc_mcontext, + &frame->uc_transact.uc_mcontext, + regs, signr, + NULL, + (unsigned long)ka->sa.sa_handler); + } else +#endif + { + err |= __put_user(0, &frame->uc.uc_link); + err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, signr, + NULL, (unsigned long)ka->sa.sa_handler, + 1); + } + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) + goto badframe; + + /* Make sure signal handler doesn't get spurious FP exceptions */ + current->thread.fp_state.fpscr = 0; + + /* Set up to return from userspace. */ + if (vdso64_rt_sigtramp && current->mm->context.vdso_base) { + regs->link = current->mm->context.vdso_base + vdso64_rt_sigtramp; + } else { + err |= setup_trampoline(__NR_rt_sigreturn, &frame->tramp[0]); + if (err) + goto badframe; + regs->link = (unsigned long) &frame->tramp[0]; + } + + /* Allocate a dummy caller frame for the signal handler. */ + newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE; + err |= put_user(regs->gpr[1], (unsigned long __user *)newsp); + + /* Set up "regs" so we "return" to the signal handler. */ + if (is_elf2_task()) { + regs->nip = (unsigned long) ka->sa.sa_handler; + regs->gpr[12] = regs->nip; + } else { + /* Handler is *really* a pointer to the function descriptor for + * the signal routine. The first entry in the function + * descriptor is the entry address of signal and the second + * entry is the TOC value we need to use. + */ + func_descr_t __user *funct_desc_ptr = + (func_descr_t __user *) ka->sa.sa_handler; + + err |= get_user(regs->nip, &funct_desc_ptr->entry); + err |= get_user(regs->gpr[2], &funct_desc_ptr->toc); + } + + /* enter the signal handler in native-endian mode */ + regs->msr &= ~MSR_LE; + regs->msr |= (MSR_KERNEL & MSR_LE); + regs->gpr[1] = newsp; + regs->gpr[3] = signr; + regs->result = 0; + if (ka->sa.sa_flags & SA_SIGINFO) { + err |= get_user(regs->gpr[4], (unsigned long __user *)&frame->pinfo); + err |= get_user(regs->gpr[5], (unsigned long __user *)&frame->puc); + regs->gpr[6] = (unsigned long) frame; + } else { + regs->gpr[4] = (unsigned long)&frame->uc.uc_mcontext; + } + if (err) + goto badframe; + + return 1; + +badframe: + if (show_unhandled_signals) + printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, + current->comm, current->pid, "setup_rt_frame", + (long)frame, regs->nip, regs->link); + + force_sigsegv(signr, current); + return 0; +} diff --git a/arch/powerpc/kernel/smp-tbsync.c b/arch/powerpc/kernel/smp-tbsync.c new file mode 100644 index 00000000000..7a37ecd3afa --- /dev/null +++ b/arch/powerpc/kernel/smp-tbsync.c @@ -0,0 +1,170 @@ +/* + * Smp timebase synchronization for ppc. + * + * Copyright (C) 2003 Samuel Rydh (samuel@ibrium.se) + * + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/unistd.h> +#include <linux/slab.h> +#include <linux/atomic.h> +#include <asm/smp.h> +#include <asm/time.h> + +#define NUM_ITER 300 + +enum { + kExit=0, kSetAndTest, kTest +}; + +static struct { + volatile u64 tb; + volatile u64 mark; + volatile int cmd; + volatile int handshake; + int filler[2]; + + volatile int ack; + int filler2[7]; + + volatile int race_result; +} *tbsync; + +static volatile int running; + +static void enter_contest(u64 mark, long add) +{ + while (get_tb() < mark) + tbsync->race_result = add; +} + +void smp_generic_take_timebase(void) +{ + int cmd; + u64 tb; + unsigned long flags; + + local_irq_save(flags); + while (!running) + barrier(); + rmb(); + + for (;;) { + tbsync->ack = 1; + while (!tbsync->handshake) + barrier(); + rmb(); + + cmd = tbsync->cmd; + tb = tbsync->tb; + mb(); + tbsync->ack = 0; + if (cmd == kExit) + break; + + while (tbsync->handshake) + barrier(); + if (cmd == kSetAndTest) + set_tb(tb >> 32, tb & 0xfffffffful); + enter_contest(tbsync->mark, -1); + } + local_irq_restore(flags); +} + +static int start_contest(int cmd, long offset, int num) +{ + int i, score=0; + u64 tb; + u64 mark; + + tbsync->cmd = cmd; + + local_irq_disable(); + for (i = -3; i < num; ) { + tb = get_tb() + 400; + tbsync->tb = tb + offset; + tbsync->mark = mark = tb + 400; + + wmb(); + + tbsync->handshake = 1; + while (tbsync->ack) + barrier(); + + while (get_tb() <= tb) + barrier(); + tbsync->handshake = 0; + enter_contest(mark, 1); + + while (!tbsync->ack) + barrier(); + + if (i++ > 0) + score += tbsync->race_result; + } + local_irq_enable(); + return score; +} + +void smp_generic_give_timebase(void) +{ + int i, score, score2, old, min=0, max=5000, offset=1000; + + pr_debug("Software timebase sync\n"); + + /* if this fails then this kernel won't work anyway... */ + tbsync = kzalloc( sizeof(*tbsync), GFP_KERNEL ); + mb(); + running = 1; + + while (!tbsync->ack) + barrier(); + + pr_debug("Got ack\n"); + + /* binary search */ + for (old = -1; old != offset ; offset = (min+max) / 2) { + score = start_contest(kSetAndTest, offset, NUM_ITER); + + pr_debug("score %d, offset %d\n", score, offset ); + + if( score > 0 ) + max = offset; + else + min = offset; + old = offset; + } + score = start_contest(kSetAndTest, min, NUM_ITER); + score2 = start_contest(kSetAndTest, max, NUM_ITER); + + pr_debug("Min %d (score %d), Max %d (score %d)\n", + min, score, max, score2); + score = abs(score); + score2 = abs(score2); + offset = (score < score2) ? min : max; + + /* guard against inaccurate mttb */ + for (i = 0; i < 10; i++) { + start_contest(kSetAndTest, offset, NUM_ITER/10); + + if ((score2 = start_contest(kTest, offset, NUM_ITER)) < 0) + score2 = -score2; + if (score2 <= score || score2 < 20) + break; + } + pr_debug("Final offset: %d (%d/%d)\n", offset, score2, NUM_ITER ); + + /* exiting */ + tbsync->cmd = kExit; + wmb(); + tbsync->handshake = 1; + while (tbsync->ack) + barrier(); + tbsync->handshake = 0; + kfree(tbsync); + tbsync = NULL; + running = 0; +} diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c new file mode 100644 index 00000000000..1007fb802e6 --- /dev/null +++ b/arch/powerpc/kernel/smp.c @@ -0,0 +1,840 @@ +/* + * SMP support for ppc. + * + * Written by Cort Dougan (cort@cs.nmt.edu) borrowing a great + * deal of code from the sparc and intel versions. + * + * Copyright (C) 1999 Cort Dougan <cort@cs.nmt.edu> + * + * PowerPC-64 Support added by Dave Engebretsen, Peter Bergner, and + * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/cache.h> +#include <linux/err.h> +#include <linux/device.h> +#include <linux/cpu.h> +#include <linux/notifier.h> +#include <linux/topology.h> + +#include <asm/ptrace.h> +#include <linux/atomic.h> +#include <asm/irq.h> +#include <asm/hw_irq.h> +#include <asm/kvm_ppc.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/prom.h> +#include <asm/smp.h> +#include <asm/time.h> +#include <asm/machdep.h> +#include <asm/cputhreads.h> +#include <asm/cputable.h> +#include <asm/mpic.h> +#include <asm/vdso_datapage.h> +#ifdef CONFIG_PPC64 +#include <asm/paca.h> +#endif +#include <asm/vdso.h> +#include <asm/debug.h> + +#ifdef DEBUG +#include <asm/udbg.h> +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +#ifdef CONFIG_HOTPLUG_CPU +/* State of each CPU during hotplug phases */ +static DEFINE_PER_CPU(int, cpu_state) = { 0 }; +#endif + +struct thread_info *secondary_ti; + +DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); + +EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); +EXPORT_PER_CPU_SYMBOL(cpu_core_map); + +/* SMP operations for this machine */ +struct smp_ops_t *smp_ops; + +/* Can't be static due to PowerMac hackery */ +volatile unsigned int cpu_callin_map[NR_CPUS]; + +int smt_enabled_at_boot = 1; + +static void (*crash_ipi_function_ptr)(struct pt_regs *) = NULL; + +/* + * Returns 1 if the specified cpu should be brought up during boot. + * Used to inhibit booting threads if they've been disabled or + * limited on the command line + */ +int smp_generic_cpu_bootable(unsigned int nr) +{ + /* Special case - we inhibit secondary thread startup + * during boot if the user requests it. + */ + if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) { + if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) + return 0; + if (smt_enabled_at_boot + && cpu_thread_in_core(nr) >= smt_enabled_at_boot) + return 0; + } + + return 1; +} + + +#ifdef CONFIG_PPC64 +int smp_generic_kick_cpu(int nr) +{ + BUG_ON(nr < 0 || nr >= NR_CPUS); + + /* + * The processor is currently spinning, waiting for the + * cpu_start field to become non-zero After we set cpu_start, + * the processor will continue on to secondary_start + */ + if (!paca[nr].cpu_start) { + paca[nr].cpu_start = 1; + smp_mb(); + return 0; + } + +#ifdef CONFIG_HOTPLUG_CPU + /* + * Ok it's not there, so it might be soft-unplugged, let's + * try to bring it back + */ + generic_set_cpu_up(nr); + smp_wmb(); + smp_send_reschedule(nr); +#endif /* CONFIG_HOTPLUG_CPU */ + + return 0; +} +#endif /* CONFIG_PPC64 */ + +static irqreturn_t call_function_action(int irq, void *data) +{ + generic_smp_call_function_interrupt(); + return IRQ_HANDLED; +} + +static irqreturn_t reschedule_action(int irq, void *data) +{ + scheduler_ipi(); + return IRQ_HANDLED; +} + +static irqreturn_t tick_broadcast_ipi_action(int irq, void *data) +{ + tick_broadcast_ipi_handler(); + return IRQ_HANDLED; +} + +static irqreturn_t debug_ipi_action(int irq, void *data) +{ + if (crash_ipi_function_ptr) { + crash_ipi_function_ptr(get_irq_regs()); + return IRQ_HANDLED; + } + +#ifdef CONFIG_DEBUGGER + debugger_ipi(get_irq_regs()); +#endif /* CONFIG_DEBUGGER */ + + return IRQ_HANDLED; +} + +static irq_handler_t smp_ipi_action[] = { + [PPC_MSG_CALL_FUNCTION] = call_function_action, + [PPC_MSG_RESCHEDULE] = reschedule_action, + [PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action, + [PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action, +}; + +const char *smp_ipi_name[] = { + [PPC_MSG_CALL_FUNCTION] = "ipi call function", + [PPC_MSG_RESCHEDULE] = "ipi reschedule", + [PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast", + [PPC_MSG_DEBUGGER_BREAK] = "ipi debugger", +}; + +/* optional function to request ipi, for controllers with >= 4 ipis */ +int smp_request_message_ipi(int virq, int msg) +{ + int err; + + if (msg < 0 || msg > PPC_MSG_DEBUGGER_BREAK) { + return -EINVAL; + } +#if !defined(CONFIG_DEBUGGER) && !defined(CONFIG_KEXEC) + if (msg == PPC_MSG_DEBUGGER_BREAK) { + return 1; + } +#endif + err = request_irq(virq, smp_ipi_action[msg], + IRQF_PERCPU | IRQF_NO_THREAD | IRQF_NO_SUSPEND, + smp_ipi_name[msg], NULL); + WARN(err < 0, "unable to request_irq %d for %s (rc %d)\n", + virq, smp_ipi_name[msg], err); + + return err; +} + +#ifdef CONFIG_PPC_SMP_MUXED_IPI +struct cpu_messages { + int messages; /* current messages */ + unsigned long data; /* data for cause ipi */ +}; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message); + +void smp_muxed_ipi_set_data(int cpu, unsigned long data) +{ + struct cpu_messages *info = &per_cpu(ipi_message, cpu); + + info->data = data; +} + +void smp_muxed_ipi_message_pass(int cpu, int msg) +{ + struct cpu_messages *info = &per_cpu(ipi_message, cpu); + char *message = (char *)&info->messages; + + /* + * Order previous accesses before accesses in the IPI handler. + */ + smp_mb(); + message[msg] = 1; + /* + * cause_ipi functions are required to include a full barrier + * before doing whatever causes the IPI. + */ + smp_ops->cause_ipi(cpu, info->data); +} + +#ifdef __BIG_ENDIAN__ +#define IPI_MESSAGE(A) (1 << (24 - 8 * (A))) +#else +#define IPI_MESSAGE(A) (1 << (8 * (A))) +#endif + +irqreturn_t smp_ipi_demux(void) +{ + struct cpu_messages *info = &__get_cpu_var(ipi_message); + unsigned int all; + + mb(); /* order any irq clear */ + + do { + all = xchg(&info->messages, 0); + if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION)) + generic_smp_call_function_interrupt(); + if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE)) + scheduler_ipi(); + if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST)) + tick_broadcast_ipi_handler(); + if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK)) + debug_ipi_action(0, NULL); + } while (info->messages); + + return IRQ_HANDLED; +} +#endif /* CONFIG_PPC_SMP_MUXED_IPI */ + +static inline void do_message_pass(int cpu, int msg) +{ + if (smp_ops->message_pass) + smp_ops->message_pass(cpu, msg); +#ifdef CONFIG_PPC_SMP_MUXED_IPI + else + smp_muxed_ipi_message_pass(cpu, msg); +#endif +} + +void smp_send_reschedule(int cpu) +{ + if (likely(smp_ops)) + do_message_pass(cpu, PPC_MSG_RESCHEDULE); +} +EXPORT_SYMBOL_GPL(smp_send_reschedule); + +void arch_send_call_function_single_ipi(int cpu) +{ + do_message_pass(cpu, PPC_MSG_CALL_FUNCTION); +} + +void arch_send_call_function_ipi_mask(const struct cpumask *mask) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) + do_message_pass(cpu, PPC_MSG_CALL_FUNCTION); +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +void tick_broadcast(const struct cpumask *mask) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) + do_message_pass(cpu, PPC_MSG_TICK_BROADCAST); +} +#endif + +#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) +void smp_send_debugger_break(void) +{ + int cpu; + int me = raw_smp_processor_id(); + + if (unlikely(!smp_ops)) + return; + + for_each_online_cpu(cpu) + if (cpu != me) + do_message_pass(cpu, PPC_MSG_DEBUGGER_BREAK); +} +#endif + +#ifdef CONFIG_KEXEC +void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) +{ + crash_ipi_function_ptr = crash_ipi_callback; + if (crash_ipi_callback) { + mb(); + smp_send_debugger_break(); + } +} +#endif + +static void stop_this_cpu(void *dummy) +{ + /* Remove this CPU */ + set_cpu_online(smp_processor_id(), false); + + local_irq_disable(); + while (1) + ; +} + +void smp_send_stop(void) +{ + smp_call_function(stop_this_cpu, NULL, 0); +} + +struct thread_info *current_set[NR_CPUS]; + +static void smp_store_cpu_info(int id) +{ + per_cpu(cpu_pvr, id) = mfspr(SPRN_PVR); +#ifdef CONFIG_PPC_FSL_BOOK3E + per_cpu(next_tlbcam_idx, id) + = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1; +#endif +} + +void __init smp_prepare_cpus(unsigned int max_cpus) +{ + unsigned int cpu; + + DBG("smp_prepare_cpus\n"); + + /* + * setup_cpu may need to be called on the boot cpu. We havent + * spun any cpus up but lets be paranoid. + */ + BUG_ON(boot_cpuid != smp_processor_id()); + + /* Fixup boot cpu */ + smp_store_cpu_info(boot_cpuid); + cpu_callin_map[boot_cpuid] = 1; + + for_each_possible_cpu(cpu) { + zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + } + + cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid)); + cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); + + if (smp_ops && smp_ops->probe) + smp_ops->probe(); +} + +void smp_prepare_boot_cpu(void) +{ + BUG_ON(smp_processor_id() != boot_cpuid); +#ifdef CONFIG_PPC64 + paca[boot_cpuid].__current = current; +#endif + set_numa_node(numa_cpu_lookup_table[boot_cpuid]); + current_set[boot_cpuid] = task_thread_info(current); +} + +#ifdef CONFIG_HOTPLUG_CPU + +int generic_cpu_disable(void) +{ + unsigned int cpu = smp_processor_id(); + + if (cpu == boot_cpuid) + return -EBUSY; + + set_cpu_online(cpu, false); +#ifdef CONFIG_PPC64 + vdso_data->processorCount--; +#endif + migrate_irqs(); + return 0; +} + +void generic_cpu_die(unsigned int cpu) +{ + int i; + + for (i = 0; i < 100; i++) { + smp_rmb(); + if (per_cpu(cpu_state, cpu) == CPU_DEAD) + return; + msleep(100); + } + printk(KERN_ERR "CPU%d didn't die...\n", cpu); +} + +void generic_mach_cpu_die(void) +{ + unsigned int cpu; + + local_irq_disable(); + idle_task_exit(); + cpu = smp_processor_id(); + printk(KERN_DEBUG "CPU%d offline\n", cpu); + __get_cpu_var(cpu_state) = CPU_DEAD; + smp_wmb(); + while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) + cpu_relax(); +} + +void generic_set_cpu_dead(unsigned int cpu) +{ + per_cpu(cpu_state, cpu) = CPU_DEAD; +} + +/* + * The cpu_state should be set to CPU_UP_PREPARE in kick_cpu(), otherwise + * the cpu_state is always CPU_DEAD after calling generic_set_cpu_dead(), + * which makes the delay in generic_cpu_die() not happen. + */ +void generic_set_cpu_up(unsigned int cpu) +{ + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; +} + +int generic_check_cpu_restart(unsigned int cpu) +{ + return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE; +} + +static bool secondaries_inhibited(void) +{ + return kvm_hv_mode_active(); +} + +#else /* HOTPLUG_CPU */ + +#define secondaries_inhibited() 0 + +#endif + +static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) +{ + struct thread_info *ti = task_thread_info(idle); + +#ifdef CONFIG_PPC64 + paca[cpu].__current = idle; + paca[cpu].kstack = (unsigned long)ti + THREAD_SIZE - STACK_FRAME_OVERHEAD; +#endif + ti->cpu = cpu; + secondary_ti = current_set[cpu] = ti; +} + +int __cpu_up(unsigned int cpu, struct task_struct *tidle) +{ + int rc, c; + + /* + * Don't allow secondary threads to come online if inhibited + */ + if (threads_per_core > 1 && secondaries_inhibited() && + cpu_thread_in_subcore(cpu)) + return -EBUSY; + + if (smp_ops == NULL || + (smp_ops->cpu_bootable && !smp_ops->cpu_bootable(cpu))) + return -EINVAL; + + cpu_idle_thread_init(cpu, tidle); + + /* Make sure callin-map entry is 0 (can be leftover a CPU + * hotplug + */ + cpu_callin_map[cpu] = 0; + + /* The information for processor bringup must + * be written out to main store before we release + * the processor. + */ + smp_mb(); + + /* wake up cpus */ + DBG("smp: kicking cpu %d\n", cpu); + rc = smp_ops->kick_cpu(cpu); + if (rc) { + pr_err("smp: failed starting cpu %d (rc %d)\n", cpu, rc); + return rc; + } + + /* + * wait to see if the cpu made a callin (is actually up). + * use this value that I found through experimentation. + * -- Cort + */ + if (system_state < SYSTEM_RUNNING) + for (c = 50000; c && !cpu_callin_map[cpu]; c--) + udelay(100); +#ifdef CONFIG_HOTPLUG_CPU + else + /* + * CPUs can take much longer to come up in the + * hotplug case. Wait five seconds. + */ + for (c = 5000; c && !cpu_callin_map[cpu]; c--) + msleep(1); +#endif + + if (!cpu_callin_map[cpu]) { + printk(KERN_ERR "Processor %u is stuck.\n", cpu); + return -ENOENT; + } + + DBG("Processor %u found.\n", cpu); + + if (smp_ops->give_timebase) + smp_ops->give_timebase(); + + /* Wait until cpu puts itself in the online map */ + while (!cpu_online(cpu)) + cpu_relax(); + + return 0; +} + +/* Return the value of the reg property corresponding to the given + * logical cpu. + */ +int cpu_to_core_id(int cpu) +{ + struct device_node *np; + const __be32 *reg; + int id = -1; + + np = of_get_cpu_node(cpu, NULL); + if (!np) + goto out; + + reg = of_get_property(np, "reg", NULL); + if (!reg) + goto out; + + id = be32_to_cpup(reg); +out: + of_node_put(np); + return id; +} + +/* Helper routines for cpu to core mapping */ +int cpu_core_index_of_thread(int cpu) +{ + return cpu >> threads_shift; +} +EXPORT_SYMBOL_GPL(cpu_core_index_of_thread); + +int cpu_first_thread_of_core(int core) +{ + return core << threads_shift; +} +EXPORT_SYMBOL_GPL(cpu_first_thread_of_core); + +static void traverse_siblings_chip_id(int cpu, bool add, int chipid) +{ + const struct cpumask *mask; + struct device_node *np; + int i, plen; + const __be32 *prop; + + mask = add ? cpu_online_mask : cpu_present_mask; + for_each_cpu(i, mask) { + np = of_get_cpu_node(i, NULL); + if (!np) + continue; + prop = of_get_property(np, "ibm,chip-id", &plen); + if (prop && plen == sizeof(int) && + of_read_number(prop, 1) == chipid) { + if (add) { + cpumask_set_cpu(cpu, cpu_core_mask(i)); + cpumask_set_cpu(i, cpu_core_mask(cpu)); + } else { + cpumask_clear_cpu(cpu, cpu_core_mask(i)); + cpumask_clear_cpu(i, cpu_core_mask(cpu)); + } + } + of_node_put(np); + } +} + +/* Must be called when no change can occur to cpu_present_mask, + * i.e. during cpu online or offline. + */ +static struct device_node *cpu_to_l2cache(int cpu) +{ + struct device_node *np; + struct device_node *cache; + + if (!cpu_present(cpu)) + return NULL; + + np = of_get_cpu_node(cpu, NULL); + if (np == NULL) + return NULL; + + cache = of_find_next_cache_node(np); + + of_node_put(np); + + return cache; +} + +static void traverse_core_siblings(int cpu, bool add) +{ + struct device_node *l2_cache, *np; + const struct cpumask *mask; + int i, chip, plen; + const __be32 *prop; + + /* First see if we have ibm,chip-id properties in cpu nodes */ + np = of_get_cpu_node(cpu, NULL); + if (np) { + chip = -1; + prop = of_get_property(np, "ibm,chip-id", &plen); + if (prop && plen == sizeof(int)) + chip = of_read_number(prop, 1); + of_node_put(np); + if (chip >= 0) { + traverse_siblings_chip_id(cpu, add, chip); + return; + } + } + + l2_cache = cpu_to_l2cache(cpu); + mask = add ? cpu_online_mask : cpu_present_mask; + for_each_cpu(i, mask) { + np = cpu_to_l2cache(i); + if (!np) + continue; + if (np == l2_cache) { + if (add) { + cpumask_set_cpu(cpu, cpu_core_mask(i)); + cpumask_set_cpu(i, cpu_core_mask(cpu)); + } else { + cpumask_clear_cpu(cpu, cpu_core_mask(i)); + cpumask_clear_cpu(i, cpu_core_mask(cpu)); + } + } + of_node_put(np); + } + of_node_put(l2_cache); +} + +/* Activate a secondary processor. */ +void start_secondary(void *unused) +{ + unsigned int cpu = smp_processor_id(); + int i, base; + + atomic_inc(&init_mm.mm_count); + current->active_mm = &init_mm; + + smp_store_cpu_info(cpu); + set_dec(tb_ticks_per_jiffy); + preempt_disable(); + cpu_callin_map[cpu] = 1; + + if (smp_ops->setup_cpu) + smp_ops->setup_cpu(cpu); + if (smp_ops->take_timebase) + smp_ops->take_timebase(); + + secondary_cpu_time_init(); + +#ifdef CONFIG_PPC64 + if (system_state == SYSTEM_RUNNING) + vdso_data->processorCount++; + + vdso_getcpu_init(); +#endif + /* Update sibling maps */ + base = cpu_first_thread_sibling(cpu); + for (i = 0; i < threads_per_core; i++) { + if (cpu_is_offline(base + i) && (cpu != base + i)) + continue; + cpumask_set_cpu(cpu, cpu_sibling_mask(base + i)); + cpumask_set_cpu(base + i, cpu_sibling_mask(cpu)); + + /* cpu_core_map should be a superset of + * cpu_sibling_map even if we don't have cache + * information, so update the former here, too. + */ + cpumask_set_cpu(cpu, cpu_core_mask(base + i)); + cpumask_set_cpu(base + i, cpu_core_mask(cpu)); + } + traverse_core_siblings(cpu, true); + + /* + * numa_node_id() works after this. + */ + set_numa_node(numa_cpu_lookup_table[cpu]); + set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); + + smp_wmb(); + notify_cpu_starting(cpu); + set_cpu_online(cpu, true); + + local_irq_enable(); + + cpu_startup_entry(CPUHP_ONLINE); + + BUG(); +} + +int setup_profiling_timer(unsigned int multiplier) +{ + return 0; +} + +#ifdef CONFIG_SCHED_SMT +/* cpumask of CPUs with asymetric SMT dependancy */ +static int powerpc_smt_flags(void) +{ + int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + + if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { + printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); + flags |= SD_ASYM_PACKING; + } + return flags; +} +#endif + +static struct sched_domain_topology_level powerpc_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +void __init smp_cpus_done(unsigned int max_cpus) +{ + cpumask_var_t old_mask; + + /* We want the setup_cpu() here to be called from CPU 0, but our + * init thread may have been "borrowed" by another CPU in the meantime + * se we pin us down to CPU 0 for a short while + */ + alloc_cpumask_var(&old_mask, GFP_NOWAIT); + cpumask_copy(old_mask, tsk_cpus_allowed(current)); + set_cpus_allowed_ptr(current, cpumask_of(boot_cpuid)); + + if (smp_ops && smp_ops->setup_cpu) + smp_ops->setup_cpu(boot_cpuid); + + set_cpus_allowed_ptr(current, old_mask); + + free_cpumask_var(old_mask); + + if (smp_ops && smp_ops->bringup_done) + smp_ops->bringup_done(); + + dump_numa_cpu_topology(); + + set_sched_topology(powerpc_topology); + +} + +#ifdef CONFIG_HOTPLUG_CPU +int __cpu_disable(void) +{ + int cpu = smp_processor_id(); + int base, i; + int err; + + if (!smp_ops->cpu_disable) + return -ENOSYS; + + err = smp_ops->cpu_disable(); + if (err) + return err; + + /* Update sibling maps */ + base = cpu_first_thread_sibling(cpu); + for (i = 0; i < threads_per_core; i++) { + cpumask_clear_cpu(cpu, cpu_sibling_mask(base + i)); + cpumask_clear_cpu(base + i, cpu_sibling_mask(cpu)); + cpumask_clear_cpu(cpu, cpu_core_mask(base + i)); + cpumask_clear_cpu(base + i, cpu_core_mask(cpu)); + } + traverse_core_siblings(cpu, false); + + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + if (smp_ops->cpu_die) + smp_ops->cpu_die(cpu); +} + +void cpu_die(void) +{ + if (ppc_md.cpu_die) + ppc_md.cpu_die(); + + /* If we return, we re-enter start_secondary */ + start_secondary_resume(); +} + +#endif diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c new file mode 100644 index 00000000000..3d30ef1038e --- /dev/null +++ b/arch/powerpc/kernel/stacktrace.c @@ -0,0 +1,63 @@ +/* + * Stack trace utility + * + * Copyright 2008 Christoph Hellwig, IBM Corp. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/stacktrace.h> +#include <asm/ptrace.h> +#include <asm/processor.h> + +/* + * Save stack-backtrace addresses into a stack_trace buffer. + */ +static void save_context_stack(struct stack_trace *trace, unsigned long sp, + struct task_struct *tsk, int savesched) +{ + for (;;) { + unsigned long *stack = (unsigned long *) sp; + unsigned long newsp, ip; + + if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD)) + return; + + newsp = stack[0]; + ip = stack[STACK_FRAME_LR_SAVE]; + + if (savesched || !in_sched_functions(ip)) { + if (!trace->skip) + trace->entries[trace->nr_entries++] = ip; + else + trace->skip--; + } + + if (trace->nr_entries >= trace->max_entries) + return; + + sp = newsp; + } +} + +void save_stack_trace(struct stack_trace *trace) +{ + unsigned long sp; + + asm("mr %0,1" : "=r" (sp)); + + save_context_stack(trace, sp, current, 1); +} +EXPORT_SYMBOL_GPL(save_stack_trace); + +void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +{ + save_context_stack(trace, tsk->thread.ksp, tsk, 0); +} +EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c new file mode 100644 index 00000000000..0167d53da30 --- /dev/null +++ b/arch/powerpc/kernel/suspend.c @@ -0,0 +1,25 @@ +/* + * Suspend support specific for power. + * + * Distribute under GPLv2 + * + * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz> + * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> + */ + +#include <linux/mm.h> +#include <asm/page.h> + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c new file mode 100644 index 00000000000..eae33e10b65 --- /dev/null +++ b/arch/powerpc/kernel/swsusp.c @@ -0,0 +1,38 @@ +/* + * Common powerpc suspend code for 32 and 64 bits + * + * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <asm/current.h> +#include <asm/mmu_context.h> +#include <asm/switch_to.h> + +void save_processor_state(void) +{ + /* + * flush out all the special registers so we don't need + * to save them in the snapshot + */ + flush_fp_to_thread(current); + flush_altivec_to_thread(current); + flush_spe_to_thread(current); + +#ifdef CONFIG_PPC64 + hard_irq_disable(); +#endif + +} + +void restore_processor_state(void) +{ +#ifdef CONFIG_PPC32 + switch_mmu_context(current->active_mm, current->active_mm); +#endif +} diff --git a/arch/powerpc/kernel/swsusp_32.S b/arch/powerpc/kernel/swsusp_32.S new file mode 100644 index 00000000000..ba4dee3d233 --- /dev/null +++ b/arch/powerpc/kernel/swsusp_32.S @@ -0,0 +1,350 @@ +#include <linux/threads.h> +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/thread_info.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/mmu.h> + +/* + * Structure for storing CPU registers on the save area. + */ +#define SL_SP 0 +#define SL_PC 4 +#define SL_MSR 8 +#define SL_SDR1 0xc +#define SL_SPRG0 0x10 /* 4 sprg's */ +#define SL_DBAT0 0x20 +#define SL_IBAT0 0x28 +#define SL_DBAT1 0x30 +#define SL_IBAT1 0x38 +#define SL_DBAT2 0x40 +#define SL_IBAT2 0x48 +#define SL_DBAT3 0x50 +#define SL_IBAT3 0x58 +#define SL_TB 0x60 +#define SL_R2 0x68 +#define SL_CR 0x6c +#define SL_LR 0x70 +#define SL_R12 0x74 /* r12 to r31 */ +#define SL_SIZE (SL_R12 + 80) + + .section .data + .align 5 + +_GLOBAL(swsusp_save_area) + .space SL_SIZE + + + .section .text + .align 5 + +_GLOBAL(swsusp_arch_suspend) + + lis r11,swsusp_save_area@h + ori r11,r11,swsusp_save_area@l + + mflr r0 + stw r0,SL_LR(r11) + mfcr r0 + stw r0,SL_CR(r11) + stw r1,SL_SP(r11) + stw r2,SL_R2(r11) + stmw r12,SL_R12(r11) + + /* Save MSR & SDR1 */ + mfmsr r4 + stw r4,SL_MSR(r11) + mfsdr1 r4 + stw r4,SL_SDR1(r11) + + /* Get a stable timebase and save it */ +1: mftbu r4 + stw r4,SL_TB(r11) + mftb r5 + stw r5,SL_TB+4(r11) + mftbu r3 + cmpw r3,r4 + bne 1b + + /* Save SPRGs */ + mfsprg r4,0 + stw r4,SL_SPRG0(r11) + mfsprg r4,1 + stw r4,SL_SPRG0+4(r11) + mfsprg r4,2 + stw r4,SL_SPRG0+8(r11) + mfsprg r4,3 + stw r4,SL_SPRG0+12(r11) + + /* Save BATs */ + mfdbatu r4,0 + stw r4,SL_DBAT0(r11) + mfdbatl r4,0 + stw r4,SL_DBAT0+4(r11) + mfdbatu r4,1 + stw r4,SL_DBAT1(r11) + mfdbatl r4,1 + stw r4,SL_DBAT1+4(r11) + mfdbatu r4,2 + stw r4,SL_DBAT2(r11) + mfdbatl r4,2 + stw r4,SL_DBAT2+4(r11) + mfdbatu r4,3 + stw r4,SL_DBAT3(r11) + mfdbatl r4,3 + stw r4,SL_DBAT3+4(r11) + mfibatu r4,0 + stw r4,SL_IBAT0(r11) + mfibatl r4,0 + stw r4,SL_IBAT0+4(r11) + mfibatu r4,1 + stw r4,SL_IBAT1(r11) + mfibatl r4,1 + stw r4,SL_IBAT1+4(r11) + mfibatu r4,2 + stw r4,SL_IBAT2(r11) + mfibatl r4,2 + stw r4,SL_IBAT2+4(r11) + mfibatu r4,3 + stw r4,SL_IBAT3(r11) + mfibatl r4,3 + stw r4,SL_IBAT3+4(r11) + +#if 0 + /* Backup various CPU config stuffs */ + bl __save_cpu_setup +#endif + /* Call the low level suspend stuff (we should probably have made + * a stackframe... + */ + bl swsusp_save + + /* Restore LR from the save area */ + lis r11,swsusp_save_area@h + ori r11,r11,swsusp_save_area@l + lwz r0,SL_LR(r11) + mtlr r0 + + blr + + +/* Resume code */ +_GLOBAL(swsusp_arch_resume) + +#ifdef CONFIG_ALTIVEC + /* Stop pending alitvec streams and memory accesses */ +BEGIN_FTR_SECTION + DSSALL +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif + sync + + /* Disable MSR:DR to make sure we don't take a TLB or + * hash miss during the copy, as our hash table will + * for a while be unusable. For .text, we assume we are + * covered by a BAT. This works only for non-G5 at this + * point. G5 will need a better approach, possibly using + * a small temporary hash table filled with large mappings, + * disabling the MMU completely isn't a good option for + * performance reasons. + * (Note that 750's may have the same performance issue as + * the G5 in this case, we should investigate using moving + * BATs for these CPUs) + */ + mfmsr r0 + sync + rlwinm r0,r0,0,28,26 /* clear MSR_DR */ + mtmsr r0 + sync + isync + + /* Load ptr the list of pages to copy in r3 */ + lis r11,(restore_pblist - KERNELBASE)@h + ori r11,r11,restore_pblist@l + lwz r10,0(r11) + + /* Copy the pages. This is a very basic implementation, to + * be replaced by something more cache efficient */ +1: + tophys(r3,r10) + li r0,256 + mtctr r0 + lwz r11,pbe_address(r3) /* source */ + tophys(r5,r11) + lwz r10,pbe_orig_address(r3) /* destination */ + tophys(r6,r10) +2: + lwz r8,0(r5) + lwz r9,4(r5) + lwz r10,8(r5) + lwz r11,12(r5) + addi r5,r5,16 + stw r8,0(r6) + stw r9,4(r6) + stw r10,8(r6) + stw r11,12(r6) + addi r6,r6,16 + bdnz 2b + lwz r10,pbe_next(r3) + cmpwi 0,r10,0 + bne 1b + + /* Do a very simple cache flush/inval of the L1 to ensure + * coherency of the icache + */ + lis r3,0x0002 + mtctr r3 + li r3, 0 +1: + lwz r0,0(r3) + addi r3,r3,0x0020 + bdnz 1b + isync + sync + + /* Now flush those cache lines */ + lis r3,0x0002 + mtctr r3 + li r3, 0 +1: + dcbf 0,r3 + addi r3,r3,0x0020 + bdnz 1b + sync + + /* Ok, we are now running with the kernel data of the old + * kernel fully restored. We can get to the save area + * easily now. As for the rest of the code, it assumes the + * loader kernel and the booted one are exactly identical + */ + lis r11,swsusp_save_area@h + ori r11,r11,swsusp_save_area@l + tophys(r11,r11) + +#if 0 + /* Restore various CPU config stuffs */ + bl __restore_cpu_setup +#endif + /* Restore the BATs, and SDR1. Then we can turn on the MMU. + * This is a bit hairy as we are running out of those BATs, + * but first, our code is probably in the icache, and we are + * writing the same value to the BAT, so that should be fine, + * though a better solution will have to be found long-term + */ + lwz r4,SL_SDR1(r11) + mtsdr1 r4 + lwz r4,SL_SPRG0(r11) + mtsprg 0,r4 + lwz r4,SL_SPRG0+4(r11) + mtsprg 1,r4 + lwz r4,SL_SPRG0+8(r11) + mtsprg 2,r4 + lwz r4,SL_SPRG0+12(r11) + mtsprg 3,r4 + +#if 0 + lwz r4,SL_DBAT0(r11) + mtdbatu 0,r4 + lwz r4,SL_DBAT0+4(r11) + mtdbatl 0,r4 + lwz r4,SL_DBAT1(r11) + mtdbatu 1,r4 + lwz r4,SL_DBAT1+4(r11) + mtdbatl 1,r4 + lwz r4,SL_DBAT2(r11) + mtdbatu 2,r4 + lwz r4,SL_DBAT2+4(r11) + mtdbatl 2,r4 + lwz r4,SL_DBAT3(r11) + mtdbatu 3,r4 + lwz r4,SL_DBAT3+4(r11) + mtdbatl 3,r4 + lwz r4,SL_IBAT0(r11) + mtibatu 0,r4 + lwz r4,SL_IBAT0+4(r11) + mtibatl 0,r4 + lwz r4,SL_IBAT1(r11) + mtibatu 1,r4 + lwz r4,SL_IBAT1+4(r11) + mtibatl 1,r4 + lwz r4,SL_IBAT2(r11) + mtibatu 2,r4 + lwz r4,SL_IBAT2+4(r11) + mtibatl 2,r4 + lwz r4,SL_IBAT3(r11) + mtibatu 3,r4 + lwz r4,SL_IBAT3+4(r11) + mtibatl 3,r4 +#endif + +BEGIN_MMU_FTR_SECTION + li r4,0 + mtspr SPRN_DBAT4U,r4 + mtspr SPRN_DBAT4L,r4 + mtspr SPRN_DBAT5U,r4 + mtspr SPRN_DBAT5L,r4 + mtspr SPRN_DBAT6U,r4 + mtspr SPRN_DBAT6L,r4 + mtspr SPRN_DBAT7U,r4 + mtspr SPRN_DBAT7L,r4 + mtspr SPRN_IBAT4U,r4 + mtspr SPRN_IBAT4L,r4 + mtspr SPRN_IBAT5U,r4 + mtspr SPRN_IBAT5L,r4 + mtspr SPRN_IBAT6U,r4 + mtspr SPRN_IBAT6L,r4 + mtspr SPRN_IBAT7U,r4 + mtspr SPRN_IBAT7L,r4 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) + + /* Flush all TLBs */ + lis r4,0x1000 +1: addic. r4,r4,-0x1000 + tlbie r4 + bgt 1b + sync + + /* restore the MSR and turn on the MMU */ + lwz r3,SL_MSR(r11) + bl turn_on_mmu + tovirt(r11,r11) + + /* Restore TB */ + li r3,0 + mttbl r3 + lwz r3,SL_TB(r11) + lwz r4,SL_TB+4(r11) + mttbu r3 + mttbl r4 + + /* Kick decrementer */ + li r0,1 + mtdec r0 + + /* Restore the callee-saved registers and return */ + lwz r0,SL_CR(r11) + mtcr r0 + lwz r2,SL_R2(r11) + lmw r12,SL_R12(r11) + lwz r1,SL_SP(r11) + lwz r0,SL_LR(r11) + mtlr r0 + + // XXX Note: we don't really need to call swsusp_resume + + li r3,0 + blr + +/* FIXME:This construct is actually not useful since we don't shut + * down the instruction MMU, we could just flip back MSR-DR on. + */ +turn_on_mmu: + mflr r4 + mtsrr0 r4 + mtsrr1 r3 + sync + isync + rfi + diff --git a/arch/powerpc/kernel/swsusp_64.c b/arch/powerpc/kernel/swsusp_64.c new file mode 100644 index 00000000000..0e899e47c32 --- /dev/null +++ b/arch/powerpc/kernel/swsusp_64.c @@ -0,0 +1,24 @@ +/* + * PowerPC 64-bit swsusp implementation + * + * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> + * + * GPLv2 + */ + +#include <asm/iommu.h> +#include <linux/irq.h> +#include <linux/sched.h> +#include <linux/interrupt.h> + +void do_after_copyback(void) +{ + iommu_restore(); + touch_softlockup_watchdog(); + mb(); +} + +void _iommu_save(void) +{ + iommu_save(); +} diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S new file mode 100644 index 00000000000..988f38dced0 --- /dev/null +++ b/arch/powerpc/kernel/swsusp_asm64.S @@ -0,0 +1,273 @@ +/* + * PowerPC 64-bit swsusp implementation + * + * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> + * + * GPLv2 + */ + +#include <linux/threads.h> +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/thread_info.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + +/* + * Structure for storing CPU registers on the save area. + */ +#define SL_r1 0x00 /* stack pointer */ +#define SL_PC 0x08 +#define SL_MSR 0x10 +#define SL_SDR1 0x18 +#define SL_XER 0x20 +#define SL_TB 0x40 +#define SL_r2 0x48 +#define SL_CR 0x50 +#define SL_LR 0x58 +#define SL_r12 0x60 +#define SL_r13 0x68 +#define SL_r14 0x70 +#define SL_r15 0x78 +#define SL_r16 0x80 +#define SL_r17 0x88 +#define SL_r18 0x90 +#define SL_r19 0x98 +#define SL_r20 0xa0 +#define SL_r21 0xa8 +#define SL_r22 0xb0 +#define SL_r23 0xb8 +#define SL_r24 0xc0 +#define SL_r25 0xc8 +#define SL_r26 0xd0 +#define SL_r27 0xd8 +#define SL_r28 0xe0 +#define SL_r29 0xe8 +#define SL_r30 0xf0 +#define SL_r31 0xf8 +#define SL_SPRG1 0x100 +#define SL_TCR 0x108 +#define SL_SIZE SL_TCR+8 + +/* these macros rely on the save area being + * pointed to by r11 */ + +#define SAVE_SPR(register) \ + mfspr r0, SPRN_##register ;\ + std r0, SL_##register(r11) +#define RESTORE_SPR(register) \ + ld r0, SL_##register(r11) ;\ + mtspr SPRN_##register, r0 +#define SAVE_SPECIAL(special) \ + mf##special r0 ;\ + std r0, SL_##special(r11) +#define RESTORE_SPECIAL(special) \ + ld r0, SL_##special(r11) ;\ + mt##special r0 +#define SAVE_REGISTER(reg) \ + std reg, SL_##reg(r11) +#define RESTORE_REGISTER(reg) \ + ld reg, SL_##reg(r11) + +/* space for storing cpu state */ + .section .data + .align 5 +swsusp_save_area: + .space SL_SIZE + + .section ".toc","aw" +swsusp_save_area_ptr: + .tc swsusp_save_area[TC],swsusp_save_area +restore_pblist_ptr: + .tc restore_pblist[TC],restore_pblist + + .section .text + .align 5 +_GLOBAL(swsusp_arch_suspend) + ld r11,swsusp_save_area_ptr@toc(r2) + SAVE_SPECIAL(LR) + SAVE_REGISTER(r1) + SAVE_SPECIAL(CR) + SAVE_SPECIAL(TB) + SAVE_REGISTER(r2) + SAVE_REGISTER(r12) + SAVE_REGISTER(r13) + SAVE_REGISTER(r14) + SAVE_REGISTER(r15) + SAVE_REGISTER(r16) + SAVE_REGISTER(r17) + SAVE_REGISTER(r18) + SAVE_REGISTER(r19) + SAVE_REGISTER(r20) + SAVE_REGISTER(r21) + SAVE_REGISTER(r22) + SAVE_REGISTER(r23) + SAVE_REGISTER(r24) + SAVE_REGISTER(r25) + SAVE_REGISTER(r26) + SAVE_REGISTER(r27) + SAVE_REGISTER(r28) + SAVE_REGISTER(r29) + SAVE_REGISTER(r30) + SAVE_REGISTER(r31) + SAVE_SPECIAL(MSR) + SAVE_SPECIAL(XER) +#ifdef CONFIG_PPC_BOOK3S_64 +BEGIN_FW_FTR_SECTION + SAVE_SPECIAL(SDR1) +END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR) +#else + SAVE_SPR(TCR) + + /* Save SPRG1, SPRG1 be used save paca */ + SAVE_SPR(SPRG1) +#endif + + /* we push the stack up 128 bytes but don't store the + * stack pointer on the stack like a real stackframe */ + addi r1,r1,-128 + + bl _iommu_save + bl swsusp_save + + /* restore LR */ + ld r11,swsusp_save_area_ptr@toc(r2) + RESTORE_SPECIAL(LR) + addi r1,r1,128 + + blr + +/* Resume code */ +_GLOBAL(swsusp_arch_resume) + /* Stop pending alitvec streams and memory accesses */ +BEGIN_FTR_SECTION + DSSALL +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) + sync + + ld r12,restore_pblist_ptr@toc(r2) + ld r12,0(r12) + + cmpdi r12,0 + beq- nothing_to_copy + li r15,PAGE_SIZE>>3 +copyloop: + ld r13,pbe_address(r12) + ld r14,pbe_orig_address(r12) + + mtctr r15 + li r10,0 +copy_page_loop: + ldx r0,r10,r13 + stdx r0,r10,r14 + addi r10,r10,8 + bdnz copy_page_loop + + ld r12,pbe_next(r12) + cmpdi r12,0 + bne+ copyloop +nothing_to_copy: + +#ifdef CONFIG_PPC_BOOK3S_64 + /* flush caches */ + lis r3, 0x10 + mtctr r3 + li r3, 0 + ori r3, r3, CONFIG_KERNEL_START>>48 + li r0, 48 + sld r3, r3, r0 + li r0, 0 +1: + dcbf r0,r3 + addi r3,r3,0x20 + bdnz 1b + + sync + + tlbia +#endif + + ld r11,swsusp_save_area_ptr@toc(r2) + + RESTORE_SPECIAL(CR) + + /* restore timebase */ + /* load saved tb */ + ld r1, SL_TB(r11) + /* get upper 32 bits of it */ + srdi r2, r1, 32 + /* clear tb lower to avoid wrap */ + li r0, 0 + mttbl r0 + /* set tb upper */ + mttbu r2 + /* set tb lower */ + mttbl r1 + + /* restore registers */ + RESTORE_REGISTER(r1) + RESTORE_REGISTER(r2) + RESTORE_REGISTER(r12) + RESTORE_REGISTER(r13) + RESTORE_REGISTER(r14) + RESTORE_REGISTER(r15) + RESTORE_REGISTER(r16) + RESTORE_REGISTER(r17) + RESTORE_REGISTER(r18) + RESTORE_REGISTER(r19) + RESTORE_REGISTER(r20) + RESTORE_REGISTER(r21) + RESTORE_REGISTER(r22) + RESTORE_REGISTER(r23) + RESTORE_REGISTER(r24) + RESTORE_REGISTER(r25) + RESTORE_REGISTER(r26) + RESTORE_REGISTER(r27) + RESTORE_REGISTER(r28) + RESTORE_REGISTER(r29) + RESTORE_REGISTER(r30) + RESTORE_REGISTER(r31) + +#ifdef CONFIG_PPC_BOOK3S_64 + /* can't use RESTORE_SPECIAL(MSR) */ + ld r0, SL_MSR(r11) + mtmsrd r0, 0 +BEGIN_FW_FTR_SECTION + RESTORE_SPECIAL(SDR1) +END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR) +#else + /* Restore SPRG1, be used to save paca */ + ld r0, SL_SPRG1(r11) + mtsprg 1, r0 + + RESTORE_SPECIAL(MSR) + + /* Restore TCR and clear any pending bits in TSR. */ + RESTORE_SPR(TCR) + lis r0, (TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS)@h + mtspr SPRN_TSR, r0 + + /* Kick decrementer */ + li r0, 1 + mtdec r0 + + /* Invalidate all tlbs */ + bl _tlbil_all +#endif + RESTORE_SPECIAL(XER) + + sync + + addi r1,r1,-128 +#ifdef CONFIG_PPC_BOOK3S_64 + bl slb_flush_and_rebolt +#endif + bl do_after_copyback + addi r1,r1,128 + + ld r11,swsusp_save_area_ptr@toc(r2) + RESTORE_SPECIAL(LR) + + li r3, 0 + blr diff --git a/arch/powerpc/kernel/swsusp_booke.S b/arch/powerpc/kernel/swsusp_booke.S new file mode 100644 index 00000000000..553c1405ee0 --- /dev/null +++ b/arch/powerpc/kernel/swsusp_booke.S @@ -0,0 +1,201 @@ +/* + * Based on swsusp_32.S, modified for FSL BookE by + * Anton Vorontsov <avorontsov@ru.mvista.com> + * Copyright (c) 2009-2010 MontaVista Software, LLC. + */ + +#include <linux/threads.h> +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/thread_info.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/mmu.h> + +/* + * Structure for storing CPU registers on the save area. + */ +#define SL_SP 0 +#define SL_PC 4 +#define SL_MSR 8 +#define SL_TCR 0xc +#define SL_SPRG0 0x10 +#define SL_SPRG1 0x14 +#define SL_SPRG2 0x18 +#define SL_SPRG3 0x1c +#define SL_SPRG4 0x20 +#define SL_SPRG5 0x24 +#define SL_SPRG6 0x28 +#define SL_SPRG7 0x2c +#define SL_TBU 0x30 +#define SL_TBL 0x34 +#define SL_R2 0x38 +#define SL_CR 0x3c +#define SL_LR 0x40 +#define SL_R12 0x44 /* r12 to r31 */ +#define SL_SIZE (SL_R12 + 80) + + .section .data + .align 5 + +_GLOBAL(swsusp_save_area) + .space SL_SIZE + + + .section .text + .align 5 + +_GLOBAL(swsusp_arch_suspend) + lis r11,swsusp_save_area@h + ori r11,r11,swsusp_save_area@l + + mflr r0 + stw r0,SL_LR(r11) + mfcr r0 + stw r0,SL_CR(r11) + stw r1,SL_SP(r11) + stw r2,SL_R2(r11) + stmw r12,SL_R12(r11) + + /* Save MSR & TCR */ + mfmsr r4 + stw r4,SL_MSR(r11) + mfspr r4,SPRN_TCR + stw r4,SL_TCR(r11) + + /* Get a stable timebase and save it */ +1: mfspr r4,SPRN_TBRU + stw r4,SL_TBU(r11) + mfspr r5,SPRN_TBRL + stw r5,SL_TBL(r11) + mfspr r3,SPRN_TBRU + cmpw r3,r4 + bne 1b + + /* Save SPRGs */ + mfspr r4,SPRN_SPRG0 + stw r4,SL_SPRG0(r11) + mfspr r4,SPRN_SPRG1 + stw r4,SL_SPRG1(r11) + mfspr r4,SPRN_SPRG2 + stw r4,SL_SPRG2(r11) + mfspr r4,SPRN_SPRG3 + stw r4,SL_SPRG3(r11) + mfspr r4,SPRN_SPRG4 + stw r4,SL_SPRG4(r11) + mfspr r4,SPRN_SPRG5 + stw r4,SL_SPRG5(r11) + mfspr r4,SPRN_SPRG6 + stw r4,SL_SPRG6(r11) + mfspr r4,SPRN_SPRG7 + stw r4,SL_SPRG7(r11) + + /* Call the low level suspend stuff (we should probably have made + * a stackframe... + */ + bl swsusp_save + + /* Restore LR from the save area */ + lis r11,swsusp_save_area@h + ori r11,r11,swsusp_save_area@l + lwz r0,SL_LR(r11) + mtlr r0 + + blr + +_GLOBAL(swsusp_arch_resume) + sync + + /* Load ptr the list of pages to copy in r3 */ + lis r11,(restore_pblist)@h + ori r11,r11,restore_pblist@l + lwz r3,0(r11) + + /* Copy the pages. This is a very basic implementation, to + * be replaced by something more cache efficient */ +1: + li r0,256 + mtctr r0 + lwz r5,pbe_address(r3) /* source */ + lwz r6,pbe_orig_address(r3) /* destination */ +2: + lwz r8,0(r5) + lwz r9,4(r5) + lwz r10,8(r5) + lwz r11,12(r5) + addi r5,r5,16 + stw r8,0(r6) + stw r9,4(r6) + stw r10,8(r6) + stw r11,12(r6) + addi r6,r6,16 + bdnz 2b + lwz r3,pbe_next(r3) + cmpwi 0,r3,0 + bne 1b + + bl flush_dcache_L1 + bl flush_instruction_cache + + lis r11,swsusp_save_area@h + ori r11,r11,swsusp_save_area@l + + /* + * Mappings from virtual addresses to physical addresses may be + * different than they were prior to restoring hibernation state. + * Invalidate the TLB so that the boot CPU is using the new + * mappings. + */ + bl _tlbil_all + + lwz r4,SL_SPRG0(r11) + mtspr SPRN_SPRG0,r4 + lwz r4,SL_SPRG1(r11) + mtspr SPRN_SPRG1,r4 + lwz r4,SL_SPRG2(r11) + mtspr SPRN_SPRG2,r4 + lwz r4,SL_SPRG3(r11) + mtspr SPRN_SPRG3,r4 + lwz r4,SL_SPRG4(r11) + mtspr SPRN_SPRG4,r4 + lwz r4,SL_SPRG5(r11) + mtspr SPRN_SPRG5,r4 + lwz r4,SL_SPRG6(r11) + mtspr SPRN_SPRG6,r4 + lwz r4,SL_SPRG7(r11) + mtspr SPRN_SPRG7,r4 + + /* restore the MSR */ + lwz r3,SL_MSR(r11) + mtmsr r3 + + /* Restore TB */ + li r3,0 + mtspr SPRN_TBWL,r3 + lwz r3,SL_TBU(r11) + lwz r4,SL_TBL(r11) + mtspr SPRN_TBWU,r3 + mtspr SPRN_TBWL,r4 + + /* Restore TCR and clear any pending bits in TSR. */ + lwz r4,SL_TCR(r11) + mtspr SPRN_TCR,r4 + lis r4, (TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS)@h + mtspr SPRN_TSR,r4 + + /* Kick decrementer */ + li r0,1 + mtdec r0 + + /* Restore the callee-saved registers and return */ + lwz r0,SL_CR(r11) + mtcr r0 + lwz r2,SL_R2(r11) + lmw r12,SL_R12(r11) + lwz r1,SL_SP(r11) + lwz r0,SL_LR(r11) + mtlr r0 + + li r3,0 + blr diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index a8210ed5c68..8a285876aef 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -14,7 +14,6 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/fs.h> @@ -23,10 +22,7 @@ #include <linux/signal.h> #include <linux/resource.h> #include <linux/times.h> -#include <linux/utsname.h> -#include <linux/timex.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/sem.h> #include <linux/msg.h> #include <linux/shm.h> @@ -43,75 +39,19 @@ #include <linux/compat.h> #include <linux/ptrace.h> #include <linux/elf.h> +#include <linux/ipc.h> +#include <linux/slab.h> #include <asm/ptrace.h> #include <asm/types.h> -#include <asm/ipc.h> #include <asm/uaccess.h> #include <asm/unistd.h> -#include <asm/semaphore.h> #include <asm/time.h> #include <asm/mmu_context.h> -#include <asm/systemcfg.h> #include <asm/ppc-pci.h> +#include <asm/syscalls.h> +#include <asm/switch_to.h> -/* readdir & getdents */ -#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de))) -#define ROUND_UP(x) (((x)+sizeof(u32)-1) & ~(sizeof(u32)-1)) - -struct old_linux_dirent32 { - u32 d_ino; - u32 d_offset; - unsigned short d_namlen; - char d_name[1]; -}; - -struct readdir_callback32 { - struct old_linux_dirent32 __user * dirent; - int count; -}; - -static int fillonedir(void * __buf, const char * name, int namlen, - off_t offset, ino_t ino, unsigned int d_type) -{ - struct readdir_callback32 * buf = (struct readdir_callback32 *) __buf; - struct old_linux_dirent32 __user * dirent; - - if (buf->count) - return -EINVAL; - buf->count++; - dirent = buf->dirent; - put_user(ino, &dirent->d_ino); - put_user(offset, &dirent->d_offset); - put_user(namlen, &dirent->d_namlen); - copy_to_user(dirent->d_name, name, namlen); - put_user(0, dirent->d_name + namlen); - return 0; -} - -asmlinkage int old32_readdir(unsigned int fd, struct old_linux_dirent32 __user *dirent, unsigned int count) -{ - int error = -EBADF; - struct file * file; - struct readdir_callback32 buf; - - file = fget(fd); - if (!file) - goto out; - - buf.count = 0; - buf.dirent = dirent; - - error = vfs_readdir(file, (filldir_t)fillonedir, &buf); - if (error < 0) - goto out_putf; - error = buf.count; - -out_putf: - fput(file); -out: - return error; -} asmlinkage long ppc32_select(u32 n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, @@ -121,780 +61,6 @@ asmlinkage long ppc32_select(u32 n, compat_ulong_t __user *inp, return compat_sys_select((int)n, inp, outp, exp, compat_ptr(tvp_x)); } -int cp_compat_stat(struct kstat *stat, struct compat_stat __user *statbuf) -{ - long err; - - if (stat->size > MAX_NON_LFS || !new_valid_dev(stat->dev) || - !new_valid_dev(stat->rdev)) - return -EOVERFLOW; - - err = access_ok(VERIFY_WRITE, statbuf, sizeof(*statbuf)) ? 0 : -EFAULT; - err |= __put_user(new_encode_dev(stat->dev), &statbuf->st_dev); - err |= __put_user(stat->ino, &statbuf->st_ino); - err |= __put_user(stat->mode, &statbuf->st_mode); - err |= __put_user(stat->nlink, &statbuf->st_nlink); - err |= __put_user(stat->uid, &statbuf->st_uid); - err |= __put_user(stat->gid, &statbuf->st_gid); - err |= __put_user(new_encode_dev(stat->rdev), &statbuf->st_rdev); - err |= __put_user(stat->size, &statbuf->st_size); - err |= __put_user(stat->atime.tv_sec, &statbuf->st_atime); - err |= __put_user(stat->atime.tv_nsec, &statbuf->st_atime_nsec); - err |= __put_user(stat->mtime.tv_sec, &statbuf->st_mtime); - err |= __put_user(stat->mtime.tv_nsec, &statbuf->st_mtime_nsec); - err |= __put_user(stat->ctime.tv_sec, &statbuf->st_ctime); - err |= __put_user(stat->ctime.tv_nsec, &statbuf->st_ctime_nsec); - err |= __put_user(stat->blksize, &statbuf->st_blksize); - err |= __put_user(stat->blocks, &statbuf->st_blocks); - err |= __put_user(0, &statbuf->__unused4[0]); - err |= __put_user(0, &statbuf->__unused4[1]); - - return err; -} - -/* Note: it is necessary to treat option as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sysfs(u32 option, u32 arg1, u32 arg2) -{ - return sys_sysfs((int)option, arg1, arg2); -} - -/* Handle adjtimex compatibility. */ -struct timex32 { - u32 modes; - s32 offset, freq, maxerror, esterror; - s32 status, constant, precision, tolerance; - struct compat_timeval time; - s32 tick; - s32 ppsfreq, jitter, shift, stabil; - s32 jitcnt, calcnt, errcnt, stbcnt; - s32 :32; s32 :32; s32 :32; s32 :32; - s32 :32; s32 :32; s32 :32; s32 :32; - s32 :32; s32 :32; s32 :32; s32 :32; -}; - -extern int do_adjtimex(struct timex *); -extern void ppc_adjtimex(void); - -asmlinkage long compat_sys_adjtimex(struct timex32 __user *utp) -{ - struct timex txc; - int ret; - - memset(&txc, 0, sizeof(struct timex)); - - if(get_user(txc.modes, &utp->modes) || - __get_user(txc.offset, &utp->offset) || - __get_user(txc.freq, &utp->freq) || - __get_user(txc.maxerror, &utp->maxerror) || - __get_user(txc.esterror, &utp->esterror) || - __get_user(txc.status, &utp->status) || - __get_user(txc.constant, &utp->constant) || - __get_user(txc.precision, &utp->precision) || - __get_user(txc.tolerance, &utp->tolerance) || - __get_user(txc.time.tv_sec, &utp->time.tv_sec) || - __get_user(txc.time.tv_usec, &utp->time.tv_usec) || - __get_user(txc.tick, &utp->tick) || - __get_user(txc.ppsfreq, &utp->ppsfreq) || - __get_user(txc.jitter, &utp->jitter) || - __get_user(txc.shift, &utp->shift) || - __get_user(txc.stabil, &utp->stabil) || - __get_user(txc.jitcnt, &utp->jitcnt) || - __get_user(txc.calcnt, &utp->calcnt) || - __get_user(txc.errcnt, &utp->errcnt) || - __get_user(txc.stbcnt, &utp->stbcnt)) - return -EFAULT; - - ret = do_adjtimex(&txc); - - /* adjust the conversion of TB to time of day to track adjtimex */ - ppc_adjtimex(); - - if(put_user(txc.modes, &utp->modes) || - __put_user(txc.offset, &utp->offset) || - __put_user(txc.freq, &utp->freq) || - __put_user(txc.maxerror, &utp->maxerror) || - __put_user(txc.esterror, &utp->esterror) || - __put_user(txc.status, &utp->status) || - __put_user(txc.constant, &utp->constant) || - __put_user(txc.precision, &utp->precision) || - __put_user(txc.tolerance, &utp->tolerance) || - __put_user(txc.time.tv_sec, &utp->time.tv_sec) || - __put_user(txc.time.tv_usec, &utp->time.tv_usec) || - __put_user(txc.tick, &utp->tick) || - __put_user(txc.ppsfreq, &utp->ppsfreq) || - __put_user(txc.jitter, &utp->jitter) || - __put_user(txc.shift, &utp->shift) || - __put_user(txc.stabil, &utp->stabil) || - __put_user(txc.jitcnt, &utp->jitcnt) || - __put_user(txc.calcnt, &utp->calcnt) || - __put_user(txc.errcnt, &utp->errcnt) || - __put_user(txc.stbcnt, &utp->stbcnt)) - ret = -EFAULT; - - return ret; -} - -asmlinkage long compat_sys_pause(void) -{ - current->state = TASK_INTERRUPTIBLE; - schedule(); - - return -ERESTARTNOHAND; -} - -static inline long get_ts32(struct timespec *o, struct compat_timeval __user *i) -{ - long usec; - - if (!access_ok(VERIFY_READ, i, sizeof(*i))) - return -EFAULT; - if (__get_user(o->tv_sec, &i->tv_sec)) - return -EFAULT; - if (__get_user(usec, &i->tv_usec)) - return -EFAULT; - o->tv_nsec = usec * 1000; - return 0; -} - -static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i) -{ - return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || - (__put_user(i->tv_sec, &o->tv_sec) | - __put_user(i->tv_usec, &o->tv_usec))); -} - -struct sysinfo32 { - s32 uptime; - u32 loads[3]; - u32 totalram; - u32 freeram; - u32 sharedram; - u32 bufferram; - u32 totalswap; - u32 freeswap; - unsigned short procs; - unsigned short pad; - u32 totalhigh; - u32 freehigh; - u32 mem_unit; - char _f[20-2*sizeof(int)-sizeof(int)]; -}; - -asmlinkage long compat_sys_sysinfo(struct sysinfo32 __user *info) -{ - struct sysinfo s; - int ret, err; - int bitcount=0; - mm_segment_t old_fs = get_fs (); - - /* The __user cast is valid due to set_fs() */ - set_fs (KERNEL_DS); - ret = sys_sysinfo((struct sysinfo __user *)&s); - set_fs (old_fs); - - /* Check to see if any memory value is too large for 32-bit and - * scale down if needed. - */ - if ((s.totalram >> 32) || (s.totalswap >> 32)) { - while (s.mem_unit < PAGE_SIZE) { - s.mem_unit <<= 1; - bitcount++; - } - s.totalram >>=bitcount; - s.freeram >>= bitcount; - s.sharedram >>= bitcount; - s.bufferram >>= bitcount; - s.totalswap >>= bitcount; - s.freeswap >>= bitcount; - s.totalhigh >>= bitcount; - s.freehigh >>= bitcount; - } - - err = put_user (s.uptime, &info->uptime); - err |= __put_user (s.loads[0], &info->loads[0]); - err |= __put_user (s.loads[1], &info->loads[1]); - err |= __put_user (s.loads[2], &info->loads[2]); - err |= __put_user (s.totalram, &info->totalram); - err |= __put_user (s.freeram, &info->freeram); - err |= __put_user (s.sharedram, &info->sharedram); - err |= __put_user (s.bufferram, &info->bufferram); - err |= __put_user (s.totalswap, &info->totalswap); - err |= __put_user (s.freeswap, &info->freeswap); - err |= __put_user (s.procs, &info->procs); - err |= __put_user (s.totalhigh, &info->totalhigh); - err |= __put_user (s.freehigh, &info->freehigh); - err |= __put_user (s.mem_unit, &info->mem_unit); - if (err) - return -EFAULT; - - return ret; -} - - - - -/* Translations due to time_t size differences. Which affects all - sorts of things, like timeval and itimerval. */ -extern struct timezone sys_tz; - -asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) -{ - if (tv) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (put_tv32(tv, &ktv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - - return 0; -} - - - -asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) -{ - struct timespec kts; - struct timezone ktz; - - if (tv) { - if (get_ts32(&kts, tv)) - return -EFAULT; - } - if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); -} - -#ifdef CONFIG_SYSVIPC -long compat_sys_ipc(u32 call, u32 first, u32 second, u32 third, compat_uptr_t ptr, - u32 fifth) -{ - int version; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - - case SEMTIMEDOP: - if (fifth) - /* sign extend semid */ - return compat_sys_semtimedop((int)first, - compat_ptr(ptr), second, - compat_ptr(fifth)); - /* else fall through for normal semop() */ - case SEMOP: - /* struct sembuf is the same on 32 and 64bit :)) */ - /* sign extend semid */ - return sys_semtimedop((int)first, compat_ptr(ptr), second, - NULL); - case SEMGET: - /* sign extend key, nsems */ - return sys_semget((int)first, (int)second, third); - case SEMCTL: - /* sign extend semid, semnum */ - return compat_sys_semctl((int)first, (int)second, third, - compat_ptr(ptr)); - - case MSGSND: - /* sign extend msqid */ - return compat_sys_msgsnd((int)first, (int)second, third, - compat_ptr(ptr)); - case MSGRCV: - /* sign extend msqid, msgtyp */ - return compat_sys_msgrcv((int)first, second, (int)fifth, - third, version, compat_ptr(ptr)); - case MSGGET: - /* sign extend key */ - return sys_msgget((int)first, second); - case MSGCTL: - /* sign extend msqid */ - return compat_sys_msgctl((int)first, second, compat_ptr(ptr)); - - case SHMAT: - /* sign extend shmid */ - return compat_sys_shmat((int)first, second, third, version, - compat_ptr(ptr)); - case SHMDT: - return sys_shmdt(compat_ptr(ptr)); - case SHMGET: - /* sign extend key_t */ - return sys_shmget((int)first, second, third); - case SHMCTL: - /* sign extend shmid */ - return compat_sys_shmctl((int)first, second, compat_ptr(ptr)); - - default: - return -ENOSYS; - } - - return -ENOSYS; -} -#endif - -/* Note: it is necessary to treat out_fd and in_fd as unsigned ints, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sendfile(u32 out_fd, u32 in_fd, compat_off_t __user * offset, u32 count) -{ - mm_segment_t old_fs = get_fs(); - int ret; - off_t of; - off_t __user *up; - - if (offset && get_user(of, offset)) - return -EFAULT; - - /* The __user pointer cast is valid because of the set_fs() */ - set_fs(KERNEL_DS); - up = offset ? (off_t __user *) &of : NULL; - ret = sys_sendfile((int)out_fd, (int)in_fd, up, count); - set_fs(old_fs); - - if (offset && put_user(of, offset)) - return -EFAULT; - - return ret; -} - -asmlinkage int compat_sys_sendfile64(int out_fd, int in_fd, compat_loff_t __user *offset, s32 count) -{ - mm_segment_t old_fs = get_fs(); - int ret; - loff_t lof; - loff_t __user *up; - - if (offset && get_user(lof, offset)) - return -EFAULT; - - /* The __user pointer cast is valid because of the set_fs() */ - set_fs(KERNEL_DS); - up = offset ? (loff_t __user *) &lof : NULL; - ret = sys_sendfile64(out_fd, in_fd, up, count); - set_fs(old_fs); - - if (offset && put_user(lof, offset)) - return -EFAULT; - - return ret; -} - -long compat_sys_execve(unsigned long a0, unsigned long a1, unsigned long a2, - unsigned long a3, unsigned long a4, unsigned long a5, - struct pt_regs *regs) -{ - int error; - char * filename; - - filename = getname((char __user *) a0); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - flush_fp_to_thread(current); - flush_altivec_to_thread(current); - - error = compat_do_execve(filename, compat_ptr(a1), compat_ptr(a2), regs); - - if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); - } - putname(filename); - -out: - return error; -} - -/* Note: it is necessary to treat option as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_prctl(u32 option, u32 arg2, u32 arg3, u32 arg4, u32 arg5) -{ - return sys_prctl((int)option, - (unsigned long) arg2, - (unsigned long) arg3, - (unsigned long) arg4, - (unsigned long) arg5); -} - -/* Note: it is necessary to treat pid as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_rr_get_interval(u32 pid, struct compat_timespec __user *interval) -{ - struct timespec t; - int ret; - mm_segment_t old_fs = get_fs (); - - /* The __user pointer cast is valid because of the set_fs() */ - set_fs (KERNEL_DS); - ret = sys_sched_rr_get_interval((int)pid, (struct timespec __user *) &t); - set_fs (old_fs); - if (put_compat_timespec(&t, interval)) - return -EFAULT; - return ret; -} - -asmlinkage int compat_sys_pciconfig_read(u32 bus, u32 dfn, u32 off, u32 len, u32 ubuf) -{ - return sys_pciconfig_read((unsigned long) bus, - (unsigned long) dfn, - (unsigned long) off, - (unsigned long) len, - compat_ptr(ubuf)); -} - -asmlinkage int compat_sys_pciconfig_write(u32 bus, u32 dfn, u32 off, u32 len, u32 ubuf) -{ - return sys_pciconfig_write((unsigned long) bus, - (unsigned long) dfn, - (unsigned long) off, - (unsigned long) len, - compat_ptr(ubuf)); -} - -asmlinkage int compat_sys_pciconfig_iobase(u32 which, u32 in_bus, u32 in_devfn) -{ - return sys_pciconfig_iobase(which, in_bus, in_devfn); -} - - -/* Note: it is necessary to treat mode as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_access(const char __user * filename, u32 mode) -{ - return sys_access(filename, (int)mode); -} - - -/* Note: it is necessary to treat mode as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_creat(const char __user * pathname, u32 mode) -{ - return sys_creat(pathname, (int)mode); -} - - -/* Note: it is necessary to treat pid and options as unsigned ints, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_waitpid(u32 pid, unsigned int __user * stat_addr, u32 options) -{ - return sys_waitpid((int)pid, stat_addr, (int)options); -} - - -/* Note: it is necessary to treat gidsetsize as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_getgroups(u32 gidsetsize, gid_t __user *grouplist) -{ - return sys_getgroups((int)gidsetsize, grouplist); -} - - -/* Note: it is necessary to treat pid as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_getpgid(u32 pid) -{ - return sys_getpgid((int)pid); -} - - - -/* Note: it is necessary to treat pid as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_getsid(u32 pid) -{ - return sys_getsid((int)pid); -} - - -/* Note: it is necessary to treat pid and sig as unsigned ints, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_kill(u32 pid, u32 sig) -{ - return sys_kill((int)pid, (int)sig); -} - - -/* Note: it is necessary to treat mode as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_mkdir(const char __user * pathname, u32 mode) -{ - return sys_mkdir(pathname, (int)mode); -} - -long compat_sys_nice(u32 increment) -{ - /* sign extend increment */ - return sys_nice((int)increment); -} - -off_t ppc32_lseek(unsigned int fd, u32 offset, unsigned int origin) -{ - /* sign extend n */ - return sys_lseek(fd, (int)offset, origin); -} - -/* Note: it is necessary to treat bufsiz as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_readlink(const char __user * path, char __user * buf, u32 bufsiz) -{ - return sys_readlink(path, buf, (int)bufsiz); -} - -/* Note: it is necessary to treat option as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_get_priority_max(u32 policy) -{ - return sys_sched_get_priority_max((int)policy); -} - - -/* Note: it is necessary to treat policy as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_get_priority_min(u32 policy) -{ - return sys_sched_get_priority_min((int)policy); -} - - -/* Note: it is necessary to treat pid as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_getparam(u32 pid, struct sched_param __user *param) -{ - return sys_sched_getparam((int)pid, param); -} - - -/* Note: it is necessary to treat pid as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_getscheduler(u32 pid) -{ - return sys_sched_getscheduler((int)pid); -} - - -/* Note: it is necessary to treat pid as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_setparam(u32 pid, struct sched_param __user *param) -{ - return sys_sched_setparam((int)pid, param); -} - - -/* Note: it is necessary to treat pid and policy as unsigned ints, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_setscheduler(u32 pid, u32 policy, struct sched_param __user *param) -{ - return sys_sched_setscheduler((int)pid, (int)policy, param); -} - - -/* Note: it is necessary to treat len as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_setdomainname(char __user *name, u32 len) -{ - return sys_setdomainname(name, (int)len); -} - - -/* Note: it is necessary to treat gidsetsize as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_setgroups(u32 gidsetsize, gid_t __user *grouplist) -{ - return sys_setgroups((int)gidsetsize, grouplist); -} - - -asmlinkage long compat_sys_sethostname(char __user *name, u32 len) -{ - /* sign extend len */ - return sys_sethostname(name, (int)len); -} - - -/* Note: it is necessary to treat pid and pgid as unsigned ints, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_setpgid(u32 pid, u32 pgid) -{ - return sys_setpgid((int)pid, (int)pgid); -} - -long compat_sys_getpriority(u32 which, u32 who) -{ - /* sign extend which and who */ - return sys_getpriority((int)which, (int)who); -} - -long compat_sys_setpriority(u32 which, u32 who, u32 niceval) -{ - /* sign extend which, who and niceval */ - return sys_setpriority((int)which, (int)who, (int)niceval); -} - -long compat_sys_ioprio_get(u32 which, u32 who) -{ - /* sign extend which and who */ - return sys_ioprio_get((int)which, (int)who); -} - -long compat_sys_ioprio_set(u32 which, u32 who, u32 ioprio) -{ - /* sign extend which, who and ioprio */ - return sys_ioprio_set((int)which, (int)who, (int)ioprio); -} - -/* Note: it is necessary to treat newmask as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_ssetmask(u32 newmask) -{ - return sys_ssetmask((int) newmask); -} - -asmlinkage long compat_sys_syslog(u32 type, char __user * buf, u32 len) -{ - /* sign extend len */ - return sys_syslog(type, buf, (int)len); -} - - -/* Note: it is necessary to treat mask as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_umask(u32 mask) -{ - return sys_umask((int)mask); -} - -#ifdef CONFIG_SYSCTL -struct __sysctl_args32 { - u32 name; - int nlen; - u32 oldval; - u32 oldlenp; - u32 newval; - u32 newlen; - u32 __unused[4]; -}; - -asmlinkage long compat_sys_sysctl(struct __sysctl_args32 __user *args) -{ - struct __sysctl_args32 tmp; - int error; - size_t oldlen; - size_t __user *oldlenp = NULL; - unsigned long addr = (((unsigned long)&args->__unused[0]) + 7) & ~7; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && tmp.oldlenp) { - /* Duh, this is ugly and might not work if sysctl_args - is in read-only memory, but do_sysctl does indirectly - a lot of uaccess in both directions and we'd have to - basically copy the whole sysctl.c here, and - glibc's __sysctl uses rw memory for the structure - anyway. */ - oldlenp = (size_t __user *)addr; - if (get_user(oldlen, (compat_size_t __user *)compat_ptr(tmp.oldlenp)) || - put_user(oldlen, oldlenp)) - return -EFAULT; - } - - lock_kernel(); - error = do_sysctl(compat_ptr(tmp.name), tmp.nlen, - compat_ptr(tmp.oldval), oldlenp, - compat_ptr(tmp.newval), tmp.newlen); - unlock_kernel(); - if (oldlenp) { - if (!error) { - if (get_user(oldlen, oldlenp) || - put_user(oldlen, (compat_size_t __user *)compat_ptr(tmp.oldlenp))) - error = -EFAULT; - } - copy_to_user(args->__unused, tmp.__unused, sizeof(tmp.__unused)); - } - return error; -} -#endif - unsigned long compat_sys_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) @@ -903,12 +69,6 @@ unsigned long compat_sys_mmap2(unsigned long addr, size_t len, return sys_mmap(addr, len, prot, flags, fd, pgoff << 12); } -long compat_sys_tgkill(u32 tgid, u32 pid, int sig) -{ - /* sign extend tgid, pid */ - return sys_tgkill((int)tgid, (int)pid, sig); -} - /* * long long munging: * The 32 bit ABI passes long longs in an odd even register pair. @@ -920,7 +80,7 @@ compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_siz return sys_pread64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo); } -compat_ssize_t compat_sys_pwrite64(unsigned int fd, char __user *ubuf, compat_size_t count, +compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count, u32 reg6, u32 poshi, u32 poslo) { return sys_pwrite64(fd, ubuf, count, ((loff_t)poshi << 32) | poslo); @@ -937,17 +97,17 @@ asmlinkage int compat_sys_truncate64(const char __user * path, u32 reg4, return sys_truncate(path, (high << 32) | low); } -asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long high, - unsigned long low) +asmlinkage long compat_sys_fallocate(int fd, int mode, u32 offhi, u32 offlo, + u32 lenhi, u32 lenlo) { - return sys_ftruncate(fd, (high << 32) | low); + return sys_fallocate(fd, mode, ((loff_t)offhi << 32) | offlo, + ((loff_t)lenhi << 32) | lenlo); } -long ppc32_lookup_dcookie(u32 cookie_high, u32 cookie_low, char __user *buf, - size_t len) +asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long high, + unsigned long low) { - return sys_lookup_dcookie((u64)cookie_high << 32 | cookie_low, - buf, len); + return sys_ftruncate(fd, (high << 32) | low); } long ppc32_fadvise64(int fd, u32 unused, u32 offset_high, u32 offset_low, @@ -957,52 +117,12 @@ long ppc32_fadvise64(int fd, u32 unused, u32 offset_high, u32 offset_low, advice); } -long ppc32_timer_create(clockid_t clock, - struct compat_sigevent __user *ev32, - timer_t __user *timer_id) -{ - sigevent_t event; - timer_t t; - long err; - mm_segment_t savefs; - - if (ev32 == NULL) - return sys_timer_create(clock, NULL, timer_id); - - if (get_compat_sigevent(&event, ev32)) - return -EFAULT; - - if (!access_ok(VERIFY_WRITE, timer_id, sizeof(timer_t))) - return -EFAULT; - - savefs = get_fs(); - set_fs(KERNEL_DS); - /* The __user pointer casts are valid due to the set_fs() */ - err = sys_timer_create(clock, - (sigevent_t __user *) &event, - (timer_t __user *) &t); - set_fs(savefs); - - if (err == 0) - err = __put_user(t, timer_id); - - return err; -} - -asmlinkage long compat_sys_add_key(const char __user *_type, - const char __user *_description, - const void __user *_payload, - u32 plen, - u32 ringid) +asmlinkage long compat_sys_sync_file_range2(int fd, unsigned int flags, + unsigned offset_hi, unsigned offset_lo, + unsigned nbytes_hi, unsigned nbytes_lo) { - return sys_add_key(_type, _description, _payload, plen, ringid); -} + loff_t offset = ((loff_t)offset_hi << 32) | offset_lo; + loff_t nbytes = ((loff_t)nbytes_hi << 32) | nbytes_lo; -asmlinkage long compat_sys_request_key(const char __user *_type, - const char __user *_description, - const char __user *_callout_info, - u32 destringid) -{ - return sys_request_key(_type, _description, _callout_info, destringid); + return sys_sync_file_range(fd, offset, nbytes, flags); } - diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index f72ced11212..cd9be9aa016 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -23,8 +23,8 @@ #include <linux/sched.h> #include <linux/syscalls.h> #include <linux/mm.h> +#include <linux/fs.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/sem.h> #include <linux/msg.h> #include <linux/shm.h> @@ -34,155 +34,29 @@ #include <linux/ipc.h> #include <linux/utsname.h> #include <linux/file.h> -#include <linux/init.h> #include <linux/personality.h> #include <asm/uaccess.h> -#include <asm/ipc.h> -#include <asm/semaphore.h> +#include <asm/syscalls.h> #include <asm/time.h> #include <asm/unistd.h> -extern unsigned long wall_jiffies; - - -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -int sys_ipc(uint call, int first, unsigned long second, long third, - void __user *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - ret = -ENOSYS; - switch (call) { - case SEMOP: - ret = sys_semtimedop(first, (struct sembuf __user *)ptr, - (unsigned)second, NULL); - break; - case SEMTIMEDOP: - ret = sys_semtimedop(first, (struct sembuf __user *)ptr, - (unsigned)second, - (const struct timespec __user *) fifth); - break; - case SEMGET: - ret = sys_semget (first, (int)second, third); - break; - case SEMCTL: { - union semun fourth; - - ret = -EINVAL; - if (!ptr) - break; - if ((ret = get_user(fourth.__pad, (void __user * __user *)ptr))) - break; - ret = sys_semctl(first, (int)second, third, fourth); - break; - } - case MSGSND: - ret = sys_msgsnd(first, (struct msgbuf __user *)ptr, - (size_t)second, third); - break; - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - - ret = -EINVAL; - if (!ptr) - break; - if ((ret = copy_from_user(&tmp, - (struct ipc_kludge __user *) ptr, - sizeof (tmp)) ? -EFAULT : 0)) - break; - ret = sys_msgrcv(first, tmp.msgp, (size_t) second, - tmp.msgtyp, third); - break; - } - default: - ret = sys_msgrcv (first, (struct msgbuf __user *) ptr, - (size_t)second, fifth, third); - break; - } - break; - case MSGGET: - ret = sys_msgget((key_t)first, (int)second); - break; - case MSGCTL: - ret = sys_msgctl(first, (int)second, - (struct msqid_ds __user *)ptr); - break; - case SHMAT: { - ulong raddr; - ret = do_shmat(first, (char __user *)ptr, (int)second, &raddr); - if (ret) - break; - ret = put_user(raddr, (ulong __user *) third); - break; - } - case SHMDT: - ret = sys_shmdt((char __user *)ptr); - break; - case SHMGET: - ret = sys_shmget(first, (size_t)second, third); - break; - case SHMCTL: - ret = sys_shmctl(first, (int)second, - (struct shmid_ds __user *)ptr); - break; - } - - return ret; -} - -/* - * sys_pipe() is the normal C calling standard for creating - * a pipe. It's not the way unix traditionally does this, though. - */ -int sys_pipe(int __user *fildes) -{ - int fd[2]; - int error; - - error = do_pipe(fd); - if (!error) { - if (copy_to_user(fildes, fd, 2*sizeof(int))) - error = -EFAULT; - } - return error; -} - static inline unsigned long do_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long off, int shift) { - struct file * file = NULL; unsigned long ret = -EINVAL; + if (!arch_validate_prot(prot)) + goto out; + if (shift) { if (off & ((1 << shift) - 1)) goto out; off >>= shift; } - - ret = -EBADF; - if (!(flags & MAP_ANONYMOUS)) { - if (!(file = fget(fd))) - goto out; - } - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - ret = do_mmap_pgoff(file, addr, len, prot, flags, off); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); + ret = sys_mmap_pgoff(addr, len, prot, flags, fd, off); out: return ret; } @@ -232,110 +106,15 @@ long ppc64_personality(unsigned long personality) long ret; if (personality(current->personality) == PER_LINUX32 - && personality == PER_LINUX) - personality = PER_LINUX32; + && personality(personality) == PER_LINUX) + personality = (personality & ~PER_MASK) | PER_LINUX32; ret = sys_personality(personality); - if (ret == PER_LINUX32) - ret = PER_LINUX; + if (personality(ret) == PER_LINUX32) + ret = (ret & ~PER_MASK) | PER_LINUX; return ret; } #endif -#ifdef CONFIG_PPC64 -#define OVERRIDE_MACHINE (personality(current->personality) == PER_LINUX32) -#else -#define OVERRIDE_MACHINE 0 -#endif - -static inline int override_machine(char *mach) -{ - if (OVERRIDE_MACHINE) { - /* change ppc64 to ppc */ - if (__put_user(0, mach+3) || __put_user(0, mach+4)) - return -EFAULT; - } - return 0; -} - -long ppc_newuname(struct new_utsname __user * name) -{ - int err = 0; - - down_read(&uts_sem); - if (copy_to_user(name, &system_utsname, sizeof(*name))) - err = -EFAULT; - up_read(&uts_sem); - if (!err) - err = override_machine(name->machine); - return err; -} - -int sys_uname(struct old_utsname __user *name) -{ - int err = 0; - - down_read(&uts_sem); - if (copy_to_user(name, &system_utsname, sizeof(*name))) - err = -EFAULT; - up_read(&uts_sem); - if (!err) - err = override_machine(name->machine); - return err; -} - -int sys_olduname(struct oldold_utsname __user *name) -{ - int error; - - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; - - down_read(&uts_sem); - error = __copy_to_user(&name->sysname, &system_utsname.sysname, - __OLD_UTS_LEN); - error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &system_utsname.nodename, - __OLD_UTS_LEN); - error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &system_utsname.release, - __OLD_UTS_LEN); - error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &system_utsname.version, - __OLD_UTS_LEN); - error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &system_utsname.machine, - __OLD_UTS_LEN); - error |= override_machine(name->machine); - up_read(&uts_sem); - - return error? -EFAULT: 0; -} - -#ifdef CONFIG_PPC64 -time_t sys64_time(time_t __user * tloc) -{ - time_t secs; - time_t usecs; - - long tb_delta = tb_ticks_since(tb_last_stamp); - tb_delta += (jiffies - wall_jiffies) * tb_ticks_per_jiffy; - - secs = xtime.tv_sec; - usecs = (xtime.tv_nsec/1000) + tb_delta / tb_ticks_per_usec; - while (usecs >= USEC_PER_SEC) { - ++secs; - usecs -= USEC_PER_SEC; - } - - if (tloc) { - if (put_user(secs,tloc)) - secs = -EFAULT; - } - - return secs; -} -#endif - long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, u32 len_high, u32 len_low) { diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c new file mode 100644 index 00000000000..67fd2fd2620 --- /dev/null +++ b/arch/powerpc/kernel/sysfs.c @@ -0,0 +1,1027 @@ +#include <linux/device.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/export.h> +#include <linux/nodemask.h> +#include <linux/cpumask.h> +#include <linux/notifier.h> + +#include <asm/current.h> +#include <asm/processor.h> +#include <asm/cputable.h> +#include <asm/hvcall.h> +#include <asm/prom.h> +#include <asm/machdep.h> +#include <asm/smp.h> +#include <asm/pmc.h> +#include <asm/firmware.h> + +#include "cacheinfo.h" + +#ifdef CONFIG_PPC64 +#include <asm/paca.h> +#include <asm/lppaca.h> +#endif + +static DEFINE_PER_CPU(struct cpu, cpu_devices); + +/* + * SMT snooze delay stuff, 64-bit only for now + */ + +#ifdef CONFIG_PPC64 + +/* Time in microseconds we delay before sleeping in the idle loop */ +DEFINE_PER_CPU(long, smt_snooze_delay) = { 100 }; + +static ssize_t store_smt_snooze_delay(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + ssize_t ret; + long snooze; + + ret = sscanf(buf, "%ld", &snooze); + if (ret != 1) + return -EINVAL; + + per_cpu(smt_snooze_delay, cpu->dev.id) = snooze; + return count; +} + +static ssize_t show_smt_snooze_delay(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + + return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->dev.id)); +} + +static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay, + store_smt_snooze_delay); + +static int __init setup_smt_snooze_delay(char *str) +{ + unsigned int cpu; + long snooze; + + if (!cpu_has_feature(CPU_FTR_SMT)) + return 1; + + snooze = simple_strtol(str, NULL, 10); + for_each_possible_cpu(cpu) + per_cpu(smt_snooze_delay, cpu) = snooze; + + return 1; +} +__setup("smt-snooze-delay=", setup_smt_snooze_delay); + +#endif /* CONFIG_PPC64 */ + +#ifdef CONFIG_PPC_FSL_BOOK3E +#define MAX_BIT 63 + +static u64 pw20_wt; +static u64 altivec_idle_wt; + +static unsigned int get_idle_ticks_bit(u64 ns) +{ + u64 cycle; + + if (ns >= 10000) + cycle = div_u64(ns + 500, 1000) * tb_ticks_per_usec; + else + cycle = div_u64(ns * tb_ticks_per_usec, 1000); + + if (!cycle) + return 0; + + return ilog2(cycle); +} + +static void do_show_pwrmgtcr0(void *val) +{ + u32 *value = val; + + *value = mfspr(SPRN_PWRMGTCR0); +} + +static ssize_t show_pw20_state(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u32 value; + unsigned int cpu = dev->id; + + smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1); + + value &= PWRMGTCR0_PW20_WAIT; + + return sprintf(buf, "%u\n", value ? 1 : 0); +} + +static void do_store_pw20_state(void *val) +{ + u32 *value = val; + u32 pw20_state; + + pw20_state = mfspr(SPRN_PWRMGTCR0); + + if (*value) + pw20_state |= PWRMGTCR0_PW20_WAIT; + else + pw20_state &= ~PWRMGTCR0_PW20_WAIT; + + mtspr(SPRN_PWRMGTCR0, pw20_state); +} + +static ssize_t store_pw20_state(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 value; + unsigned int cpu = dev->id; + + if (kstrtou32(buf, 0, &value)) + return -EINVAL; + + if (value > 1) + return -EINVAL; + + smp_call_function_single(cpu, do_store_pw20_state, &value, 1); + + return count; +} + +static ssize_t show_pw20_wait_time(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u32 value; + u64 tb_cycle = 1; + u64 time; + + unsigned int cpu = dev->id; + + if (!pw20_wt) { + smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1); + value = (value & PWRMGTCR0_PW20_ENT) >> + PWRMGTCR0_PW20_ENT_SHIFT; + + tb_cycle = (tb_cycle << (MAX_BIT - value + 1)); + /* convert ms to ns */ + if (tb_ticks_per_usec > 1000) { + time = div_u64(tb_cycle, tb_ticks_per_usec / 1000); + } else { + u32 rem_us; + + time = div_u64_rem(tb_cycle, tb_ticks_per_usec, + &rem_us); + time = time * 1000 + rem_us * 1000 / tb_ticks_per_usec; + } + } else { + time = pw20_wt; + } + + return sprintf(buf, "%llu\n", time > 0 ? time : 0); +} + +static void set_pw20_wait_entry_bit(void *val) +{ + u32 *value = val; + u32 pw20_idle; + + pw20_idle = mfspr(SPRN_PWRMGTCR0); + + /* Set Automatic PW20 Core Idle Count */ + /* clear count */ + pw20_idle &= ~PWRMGTCR0_PW20_ENT; + + /* set count */ + pw20_idle |= ((MAX_BIT - *value) << PWRMGTCR0_PW20_ENT_SHIFT); + + mtspr(SPRN_PWRMGTCR0, pw20_idle); +} + +static ssize_t store_pw20_wait_time(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 entry_bit; + u64 value; + + unsigned int cpu = dev->id; + + if (kstrtou64(buf, 0, &value)) + return -EINVAL; + + if (!value) + return -EINVAL; + + entry_bit = get_idle_ticks_bit(value); + if (entry_bit > MAX_BIT) + return -EINVAL; + + pw20_wt = value; + + smp_call_function_single(cpu, set_pw20_wait_entry_bit, + &entry_bit, 1); + + return count; +} + +static ssize_t show_altivec_idle(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u32 value; + unsigned int cpu = dev->id; + + smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1); + + value &= PWRMGTCR0_AV_IDLE_PD_EN; + + return sprintf(buf, "%u\n", value ? 1 : 0); +} + +static void do_store_altivec_idle(void *val) +{ + u32 *value = val; + u32 altivec_idle; + + altivec_idle = mfspr(SPRN_PWRMGTCR0); + + if (*value) + altivec_idle |= PWRMGTCR0_AV_IDLE_PD_EN; + else + altivec_idle &= ~PWRMGTCR0_AV_IDLE_PD_EN; + + mtspr(SPRN_PWRMGTCR0, altivec_idle); +} + +static ssize_t store_altivec_idle(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 value; + unsigned int cpu = dev->id; + + if (kstrtou32(buf, 0, &value)) + return -EINVAL; + + if (value > 1) + return -EINVAL; + + smp_call_function_single(cpu, do_store_altivec_idle, &value, 1); + + return count; +} + +static ssize_t show_altivec_idle_wait_time(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u32 value; + u64 tb_cycle = 1; + u64 time; + + unsigned int cpu = dev->id; + + if (!altivec_idle_wt) { + smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1); + value = (value & PWRMGTCR0_AV_IDLE_CNT) >> + PWRMGTCR0_AV_IDLE_CNT_SHIFT; + + tb_cycle = (tb_cycle << (MAX_BIT - value + 1)); + /* convert ms to ns */ + if (tb_ticks_per_usec > 1000) { + time = div_u64(tb_cycle, tb_ticks_per_usec / 1000); + } else { + u32 rem_us; + + time = div_u64_rem(tb_cycle, tb_ticks_per_usec, + &rem_us); + time = time * 1000 + rem_us * 1000 / tb_ticks_per_usec; + } + } else { + time = altivec_idle_wt; + } + + return sprintf(buf, "%llu\n", time > 0 ? time : 0); +} + +static void set_altivec_idle_wait_entry_bit(void *val) +{ + u32 *value = val; + u32 altivec_idle; + + altivec_idle = mfspr(SPRN_PWRMGTCR0); + + /* Set Automatic AltiVec Idle Count */ + /* clear count */ + altivec_idle &= ~PWRMGTCR0_AV_IDLE_CNT; + + /* set count */ + altivec_idle |= ((MAX_BIT - *value) << PWRMGTCR0_AV_IDLE_CNT_SHIFT); + + mtspr(SPRN_PWRMGTCR0, altivec_idle); +} + +static ssize_t store_altivec_idle_wait_time(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 entry_bit; + u64 value; + + unsigned int cpu = dev->id; + + if (kstrtou64(buf, 0, &value)) + return -EINVAL; + + if (!value) + return -EINVAL; + + entry_bit = get_idle_ticks_bit(value); + if (entry_bit > MAX_BIT) + return -EINVAL; + + altivec_idle_wt = value; + + smp_call_function_single(cpu, set_altivec_idle_wait_entry_bit, + &entry_bit, 1); + + return count; +} + +/* + * Enable/Disable interface: + * 0, disable. 1, enable. + */ +static DEVICE_ATTR(pw20_state, 0600, show_pw20_state, store_pw20_state); +static DEVICE_ATTR(altivec_idle, 0600, show_altivec_idle, store_altivec_idle); + +/* + * Set wait time interface:(Nanosecond) + * Example: Base on TBfreq is 41MHZ. + * 1~48(ns): TB[63] + * 49~97(ns): TB[62] + * 98~195(ns): TB[61] + * 196~390(ns): TB[60] + * 391~780(ns): TB[59] + * 781~1560(ns): TB[58] + * ... + */ +static DEVICE_ATTR(pw20_wait_time, 0600, + show_pw20_wait_time, + store_pw20_wait_time); +static DEVICE_ATTR(altivec_idle_wait_time, 0600, + show_altivec_idle_wait_time, + store_altivec_idle_wait_time); +#endif + +/* + * Enabling PMCs will slow partition context switch times so we only do + * it the first time we write to the PMCs. + */ + +static DEFINE_PER_CPU(char, pmcs_enabled); + +void ppc_enable_pmcs(void) +{ + ppc_set_pmu_inuse(1); + + /* Only need to enable them once */ + if (__get_cpu_var(pmcs_enabled)) + return; + + __get_cpu_var(pmcs_enabled) = 1; + + if (ppc_md.enable_pmcs) + ppc_md.enable_pmcs(); +} +EXPORT_SYMBOL(ppc_enable_pmcs); + +#define __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, EXTRA) \ +static void read_##NAME(void *val) \ +{ \ + *(unsigned long *)val = mfspr(ADDRESS); \ +} \ +static void write_##NAME(void *val) \ +{ \ + EXTRA; \ + mtspr(ADDRESS, *(unsigned long *)val); \ +} + +#define __SYSFS_SPRSETUP_SHOW_STORE(NAME) \ +static ssize_t show_##NAME(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct cpu *cpu = container_of(dev, struct cpu, dev); \ + unsigned long val; \ + smp_call_function_single(cpu->dev.id, read_##NAME, &val, 1); \ + return sprintf(buf, "%lx\n", val); \ +} \ +static ssize_t __used \ + store_##NAME(struct device *dev, struct device_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + struct cpu *cpu = container_of(dev, struct cpu, dev); \ + unsigned long val; \ + int ret = sscanf(buf, "%lx", &val); \ + if (ret != 1) \ + return -EINVAL; \ + smp_call_function_single(cpu->dev.id, write_##NAME, &val, 1); \ + return count; \ +} + +#define SYSFS_PMCSETUP(NAME, ADDRESS) \ + __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ppc_enable_pmcs()) \ + __SYSFS_SPRSETUP_SHOW_STORE(NAME) +#define SYSFS_SPRSETUP(NAME, ADDRESS) \ + __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ) \ + __SYSFS_SPRSETUP_SHOW_STORE(NAME) + +#define SYSFS_SPRSETUP_SHOW_STORE(NAME) \ + __SYSFS_SPRSETUP_SHOW_STORE(NAME) + +/* Let's define all possible registers, we'll only hook up the ones + * that are implemented on the current processor + */ + +#if defined(CONFIG_PPC64) +#define HAS_PPC_PMC_CLASSIC 1 +#define HAS_PPC_PMC_IBM 1 +#define HAS_PPC_PMC_PA6T 1 +#elif defined(CONFIG_6xx) +#define HAS_PPC_PMC_CLASSIC 1 +#define HAS_PPC_PMC_IBM 1 +#define HAS_PPC_PMC_G4 1 +#endif + + +#ifdef HAS_PPC_PMC_CLASSIC +SYSFS_PMCSETUP(mmcr0, SPRN_MMCR0); +SYSFS_PMCSETUP(mmcr1, SPRN_MMCR1); +SYSFS_PMCSETUP(pmc1, SPRN_PMC1); +SYSFS_PMCSETUP(pmc2, SPRN_PMC2); +SYSFS_PMCSETUP(pmc3, SPRN_PMC3); +SYSFS_PMCSETUP(pmc4, SPRN_PMC4); +SYSFS_PMCSETUP(pmc5, SPRN_PMC5); +SYSFS_PMCSETUP(pmc6, SPRN_PMC6); + +#ifdef HAS_PPC_PMC_G4 +SYSFS_PMCSETUP(mmcr2, SPRN_MMCR2); +#endif + +#ifdef CONFIG_PPC64 +SYSFS_PMCSETUP(pmc7, SPRN_PMC7); +SYSFS_PMCSETUP(pmc8, SPRN_PMC8); + +SYSFS_PMCSETUP(mmcra, SPRN_MMCRA); +SYSFS_SPRSETUP(purr, SPRN_PURR); +SYSFS_SPRSETUP(spurr, SPRN_SPURR); +SYSFS_SPRSETUP(pir, SPRN_PIR); + +/* + Lets only enable read for phyp resources and + enable write when needed with a separate function. + Lets be conservative and default to pseries. +*/ +static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra); +static DEVICE_ATTR(spurr, 0400, show_spurr, NULL); +static DEVICE_ATTR(purr, 0400, show_purr, store_purr); +static DEVICE_ATTR(pir, 0400, show_pir, NULL); + +static unsigned long dscr_default; + +static void read_dscr(void *val) +{ + *(unsigned long *)val = get_paca()->dscr_default; +} + +static void write_dscr(void *val) +{ + get_paca()->dscr_default = *(unsigned long *)val; + if (!current->thread.dscr_inherit) { + current->thread.dscr = *(unsigned long *)val; + mtspr(SPRN_DSCR, *(unsigned long *)val); + } +} + +SYSFS_SPRSETUP_SHOW_STORE(dscr); +static DEVICE_ATTR(dscr, 0600, show_dscr, store_dscr); + +static void add_write_permission_dev_attr(struct device_attribute *attr) +{ + attr->attr.mode |= 0200; +} + +static ssize_t show_dscr_default(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%lx\n", dscr_default); +} + +static ssize_t __used store_dscr_default(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + unsigned long val; + int ret = 0; + + ret = sscanf(buf, "%lx", &val); + if (ret != 1) + return -EINVAL; + dscr_default = val; + + on_each_cpu(write_dscr, &val, 1); + + return count; +} + +static DEVICE_ATTR(dscr_default, 0600, + show_dscr_default, store_dscr_default); + +static void sysfs_create_dscr_default(void) +{ + int err = 0; + if (cpu_has_feature(CPU_FTR_DSCR)) + err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default); +} +#endif /* CONFIG_PPC64 */ + +#ifdef HAS_PPC_PMC_PA6T +SYSFS_PMCSETUP(pa6t_pmc0, SPRN_PA6T_PMC0); +SYSFS_PMCSETUP(pa6t_pmc1, SPRN_PA6T_PMC1); +SYSFS_PMCSETUP(pa6t_pmc2, SPRN_PA6T_PMC2); +SYSFS_PMCSETUP(pa6t_pmc3, SPRN_PA6T_PMC3); +SYSFS_PMCSETUP(pa6t_pmc4, SPRN_PA6T_PMC4); +SYSFS_PMCSETUP(pa6t_pmc5, SPRN_PA6T_PMC5); +#ifdef CONFIG_DEBUG_KERNEL +SYSFS_SPRSETUP(hid0, SPRN_HID0); +SYSFS_SPRSETUP(hid1, SPRN_HID1); +SYSFS_SPRSETUP(hid4, SPRN_HID4); +SYSFS_SPRSETUP(hid5, SPRN_HID5); +SYSFS_SPRSETUP(ima0, SPRN_PA6T_IMA0); +SYSFS_SPRSETUP(ima1, SPRN_PA6T_IMA1); +SYSFS_SPRSETUP(ima2, SPRN_PA6T_IMA2); +SYSFS_SPRSETUP(ima3, SPRN_PA6T_IMA3); +SYSFS_SPRSETUP(ima4, SPRN_PA6T_IMA4); +SYSFS_SPRSETUP(ima5, SPRN_PA6T_IMA5); +SYSFS_SPRSETUP(ima6, SPRN_PA6T_IMA6); +SYSFS_SPRSETUP(ima7, SPRN_PA6T_IMA7); +SYSFS_SPRSETUP(ima8, SPRN_PA6T_IMA8); +SYSFS_SPRSETUP(ima9, SPRN_PA6T_IMA9); +SYSFS_SPRSETUP(imaat, SPRN_PA6T_IMAAT); +SYSFS_SPRSETUP(btcr, SPRN_PA6T_BTCR); +SYSFS_SPRSETUP(pccr, SPRN_PA6T_PCCR); +SYSFS_SPRSETUP(rpccr, SPRN_PA6T_RPCCR); +SYSFS_SPRSETUP(der, SPRN_PA6T_DER); +SYSFS_SPRSETUP(mer, SPRN_PA6T_MER); +SYSFS_SPRSETUP(ber, SPRN_PA6T_BER); +SYSFS_SPRSETUP(ier, SPRN_PA6T_IER); +SYSFS_SPRSETUP(sier, SPRN_PA6T_SIER); +SYSFS_SPRSETUP(siar, SPRN_PA6T_SIAR); +SYSFS_SPRSETUP(tsr0, SPRN_PA6T_TSR0); +SYSFS_SPRSETUP(tsr1, SPRN_PA6T_TSR1); +SYSFS_SPRSETUP(tsr2, SPRN_PA6T_TSR2); +SYSFS_SPRSETUP(tsr3, SPRN_PA6T_TSR3); +#endif /* CONFIG_DEBUG_KERNEL */ +#endif /* HAS_PPC_PMC_PA6T */ + +#ifdef HAS_PPC_PMC_IBM +static struct device_attribute ibm_common_attrs[] = { + __ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0), + __ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1), +}; +#endif /* HAS_PPC_PMC_G4 */ + +#ifdef HAS_PPC_PMC_G4 +static struct device_attribute g4_common_attrs[] = { + __ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0), + __ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1), + __ATTR(mmcr2, 0600, show_mmcr2, store_mmcr2), +}; +#endif /* HAS_PPC_PMC_G4 */ + +static struct device_attribute classic_pmc_attrs[] = { + __ATTR(pmc1, 0600, show_pmc1, store_pmc1), + __ATTR(pmc2, 0600, show_pmc2, store_pmc2), + __ATTR(pmc3, 0600, show_pmc3, store_pmc3), + __ATTR(pmc4, 0600, show_pmc4, store_pmc4), + __ATTR(pmc5, 0600, show_pmc5, store_pmc5), + __ATTR(pmc6, 0600, show_pmc6, store_pmc6), +#ifdef CONFIG_PPC64 + __ATTR(pmc7, 0600, show_pmc7, store_pmc7), + __ATTR(pmc8, 0600, show_pmc8, store_pmc8), +#endif +}; + +#ifdef HAS_PPC_PMC_PA6T +static struct device_attribute pa6t_attrs[] = { + __ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0), + __ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1), + __ATTR(pmc0, 0600, show_pa6t_pmc0, store_pa6t_pmc0), + __ATTR(pmc1, 0600, show_pa6t_pmc1, store_pa6t_pmc1), + __ATTR(pmc2, 0600, show_pa6t_pmc2, store_pa6t_pmc2), + __ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3), + __ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4), + __ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5), +#ifdef CONFIG_DEBUG_KERNEL + __ATTR(hid0, 0600, show_hid0, store_hid0), + __ATTR(hid1, 0600, show_hid1, store_hid1), + __ATTR(hid4, 0600, show_hid4, store_hid4), + __ATTR(hid5, 0600, show_hid5, store_hid5), + __ATTR(ima0, 0600, show_ima0, store_ima0), + __ATTR(ima1, 0600, show_ima1, store_ima1), + __ATTR(ima2, 0600, show_ima2, store_ima2), + __ATTR(ima3, 0600, show_ima3, store_ima3), + __ATTR(ima4, 0600, show_ima4, store_ima4), + __ATTR(ima5, 0600, show_ima5, store_ima5), + __ATTR(ima6, 0600, show_ima6, store_ima6), + __ATTR(ima7, 0600, show_ima7, store_ima7), + __ATTR(ima8, 0600, show_ima8, store_ima8), + __ATTR(ima9, 0600, show_ima9, store_ima9), + __ATTR(imaat, 0600, show_imaat, store_imaat), + __ATTR(btcr, 0600, show_btcr, store_btcr), + __ATTR(pccr, 0600, show_pccr, store_pccr), + __ATTR(rpccr, 0600, show_rpccr, store_rpccr), + __ATTR(der, 0600, show_der, store_der), + __ATTR(mer, 0600, show_mer, store_mer), + __ATTR(ber, 0600, show_ber, store_ber), + __ATTR(ier, 0600, show_ier, store_ier), + __ATTR(sier, 0600, show_sier, store_sier), + __ATTR(siar, 0600, show_siar, store_siar), + __ATTR(tsr0, 0600, show_tsr0, store_tsr0), + __ATTR(tsr1, 0600, show_tsr1, store_tsr1), + __ATTR(tsr2, 0600, show_tsr2, store_tsr2), + __ATTR(tsr3, 0600, show_tsr3, store_tsr3), +#endif /* CONFIG_DEBUG_KERNEL */ +}; +#endif /* HAS_PPC_PMC_PA6T */ +#endif /* HAS_PPC_PMC_CLASSIC */ + +static void register_cpu_online(unsigned int cpu) +{ + struct cpu *c = &per_cpu(cpu_devices, cpu); + struct device *s = &c->dev; + struct device_attribute *attrs, *pmc_attrs; + int i, nattrs; + +#ifdef CONFIG_PPC64 + if (cpu_has_feature(CPU_FTR_SMT)) + device_create_file(s, &dev_attr_smt_snooze_delay); +#endif + + /* PMC stuff */ + switch (cur_cpu_spec->pmc_type) { +#ifdef HAS_PPC_PMC_IBM + case PPC_PMC_IBM: + attrs = ibm_common_attrs; + nattrs = sizeof(ibm_common_attrs) / sizeof(struct device_attribute); + pmc_attrs = classic_pmc_attrs; + break; +#endif /* HAS_PPC_PMC_IBM */ +#ifdef HAS_PPC_PMC_G4 + case PPC_PMC_G4: + attrs = g4_common_attrs; + nattrs = sizeof(g4_common_attrs) / sizeof(struct device_attribute); + pmc_attrs = classic_pmc_attrs; + break; +#endif /* HAS_PPC_PMC_G4 */ +#ifdef HAS_PPC_PMC_PA6T + case PPC_PMC_PA6T: + /* PA Semi starts counting at PMC0 */ + attrs = pa6t_attrs; + nattrs = sizeof(pa6t_attrs) / sizeof(struct device_attribute); + pmc_attrs = NULL; + break; +#endif /* HAS_PPC_PMC_PA6T */ + default: + attrs = NULL; + nattrs = 0; + pmc_attrs = NULL; + } + + for (i = 0; i < nattrs; i++) + device_create_file(s, &attrs[i]); + + if (pmc_attrs) + for (i = 0; i < cur_cpu_spec->num_pmcs; i++) + device_create_file(s, &pmc_attrs[i]); + +#ifdef CONFIG_PPC64 + if (cpu_has_feature(CPU_FTR_MMCRA)) + device_create_file(s, &dev_attr_mmcra); + + if (cpu_has_feature(CPU_FTR_PURR)) { + if (!firmware_has_feature(FW_FEATURE_LPAR)) + add_write_permission_dev_attr(&dev_attr_purr); + device_create_file(s, &dev_attr_purr); + } + + if (cpu_has_feature(CPU_FTR_SPURR)) + device_create_file(s, &dev_attr_spurr); + + if (cpu_has_feature(CPU_FTR_DSCR)) + device_create_file(s, &dev_attr_dscr); + + if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2)) + device_create_file(s, &dev_attr_pir); +#endif /* CONFIG_PPC64 */ + +#ifdef CONFIG_PPC_FSL_BOOK3E + if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) { + device_create_file(s, &dev_attr_pw20_state); + device_create_file(s, &dev_attr_pw20_wait_time); + + device_create_file(s, &dev_attr_altivec_idle); + device_create_file(s, &dev_attr_altivec_idle_wait_time); + } +#endif + cacheinfo_cpu_online(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void unregister_cpu_online(unsigned int cpu) +{ + struct cpu *c = &per_cpu(cpu_devices, cpu); + struct device *s = &c->dev; + struct device_attribute *attrs, *pmc_attrs; + int i, nattrs; + + BUG_ON(!c->hotpluggable); + +#ifdef CONFIG_PPC64 + if (cpu_has_feature(CPU_FTR_SMT)) + device_remove_file(s, &dev_attr_smt_snooze_delay); +#endif + + /* PMC stuff */ + switch (cur_cpu_spec->pmc_type) { +#ifdef HAS_PPC_PMC_IBM + case PPC_PMC_IBM: + attrs = ibm_common_attrs; + nattrs = sizeof(ibm_common_attrs) / sizeof(struct device_attribute); + pmc_attrs = classic_pmc_attrs; + break; +#endif /* HAS_PPC_PMC_IBM */ +#ifdef HAS_PPC_PMC_G4 + case PPC_PMC_G4: + attrs = g4_common_attrs; + nattrs = sizeof(g4_common_attrs) / sizeof(struct device_attribute); + pmc_attrs = classic_pmc_attrs; + break; +#endif /* HAS_PPC_PMC_G4 */ +#ifdef HAS_PPC_PMC_PA6T + case PPC_PMC_PA6T: + /* PA Semi starts counting at PMC0 */ + attrs = pa6t_attrs; + nattrs = sizeof(pa6t_attrs) / sizeof(struct device_attribute); + pmc_attrs = NULL; + break; +#endif /* HAS_PPC_PMC_PA6T */ + default: + attrs = NULL; + nattrs = 0; + pmc_attrs = NULL; + } + + for (i = 0; i < nattrs; i++) + device_remove_file(s, &attrs[i]); + + if (pmc_attrs) + for (i = 0; i < cur_cpu_spec->num_pmcs; i++) + device_remove_file(s, &pmc_attrs[i]); + +#ifdef CONFIG_PPC64 + if (cpu_has_feature(CPU_FTR_MMCRA)) + device_remove_file(s, &dev_attr_mmcra); + + if (cpu_has_feature(CPU_FTR_PURR)) + device_remove_file(s, &dev_attr_purr); + + if (cpu_has_feature(CPU_FTR_SPURR)) + device_remove_file(s, &dev_attr_spurr); + + if (cpu_has_feature(CPU_FTR_DSCR)) + device_remove_file(s, &dev_attr_dscr); + + if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2)) + device_remove_file(s, &dev_attr_pir); +#endif /* CONFIG_PPC64 */ + +#ifdef CONFIG_PPC_FSL_BOOK3E + if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) { + device_remove_file(s, &dev_attr_pw20_state); + device_remove_file(s, &dev_attr_pw20_wait_time); + + device_remove_file(s, &dev_attr_altivec_idle); + device_remove_file(s, &dev_attr_altivec_idle_wait_time); + } +#endif + cacheinfo_cpu_offline(cpu); +} + +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE +ssize_t arch_cpu_probe(const char *buf, size_t count) +{ + if (ppc_md.cpu_probe) + return ppc_md.cpu_probe(buf, count); + + return -EINVAL; +} + +ssize_t arch_cpu_release(const char *buf, size_t count) +{ + if (ppc_md.cpu_release) + return ppc_md.cpu_release(buf, count); + + return -EINVAL; +} +#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ + +#endif /* CONFIG_HOTPLUG_CPU */ + +static int sysfs_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned int)(long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + register_cpu_online(cpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + case CPU_DEAD_FROZEN: + unregister_cpu_online(cpu); + break; +#endif + } + return NOTIFY_OK; +} + +static struct notifier_block sysfs_cpu_nb = { + .notifier_call = sysfs_cpu_notify, +}; + +static DEFINE_MUTEX(cpu_mutex); + +int cpu_add_dev_attr(struct device_attribute *attr) +{ + int cpu; + + mutex_lock(&cpu_mutex); + + for_each_possible_cpu(cpu) { + device_create_file(get_cpu_device(cpu), attr); + } + + mutex_unlock(&cpu_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(cpu_add_dev_attr); + +int cpu_add_dev_attr_group(struct attribute_group *attrs) +{ + int cpu; + struct device *dev; + int ret; + + mutex_lock(&cpu_mutex); + + for_each_possible_cpu(cpu) { + dev = get_cpu_device(cpu); + ret = sysfs_create_group(&dev->kobj, attrs); + WARN_ON(ret != 0); + } + + mutex_unlock(&cpu_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(cpu_add_dev_attr_group); + + +void cpu_remove_dev_attr(struct device_attribute *attr) +{ + int cpu; + + mutex_lock(&cpu_mutex); + + for_each_possible_cpu(cpu) { + device_remove_file(get_cpu_device(cpu), attr); + } + + mutex_unlock(&cpu_mutex); +} +EXPORT_SYMBOL_GPL(cpu_remove_dev_attr); + +void cpu_remove_dev_attr_group(struct attribute_group *attrs) +{ + int cpu; + struct device *dev; + + mutex_lock(&cpu_mutex); + + for_each_possible_cpu(cpu) { + dev = get_cpu_device(cpu); + sysfs_remove_group(&dev->kobj, attrs); + } + + mutex_unlock(&cpu_mutex); +} +EXPORT_SYMBOL_GPL(cpu_remove_dev_attr_group); + + +/* NUMA stuff */ + +#ifdef CONFIG_NUMA +static void register_nodes(void) +{ + int i; + + for (i = 0; i < MAX_NUMNODES; i++) + register_one_node(i); +} + +int sysfs_add_device_to_node(struct device *dev, int nid) +{ + struct node *node = node_devices[nid]; + return sysfs_create_link(&node->dev.kobj, &dev->kobj, + kobject_name(&dev->kobj)); +} +EXPORT_SYMBOL_GPL(sysfs_add_device_to_node); + +void sysfs_remove_device_from_node(struct device *dev, int nid) +{ + struct node *node = node_devices[nid]; + sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj)); +} +EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node); + +#else +static void register_nodes(void) +{ + return; +} + +#endif + +/* Only valid if CPU is present. */ +static ssize_t show_physical_id(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + + return sprintf(buf, "%d\n", get_hard_smp_processor_id(cpu->dev.id)); +} +static DEVICE_ATTR(physical_id, 0444, show_physical_id, NULL); + +static int __init topology_init(void) +{ + int cpu; + + register_nodes(); + + cpu_notifier_register_begin(); + + for_each_possible_cpu(cpu) { + struct cpu *c = &per_cpu(cpu_devices, cpu); + + /* + * For now, we just see if the system supports making + * the RTAS calls for CPU hotplug. But, there may be a + * more comprehensive way to do this for an individual + * CPU. For instance, the boot cpu might never be valid + * for hotplugging. + */ + if (ppc_md.cpu_die) + c->hotpluggable = 1; + + if (cpu_online(cpu) || c->hotpluggable) { + register_cpu(c, cpu); + + device_create_file(&c->dev, &dev_attr_physical_id); + } + + if (cpu_online(cpu)) + register_cpu_online(cpu); + } + + __register_cpu_notifier(&sysfs_cpu_nb); + + cpu_notifier_register_done(); + +#ifdef CONFIG_PPC64 + sysfs_create_dscr_default(); +#endif /* CONFIG_PPC64 */ + + return 0; +} +subsys_initcall(topology_init); diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index 65eaea91b49..895c50ca943 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -14,16 +14,15 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <asm/ppc_asm.h> #ifdef CONFIG_PPC64 -#define SYSCALL(func) .llong .sys_##func,.sys_##func -#define COMPAT_SYS(func) .llong .sys_##func,.compat_sys_##func -#define PPC_SYS(func) .llong .ppc_##func,.ppc_##func -#define OLDSYS(func) .llong .sys_ni_syscall,.sys_ni_syscall -#define SYS32ONLY(func) .llong .sys_ni_syscall,.compat_sys_##func -#define SYSX(f, f3264, f32) .llong .f,.f3264 +#define SYSCALL(func) .llong DOTSYM(sys_##func),DOTSYM(sys_##func) +#define COMPAT_SYS(func) .llong DOTSYM(sys_##func),DOTSYM(compat_sys_##func) +#define PPC_SYS(func) .llong DOTSYM(ppc_##func),DOTSYM(ppc_##func) +#define OLDSYS(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall) +#define SYS32ONLY(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func) +#define SYSX(f, f3264, f32) .llong DOTSYM(f),DOTSYM(f3264) #else #define SYSCALL(func) .long sys_##func #define COMPAT_SYS(func) .long sys_##func @@ -32,290 +31,21 @@ #define SYS32ONLY(func) .long sys_##func #define SYSX(f, f3264, f32) .long f32 #endif +#define SYSCALL_SPU(func) SYSCALL(func) +#define COMPAT_SYS_SPU(func) COMPAT_SYS(func) +#define PPC_SYS_SPU(func) PPC_SYS(func) +#define SYSX_SPU(f, f3264, f32) SYSX(f, f3264, f32) + +.section .rodata,"a" #ifdef CONFIG_PPC64 #define sys_sigpending sys_ni_syscall #define sys_old_getrlimit sys_ni_syscall -#else -#define ppc_rtas sys_ni_syscall + + .p2align 3 #endif -_GLOBAL(sys_call_table) -SYSCALL(restart_syscall) -SYSCALL(exit) -PPC_SYS(fork) -SYSCALL(read) -SYSCALL(write) -COMPAT_SYS(open) -SYSCALL(close) -COMPAT_SYS(waitpid) -COMPAT_SYS(creat) -SYSCALL(link) -SYSCALL(unlink) -COMPAT_SYS(execve) -SYSCALL(chdir) -SYSX(sys64_time,compat_sys_time,sys_time) -SYSCALL(mknod) -SYSCALL(chmod) -SYSCALL(lchown) -SYSCALL(ni_syscall) -OLDSYS(stat) -SYSX(sys_lseek,ppc32_lseek,sys_lseek) -SYSCALL(getpid) -COMPAT_SYS(mount) -SYSX(sys_ni_syscall,sys_oldumount,sys_oldumount) -SYSCALL(setuid) -SYSCALL(getuid) -COMPAT_SYS(stime) -COMPAT_SYS(ptrace) -SYSCALL(alarm) -OLDSYS(fstat) -COMPAT_SYS(pause) -COMPAT_SYS(utime) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -COMPAT_SYS(access) -COMPAT_SYS(nice) -SYSCALL(ni_syscall) -SYSCALL(sync) -COMPAT_SYS(kill) -SYSCALL(rename) -COMPAT_SYS(mkdir) -SYSCALL(rmdir) -SYSCALL(dup) -SYSCALL(pipe) -COMPAT_SYS(times) -SYSCALL(ni_syscall) -SYSCALL(brk) -SYSCALL(setgid) -SYSCALL(getgid) -SYSCALL(signal) -SYSCALL(geteuid) -SYSCALL(getegid) -SYSCALL(acct) -SYSCALL(umount) -SYSCALL(ni_syscall) -COMPAT_SYS(ioctl) -COMPAT_SYS(fcntl) -SYSCALL(ni_syscall) -COMPAT_SYS(setpgid) -SYSCALL(ni_syscall) -SYSX(sys_ni_syscall,sys_olduname, sys_olduname) -COMPAT_SYS(umask) -SYSCALL(chroot) -SYSCALL(ustat) -SYSCALL(dup2) -SYSCALL(getppid) -SYSCALL(getpgrp) -SYSCALL(setsid) -SYS32ONLY(sigaction) -SYSCALL(sgetmask) -COMPAT_SYS(ssetmask) -SYSCALL(setreuid) -SYSCALL(setregid) -SYSX(sys_ni_syscall,ppc32_sigsuspend,ppc_sigsuspend) -COMPAT_SYS(sigpending) -COMPAT_SYS(sethostname) -COMPAT_SYS(setrlimit) -COMPAT_SYS(old_getrlimit) -COMPAT_SYS(getrusage) -COMPAT_SYS(gettimeofday) -COMPAT_SYS(settimeofday) -COMPAT_SYS(getgroups) -COMPAT_SYS(setgroups) -SYSX(sys_ni_syscall,sys_ni_syscall,ppc_select) -SYSCALL(symlink) -OLDSYS(lstat) -COMPAT_SYS(readlink) -SYSCALL(uselib) -SYSCALL(swapon) -SYSCALL(reboot) -SYSX(sys_ni_syscall,old32_readdir,old_readdir) -SYSCALL(mmap) -SYSCALL(munmap) -SYSCALL(truncate) -SYSCALL(ftruncate) -SYSCALL(fchmod) -SYSCALL(fchown) -COMPAT_SYS(getpriority) -COMPAT_SYS(setpriority) -SYSCALL(ni_syscall) -COMPAT_SYS(statfs) -COMPAT_SYS(fstatfs) -SYSCALL(ni_syscall) -COMPAT_SYS(socketcall) -COMPAT_SYS(syslog) -COMPAT_SYS(setitimer) -COMPAT_SYS(getitimer) -COMPAT_SYS(newstat) -COMPAT_SYS(newlstat) -COMPAT_SYS(newfstat) -SYSX(sys_ni_syscall,sys_uname,sys_uname) -SYSCALL(ni_syscall) -SYSCALL(vhangup) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -COMPAT_SYS(wait4) -SYSCALL(swapoff) -COMPAT_SYS(sysinfo) -COMPAT_SYS(ipc) -SYSCALL(fsync) -SYSX(sys_ni_syscall,ppc32_sigreturn,sys_sigreturn) -PPC_SYS(clone) -COMPAT_SYS(setdomainname) -PPC_SYS(newuname) -SYSCALL(ni_syscall) -COMPAT_SYS(adjtimex) -SYSCALL(mprotect) -SYSX(sys_ni_syscall,compat_sys_sigprocmask,sys_sigprocmask) -SYSCALL(ni_syscall) -SYSCALL(init_module) -SYSCALL(delete_module) -SYSCALL(ni_syscall) -SYSCALL(quotactl) -COMPAT_SYS(getpgid) -SYSCALL(fchdir) -SYSCALL(bdflush) -COMPAT_SYS(sysfs) -SYSX(ppc64_personality,ppc64_personality,sys_personality) -SYSCALL(ni_syscall) -SYSCALL(setfsuid) -SYSCALL(setfsgid) -SYSCALL(llseek) -COMPAT_SYS(getdents) -SYSX(sys_select,ppc32_select,ppc_select) -SYSCALL(flock) -SYSCALL(msync) -COMPAT_SYS(readv) -COMPAT_SYS(writev) -COMPAT_SYS(getsid) -SYSCALL(fdatasync) -COMPAT_SYS(sysctl) -SYSCALL(mlock) -SYSCALL(munlock) -SYSCALL(mlockall) -SYSCALL(munlockall) -COMPAT_SYS(sched_setparam) -COMPAT_SYS(sched_getparam) -COMPAT_SYS(sched_setscheduler) -COMPAT_SYS(sched_getscheduler) -SYSCALL(sched_yield) -COMPAT_SYS(sched_get_priority_max) -COMPAT_SYS(sched_get_priority_min) -COMPAT_SYS(sched_rr_get_interval) -COMPAT_SYS(nanosleep) -SYSCALL(mremap) -SYSCALL(setresuid) -SYSCALL(getresuid) -SYSCALL(ni_syscall) -SYSCALL(poll) -COMPAT_SYS(nfsservctl) -SYSCALL(setresgid) -SYSCALL(getresgid) -COMPAT_SYS(prctl) -SYSX(ppc64_rt_sigreturn,ppc32_rt_sigreturn,sys_rt_sigreturn) -COMPAT_SYS(rt_sigaction) -COMPAT_SYS(rt_sigprocmask) -COMPAT_SYS(rt_sigpending) -COMPAT_SYS(rt_sigtimedwait) -COMPAT_SYS(rt_sigqueueinfo) -SYSX(ppc64_rt_sigsuspend,ppc32_rt_sigsuspend,ppc_rt_sigsuspend) -COMPAT_SYS(pread64) -COMPAT_SYS(pwrite64) -SYSCALL(chown) -SYSCALL(getcwd) -SYSCALL(capget) -SYSCALL(capset) -COMPAT_SYS(sigaltstack) -SYSX(sys_sendfile64,compat_sys_sendfile,sys_sendfile) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -PPC_SYS(vfork) -COMPAT_SYS(getrlimit) -COMPAT_SYS(readahead) -SYS32ONLY(mmap2) -SYS32ONLY(truncate64) -SYS32ONLY(ftruncate64) -SYSX(sys_ni_syscall,sys_stat64,sys_stat64) -SYSX(sys_ni_syscall,sys_lstat64,sys_lstat64) -SYSX(sys_ni_syscall,sys_fstat64,sys_fstat64) -COMPAT_SYS(pciconfig_read) -COMPAT_SYS(pciconfig_write) -COMPAT_SYS(pciconfig_iobase) -SYSCALL(ni_syscall) -SYSCALL(getdents64) -SYSCALL(pivot_root) -SYSX(sys_ni_syscall,compat_sys_fcntl64,sys_fcntl64) -SYSCALL(madvise) -SYSCALL(mincore) -SYSCALL(gettid) -SYSCALL(tkill) -SYSCALL(setxattr) -SYSCALL(lsetxattr) -SYSCALL(fsetxattr) -SYSCALL(getxattr) -SYSCALL(lgetxattr) -SYSCALL(fgetxattr) -SYSCALL(listxattr) -SYSCALL(llistxattr) -SYSCALL(flistxattr) -SYSCALL(removexattr) -SYSCALL(lremovexattr) -SYSCALL(fremovexattr) -COMPAT_SYS(futex) -COMPAT_SYS(sched_setaffinity) -COMPAT_SYS(sched_getaffinity) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -SYS32ONLY(sendfile64) -COMPAT_SYS(io_setup) -SYSCALL(io_destroy) -COMPAT_SYS(io_getevents) -COMPAT_SYS(io_submit) -SYSCALL(io_cancel) -SYSCALL(set_tid_address) -SYSX(sys_fadvise64,ppc32_fadvise64,sys_fadvise64) -SYSCALL(exit_group) -SYSX(sys_lookup_dcookie,ppc32_lookup_dcookie,sys_lookup_dcookie) -SYSCALL(epoll_create) -SYSCALL(epoll_ctl) -SYSCALL(epoll_wait) -SYSCALL(remap_file_pages) -SYSX(sys_timer_create,ppc32_timer_create,sys_timer_create) -COMPAT_SYS(timer_settime) -COMPAT_SYS(timer_gettime) -SYSCALL(timer_getoverrun) -SYSCALL(timer_delete) -COMPAT_SYS(clock_settime) -COMPAT_SYS(clock_gettime) -COMPAT_SYS(clock_getres) -COMPAT_SYS(clock_nanosleep) -SYSX(ppc64_swapcontext,ppc32_swapcontext,ppc_swapcontext) -COMPAT_SYS(tgkill) -COMPAT_SYS(utimes) -COMPAT_SYS(statfs64) -COMPAT_SYS(fstatfs64) -SYSX(sys_ni_syscall, ppc_fadvise64_64, ppc_fadvise64_64) -PPC_SYS(rtas) -OLDSYS(debug_setcontext) -SYSCALL(ni_syscall) -SYSCALL(ni_syscall) -COMPAT_SYS(mbind) -COMPAT_SYS(get_mempolicy) -COMPAT_SYS(set_mempolicy) -COMPAT_SYS(mq_open) -SYSCALL(mq_unlink) -COMPAT_SYS(mq_timedsend) -COMPAT_SYS(mq_timedreceive) -COMPAT_SYS(mq_notify) -COMPAT_SYS(mq_getsetattr) -COMPAT_SYS(kexec_load) -COMPAT_SYS(add_key) -COMPAT_SYS(request_key) -COMPAT_SYS(keyctl) -COMPAT_SYS(waitid) -COMPAT_SYS(ioprio_set) -COMPAT_SYS(ioprio_get) -SYSCALL(inotify_init) -SYSCALL(inotify_add_watch) -SYSCALL(inotify_rm_watch) +.globl sys_call_table +sys_call_table: + +#include <asm/systbl.h> diff --git a/arch/powerpc/kernel/systbl_chk.c b/arch/powerpc/kernel/systbl_chk.c new file mode 100644 index 00000000000..238aa63ced8 --- /dev/null +++ b/arch/powerpc/kernel/systbl_chk.c @@ -0,0 +1,58 @@ +/* + * This file, when run through CPP produces a list of syscall numbers + * in the order of systbl.h. That way we can check for gaps and syscalls + * that are out of order. + * + * Unfortunately, we cannot check for the correct ordering of entries + * using SYSX(). + * + * Copyright © IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/unistd.h> + +#define SYSCALL(func) __NR_##func +#define COMPAT_SYS(func) __NR_##func +#define PPC_SYS(func) __NR_##func +#ifdef CONFIG_PPC64 +#define OLDSYS(func) -1 +#define SYS32ONLY(func) -1 +#else +#define OLDSYS(func) __NR_old##func +#define SYS32ONLY(func) __NR_##func +#endif +#define SYSX(f, f3264, f32) -1 + +#define SYSCALL_SPU(func) SYSCALL(func) +#define COMPAT_SYS_SPU(func) COMPAT_SYS(func) +#define PPC_SYS_SPU(func) PPC_SYS(func) +#define SYSX_SPU(f, f3264, f32) SYSX(f, f3264, f32) + +/* Just insert a marker for ni_syscalls */ +#define __NR_ni_syscall -1 + +/* + * These are the known exceptions. + * Hopefully, there will be no more. + */ +#define __NR_llseek __NR__llseek +#undef __NR_umount +#define __NR_umount __NR_umount2 +#define __NR_old_getrlimit __NR_getrlimit +#define __NR_newstat __NR_stat +#define __NR_newlstat __NR_lstat +#define __NR_newfstat __NR_fstat +#define __NR_newuname __NR_uname +#define __NR_sysctl __NR__sysctl +#define __NR_olddebug_setcontext __NR_sys_debug_setcontext + +/* We call sys_ugetrlimit for syscall number __NR_getrlimit */ +#define getrlimit ugetrlimit + +START_TABLE +#include <asm/systbl.h> +END_TABLE __NR_syscalls diff --git a/arch/powerpc/kernel/systbl_chk.sh b/arch/powerpc/kernel/systbl_chk.sh new file mode 100644 index 00000000000..19415e7674a --- /dev/null +++ b/arch/powerpc/kernel/systbl_chk.sh @@ -0,0 +1,33 @@ +#!/bin/sh +# +# Just process the CPP output from systbl_chk.c and complain +# if anything is out of order. +# +# Copyright © 2008 IBM Corporation +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version +# 2 of the License, or (at your option) any later version. + +awk 'BEGIN { num = -1; } # Ignore the beginning of the file + /^#/ { next; } + /^[ \t]*$/ { next; } + /^START_TABLE/ { num = 0; next; } + /^END_TABLE/ { + if (num != $2) { + printf "__NR_syscalls (%s) is not one more than the last syscall (%s)\n", + $2, num - 1; + exit(1); + } + num = -1; # Ignore the rest of the file + } + { + if (num == -1) next; + if (($1 != -1) && ($1 != num)) { + printf "Syscall %s out of order (expected %s)\n", + $1, num; + exit(1); + }; + num++; + }' "$1" diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c new file mode 100644 index 00000000000..a753b72efbc --- /dev/null +++ b/arch/powerpc/kernel/tau_6xx.c @@ -0,0 +1,270 @@ +/* + * temp.c Thermal management for cpu's with Thermal Assist Units + * + * Written by Troy Benjegerdes <hozer@drgw.net> + * + * TODO: + * dynamic power management to limit peak CPU temp (using ICTC) + * calibration??? + * + * Silly, crazy ideas: use cpu load (from scheduler) and ICTC to extend battery + * life in portables, and add a 'performance/watt' metric somewhere in /proc + */ + +#include <linux/errno.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/param.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/init.h> + +#include <asm/io.h> +#include <asm/reg.h> +#include <asm/nvram.h> +#include <asm/cache.h> +#include <asm/8xx_immap.h> +#include <asm/machdep.h> + +static struct tau_temp +{ + int interrupts; + unsigned char low; + unsigned char high; + unsigned char grew; +} tau[NR_CPUS]; + +struct timer_list tau_timer; + +#undef DEBUG + +/* TODO: put these in a /proc interface, with some sanity checks, and maybe + * dynamic adjustment to minimize # of interrupts */ +/* configurable values for step size and how much to expand the window when + * we get an interrupt. These are based on the limit that was out of range */ +#define step_size 2 /* step size when temp goes out of range */ +#define window_expand 1 /* expand the window by this much */ +/* configurable values for shrinking the window */ +#define shrink_timer 2*HZ /* period between shrinking the window */ +#define min_window 2 /* minimum window size, degrees C */ + +void set_thresholds(unsigned long cpu) +{ +#ifdef CONFIG_TAU_INT + /* + * setup THRM1, + * threshold, valid bit, enable interrupts, interrupt when below threshold + */ + mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TIE | THRM1_TID); + + /* setup THRM2, + * threshold, valid bit, enable interrupts, interrupt when above threshold + */ + mtspr (SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | THRM1_TIE); +#else + /* same thing but don't enable interrupts */ + mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TID); + mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V); +#endif +} + +void TAUupdate(int cpu) +{ + unsigned thrm; + +#ifdef DEBUG + printk("TAUupdate "); +#endif + + /* if both thresholds are crossed, the step_sizes cancel out + * and the window winds up getting expanded twice. */ + if((thrm = mfspr(SPRN_THRM1)) & THRM1_TIV){ /* is valid? */ + if(thrm & THRM1_TIN){ /* crossed low threshold */ + if (tau[cpu].low >= step_size){ + tau[cpu].low -= step_size; + tau[cpu].high -= (step_size - window_expand); + } + tau[cpu].grew = 1; +#ifdef DEBUG + printk("low threshold crossed "); +#endif + } + } + if((thrm = mfspr(SPRN_THRM2)) & THRM1_TIV){ /* is valid? */ + if(thrm & THRM1_TIN){ /* crossed high threshold */ + if (tau[cpu].high <= 127-step_size){ + tau[cpu].low += (step_size - window_expand); + tau[cpu].high += step_size; + } + tau[cpu].grew = 1; +#ifdef DEBUG + printk("high threshold crossed "); +#endif + } + } + +#ifdef DEBUG + printk("grew = %d\n", tau[cpu].grew); +#endif + +#ifndef CONFIG_TAU_INT /* tau_timeout will do this if not using interrupts */ + set_thresholds(cpu); +#endif + +} + +#ifdef CONFIG_TAU_INT +/* + * TAU interrupts - called when we have a thermal assist unit interrupt + * with interrupts disabled + */ + +void TAUException(struct pt_regs * regs) +{ + int cpu = smp_processor_id(); + + irq_enter(); + tau[cpu].interrupts++; + + TAUupdate(cpu); + + irq_exit(); +} +#endif /* CONFIG_TAU_INT */ + +static void tau_timeout(void * info) +{ + int cpu; + unsigned long flags; + int size; + int shrink; + + /* disabling interrupts *should* be okay */ + local_irq_save(flags); + cpu = smp_processor_id(); + +#ifndef CONFIG_TAU_INT + TAUupdate(cpu); +#endif + + size = tau[cpu].high - tau[cpu].low; + if (size > min_window && ! tau[cpu].grew) { + /* do an exponential shrink of half the amount currently over size */ + shrink = (2 + size - min_window) / 4; + if (shrink) { + tau[cpu].low += shrink; + tau[cpu].high -= shrink; + } else { /* size must have been min_window + 1 */ + tau[cpu].low += 1; +#if 1 /* debug */ + if ((tau[cpu].high - tau[cpu].low) != min_window){ + printk(KERN_ERR "temp.c: line %d, logic error\n", __LINE__); + } +#endif + } + } + + tau[cpu].grew = 0; + + set_thresholds(cpu); + + /* + * Do the enable every time, since otherwise a bunch of (relatively) + * complex sleep code needs to be added. One mtspr every time + * tau_timeout is called is probably not a big deal. + * + * Enable thermal sensor and set up sample interval timer + * need 20 us to do the compare.. until a nice 'cpu_speed' function + * call is implemented, just assume a 500 mhz clock. It doesn't really + * matter if we take too long for a compare since it's all interrupt + * driven anyway. + * + * use a extra long time.. (60 us @ 500 mhz) + */ + mtspr(SPRN_THRM3, THRM3_SITV(500*60) | THRM3_E); + + local_irq_restore(flags); +} + +static void tau_timeout_smp(unsigned long unused) +{ + + /* schedule ourselves to be run again */ + mod_timer(&tau_timer, jiffies + shrink_timer) ; + on_each_cpu(tau_timeout, NULL, 0); +} + +/* + * setup the TAU + * + * Set things up to use THRM1 as a temperature lower bound, and THRM2 as an upper bound. + * Start off at zero + */ + +int tau_initialized = 0; + +void __init TAU_init_smp(void * info) +{ + unsigned long cpu = smp_processor_id(); + + /* set these to a reasonable value and let the timer shrink the + * window */ + tau[cpu].low = 5; + tau[cpu].high = 120; + + set_thresholds(cpu); +} + +int __init TAU_init(void) +{ + /* We assume in SMP that if one CPU has TAU support, they + * all have it --BenH + */ + if (!cpu_has_feature(CPU_FTR_TAU)) { + printk("Thermal assist unit not available\n"); + tau_initialized = 0; + return 1; + } + + + /* first, set up the window shrinking timer */ + init_timer(&tau_timer); + tau_timer.function = tau_timeout_smp; + tau_timer.expires = jiffies + shrink_timer; + add_timer(&tau_timer); + + on_each_cpu(TAU_init_smp, NULL, 0); + + printk("Thermal assist unit "); +#ifdef CONFIG_TAU_INT + printk("using interrupts, "); +#else + printk("using timers, "); +#endif + printk("shrink_timer: %d jiffies\n", shrink_timer); + tau_initialized = 1; + + return 0; +} + +__initcall(TAU_init); + +/* + * return current temp + */ + +u32 cpu_temp_both(unsigned long cpu) +{ + return ((tau[cpu].high << 16) | tau[cpu].low); +} + +int cpu_temp(unsigned long cpu) +{ + return ((tau[cpu].high + tau[cpu].low) / 2); +} + +int tau_interrupts(unsigned long cpu) +{ + return (tau[cpu].interrupts); +} diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 23436b6c188..9fff9cdcc51 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -17,8 +17,7 @@ * * TODO (not necessarily in this file): * - improve precision and reproducibility of timebase frequency - * measurement at boot time. (for iSeries, we calibrate the timebase - * against the Titan chip's clock.) + * measurement at boot time. * - for astronomical applications: add a new function to get * non ambiguous timestamps even around leap seconds. This needs * a new timestamp format and a good name. @@ -32,9 +31,8 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> #include <linux/errno.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/param.h> @@ -44,12 +42,19 @@ #include <linux/timex.h> #include <linux/kernel_stat.h> #include <linux/time.h> +#include <linux/clockchips.h> #include <linux/init.h> #include <linux/profile.h> #include <linux/cpu.h> #include <linux/security.h> #include <linux/percpu.h> #include <linux/rtc.h> +#include <linux/jiffies.h> +#include <linux/posix-timers.h> +#include <linux/irq.h> +#include <linux/delay.h> +#include <linux/irq_work.h> +#include <asm/trace.h> #include <asm/io.h> #include <asm/processor.h> @@ -61,26 +66,53 @@ #include <asm/prom.h> #include <asm/irq.h> #include <asm/div64.h> -#ifdef CONFIG_PPC64 -#include <asm/systemcfg.h> +#include <asm/smp.h> +#include <asm/vdso_datapage.h> #include <asm/firmware.h> -#endif -#ifdef CONFIG_PPC_ISERIES -#include <asm/iSeries/ItLpQueue.h> -#include <asm/iSeries/HvCallXm.h> -#endif +#include <asm/cputime.h> -/* keep track of when we need to update the rtc */ -time_t last_rtc_update; -extern int piranha_simulator; -#ifdef CONFIG_PPC_ISERIES -unsigned long iSeries_recal_titan = 0; -unsigned long iSeries_recal_tb = 0; -static unsigned long first_settimeofday = 1; -#endif +/* powerpc clocksource/clockevent code */ + +#include <linux/clockchips.h> +#include <linux/timekeeper_internal.h> + +static cycle_t rtc_read(struct clocksource *); +static struct clocksource clocksource_rtc = { + .name = "rtc", + .rating = 400, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .mask = CLOCKSOURCE_MASK(64), + .read = rtc_read, +}; + +static cycle_t timebase_read(struct clocksource *); +static struct clocksource clocksource_timebase = { + .name = "timebase", + .rating = 400, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .mask = CLOCKSOURCE_MASK(64), + .read = timebase_read, +}; + +#define DECREMENTER_MAX 0x7fffffff + +static int decrementer_set_next_event(unsigned long evt, + struct clock_event_device *dev); +static void decrementer_set_mode(enum clock_event_mode mode, + struct clock_event_device *dev); -/* The decrementer counts down by 128 every 128ns on a 601. */ -#define DECREMENTER_COUNT_601 (1000000000 / HZ) +struct clock_event_device decrementer_clockevent = { + .name = "decrementer", + .rating = 200, + .irq = 0, + .set_next_event = decrementer_set_next_event, + .set_mode = decrementer_set_mode, + .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP, +}; +EXPORT_SYMBOL(decrementer_clockevent); + +DEFINE_PER_CPU(u64, decrementers_next_tb); +static DEFINE_PER_CPU(struct clock_event_device, decrementers); #define XSEC_PER_SEC (1024*1024) @@ -95,218 +127,289 @@ unsigned long tb_ticks_per_jiffy; unsigned long tb_ticks_per_usec = 100; /* sane default */ EXPORT_SYMBOL(tb_ticks_per_usec); unsigned long tb_ticks_per_sec; -u64 tb_to_xs; -unsigned tb_to_us; -unsigned long processor_freq; +EXPORT_SYMBOL(tb_ticks_per_sec); /* for cputime_t conversions */ + DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL_GPL(rtc_lock); -u64 tb_to_ns_scale; -unsigned tb_to_ns_shift; - -struct gettimeofday_struct do_gtod; - -extern unsigned long wall_jiffies; +static u64 tb_to_ns_scale __read_mostly; +static unsigned tb_to_ns_shift __read_mostly; +static u64 boot_tb __read_mostly; extern struct timezone sys_tz; static long timezone_offset; -void ppc_adjtimex(void); - -static unsigned adjusting_time = 0; - unsigned long ppc_proc_freq; +EXPORT_SYMBOL_GPL(ppc_proc_freq); unsigned long ppc_tb_freq; +EXPORT_SYMBOL_GPL(ppc_tb_freq); -#ifdef CONFIG_PPC32 /* XXX for now */ -#define boot_cpuid 0 -#endif +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +/* + * Factors for converting from cputime_t (timebase ticks) to + * jiffies, microseconds, seconds, and clock_t (1/USER_HZ seconds). + * These are all stored as 0.64 fixed-point binary fractions. + */ +u64 __cputime_jiffies_factor; +EXPORT_SYMBOL(__cputime_jiffies_factor); +u64 __cputime_usec_factor; +EXPORT_SYMBOL(__cputime_usec_factor); +u64 __cputime_sec_factor; +EXPORT_SYMBOL(__cputime_sec_factor); +u64 __cputime_clockt_factor; +EXPORT_SYMBOL(__cputime_clockt_factor); +DEFINE_PER_CPU(unsigned long, cputime_last_delta); +DEFINE_PER_CPU(unsigned long, cputime_scaled_last_delta); + +cputime_t cputime_one_jiffy; + +void (*dtl_consumer)(struct dtl_entry *, u64); + +static void calc_cputime_factors(void) +{ + struct div_result res; -u64 tb_last_jiffy __cacheline_aligned_in_smp; -unsigned long tb_last_stamp; + div128_by_32(HZ, 0, tb_ticks_per_sec, &res); + __cputime_jiffies_factor = res.result_low; + div128_by_32(1000000, 0, tb_ticks_per_sec, &res); + __cputime_usec_factor = res.result_low; + div128_by_32(1, 0, tb_ticks_per_sec, &res); + __cputime_sec_factor = res.result_low; + div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res); + __cputime_clockt_factor = res.result_low; +} /* - * Note that on ppc32 this only stores the bottom 32 bits of - * the timebase value, but that's enough to tell when a jiffy - * has passed. + * Read the SPURR on systems that have it, otherwise the PURR, + * or if that doesn't exist return the timebase value passed in. */ -DEFINE_PER_CPU(unsigned long, last_jiffy); - -static __inline__ void timer_check_rtc(void) -{ - /* - * update the rtc when needed, this should be performed on the - * right fraction of a second. Half or full second ? - * Full second works on mk48t59 clocks, others need testing. - * Note that this update is basically only used through - * the adjtimex system calls. Setting the HW clock in - * any other way is a /dev/rtc and userland business. - * This is still wrong by -0.5/+1.5 jiffies because of the - * timer interrupt resolution and possible delay, but here we - * hit a quantization limit which can only be solved by higher - * resolution timers and decoupling time management from timer - * interrupts. This is also wrong on the clocks - * which require being written at the half second boundary. - * We should have an rtc call that only sets the minutes and - * seconds like on Intel to avoid problems with non UTC clocks. - */ - if (ppc_md.set_rtc_time && ntp_synced() && - xtime.tv_sec - last_rtc_update >= 659 && - abs((xtime.tv_nsec/1000) - (1000000-1000000/HZ)) < 500000/HZ && - jiffies - wall_jiffies == 1) { - struct rtc_time tm; - to_tm(xtime.tv_sec + 1 + timezone_offset, &tm); - tm.tm_year -= 1900; - tm.tm_mon -= 1; - if (ppc_md.set_rtc_time(&tm) == 0) - last_rtc_update = xtime.tv_sec + 1; - else - /* Try again one minute later */ - last_rtc_update += 60; - } +static u64 read_spurr(u64 tb) +{ + if (cpu_has_feature(CPU_FTR_SPURR)) + return mfspr(SPRN_SPURR); + if (cpu_has_feature(CPU_FTR_PURR)) + return mfspr(SPRN_PURR); + return tb; } +#ifdef CONFIG_PPC_SPLPAR + /* - * This version of gettimeofday has microsecond resolution. + * Scan the dispatch trace log and count up the stolen time. + * Should be called with interrupts disabled. */ -static inline void __do_gettimeofday(struct timeval *tv, u64 tb_val) +static u64 scan_dispatch_log(u64 stop_tb) { - unsigned long sec, usec; - u64 tb_ticks, xsec; - struct gettimeofday_vars *temp_varp; - u64 temp_tb_to_xs, temp_stamp_xsec; + u64 i = local_paca->dtl_ridx; + struct dtl_entry *dtl = local_paca->dtl_curr; + struct dtl_entry *dtl_end = local_paca->dispatch_log_end; + struct lppaca *vpa = local_paca->lppaca_ptr; + u64 tb_delta; + u64 stolen = 0; + u64 dtb; + + if (!dtl) + return 0; - /* - * These calculations are faster (gets rid of divides) - * if done in units of 1/2^20 rather than microseconds. - * The conversion to microseconds at the end is done - * without a divide (and in fact, without a multiply) + if (i == be64_to_cpu(vpa->dtl_idx)) + return 0; + while (i < be64_to_cpu(vpa->dtl_idx)) { + dtb = be64_to_cpu(dtl->timebase); + tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) + + be32_to_cpu(dtl->ready_to_enqueue_time); + barrier(); + if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) { + /* buffer has overflowed */ + i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG; + dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); + continue; + } + if (dtb > stop_tb) + break; + if (dtl_consumer) + dtl_consumer(dtl, i); + stolen += tb_delta; + ++i; + ++dtl; + if (dtl == dtl_end) + dtl = local_paca->dispatch_log; + } + local_paca->dtl_ridx = i; + local_paca->dtl_curr = dtl; + return stolen; +} + +/* + * Accumulate stolen time by scanning the dispatch trace log. + * Called on entry from user mode. + */ +void accumulate_stolen_time(void) +{ + u64 sst, ust; + + u8 save_soft_enabled = local_paca->soft_enabled; + + /* We are called early in the exception entry, before + * soft/hard_enabled are sync'ed to the expected state + * for the exception. We are hard disabled but the PACA + * needs to reflect that so various debug stuff doesn't + * complain */ - temp_varp = do_gtod.varp; - tb_ticks = tb_val - temp_varp->tb_orig_stamp; - temp_tb_to_xs = temp_varp->tb_to_xs; - temp_stamp_xsec = temp_varp->stamp_xsec; - xsec = temp_stamp_xsec + mulhdu(tb_ticks, temp_tb_to_xs); - sec = xsec / XSEC_PER_SEC; - usec = (unsigned long)xsec & (XSEC_PER_SEC - 1); - usec = SCALE_XSEC(usec, 1000000); + local_paca->soft_enabled = 0; + + sst = scan_dispatch_log(local_paca->starttime_user); + ust = scan_dispatch_log(local_paca->starttime); + local_paca->system_time -= sst; + local_paca->user_time -= ust; + local_paca->stolen_time += ust + sst; - tv->tv_sec = sec; - tv->tv_usec = usec; + local_paca->soft_enabled = save_soft_enabled; } -void do_gettimeofday(struct timeval *tv) +static inline u64 calculate_stolen_time(u64 stop_tb) { - if (__USE_RTC()) { - /* do this the old way */ - unsigned long flags, seq; - unsigned int sec, nsec, usec, lost; + u64 stolen = 0; - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - sec = xtime.tv_sec; - nsec = xtime.tv_nsec + tb_ticks_since(tb_last_stamp); - lost = jiffies - wall_jiffies; - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - usec = nsec / 1000 + lost * (1000000 / HZ); - while (usec >= 1000000) { - usec -= 1000000; - ++sec; - } - tv->tv_sec = sec; - tv->tv_usec = usec; - return; + if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) { + stolen = scan_dispatch_log(stop_tb); + get_paca()->system_time -= stolen; } - __do_gettimeofday(tv, get_tb()); + + stolen += get_paca()->stolen_time; + get_paca()->stolen_time = 0; + return stolen; } -EXPORT_SYMBOL(do_gettimeofday); +#else /* CONFIG_PPC_SPLPAR */ +static inline u64 calculate_stolen_time(u64 stop_tb) +{ + return 0; +} -/* Synchronize xtime with do_gettimeofday */ +#endif /* CONFIG_PPC_SPLPAR */ -static inline void timer_sync_xtime(unsigned long cur_tb) +/* + * Account time for a transition between system, hard irq + * or soft irq state. + */ +static u64 vtime_delta(struct task_struct *tsk, + u64 *sys_scaled, u64 *stolen) { -#ifdef CONFIG_PPC64 - /* why do we do this? */ - struct timeval my_tv; + u64 now, nowscaled, deltascaled; + u64 udelta, delta, user_scaled; + + WARN_ON_ONCE(!irqs_disabled()); + + now = mftb(); + nowscaled = read_spurr(now); + get_paca()->system_time += now - get_paca()->starttime; + get_paca()->starttime = now; + deltascaled = nowscaled - get_paca()->startspurr; + get_paca()->startspurr = nowscaled; + + *stolen = calculate_stolen_time(now); - __do_gettimeofday(&my_tv, cur_tb); + delta = get_paca()->system_time; + get_paca()->system_time = 0; + udelta = get_paca()->user_time - get_paca()->utime_sspurr; + get_paca()->utime_sspurr = get_paca()->user_time; - if (xtime.tv_sec <= my_tv.tv_sec) { - xtime.tv_sec = my_tv.tv_sec; - xtime.tv_nsec = my_tv.tv_usec * 1000; + /* + * Because we don't read the SPURR on every kernel entry/exit, + * deltascaled includes both user and system SPURR ticks. + * Apportion these ticks to system SPURR ticks and user + * SPURR ticks in the same ratio as the system time (delta) + * and user time (udelta) values obtained from the timebase + * over the same interval. The system ticks get accounted here; + * the user ticks get saved up in paca->user_time_scaled to be + * used by account_process_tick. + */ + *sys_scaled = delta; + user_scaled = udelta; + if (deltascaled != delta + udelta) { + if (udelta) { + *sys_scaled = deltascaled * delta / (delta + udelta); + user_scaled = deltascaled - *sys_scaled; + } else { + *sys_scaled = deltascaled; + } } -#endif + get_paca()->user_time_scaled += user_scaled; + + return delta; } -/* - * There are two copies of tb_to_xs and stamp_xsec so that no - * lock is needed to access and use these values in - * do_gettimeofday. We alternate the copies and as long as a - * reasonable time elapses between changes, there will never - * be inconsistent values. ntpd has a minimum of one minute - * between updates. - */ -static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec, - u64 new_tb_to_xs) +void vtime_account_system(struct task_struct *tsk) { - unsigned temp_idx; - struct gettimeofday_vars *temp_varp; + u64 delta, sys_scaled, stolen; - temp_idx = (do_gtod.var_idx == 0); - temp_varp = &do_gtod.vars[temp_idx]; + delta = vtime_delta(tsk, &sys_scaled, &stolen); + account_system_time(tsk, 0, delta, sys_scaled); + if (stolen) + account_steal_time(stolen); +} +EXPORT_SYMBOL_GPL(vtime_account_system); - temp_varp->tb_to_xs = new_tb_to_xs; - temp_varp->tb_orig_stamp = new_tb_stamp; - temp_varp->stamp_xsec = new_stamp_xsec; - smp_mb(); - do_gtod.varp = temp_varp; - do_gtod.var_idx = temp_idx; +void vtime_account_idle(struct task_struct *tsk) +{ + u64 delta, sys_scaled, stolen; -#ifdef CONFIG_PPC64 - /* - * tb_update_count is used to allow the userspace gettimeofday code - * to assure itself that it sees a consistent view of the tb_to_xs and - * stamp_xsec variables. It reads the tb_update_count, then reads - * tb_to_xs and stamp_xsec and then reads tb_update_count again. If - * the two values of tb_update_count match and are even then the - * tb_to_xs and stamp_xsec values are consistent. If not, then it - * loops back and reads them again until this criteria is met. - */ - ++(systemcfg->tb_update_count); - smp_wmb(); - systemcfg->tb_orig_stamp = new_tb_stamp; - systemcfg->stamp_xsec = new_stamp_xsec; - systemcfg->tb_to_xs = new_tb_to_xs; - smp_wmb(); - ++(systemcfg->tb_update_count); -#endif + delta = vtime_delta(tsk, &sys_scaled, &stolen); + account_idle_time(delta + stolen); } /* - * When the timebase - tb_orig_stamp gets too big, we do a manipulation - * between tb_orig_stamp and stamp_xsec. The goal here is to keep the - * difference tb - tb_orig_stamp small enough to always fit inside a - * 32 bits number. This is a requirement of our fast 32 bits userland - * implementation in the vdso. If we "miss" a call to this function - * (interrupt latency, CPU locked in a spinlock, ...) and we end up - * with a too big difference, then the vdso will fallback to calling - * the syscall + * Transfer the user time accumulated in the paca + * by the exception entry and exit code to the generic + * process user time records. + * Must be called with interrupts disabled. + * Assumes that vtime_account_system/idle() has been called + * recently (i.e. since the last entry from usermode) so that + * get_paca()->user_time_scaled is up to date. */ -static __inline__ void timer_recalc_offset(u64 cur_tb) +void vtime_account_user(struct task_struct *tsk) { - unsigned long offset; - u64 new_stamp_xsec; + cputime_t utime, utimescaled; + + utime = get_paca()->user_time; + utimescaled = get_paca()->user_time_scaled; + get_paca()->user_time = 0; + get_paca()->user_time_scaled = 0; + get_paca()->utime_sspurr = 0; + account_user_time(tsk, utime, utimescaled); +} - if (__USE_RTC()) - return; - offset = cur_tb - do_gtod.varp->tb_orig_stamp; - if ((offset & 0x80000000u) == 0) - return; - new_stamp_xsec = do_gtod.varp->stamp_xsec - + mulhdu(offset, do_gtod.varp->tb_to_xs); - update_gtod(cur_tb, new_stamp_xsec, do_gtod.varp->tb_to_xs); +#else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +#define calc_cputime_factors() +#endif + +void __delay(unsigned long loops) +{ + unsigned long start; + int diff; + + if (__USE_RTC()) { + start = get_rtcl(); + do { + /* the RTCL register wraps at 1000000000 */ + diff = get_rtcl() - start; + if (diff < 0) + diff += 1000000000; + } while (diff < loops); + } else { + start = get_tbl(); + while (get_tbl() - start < loops) + HMT_low(); + HMT_medium(); + } } +EXPORT_SYMBOL(__delay); + +void udelay(unsigned long usecs) +{ + __delay(tb_ticks_per_usec * usecs); +} +EXPORT_SYMBOL(udelay); #ifdef CONFIG_SMP unsigned long profile_pc(struct pt_regs *regs) @@ -321,137 +424,90 @@ unsigned long profile_pc(struct pt_regs *regs) EXPORT_SYMBOL(profile_pc); #endif -#ifdef CONFIG_PPC_ISERIES +#ifdef CONFIG_IRQ_WORK -/* - * This function recalibrates the timebase based on the 49-bit time-of-day - * value in the Titan chip. The Titan is much more accurate than the value - * returned by the service processor for the timebase frequency. +/* + * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable... */ +#ifdef CONFIG_PPC64 +static inline unsigned long test_irq_work_pending(void) +{ + unsigned long x; -static void iSeries_tb_recal(void) -{ - struct div_result divres; - unsigned long titan, tb; - tb = get_tb(); - titan = HvCallXm_loadTod(); - if ( iSeries_recal_titan ) { - unsigned long tb_ticks = tb - iSeries_recal_tb; - unsigned long titan_usec = (titan - iSeries_recal_titan) >> 12; - unsigned long new_tb_ticks_per_sec = (tb_ticks * USEC_PER_SEC)/titan_usec; - unsigned long new_tb_ticks_per_jiffy = (new_tb_ticks_per_sec+(HZ/2))/HZ; - long tick_diff = new_tb_ticks_per_jiffy - tb_ticks_per_jiffy; - char sign = '+'; - /* make sure tb_ticks_per_sec and tb_ticks_per_jiffy are consistent */ - new_tb_ticks_per_sec = new_tb_ticks_per_jiffy * HZ; - - if ( tick_diff < 0 ) { - tick_diff = -tick_diff; - sign = '-'; - } - if ( tick_diff ) { - if ( tick_diff < tb_ticks_per_jiffy/25 ) { - printk( "Titan recalibrate: new tb_ticks_per_jiffy = %lu (%c%ld)\n", - new_tb_ticks_per_jiffy, sign, tick_diff ); - tb_ticks_per_jiffy = new_tb_ticks_per_jiffy; - tb_ticks_per_sec = new_tb_ticks_per_sec; - div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres ); - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; - tb_to_xs = divres.result_low; - do_gtod.varp->tb_to_xs = tb_to_xs; - systemcfg->tb_ticks_per_sec = tb_ticks_per_sec; - systemcfg->tb_to_xs = tb_to_xs; - } - else { - printk( "Titan recalibrate: FAILED (difference > 4 percent)\n" - " new tb_ticks_per_jiffy = %lu\n" - " old tb_ticks_per_jiffy = %lu\n", - new_tb_ticks_per_jiffy, tb_ticks_per_jiffy ); - } - } - } - iSeries_recal_titan = titan; - iSeries_recal_tb = tb; + asm volatile("lbz %0,%1(13)" + : "=r" (x) + : "i" (offsetof(struct paca_struct, irq_work_pending))); + return x; } -#endif -/* - * For iSeries shared processors, we have to let the hypervisor - * set the hardware decrementer. We set a virtual decrementer - * in the lppaca and call the hypervisor if the virtual - * decrementer is less than the current value in the hardware - * decrementer. (almost always the new decrementer value will - * be greater than the current hardware decementer so the hypervisor - * call will not be needed) - */ +static inline void set_irq_work_pending_flag(void) +{ + asm volatile("stb %0,%1(13)" : : + "r" (1), + "i" (offsetof(struct paca_struct, irq_work_pending))); +} -/* - * timer_interrupt - gets called when the decrementer overflows, - * with interrupts disabled. - */ -void timer_interrupt(struct pt_regs * regs) +static inline void clear_irq_work_pending(void) { - int next_dec; - int cpu = smp_processor_id(); - unsigned long ticks; + asm volatile("stb %0,%1(13)" : : + "r" (0), + "i" (offsetof(struct paca_struct, irq_work_pending))); +} -#ifdef CONFIG_PPC32 - if (atomic_read(&ppc_n_lost_interrupts) != 0) - do_IRQ(regs); -#endif +#else /* 32-bit */ - irq_enter(); +DEFINE_PER_CPU(u8, irq_work_pending); - profile_tick(CPU_PROFILING, regs); +#define set_irq_work_pending_flag() __get_cpu_var(irq_work_pending) = 1 +#define test_irq_work_pending() __get_cpu_var(irq_work_pending) +#define clear_irq_work_pending() __get_cpu_var(irq_work_pending) = 0 -#ifdef CONFIG_PPC_ISERIES - get_paca()->lppaca.int_dword.fields.decr_int = 0; -#endif +#endif /* 32 vs 64 bit */ - while ((ticks = tb_ticks_since(per_cpu(last_jiffy, cpu))) - >= tb_ticks_per_jiffy) { - /* Update last_jiffy */ - per_cpu(last_jiffy, cpu) += tb_ticks_per_jiffy; - /* Handle RTCL overflow on 601 */ - if (__USE_RTC() && per_cpu(last_jiffy, cpu) >= 1000000000) - per_cpu(last_jiffy, cpu) -= 1000000000; - - /* - * We cannot disable the decrementer, so in the period - * between this cpu's being marked offline in cpu_online_map - * and calling stop-self, it is taking timer interrupts. - * Avoid calling into the scheduler rebalancing code if this - * is the case. - */ - if (!cpu_is_offline(cpu)) - update_process_times(user_mode(regs)); - - /* - * No need to check whether cpu is offline here; boot_cpuid - * should have been fixed up by now. - */ - if (cpu != boot_cpuid) - continue; +void arch_irq_work_raise(void) +{ + preempt_disable(); + set_irq_work_pending_flag(); + set_dec(1); + preempt_enable(); +} + +#else /* CONFIG_IRQ_WORK */ + +#define test_irq_work_pending() 0 +#define clear_irq_work_pending() + +#endif /* CONFIG_IRQ_WORK */ + +void __timer_interrupt(void) +{ + struct pt_regs *regs = get_irq_regs(); + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); + struct clock_event_device *evt = &__get_cpu_var(decrementers); + u64 now; + + trace_timer_interrupt_entry(regs); - write_seqlock(&xtime_lock); - tb_last_jiffy += tb_ticks_per_jiffy; - tb_last_stamp = per_cpu(last_jiffy, cpu); - timer_recalc_offset(tb_last_jiffy); - do_timer(regs); - timer_sync_xtime(tb_last_jiffy); - timer_check_rtc(); - write_sequnlock(&xtime_lock); - if (adjusting_time && (time_adjust == 0)) - ppc_adjtimex(); + if (test_irq_work_pending()) { + clear_irq_work_pending(); + irq_work_run(); } - - next_dec = tb_ticks_per_jiffy - ticks; - set_dec(next_dec); -#ifdef CONFIG_PPC_ISERIES - if (hvlpevent_is_pending()) - process_hvlpevents(regs); -#endif + now = get_tb_or_rtc(); + if (now >= *next_tb) { + *next_tb = ~(u64)0; + if (evt->event_handler) + evt->event_handler(evt); + __get_cpu_var(irq_stat).timer_irqs_event++; + } else { + now = *next_tb - now; + if (now <= DECREMENTER_MAX) + set_dec((int)now); + /* We may have raced with new irq work */ + if (test_irq_work_pending()) + set_dec(1); + __get_cpu_var(irq_stat).timer_irqs_others++; + } #ifdef CONFIG_PPC64 /* collect purr register values often, for accurate calculations */ @@ -461,36 +517,93 @@ void timer_interrupt(struct pt_regs * regs) } #endif + trace_timer_interrupt_exit(regs); +} + +/* + * timer_interrupt - gets called when the decrementer overflows, + * with interrupts disabled. + */ +void timer_interrupt(struct pt_regs * regs) +{ + struct pt_regs *old_regs; + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); + + /* Ensure a positive value is written to the decrementer, or else + * some CPUs will continue to take decrementer exceptions. + */ + set_dec(DECREMENTER_MAX); + + /* Some implementations of hotplug will get timer interrupts while + * offline, just ignore these and we also need to set + * decrementers_next_tb as MAX to make sure __check_irq_replay + * don't replay timer interrupt when return, otherwise we'll trap + * here infinitely :( + */ + if (!cpu_online(smp_processor_id())) { + *next_tb = ~(u64)0; + return; + } + + /* Conditionally hard-enable interrupts now that the DEC has been + * bumped to its maximum value + */ + may_hard_irq_enable(); + + +#if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC) + if (atomic_read(&ppc_n_lost_interrupts) != 0) + do_IRQ(regs); +#endif + + old_regs = set_irq_regs(regs); + irq_enter(); + + __timer_interrupt(); irq_exit(); + set_irq_regs(old_regs); } -void wakeup_decrementer(void) +/* + * Hypervisor decrementer interrupts shouldn't occur but are sometimes + * left pending on exit from a KVM guest. We don't need to do anything + * to clear them, as they are edge-triggered. + */ +void hdec_interrupt(struct pt_regs *regs) { - int i; +} - set_dec(tb_ticks_per_jiffy); - /* - * We don't expect this to be called on a machine with a 601, - * so using get_tbl is fine. +#ifdef CONFIG_SUSPEND +static void generic_suspend_disable_irqs(void) +{ + /* Disable the decrementer, so that it doesn't interfere + * with suspending. */ - tb_last_stamp = tb_last_jiffy = get_tb(); - for_each_cpu(i) - per_cpu(last_jiffy, i) = tb_last_stamp; + + set_dec(DECREMENTER_MAX); + local_irq_disable(); + set_dec(DECREMENTER_MAX); } -#ifdef CONFIG_SMP -void __init smp_space_timers(unsigned int max_cpus) +static void generic_suspend_enable_irqs(void) { - int i; - unsigned long offset = tb_ticks_per_jiffy / max_cpus; - unsigned long previous_tb = per_cpu(last_jiffy, boot_cpuid); + local_irq_enable(); +} - for_each_cpu(i) { - if (i != boot_cpuid) { - previous_tb += offset; - per_cpu(last_jiffy, i) = previous_tb; - } - } +/* Overrides the weak version in kernel/power/main.c */ +void arch_suspend_disable_irqs(void) +{ + if (ppc_md.suspend_disable_irqs) + ppc_md.suspend_disable_irqs(); + generic_suspend_disable_irqs(); +} + +/* Overrides the weak version in kernel/power/main.c */ +void arch_suspend_enable_irqs(void) +{ + generic_suspend_enable_irqs(); + if (ppc_md.suspend_enable_irqs) + ppc_md.suspend_enable_irqs(); } #endif @@ -505,177 +618,290 @@ unsigned long long sched_clock(void) { if (__USE_RTC()) return get_rtc(); - return mulhdu(get_tb(), tb_to_ns_scale) << tb_to_ns_shift; + return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift; } -int do_settimeofday(struct timespec *tv) +static int __init get_freq(char *name, int cells, unsigned long *val) { - time_t wtm_sec, new_sec = tv->tv_sec; - long wtm_nsec, new_nsec = tv->tv_nsec; - unsigned long flags; - long int tb_delta; - u64 new_xsec, tb_delta_xs; + struct device_node *cpu; + const __be32 *fp; + int found = 0; - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; + /* The cpu node should have timebase and clock frequency properties */ + cpu = of_find_node_by_type(NULL, "cpu"); - write_seqlock_irqsave(&xtime_lock, flags); + if (cpu) { + fp = of_get_property(cpu, name, NULL); + if (fp) { + found = 1; + *val = of_read_ulong(fp, cells); + } - /* - * Updating the RTC is not the job of this code. If the time is - * stepped under NTP, the RTC will be updated after STA_UNSYNC - * is cleared. Tools like clock/hwclock either copy the RTC - * to the system time, in which case there is no point in writing - * to the RTC again, or write to the RTC but then they don't call - * settimeofday to perform this operation. - */ -#ifdef CONFIG_PPC_ISERIES - if (first_settimeofday) { - iSeries_tb_recal(); - first_settimeofday = 0; + of_node_put(cpu); } -#endif - tb_delta = tb_ticks_since(tb_last_stamp); - tb_delta += (jiffies - wall_jiffies) * tb_ticks_per_jiffy; - tb_delta_xs = mulhdu(tb_delta, do_gtod.varp->tb_to_xs); - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - new_sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - new_nsec); + return found; +} - set_normalized_timespec(&xtime, new_sec, new_nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); +void start_cpu_decrementer(void) +{ +#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) + /* Clear any pending timer interrupts */ + mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS); - /* In case of a large backwards jump in time with NTP, we want the - * clock to be updated as soon as the PLL is again in lock. - */ - last_rtc_update = new_sec - 658; + /* Enable decrementer interrupt */ + mtspr(SPRN_TCR, TCR_DIE); +#endif /* defined(CONFIG_BOOKE) || defined(CONFIG_40x) */ +} - ntp_clear(); +void __init generic_calibrate_decr(void) +{ + ppc_tb_freq = DEFAULT_TB_FREQ; /* hardcoded default */ + + if (!get_freq("ibm,extended-timebase-frequency", 2, &ppc_tb_freq) && + !get_freq("timebase-frequency", 1, &ppc_tb_freq)) { - new_xsec = 0; - if (new_nsec != 0) { - new_xsec = (u64)new_nsec * XSEC_PER_SEC; - do_div(new_xsec, NSEC_PER_SEC); + printk(KERN_ERR "WARNING: Estimating decrementer frequency " + "(not found)\n"); } - new_xsec += (u64)new_sec * XSEC_PER_SEC - tb_delta_xs; - update_gtod(tb_last_jiffy, new_xsec, do_gtod.varp->tb_to_xs); -#ifdef CONFIG_PPC64 - systemcfg->tz_minuteswest = sys_tz.tz_minuteswest; - systemcfg->tz_dsttime = sys_tz.tz_dsttime; -#endif + ppc_proc_freq = DEFAULT_PROC_FREQ; /* hardcoded default */ - write_sequnlock_irqrestore(&xtime_lock, flags); - clock_was_set(); - return 0; + if (!get_freq("ibm,extended-clock-frequency", 2, &ppc_proc_freq) && + !get_freq("clock-frequency", 1, &ppc_proc_freq)) { + + printk(KERN_ERR "WARNING: Estimating processor frequency " + "(not found)\n"); + } } -EXPORT_SYMBOL(do_settimeofday); +int update_persistent_clock(struct timespec now) +{ + struct rtc_time tm; + + if (!ppc_md.set_rtc_time) + return -ENODEV; -void __init generic_calibrate_decr(void) + to_tm(now.tv_sec + 1 + timezone_offset, &tm); + tm.tm_year -= 1900; + tm.tm_mon -= 1; + + return ppc_md.set_rtc_time(&tm); +} + +static void __read_persistent_clock(struct timespec *ts) { - struct device_node *cpu; - unsigned int *fp; - int node_found; + struct rtc_time tm; + static int first = 1; + + ts->tv_nsec = 0; + /* XXX this is a litle fragile but will work okay in the short term */ + if (first) { + first = 0; + if (ppc_md.time_init) + timezone_offset = ppc_md.time_init(); + + /* get_boot_time() isn't guaranteed to be safe to call late */ + if (ppc_md.get_boot_time) { + ts->tv_sec = ppc_md.get_boot_time() - timezone_offset; + return; + } + } + if (!ppc_md.get_rtc_time) { + ts->tv_sec = 0; + return; + } + ppc_md.get_rtc_time(&tm); + + ts->tv_sec = mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); +} + +void read_persistent_clock(struct timespec *ts) +{ + __read_persistent_clock(ts); + + /* Sanitize it in case real time clock is set below EPOCH */ + if (ts->tv_sec < 0) { + ts->tv_sec = 0; + ts->tv_nsec = 0; + } + +} + +/* clocksource code */ +static cycle_t rtc_read(struct clocksource *cs) +{ + return (cycle_t)get_rtc(); +} + +static cycle_t timebase_read(struct clocksource *cs) +{ + return (cycle_t)get_tb(); +} + +void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) +{ + u64 new_tb_to_xs, new_stamp_xsec; + u32 frac_sec; + + if (clock != &clocksource_timebase) + return; + + /* Make userspace gettimeofday spin until we're done. */ + ++vdso_data->tb_update_count; + smp_mb(); + + /* 19342813113834067 ~= 2^(20+64) / 1e9 */ + new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); + new_stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC; + do_div(new_stamp_xsec, 1000000000); + new_stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC; + + BUG_ON(wall_time->tv_nsec >= NSEC_PER_SEC); + /* this is tv_nsec / 1e9 as a 0.32 fraction */ + frac_sec = ((u64) wall_time->tv_nsec * 18446744073ULL) >> 32; /* - * The cpu node should have a timebase-frequency property - * to tell us the rate at which the decrementer counts. + * tb_update_count is used to allow the userspace gettimeofday code + * to assure itself that it sees a consistent view of the tb_to_xs and + * stamp_xsec variables. It reads the tb_update_count, then reads + * tb_to_xs and stamp_xsec and then reads tb_update_count again. If + * the two values of tb_update_count match and are even then the + * tb_to_xs and stamp_xsec values are consistent. If not, then it + * loops back and reads them again until this criteria is met. + * We expect the caller to have done the first increment of + * vdso_data->tb_update_count already. */ - cpu = of_find_node_by_type(NULL, "cpu"); + vdso_data->tb_orig_stamp = clock->cycle_last; + vdso_data->stamp_xsec = new_stamp_xsec; + vdso_data->tb_to_xs = new_tb_to_xs; + vdso_data->wtom_clock_sec = wtm->tv_sec; + vdso_data->wtom_clock_nsec = wtm->tv_nsec; + vdso_data->stamp_xtime = *wall_time; + vdso_data->stamp_sec_fraction = frac_sec; + smp_wmb(); + ++(vdso_data->tb_update_count); +} - ppc_tb_freq = DEFAULT_TB_FREQ; /* hardcoded default */ - node_found = 0; - if (cpu != 0) { - fp = (unsigned int *)get_property(cpu, "timebase-frequency", - NULL); - if (fp != 0) { - node_found = 1; - ppc_tb_freq = *fp; - } - } - if (!node_found) - printk(KERN_ERR "WARNING: Estimating decrementer frequency " - "(not found)\n"); +void update_vsyscall_tz(void) +{ + vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; + vdso_data->tz_dsttime = sys_tz.tz_dsttime; +} - ppc_proc_freq = DEFAULT_PROC_FREQ; - node_found = 0; - if (cpu != 0) { - fp = (unsigned int *)get_property(cpu, "clock-frequency", - NULL); - if (fp != 0) { - node_found = 1; - ppc_proc_freq = *fp; - } +static void __init clocksource_init(void) +{ + struct clocksource *clock; + + if (__USE_RTC()) + clock = &clocksource_rtc; + else + clock = &clocksource_timebase; + + if (clocksource_register_hz(clock, tb_ticks_per_sec)) { + printk(KERN_ERR "clocksource: %s is already registered\n", + clock->name); + return; } -#ifdef CONFIG_BOOKE - /* Set the time base to zero */ - mtspr(SPRN_TBWL, 0); - mtspr(SPRN_TBWU, 0); - /* Clear any pending timer interrupts */ - mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS); + printk(KERN_INFO "clocksource: %s mult[%x] shift[%d] registered\n", + clock->name, clock->mult, clock->shift); +} - /* Enable decrementer interrupt */ - mtspr(SPRN_TCR, TCR_DIE); -#endif - if (!node_found) - printk(KERN_ERR "WARNING: Estimating processor frequency " - "(not found)\n"); +static int decrementer_set_next_event(unsigned long evt, + struct clock_event_device *dev) +{ + __get_cpu_var(decrementers_next_tb) = get_tb_or_rtc() + evt; + set_dec(evt); + + /* We may have raced with new irq work */ + if (test_irq_work_pending()) + set_dec(1); - of_node_put(cpu); + return 0; } -unsigned long get_boot_time(void) +static void decrementer_set_mode(enum clock_event_mode mode, + struct clock_event_device *dev) { - struct rtc_time tm; + if (mode != CLOCK_EVT_MODE_ONESHOT) + decrementer_set_next_event(DECREMENTER_MAX, dev); +} - if (ppc_md.get_boot_time) - return ppc_md.get_boot_time(); - if (!ppc_md.get_rtc_time) - return 0; - ppc_md.get_rtc_time(&tm); - return mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec); +/* Interrupt handler for the timer broadcast IPI */ +void tick_broadcast_ipi_handler(void) +{ + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); + + *next_tb = get_tb_or_rtc(); + __timer_interrupt(); +} + +static void register_decrementer_clockevent(int cpu) +{ + struct clock_event_device *dec = &per_cpu(decrementers, cpu); + + *dec = decrementer_clockevent; + dec->cpumask = cpumask_of(cpu); + + printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n", + dec->name, dec->mult, dec->shift, cpu); + + clockevents_register_device(dec); +} + +static void __init init_decrementer_clockevent(void) +{ + int cpu = smp_processor_id(); + + clockevents_calc_mult_shift(&decrementer_clockevent, ppc_tb_freq, 4); + + decrementer_clockevent.max_delta_ns = + clockevent_delta2ns(DECREMENTER_MAX, &decrementer_clockevent); + decrementer_clockevent.min_delta_ns = + clockevent_delta2ns(2, &decrementer_clockevent); + + register_decrementer_clockevent(cpu); +} + +void secondary_cpu_time_init(void) +{ + /* Start the decrementer on CPUs that have manual control + * such as BookE + */ + start_cpu_decrementer(); + + /* FIME: Should make unrelatred change to move snapshot_timebase + * call here ! */ + register_decrementer_clockevent(smp_processor_id()); } /* This function is only called on the boot processor */ void __init time_init(void) { - unsigned long flags; - unsigned long tm = 0; struct div_result res; u64 scale; unsigned shift; - if (ppc_md.time_init != NULL) - timezone_offset = ppc_md.time_init(); - if (__USE_RTC()) { /* 601 processor: dec counts down by 128 every 128ns */ ppc_tb_freq = 1000000000; - tb_last_stamp = get_rtcl(); - tb_last_jiffy = tb_last_stamp; } else { /* Normal PowerPC with timebase register */ ppc_md.calibrate_decr(); - printk(KERN_INFO "time_init: decrementer frequency = %lu.%.6lu MHz\n", + printk(KERN_DEBUG "time_init: decrementer frequency = %lu.%.6lu MHz\n", ppc_tb_freq / 1000000, ppc_tb_freq % 1000000); - printk(KERN_INFO "time_init: processor frequency = %lu.%.6lu MHz\n", + printk(KERN_DEBUG "time_init: processor frequency = %lu.%.6lu MHz\n", ppc_proc_freq / 1000000, ppc_proc_freq % 1000000); - tb_last_stamp = tb_last_jiffy = get_tb(); } tb_ticks_per_jiffy = ppc_tb_freq / HZ; - tb_ticks_per_sec = tb_ticks_per_jiffy * HZ; + tb_ticks_per_sec = ppc_tb_freq; tb_ticks_per_usec = ppc_tb_freq / 1000000; - tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000); - div128_by_32(1024*1024, 0, tb_ticks_per_sec, &res); - tb_to_xs = res.result_low; - -#ifdef CONFIG_PPC64 - get_paca()->default_decr = tb_ticks_per_jiffy; -#endif + calc_cputime_factors(); + setup_cputime_one_jiffy(); /* * Compute scale factor for sched_clock. @@ -695,167 +921,28 @@ void __init time_init(void) } tb_to_ns_scale = scale; tb_to_ns_shift = shift; - -#ifdef CONFIG_PPC_ISERIES - if (!piranha_simulator) -#endif - tm = get_boot_time(); - - write_seqlock_irqsave(&xtime_lock, flags); - xtime.tv_sec = tm; - xtime.tv_nsec = 0; - do_gtod.varp = &do_gtod.vars[0]; - do_gtod.var_idx = 0; - do_gtod.varp->tb_orig_stamp = tb_last_jiffy; - __get_cpu_var(last_jiffy) = tb_last_stamp; - do_gtod.varp->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; - do_gtod.varp->tb_to_xs = tb_to_xs; - do_gtod.tb_to_us = tb_to_us; -#ifdef CONFIG_PPC64 - systemcfg->tb_orig_stamp = tb_last_jiffy; - systemcfg->tb_update_count = 0; - systemcfg->tb_ticks_per_sec = tb_ticks_per_sec; - systemcfg->stamp_xsec = xtime.tv_sec * XSEC_PER_SEC; - systemcfg->tb_to_xs = tb_to_xs; -#endif - - time_freq = 0; + /* Save the current timebase to pretty up CONFIG_PRINTK_TIME */ + boot_tb = get_tb_or_rtc(); /* If platform provided a timezone (pmac), we correct the time */ - if (timezone_offset) { + if (timezone_offset) { sys_tz.tz_minuteswest = -timezone_offset / 60; sys_tz.tz_dsttime = 0; - xtime.tv_sec -= timezone_offset; - } - - last_rtc_update = xtime.tv_sec; - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - write_sequnlock_irqrestore(&xtime_lock, flags); - - /* Not exact, but the timer interrupt takes care of this */ - set_dec(tb_ticks_per_jiffy); -} - -/* - * After adjtimex is called, adjust the conversion of tb ticks - * to microseconds to keep do_gettimeofday synchronized - * with ntpd. - * - * Use the time_adjust, time_freq and time_offset computed by adjtimex to - * adjust the frequency. - */ - -/* #define DEBUG_PPC_ADJTIMEX 1 */ - -void ppc_adjtimex(void) -{ -#ifdef CONFIG_PPC64 - unsigned long den, new_tb_ticks_per_sec, tb_ticks, old_xsec, - new_tb_to_xs, new_xsec, new_stamp_xsec; - unsigned long tb_ticks_per_sec_delta; - long delta_freq, ltemp; - struct div_result divres; - unsigned long flags; - long singleshot_ppm = 0; - - /* - * Compute parts per million frequency adjustment to - * accomplish the time adjustment implied by time_offset to be - * applied over the elapsed time indicated by time_constant. - * Use SHIFT_USEC to get it into the same units as - * time_freq. - */ - if ( time_offset < 0 ) { - ltemp = -time_offset; - ltemp <<= SHIFT_USEC - SHIFT_UPDATE; - ltemp >>= SHIFT_KG + time_constant; - ltemp = -ltemp; - } else { - ltemp = time_offset; - ltemp <<= SHIFT_USEC - SHIFT_UPDATE; - ltemp >>= SHIFT_KG + time_constant; - } - - /* If there is a single shot time adjustment in progress */ - if ( time_adjust ) { -#ifdef DEBUG_PPC_ADJTIMEX - printk("ppc_adjtimex: "); - if ( adjusting_time == 0 ) - printk("starting "); - printk("single shot time_adjust = %ld\n", time_adjust); -#endif - - adjusting_time = 1; - - /* - * Compute parts per million frequency adjustment - * to match time_adjust - */ - singleshot_ppm = tickadj * HZ; - /* - * The adjustment should be tickadj*HZ to match the code in - * linux/kernel/timer.c, but experiments show that this is too - * large. 3/4 of tickadj*HZ seems about right - */ - singleshot_ppm -= singleshot_ppm / 4; - /* Use SHIFT_USEC to get it into the same units as time_freq */ - singleshot_ppm <<= SHIFT_USEC; - if ( time_adjust < 0 ) - singleshot_ppm = -singleshot_ppm; - } - else { -#ifdef DEBUG_PPC_ADJTIMEX - if ( adjusting_time ) - printk("ppc_adjtimex: ending single shot time_adjust\n"); -#endif - adjusting_time = 0; - } - - /* Add up all of the frequency adjustments */ - delta_freq = time_freq + ltemp + singleshot_ppm; - - /* - * Compute a new value for tb_ticks_per_sec based on - * the frequency adjustment - */ - den = 1000000 * (1 << (SHIFT_USEC - 8)); - if ( delta_freq < 0 ) { - tb_ticks_per_sec_delta = ( tb_ticks_per_sec * ( (-delta_freq) >> (SHIFT_USEC - 8))) / den; - new_tb_ticks_per_sec = tb_ticks_per_sec + tb_ticks_per_sec_delta; - } - else { - tb_ticks_per_sec_delta = ( tb_ticks_per_sec * ( delta_freq >> (SHIFT_USEC - 8))) / den; - new_tb_ticks_per_sec = tb_ticks_per_sec - tb_ticks_per_sec_delta; } - -#ifdef DEBUG_PPC_ADJTIMEX - printk("ppc_adjtimex: ltemp = %ld, time_freq = %ld, singleshot_ppm = %ld\n", ltemp, time_freq, singleshot_ppm); - printk("ppc_adjtimex: tb_ticks_per_sec - base = %ld new = %ld\n", tb_ticks_per_sec, new_tb_ticks_per_sec); -#endif - /* - * Compute a new value of tb_to_xs (used to convert tb to - * microseconds) and a new value of stamp_xsec which is the - * time (in 1/2^20 second units) corresponding to - * tb_orig_stamp. This new value of stamp_xsec compensates - * for the change in frequency (implied by the new tb_to_xs) - * which guarantees that the current time remains the same. - */ - write_seqlock_irqsave( &xtime_lock, flags ); - tb_ticks = get_tb() - do_gtod.varp->tb_orig_stamp; - div128_by_32(1024*1024, 0, new_tb_ticks_per_sec, &divres); - new_tb_to_xs = divres.result_low; - new_xsec = mulhdu(tb_ticks, new_tb_to_xs); + vdso_data->tb_update_count = 0; + vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - old_xsec = mulhdu(tb_ticks, do_gtod.varp->tb_to_xs); - new_stamp_xsec = do_gtod.varp->stamp_xsec + old_xsec - new_xsec; + /* Start the decrementer on CPUs that have manual control + * such as BookE + */ + start_cpu_decrementer(); - update_gtod(do_gtod.varp->tb_orig_stamp, new_stamp_xsec, new_tb_to_xs); + /* Register the clocksource */ + clocksource_init(); - write_sequnlock_irqrestore( &xtime_lock, flags ); -#endif /* CONFIG_PPC64 */ + init_decrementer_clockevent(); + tick_setup_hrtimer_broadcast(); } @@ -938,39 +1025,6 @@ void to_tm(int tim, struct rtc_time * tm) GregorianDay(tm); } -/* Auxiliary function to compute scaling factors */ -/* Actually the choice of a timebase running at 1/4 the of the bus - * frequency giving resolution of a few tens of nanoseconds is quite nice. - * It makes this computation very precise (27-28 bits typically) which - * is optimistic considering the stability of most processor clock - * oscillators and the precision with which the timebase frequency - * is measured but does not harm. - */ -unsigned mulhwu_scale_factor(unsigned inscale, unsigned outscale) -{ - unsigned mlt=0, tmp, err; - /* No concern for performance, it's done once: use a stupid - * but safe and compact method to find the multiplier. - */ - - for (tmp = 1U<<31; tmp != 0; tmp >>= 1) { - if (mulhwu(inscale, mlt|tmp) < outscale) - mlt |= tmp; - } - - /* We might still be off by 1 for the best approximation. - * A side effect of this is that if outscale is too large - * the returned value will be zero. - * Many corner cases have been checked and seem to work, - * some might have been forgotten in the test however. - */ - - err = inscale * (mlt+1); - if (err <= inscale/2) - mlt++; - return mlt; -} - /* * Divide a 128-bit dividend by a 32-bit divisor, leaving a 128 bit * result. @@ -1003,3 +1057,26 @@ void div128_by_32(u64 dividend_high, u64 dividend_low, dr->result_low = ((u64)y << 32) + z; } + +/* We don't need to calibrate delay, we use the CPU timebase for that */ +void calibrate_delay(void) +{ + /* Some generic code (such as spinlock debug) use loops_per_jiffy + * as the number of __delay(1) in a jiffy, so make it so + */ + loops_per_jiffy = tb_ticks_per_jiffy; +} + +static int __init rtc_init(void) +{ + struct platform_device *pdev; + + if (!ppc_md.get_rtc_time) + return -ENODEV; + + pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0); + + return PTR_ERR_OR_ZERO(pdev); +} + +module_init(rtc_init); diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S new file mode 100644 index 00000000000..2a324f4cb1b --- /dev/null +++ b/arch/powerpc/kernel/tm.S @@ -0,0 +1,481 @@ +/* + * Transactional memory support routines to reclaim and recheckpoint + * transactional process state. + * + * Copyright 2012 Matt Evans & Michael Neuling, IBM Corporation. + */ + +#include <asm/asm-offsets.h> +#include <asm/ppc_asm.h> +#include <asm/ppc-opcode.h> +#include <asm/ptrace.h> +#include <asm/reg.h> +#include <asm/bug.h> + +#ifdef CONFIG_VSX +/* See fpu.S, this is borrowed from there */ +#define __SAVE_32FPRS_VSRS(n,c,base) \ +BEGIN_FTR_SECTION \ + b 2f; \ +END_FTR_SECTION_IFSET(CPU_FTR_VSX); \ + SAVE_32FPRS(n,base); \ + b 3f; \ +2: SAVE_32VSRS(n,c,base); \ +3: +#define __REST_32FPRS_VSRS(n,c,base) \ +BEGIN_FTR_SECTION \ + b 2f; \ +END_FTR_SECTION_IFSET(CPU_FTR_VSX); \ + REST_32FPRS(n,base); \ + b 3f; \ +2: REST_32VSRS(n,c,base); \ +3: +#else +#define __SAVE_32FPRS_VSRS(n,c,base) SAVE_32FPRS(n, base) +#define __REST_32FPRS_VSRS(n,c,base) REST_32FPRS(n, base) +#endif +#define SAVE_32FPRS_VSRS(n,c,base) \ + __SAVE_32FPRS_VSRS(n,__REG_##c,__REG_##base) +#define REST_32FPRS_VSRS(n,c,base) \ + __REST_32FPRS_VSRS(n,__REG_##c,__REG_##base) + +/* Stack frame offsets for local variables. */ +#define TM_FRAME_L0 TM_FRAME_SIZE-16 +#define TM_FRAME_L1 TM_FRAME_SIZE-8 + + +/* In order to access the TM SPRs, TM must be enabled. So, do so: */ +_GLOBAL(tm_enable) + mfmsr r4 + li r3, MSR_TM >> 32 + sldi r3, r3, 32 + and. r0, r4, r3 + bne 1f + or r4, r4, r3 + mtmsrd r4 +1: blr + +_GLOBAL(tm_save_sprs) + mfspr r0, SPRN_TFHAR + std r0, THREAD_TM_TFHAR(r3) + mfspr r0, SPRN_TEXASR + std r0, THREAD_TM_TEXASR(r3) + mfspr r0, SPRN_TFIAR + std r0, THREAD_TM_TFIAR(r3) + blr + +_GLOBAL(tm_restore_sprs) + ld r0, THREAD_TM_TFHAR(r3) + mtspr SPRN_TFHAR, r0 + ld r0, THREAD_TM_TEXASR(r3) + mtspr SPRN_TEXASR, r0 + ld r0, THREAD_TM_TFIAR(r3) + mtspr SPRN_TFIAR, r0 + blr + + /* Passed an 8-bit failure cause as first argument. */ +_GLOBAL(tm_abort) + TABORT(R3) + blr + +/* void tm_reclaim(struct thread_struct *thread, + * unsigned long orig_msr, + * uint8_t cause) + * + * - Performs a full reclaim. This destroys outstanding + * transactions and updates thread->regs.tm_ckpt_* with the + * original checkpointed state. Note that thread->regs is + * unchanged. + * - FP regs are written back to thread->transact_fpr before + * reclaiming. These are the transactional (current) versions. + * + * Purpose is to both abort transactions of, and preserve the state of, + * a transactions at a context switch. We preserve/restore both sets of process + * state to restore them when the thread's scheduled again. We continue in + * userland as though nothing happened, but when the transaction is resumed + * they will abort back to the checkpointed state we save out here. + * + * Call with IRQs off, stacks get all out of sync for some periods in here! + */ +_GLOBAL(tm_reclaim) + mfcr r6 + mflr r0 + stw r6, 8(r1) + std r0, 16(r1) + std r2, STK_GOT(r1) + stdu r1, -TM_FRAME_SIZE(r1) + + /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */ + + std r3, STK_PARAM(R3)(r1) + SAVE_NVGPRS(r1) + + /* We need to setup MSR for VSX register save instructions. Here we + * also clear the MSR RI since when we do the treclaim, we won't have a + * valid kernel pointer for a while. We clear RI here as it avoids + * adding another mtmsr closer to the treclaim. This makes the region + * maked as non-recoverable wider than it needs to be but it saves on + * inserting another mtmsrd later. + */ + mfmsr r14 + mr r15, r14 + ori r15, r15, MSR_FP + li r16, MSR_RI + ori r16, r16, MSR_EE /* IRQs hard off */ + andc r15, r15, r16 + oris r15, r15, MSR_VEC@h +#ifdef CONFIG_VSX + BEGIN_FTR_SECTION + oris r15,r15, MSR_VSX@h + END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + mtmsrd r15 + std r14, TM_FRAME_L0(r1) + + /* Stash the stack pointer away for use after reclaim */ + std r1, PACAR1(r13) + + /* ******************** FPR/VR/VSRs ************ + * Before reclaiming, capture the current/transactional FPR/VR + * versions /if used/. + * + * (If VSX used, FP and VMX are implied. Or, we don't need to look + * at MSR.VSX as copying FP regs if .FP, vector regs if .VMX covers it.) + * + * We're passed the thread's MSR as parameter 2. + * + * We enabled VEC/FP/VSX in the msr above, so we can execute these + * instructions! + */ + andis. r0, r4, MSR_VEC@h + beq dont_backup_vec + + addi r7, r3, THREAD_TRANSACT_VRSTATE + SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 transact vr state */ + mfvscr vr0 + li r6, VRSTATE_VSCR + stvx vr0, r7, r6 +dont_backup_vec: + mfspr r0, SPRN_VRSAVE + std r0, THREAD_TRANSACT_VRSAVE(r3) + + andi. r0, r4, MSR_FP + beq dont_backup_fp + + addi r7, r3, THREAD_TRANSACT_FPSTATE + SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 transact fp state */ + + mffs fr0 + stfd fr0,FPSTATE_FPSCR(r7) + +dont_backup_fp: + /* Do sanity check on MSR to make sure we are suspended */ + li r7, (MSR_TS_S)@higher + srdi r6, r14, 32 + and r6, r6, r7 +1: tdeqi r6, 0 + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 + + /* The moment we treclaim, ALL of our GPRs will switch + * to user register state. (FPRs, CCR etc. also!) + * Use an sprg and a tm_scratch in the PACA to shuffle. + */ + TRECLAIM(R5) /* Cause in r5 */ + + /* ******************** GPRs ******************** */ + /* Stash the checkpointed r13 away in the scratch SPR and get the real + * paca + */ + SET_SCRATCH0(r13) + GET_PACA(r13) + + /* Stash the checkpointed r1 away in paca tm_scratch and get the real + * stack pointer back + */ + std r1, PACATMSCRATCH(r13) + ld r1, PACAR1(r13) + + /* Store the PPR in r11 and reset to decent value */ + std r11, GPR11(r1) /* Temporary stash */ + mfspr r11, SPRN_PPR + HMT_MEDIUM + + /* Now get some more GPRS free */ + std r7, GPR7(r1) /* Temporary stash */ + std r12, GPR12(r1) /* '' '' '' */ + ld r12, STK_PARAM(R3)(r1) /* Param 0, thread_struct * */ + + std r11, THREAD_TM_PPR(r12) /* Store PPR and free r11 */ + + addi r7, r12, PT_CKPT_REGS /* Thread's ckpt_regs */ + + /* Make r7 look like an exception frame so that we + * can use the neat GPRx(n) macros. r7 is NOT a pt_regs ptr! + */ + subi r7, r7, STACK_FRAME_OVERHEAD + + /* Sync the userland GPRs 2-12, 14-31 to thread->regs: */ + SAVE_GPR(0, r7) /* user r0 */ + SAVE_GPR(2, r7) /* user r2 */ + SAVE_4GPRS(3, r7) /* user r3-r6 */ + SAVE_GPR(8, r7) /* user r8 */ + SAVE_GPR(9, r7) /* user r9 */ + SAVE_GPR(10, r7) /* user r10 */ + ld r3, PACATMSCRATCH(r13) /* user r1 */ + ld r4, GPR7(r1) /* user r7 */ + ld r5, GPR11(r1) /* user r11 */ + ld r6, GPR12(r1) /* user r12 */ + GET_SCRATCH0(8) /* user r13 */ + std r3, GPR1(r7) + std r4, GPR7(r7) + std r5, GPR11(r7) + std r6, GPR12(r7) + std r8, GPR13(r7) + + SAVE_NVGPRS(r7) /* user r14-r31 */ + + /* ******************** NIP ******************** */ + mfspr r3, SPRN_TFHAR + std r3, _NIP(r7) /* Returns to failhandler */ + /* The checkpointed NIP is ignored when rescheduling/rechkpting, + * but is used in signal return to 'wind back' to the abort handler. + */ + + /* ******************** CR,LR,CCR,MSR ********** */ + mfctr r3 + mflr r4 + mfcr r5 + mfxer r6 + + std r3, _CTR(r7) + std r4, _LINK(r7) + std r5, _CCR(r7) + std r6, _XER(r7) + + + /* ******************** TAR, DSCR ********** */ + mfspr r3, SPRN_TAR + mfspr r4, SPRN_DSCR + + std r3, THREAD_TM_TAR(r12) + std r4, THREAD_TM_DSCR(r12) + + /* MSR and flags: We don't change CRs, and we don't need to alter + * MSR. + */ + + /* TM regs, incl TEXASR -- these live in thread_struct. Note they've + * been updated by the treclaim, to explain to userland the failure + * cause (aborted). + */ + mfspr r0, SPRN_TEXASR + mfspr r3, SPRN_TFHAR + mfspr r4, SPRN_TFIAR + std r0, THREAD_TM_TEXASR(r12) + std r3, THREAD_TM_TFHAR(r12) + std r4, THREAD_TM_TFIAR(r12) + + /* AMR is checkpointed too, but is unsupported by Linux. */ + + /* Restore original MSR/IRQ state & clear TM mode */ + ld r14, TM_FRAME_L0(r1) /* Orig MSR */ + li r15, 0 + rldimi r14, r15, MSR_TS_LG, (63-MSR_TS_LG)-1 + mtmsrd r14 + + REST_NVGPRS(r1) + + addi r1, r1, TM_FRAME_SIZE + lwz r4, 8(r1) + ld r0, 16(r1) + mtcr r4 + mtlr r0 + ld r2, STK_GOT(r1) + + /* Load CPU's default DSCR */ + ld r0, PACA_DSCR(r13) + mtspr SPRN_DSCR, r0 + + blr + + + /* void tm_recheckpoint(struct thread_struct *thread, + * unsigned long orig_msr) + * - Restore the checkpointed register state saved by tm_reclaim + * when we switch_to a process. + * + * Call with IRQs off, stacks get all out of sync for + * some periods in here! + */ +_GLOBAL(__tm_recheckpoint) + mfcr r5 + mflr r0 + stw r5, 8(r1) + std r0, 16(r1) + std r2, STK_GOT(r1) + stdu r1, -TM_FRAME_SIZE(r1) + + /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. + * This is used for backing up the NVGPRs: + */ + SAVE_NVGPRS(r1) + + /* Load complete register state from ts_ckpt* registers */ + + addi r7, r3, PT_CKPT_REGS /* Thread's ckpt_regs */ + + /* Make r7 look like an exception frame so that we + * can use the neat GPRx(n) macros. r7 is now NOT a pt_regs ptr! + */ + subi r7, r7, STACK_FRAME_OVERHEAD + + SET_SCRATCH0(r1) + + mfmsr r6 + /* R4 = original MSR to indicate whether thread used FP/Vector etc. */ + + /* Enable FP/vec in MSR if necessary! */ + lis r5, MSR_VEC@h + ori r5, r5, MSR_FP + and. r5, r4, r5 + beq restore_gprs /* if neither, skip both */ + +#ifdef CONFIG_VSX + BEGIN_FTR_SECTION + oris r5, r5, MSR_VSX@h + END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + or r5, r6, r5 /* Set MSR.FP+.VSX/.VEC */ + mtmsr r5 + +#ifdef CONFIG_ALTIVEC + /* FP and VEC registers: These are recheckpointed from thread.fpr[] + * and thread.vr[] respectively. The thread.transact_fpr[] version + * is more modern, and will be loaded subsequently by any FPUnavailable + * trap. + */ + andis. r0, r4, MSR_VEC@h + beq dont_restore_vec + + addi r8, r3, THREAD_VRSTATE + li r5, VRSTATE_VSCR + lvx vr0, r8, r5 + mtvscr vr0 + REST_32VRS(0, r5, r8) /* r5 scratch, r8 ptr */ +dont_restore_vec: + ld r5, THREAD_VRSAVE(r3) + mtspr SPRN_VRSAVE, r5 +#endif + + andi. r0, r4, MSR_FP + beq dont_restore_fp + + addi r8, r3, THREAD_FPSTATE + lfd fr0, FPSTATE_FPSCR(r8) + MTFSF_L(fr0) + REST_32FPRS_VSRS(0, R4, R8) + +dont_restore_fp: + mtmsr r6 /* FP/Vec off again! */ + +restore_gprs: + + /* ******************** CR,LR,CCR,MSR ********** */ + ld r4, _CTR(r7) + ld r5, _LINK(r7) + ld r8, _XER(r7) + + mtctr r4 + mtlr r5 + mtxer r8 + + /* ******************** TAR ******************** */ + ld r4, THREAD_TM_TAR(r3) + mtspr SPRN_TAR, r4 + + /* Load up the PPR and DSCR in GPRs only at this stage */ + ld r5, THREAD_TM_DSCR(r3) + ld r6, THREAD_TM_PPR(r3) + + /* Clear the MSR RI since we are about to change R1. EE is already off + */ + li r4, 0 + mtmsrd r4, 1 + + REST_GPR(0, r7) /* GPR0 */ + REST_2GPRS(2, r7) /* GPR2-3 */ + REST_GPR(4, r7) /* GPR4 */ + REST_4GPRS(8, r7) /* GPR8-11 */ + REST_2GPRS(12, r7) /* GPR12-13 */ + + REST_NVGPRS(r7) /* GPR14-31 */ + + /* Load up PPR and DSCR here so we don't run with user values for long + */ + mtspr SPRN_DSCR, r5 + mtspr SPRN_PPR, r6 + + /* Do final sanity check on TEXASR to make sure FS is set. Do this + * here before we load up the userspace r1 so any bugs we hit will get + * a call chain */ + mfspr r5, SPRN_TEXASR + srdi r5, r5, 16 + li r6, (TEXASR_FS)@h + and r6, r6, r5 +1: tdeqi r6, 0 + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 + + /* Do final sanity check on MSR to make sure we are not transactional + * or suspended + */ + mfmsr r6 + li r5, (MSR_TS_MASK)@higher + srdi r6, r6, 32 + and r6, r6, r5 +1: tdnei r6, 0 + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 + + /* Restore CR */ + ld r6, _CCR(r7) + mtcr r6 + + REST_GPR(1, r7) /* GPR1 */ + REST_GPR(5, r7) /* GPR5-7 */ + REST_GPR(6, r7) + ld r7, GPR7(r7) + + /* Commit register state as checkpointed state: */ + TRECHKPT + + HMT_MEDIUM + + /* Our transactional state has now changed. + * + * Now just get out of here. Transactional (current) state will be + * updated once restore is called on the return path in the _switch-ed + * -to process. + */ + + GET_PACA(r13) + GET_SCRATCH0(r1) + + /* R1 is restored, so we are recoverable again. EE is still off */ + li r4, MSR_RI + mtmsrd r4, 1 + + REST_NVGPRS(r1) + + addi r1, r1, TM_FRAME_SIZE + lwz r4, 8(r1) + ld r0, 16(r1) + mtcr r4 + mtlr r0 + ld r2, STK_GOT(r1) + + /* Load CPU's default DSCR */ + ld r0, PACA_DSCR(r13) + mtspr SPRN_DSCR, r0 + + blr + + /* ****************************************************************** */ diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 5d638ecddbd..239f1cde3ff 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1,5 +1,6 @@ /* * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * Copyright 2007-2010 Freescale Semiconductor, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -14,7 +15,6 @@ * This file handles the architecture-dependent parts of hardware exceptions */ -#include <linux/config.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -22,203 +22,260 @@ #include <linux/stddef.h> #include <linux/unistd.h> #include <linux/ptrace.h> -#include <linux/slab.h> #include <linux/user.h> -#include <linux/a.out.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/module.h> #include <linux/prctl.h> #include <linux/delay.h> #include <linux/kprobes.h> - -#include <asm/kdebug.h> +#include <linux/kexec.h> +#include <linux/backlight.h> +#include <linux/bug.h> +#include <linux/kdebug.h> +#include <linux/debugfs.h> +#include <linux/ratelimit.h> +#include <linux/context_tracking.h> + +#include <asm/emulated_ops.h> #include <asm/pgtable.h> #include <asm/uaccess.h> -#include <asm/system.h> #include <asm/io.h> #include <asm/machdep.h> #include <asm/rtas.h> -#include <asm/xmon.h> #include <asm/pmc.h> -#ifdef CONFIG_PPC32 #include <asm/reg.h> -#endif #ifdef CONFIG_PMAC_BACKLIGHT #include <asm/backlight.h> #endif #ifdef CONFIG_PPC64 #include <asm/firmware.h> #include <asm/processor.h> -#include <asm/systemcfg.h> -#endif - -#ifdef CONFIG_PPC64 /* XXX */ -#define _IO_BASE pci_io_base +#include <asm/tm.h> #endif - -#ifdef CONFIG_DEBUGGER -int (*__debugger)(struct pt_regs *regs); -int (*__debugger_ipi)(struct pt_regs *regs); -int (*__debugger_bpt)(struct pt_regs *regs); -int (*__debugger_sstep)(struct pt_regs *regs); -int (*__debugger_iabr_match)(struct pt_regs *regs); -int (*__debugger_dabr_match)(struct pt_regs *regs); -int (*__debugger_fault_handler)(struct pt_regs *regs); +#include <asm/kexec.h> +#include <asm/ppc-opcode.h> +#include <asm/rio.h> +#include <asm/fadump.h> +#include <asm/switch_to.h> +#include <asm/tm.h> +#include <asm/debug.h> +#include <sysdev/fsl_pci.h> + +#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) +int (*__debugger)(struct pt_regs *regs) __read_mostly; +int (*__debugger_ipi)(struct pt_regs *regs) __read_mostly; +int (*__debugger_bpt)(struct pt_regs *regs) __read_mostly; +int (*__debugger_sstep)(struct pt_regs *regs) __read_mostly; +int (*__debugger_iabr_match)(struct pt_regs *regs) __read_mostly; +int (*__debugger_break_match)(struct pt_regs *regs) __read_mostly; +int (*__debugger_fault_handler)(struct pt_regs *regs) __read_mostly; EXPORT_SYMBOL(__debugger); EXPORT_SYMBOL(__debugger_ipi); EXPORT_SYMBOL(__debugger_bpt); EXPORT_SYMBOL(__debugger_sstep); EXPORT_SYMBOL(__debugger_iabr_match); -EXPORT_SYMBOL(__debugger_dabr_match); +EXPORT_SYMBOL(__debugger_break_match); EXPORT_SYMBOL(__debugger_fault_handler); #endif -struct notifier_block *powerpc_die_chain; -static DEFINE_SPINLOCK(die_notifier_lock); - -int register_die_notifier(struct notifier_block *nb) -{ - int err = 0; - unsigned long flags; - - spin_lock_irqsave(&die_notifier_lock, flags); - err = notifier_chain_register(&powerpc_die_chain, nb); - spin_unlock_irqrestore(&die_notifier_lock, flags); - return err; -} +/* Transactional Memory trap debug */ +#ifdef TM_DEBUG_SW +#define TM_DEBUG(x...) printk(KERN_INFO x) +#else +#define TM_DEBUG(x...) do { } while(0) +#endif /* * Trap & Exception support */ -static DEFINE_SPINLOCK(die_lock); +#ifdef CONFIG_PMAC_BACKLIGHT +static void pmac_backlight_unblank(void) +{ + mutex_lock(&pmac_backlight_mutex); + if (pmac_backlight) { + struct backlight_properties *props; + + props = &pmac_backlight->props; + props->brightness = props->max_brightness; + props->power = FB_BLANK_UNBLANK; + backlight_update_status(pmac_backlight); + } + mutex_unlock(&pmac_backlight_mutex); +} +#else +static inline void pmac_backlight_unblank(void) { } +#endif + +static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; +static int die_counter; -int die(const char *str, struct pt_regs *regs, long err) +static unsigned __kprobes long oops_begin(struct pt_regs *regs) { - static int die_counter; - int nl = 0; + int cpu; + unsigned long flags; if (debugger(regs)) return 1; + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!arch_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + arch_spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; console_verbose(); - spin_lock_irq(&die_lock); bust_spinlocks(1); -#ifdef CONFIG_PMAC_BACKLIGHT - if (_machine == _MACH_Pmac) { - set_backlight_enable(1); - set_backlight_level(BACKLIGHT_MAX); + if (machine_is(powermac)) + pmac_backlight_unblank(); + return flags; +} + +static void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, + int signr) +{ + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); + die_nest_count--; + oops_exit(); + printk("\n"); + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + arch_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + + crash_fadump(regs, "die oops"); + + /* + * A system reset (0x100) is a request to dump, so we always send + * it through the crashdump code. + */ + if (kexec_should_crash(current) || (TRAP(regs) == 0x100)) { + crash_kexec(regs); + + /* + * We aren't the primary crash CPU. We need to send it + * to a holding pattern to avoid it ending up in the panic + * code. + */ + crash_kexec_secondary(regs); } -#endif + + if (!signr) + return; + + /* + * While our oops output is serialised by a spinlock, output + * from panic() called below can race and corrupt it. If we + * know we are going to panic, delay for 1 second so we have a + * chance to get clean backtraces from all CPUs that are oopsing. + */ + if (in_interrupt() || panic_on_oops || !current->pid || + is_global_init(current)) { + mdelay(MSEC_PER_SEC); + } + + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + do_exit(signr); +} + +static int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); - nl = 1; #endif #ifdef CONFIG_SMP printk("SMP NR_CPUS=%d ", NR_CPUS); - nl = 1; #endif #ifdef CONFIG_DEBUG_PAGEALLOC printk("DEBUG_PAGEALLOC "); - nl = 1; #endif #ifdef CONFIG_NUMA printk("NUMA "); - nl = 1; #endif -#ifdef CONFIG_PPC64 - switch (systemcfg->platform) { - case PLATFORM_PSERIES: - printk("PSERIES "); - nl = 1; - break; - case PLATFORM_PSERIES_LPAR: - printk("PSERIES LPAR "); - nl = 1; - break; - case PLATFORM_ISERIES_LPAR: - printk("ISERIES LPAR "); - nl = 1; - break; - case PLATFORM_POWERMAC: - printk("POWERMAC "); - nl = 1; - break; - case PLATFORM_BPA: - printk("BPA "); - nl = 1; - break; - } -#endif - if (nl) - printk("\n"); + printk("%s\n", ppc_md.name ? ppc_md.name : ""); + + if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP) + return 1; + print_modules(); show_regs(regs); - bust_spinlocks(0); - spin_unlock_irq(&die_lock); - if (in_interrupt()) - panic("Fatal exception in interrupt"); + return 0; +} - if (panic_on_oops) { -#ifdef CONFIG_PPC64 - printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n"); - ssleep(5); -#endif - panic("Fatal exception"); - } - do_exit(err); +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(regs); - return 0; + if (__die(str, regs, err)) + err = 0; + oops_end(flags, regs, err); +} + +void user_single_step_siginfo(struct task_struct *tsk, + struct pt_regs *regs, siginfo_t *info) +{ + memset(info, 0, sizeof(*info)); + info->si_signo = SIGTRAP; + info->si_code = TRAP_TRACE; + info->si_addr = (void __user *)regs->nip; } void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) { siginfo_t info; + const char fmt32[] = KERN_INFO "%s[%d]: unhandled signal %d " \ + "at %08lx nip %08lx lr %08lx code %x\n"; + const char fmt64[] = KERN_INFO "%s[%d]: unhandled signal %d " \ + "at %016lx nip %016lx lr %016lx code %x\n"; if (!user_mode(regs)) { - if (die("Exception in kernel mode", regs, signr)) - return; + die("Exception in kernel mode", regs, signr); + return; + } + + if (show_unhandled_signals && unhandled_signal(current, signr)) { + printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, + current->comm, current->pid, signr, + addr, regs->nip, regs->link, code); } + if (arch_irqs_disabled() && !arch_irq_disabled_regs(regs)) + local_irq_enable(); + + current->thread.trap_nr = code; memset(&info, 0, sizeof(info)); info.si_signo = signr; info.si_code = code; info.si_addr = (void __user *) addr; force_sig_info(signr, &info, current); - - /* - * Init gets no signals that it doesn't have a handler for. - * That's all very well, but if it has caused a synchronous - * exception and we ignore the resulting signal, it will just - * generate the same exception over and over again and we get - * nowhere. Better to kill it and let the kernel panic. - */ - if (current->pid == 1) { - __sighandler_t handler; - - spin_lock_irq(¤t->sighand->siglock); - handler = current->sighand->action[signr-1].sa.sa_handler; - spin_unlock_irq(¤t->sighand->siglock); - if (handler == SIG_DFL) { - /* init has generated a synchronous exception - and it doesn't have a handler for the signal */ - printk(KERN_CRIT "init has generated signal %d " - "but has no handler for it\n", signr); - do_exit(signr); - } - } } #ifdef CONFIG_PPC64 void system_reset_exception(struct pt_regs *regs) { /* See if any machine dependent calls */ - if (ppc_md.system_reset_exception) - ppc_md.system_reset_exception(regs); + if (ppc_md.system_reset_exception) { + if (ppc_md.system_reset_exception(regs)) + return; + } die("System Reset", regs, SIGABRT); @@ -228,6 +285,23 @@ void system_reset_exception(struct pt_regs *regs) /* What should we do here? We could issue a shutdown or hard reset. */ } + +/* + * This function is called in real mode. Strictly no printk's please. + * + * regs->nip and regs->msr contains srr0 and ssr1. + */ +long machine_check_early(struct pt_regs *regs) +{ + long handled = 0; + + __get_cpu_var(irq_stat).mce_exceptions++; + + if (cur_cpu_spec && cur_cpu_spec->machine_check_early) + handled = cur_cpu_spec->machine_check_early(regs); + return handled; +} + #endif /* @@ -242,7 +316,7 @@ void system_reset_exception(struct pt_regs *regs) */ static inline int check_io_access(struct pt_regs *regs) { -#ifdef CONFIG_PPC_PMAC +#ifdef CONFIG_PPC32 unsigned long msr = regs->msr; const struct exception_table_entry *entry; unsigned int *nip = (unsigned int *)regs->nip; @@ -275,11 +349,11 @@ static inline int check_io_access(struct pt_regs *regs) return 1; } } -#endif /* CONFIG_PPC_PMAC */ +#endif /* CONFIG_PPC32 */ return 0; } -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) +#ifdef CONFIG_PPC_ADV_DEBUG_REGS /* On 4xx, the reason for the machine check or program exception is in the ESR. */ #define get_reason(regs) ((regs)->dsisr) @@ -294,14 +368,15 @@ static inline int check_io_access(struct pt_regs *regs) #define REASON_TRAP ESR_PTR /* single-step stuff */ -#define single_stepping(regs) (current->thread.dbcr0 & DBCR0_IC) -#define clear_single_step(regs) (current->thread.dbcr0 &= ~DBCR0_IC) +#define single_stepping(regs) (current->thread.debug.dbcr0 & DBCR0_IC) +#define clear_single_step(regs) (current->thread.debug.dbcr0 &= ~DBCR0_IC) #else /* On non-4xx, the reason for the machine check or program exception is in the MSR. */ #define get_reason(regs) ((regs)->msr) #define get_mc_reason(regs) ((regs)->msr) +#define REASON_TM 0x200000 #define REASON_FP 0x100000 #define REASON_ILLEGAL 0x80000 #define REASON_PRIVILEGED 0x40000 @@ -311,57 +386,25 @@ static inline int check_io_access(struct pt_regs *regs) #define clear_single_step(regs) ((regs)->msr &= ~MSR_SE) #endif -/* - * This is "fall-back" implementation for configurations - * which don't provide platform-specific machine check info - */ -void __attribute__ ((weak)) -platform_machine_check(struct pt_regs *regs) -{ -} - -void machine_check_exception(struct pt_regs *regs) +#if defined(CONFIG_4xx) +int machine_check_4xx(struct pt_regs *regs) { -#ifdef CONFIG_PPC64 - int recover = 0; - - /* See if any machine dependent calls */ - if (ppc_md.machine_check_exception) - recover = ppc_md.machine_check_exception(regs); - - if (recover) - return; -#else unsigned long reason = get_mc_reason(regs); - if (user_mode(regs)) { - regs->msr |= MSR_RI; - _exception(SIGBUS, regs, BUS_ADRERR, regs->nip); - return; - } - -#if defined(CONFIG_8xx) && defined(CONFIG_PCI) - /* the qspan pci read routines can cause machine checks -- Cort */ - bad_page_fault(regs, regs->dar, SIGBUS); - return; -#endif - - if (debugger_fault_handler(regs)) { - regs->msr |= MSR_RI; - return; - } - - if (check_io_access(regs)) - return; - -#if defined(CONFIG_4xx) && !defined(CONFIG_440A) if (reason & ESR_IMCP) { printk("Instruction"); mtspr(SPRN_ESR, reason & ~ESR_IMCP); } else printk("Data"); printk(" machine check in kernel mode.\n"); -#elif defined(CONFIG_440A) + + return 0; +} + +int machine_check_440A(struct pt_regs *regs) +{ + unsigned long reason = get_mc_reason(regs); + printk("Machine check in kernel mode.\n"); if (reason & ESR_IMCP){ printk("Instruction Synchronous Machine Check exception\n"); @@ -391,7 +434,159 @@ void machine_check_exception(struct pt_regs *regs) /* Clear MCSR */ mtspr(SPRN_MCSR, mcsr); } -#elif defined (CONFIG_E500) + return 0; +} + +int machine_check_47x(struct pt_regs *regs) +{ + unsigned long reason = get_mc_reason(regs); + u32 mcsr; + + printk(KERN_ERR "Machine check in kernel mode.\n"); + if (reason & ESR_IMCP) { + printk(KERN_ERR + "Instruction Synchronous Machine Check exception\n"); + mtspr(SPRN_ESR, reason & ~ESR_IMCP); + return 0; + } + mcsr = mfspr(SPRN_MCSR); + if (mcsr & MCSR_IB) + printk(KERN_ERR "Instruction Read PLB Error\n"); + if (mcsr & MCSR_DRB) + printk(KERN_ERR "Data Read PLB Error\n"); + if (mcsr & MCSR_DWB) + printk(KERN_ERR "Data Write PLB Error\n"); + if (mcsr & MCSR_TLBP) + printk(KERN_ERR "TLB Parity Error\n"); + if (mcsr & MCSR_ICP) { + flush_instruction_cache(); + printk(KERN_ERR "I-Cache Parity Error\n"); + } + if (mcsr & MCSR_DCSP) + printk(KERN_ERR "D-Cache Search Parity Error\n"); + if (mcsr & PPC47x_MCSR_GPR) + printk(KERN_ERR "GPR Parity Error\n"); + if (mcsr & PPC47x_MCSR_FPR) + printk(KERN_ERR "FPR Parity Error\n"); + if (mcsr & PPC47x_MCSR_IPR) + printk(KERN_ERR "Machine Check exception is imprecise\n"); + + /* Clear MCSR */ + mtspr(SPRN_MCSR, mcsr); + + return 0; +} +#elif defined(CONFIG_E500) +int machine_check_e500mc(struct pt_regs *regs) +{ + unsigned long mcsr = mfspr(SPRN_MCSR); + unsigned long reason = mcsr; + int recoverable = 1; + + if (reason & MCSR_LD) { + recoverable = fsl_rio_mcheck_exception(regs); + if (recoverable == 1) + goto silent_out; + } + + printk("Machine check in kernel mode.\n"); + printk("Caused by (from MCSR=%lx): ", reason); + + if (reason & MCSR_MCP) + printk("Machine Check Signal\n"); + + if (reason & MCSR_ICPERR) { + printk("Instruction Cache Parity Error\n"); + + /* + * This is recoverable by invalidating the i-cache. + */ + mtspr(SPRN_L1CSR1, mfspr(SPRN_L1CSR1) | L1CSR1_ICFI); + while (mfspr(SPRN_L1CSR1) & L1CSR1_ICFI) + ; + + /* + * This will generally be accompanied by an instruction + * fetch error report -- only treat MCSR_IF as fatal + * if it wasn't due to an L1 parity error. + */ + reason &= ~MCSR_IF; + } + + if (reason & MCSR_DCPERR_MC) { + printk("Data Cache Parity Error\n"); + + /* + * In write shadow mode we auto-recover from the error, but it + * may still get logged and cause a machine check. We should + * only treat the non-write shadow case as non-recoverable. + */ + if (!(mfspr(SPRN_L1CSR2) & L1CSR2_DCWS)) + recoverable = 0; + } + + if (reason & MCSR_L2MMU_MHIT) { + printk("Hit on multiple TLB entries\n"); + recoverable = 0; + } + + if (reason & MCSR_NMI) + printk("Non-maskable interrupt\n"); + + if (reason & MCSR_IF) { + printk("Instruction Fetch Error Report\n"); + recoverable = 0; + } + + if (reason & MCSR_LD) { + printk("Load Error Report\n"); + recoverable = 0; + } + + if (reason & MCSR_ST) { + printk("Store Error Report\n"); + recoverable = 0; + } + + if (reason & MCSR_LDG) { + printk("Guarded Load Error Report\n"); + recoverable = 0; + } + + if (reason & MCSR_TLBSYNC) + printk("Simultaneous tlbsync operations\n"); + + if (reason & MCSR_BSL2_ERR) { + printk("Level 2 Cache Error\n"); + recoverable = 0; + } + + if (reason & MCSR_MAV) { + u64 addr; + + addr = mfspr(SPRN_MCAR); + addr |= (u64)mfspr(SPRN_MCARU) << 32; + + printk("Machine Check %s Address: %#llx\n", + reason & MCSR_MEA ? "Effective" : "Physical", addr); + } + +silent_out: + mtspr(SPRN_MCSR, mcsr); + return mfspr(SPRN_MCSR) == 0 && recoverable; +} + +int machine_check_e500(struct pt_regs *regs) +{ + unsigned long reason = get_mc_reason(regs); + + if (reason & MCSR_BUS_RBERR) { + if (fsl_rio_mcheck_exception(regs)) + return 1; + if (fsl_pci_mcheck_exception(regs)) + return 1; + } + printk("Machine check in kernel mode.\n"); printk("Caused by (from MCSR=%lx): ", reason); @@ -403,8 +598,6 @@ void machine_check_exception(struct pt_regs *regs) printk("Data Cache Push Parity Error\n"); if (reason & MCSR_DCPERR) printk("Data Cache Parity Error\n"); - if (reason & MCSR_GL_CI) - printk("Guarded Load or Cache-Inhibited stwcx.\n"); if (reason & MCSR_BUS_IAERR) printk("Bus - Instruction Address Error\n"); if (reason & MCSR_BUS_RAERR) @@ -421,7 +614,19 @@ void machine_check_exception(struct pt_regs *regs) printk("Bus - Instruction Parity Error\n"); if (reason & MCSR_BUS_RPERR) printk("Bus - Read Parity Error\n"); -#elif defined (CONFIG_E200) + + return 0; +} + +int machine_check_generic(struct pt_regs *regs) +{ + return 0; +} +#elif defined(CONFIG_E200) +int machine_check_e200(struct pt_regs *regs) +{ + unsigned long reason = get_mc_reason(regs); + printk("Machine check in kernel mode.\n"); printk("Caused by (from MCSR=%lx): ", reason); @@ -439,7 +644,14 @@ void machine_check_exception(struct pt_regs *regs) printk("Bus - Read Bus Error on data load\n"); if (reason & MCSR_BUS_WRERR) printk("Bus - Write Bus Error on buffered store or cache line push\n"); -#else /* !CONFIG_4xx && !CONFIG_E500 && !CONFIG_E200 */ + + return 0; +} +#else +int machine_check_generic(struct pt_regs *regs) +{ + unsigned long reason = get_mc_reason(regs); + printk("Machine check in kernel mode.\n"); printk("Caused by (from SRR1=%lx): ", reason); switch (reason & 0x601F0000) { @@ -469,22 +681,56 @@ void machine_check_exception(struct pt_regs *regs) default: printk("Unknown values in msr\n"); } -#endif /* CONFIG_4xx */ + return 0; +} +#endif /* everything else */ - /* - * Optional platform-provided routine to print out - * additional info, e.g. bus error registers. +void machine_check_exception(struct pt_regs *regs) +{ + enum ctx_state prev_state = exception_enter(); + int recover = 0; + + __get_cpu_var(irq_stat).mce_exceptions++; + + /* See if any machine dependent calls. In theory, we would want + * to call the CPU first, and call the ppc_md. one if the CPU + * one returns a positive number. However there is existing code + * that assumes the board gets a first chance, so let's keep it + * that way for now and fix things later. --BenH. */ - platform_machine_check(regs); -#endif /* CONFIG_PPC64 */ + if (ppc_md.machine_check_exception) + recover = ppc_md.machine_check_exception(regs); + else if (cur_cpu_spec->machine_check) + recover = cur_cpu_spec->machine_check(regs); + + if (recover > 0) + goto bail; + +#if defined(CONFIG_8xx) && defined(CONFIG_PCI) + /* the qspan pci read routines can cause machine checks -- Cort + * + * yuck !!! that totally needs to go away ! There are better ways + * to deal with that than having a wart in the mcheck handler. + * -- BenH + */ + bad_page_fault(regs, regs->dar, SIGBUS); + goto bail; +#endif if (debugger_fault_handler(regs)) - return; + goto bail; + + if (check_io_access(regs)) + goto bail; + die("Machine check", regs, SIGBUS); /* Must die if the interrupt is not recoverable */ if (!(regs->msr & MSR_RI)) panic("Unrecoverable Machine check"); + +bail: + exception_exit(prev_state); } void SMIException(struct pt_regs *regs) @@ -494,20 +740,29 @@ void SMIException(struct pt_regs *regs) void unknown_exception(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); + printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n", regs->nip, regs->msr, regs->trap); _exception(SIGTRAP, regs, 0, 0); + + exception_exit(prev_state); } void instruction_breakpoint_exception(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); + if (notify_die(DIE_IABR_MATCH, "iabr_match", regs, 5, 5, SIGTRAP) == NOTIFY_STOP) - return; + goto bail; if (debugger_iabr_match(regs)) - return; + goto bail; _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip); + +bail: + exception_exit(prev_state); } void RunModeException(struct pt_regs *regs) @@ -517,15 +772,20 @@ void RunModeException(struct pt_regs *regs) void __kprobes single_step_exception(struct pt_regs *regs) { - regs->msr &= ~(MSR_SE | MSR_BE); /* Turn off 'trace' bits */ + enum ctx_state prev_state = exception_enter(); + + clear_single_step(regs); if (notify_die(DIE_SSTEP, "single_step", regs, 5, 5, SIGTRAP) == NOTIFY_STOP) - return; + goto bail; if (debugger_sstep(regs)) - return; + goto bail; _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip); + +bail: + exception_exit(prev_state); } /* @@ -536,40 +796,44 @@ void __kprobes single_step_exception(struct pt_regs *regs) */ static void emulate_single_step(struct pt_regs *regs) { - if (single_stepping(regs)) { - clear_single_step(regs); - _exception(SIGTRAP, regs, TRAP_TRACE, 0); - } + if (single_stepping(regs)) + single_step_exception(regs); } -static void parse_fpe(struct pt_regs *regs) +static inline int __parse_fpscr(unsigned long fpscr) { - int code = 0; - unsigned long fpscr; - - flush_fp_to_thread(current); - - fpscr = current->thread.fpscr.val; + int ret = 0; /* Invalid operation */ if ((fpscr & FPSCR_VE) && (fpscr & FPSCR_VX)) - code = FPE_FLTINV; + ret = FPE_FLTINV; /* Overflow */ else if ((fpscr & FPSCR_OE) && (fpscr & FPSCR_OX)) - code = FPE_FLTOVF; + ret = FPE_FLTOVF; /* Underflow */ else if ((fpscr & FPSCR_UE) && (fpscr & FPSCR_UX)) - code = FPE_FLTUND; + ret = FPE_FLTUND; /* Divide by zero */ else if ((fpscr & FPSCR_ZE) && (fpscr & FPSCR_ZX)) - code = FPE_FLTDIV; + ret = FPE_FLTDIV; /* Inexact result */ else if ((fpscr & FPSCR_XE) && (fpscr & FPSCR_XX)) - code = FPE_FLTRES; + ret = FPE_FLTRES; + + return ret; +} + +static void parse_fpe(struct pt_regs *regs) +{ + int code = 0; + + flush_fp_to_thread(current); + + code = __parse_fpscr(current->thread.fp_state.fpscr); _exception(SIGFPE, regs, code, regs->nip); } @@ -585,23 +849,6 @@ static void parse_fpe(struct pt_regs *regs) * bits is faster and easier. * */ -#define INST_MFSPR_PVR 0x7c1f42a6 -#define INST_MFSPR_PVR_MASK 0xfc1fffff - -#define INST_DCBA 0x7c0005ec -#define INST_DCBA_MASK 0x7c0007fe - -#define INST_MCRXR 0x7c000400 -#define INST_MCRXR_MASK 0x7c0007fe - -#define INST_STRING 0x7c00042a -#define INST_STRING_MASK 0x7c0007fe -#define INST_STRING_GEN_MASK 0x7c00067e -#define INST_LSWI 0x7c0004aa -#define INST_LSWX 0x7c00042a -#define INST_STSWI 0x7c0005aa -#define INST_STSWX 0x7c00052a - static int emulate_string_inst(struct pt_regs *regs, u32 instword) { u8 rT = (instword >> 21) & 0x1f; @@ -612,20 +859,20 @@ static int emulate_string_inst(struct pt_regs *regs, u32 instword) int pos = 0; /* Early out if we are an invalid form of lswx */ - if ((instword & INST_STRING_MASK) == INST_LSWX) + if ((instword & PPC_INST_STRING_MASK) == PPC_INST_LSWX) if ((rT == rA) || (rT == NB_RB)) return -EINVAL; EA = (rA == 0) ? 0 : regs->gpr[rA]; - switch (instword & INST_STRING_MASK) { - case INST_LSWX: - case INST_STSWX: + switch (instword & PPC_INST_STRING_MASK) { + case PPC_INST_LSWX: + case PPC_INST_STSWX: EA += NB_RB; num_bytes = regs->xer & 0x7f; break; - case INST_LSWI: - case INST_STSWI: + case PPC_INST_LSWI: + case PPC_INST_STSWI: num_bytes = (NB_RB == 0) ? 32 : NB_RB; break; default: @@ -637,9 +884,13 @@ static int emulate_string_inst(struct pt_regs *regs, u32 instword) u8 val; u32 shift = 8 * (3 - (pos & 0x3)); - switch ((instword & INST_STRING_MASK)) { - case INST_LSWX: - case INST_LSWI: + /* if process is 32-bit, clear upper 32 bits of EA */ + if ((regs->msr & MSR_64BIT) == 0) + EA &= 0xFFFFFFFF; + + switch ((instword & PPC_INST_STRING_MASK)) { + case PPC_INST_LSWX: + case PPC_INST_LSWI: if (get_user(val, (u8 __user *)EA)) return -EFAULT; /* first time updating this reg, @@ -648,8 +899,8 @@ static int emulate_string_inst(struct pt_regs *regs, u32 instword) regs->gpr[rT] = 0; regs->gpr[rT] |= val << shift; break; - case INST_STSWI: - case INST_STSWX: + case PPC_INST_STSWI: + case PPC_INST_STSWX: val = regs->gpr[rT] >> shift; if (put_user(val, (u8 __user *)EA)) return -EFAULT; @@ -670,6 +921,62 @@ static int emulate_string_inst(struct pt_regs *regs, u32 instword) return 0; } +static int emulate_popcntb_inst(struct pt_regs *regs, u32 instword) +{ + u32 ra,rs; + unsigned long tmp; + + ra = (instword >> 16) & 0x1f; + rs = (instword >> 21) & 0x1f; + + tmp = regs->gpr[rs]; + tmp = tmp - ((tmp >> 1) & 0x5555555555555555ULL); + tmp = (tmp & 0x3333333333333333ULL) + ((tmp >> 2) & 0x3333333333333333ULL); + tmp = (tmp + (tmp >> 4)) & 0x0f0f0f0f0f0f0f0fULL; + regs->gpr[ra] = tmp; + + return 0; +} + +static int emulate_isel(struct pt_regs *regs, u32 instword) +{ + u8 rT = (instword >> 21) & 0x1f; + u8 rA = (instword >> 16) & 0x1f; + u8 rB = (instword >> 11) & 0x1f; + u8 BC = (instword >> 6) & 0x1f; + u8 bit; + unsigned long tmp; + + tmp = (rA == 0) ? 0 : regs->gpr[rA]; + bit = (regs->ccr >> (31 - BC)) & 0x1; + + regs->gpr[rT] = bit ? tmp : regs->gpr[rB]; + + return 0; +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static inline bool tm_abort_check(struct pt_regs *regs, int cause) +{ + /* If we're emulating a load/store in an active transaction, we cannot + * emulate it as the kernel operates in transaction suspended context. + * We need to abort the transaction. This creates a persistent TM + * abort so tell the user what caused it with a new code. + */ + if (MSR_TM_TRANSACTIONAL(regs->msr)) { + tm_enable(); + tm_abort(cause); + return true; + } + return false; +} +#else +static inline bool tm_abort_check(struct pt_regs *regs, int reason) +{ + return false; +} +#endif + static int emulate_instruction(struct pt_regs *regs) { u32 instword; @@ -683,131 +990,213 @@ static int emulate_instruction(struct pt_regs *regs) return -EFAULT; /* Emulate the mfspr rD, PVR. */ - if ((instword & INST_MFSPR_PVR_MASK) == INST_MFSPR_PVR) { + if ((instword & PPC_INST_MFSPR_PVR_MASK) == PPC_INST_MFSPR_PVR) { + PPC_WARN_EMULATED(mfpvr, regs); rd = (instword >> 21) & 0x1f; regs->gpr[rd] = mfspr(SPRN_PVR); return 0; } /* Emulating the dcba insn is just a no-op. */ - if ((instword & INST_DCBA_MASK) == INST_DCBA) + if ((instword & PPC_INST_DCBA_MASK) == PPC_INST_DCBA) { + PPC_WARN_EMULATED(dcba, regs); return 0; + } /* Emulate the mcrxr insn. */ - if ((instword & INST_MCRXR_MASK) == INST_MCRXR) { + if ((instword & PPC_INST_MCRXR_MASK) == PPC_INST_MCRXR) { int shift = (instword >> 21) & 0x1c; unsigned long msk = 0xf0000000UL >> shift; + PPC_WARN_EMULATED(mcrxr, regs); regs->ccr = (regs->ccr & ~msk) | ((regs->xer >> shift) & msk); regs->xer &= ~0xf0000000UL; return 0; } /* Emulate load/store string insn. */ - if ((instword & INST_STRING_GEN_MASK) == INST_STRING) + if ((instword & PPC_INST_STRING_GEN_MASK) == PPC_INST_STRING) { + if (tm_abort_check(regs, + TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT)) + return -EINVAL; + PPC_WARN_EMULATED(string, regs); return emulate_string_inst(regs, instword); + } - return -EINVAL; -} + /* Emulate the popcntb (Population Count Bytes) instruction. */ + if ((instword & PPC_INST_POPCNTB_MASK) == PPC_INST_POPCNTB) { + PPC_WARN_EMULATED(popcntb, regs); + return emulate_popcntb_inst(regs, instword); + } -/* - * Look through the list of trap instructions that are used for BUG(), - * BUG_ON() and WARN_ON() and see if we hit one. At this point we know - * that the exception was caused by a trap instruction of some kind. - * Returns 1 if we should continue (i.e. it was a WARN_ON) or 0 - * otherwise. - */ -extern struct bug_entry __start___bug_table[], __stop___bug_table[]; + /* Emulate isel (Integer Select) instruction */ + if ((instword & PPC_INST_ISEL_MASK) == PPC_INST_ISEL) { + PPC_WARN_EMULATED(isel, regs); + return emulate_isel(regs, instword); + } -#ifndef CONFIG_MODULES -#define module_find_bug(x) NULL + /* Emulate sync instruction variants */ + if ((instword & PPC_INST_SYNC_MASK) == PPC_INST_SYNC) { + PPC_WARN_EMULATED(sync, regs); + asm volatile("sync"); + return 0; + } + +#ifdef CONFIG_PPC64 + /* Emulate the mfspr rD, DSCR. */ + if ((((instword & PPC_INST_MFSPR_DSCR_USER_MASK) == + PPC_INST_MFSPR_DSCR_USER) || + ((instword & PPC_INST_MFSPR_DSCR_MASK) == + PPC_INST_MFSPR_DSCR)) && + cpu_has_feature(CPU_FTR_DSCR)) { + PPC_WARN_EMULATED(mfdscr, regs); + rd = (instword >> 21) & 0x1f; + regs->gpr[rd] = mfspr(SPRN_DSCR); + return 0; + } + /* Emulate the mtspr DSCR, rD. */ + if ((((instword & PPC_INST_MTSPR_DSCR_USER_MASK) == + PPC_INST_MTSPR_DSCR_USER) || + ((instword & PPC_INST_MTSPR_DSCR_MASK) == + PPC_INST_MTSPR_DSCR)) && + cpu_has_feature(CPU_FTR_DSCR)) { + PPC_WARN_EMULATED(mtdscr, regs); + rd = (instword >> 21) & 0x1f; + current->thread.dscr = regs->gpr[rd]; + current->thread.dscr_inherit = 1; + mtspr(SPRN_DSCR, current->thread.dscr); + return 0; + } #endif -struct bug_entry *find_bug(unsigned long bugaddr) -{ - struct bug_entry *bug; + return -EINVAL; +} - for (bug = __start___bug_table; bug < __stop___bug_table; ++bug) - if (bugaddr == bug->bug_addr) - return bug; - return module_find_bug(bugaddr); +int is_valid_bugaddr(unsigned long addr) +{ + return is_kernel_addr(addr); } -static int check_bug_trap(struct pt_regs *regs) +#ifdef CONFIG_MATH_EMULATION +static int emulate_math(struct pt_regs *regs) { - struct bug_entry *bug; - unsigned long addr; + int ret; + extern int do_mathemu(struct pt_regs *regs); + + ret = do_mathemu(regs); + if (ret >= 0) + PPC_WARN_EMULATED(math, regs); - if (regs->msr & MSR_PR) - return 0; /* not in kernel */ - addr = regs->nip; /* address of trap instruction */ - if (addr < PAGE_OFFSET) + switch (ret) { + case 0: + emulate_single_step(regs); return 0; - bug = find_bug(regs->nip); - if (bug == NULL) + case 1: { + int code = 0; + code = __parse_fpscr(current->thread.fp_state.fpscr); + _exception(SIGFPE, regs, code, regs->nip); + return 0; + } + case -EFAULT: + _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip); return 0; - if (bug->line & BUG_WARNING_TRAP) { - /* this is a WARN_ON rather than BUG/BUG_ON */ -#ifdef CONFIG_XMON - xmon_printf(KERN_ERR "Badness in %s at %s:%d\n", - bug->function, bug->file, - bug->line & ~BUG_WARNING_TRAP); -#endif /* CONFIG_XMON */ - printk(KERN_ERR "Badness in %s at %s:%d\n", - bug->function, bug->file, - bug->line & ~BUG_WARNING_TRAP); - dump_stack(); - return 1; } -#ifdef CONFIG_XMON - xmon_printf(KERN_CRIT "kernel BUG in %s at %s:%d!\n", - bug->function, bug->file, bug->line); - xmon(regs); -#endif /* CONFIG_XMON */ - printk(KERN_CRIT "kernel BUG in %s at %s:%d!\n", - bug->function, bug->file, bug->line); - return 0; + return -1; } +#else +static inline int emulate_math(struct pt_regs *regs) { return -1; } +#endif void __kprobes program_check_exception(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); unsigned int reason = get_reason(regs); - extern int do_mathemu(struct pt_regs *regs); -#ifdef CONFIG_MATH_EMULATION - /* (reason & REASON_ILLEGAL) would be the obvious thing here, - * but there seems to be a hardware bug on the 405GP (RevD) - * that means ESR is sometimes set incorrectly - either to - * ESR_DST (!?) or 0. In the process of chasing this with the - * hardware people - not sure if it can happen on any illegal - * instruction or only on FP instructions, whether there is a - * pattern to occurences etc. -dgibson 31/Mar/2003 */ - if (!(reason & REASON_TRAP) && do_mathemu(regs) == 0) { - emulate_single_step(regs); - return; - } -#endif /* CONFIG_MATH_EMULATION */ + /* We can now get here via a FP Unavailable exception if the core + * has no FPU, in that case the reason flags will be 0 */ if (reason & REASON_FP) { /* IEEE FP exception */ parse_fpe(regs); - return; + goto bail; } if (reason & REASON_TRAP) { + /* Debugger is first in line to stop recursive faults in + * rcu_lock, notify_die, or atomic_notifier_call_chain */ + if (debugger_bpt(regs)) + goto bail; + /* trap exception */ if (notify_die(DIE_BPT, "breakpoint", regs, 5, 5, SIGTRAP) == NOTIFY_STOP) - return; - if (debugger_bpt(regs)) - return; - if (check_bug_trap(regs)) { + goto bail; + + if (!(regs->msr & MSR_PR) && /* not user-mode */ + report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) { regs->nip += 4; - return; + goto bail; } _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip); - return; + goto bail; + } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (reason & REASON_TM) { + /* This is a TM "Bad Thing Exception" program check. + * This occurs when: + * - An rfid/hrfid/mtmsrd attempts to cause an illegal + * transition in TM states. + * - A trechkpt is attempted when transactional. + * - A treclaim is attempted when non transactional. + * - A tend is illegally attempted. + * - writing a TM SPR when transactional. + */ + if (!user_mode(regs) && + report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) { + regs->nip += 4; + goto bail; + } + /* If usermode caused this, it's done something illegal and + * gets a SIGILL slap on the wrist. We call it an illegal + * operand to distinguish from the instruction just being bad + * (e.g. executing a 'tend' on a CPU without TM!); it's an + * illegal /placement/ of a valid instruction. + */ + if (user_mode(regs)) { + _exception(SIGILL, regs, ILL_ILLOPN, regs->nip); + goto bail; + } else { + printk(KERN_EMERG "Unexpected TM Bad Thing exception " + "at %lx (msr 0x%x)\n", regs->nip, reason); + die("Unrecoverable exception", regs, SIGABRT); + } } +#endif + + /* + * If we took the program check in the kernel skip down to sending a + * SIGILL. The subsequent cases all relate to emulating instructions + * which we should only do for userspace. We also do not want to enable + * interrupts for kernel faults because that might lead to further + * faults, and loose the context of the original exception. + */ + if (!user_mode(regs)) + goto sigill; + + /* We restore the interrupt state now */ + if (!arch_irq_disabled_regs(regs)) + local_irq_enable(); + + /* (reason & REASON_ILLEGAL) would be the obvious thing here, + * but there seems to be a hardware bug on the 405GP (RevD) + * that means ESR is sometimes set incorrectly - either to + * ESR_DST (!?) or 0. In the process of chasing this with the + * hardware people - not sure if it can happen on any illegal + * instruction or only on FP instructions, whether there is a + * pattern to occurrences etc. -dgibson 31/Mar/2003 + */ + if (!emulate_math(regs)) + goto bail; /* Try to emulate it if we should. */ if (reason & (REASON_ILLEGAL | REASON_PRIVILEGED)) { @@ -815,41 +1204,70 @@ void __kprobes program_check_exception(struct pt_regs *regs) case 0: regs->nip += 4; emulate_single_step(regs); - return; + goto bail; case -EFAULT: _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip); - return; + goto bail; } } +sigill: if (reason & REASON_PRIVILEGED) _exception(SIGILL, regs, ILL_PRVOPC, regs->nip); else _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); + +bail: + exception_exit(prev_state); +} + +/* + * This occurs when running in hypervisor mode on POWER6 or later + * and an illegal instruction is encountered. + */ +void __kprobes emulation_assist_interrupt(struct pt_regs *regs) +{ + regs->msr |= REASON_ILLEGAL; + program_check_exception(regs); } void alignment_exception(struct pt_regs *regs) { - int fixed; + enum ctx_state prev_state = exception_enter(); + int sig, code, fixed = 0; + + /* We restore the interrupt state now */ + if (!arch_irq_disabled_regs(regs)) + local_irq_enable(); - fixed = fix_alignment(regs); + if (tm_abort_check(regs, TM_CAUSE_ALIGNMENT | TM_CAUSE_PERSISTENT)) + goto bail; + + /* we don't implement logging of alignment exceptions */ + if (!(current->thread.align_ctl & PR_UNALIGN_SIGBUS)) + fixed = fix_alignment(regs); if (fixed == 1) { regs->nip += 4; /* skip over emulated instruction */ emulate_single_step(regs); - return; + goto bail; } /* Operand address was bad */ if (fixed == -EFAULT) { - if (user_mode(regs)) - _exception(SIGSEGV, regs, SEGV_ACCERR, regs->dar); - else - /* Search exception table */ - bad_page_fault(regs, regs->dar, SIGSEGV); - return; + sig = SIGSEGV; + code = SEGV_ACCERR; + } else { + sig = SIGBUS; + code = BUS_ADRALN; } - _exception(SIGBUS, regs, BUS_ADRALN, regs->dar); + if (user_mode(regs)) + _exception(sig, regs, code, regs->dar); + else + bad_page_fault(regs, regs->dar, sig); + +bail: + exception_exit(prev_state); } void StackOverflow(struct pt_regs *regs) @@ -872,94 +1290,356 @@ void nonrecoverable_exception(struct pt_regs *regs) void trace_syscall(struct pt_regs *regs) { printk("Task: %p(%d), PC: %08lX/%08lX, Syscall: %3ld, Result: %s%ld %s\n", - current, current->pid, regs->nip, regs->link, regs->gpr[0], + current, task_pid_nr(current), regs->nip, regs->link, regs->gpr[0], regs->ccr&0x10000000?"Error=":"", regs->gpr[3], print_tainted()); } void kernel_fp_unavailable_exception(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); + printk(KERN_EMERG "Unrecoverable FP Unavailable Exception " "%lx at %lx\n", regs->trap, regs->nip); die("Unrecoverable FP Unavailable Exception", regs, SIGABRT); + + exception_exit(prev_state); } void altivec_unavailable_exception(struct pt_regs *regs) { -#if !defined(CONFIG_ALTIVEC) + enum ctx_state prev_state = exception_enter(); + if (user_mode(regs)) { /* A user program has executed an altivec instruction, but this kernel doesn't support altivec. */ _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); - return; + goto bail; } -#endif + printk(KERN_EMERG "Unrecoverable VMX/Altivec Unavailable Exception " "%lx at %lx\n", regs->trap, regs->nip); die("Unrecoverable VMX/Altivec Unavailable Exception", regs, SIGABRT); + +bail: + exception_exit(prev_state); +} + +void vsx_unavailable_exception(struct pt_regs *regs) +{ + if (user_mode(regs)) { + /* A user program has executed an vsx instruction, + but this kernel doesn't support vsx. */ + _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); + return; + } + + printk(KERN_EMERG "Unrecoverable VSX Unavailable Exception " + "%lx at %lx\n", regs->trap, regs->nip); + die("Unrecoverable VSX Unavailable Exception", regs, SIGABRT); } #ifdef CONFIG_PPC64 -extern perf_irq_t perf_irq; +void facility_unavailable_exception(struct pt_regs *regs) +{ + static char *facility_strings[] = { + [FSCR_FP_LG] = "FPU", + [FSCR_VECVSX_LG] = "VMX/VSX", + [FSCR_DSCR_LG] = "DSCR", + [FSCR_PM_LG] = "PMU SPRs", + [FSCR_BHRB_LG] = "BHRB", + [FSCR_TM_LG] = "TM", + [FSCR_EBB_LG] = "EBB", + [FSCR_TAR_LG] = "TAR", + }; + char *facility = "unknown"; + u64 value; + u8 status; + bool hv; + + hv = (regs->trap == 0xf80); + if (hv) + value = mfspr(SPRN_HFSCR); + else + value = mfspr(SPRN_FSCR); + + status = value >> 56; + if (status == FSCR_DSCR_LG) { + /* User is acessing the DSCR. Set the inherit bit and allow + * the user to set it directly in future by setting via the + * FSCR DSCR bit. We always leave HFSCR DSCR set. + */ + current->thread.dscr_inherit = 1; + mtspr(SPRN_FSCR, value | FSCR_DSCR); + return; + } + + if ((status < ARRAY_SIZE(facility_strings)) && + facility_strings[status]) + facility = facility_strings[status]; + + /* We restore the interrupt state now */ + if (!arch_irq_disabled_regs(regs)) + local_irq_enable(); + + pr_err_ratelimited( + "%sFacility '%s' unavailable, exception at 0x%lx, MSR=%lx\n", + hv ? "Hypervisor " : "", facility, regs->nip, regs->msr); + + if (user_mode(regs)) { + _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); + return; + } + + die("Unexpected facility unavailable exception", regs, SIGABRT); +} #endif -#if defined(CONFIG_PPC64) || defined(CONFIG_E500) +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + +void fp_unavailable_tm(struct pt_regs *regs) +{ + /* Note: This does not handle any kind of FP laziness. */ + + TM_DEBUG("FP Unavailable trap whilst transactional at 0x%lx, MSR=%lx\n", + regs->nip, regs->msr); + + /* We can only have got here if the task started using FP after + * beginning the transaction. So, the transactional regs are just a + * copy of the checkpointed ones. But, we still need to recheckpoint + * as we're enabling FP for the process; it will return, abort the + * transaction, and probably retry but now with FP enabled. So the + * checkpointed FP registers need to be loaded. + */ + tm_reclaim_current(TM_CAUSE_FAC_UNAV); + /* Reclaim didn't save out any FPRs to transact_fprs. */ + + /* Enable FP for the task: */ + regs->msr |= (MSR_FP | current->thread.fpexc_mode); + + /* This loads and recheckpoints the FP registers from + * thread.fpr[]. They will remain in registers after the + * checkpoint so we don't need to reload them after. + * If VMX is in use, the VRs now hold checkpointed values, + * so we don't want to load the VRs from the thread_struct. + */ + tm_recheckpoint(¤t->thread, MSR_FP); + + /* If VMX is in use, get the transactional values back */ + if (regs->msr & MSR_VEC) { + do_load_up_transact_altivec(¤t->thread); + /* At this point all the VSX state is loaded, so enable it */ + regs->msr |= MSR_VSX; + } +} + +void altivec_unavailable_tm(struct pt_regs *regs) +{ + /* See the comments in fp_unavailable_tm(). This function operates + * the same way. + */ + + TM_DEBUG("Vector Unavailable trap whilst transactional at 0x%lx," + "MSR=%lx\n", + regs->nip, regs->msr); + tm_reclaim_current(TM_CAUSE_FAC_UNAV); + regs->msr |= MSR_VEC; + tm_recheckpoint(¤t->thread, MSR_VEC); + current->thread.used_vr = 1; + + if (regs->msr & MSR_FP) { + do_load_up_transact_fpu(¤t->thread); + regs->msr |= MSR_VSX; + } +} + +void vsx_unavailable_tm(struct pt_regs *regs) +{ + unsigned long orig_msr = regs->msr; + + /* See the comments in fp_unavailable_tm(). This works similarly, + * though we're loading both FP and VEC registers in here. + * + * If FP isn't in use, load FP regs. If VEC isn't in use, load VEC + * regs. Either way, set MSR_VSX. + */ + + TM_DEBUG("VSX Unavailable trap whilst transactional at 0x%lx," + "MSR=%lx\n", + regs->nip, regs->msr); + + current->thread.used_vsr = 1; + + /* If FP and VMX are already loaded, we have all the state we need */ + if ((orig_msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC)) { + regs->msr |= MSR_VSX; + return; + } + + /* This reclaims FP and/or VR regs if they're already enabled */ + tm_reclaim_current(TM_CAUSE_FAC_UNAV); + + regs->msr |= MSR_VEC | MSR_FP | current->thread.fpexc_mode | + MSR_VSX; + + /* This loads & recheckpoints FP and VRs; but we have + * to be sure not to overwrite previously-valid state. + */ + tm_recheckpoint(¤t->thread, regs->msr & ~orig_msr); + + if (orig_msr & MSR_FP) + do_load_up_transact_fpu(¤t->thread); + if (orig_msr & MSR_VEC) + do_load_up_transact_altivec(¤t->thread); +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + void performance_monitor_exception(struct pt_regs *regs) { + __get_cpu_var(irq_stat).pmu_irqs++; + perf_irq(regs); } -#endif #ifdef CONFIG_8xx void SoftwareEmulation(struct pt_regs *regs) { - extern int do_mathemu(struct pt_regs *); - extern int Soft_emulate_8xx(struct pt_regs *); - int errcode; - CHECK_FULL_REGS(regs); if (!user_mode(regs)) { debugger(regs); - die("Kernel Mode Software FPU Emulation", regs, SIGFPE); + die("Kernel Mode Unimplemented Instruction or SW FPU Emulation", + regs, SIGFPE); } -#ifdef CONFIG_MATH_EMULATION - errcode = do_mathemu(regs); -#else - errcode = Soft_emulate_8xx(regs); -#endif - if (errcode) { - if (errcode > 0) - _exception(SIGFPE, regs, 0, 0); - else if (errcode == -EFAULT) - _exception(SIGSEGV, regs, 0, 0); - else - _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); - } else - emulate_single_step(regs); + if (!emulate_math(regs)) + return; + + _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); } #endif /* CONFIG_8xx */ -#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) +#ifdef CONFIG_PPC_ADV_DEBUG_REGS +static void handle_debug(struct pt_regs *regs, unsigned long debug_status) +{ + int changed = 0; + /* + * Determine the cause of the debug event, clear the + * event flags and send a trap to the handler. Torez + */ + if (debug_status & (DBSR_DAC1R | DBSR_DAC1W)) { + dbcr_dac(current) &= ~(DBCR_DAC1R | DBCR_DAC1W); +#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE + current->thread.debug.dbcr2 &= ~DBCR2_DAC12MODE; +#endif + do_send_trap(regs, mfspr(SPRN_DAC1), debug_status, TRAP_HWBKPT, + 5); + changed |= 0x01; + } else if (debug_status & (DBSR_DAC2R | DBSR_DAC2W)) { + dbcr_dac(current) &= ~(DBCR_DAC2R | DBCR_DAC2W); + do_send_trap(regs, mfspr(SPRN_DAC2), debug_status, TRAP_HWBKPT, + 6); + changed |= 0x01; + } else if (debug_status & DBSR_IAC1) { + current->thread.debug.dbcr0 &= ~DBCR0_IAC1; + dbcr_iac_range(current) &= ~DBCR_IAC12MODE; + do_send_trap(regs, mfspr(SPRN_IAC1), debug_status, TRAP_HWBKPT, + 1); + changed |= 0x01; + } else if (debug_status & DBSR_IAC2) { + current->thread.debug.dbcr0 &= ~DBCR0_IAC2; + do_send_trap(regs, mfspr(SPRN_IAC2), debug_status, TRAP_HWBKPT, + 2); + changed |= 0x01; + } else if (debug_status & DBSR_IAC3) { + current->thread.debug.dbcr0 &= ~DBCR0_IAC3; + dbcr_iac_range(current) &= ~DBCR_IAC34MODE; + do_send_trap(regs, mfspr(SPRN_IAC3), debug_status, TRAP_HWBKPT, + 3); + changed |= 0x01; + } else if (debug_status & DBSR_IAC4) { + current->thread.debug.dbcr0 &= ~DBCR0_IAC4; + do_send_trap(regs, mfspr(SPRN_IAC4), debug_status, TRAP_HWBKPT, + 4); + changed |= 0x01; + } + /* + * At the point this routine was called, the MSR(DE) was turned off. + * Check all other debug flags and see if that bit needs to be turned + * back on or not. + */ + if (DBCR_ACTIVE_EVENTS(current->thread.debug.dbcr0, + current->thread.debug.dbcr1)) + regs->msr |= MSR_DE; + else + /* Make sure the IDM flag is off */ + current->thread.debug.dbcr0 &= ~DBCR0_IDM; + + if (changed & 0x01) + mtspr(SPRN_DBCR0, current->thread.debug.dbcr0); +} -void DebugException(struct pt_regs *regs, unsigned long debug_status) +void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status) { - if (debug_status & DBSR_IC) { /* instruction completion */ + current->thread.debug.dbsr = debug_status; + + /* Hack alert: On BookE, Branch Taken stops on the branch itself, while + * on server, it stops on the target of the branch. In order to simulate + * the server behaviour, we thus restart right away with a single step + * instead of stopping here when hitting a BT + */ + if (debug_status & DBSR_BT) { regs->msr &= ~MSR_DE; + + /* Disable BT */ + mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~DBCR0_BT); + /* Clear the BT event */ + mtspr(SPRN_DBSR, DBSR_BT); + + /* Do the single step trick only when coming from userspace */ if (user_mode(regs)) { - current->thread.dbcr0 &= ~DBCR0_IC; - } else { - /* Disable instruction completion */ - mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~DBCR0_IC); - /* Clear the instruction completion event */ - mtspr(SPRN_DBSR, DBSR_IC); - if (debugger_sstep(regs)) - return; + current->thread.debug.dbcr0 &= ~DBCR0_BT; + current->thread.debug.dbcr0 |= DBCR0_IDM | DBCR0_IC; + regs->msr |= MSR_DE; + return; } - _exception(SIGTRAP, regs, TRAP_TRACE, 0); - } + + if (notify_die(DIE_SSTEP, "block_step", regs, 5, + 5, SIGTRAP) == NOTIFY_STOP) { + return; + } + if (debugger_sstep(regs)) + return; + } else if (debug_status & DBSR_IC) { /* Instruction complete */ + regs->msr &= ~MSR_DE; + + /* Disable instruction completion */ + mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~DBCR0_IC); + /* Clear the instruction completion event */ + mtspr(SPRN_DBSR, DBSR_IC); + + if (notify_die(DIE_SSTEP, "single_step", regs, 5, + 5, SIGTRAP) == NOTIFY_STOP) { + return; + } + + if (debugger_sstep(regs)) + return; + + if (user_mode(regs)) { + current->thread.debug.dbcr0 &= ~DBCR0_IC; + if (DBCR_ACTIVE_EVENTS(current->thread.debug.dbcr0, + current->thread.debug.dbcr1)) + regs->msr |= MSR_DE; + else + /* Make sure the IDM bit is off */ + current->thread.debug.dbcr0 &= ~DBCR0_IDM; + } + + _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip); + } else + handle_debug(regs, debug_status); } -#endif /* CONFIG_4xx || CONFIG_BOOKE */ +#endif /* CONFIG_PPC_ADV_DEBUG_REGS */ #if !defined(CONFIG_TAU_INT) void TAUException(struct pt_regs *regs) @@ -982,6 +1662,7 @@ void altivec_assist_exception(struct pt_regs *regs) flush_altivec_to_thread(current); + PPC_WARN_EMULATED(altivec, regs); err = emulate_altivec(regs); if (err == 0) { regs->nip += 4; /* skip emulated instruction */ @@ -995,14 +1676,28 @@ void altivec_assist_exception(struct pt_regs *regs) } else { /* didn't recognize the instruction */ /* XXX quick hack for now: set the non-Java bit in the VSCR */ - if (printk_ratelimit()) - printk(KERN_ERR "Unrecognized altivec instruction " - "in %s at %lx\n", current->comm, regs->nip); - current->thread.vscr.u[3] |= 0x10000; + printk_ratelimited(KERN_ERR "Unrecognized altivec instruction " + "in %s at %lx\n", current->comm, regs->nip); + current->thread.vr_state.vscr.u[3] |= 0x10000; } } #endif /* CONFIG_ALTIVEC */ +#ifdef CONFIG_VSX +void vsx_assist_exception(struct pt_regs *regs) +{ + if (!user_mode(regs)) { + printk(KERN_EMERG "VSX assist exception in kernel mode" + " at %lx\n", regs->nip); + die("Kernel VSX assist exception", regs, SIGILL); + } + + flush_vsx_to_thread(current); + printk(KERN_INFO "VSX assist not supported at %lx\n", regs->nip); + _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); +} +#endif /* CONFIG_VSX */ + #ifdef CONFIG_FSL_BOOKE void CacheLockingException(struct pt_regs *regs, unsigned long address, unsigned long error_code) @@ -1020,37 +1715,82 @@ void CacheLockingException(struct pt_regs *regs, unsigned long address, #ifdef CONFIG_SPE void SPEFloatingPointException(struct pt_regs *regs) { + extern int do_spe_mathemu(struct pt_regs *regs); unsigned long spefscr; int fpexc_mode; int code = 0; + int err; + + flush_spe_to_thread(current); spefscr = current->thread.spefscr; fpexc_mode = current->thread.fpexc_mode; - /* Hardware does not neccessarily set sticky - * underflow/overflow/invalid flags */ if ((spefscr & SPEFSCR_FOVF) && (fpexc_mode & PR_FP_EXC_OVF)) { code = FPE_FLTOVF; - spefscr |= SPEFSCR_FOVFS; } else if ((spefscr & SPEFSCR_FUNF) && (fpexc_mode & PR_FP_EXC_UND)) { code = FPE_FLTUND; - spefscr |= SPEFSCR_FUNFS; } else if ((spefscr & SPEFSCR_FDBZ) && (fpexc_mode & PR_FP_EXC_DIV)) code = FPE_FLTDIV; else if ((spefscr & SPEFSCR_FINV) && (fpexc_mode & PR_FP_EXC_INV)) { code = FPE_FLTINV; - spefscr |= SPEFSCR_FINVS; } else if ((spefscr & (SPEFSCR_FG | SPEFSCR_FX)) && (fpexc_mode & PR_FP_EXC_RES)) code = FPE_FLTRES; - current->thread.spefscr = spefscr; + err = do_spe_mathemu(regs); + if (err == 0) { + regs->nip += 4; /* skip emulated instruction */ + emulate_single_step(regs); + return; + } + + if (err == -EFAULT) { + /* got an error reading the instruction */ + _exception(SIGSEGV, regs, SEGV_ACCERR, regs->nip); + } else if (err == -EINVAL) { + /* didn't recognize the instruction */ + printk(KERN_ERR "unrecognized spe instruction " + "in %s at %lx\n", current->comm, regs->nip); + } else { + _exception(SIGFPE, regs, code, regs->nip); + } - _exception(SIGFPE, regs, code, regs->nip); return; } + +void SPEFloatingPointRoundException(struct pt_regs *regs) +{ + extern int speround_handler(struct pt_regs *regs); + int err; + + preempt_disable(); + if (regs->msr & MSR_SPE) + giveup_spe(current); + preempt_enable(); + + regs->nip -= 4; + err = speround_handler(regs); + if (err == 0) { + regs->nip += 4; /* skip emulated instruction */ + emulate_single_step(regs); + return; + } + + if (err == -EFAULT) { + /* got an error reading the instruction */ + _exception(SIGSEGV, regs, SEGV_ACCERR, regs->nip); + } else if (err == -EINVAL) { + /* didn't recognize the instruction */ + printk(KERN_ERR "unrecognized spe instruction " + "in %s at %lx\n", current->comm, regs->nip); + } else { + _exception(SIGFPE, regs, 0, regs->nip); + return; + } +} #endif /* @@ -1066,7 +1806,7 @@ void unrecoverable_exception(struct pt_regs *regs) die("Unrecoverable exception", regs, SIGABRT); } -#ifdef CONFIG_BOOKE_WDT +#if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x) /* * Default handler for a Watchdog exception, * spins until a reboot occurs @@ -1099,3 +1839,82 @@ void kernel_bad_stack(struct pt_regs *regs) void __init trap_init(void) { } + + +#ifdef CONFIG_PPC_EMULATED_STATS + +#define WARN_EMULATED_SETUP(type) .type = { .name = #type } + +struct ppc_emulated ppc_emulated = { +#ifdef CONFIG_ALTIVEC + WARN_EMULATED_SETUP(altivec), +#endif + WARN_EMULATED_SETUP(dcba), + WARN_EMULATED_SETUP(dcbz), + WARN_EMULATED_SETUP(fp_pair), + WARN_EMULATED_SETUP(isel), + WARN_EMULATED_SETUP(mcrxr), + WARN_EMULATED_SETUP(mfpvr), + WARN_EMULATED_SETUP(multiple), + WARN_EMULATED_SETUP(popcntb), + WARN_EMULATED_SETUP(spe), + WARN_EMULATED_SETUP(string), + WARN_EMULATED_SETUP(sync), + WARN_EMULATED_SETUP(unaligned), +#ifdef CONFIG_MATH_EMULATION + WARN_EMULATED_SETUP(math), +#endif +#ifdef CONFIG_VSX + WARN_EMULATED_SETUP(vsx), +#endif +#ifdef CONFIG_PPC64 + WARN_EMULATED_SETUP(mfdscr), + WARN_EMULATED_SETUP(mtdscr), + WARN_EMULATED_SETUP(lq_stq), +#endif +}; + +u32 ppc_warn_emulated; + +void ppc_warn_emulated_print(const char *type) +{ + pr_warn_ratelimited("%s used emulated %s instruction\n", current->comm, + type); +} + +static int __init ppc_warn_emulated_init(void) +{ + struct dentry *dir, *d; + unsigned int i; + struct ppc_emulated_entry *entries = (void *)&ppc_emulated; + + if (!powerpc_debugfs_root) + return -ENODEV; + + dir = debugfs_create_dir("emulated_instructions", + powerpc_debugfs_root); + if (!dir) + return -ENOMEM; + + d = debugfs_create_u32("do_warn", S_IRUGO | S_IWUSR, dir, + &ppc_warn_emulated); + if (!d) + goto fail; + + for (i = 0; i < sizeof(ppc_emulated)/sizeof(*entries); i++) { + d = debugfs_create_u32(entries[i].name, S_IRUGO | S_IWUSR, dir, + (u32 *)&entries[i].val.counter); + if (!d) + goto fail; + } + + return 0; + +fail: + debugfs_remove_recursive(dir); + return -ENOMEM; +} + +device_initcall(ppc_warn_emulated_init); + +#endif /* CONFIG_PPC_EMULATED_STATS */ diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c new file mode 100644 index 00000000000..b7aa07279a6 --- /dev/null +++ b/arch/powerpc/kernel/udbg.c @@ -0,0 +1,182 @@ +/* + * polling mode stateless debugging stuff, originally for NS16550 Serial Ports + * + * c 2001 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <stdarg.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/console.h> +#include <linux/init.h> +#include <asm/processor.h> +#include <asm/udbg.h> + +void (*udbg_putc)(char c); +void (*udbg_flush)(void); +int (*udbg_getc)(void); +int (*udbg_getc_poll)(void); + +/* + * Early debugging facilities. You can enable _one_ of these via .config, + * if you do so your kernel _will not boot_ on anything else. Be careful. + */ +void __init udbg_early_init(void) +{ +#if defined(CONFIG_PPC_EARLY_DEBUG_LPAR) + /* For LPAR machines that have an HVC console on vterm 0 */ + udbg_init_debug_lpar(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_LPAR_HVSI) + /* For LPAR machines that have an HVSI console on vterm 0 */ + udbg_init_debug_lpar_hvsi(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_G5) + /* For use on Apple G5 machines */ + udbg_init_pmac_realmode(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_RTAS_PANEL) + /* RTAS panel debug */ + udbg_init_rtas_panel(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_RTAS_CONSOLE) + /* RTAS console debug */ + udbg_init_rtas_console(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_MAPLE) + /* Maple real mode debug */ + udbg_init_maple_realmode(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_BEAT) + udbg_init_debug_beat(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE) + udbg_init_pas_realmode(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) + udbg_init_btext(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_44x) + /* PPC44x debug */ + udbg_init_44x_as1(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_40x) + /* PPC40x debug */ + udbg_init_40x_realmode(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_CPM) + udbg_init_cpm(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_USBGECKO) + udbg_init_usbgecko(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_MEMCONS) + /* In memory console */ + udbg_init_memcons(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_EHV_BC) + udbg_init_ehv_bc(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_PS3GELIC) + udbg_init_ps3gelic(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_OPAL_RAW) + udbg_init_debug_opal_raw(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_OPAL_HVSI) + udbg_init_debug_opal_hvsi(); +#endif + +#ifdef CONFIG_PPC_EARLY_DEBUG + console_loglevel = 10; + + register_early_udbg_console(); +#endif +} + +/* udbg library, used by xmon et al */ +void udbg_puts(const char *s) +{ + if (udbg_putc) { + char c; + + if (s && *s != '\0') { + while ((c = *s++) != '\0') + udbg_putc(c); + } + + if (udbg_flush) + udbg_flush(); + } +#if 0 + else { + printk("%s", s); + } +#endif +} + +int udbg_write(const char *s, int n) +{ + int remain = n; + char c; + + if (!udbg_putc) + return 0; + + if (s && *s != '\0') { + while (((c = *s++) != '\0') && (remain-- > 0)) { + udbg_putc(c); + } + } + + if (udbg_flush) + udbg_flush(); + + return n - remain; +} + +#define UDBG_BUFSIZE 256 +void udbg_printf(const char *fmt, ...) +{ + char buf[UDBG_BUFSIZE]; + va_list args; + + va_start(args, fmt); + vsnprintf(buf, UDBG_BUFSIZE, fmt, args); + udbg_puts(buf); + va_end(args); +} + +void __init udbg_progress(char *s, unsigned short hex) +{ + udbg_puts(s); + udbg_puts("\n"); +} + +/* + * Early boot console based on udbg + */ +static void udbg_console_write(struct console *con, const char *s, + unsigned int n) +{ + udbg_write(s, n); +} + +static struct console udbg_console = { + .name = "udbg", + .write = udbg_console_write, + .flags = CON_PRINTBUFFER | CON_ENABLED | CON_BOOT | CON_ANYTIME, + .index = 0, +}; + +/* + * Called by setup_system after ppc_md->probe and ppc_md->early_init. + * Call it again after setting udbg_putc in ppc_md->setup_arch. + */ +void __init register_early_udbg_console(void) +{ + if (early_console) + return; + + if (!udbg_putc) + return; + + if (strstr(boot_command_line, "udbg-immortal")) { + printk(KERN_INFO "early console immortal !\n"); + udbg_console.flags &= ~CON_BOOT; + } + early_console = &udbg_console; + register_console(&udbg_console); +} + +#if 0 /* if you want to use this as a regular output console */ +console_initcall(register_udbg_console); +#endif diff --git a/arch/powerpc/kernel/udbg_16550.c b/arch/powerpc/kernel/udbg_16550.c new file mode 100644 index 00000000000..6e7c4923b5e --- /dev/null +++ b/arch/powerpc/kernel/udbg_16550.c @@ -0,0 +1,298 @@ +/* + * udbg for NS16550 compatible serial ports + * + * Copyright (C) 2001-2005 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/types.h> +#include <asm/udbg.h> +#include <asm/io.h> +#include <asm/reg_a2.h> + +extern u8 real_readb(volatile u8 __iomem *addr); +extern void real_writeb(u8 data, volatile u8 __iomem *addr); +extern u8 real_205_readb(volatile u8 __iomem *addr); +extern void real_205_writeb(u8 data, volatile u8 __iomem *addr); + +#define UART_RBR 0 +#define UART_IER 1 +#define UART_FCR 2 +#define UART_LCR 3 +#define UART_MCR 4 +#define UART_LSR 5 +#define UART_MSR 6 +#define UART_SCR 7 +#define UART_THR UART_RBR +#define UART_IIR UART_FCR +#define UART_DLL UART_RBR +#define UART_DLM UART_IER +#define UART_DLAB UART_LCR + +#define LSR_DR 0x01 /* Data ready */ +#define LSR_OE 0x02 /* Overrun */ +#define LSR_PE 0x04 /* Parity error */ +#define LSR_FE 0x08 /* Framing error */ +#define LSR_BI 0x10 /* Break */ +#define LSR_THRE 0x20 /* Xmit holding register empty */ +#define LSR_TEMT 0x40 /* Xmitter empty */ +#define LSR_ERR 0x80 /* Error */ + +#define LCR_DLAB 0x80 + +static u8 (*udbg_uart_in)(unsigned int reg); +static void (*udbg_uart_out)(unsigned int reg, u8 data); + +static void udbg_uart_flush(void) +{ + if (!udbg_uart_in) + return; + + /* wait for idle */ + while ((udbg_uart_in(UART_LSR) & LSR_THRE) == 0) + cpu_relax(); +} + +static void udbg_uart_putc(char c) +{ + if (!udbg_uart_out) + return; + + if (c == '\n') + udbg_uart_putc('\r'); + udbg_uart_flush(); + udbg_uart_out(UART_THR, c); +} + +static int udbg_uart_getc_poll(void) +{ + if (!udbg_uart_in || !(udbg_uart_in(UART_LSR) & LSR_DR)) + return udbg_uart_in(UART_RBR); + return -1; +} + +static int udbg_uart_getc(void) +{ + if (!udbg_uart_in) + return -1; + /* wait for char */ + while (!(udbg_uart_in(UART_LSR) & LSR_DR)) + cpu_relax(); + return udbg_uart_in(UART_RBR); +} + +static void udbg_use_uart(void) +{ + udbg_putc = udbg_uart_putc; + udbg_flush = udbg_uart_flush; + udbg_getc = udbg_uart_getc; + udbg_getc_poll = udbg_uart_getc_poll; +} + +void udbg_uart_setup(unsigned int speed, unsigned int clock) +{ + unsigned int dll, base_bauds; + + if (!udbg_uart_out) + return; + + if (clock == 0) + clock = 1843200; + if (speed == 0) + speed = 9600; + + base_bauds = clock / 16; + dll = base_bauds / speed; + + udbg_uart_out(UART_LCR, 0x00); + udbg_uart_out(UART_IER, 0xff); + udbg_uart_out(UART_IER, 0x00); + udbg_uart_out(UART_LCR, LCR_DLAB); + udbg_uart_out(UART_DLL, dll & 0xff); + udbg_uart_out(UART_DLM, dll >> 8); + /* 8 data, 1 stop, no parity */ + udbg_uart_out(UART_LCR, 0x3); + /* RTS/DTR */ + udbg_uart_out(UART_MCR, 0x3); + /* Clear & enable FIFOs */ + udbg_uart_out(UART_FCR, 0x7); +} + +unsigned int udbg_probe_uart_speed(unsigned int clock) +{ + unsigned int dll, dlm, divisor, prescaler, speed; + u8 old_lcr; + + old_lcr = udbg_uart_in(UART_LCR); + + /* select divisor latch registers. */ + udbg_uart_out(UART_LCR, old_lcr | LCR_DLAB); + + /* now, read the divisor */ + dll = udbg_uart_in(UART_DLL); + dlm = udbg_uart_in(UART_DLM); + divisor = dlm << 8 | dll; + + /* check prescaling */ + if (udbg_uart_in(UART_MCR) & 0x80) + prescaler = 4; + else + prescaler = 1; + + /* restore the LCR */ + udbg_uart_out(UART_LCR, old_lcr); + + /* calculate speed */ + speed = (clock / prescaler) / (divisor * 16); + + /* sanity check */ + if (speed > (clock / 16)) + speed = 9600; + + return speed; +} + +static union { + unsigned char __iomem *mmio_base; + unsigned long pio_base; +} udbg_uart; + +static unsigned int udbg_uart_stride = 1; + +static u8 udbg_uart_in_pio(unsigned int reg) +{ + return inb(udbg_uart.pio_base + (reg * udbg_uart_stride)); +} + +static void udbg_uart_out_pio(unsigned int reg, u8 data) +{ + outb(data, udbg_uart.pio_base + (reg * udbg_uart_stride)); +} + +void udbg_uart_init_pio(unsigned long port, unsigned int stride) +{ + if (!port) + return; + udbg_uart.pio_base = port; + udbg_uart_stride = stride; + udbg_uart_in = udbg_uart_in_pio; + udbg_uart_out = udbg_uart_out_pio; + udbg_use_uart(); +} + +static u8 udbg_uart_in_mmio(unsigned int reg) +{ + return in_8(udbg_uart.mmio_base + (reg * udbg_uart_stride)); +} + +static void udbg_uart_out_mmio(unsigned int reg, u8 data) +{ + out_8(udbg_uart.mmio_base + (reg * udbg_uart_stride), data); +} + + +void udbg_uart_init_mmio(void __iomem *addr, unsigned int stride) +{ + if (!addr) + return; + udbg_uart.mmio_base = addr; + udbg_uart_stride = stride; + udbg_uart_in = udbg_uart_in_mmio; + udbg_uart_out = udbg_uart_out_mmio; + udbg_use_uart(); +} + +#ifdef CONFIG_PPC_MAPLE + +#define UDBG_UART_MAPLE_ADDR ((void __iomem *)0xf40003f8) + +static u8 udbg_uart_in_maple(unsigned int reg) +{ + return real_readb(UDBG_UART_MAPLE_ADDR + reg); +} + +static void udbg_uart_out_maple(unsigned int reg, u8 val) +{ + real_writeb(val, UDBG_UART_MAPLE_ADDR + reg); +} + +void __init udbg_init_maple_realmode(void) +{ + udbg_uart_in = udbg_uart_in_maple; + udbg_uart_out = udbg_uart_out_maple; + udbg_use_uart(); +} + +#endif /* CONFIG_PPC_MAPLE */ + +#ifdef CONFIG_PPC_PASEMI + +#define UDBG_UART_PAS_ADDR ((void __iomem *)0xfcff03f8UL) + +static u8 udbg_uart_in_pas(unsigned int reg) +{ + return real_205_readb(UDBG_UART_PAS_ADDR + reg); +} + +static void udbg_uart_out_pas(unsigned int reg, u8 val) +{ + real_205_writeb(val, UDBG_UART_PAS_ADDR + reg); +} + +void __init udbg_init_pas_realmode(void) +{ + udbg_uart_in = udbg_uart_in_pas; + udbg_uart_out = udbg_uart_out_pas; + udbg_use_uart(); +} + +#endif /* CONFIG_PPC_PASEMI */ + +#ifdef CONFIG_PPC_EARLY_DEBUG_44x + +#include <platforms/44x/44x.h> + +static u8 udbg_uart_in_44x_as1(unsigned int reg) +{ + return as1_readb((void __iomem *)PPC44x_EARLY_DEBUG_VIRTADDR + reg); +} + +static void udbg_uart_out_44x_as1(unsigned int reg, u8 val) +{ + as1_writeb(val, (void __iomem *)PPC44x_EARLY_DEBUG_VIRTADDR + reg); +} + +void __init udbg_init_44x_as1(void) +{ + udbg_uart_in = udbg_uart_in_44x_as1; + udbg_uart_out = udbg_uart_out_44x_as1; + udbg_use_uart(); +} + +#endif /* CONFIG_PPC_EARLY_DEBUG_44x */ + +#ifdef CONFIG_PPC_EARLY_DEBUG_40x + +static u8 udbg_uart_in_40x(unsigned int reg) +{ + return real_readb((void __iomem *)CONFIG_PPC_EARLY_DEBUG_40x_PHYSADDR + + reg); +} + +static void udbg_uart_out_40x(unsigned int reg, u8 val) +{ + real_writeb(val, (void __iomem *)CONFIG_PPC_EARLY_DEBUG_40x_PHYSADDR + + reg); +} + +void __init udbg_init_40x_realmode(void) +{ + udbg_uart_in = udbg_uart_in_40x; + udbg_uart_out = udbg_uart_out_40x; + udbg_use_uart(); +} + +#endif /* CONFIG_PPC_EARLY_DEBUG_40x */ diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c new file mode 100644 index 00000000000..003b20964ea --- /dev/null +++ b/arch/powerpc/kernel/uprobes.c @@ -0,0 +1,207 @@ +/* + * User-space Probes (UProbes) for powerpc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2007-2012 + * + * Adapted from the x86 port by Ananth N Mavinakayanahalli <ananth@in.ibm.com> + */ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/ptrace.h> +#include <linux/uprobes.h> +#include <linux/uaccess.h> +#include <linux/kdebug.h> + +#include <asm/sstep.h> + +#define UPROBE_TRAP_NR UINT_MAX + +/** + * is_trap_insn - check if the instruction is a trap variant + * @insn: instruction to be checked. + * Returns true if @insn is a trap variant. + */ +bool is_trap_insn(uprobe_opcode_t *insn) +{ + return (is_trap(*insn)); +} + +/** + * arch_uprobe_analyze_insn + * @mm: the probed address space. + * @arch_uprobe: the probepoint information. + * @addr: vaddr to probe. + * Return 0 on success or a -ve number on error. + */ +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, + struct mm_struct *mm, unsigned long addr) +{ + if (addr & 0x03) + return -EINVAL; + + return 0; +} + +/* + * arch_uprobe_pre_xol - prepare to execute out of line. + * @auprobe: the probepoint information. + * @regs: reflects the saved user state of current task. + */ +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct arch_uprobe_task *autask = ¤t->utask->autask; + + autask->saved_trap_nr = current->thread.trap_nr; + current->thread.trap_nr = UPROBE_TRAP_NR; + regs->nip = current->utask->xol_vaddr; + + user_enable_single_step(current); + return 0; +} + +/** + * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs + * @regs: Reflects the saved state of the task after it has hit a breakpoint + * instruction. + * Return the address of the breakpoint instruction. + */ +unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) +{ + return instruction_pointer(regs); +} + +/* + * If xol insn itself traps and generates a signal (SIGILL/SIGSEGV/etc), + * then detect the case where a singlestepped instruction jumps back to its + * own address. It is assumed that anything like do_page_fault/do_trap/etc + * sets thread.trap_nr != UINT_MAX. + * + * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr, + * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to + * UPROBE_TRAP_NR == UINT_MAX set by arch_uprobe_pre_xol(). + */ +bool arch_uprobe_xol_was_trapped(struct task_struct *t) +{ + if (t->thread.trap_nr != UPROBE_TRAP_NR) + return true; + + return false; +} + +/* + * Called after single-stepping. To avoid the SMP problems that can + * occur when we temporarily put back the original opcode to + * single-step, we single-stepped a copy of the instruction. + * + * This function prepares to resume execution after the single-step. + */ +int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); + + current->thread.trap_nr = utask->autask.saved_trap_nr; + + /* + * On powerpc, except for loads and stores, most instructions + * including ones that alter code flow (branches, calls, returns) + * are emulated in the kernel. We get here only if the emulation + * support doesn't exist and have to fix-up the next instruction + * to be executed. + */ + regs->nip = utask->vaddr + MAX_UINSN_BYTES; + + user_disable_single_step(current); + return 0; +} + +/* callback routine for handling exceptions. */ +int arch_uprobe_exception_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + + /* regs == NULL is a kernel bug */ + if (WARN_ON(!regs)) + return NOTIFY_DONE; + + /* We are only interested in userspace traps */ + if (!user_mode(regs)) + return NOTIFY_DONE; + + switch (val) { + case DIE_BPT: + if (uprobe_pre_sstep_notifier(regs)) + return NOTIFY_STOP; + break; + case DIE_SSTEP: + if (uprobe_post_sstep_notifier(regs)) + return NOTIFY_STOP; + default: + break; + } + return NOTIFY_DONE; +} + +/* + * This function gets called when XOL instruction either gets trapped or + * the thread has a fatal signal, so reset the instruction pointer to its + * probed address. + */ +void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + current->thread.trap_nr = utask->autask.saved_trap_nr; + instruction_pointer_set(regs, utask->vaddr); + + user_disable_single_step(current); +} + +/* + * See if the instruction can be emulated. + * Returns true if instruction was emulated, false otherwise. + */ +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + int ret; + + /* + * emulate_step() returns 1 if the insn was successfully emulated. + * For all other cases, we need to single-step in hardware. + */ + ret = emulate_step(regs, auprobe->insn); + if (ret > 0) + return true; + + return false; +} + +unsigned long +arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) +{ + unsigned long orig_ret_vaddr; + + orig_ret_vaddr = regs->link; + + /* Replace the return addr with trampoline addr */ + regs->link = trampoline_vaddr; + + return orig_ret_vaddr; +} diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c new file mode 100644 index 00000000000..ce74c335a6a --- /dev/null +++ b/arch/powerpc/kernel/vdso.c @@ -0,0 +1,858 @@ + +/* + * Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp. + * <benh@kernel.crashing.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/stddef.h> +#include <linux/unistd.h> +#include <linux/slab.h> +#include <linux/user.h> +#include <linux/elf.h> +#include <linux/security.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> + +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/prom.h> +#include <asm/machdep.h> +#include <asm/cputable.h> +#include <asm/sections.h> +#include <asm/firmware.h> +#include <asm/vdso.h> +#include <asm/vdso_datapage.h> +#include <asm/setup.h> + +#undef DEBUG + +#ifdef DEBUG +#define DBG(fmt...) printk(fmt) +#else +#define DBG(fmt...) +#endif + +/* Max supported size for symbol names */ +#define MAX_SYMNAME 64 + +/* The alignment of the vDSO */ +#define VDSO_ALIGNMENT (1 << 16) + +extern char vdso32_start, vdso32_end; +static void *vdso32_kbase = &vdso32_start; +static unsigned int vdso32_pages; +static struct page **vdso32_pagelist; +unsigned long vdso32_sigtramp; +unsigned long vdso32_rt_sigtramp; + +#ifdef CONFIG_PPC64 +extern char vdso64_start, vdso64_end; +static void *vdso64_kbase = &vdso64_start; +static unsigned int vdso64_pages; +static struct page **vdso64_pagelist; +unsigned long vdso64_rt_sigtramp; +#endif /* CONFIG_PPC64 */ + +static int vdso_ready; + +/* + * The vdso data page (aka. systemcfg for old ppc64 fans) is here. + * Once the early boot kernel code no longer needs to muck around + * with it, it will become dynamically allocated + */ +static union { + struct vdso_data data; + u8 page[PAGE_SIZE]; +} vdso_data_store __page_aligned_data; +struct vdso_data *vdso_data = &vdso_data_store.data; + +/* Format of the patch table */ +struct vdso_patch_def +{ + unsigned long ftr_mask, ftr_value; + const char *gen_name; + const char *fix_name; +}; + +/* Table of functions to patch based on the CPU type/revision + * + * Currently, we only change sync_dicache to do nothing on processors + * with a coherent icache + */ +static struct vdso_patch_def vdso_patches[] = { + { + CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE, + "__kernel_sync_dicache", "__kernel_sync_dicache_p5" + }, + { + CPU_FTR_USE_TB, 0, + "__kernel_gettimeofday", NULL + }, + { + CPU_FTR_USE_TB, 0, + "__kernel_clock_gettime", NULL + }, + { + CPU_FTR_USE_TB, 0, + "__kernel_clock_getres", NULL + }, + { + CPU_FTR_USE_TB, 0, + "__kernel_get_tbfreq", NULL + }, + { + CPU_FTR_USE_TB, 0, + "__kernel_time", NULL + }, +}; + +/* + * Some infos carried around for each of them during parsing at + * boot time. + */ +struct lib32_elfinfo +{ + Elf32_Ehdr *hdr; /* ptr to ELF */ + Elf32_Sym *dynsym; /* ptr to .dynsym section */ + unsigned long dynsymsize; /* size of .dynsym section */ + char *dynstr; /* ptr to .dynstr section */ + unsigned long text; /* offset of .text section in .so */ +}; + +struct lib64_elfinfo +{ + Elf64_Ehdr *hdr; + Elf64_Sym *dynsym; + unsigned long dynsymsize; + char *dynstr; + unsigned long text; +}; + + +#ifdef __DEBUG +static void dump_one_vdso_page(struct page *pg, struct page *upg) +{ + printk("kpg: %p (c:%d,f:%08lx)", __va(page_to_pfn(pg) << PAGE_SHIFT), + page_count(pg), + pg->flags); + if (upg && !IS_ERR(upg) /* && pg != upg*/) { + printk(" upg: %p (c:%d,f:%08lx)", __va(page_to_pfn(upg) + << PAGE_SHIFT), + page_count(upg), + upg->flags); + } + printk("\n"); +} + +static void dump_vdso_pages(struct vm_area_struct * vma) +{ + int i; + + if (!vma || is_32bit_task()) { + printk("vDSO32 @ %016lx:\n", (unsigned long)vdso32_kbase); + for (i=0; i<vdso32_pages; i++) { + struct page *pg = virt_to_page(vdso32_kbase + + i*PAGE_SIZE); + struct page *upg = (vma && vma->vm_mm) ? + follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0) + : NULL; + dump_one_vdso_page(pg, upg); + } + } + if (!vma || !is_32bit_task()) { + printk("vDSO64 @ %016lx:\n", (unsigned long)vdso64_kbase); + for (i=0; i<vdso64_pages; i++) { + struct page *pg = virt_to_page(vdso64_kbase + + i*PAGE_SIZE); + struct page *upg = (vma && vma->vm_mm) ? + follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0) + : NULL; + dump_one_vdso_page(pg, upg); + } + } +} +#endif /* DEBUG */ + +/* + * This is called from binfmt_elf, we create the special vma for the + * vDSO and insert it into the mm struct tree + */ +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + struct mm_struct *mm = current->mm; + struct page **vdso_pagelist; + unsigned long vdso_pages; + unsigned long vdso_base; + int rc; + + if (!vdso_ready) + return 0; + +#ifdef CONFIG_PPC64 + if (is_32bit_task()) { + vdso_pagelist = vdso32_pagelist; + vdso_pages = vdso32_pages; + vdso_base = VDSO32_MBASE; + } else { + vdso_pagelist = vdso64_pagelist; + vdso_pages = vdso64_pages; + /* + * On 64bit we don't have a preferred map address. This + * allows get_unmapped_area to find an area near other mmaps + * and most likely share a SLB entry. + */ + vdso_base = 0; + } +#else + vdso_pagelist = vdso32_pagelist; + vdso_pages = vdso32_pages; + vdso_base = VDSO32_MBASE; +#endif + + current->mm->context.vdso_base = 0; + + /* vDSO has a problem and was disabled, just don't "enable" it for the + * process + */ + if (vdso_pages == 0) + return 0; + /* Add a page to the vdso size for the data page */ + vdso_pages ++; + + /* + * pick a base address for the vDSO in process space. We try to put it + * at vdso_base which is the "natural" base for it, but we might fail + * and end up putting it elsewhere. + * Add enough to the size so that the result can be aligned. + */ + down_write(&mm->mmap_sem); + vdso_base = get_unmapped_area(NULL, vdso_base, + (vdso_pages << PAGE_SHIFT) + + ((VDSO_ALIGNMENT - 1) & PAGE_MASK), + 0, 0); + if (IS_ERR_VALUE(vdso_base)) { + rc = vdso_base; + goto fail_mmapsem; + } + + /* Add required alignment. */ + vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT); + + /* + * Put vDSO base into mm struct. We need to do this before calling + * install_special_mapping or the perf counter mmap tracking code + * will fail to recognise it as a vDSO (since arch_vma_name fails). + */ + current->mm->context.vdso_base = vdso_base; + + /* + * our vma flags don't have VM_WRITE so by default, the process isn't + * allowed to write those pages. + * gdb can break that with ptrace interface, and thus trigger COW on + * those pages but it's then your responsibility to never do that on + * the "data" page of the vDSO or you'll stop getting kernel updates + * and your nice userland gettimeofday will be totally dead. + * It's fine to use that for setting breakpoints in the vDSO code + * pages though. + */ + rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + vdso_pagelist); + if (rc) { + current->mm->context.vdso_base = 0; + goto fail_mmapsem; + } + + up_write(&mm->mmap_sem); + return 0; + + fail_mmapsem: + up_write(&mm->mmap_sem); + return rc; +} + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == vma->vm_mm->context.vdso_base) + return "[vdso]"; + return NULL; +} + + + +static void * __init find_section32(Elf32_Ehdr *ehdr, const char *secname, + unsigned long *size) +{ + Elf32_Shdr *sechdrs; + unsigned int i; + char *secnames; + + /* Grab section headers and strings so we can tell who is who */ + sechdrs = (void *)ehdr + ehdr->e_shoff; + secnames = (void *)ehdr + sechdrs[ehdr->e_shstrndx].sh_offset; + + /* Find the section they want */ + for (i = 1; i < ehdr->e_shnum; i++) { + if (strcmp(secnames+sechdrs[i].sh_name, secname) == 0) { + if (size) + *size = sechdrs[i].sh_size; + return (void *)ehdr + sechdrs[i].sh_offset; + } + } + *size = 0; + return NULL; +} + +static Elf32_Sym * __init find_symbol32(struct lib32_elfinfo *lib, + const char *symname) +{ + unsigned int i; + char name[MAX_SYMNAME], *c; + + for (i = 0; i < (lib->dynsymsize / sizeof(Elf32_Sym)); i++) { + if (lib->dynsym[i].st_name == 0) + continue; + strlcpy(name, lib->dynstr + lib->dynsym[i].st_name, + MAX_SYMNAME); + c = strchr(name, '@'); + if (c) + *c = 0; + if (strcmp(symname, name) == 0) + return &lib->dynsym[i]; + } + return NULL; +} + +/* Note that we assume the section is .text and the symbol is relative to + * the library base + */ +static unsigned long __init find_function32(struct lib32_elfinfo *lib, + const char *symname) +{ + Elf32_Sym *sym = find_symbol32(lib, symname); + + if (sym == NULL) { + printk(KERN_WARNING "vDSO32: function %s not found !\n", + symname); + return 0; + } + return sym->st_value - VDSO32_LBASE; +} + +static int __init vdso_do_func_patch32(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64, + const char *orig, const char *fix) +{ + Elf32_Sym *sym32_gen, *sym32_fix; + + sym32_gen = find_symbol32(v32, orig); + if (sym32_gen == NULL) { + printk(KERN_ERR "vDSO32: Can't find symbol %s !\n", orig); + return -1; + } + if (fix == NULL) { + sym32_gen->st_name = 0; + return 0; + } + sym32_fix = find_symbol32(v32, fix); + if (sym32_fix == NULL) { + printk(KERN_ERR "vDSO32: Can't find symbol %s !\n", fix); + return -1; + } + sym32_gen->st_value = sym32_fix->st_value; + sym32_gen->st_size = sym32_fix->st_size; + sym32_gen->st_info = sym32_fix->st_info; + sym32_gen->st_other = sym32_fix->st_other; + sym32_gen->st_shndx = sym32_fix->st_shndx; + + return 0; +} + + +#ifdef CONFIG_PPC64 + +static void * __init find_section64(Elf64_Ehdr *ehdr, const char *secname, + unsigned long *size) +{ + Elf64_Shdr *sechdrs; + unsigned int i; + char *secnames; + + /* Grab section headers and strings so we can tell who is who */ + sechdrs = (void *)ehdr + ehdr->e_shoff; + secnames = (void *)ehdr + sechdrs[ehdr->e_shstrndx].sh_offset; + + /* Find the section they want */ + for (i = 1; i < ehdr->e_shnum; i++) { + if (strcmp(secnames+sechdrs[i].sh_name, secname) == 0) { + if (size) + *size = sechdrs[i].sh_size; + return (void *)ehdr + sechdrs[i].sh_offset; + } + } + if (size) + *size = 0; + return NULL; +} + +static Elf64_Sym * __init find_symbol64(struct lib64_elfinfo *lib, + const char *symname) +{ + unsigned int i; + char name[MAX_SYMNAME], *c; + + for (i = 0; i < (lib->dynsymsize / sizeof(Elf64_Sym)); i++) { + if (lib->dynsym[i].st_name == 0) + continue; + strlcpy(name, lib->dynstr + lib->dynsym[i].st_name, + MAX_SYMNAME); + c = strchr(name, '@'); + if (c) + *c = 0; + if (strcmp(symname, name) == 0) + return &lib->dynsym[i]; + } + return NULL; +} + +/* Note that we assume the section is .text and the symbol is relative to + * the library base + */ +static unsigned long __init find_function64(struct lib64_elfinfo *lib, + const char *symname) +{ + Elf64_Sym *sym = find_symbol64(lib, symname); + + if (sym == NULL) { + printk(KERN_WARNING "vDSO64: function %s not found !\n", + symname); + return 0; + } +#ifdef VDS64_HAS_DESCRIPTORS + return *((u64 *)(vdso64_kbase + sym->st_value - VDSO64_LBASE)) - + VDSO64_LBASE; +#else + return sym->st_value - VDSO64_LBASE; +#endif +} + +static int __init vdso_do_func_patch64(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64, + const char *orig, const char *fix) +{ + Elf64_Sym *sym64_gen, *sym64_fix; + + sym64_gen = find_symbol64(v64, orig); + if (sym64_gen == NULL) { + printk(KERN_ERR "vDSO64: Can't find symbol %s !\n", orig); + return -1; + } + if (fix == NULL) { + sym64_gen->st_name = 0; + return 0; + } + sym64_fix = find_symbol64(v64, fix); + if (sym64_fix == NULL) { + printk(KERN_ERR "vDSO64: Can't find symbol %s !\n", fix); + return -1; + } + sym64_gen->st_value = sym64_fix->st_value; + sym64_gen->st_size = sym64_fix->st_size; + sym64_gen->st_info = sym64_fix->st_info; + sym64_gen->st_other = sym64_fix->st_other; + sym64_gen->st_shndx = sym64_fix->st_shndx; + + return 0; +} + +#endif /* CONFIG_PPC64 */ + + +static __init int vdso_do_find_sections(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64) +{ + void *sect; + + /* + * Locate symbol tables & text section + */ + + v32->dynsym = find_section32(v32->hdr, ".dynsym", &v32->dynsymsize); + v32->dynstr = find_section32(v32->hdr, ".dynstr", NULL); + if (v32->dynsym == NULL || v32->dynstr == NULL) { + printk(KERN_ERR "vDSO32: required symbol section not found\n"); + return -1; + } + sect = find_section32(v32->hdr, ".text", NULL); + if (sect == NULL) { + printk(KERN_ERR "vDSO32: the .text section was not found\n"); + return -1; + } + v32->text = sect - vdso32_kbase; + +#ifdef CONFIG_PPC64 + v64->dynsym = find_section64(v64->hdr, ".dynsym", &v64->dynsymsize); + v64->dynstr = find_section64(v64->hdr, ".dynstr", NULL); + if (v64->dynsym == NULL || v64->dynstr == NULL) { + printk(KERN_ERR "vDSO64: required symbol section not found\n"); + return -1; + } + sect = find_section64(v64->hdr, ".text", NULL); + if (sect == NULL) { + printk(KERN_ERR "vDSO64: the .text section was not found\n"); + return -1; + } + v64->text = sect - vdso64_kbase; +#endif /* CONFIG_PPC64 */ + + return 0; +} + +static __init void vdso_setup_trampolines(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64) +{ + /* + * Find signal trampolines + */ + +#ifdef CONFIG_PPC64 + vdso64_rt_sigtramp = find_function64(v64, "__kernel_sigtramp_rt64"); +#endif + vdso32_sigtramp = find_function32(v32, "__kernel_sigtramp32"); + vdso32_rt_sigtramp = find_function32(v32, "__kernel_sigtramp_rt32"); +} + +static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64) +{ + Elf32_Sym *sym32; +#ifdef CONFIG_PPC64 + Elf64_Sym *sym64; + + sym64 = find_symbol64(v64, "__kernel_datapage_offset"); + if (sym64 == NULL) { + printk(KERN_ERR "vDSO64: Can't find symbol " + "__kernel_datapage_offset !\n"); + return -1; + } + *((int *)(vdso64_kbase + sym64->st_value - VDSO64_LBASE)) = + (vdso64_pages << PAGE_SHIFT) - + (sym64->st_value - VDSO64_LBASE); +#endif /* CONFIG_PPC64 */ + + sym32 = find_symbol32(v32, "__kernel_datapage_offset"); + if (sym32 == NULL) { + printk(KERN_ERR "vDSO32: Can't find symbol " + "__kernel_datapage_offset !\n"); + return -1; + } + *((int *)(vdso32_kbase + (sym32->st_value - VDSO32_LBASE))) = + (vdso32_pages << PAGE_SHIFT) - + (sym32->st_value - VDSO32_LBASE); + + return 0; +} + + +static __init int vdso_fixup_features(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64) +{ + void *start32; + unsigned long size32; + +#ifdef CONFIG_PPC64 + void *start64; + unsigned long size64; + + start64 = find_section64(v64->hdr, "__ftr_fixup", &size64); + if (start64) + do_feature_fixups(cur_cpu_spec->cpu_features, + start64, start64 + size64); + + start64 = find_section64(v64->hdr, "__mmu_ftr_fixup", &size64); + if (start64) + do_feature_fixups(cur_cpu_spec->mmu_features, + start64, start64 + size64); + + start64 = find_section64(v64->hdr, "__fw_ftr_fixup", &size64); + if (start64) + do_feature_fixups(powerpc_firmware_features, + start64, start64 + size64); + + start64 = find_section64(v64->hdr, "__lwsync_fixup", &size64); + if (start64) + do_lwsync_fixups(cur_cpu_spec->cpu_features, + start64, start64 + size64); +#endif /* CONFIG_PPC64 */ + + start32 = find_section32(v32->hdr, "__ftr_fixup", &size32); + if (start32) + do_feature_fixups(cur_cpu_spec->cpu_features, + start32, start32 + size32); + + start32 = find_section32(v32->hdr, "__mmu_ftr_fixup", &size32); + if (start32) + do_feature_fixups(cur_cpu_spec->mmu_features, + start32, start32 + size32); + +#ifdef CONFIG_PPC64 + start32 = find_section32(v32->hdr, "__fw_ftr_fixup", &size32); + if (start32) + do_feature_fixups(powerpc_firmware_features, + start32, start32 + size32); +#endif /* CONFIG_PPC64 */ + + start32 = find_section32(v32->hdr, "__lwsync_fixup", &size32); + if (start32) + do_lwsync_fixups(cur_cpu_spec->cpu_features, + start32, start32 + size32); + + return 0; +} + +static __init int vdso_fixup_alt_funcs(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(vdso_patches); i++) { + struct vdso_patch_def *patch = &vdso_patches[i]; + int match = (cur_cpu_spec->cpu_features & patch->ftr_mask) + == patch->ftr_value; + if (!match) + continue; + + DBG("replacing %s with %s...\n", patch->gen_name, + patch->fix_name ? "NONE" : patch->fix_name); + + /* + * Patch the 32 bits and 64 bits symbols. Note that we do not + * patch the "." symbol on 64 bits. + * It would be easy to do, but doesn't seem to be necessary, + * patching the OPD symbol is enough. + */ + vdso_do_func_patch32(v32, v64, patch->gen_name, + patch->fix_name); +#ifdef CONFIG_PPC64 + vdso_do_func_patch64(v32, v64, patch->gen_name, + patch->fix_name); +#endif /* CONFIG_PPC64 */ + } + + return 0; +} + + +static __init int vdso_setup(void) +{ + struct lib32_elfinfo v32; + struct lib64_elfinfo v64; + + v32.hdr = vdso32_kbase; +#ifdef CONFIG_PPC64 + v64.hdr = vdso64_kbase; +#endif + if (vdso_do_find_sections(&v32, &v64)) + return -1; + + if (vdso_fixup_datapage(&v32, &v64)) + return -1; + + if (vdso_fixup_features(&v32, &v64)) + return -1; + + if (vdso_fixup_alt_funcs(&v32, &v64)) + return -1; + + vdso_setup_trampolines(&v32, &v64); + + return 0; +} + +/* + * Called from setup_arch to initialize the bitmap of available + * syscalls in the systemcfg page + */ +static void __init vdso_setup_syscall_map(void) +{ + unsigned int i; + extern unsigned long *sys_call_table; + extern unsigned long sys_ni_syscall; + + + for (i = 0; i < __NR_syscalls; i++) { +#ifdef CONFIG_PPC64 + if (sys_call_table[i*2] != sys_ni_syscall) + vdso_data->syscall_map_64[i >> 5] |= + 0x80000000UL >> (i & 0x1f); + if (sys_call_table[i*2+1] != sys_ni_syscall) + vdso_data->syscall_map_32[i >> 5] |= + 0x80000000UL >> (i & 0x1f); +#else /* CONFIG_PPC64 */ + if (sys_call_table[i] != sys_ni_syscall) + vdso_data->syscall_map_32[i >> 5] |= + 0x80000000UL >> (i & 0x1f); +#endif /* CONFIG_PPC64 */ + } +} + +#ifdef CONFIG_PPC64 +int vdso_getcpu_init(void) +{ + unsigned long cpu, node, val; + + /* + * SPRG_VDSO contains the CPU in the bottom 16 bits and the NUMA node + * in the next 16 bits. The VDSO uses this to implement getcpu(). + */ + cpu = get_cpu(); + WARN_ON_ONCE(cpu > 0xffff); + + node = cpu_to_node(cpu); + WARN_ON_ONCE(node > 0xffff); + + val = (cpu & 0xfff) | ((node & 0xffff) << 16); + mtspr(SPRN_SPRG_VDSO_WRITE, val); + get_paca()->sprg_vdso = val; + + put_cpu(); + + return 0; +} +/* We need to call this before SMP init */ +early_initcall(vdso_getcpu_init); +#endif + +static int __init vdso_init(void) +{ + int i; + +#ifdef CONFIG_PPC64 + /* + * Fill up the "systemcfg" stuff for backward compatibility + */ + strcpy((char *)vdso_data->eye_catcher, "SYSTEMCFG:PPC64"); + vdso_data->version.major = SYSTEMCFG_MAJOR; + vdso_data->version.minor = SYSTEMCFG_MINOR; + vdso_data->processor = mfspr(SPRN_PVR); + /* + * Fake the old platform number for pSeries and add + * in LPAR bit if necessary + */ + vdso_data->platform = 0x100; + if (firmware_has_feature(FW_FEATURE_LPAR)) + vdso_data->platform |= 1; + vdso_data->physicalMemorySize = memblock_phys_mem_size(); + vdso_data->dcache_size = ppc64_caches.dsize; + vdso_data->dcache_line_size = ppc64_caches.dline_size; + vdso_data->icache_size = ppc64_caches.isize; + vdso_data->icache_line_size = ppc64_caches.iline_size; + + /* XXXOJN: Blocks should be added to ppc64_caches and used instead */ + vdso_data->dcache_block_size = ppc64_caches.dline_size; + vdso_data->icache_block_size = ppc64_caches.iline_size; + vdso_data->dcache_log_block_size = ppc64_caches.log_dline_size; + vdso_data->icache_log_block_size = ppc64_caches.log_iline_size; + + /* + * Calculate the size of the 64 bits vDSO + */ + vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT; + DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages); +#else + vdso_data->dcache_block_size = L1_CACHE_BYTES; + vdso_data->dcache_log_block_size = L1_CACHE_SHIFT; + vdso_data->icache_block_size = L1_CACHE_BYTES; + vdso_data->icache_log_block_size = L1_CACHE_SHIFT; +#endif /* CONFIG_PPC64 */ + + + /* + * Calculate the size of the 32 bits vDSO + */ + vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT; + DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages); + + + /* + * Setup the syscall map in the vDOS + */ + vdso_setup_syscall_map(); + + /* + * Initialize the vDSO images in memory, that is do necessary + * fixups of vDSO symbols, locate trampolines, etc... + */ + if (vdso_setup()) { + printk(KERN_ERR "vDSO setup failure, not enabled !\n"); + vdso32_pages = 0; +#ifdef CONFIG_PPC64 + vdso64_pages = 0; +#endif + return 0; + } + + /* Make sure pages are in the correct state */ + vdso32_pagelist = kzalloc(sizeof(struct page *) * (vdso32_pages + 2), + GFP_KERNEL); + BUG_ON(vdso32_pagelist == NULL); + for (i = 0; i < vdso32_pages; i++) { + struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); + ClearPageReserved(pg); + get_page(pg); + vdso32_pagelist[i] = pg; + } + vdso32_pagelist[i++] = virt_to_page(vdso_data); + vdso32_pagelist[i] = NULL; + +#ifdef CONFIG_PPC64 + vdso64_pagelist = kzalloc(sizeof(struct page *) * (vdso64_pages + 2), + GFP_KERNEL); + BUG_ON(vdso64_pagelist == NULL); + for (i = 0; i < vdso64_pages; i++) { + struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); + ClearPageReserved(pg); + get_page(pg); + vdso64_pagelist[i] = pg; + } + vdso64_pagelist[i++] = virt_to_page(vdso_data); + vdso64_pagelist[i] = NULL; +#endif /* CONFIG_PPC64 */ + + get_page(virt_to_page(vdso_data)); + + smp_wmb(); + vdso_ready = 1; + + return 0; +} +arch_initcall(vdso_init); + +int in_gate_area_no_mm(unsigned long addr) +{ + return 0; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + return 0; +} + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ + return NULL; +} + diff --git a/arch/powerpc/kernel/vdso32/.gitignore b/arch/powerpc/kernel/vdso32/.gitignore new file mode 100644 index 00000000000..fea5809857a --- /dev/null +++ b/arch/powerpc/kernel/vdso32/.gitignore @@ -0,0 +1,2 @@ +vdso32.lds +vdso32.so.dbg diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile new file mode 100644 index 00000000000..53e6c9b979e --- /dev/null +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -0,0 +1,58 @@ + +# List of files in the vdso, has to be asm only for now + +obj-vdso32-$(CONFIG_PPC64) = getcpu.o +obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o \ + $(obj-vdso32-y) + +# Build rules + +ifeq ($(CONFIG_PPC32),y) +CROSS32CC := $(CC) +endif + +targets := $(obj-vdso32) vdso32.so vdso32.so.dbg +obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) + +GCOV_PROFILE := n + +ccflags-y := -shared -fno-common -fno-builtin +ccflags-y += -nostdlib -Wl,-soname=linux-vdso32.so.1 \ + $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) +asflags-y := -D__VDSO32__ -s + +obj-y += vdso32_wrapper.o +extra-y += vdso32.lds +CPPFLAGS_vdso32.lds += -P -C -Upowerpc + +# Force dependency (incbin is bad) +$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so + +# link rule for the .so file, .lds has to be first +$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) + $(call if_changed,vdso32ld) + +# strip rule for the .so file +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +# assembly rules for the .S files +$(obj-vdso32): %.o: %.S + $(call if_changed_dep,vdso32as) + +# actual build commands +quiet_cmd_vdso32ld = VDSO32L $@ + cmd_vdso32ld = $(CROSS32CC) $(c_flags) -Wl,-T $^ -o $@ +quiet_cmd_vdso32as = VDSO32A $@ + cmd_vdso32as = $(CROSS32CC) $(a_flags) -c -o $@ $< + +# install commands for the unstripped file +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ + +vdso32.so: $(obj)/vdso32.so.dbg + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso_install: vdso32.so diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S b/arch/powerpc/kernel/vdso32/cacheflush.S new file mode 100644 index 00000000000..1ba6feb71b3 --- /dev/null +++ b/arch/powerpc/kernel/vdso32/cacheflush.S @@ -0,0 +1,85 @@ +/* + * vDSO provided cache flush routines + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), + * IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/vdso.h> +#include <asm/asm-offsets.h> + + .text + +/* + * Default "generic" version of __kernel_sync_dicache. + * + * void __kernel_sync_dicache(unsigned long start, unsigned long end) + * + * Flushes the data cache & invalidate the instruction cache for the + * provided range [start, end[ + */ +V_FUNCTION_BEGIN(__kernel_sync_dicache) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + mr r11,r3 + bl __get_datapage@local + mtlr r12 + mr r10,r3 + + lwz r7,CFG_DCACHE_BLOCKSZ(r10) + addi r5,r7,-1 + andc r6,r11,r5 /* round low to line bdy */ + subf r8,r6,r4 /* compute length */ + add r8,r8,r5 /* ensure we get enough */ + lwz r9,CFG_DCACHE_LOGBLOCKSZ(r10) + srw. r8,r8,r9 /* compute line count */ + crclr cr0*4+so + beqlr /* nothing to do? */ + mtctr r8 +1: dcbst 0,r6 + add r6,r6,r7 + bdnz 1b + sync + +/* Now invalidate the instruction cache */ + + lwz r7,CFG_ICACHE_BLOCKSZ(r10) + addi r5,r7,-1 + andc r6,r11,r5 /* round low to line bdy */ + subf r8,r6,r4 /* compute length */ + add r8,r8,r5 + lwz r9,CFG_ICACHE_LOGBLOCKSZ(r10) + srw. r8,r8,r9 /* compute line count */ + crclr cr0*4+so + beqlr /* nothing to do? */ + mtctr r8 +2: icbi 0,r6 + add r6,r6,r7 + bdnz 2b + isync + li r3,0 + blr + .cfi_endproc +V_FUNCTION_END(__kernel_sync_dicache) + + +/* + * POWER5 version of __kernel_sync_dicache + */ +V_FUNCTION_BEGIN(__kernel_sync_dicache_p5) + .cfi_startproc + crclr cr0*4+so + sync + isync + li r3,0 + blr + .cfi_endproc +V_FUNCTION_END(__kernel_sync_dicache_p5) + diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S new file mode 100644 index 00000000000..dc21e891d2e --- /dev/null +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -0,0 +1,85 @@ +/* + * Access to the shared data page by the vDSO & syscall map + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> +#include <asm/vdso.h> + + .text +V_FUNCTION_BEGIN(__get_datapage) + .cfi_startproc + /* We don't want that exposed or overridable as we want other objects + * to be able to bl directly to here + */ + .protected __get_datapage + .hidden __get_datapage + + mflr r0 + .cfi_register lr,r0 + + bcl 20,31,1f + .global __kernel_datapage_offset; +__kernel_datapage_offset: + .long 0 +1: + mflr r3 + mtlr r0 + lwz r0,0(r3) + add r3,r0,r3 + blr + .cfi_endproc +V_FUNCTION_END(__get_datapage) + +/* + * void *__kernel_get_syscall_map(unsigned int *syscall_count) ; + * + * returns a pointer to the syscall map. the map is agnostic to the + * size of "long", unlike kernel bitops, it stores bits from top to + * bottom so that memory actually contains a linear bitmap + * check for syscall N by testing bit (0x80000000 >> (N & 0x1f)) of + * 32 bits int at N >> 5. + */ +V_FUNCTION_BEGIN(__kernel_get_syscall_map) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + mr r4,r3 + bl __get_datapage@local + mtlr r12 + addi r3,r3,CFG_SYSCALL_MAP32 + cmpli cr0,r4,0 + beqlr + li r0,__NR_syscalls + stw r0,0(r4) + crclr cr0*4+so + blr + .cfi_endproc +V_FUNCTION_END(__kernel_get_syscall_map) + +/* + * void unsigned long long __kernel_get_tbfreq(void); + * + * returns the timebase frequency in HZ + */ +V_FUNCTION_BEGIN(__kernel_get_tbfreq) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + bl __get_datapage@local + lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3) + lwz r3,CFG_TB_TICKS_PER_SEC(r3) + mtlr r12 + crclr cr0*4+so + blr + .cfi_endproc +V_FUNCTION_END(__kernel_get_tbfreq) diff --git a/arch/powerpc/kernel/vdso32/getcpu.S b/arch/powerpc/kernel/vdso32/getcpu.S new file mode 100644 index 00000000000..23eb9a9441b --- /dev/null +++ b/arch/powerpc/kernel/vdso32/getcpu.S @@ -0,0 +1,45 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/ppc_asm.h> +#include <asm/vdso.h> + + .text +/* + * Exact prototype of getcpu + * + * int __kernel_getcpu(unsigned *cpu, unsigned *node); + * + */ +V_FUNCTION_BEGIN(__kernel_getcpu) + .cfi_startproc + mfspr r5,SPRN_SPRG_VDSO_READ + cmpdi cr0,r3,0 + cmpdi cr1,r4,0 + clrlwi r6,r5,16 + rlwinm r7,r5,16,31-15,31-0 + beq cr0,1f + stw r6,0(r3) +1: beq cr1,2f + stw r7,0(r4) +2: crclr cr0*4+so + li r3,0 /* always success */ + blr + .cfi_endproc +V_FUNCTION_END(__kernel_getcpu) diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S new file mode 100644 index 00000000000..6b2b69616e7 --- /dev/null +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -0,0 +1,298 @@ +/* + * Userland implementation of gettimeofday() for 32 bits processes in a + * ppc64 kernel for use in the vDSO + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org, + * IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/vdso.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> + +/* Offset for the low 32-bit part of a field of long type */ +#ifdef CONFIG_PPC64 +#define LOPART 4 +#define TSPEC_TV_SEC TSPC64_TV_SEC+LOPART +#else +#define LOPART 0 +#define TSPEC_TV_SEC TSPC32_TV_SEC +#endif + + .text +/* + * Exact prototype of gettimeofday + * + * int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); + * + */ +V_FUNCTION_BEGIN(__kernel_gettimeofday) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + + mr r10,r3 /* r10 saves tv */ + mr r11,r4 /* r11 saves tz */ + bl __get_datapage@local /* get data page */ + mr r9, r3 /* datapage ptr in r9 */ + cmplwi r10,0 /* check if tv is NULL */ + beq 3f + lis r7,1000000@ha /* load up USEC_PER_SEC */ + addi r7,r7,1000000@l /* so we get microseconds in r4 */ + bl __do_get_tspec@local /* get sec/usec from tb & kernel */ + stw r3,TVAL32_TV_SEC(r10) + stw r4,TVAL32_TV_USEC(r10) + +3: cmplwi r11,0 /* check if tz is NULL */ + beq 1f + lwz r4,CFG_TZ_MINUTEWEST(r9)/* fill tz */ + lwz r5,CFG_TZ_DSTTIME(r9) + stw r4,TZONE_TZ_MINWEST(r11) + stw r5,TZONE_TZ_DSTTIME(r11) + +1: mtlr r12 + crclr cr0*4+so + li r3,0 + blr + .cfi_endproc +V_FUNCTION_END(__kernel_gettimeofday) + +/* + * Exact prototype of clock_gettime() + * + * int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); + * + */ +V_FUNCTION_BEGIN(__kernel_clock_gettime) + .cfi_startproc + /* Check for supported clock IDs */ + cmpli cr0,r3,CLOCK_REALTIME + cmpli cr1,r3,CLOCK_MONOTONIC + cror cr0*4+eq,cr0*4+eq,cr1*4+eq + bne cr0,99f + + mflr r12 /* r12 saves lr */ + .cfi_register lr,r12 + mr r11,r4 /* r11 saves tp */ + bl __get_datapage@local /* get data page */ + mr r9,r3 /* datapage ptr in r9 */ + lis r7,NSEC_PER_SEC@h /* want nanoseconds */ + ori r7,r7,NSEC_PER_SEC@l +50: bl __do_get_tspec@local /* get sec/nsec from tb & kernel */ + bne cr1,80f /* not monotonic -> all done */ + + /* + * CLOCK_MONOTONIC + */ + + /* now we must fixup using wall to monotonic. We need to snapshot + * that value and do the counter trick again. Fortunately, we still + * have the counter value in r8 that was returned by __do_get_xsec. + * At this point, r3,r4 contain our sec/nsec values, r5 and r6 + * can be used, r7 contains NSEC_PER_SEC. + */ + + lwz r5,WTOM_CLOCK_SEC(r9) + lwz r6,WTOM_CLOCK_NSEC(r9) + + /* We now have our offset in r5,r6. We create a fake dependency + * on that value and re-check the counter + */ + or r0,r6,r5 + xor r0,r0,r0 + add r9,r9,r0 + lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9) + cmpl cr0,r8,r0 /* check if updated */ + bne- 50b + + /* Calculate and store result. Note that this mimics the C code, + * which may cause funny results if nsec goes negative... is that + * possible at all ? + */ + add r3,r3,r5 + add r4,r4,r6 + cmpw cr0,r4,r7 + cmpwi cr1,r4,0 + blt 1f + subf r4,r7,r4 + addi r3,r3,1 +1: bge cr1,80f + addi r3,r3,-1 + add r4,r4,r7 + +80: stw r3,TSPC32_TV_SEC(r11) + stw r4,TSPC32_TV_NSEC(r11) + + mtlr r12 + crclr cr0*4+so + li r3,0 + blr + + /* + * syscall fallback + */ +99: + li r0,__NR_clock_gettime + sc + blr + .cfi_endproc +V_FUNCTION_END(__kernel_clock_gettime) + + +/* + * Exact prototype of clock_getres() + * + * int __kernel_clock_getres(clockid_t clock_id, struct timespec *res); + * + */ +V_FUNCTION_BEGIN(__kernel_clock_getres) + .cfi_startproc + /* Check for supported clock IDs */ + cmpwi cr0,r3,CLOCK_REALTIME + cmpwi cr1,r3,CLOCK_MONOTONIC + cror cr0*4+eq,cr0*4+eq,cr1*4+eq + bne cr0,99f + + li r3,0 + cmpli cr0,r4,0 + crclr cr0*4+so + beqlr + lis r5,CLOCK_REALTIME_RES@h + ori r5,r5,CLOCK_REALTIME_RES@l + stw r3,TSPC32_TV_SEC(r4) + stw r5,TSPC32_TV_NSEC(r4) + blr + + /* + * syscall fallback + */ +99: + li r0,__NR_clock_getres + sc + blr + .cfi_endproc +V_FUNCTION_END(__kernel_clock_getres) + + +/* + * Exact prototype of time() + * + * time_t time(time *t); + * + */ +V_FUNCTION_BEGIN(__kernel_time) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + + mr r11,r3 /* r11 holds t */ + bl __get_datapage@local + mr r9, r3 /* datapage ptr in r9 */ + + lwz r3,STAMP_XTIME+TSPEC_TV_SEC(r9) + + cmplwi r11,0 /* check if t is NULL */ + beq 2f + stw r3,0(r11) /* store result at *t */ +2: mtlr r12 + crclr cr0*4+so + blr + .cfi_endproc +V_FUNCTION_END(__kernel_time) + +/* + * This is the core of clock_gettime() and gettimeofday(), + * it returns the current time in r3 (seconds) and r4. + * On entry, r7 gives the resolution of r4, either USEC_PER_SEC + * or NSEC_PER_SEC, giving r4 in microseconds or nanoseconds. + * It expects the datapage ptr in r9 and doesn't clobber it. + * It clobbers r0, r5 and r6. + * On return, r8 contains the counter value that can be reused. + * This clobbers cr0 but not any other cr field. + */ +__do_get_tspec: + .cfi_startproc + /* Check for update count & load values. We use the low + * order 32 bits of the update count + */ +1: lwz r8,(CFG_TB_UPDATE_COUNT+LOPART)(r9) + andi. r0,r8,1 /* pending update ? loop */ + bne- 1b + xor r0,r8,r8 /* create dependency */ + add r9,r9,r0 + + /* Load orig stamp (offset to TB) */ + lwz r5,CFG_TB_ORIG_STAMP(r9) + lwz r6,(CFG_TB_ORIG_STAMP+4)(r9) + + /* Get a stable TB value */ +#ifdef CONFIG_8xx +2: mftbu r3 + mftbl r4 + mftbu r0 +#else +2: mfspr r3, SPRN_TBRU + mfspr r4, SPRN_TBRL + mfspr r0, SPRN_TBRU +#endif + cmplw cr0,r3,r0 + bne- 2b + + /* Subtract tb orig stamp and shift left 12 bits. + */ + subfc r4,r6,r4 + subfe r0,r5,r3 + slwi r0,r0,12 + rlwimi. r0,r4,12,20,31 + slwi r4,r4,12 + + /* + * Load scale factor & do multiplication. + * We only use the high 32 bits of the tb_to_xs value. + * Even with a 1GHz timebase clock, the high 32 bits of + * tb_to_xs will be at least 4 million, so the error from + * ignoring the low 32 bits will be no more than 0.25ppm. + * The error will just make the clock run very very slightly + * slow until the next time the kernel updates the VDSO data, + * at which point the clock will catch up to the kernel's value, + * so there is no long-term error accumulation. + */ + lwz r5,CFG_TB_TO_XS(r9) /* load values */ + mulhwu r4,r4,r5 + li r3,0 + + beq+ 4f /* skip high part computation if 0 */ + mulhwu r3,r0,r5 + mullw r5,r0,r5 + addc r4,r4,r5 + addze r3,r3 +4: + /* At this point, we have seconds since the xtime stamp + * as a 32.32 fixed-point number in r3 and r4. + * Load & add the xtime stamp. + */ + lwz r5,STAMP_XTIME+TSPEC_TV_SEC(r9) + lwz r6,STAMP_SEC_FRAC(r9) + addc r4,r4,r6 + adde r3,r3,r5 + + /* We create a fake dependency on the result in r3/r4 + * and re-check the counter + */ + or r6,r4,r3 + xor r0,r6,r6 + add r9,r9,r0 + lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9) + cmplw cr0,r8,r0 /* check if updated */ + bne- 1b + + mulhwu r4,r4,r7 /* convert to micro or nanoseconds */ + + blr + .cfi_endproc diff --git a/arch/powerpc/kernel/vdso32/note.S b/arch/powerpc/kernel/vdso32/note.S new file mode 100644 index 00000000000..d4b5be4f3d5 --- /dev/null +++ b/arch/powerpc/kernel/vdso32/note.S @@ -0,0 +1,25 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> + +#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type) \ + .section name, flags; \ + .balign 4; \ + .long 1f - 0f; /* name length */ \ + .long 3f - 2f; /* data length */ \ + .long type; /* note type */ \ +0: .asciz vendor; /* vendor name */ \ +1: .balign 4; \ +2: + +#define ASM_ELF_NOTE_END \ +3: .balign 4; /* pad out section */ \ + .previous + + ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0) + .long LINUX_VERSION_CODE + ASM_ELF_NOTE_END diff --git a/arch/powerpc/kernel/vdso32/sigtramp.S b/arch/powerpc/kernel/vdso32/sigtramp.S new file mode 100644 index 00000000000..cf0c9c9c24f --- /dev/null +++ b/arch/powerpc/kernel/vdso32/sigtramp.S @@ -0,0 +1,299 @@ +/* + * Signal trampolines for 32 bits processes in a ppc64 kernel for + * use in the vDSO + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp. + * Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/unistd.h> +#include <asm/vdso.h> + + .text + +/* The nop here is a hack. The dwarf2 unwind routines subtract 1 from + the return address to get an address in the middle of the presumed + call instruction. Since we don't have a call here, we artificially + extend the range covered by the unwind info by adding a nop before + the real start. */ + nop +V_FUNCTION_BEGIN(__kernel_sigtramp32) +.Lsig_start = . - 4 + li r0,__NR_sigreturn + sc +.Lsig_end: +V_FUNCTION_END(__kernel_sigtramp32) + +.Lsigrt_start: + nop +V_FUNCTION_BEGIN(__kernel_sigtramp_rt32) + li r0,__NR_rt_sigreturn + sc +.Lsigrt_end: +V_FUNCTION_END(__kernel_sigtramp_rt32) + + .section .eh_frame,"a",@progbits + +/* Register r1 can be found at offset 4 of a pt_regs structure. + A pointer to the pt_regs is stored in memory at the old sp plus PTREGS. */ +#define cfa_save \ + .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x23; .uleb128 RSIZE; /* DW_OP_plus_uconst */ \ + .byte 0x06; /* DW_OP_deref */ \ +9: + +/* Register REGNO can be found at offset OFS of a pt_regs structure. + A pointer to the pt_regs is stored in memory at the old sp plus PTREGS. */ +#define rsave(regno, ofs) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .ifne ofs; \ + .byte 0x23; .uleb128 ofs; /* DW_OP_plus_uconst */ \ + .endif; \ +9: + +/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16 + of the VMX reg struct. The VMX reg struct is at offset VREGS of + the pt_regs struct. This macro is for REGNO == 0, and contains + 'subroutines' that the other macros jump to. */ +#define vsave_msr0(regno) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x30 + regno; /* DW_OP_lit0 */ \ +2: \ + .byte 0x40; /* DW_OP_lit16 */ \ + .byte 0x1e; /* DW_OP_mul */ \ +3: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x12; /* DW_OP_dup */ \ + .byte 0x23; /* DW_OP_plus_uconst */ \ + .uleb128 33*RSIZE; /* msr offset */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x0c; .long 1 << 25; /* DW_OP_const4u */ \ + .byte 0x1a; /* DW_OP_and */ \ + .byte 0x12; /* DW_OP_dup, ret 0 if bra taken */ \ + .byte 0x30; /* DW_OP_lit0 */ \ + .byte 0x29; /* DW_OP_eq */ \ + .byte 0x28; .short 0x7fff; /* DW_OP_bra to end */ \ + .byte 0x13; /* DW_OP_drop, pop the 0 */ \ + .byte 0x23; .uleb128 VREGS; /* DW_OP_plus_uconst */ \ + .byte 0x22; /* DW_OP_plus */ \ + .byte 0x2f; .short 0x7fff; /* DW_OP_skip to end */ \ +9: + +/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16 + of the VMX reg struct. REGNO is 1 thru 31. */ +#define vsave_msr1(regno) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x30 + regno; /* DW_OP_lit n */ \ + .byte 0x2f; .short 2b - 9f; /* DW_OP_skip */ \ +9: + +/* If msr bit 1<<25 is set, then VMX register REGNO is at offset OFS of + the VMX save block. */ +#define vsave_msr2(regno, ofs) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x0a; .short ofs; /* DW_OP_const2u */ \ + .byte 0x2f; .short 3b - 9f; /* DW_OP_skip */ \ +9: + +/* VMX register REGNO is at offset OFS of the VMX save area. */ +#define vsave(regno, ofs) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x23; .uleb128 VREGS; /* DW_OP_plus_uconst */ \ + .byte 0x23; .uleb128 ofs; /* DW_OP_plus_uconst */ \ +9: + +/* This is where the pt_regs pointer can be found on the stack. */ +#define PTREGS 64+28 + +/* Size of regs. */ +#define RSIZE 4 + +/* This is the offset of the VMX regs. */ +#define VREGS 48*RSIZE+34*8 + +/* Describe where general purpose regs are saved. */ +#define EH_FRAME_GEN \ + cfa_save; \ + rsave ( 0, 0*RSIZE); \ + rsave ( 2, 2*RSIZE); \ + rsave ( 3, 3*RSIZE); \ + rsave ( 4, 4*RSIZE); \ + rsave ( 5, 5*RSIZE); \ + rsave ( 6, 6*RSIZE); \ + rsave ( 7, 7*RSIZE); \ + rsave ( 8, 8*RSIZE); \ + rsave ( 9, 9*RSIZE); \ + rsave (10, 10*RSIZE); \ + rsave (11, 11*RSIZE); \ + rsave (12, 12*RSIZE); \ + rsave (13, 13*RSIZE); \ + rsave (14, 14*RSIZE); \ + rsave (15, 15*RSIZE); \ + rsave (16, 16*RSIZE); \ + rsave (17, 17*RSIZE); \ + rsave (18, 18*RSIZE); \ + rsave (19, 19*RSIZE); \ + rsave (20, 20*RSIZE); \ + rsave (21, 21*RSIZE); \ + rsave (22, 22*RSIZE); \ + rsave (23, 23*RSIZE); \ + rsave (24, 24*RSIZE); \ + rsave (25, 25*RSIZE); \ + rsave (26, 26*RSIZE); \ + rsave (27, 27*RSIZE); \ + rsave (28, 28*RSIZE); \ + rsave (29, 29*RSIZE); \ + rsave (30, 30*RSIZE); \ + rsave (31, 31*RSIZE); \ + rsave (67, 32*RSIZE); /* ap, used as temp for nip */ \ + rsave (65, 36*RSIZE); /* lr */ \ + rsave (70, 38*RSIZE) /* cr */ + +/* Describe where the FP regs are saved. */ +#define EH_FRAME_FP \ + rsave (32, 48*RSIZE + 0*8); \ + rsave (33, 48*RSIZE + 1*8); \ + rsave (34, 48*RSIZE + 2*8); \ + rsave (35, 48*RSIZE + 3*8); \ + rsave (36, 48*RSIZE + 4*8); \ + rsave (37, 48*RSIZE + 5*8); \ + rsave (38, 48*RSIZE + 6*8); \ + rsave (39, 48*RSIZE + 7*8); \ + rsave (40, 48*RSIZE + 8*8); \ + rsave (41, 48*RSIZE + 9*8); \ + rsave (42, 48*RSIZE + 10*8); \ + rsave (43, 48*RSIZE + 11*8); \ + rsave (44, 48*RSIZE + 12*8); \ + rsave (45, 48*RSIZE + 13*8); \ + rsave (46, 48*RSIZE + 14*8); \ + rsave (47, 48*RSIZE + 15*8); \ + rsave (48, 48*RSIZE + 16*8); \ + rsave (49, 48*RSIZE + 17*8); \ + rsave (50, 48*RSIZE + 18*8); \ + rsave (51, 48*RSIZE + 19*8); \ + rsave (52, 48*RSIZE + 20*8); \ + rsave (53, 48*RSIZE + 21*8); \ + rsave (54, 48*RSIZE + 22*8); \ + rsave (55, 48*RSIZE + 23*8); \ + rsave (56, 48*RSIZE + 24*8); \ + rsave (57, 48*RSIZE + 25*8); \ + rsave (58, 48*RSIZE + 26*8); \ + rsave (59, 48*RSIZE + 27*8); \ + rsave (60, 48*RSIZE + 28*8); \ + rsave (61, 48*RSIZE + 29*8); \ + rsave (62, 48*RSIZE + 30*8); \ + rsave (63, 48*RSIZE + 31*8) + +/* Describe where the VMX regs are saved. */ +#ifdef CONFIG_ALTIVEC +#define EH_FRAME_VMX \ + vsave_msr0 ( 0); \ + vsave_msr1 ( 1); \ + vsave_msr1 ( 2); \ + vsave_msr1 ( 3); \ + vsave_msr1 ( 4); \ + vsave_msr1 ( 5); \ + vsave_msr1 ( 6); \ + vsave_msr1 ( 7); \ + vsave_msr1 ( 8); \ + vsave_msr1 ( 9); \ + vsave_msr1 (10); \ + vsave_msr1 (11); \ + vsave_msr1 (12); \ + vsave_msr1 (13); \ + vsave_msr1 (14); \ + vsave_msr1 (15); \ + vsave_msr1 (16); \ + vsave_msr1 (17); \ + vsave_msr1 (18); \ + vsave_msr1 (19); \ + vsave_msr1 (20); \ + vsave_msr1 (21); \ + vsave_msr1 (22); \ + vsave_msr1 (23); \ + vsave_msr1 (24); \ + vsave_msr1 (25); \ + vsave_msr1 (26); \ + vsave_msr1 (27); \ + vsave_msr1 (28); \ + vsave_msr1 (29); \ + vsave_msr1 (30); \ + vsave_msr1 (31); \ + vsave_msr2 (33, 32*16+12); \ + vsave (32, 32*16) +#else +#define EH_FRAME_VMX +#endif + +.Lcie: + .long .Lcie_end - .Lcie_start +.Lcie_start: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zRS" /* NUL-terminated augmentation string */ + .uleb128 4 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 67 /* Return address register column, ap */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel | DW_EH_PE_sdata4. */ + .byte 0x0c,1,0 /* DW_CFA_def_cfa: r1 ofs 0 */ + .balign 4 +.Lcie_end: + + .long .Lfde0_end - .Lfde0_start +.Lfde0_start: + .long .Lfde0_start - .Lcie /* CIE pointer. */ + .long .Lsig_start - . /* PC start, length */ + .long .Lsig_end - .Lsig_start + .uleb128 0 /* Augmentation */ + EH_FRAME_GEN + EH_FRAME_FP + EH_FRAME_VMX + .balign 4 +.Lfde0_end: + +/* We have a different stack layout for rt_sigreturn. */ +#undef PTREGS +#define PTREGS 64+16+128+20+28 + + .long .Lfde1_end - .Lfde1_start +.Lfde1_start: + .long .Lfde1_start - .Lcie /* CIE pointer. */ + .long .Lsigrt_start - . /* PC start, length */ + .long .Lsigrt_end - .Lsigrt_start + .uleb128 0 /* Augmentation */ + EH_FRAME_GEN + EH_FRAME_FP + EH_FRAME_VMX + .balign 4 +.Lfde1_end: diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S new file mode 100644 index 00000000000..e58ee10fa5c --- /dev/null +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S @@ -0,0 +1,161 @@ +/* + * This is the infamous ld script for the 32 bits vdso + * library + */ +#include <asm/vdso.h> + +#ifdef __LITTLE_ENDIAN__ +OUTPUT_FORMAT("elf32-powerpcle", "elf32-powerpcle", "elf32-powerpcle") +#else +OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", "elf32-powerpc") +#endif +OUTPUT_ARCH(powerpc:common) +ENTRY(_start) + +SECTIONS +{ + . = VDSO32_LBASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + . = ALIGN(16); + .text : { + *(.text .stub .text.* .gnu.linkonce.t.* __ftr_alt_*) + } :text + PROVIDE(__etext = .); + PROVIDE(_etext = .); + PROVIDE(etext = .); + + . = ALIGN(8); + __ftr_fixup : { *(__ftr_fixup) } + + . = ALIGN(8); + __mmu_ftr_fixup : { *(__mmu_ftr_fixup) } + + . = ALIGN(8); + __lwsync_fixup : { *(__lwsync_fixup) } + +#ifdef CONFIG_PPC64 + . = ALIGN(8); + __fw_ftr_fixup : { *(__fw_ftr_fixup) } +#endif + + /* + * Other stuff is appended to the text segment: + */ + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .gcc_except_table : { *(.gcc_except_table) } + .fixup : { *(.fixup) } + + .dynamic : { *(.dynamic) } :text :dynamic + .got : { *(.got) } :text + .plt : { *(.plt) } + + _end = .; + __end = .; + PROVIDE(end = .); + + /* + * Stabs debugging sections are here too. + */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + + /* + * DWARF debug sections. + * Symbols in the DWARF debugging sections are relative to the beginning + * of the section so we begin them at 0. + */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + + /DISCARD/ : { + *(.note.GNU-stack) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + VDSO_VERSION_STRING { + global: + /* + * Has to be there for the kernel to find + */ + __kernel_datapage_offset; + + __kernel_get_syscall_map; + __kernel_gettimeofday; + __kernel_clock_gettime; + __kernel_clock_getres; + __kernel_get_tbfreq; + __kernel_sync_dicache; + __kernel_sync_dicache_p5; + __kernel_sigtramp32; + __kernel_sigtramp_rt32; +#ifdef CONFIG_PPC64 + __kernel_getcpu; +#endif + __kernel_time; + + local: *; + }; +} diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S new file mode 100644 index 00000000000..6ac107ac402 --- /dev/null +++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S @@ -0,0 +1,13 @@ +#include <linux/linkage.h> +#include <asm/page.h> + + __PAGE_ALIGNED_DATA + + .globl vdso32_start, vdso32_end + .balign PAGE_SIZE +vdso32_start: + .incbin "arch/powerpc/kernel/vdso32/vdso32.so.dbg" + .balign PAGE_SIZE +vdso32_end: + + .previous diff --git a/arch/powerpc/kernel/vdso64/.gitignore b/arch/powerpc/kernel/vdso64/.gitignore new file mode 100644 index 00000000000..77a0b423642 --- /dev/null +++ b/arch/powerpc/kernel/vdso64/.gitignore @@ -0,0 +1,2 @@ +vdso64.lds +vdso64.so.dbg diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile new file mode 100644 index 00000000000..effca9404b1 --- /dev/null +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -0,0 +1,51 @@ +# List of files in the vdso, has to be asm only for now + +obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o + +# Build rules + +targets := $(obj-vdso64) vdso64.so vdso64.so.dbg +obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) + +GCOV_PROFILE := n + +ccflags-y := -shared -fno-common -fno-builtin +ccflags-y += -nostdlib -Wl,-soname=linux-vdso64.so.1 \ + $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) +asflags-y := -D__VDSO64__ -s + +obj-y += vdso64_wrapper.o +extra-y += vdso64.lds +CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) + +# Force dependency (incbin is bad) +$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so + +# link rule for the .so file, .lds has to be first +$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) + $(call if_changed,vdso64ld) + +# strip rule for the .so file +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +# assembly rules for the .S files +$(obj-vdso64): %.o: %.S + $(call if_changed_dep,vdso64as) + +# actual build commands +quiet_cmd_vdso64ld = VDSO64L $@ + cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $^ -o $@ +quiet_cmd_vdso64as = VDSO64A $@ + cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $< + +# install commands for the unstripped file +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ + +vdso64.so: $(obj)/vdso64.so.dbg + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso_install: vdso64.so diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S new file mode 100644 index 00000000000..69c5af2b3c9 --- /dev/null +++ b/arch/powerpc/kernel/vdso64/cacheflush.S @@ -0,0 +1,84 @@ +/* + * vDSO provided cache flush routines + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), + * IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/vdso.h> +#include <asm/asm-offsets.h> + + .text + +/* + * Default "generic" version of __kernel_sync_dicache. + * + * void __kernel_sync_dicache(unsigned long start, unsigned long end) + * + * Flushes the data cache & invalidate the instruction cache for the + * provided range [start, end[ + */ +V_FUNCTION_BEGIN(__kernel_sync_dicache) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + mr r11,r3 + bl V_LOCAL_FUNC(__get_datapage) + mtlr r12 + mr r10,r3 + + lwz r7,CFG_DCACHE_BLOCKSZ(r10) + addi r5,r7,-1 + andc r6,r11,r5 /* round low to line bdy */ + subf r8,r6,r4 /* compute length */ + add r8,r8,r5 /* ensure we get enough */ + lwz r9,CFG_DCACHE_LOGBLOCKSZ(r10) + srw. r8,r8,r9 /* compute line count */ + crclr cr0*4+so + beqlr /* nothing to do? */ + mtctr r8 +1: dcbst 0,r6 + add r6,r6,r7 + bdnz 1b + sync + +/* Now invalidate the instruction cache */ + + lwz r7,CFG_ICACHE_BLOCKSZ(r10) + addi r5,r7,-1 + andc r6,r11,r5 /* round low to line bdy */ + subf r8,r6,r4 /* compute length */ + add r8,r8,r5 + lwz r9,CFG_ICACHE_LOGBLOCKSZ(r10) + srw. r8,r8,r9 /* compute line count */ + crclr cr0*4+so + beqlr /* nothing to do? */ + mtctr r8 +2: icbi 0,r6 + add r6,r6,r7 + bdnz 2b + isync + li r3,0 + blr + .cfi_endproc +V_FUNCTION_END(__kernel_sync_dicache) + + +/* + * POWER5 version of __kernel_sync_dicache + */ +V_FUNCTION_BEGIN(__kernel_sync_dicache_p5) + .cfi_startproc + crclr cr0*4+so + sync + isync + li r3,0 + blr + .cfi_endproc +V_FUNCTION_END(__kernel_sync_dicache_p5) diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S new file mode 100644 index 00000000000..79796de1173 --- /dev/null +++ b/arch/powerpc/kernel/vdso64/datapage.S @@ -0,0 +1,85 @@ +/* + * Access to the shared data page by the vDSO & syscall map + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> +#include <asm/vdso.h> + + .text +V_FUNCTION_BEGIN(__get_datapage) + .cfi_startproc + /* We don't want that exposed or overridable as we want other objects + * to be able to bl directly to here + */ + .protected __get_datapage + .hidden __get_datapage + + mflr r0 + .cfi_register lr,r0 + + bcl 20,31,1f + .global __kernel_datapage_offset; +__kernel_datapage_offset: + .long 0 +1: + mflr r3 + mtlr r0 + lwz r0,0(r3) + add r3,r0,r3 + blr + .cfi_endproc +V_FUNCTION_END(__get_datapage) + +/* + * void *__kernel_get_syscall_map(unsigned int *syscall_count) ; + * + * returns a pointer to the syscall map. the map is agnostic to the + * size of "long", unlike kernel bitops, it stores bits from top to + * bottom so that memory actually contains a linear bitmap + * check for syscall N by testing bit (0x80000000 >> (N & 0x1f)) of + * 32 bits int at N >> 5. + */ +V_FUNCTION_BEGIN(__kernel_get_syscall_map) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + mr r4,r3 + bl V_LOCAL_FUNC(__get_datapage) + mtlr r12 + addi r3,r3,CFG_SYSCALL_MAP64 + cmpli cr0,r4,0 + crclr cr0*4+so + beqlr + li r0,__NR_syscalls + stw r0,0(r4) + blr + .cfi_endproc +V_FUNCTION_END(__kernel_get_syscall_map) + + +/* + * void unsigned long __kernel_get_tbfreq(void); + * + * returns the timebase frequency in HZ + */ +V_FUNCTION_BEGIN(__kernel_get_tbfreq) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + bl V_LOCAL_FUNC(__get_datapage) + ld r3,CFG_TB_TICKS_PER_SEC(r3) + mtlr r12 + crclr cr0*4+so + blr + .cfi_endproc +V_FUNCTION_END(__kernel_get_tbfreq) diff --git a/arch/powerpc/kernel/vdso64/getcpu.S b/arch/powerpc/kernel/vdso64/getcpu.S new file mode 100644 index 00000000000..23eb9a9441b --- /dev/null +++ b/arch/powerpc/kernel/vdso64/getcpu.S @@ -0,0 +1,45 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/ppc_asm.h> +#include <asm/vdso.h> + + .text +/* + * Exact prototype of getcpu + * + * int __kernel_getcpu(unsigned *cpu, unsigned *node); + * + */ +V_FUNCTION_BEGIN(__kernel_getcpu) + .cfi_startproc + mfspr r5,SPRN_SPRG_VDSO_READ + cmpdi cr0,r3,0 + cmpdi cr1,r4,0 + clrlwi r6,r5,16 + rlwinm r7,r5,16,31-15,31-0 + beq cr0,1f + stw r6,0(r3) +1: beq cr1,2f + stw r7,0(r4) +2: crclr cr0*4+so + li r3,0 /* always success */ + blr + .cfi_endproc +V_FUNCTION_END(__kernel_getcpu) diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S new file mode 100644 index 00000000000..a76b4af37ef --- /dev/null +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S @@ -0,0 +1,244 @@ +/* + * Userland implementation of gettimeofday() for 64 bits processes in a + * ppc64 kernel for use in the vDSO + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), + * IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/vdso.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> + + .text +/* + * Exact prototype of gettimeofday + * + * int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); + * + */ +V_FUNCTION_BEGIN(__kernel_gettimeofday) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + + mr r11,r3 /* r11 holds tv */ + mr r10,r4 /* r10 holds tz */ + bl V_LOCAL_FUNC(__get_datapage) /* get data page */ + cmpldi r11,0 /* check if tv is NULL */ + beq 2f + lis r7,1000000@ha /* load up USEC_PER_SEC */ + addi r7,r7,1000000@l + bl V_LOCAL_FUNC(__do_get_tspec) /* get sec/us from tb & kernel */ + std r4,TVAL64_TV_SEC(r11) /* store sec in tv */ + std r5,TVAL64_TV_USEC(r11) /* store usec in tv */ +2: cmpldi r10,0 /* check if tz is NULL */ + beq 1f + lwz r4,CFG_TZ_MINUTEWEST(r3)/* fill tz */ + lwz r5,CFG_TZ_DSTTIME(r3) + stw r4,TZONE_TZ_MINWEST(r10) + stw r5,TZONE_TZ_DSTTIME(r10) +1: mtlr r12 + crclr cr0*4+so + li r3,0 /* always success */ + blr + .cfi_endproc +V_FUNCTION_END(__kernel_gettimeofday) + + +/* + * Exact prototype of clock_gettime() + * + * int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); + * + */ +V_FUNCTION_BEGIN(__kernel_clock_gettime) + .cfi_startproc + /* Check for supported clock IDs */ + cmpwi cr0,r3,CLOCK_REALTIME + cmpwi cr1,r3,CLOCK_MONOTONIC + cror cr0*4+eq,cr0*4+eq,cr1*4+eq + bne cr0,99f + + mflr r12 /* r12 saves lr */ + .cfi_register lr,r12 + mr r11,r4 /* r11 saves tp */ + bl V_LOCAL_FUNC(__get_datapage) /* get data page */ + lis r7,NSEC_PER_SEC@h /* want nanoseconds */ + ori r7,r7,NSEC_PER_SEC@l +50: bl V_LOCAL_FUNC(__do_get_tspec) /* get time from tb & kernel */ + bne cr1,80f /* if not monotonic, all done */ + + /* + * CLOCK_MONOTONIC + */ + + /* now we must fixup using wall to monotonic. We need to snapshot + * that value and do the counter trick again. Fortunately, we still + * have the counter value in r8 that was returned by __do_get_tspec. + * At this point, r4,r5 contain our sec/nsec values. + */ + + lwa r6,WTOM_CLOCK_SEC(r3) + lwa r9,WTOM_CLOCK_NSEC(r3) + + /* We now have our result in r6,r9. We create a fake dependency + * on that result and re-check the counter + */ + or r0,r6,r9 + xor r0,r0,r0 + add r3,r3,r0 + ld r0,CFG_TB_UPDATE_COUNT(r3) + cmpld cr0,r0,r8 /* check if updated */ + bne- 50b + + /* Add wall->monotonic offset and check for overflow or underflow. + */ + add r4,r4,r6 + add r5,r5,r9 + cmpd cr0,r5,r7 + cmpdi cr1,r5,0 + blt 1f + subf r5,r7,r5 + addi r4,r4,1 +1: bge cr1,80f + addi r4,r4,-1 + add r5,r5,r7 + +80: std r4,TSPC64_TV_SEC(r11) + std r5,TSPC64_TV_NSEC(r11) + + mtlr r12 + crclr cr0*4+so + li r3,0 + blr + + /* + * syscall fallback + */ +99: + li r0,__NR_clock_gettime + sc + blr + .cfi_endproc +V_FUNCTION_END(__kernel_clock_gettime) + + +/* + * Exact prototype of clock_getres() + * + * int __kernel_clock_getres(clockid_t clock_id, struct timespec *res); + * + */ +V_FUNCTION_BEGIN(__kernel_clock_getres) + .cfi_startproc + /* Check for supported clock IDs */ + cmpwi cr0,r3,CLOCK_REALTIME + cmpwi cr1,r3,CLOCK_MONOTONIC + cror cr0*4+eq,cr0*4+eq,cr1*4+eq + bne cr0,99f + + li r3,0 + cmpli cr0,r4,0 + crclr cr0*4+so + beqlr + lis r5,CLOCK_REALTIME_RES@h + ori r5,r5,CLOCK_REALTIME_RES@l + std r3,TSPC64_TV_SEC(r4) + std r5,TSPC64_TV_NSEC(r4) + blr + + /* + * syscall fallback + */ +99: + li r0,__NR_clock_getres + sc + blr + .cfi_endproc +V_FUNCTION_END(__kernel_clock_getres) + +/* + * Exact prototype of time() + * + * time_t time(time *t); + * + */ +V_FUNCTION_BEGIN(__kernel_time) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + + mr r11,r3 /* r11 holds t */ + bl V_LOCAL_FUNC(__get_datapage) + + ld r4,STAMP_XTIME+TSPC64_TV_SEC(r3) + + cmpldi r11,0 /* check if t is NULL */ + beq 2f + std r4,0(r11) /* store result at *t */ +2: mtlr r12 + crclr cr0*4+so + mr r3,r4 + blr + .cfi_endproc +V_FUNCTION_END(__kernel_time) + + +/* + * This is the core of clock_gettime() and gettimeofday(), + * it returns the current time in r4 (seconds) and r5. + * On entry, r7 gives the resolution of r5, either USEC_PER_SEC + * or NSEC_PER_SEC, giving r5 in microseconds or nanoseconds. + * It expects the datapage ptr in r3 and doesn't clobber it. + * It clobbers r0, r6 and r9. + * On return, r8 contains the counter value that can be reused. + * This clobbers cr0 but not any other cr field. + */ +V_FUNCTION_BEGIN(__do_get_tspec) + .cfi_startproc + /* check for update count & load values */ +1: ld r8,CFG_TB_UPDATE_COUNT(r3) + andi. r0,r8,1 /* pending update ? loop */ + bne- 1b + xor r0,r8,r8 /* create dependency */ + add r3,r3,r0 + + /* Get TB & offset it. We use the MFTB macro which will generate + * workaround code for Cell. + */ + MFTB(r6) + ld r9,CFG_TB_ORIG_STAMP(r3) + subf r6,r9,r6 + + /* Scale result */ + ld r5,CFG_TB_TO_XS(r3) + sldi r6,r6,12 /* compute time since stamp_xtime */ + mulhdu r6,r6,r5 /* in units of 2^-32 seconds */ + + /* Add stamp since epoch */ + ld r4,STAMP_XTIME+TSPC64_TV_SEC(r3) + lwz r5,STAMP_SEC_FRAC(r3) + or r0,r4,r5 + or r0,r0,r6 + xor r0,r0,r0 + add r3,r3,r0 + ld r0,CFG_TB_UPDATE_COUNT(r3) + cmpld r0,r8 /* check if updated */ + bne- 1b /* reload if so */ + + /* convert to seconds & nanoseconds and add to stamp */ + add r6,r6,r5 /* add on fractional seconds of xtime */ + mulhwu r5,r6,r7 /* compute micro or nanoseconds and */ + srdi r6,r6,32 /* seconds since stamp_xtime */ + clrldi r5,r5,32 + add r4,r4,r6 + blr + .cfi_endproc +V_FUNCTION_END(__do_get_tspec) diff --git a/arch/powerpc/kernel/vdso64/note.S b/arch/powerpc/kernel/vdso64/note.S new file mode 100644 index 00000000000..dc2a509f7e8 --- /dev/null +++ b/arch/powerpc/kernel/vdso64/note.S @@ -0,0 +1 @@ +#include "../vdso32/note.S" diff --git a/arch/powerpc/kernel/vdso64/sigtramp.S b/arch/powerpc/kernel/vdso64/sigtramp.S new file mode 100644 index 00000000000..542c6f422e4 --- /dev/null +++ b/arch/powerpc/kernel/vdso64/sigtramp.S @@ -0,0 +1,311 @@ +/* + * Signal trampoline for 64 bits processes in a ppc64 kernel for + * use in the vDSO + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp. + * Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/unistd.h> +#include <asm/vdso.h> +#include <asm/ptrace.h> /* XXX for __SIGNAL_FRAMESIZE */ + + .text + +/* The nop here is a hack. The dwarf2 unwind routines subtract 1 from + the return address to get an address in the middle of the presumed + call instruction. Since we don't have a call here, we artificially + extend the range covered by the unwind info by padding before the + real start. */ + nop + .balign 8 +V_FUNCTION_BEGIN(__kernel_sigtramp_rt64) +.Lsigrt_start = . - 4 + addi r1, r1, __SIGNAL_FRAMESIZE + li r0,__NR_rt_sigreturn + sc +.Lsigrt_end: +V_FUNCTION_END(__kernel_sigtramp_rt64) +/* The ".balign 8" above and the following zeros mimic the old stack + trampoline layout. The last magic value is the ucontext pointer, + chosen in such a way that older libgcc unwind code returns a zero + for a sigcontext pointer. */ + .long 0,0,0 + .quad 0,-21*8 + +/* Register r1 can be found at offset 8 of a pt_regs structure. + A pointer to the pt_regs is stored in memory at the old sp plus PTREGS. */ +#define cfa_save \ + .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x23; .uleb128 RSIZE; /* DW_OP_plus_uconst */ \ + .byte 0x06; /* DW_OP_deref */ \ +9: + +/* Register REGNO can be found at offset OFS of a pt_regs structure. + A pointer to the pt_regs is stored in memory at the old sp plus PTREGS. */ +#define rsave(regno, ofs) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .ifne ofs; \ + .byte 0x23; .uleb128 ofs; /* DW_OP_plus_uconst */ \ + .endif; \ +9: + +/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16 + of the VMX reg struct. A pointer to the VMX reg struct is at VREGS in + the pt_regs struct. This macro is for REGNO == 0, and contains + 'subroutines' that the other macros jump to. */ +#define vsave_msr0(regno) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x30 + regno; /* DW_OP_lit0 */ \ +2: \ + .byte 0x40; /* DW_OP_lit16 */ \ + .byte 0x1e; /* DW_OP_mul */ \ +3: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x12; /* DW_OP_dup */ \ + .byte 0x23; /* DW_OP_plus_uconst */ \ + .uleb128 33*RSIZE; /* msr offset */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x0c; .long 1 << 25; /* DW_OP_const4u */ \ + .byte 0x1a; /* DW_OP_and */ \ + .byte 0x12; /* DW_OP_dup, ret 0 if bra taken */ \ + .byte 0x30; /* DW_OP_lit0 */ \ + .byte 0x29; /* DW_OP_eq */ \ + .byte 0x28; .short 0x7fff; /* DW_OP_bra to end */ \ + .byte 0x13; /* DW_OP_drop, pop the 0 */ \ + .byte 0x23; .uleb128 VREGS; /* DW_OP_plus_uconst */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x22; /* DW_OP_plus */ \ + .byte 0x2f; .short 0x7fff; /* DW_OP_skip to end */ \ +9: + +/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16 + of the VMX reg struct. REGNO is 1 thru 31. */ +#define vsave_msr1(regno) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x30 + regno; /* DW_OP_lit n */ \ + .byte 0x2f; .short 2b - 9f; /* DW_OP_skip */ \ +9: + +/* If msr bit 1<<25 is set, then VMX register REGNO is at offset OFS of + the VMX save block. */ +#define vsave_msr2(regno, ofs) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x0a; .short ofs; /* DW_OP_const2u */ \ + .byte 0x2f; .short 3b - 9f; /* DW_OP_skip */ \ +9: + +/* VMX register REGNO is at offset OFS of the VMX save area. */ +#define vsave(regno, ofs) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x23; .uleb128 VREGS; /* DW_OP_plus_uconst */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x23; .uleb128 ofs; /* DW_OP_plus_uconst */ \ +9: + +/* This is where the pt_regs pointer can be found on the stack. */ +#define PTREGS 128+168+56 + +/* Size of regs. */ +#define RSIZE 8 + +/* Size of CR reg in DWARF unwind info. */ +#define CRSIZE 4 + +/* Offset of CR reg within a full word. */ +#ifdef __LITTLE_ENDIAN__ +#define CROFF 0 +#else +#define CROFF (RSIZE - CRSIZE) +#endif + +/* This is the offset of the VMX reg pointer. */ +#define VREGS 48*RSIZE+33*8 + +/* Describe where general purpose regs are saved. */ +#define EH_FRAME_GEN \ + cfa_save; \ + rsave ( 0, 0*RSIZE); \ + rsave ( 2, 2*RSIZE); \ + rsave ( 3, 3*RSIZE); \ + rsave ( 4, 4*RSIZE); \ + rsave ( 5, 5*RSIZE); \ + rsave ( 6, 6*RSIZE); \ + rsave ( 7, 7*RSIZE); \ + rsave ( 8, 8*RSIZE); \ + rsave ( 9, 9*RSIZE); \ + rsave (10, 10*RSIZE); \ + rsave (11, 11*RSIZE); \ + rsave (12, 12*RSIZE); \ + rsave (13, 13*RSIZE); \ + rsave (14, 14*RSIZE); \ + rsave (15, 15*RSIZE); \ + rsave (16, 16*RSIZE); \ + rsave (17, 17*RSIZE); \ + rsave (18, 18*RSIZE); \ + rsave (19, 19*RSIZE); \ + rsave (20, 20*RSIZE); \ + rsave (21, 21*RSIZE); \ + rsave (22, 22*RSIZE); \ + rsave (23, 23*RSIZE); \ + rsave (24, 24*RSIZE); \ + rsave (25, 25*RSIZE); \ + rsave (26, 26*RSIZE); \ + rsave (27, 27*RSIZE); \ + rsave (28, 28*RSIZE); \ + rsave (29, 29*RSIZE); \ + rsave (30, 30*RSIZE); \ + rsave (31, 31*RSIZE); \ + rsave (67, 32*RSIZE); /* ap, used as temp for nip */ \ + rsave (65, 36*RSIZE); /* lr */ \ + rsave (68, 38*RSIZE + CROFF); /* cr fields */ \ + rsave (69, 38*RSIZE + CROFF); \ + rsave (70, 38*RSIZE + CROFF); \ + rsave (71, 38*RSIZE + CROFF); \ + rsave (72, 38*RSIZE + CROFF); \ + rsave (73, 38*RSIZE + CROFF); \ + rsave (74, 38*RSIZE + CROFF); \ + rsave (75, 38*RSIZE + CROFF) + +/* Describe where the FP regs are saved. */ +#define EH_FRAME_FP \ + rsave (32, 48*RSIZE + 0*8); \ + rsave (33, 48*RSIZE + 1*8); \ + rsave (34, 48*RSIZE + 2*8); \ + rsave (35, 48*RSIZE + 3*8); \ + rsave (36, 48*RSIZE + 4*8); \ + rsave (37, 48*RSIZE + 5*8); \ + rsave (38, 48*RSIZE + 6*8); \ + rsave (39, 48*RSIZE + 7*8); \ + rsave (40, 48*RSIZE + 8*8); \ + rsave (41, 48*RSIZE + 9*8); \ + rsave (42, 48*RSIZE + 10*8); \ + rsave (43, 48*RSIZE + 11*8); \ + rsave (44, 48*RSIZE + 12*8); \ + rsave (45, 48*RSIZE + 13*8); \ + rsave (46, 48*RSIZE + 14*8); \ + rsave (47, 48*RSIZE + 15*8); \ + rsave (48, 48*RSIZE + 16*8); \ + rsave (49, 48*RSIZE + 17*8); \ + rsave (50, 48*RSIZE + 18*8); \ + rsave (51, 48*RSIZE + 19*8); \ + rsave (52, 48*RSIZE + 20*8); \ + rsave (53, 48*RSIZE + 21*8); \ + rsave (54, 48*RSIZE + 22*8); \ + rsave (55, 48*RSIZE + 23*8); \ + rsave (56, 48*RSIZE + 24*8); \ + rsave (57, 48*RSIZE + 25*8); \ + rsave (58, 48*RSIZE + 26*8); \ + rsave (59, 48*RSIZE + 27*8); \ + rsave (60, 48*RSIZE + 28*8); \ + rsave (61, 48*RSIZE + 29*8); \ + rsave (62, 48*RSIZE + 30*8); \ + rsave (63, 48*RSIZE + 31*8) + +/* Describe where the VMX regs are saved. */ +#ifdef CONFIG_ALTIVEC +#define EH_FRAME_VMX \ + vsave_msr0 ( 0); \ + vsave_msr1 ( 1); \ + vsave_msr1 ( 2); \ + vsave_msr1 ( 3); \ + vsave_msr1 ( 4); \ + vsave_msr1 ( 5); \ + vsave_msr1 ( 6); \ + vsave_msr1 ( 7); \ + vsave_msr1 ( 8); \ + vsave_msr1 ( 9); \ + vsave_msr1 (10); \ + vsave_msr1 (11); \ + vsave_msr1 (12); \ + vsave_msr1 (13); \ + vsave_msr1 (14); \ + vsave_msr1 (15); \ + vsave_msr1 (16); \ + vsave_msr1 (17); \ + vsave_msr1 (18); \ + vsave_msr1 (19); \ + vsave_msr1 (20); \ + vsave_msr1 (21); \ + vsave_msr1 (22); \ + vsave_msr1 (23); \ + vsave_msr1 (24); \ + vsave_msr1 (25); \ + vsave_msr1 (26); \ + vsave_msr1 (27); \ + vsave_msr1 (28); \ + vsave_msr1 (29); \ + vsave_msr1 (30); \ + vsave_msr1 (31); \ + vsave_msr2 (33, 32*16+12); \ + vsave (32, 33*16) +#else +#define EH_FRAME_VMX +#endif + + .section .eh_frame,"a",@progbits +.Lcie: + .long .Lcie_end - .Lcie_start +.Lcie_start: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zRS" /* NUL-terminated augmentation string */ + .uleb128 4 /* Code alignment factor */ + .sleb128 -8 /* Data alignment factor */ + .byte 67 /* Return address register column, ap */ + .uleb128 1 /* Augmentation value length */ + .byte 0x14 /* DW_EH_PE_pcrel | DW_EH_PE_udata8. */ + .byte 0x0c,1,0 /* DW_CFA_def_cfa: r1 ofs 0 */ + .balign 8 +.Lcie_end: + + .long .Lfde0_end - .Lfde0_start +.Lfde0_start: + .long .Lfde0_start - .Lcie /* CIE pointer. */ + .quad .Lsigrt_start - . /* PC start, length */ + .quad .Lsigrt_end - .Lsigrt_start + .uleb128 0 /* Augmentation */ + EH_FRAME_GEN + EH_FRAME_FP + EH_FRAME_VMX +# Do we really need to describe the frame at this point? ie. will +# we ever have some call chain that returns somewhere past the addi? +# I don't think so, since gcc doesn't support async signals. +# .byte 0x41 /* DW_CFA_advance_loc 1*4 */ +#undef PTREGS +#define PTREGS 168+56 +# EH_FRAME_GEN +# EH_FRAME_FP +# EH_FRAME_VMX + .balign 8 +.Lfde0_end: diff --git a/arch/powerpc/kernel/vdso64/vdso64.lds.S b/arch/powerpc/kernel/vdso64/vdso64.lds.S new file mode 100644 index 00000000000..64fb183a47c --- /dev/null +++ b/arch/powerpc/kernel/vdso64/vdso64.lds.S @@ -0,0 +1,158 @@ +/* + * This is the infamous ld script for the 64 bits vdso + * library + */ +#include <asm/vdso.h> + +#ifdef __LITTLE_ENDIAN__ +OUTPUT_FORMAT("elf64-powerpcle", "elf64-powerpcle", "elf64-powerpcle") +#else +OUTPUT_FORMAT("elf64-powerpc", "elf64-powerpc", "elf64-powerpc") +#endif +OUTPUT_ARCH(powerpc:common64) +ENTRY(_start) + +SECTIONS +{ + . = VDSO64_LBASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + . = ALIGN(16); + .text : { + *(.text .stub .text.* .gnu.linkonce.t.* __ftr_alt_*) + *(.sfpr .glink) + } :text + PROVIDE(__etext = .); + PROVIDE(_etext = .); + PROVIDE(etext = .); + + . = ALIGN(8); + __ftr_fixup : { *(__ftr_fixup) } + + . = ALIGN(8); + __mmu_ftr_fixup : { *(__mmu_ftr_fixup) } + + . = ALIGN(8); + __lwsync_fixup : { *(__lwsync_fixup) } + + . = ALIGN(8); + __fw_ftr_fixup : { *(__fw_ftr_fixup) } + + /* + * Other stuff is appended to the text segment: + */ + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .gcc_except_table : { *(.gcc_except_table) } + .rela.dyn ALIGN(8) : { *(.rela.dyn) } + + .opd ALIGN(8) : { KEEP (*(.opd)) } + .got ALIGN(8) : { *(.got .toc) } + + _end = .; + PROVIDE(end = .); + + /* + * Stabs debugging sections are here too. + */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + + /* + * DWARF debug sections. + * Symbols in the DWARF debugging sections are relative to the beginning + * of the section so we begin them at 0. + */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + + /DISCARD/ : { + *(.note.GNU-stack) + *(.branch_lt) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + VDSO_VERSION_STRING { + global: + /* + * Has to be there for the kernel to find + */ + __kernel_datapage_offset; + + __kernel_get_syscall_map; + __kernel_gettimeofday; + __kernel_clock_gettime; + __kernel_clock_getres; + __kernel_get_tbfreq; + __kernel_sync_dicache; + __kernel_sync_dicache_p5; + __kernel_sigtramp_rt64; + __kernel_getcpu; + __kernel_time; + + local: *; + }; +} diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S new file mode 100644 index 00000000000..df60fca6a13 --- /dev/null +++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S @@ -0,0 +1,13 @@ +#include <linux/linkage.h> +#include <asm/page.h> + + __PAGE_ALIGNED_DATA + + .globl vdso64_start, vdso64_end + .balign PAGE_SIZE +vdso64_start: + .incbin "arch/powerpc/kernel/vdso64/vdso64.so.dbg" + .balign PAGE_SIZE +vdso64_end: + + .previous diff --git a/arch/powerpc/kernel/vecemu.c b/arch/powerpc/kernel/vecemu.c index 604d0947cb2..c4bfadb2606 100644 --- a/arch/powerpc/kernel/vecemu.c +++ b/arch/powerpc/kernel/vecemu.c @@ -271,7 +271,7 @@ int emulate_altivec(struct pt_regs *regs) vb = (instr >> 11) & 0x1f; vc = (instr >> 6) & 0x1f; - vrs = current->thread.vr; + vrs = current->thread.vr_state.vr; switch (instr & 0x3f) { case 10: switch (vc) { @@ -320,12 +320,12 @@ int emulate_altivec(struct pt_regs *regs) case 14: /* vctuxs */ for (i = 0; i < 4; ++i) vrs[vd].u[i] = ctuxs(vrs[vb].u[i], va, - ¤t->thread.vscr.u[3]); + ¤t->thread.vr_state.vscr.u[3]); break; case 15: /* vctsxs */ for (i = 0; i < 4; ++i) vrs[vd].u[i] = ctsxs(vrs[vb].u[i], va, - ¤t->thread.vscr.u[3]); + ¤t->thread.vr_state.vscr.u[3]); break; default: return -EINVAL; diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index 66b3d03c5fa..74f8050518d 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -1,6 +1,295 @@ -#include <linux/config.h> +#include <asm/processor.h> #include <asm/ppc_asm.h> #include <asm/reg.h> +#include <asm/asm-offsets.h> +#include <asm/cputable.h> +#include <asm/thread_info.h> +#include <asm/page.h> +#include <asm/ptrace.h> + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* void do_load_up_transact_altivec(struct thread_struct *thread) + * + * This is similar to load_up_altivec but for the transactional version of the + * vector regs. It doesn't mess with the task MSR or valid flags. + * Furthermore, VEC laziness is not supported with TM currently. + */ +_GLOBAL(do_load_up_transact_altivec) + mfmsr r6 + oris r5,r6,MSR_VEC@h + MTMSRD(r5) + isync + + li r4,1 + stw r4,THREAD_USED_VR(r3) + + li r10,THREAD_TRANSACT_VRSTATE+VRSTATE_VSCR + lvx vr0,r10,r3 + mtvscr vr0 + addi r10,r3,THREAD_TRANSACT_VRSTATE + REST_32VRS(0,r4,r10) + + /* Disable VEC again. */ + MTMSRD(r6) + isync + + blr +#endif + +/* + * Enable use of VMX/Altivec for the caller. + */ +_GLOBAL(vec_enable) + mfmsr r3 + oris r3,r3,MSR_VEC@h + MTMSRD(r3) + isync + blr + +/* + * Load state from memory into VMX registers including VSCR. + * Assumes the caller has enabled VMX in the MSR. + */ +_GLOBAL(load_vr_state) + li r4,VRSTATE_VSCR + lvx vr0,r4,r3 + mtvscr vr0 + REST_32VRS(0,r4,r3) + blr + +/* + * Store VMX state into memory, including VSCR. + * Assumes the caller has enabled VMX in the MSR. + */ +_GLOBAL(store_vr_state) + SAVE_32VRS(0, r4, r3) + mfvscr vr0 + li r4, VRSTATE_VSCR + stvx vr0, r4, r3 + blr + +/* + * Disable VMX for the task which had it previously, + * and save its vector registers in its thread_struct. + * Enables the VMX for use in the kernel on return. + * On SMP we know the VMX is free, since we give it up every + * switch (ie, no lazy save of the vector registers). + * + * Note that on 32-bit this can only use registers that will be + * restored by fast_exception_return, i.e. r3 - r6, r10 and r11. + */ +_GLOBAL(load_up_altivec) + mfmsr r5 /* grab the current MSR */ + oris r5,r5,MSR_VEC@h + MTMSRD(r5) /* enable use of AltiVec now */ + isync + +/* + * For SMP, we don't do lazy VMX switching because it just gets too + * horrendously complex, especially when a task switches from one CPU + * to another. Instead we call giveup_altvec in switch_to. + * VRSAVE isn't dealt with here, that is done in the normal context + * switch code. Note that we could rely on vrsave value to eventually + * avoid saving all of the VREGs here... + */ +#ifndef CONFIG_SMP + LOAD_REG_ADDRBASE(r3, last_task_used_altivec) + toreal(r3) + PPC_LL r4,ADDROFF(last_task_used_altivec)(r3) + PPC_LCMPI 0,r4,0 + beq 1f + + /* Save VMX state to last_task_used_altivec's THREAD struct */ + toreal(r4) + addi r4,r4,THREAD + addi r6,r4,THREAD_VRSTATE + SAVE_32VRS(0,r5,r6) + mfvscr vr0 + li r10,VRSTATE_VSCR + stvx vr0,r10,r6 + /* Disable VMX for last_task_used_altivec */ + PPC_LL r5,PT_REGS(r4) + toreal(r5) + PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5) + lis r10,MSR_VEC@h + andc r4,r4,r10 + PPC_STL r4,_MSR-STACK_FRAME_OVERHEAD(r5) +1: +#endif /* CONFIG_SMP */ + + /* Hack: if we get an altivec unavailable trap with VRSAVE + * set to all zeros, we assume this is a broken application + * that fails to set it properly, and thus we switch it to + * all 1's + */ + mfspr r4,SPRN_VRSAVE + cmpwi 0,r4,0 + bne+ 1f + li r4,-1 + mtspr SPRN_VRSAVE,r4 +1: + /* enable use of VMX after return */ +#ifdef CONFIG_PPC32 + mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ + oris r9,r9,MSR_VEC@h +#else + ld r4,PACACURRENT(r13) + addi r5,r4,THREAD /* Get THREAD */ + oris r12,r12,MSR_VEC@h + std r12,_MSR(r1) +#endif + addi r6,r5,THREAD_VRSTATE + li r4,1 + li r10,VRSTATE_VSCR + stw r4,THREAD_USED_VR(r5) + lvx vr0,r10,r6 + mtvscr vr0 + REST_32VRS(0,r4,r6) +#ifndef CONFIG_SMP + /* Update last_task_used_altivec to 'current' */ + subi r4,r5,THREAD /* Back to 'current' */ + fromreal(r4) + PPC_STL r4,ADDROFF(last_task_used_altivec)(r3) +#endif /* CONFIG_SMP */ + /* restore registers and return */ + blr + +_GLOBAL(giveup_altivec_notask) + mfmsr r3 + andis. r4,r3,MSR_VEC@h + bnelr /* Already enabled? */ + oris r3,r3,MSR_VEC@h + SYNC + MTMSRD(r3) /* enable use of VMX now */ + isync + blr + +/* + * giveup_altivec(tsk) + * Disable VMX for the task given as the argument, + * and save the vector registers in its thread_struct. + * Enables the VMX for use in the kernel on return. + */ +_GLOBAL(giveup_altivec) + mfmsr r5 + oris r5,r5,MSR_VEC@h + SYNC + MTMSRD(r5) /* enable use of VMX now */ + isync + PPC_LCMPI 0,r3,0 + beqlr /* if no previous owner, done */ + addi r3,r3,THREAD /* want THREAD of task */ + PPC_LL r7,THREAD_VRSAVEAREA(r3) + PPC_LL r5,PT_REGS(r3) + PPC_LCMPI 0,r7,0 + bne 2f + addi r7,r3,THREAD_VRSTATE +2: PPC_LCMPI 0,r5,0 + SAVE_32VRS(0,r4,r7) + mfvscr vr0 + li r4,VRSTATE_VSCR + stvx vr0,r4,r7 + beq 1f + PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5) +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + lis r3,(MSR_VEC|MSR_VSX)@h +FTR_SECTION_ELSE + lis r3,MSR_VEC@h +ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX) +#else + lis r3,MSR_VEC@h +#endif + andc r4,r4,r3 /* disable FP for previous task */ + PPC_STL r4,_MSR-STACK_FRAME_OVERHEAD(r5) +1: +#ifndef CONFIG_SMP + li r5,0 + LOAD_REG_ADDRBASE(r4,last_task_used_altivec) + PPC_STL r5,ADDROFF(last_task_used_altivec)(r4) +#endif /* CONFIG_SMP */ + blr + +#ifdef CONFIG_VSX + +#ifdef CONFIG_PPC32 +#error This asm code isn't ready for 32-bit kernels +#endif + +/* + * load_up_vsx(unused, unused, tsk) + * Disable VSX for the task which had it previously, + * and save its vector registers in its thread_struct. + * Reuse the fp and vsx saves, but first check to see if they have + * been saved already. + */ +_GLOBAL(load_up_vsx) +/* Load FP and VSX registers if they haven't been done yet */ + andi. r5,r12,MSR_FP + beql+ load_up_fpu /* skip if already loaded */ + andis. r5,r12,MSR_VEC@h + beql+ load_up_altivec /* skip if already loaded */ + +#ifndef CONFIG_SMP + ld r3,last_task_used_vsx@got(r2) + ld r4,0(r3) + cmpdi 0,r4,0 + beq 1f + /* Disable VSX for last_task_used_vsx */ + addi r4,r4,THREAD + ld r5,PT_REGS(r4) + ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) + lis r6,MSR_VSX@h + andc r6,r4,r6 + std r6,_MSR-STACK_FRAME_OVERHEAD(r5) +1: +#endif /* CONFIG_SMP */ + ld r4,PACACURRENT(r13) + addi r4,r4,THREAD /* Get THREAD */ + li r6,1 + stw r6,THREAD_USED_VSR(r4) /* ... also set thread used vsr */ + /* enable use of VSX after return */ + oris r12,r12,MSR_VSX@h + std r12,_MSR(r1) +#ifndef CONFIG_SMP + /* Update last_task_used_vsx to 'current' */ + ld r4,PACACURRENT(r13) + std r4,0(r3) +#endif /* CONFIG_SMP */ + b fast_exception_return + +/* + * __giveup_vsx(tsk) + * Disable VSX for the task given as the argument. + * Does NOT save vsx registers. + * Enables the VSX for use in the kernel on return. + */ +_GLOBAL(__giveup_vsx) + mfmsr r5 + oris r5,r5,MSR_VSX@h + mtmsrd r5 /* enable use of VSX now */ + isync + + cmpdi 0,r3,0 + beqlr- /* if no previous owner, done */ + addi r3,r3,THREAD /* want THREAD of task */ + ld r5,PT_REGS(r3) + cmpdi 0,r5,0 + beq 1f + ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) + lis r3,MSR_VSX@h + andc r4,r4,r3 /* disable VSX for previous task */ + std r4,_MSR-STACK_FRAME_OVERHEAD(r5) +1: +#ifndef CONFIG_SMP + li r5,0 + ld r4,last_task_used_vsx@got(r2) + std r5,0(r4) +#endif /* CONFIG_SMP */ + blr + +#endif /* CONFIG_VSX */ + /* * The routines below are in assembler so we can closely control the @@ -53,12 +342,12 @@ fpenable: stfd fr31,8(r1) LDCONST(fr1, fpzero) mffs fr31 - mtfsf 0xff,fr1 + MTFSF_L(fr1) blr fpdisable: mtlr r12 - mtfsf 0xff,fr31 + MTFSF_L(fr31) lfd fr31,8(r1) lfd fr1,16(r1) lfd fr0,24(r1) diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 97082a4203a..904c66128fa 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -1,11 +1,12 @@ /* * IBM PowerPC Virtual I/O Infrastructure Support. * - * Copyright (c) 2003-2005 IBM Corp. + * Copyright (c) 2003,2008 IBM Corp. * Dave Engebretsen engebret@us.ibm.com * Santiago Leon santil@us.ibm.com * Hollis Blanchard <hollisb@us.ibm.com> * Stephen Rothwell + * Robert Jennings <rcjenn@us.ibm.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -13,26 +14,1213 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/cpu.h> +#include <linux/types.h> +#include <linux/delay.h> +#include <linux/stat.h> +#include <linux/device.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/console.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mm.h> #include <linux/dma-mapping.h> +#include <linux/kobject.h> + #include <asm/iommu.h> #include <asm/dma.h> #include <asm/vio.h> +#include <asm/prom.h> +#include <asm/firmware.h> +#include <asm/tce.h> +#include <asm/page.h> +#include <asm/hvcall.h> -static const struct vio_device_id *vio_match_device( - const struct vio_device_id *, const struct vio_dev *); - -struct vio_dev vio_bus_device = { /* fake "parent" device */ - .name = vio_bus_device.dev.bus_id, +static struct vio_dev vio_bus_device = { /* fake "parent" device */ + .name = "vio", .type = "", - .dev.bus_id = "vio", + .dev.init_name = "vio", .dev.bus = &vio_bus_type, }; -static struct vio_bus_ops vio_bus_ops; +#ifdef CONFIG_PPC_SMLPAR +/** + * vio_cmo_pool - A pool of IO memory for CMO use + * + * @size: The size of the pool in bytes + * @free: The amount of free memory in the pool + */ +struct vio_cmo_pool { + size_t size; + size_t free; +}; + +/* How many ms to delay queued balance work */ +#define VIO_CMO_BALANCE_DELAY 100 + +/* Portion out IO memory to CMO devices by this chunk size */ +#define VIO_CMO_BALANCE_CHUNK 131072 + +/** + * vio_cmo_dev_entry - A device that is CMO-enabled and requires entitlement + * + * @vio_dev: struct vio_dev pointer + * @list: pointer to other devices on bus that are being tracked + */ +struct vio_cmo_dev_entry { + struct vio_dev *viodev; + struct list_head list; +}; + +/** + * vio_cmo - VIO bus accounting structure for CMO entitlement + * + * @lock: spinlock for entire structure + * @balance_q: work queue for balancing system entitlement + * @device_list: list of CMO-enabled devices requiring entitlement + * @entitled: total system entitlement in bytes + * @reserve: pool of memory from which devices reserve entitlement, incl. spare + * @excess: pool of excess entitlement not needed for device reserves or spare + * @spare: IO memory for device hotplug functionality + * @min: minimum necessary for system operation + * @desired: desired memory for system operation + * @curr: bytes currently allocated + * @high: high water mark for IO data usage + */ +struct vio_cmo { + spinlock_t lock; + struct delayed_work balance_q; + struct list_head device_list; + size_t entitled; + struct vio_cmo_pool reserve; + struct vio_cmo_pool excess; + size_t spare; + size_t min; + size_t desired; + size_t curr; + size_t high; +} vio_cmo; + +/** + * vio_cmo_OF_devices - Count the number of OF devices that have DMA windows + */ +static int vio_cmo_num_OF_devs(void) +{ + struct device_node *node_vroot; + int count = 0; + + /* + * Count the number of vdevice entries with an + * ibm,my-dma-window OF property + */ + node_vroot = of_find_node_by_name(NULL, "vdevice"); + if (node_vroot) { + struct device_node *of_node; + struct property *prop; + + for_each_child_of_node(node_vroot, of_node) { + prop = of_find_property(of_node, "ibm,my-dma-window", + NULL); + if (prop) + count++; + } + } + of_node_put(node_vroot); + return count; +} + +/** + * vio_cmo_alloc - allocate IO memory for CMO-enable devices + * + * @viodev: VIO device requesting IO memory + * @size: size of allocation requested + * + * Allocations come from memory reserved for the devices and any excess + * IO memory available to all devices. The spare pool used to service + * hotplug must be equal to %VIO_CMO_MIN_ENT for the excess pool to be + * made available. + * + * Return codes: + * 0 for successful allocation and -ENOMEM for a failure + */ +static inline int vio_cmo_alloc(struct vio_dev *viodev, size_t size) +{ + unsigned long flags; + size_t reserve_free = 0; + size_t excess_free = 0; + int ret = -ENOMEM; + + spin_lock_irqsave(&vio_cmo.lock, flags); + + /* Determine the amount of free entitlement available in reserve */ + if (viodev->cmo.entitled > viodev->cmo.allocated) + reserve_free = viodev->cmo.entitled - viodev->cmo.allocated; + + /* If spare is not fulfilled, the excess pool can not be used. */ + if (vio_cmo.spare >= VIO_CMO_MIN_ENT) + excess_free = vio_cmo.excess.free; + + /* The request can be satisfied */ + if ((reserve_free + excess_free) >= size) { + vio_cmo.curr += size; + if (vio_cmo.curr > vio_cmo.high) + vio_cmo.high = vio_cmo.curr; + viodev->cmo.allocated += size; + size -= min(reserve_free, size); + vio_cmo.excess.free -= size; + ret = 0; + } + + spin_unlock_irqrestore(&vio_cmo.lock, flags); + return ret; +} + +/** + * vio_cmo_dealloc - deallocate IO memory from CMO-enable devices + * @viodev: VIO device freeing IO memory + * @size: size of deallocation + * + * IO memory is freed by the device back to the correct memory pools. + * The spare pool is replenished first from either memory pool, then + * the reserve pool is used to reduce device entitlement, the excess + * pool is used to increase the reserve pool toward the desired entitlement + * target, and then the remaining memory is returned to the pools. + * + */ +static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size) +{ + unsigned long flags; + size_t spare_needed = 0; + size_t excess_freed = 0; + size_t reserve_freed = size; + size_t tmp; + int balance = 0; + + spin_lock_irqsave(&vio_cmo.lock, flags); + vio_cmo.curr -= size; + + /* Amount of memory freed from the excess pool */ + if (viodev->cmo.allocated > viodev->cmo.entitled) { + excess_freed = min(reserve_freed, (viodev->cmo.allocated - + viodev->cmo.entitled)); + reserve_freed -= excess_freed; + } + + /* Remove allocation from device */ + viodev->cmo.allocated -= (reserve_freed + excess_freed); + + /* Spare is a subset of the reserve pool, replenish it first. */ + spare_needed = VIO_CMO_MIN_ENT - vio_cmo.spare; + + /* + * Replenish the spare in the reserve pool from the excess pool. + * This moves entitlement into the reserve pool. + */ + if (spare_needed && excess_freed) { + tmp = min(excess_freed, spare_needed); + vio_cmo.excess.size -= tmp; + vio_cmo.reserve.size += tmp; + vio_cmo.spare += tmp; + excess_freed -= tmp; + spare_needed -= tmp; + balance = 1; + } + + /* + * Replenish the spare in the reserve pool from the reserve pool. + * This removes entitlement from the device down to VIO_CMO_MIN_ENT, + * if needed, and gives it to the spare pool. The amount of used + * memory in this pool does not change. + */ + if (spare_needed && reserve_freed) { + tmp = min3(spare_needed, reserve_freed, (viodev->cmo.entitled - VIO_CMO_MIN_ENT)); + + vio_cmo.spare += tmp; + viodev->cmo.entitled -= tmp; + reserve_freed -= tmp; + spare_needed -= tmp; + balance = 1; + } + + /* + * Increase the reserve pool until the desired allocation is met. + * Move an allocation freed from the excess pool into the reserve + * pool and schedule a balance operation. + */ + if (excess_freed && (vio_cmo.desired > vio_cmo.reserve.size)) { + tmp = min(excess_freed, (vio_cmo.desired - vio_cmo.reserve.size)); + + vio_cmo.excess.size -= tmp; + vio_cmo.reserve.size += tmp; + excess_freed -= tmp; + balance = 1; + } + + /* Return memory from the excess pool to that pool */ + if (excess_freed) + vio_cmo.excess.free += excess_freed; + + if (balance) + schedule_delayed_work(&vio_cmo.balance_q, VIO_CMO_BALANCE_DELAY); + spin_unlock_irqrestore(&vio_cmo.lock, flags); +} + +/** + * vio_cmo_entitlement_update - Manage system entitlement changes + * + * @new_entitlement: new system entitlement to attempt to accommodate + * + * Increases in entitlement will be used to fulfill the spare entitlement + * and the rest is given to the excess pool. Decreases, if they are + * possible, come from the excess pool and from unused device entitlement + * + * Returns: 0 on success, -ENOMEM when change can not be made + */ +int vio_cmo_entitlement_update(size_t new_entitlement) +{ + struct vio_dev *viodev; + struct vio_cmo_dev_entry *dev_ent; + unsigned long flags; + size_t avail, delta, tmp; + + spin_lock_irqsave(&vio_cmo.lock, flags); + + /* Entitlement increases */ + if (new_entitlement > vio_cmo.entitled) { + delta = new_entitlement - vio_cmo.entitled; + + /* Fulfill spare allocation */ + if (vio_cmo.spare < VIO_CMO_MIN_ENT) { + tmp = min(delta, (VIO_CMO_MIN_ENT - vio_cmo.spare)); + vio_cmo.spare += tmp; + vio_cmo.reserve.size += tmp; + delta -= tmp; + } + + /* Remaining new allocation goes to the excess pool */ + vio_cmo.entitled += delta; + vio_cmo.excess.size += delta; + vio_cmo.excess.free += delta; + + goto out; + } + + /* Entitlement decreases */ + delta = vio_cmo.entitled - new_entitlement; + avail = vio_cmo.excess.free; + + /* + * Need to check how much unused entitlement each device can + * sacrifice to fulfill entitlement change. + */ + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) { + if (avail >= delta) + break; + + viodev = dev_ent->viodev; + if ((viodev->cmo.entitled > viodev->cmo.allocated) && + (viodev->cmo.entitled > VIO_CMO_MIN_ENT)) + avail += viodev->cmo.entitled - + max_t(size_t, viodev->cmo.allocated, + VIO_CMO_MIN_ENT); + } + + if (delta <= avail) { + vio_cmo.entitled -= delta; + + /* Take entitlement from the excess pool first */ + tmp = min(vio_cmo.excess.free, delta); + vio_cmo.excess.size -= tmp; + vio_cmo.excess.free -= tmp; + delta -= tmp; + + /* + * Remove all but VIO_CMO_MIN_ENT bytes from devices + * until entitlement change is served + */ + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) { + if (!delta) + break; + + viodev = dev_ent->viodev; + tmp = 0; + if ((viodev->cmo.entitled > viodev->cmo.allocated) && + (viodev->cmo.entitled > VIO_CMO_MIN_ENT)) + tmp = viodev->cmo.entitled - + max_t(size_t, viodev->cmo.allocated, + VIO_CMO_MIN_ENT); + viodev->cmo.entitled -= min(tmp, delta); + delta -= min(tmp, delta); + } + } else { + spin_unlock_irqrestore(&vio_cmo.lock, flags); + return -ENOMEM; + } + +out: + schedule_delayed_work(&vio_cmo.balance_q, 0); + spin_unlock_irqrestore(&vio_cmo.lock, flags); + return 0; +} + +/** + * vio_cmo_balance - Balance entitlement among devices + * + * @work: work queue structure for this operation + * + * Any system entitlement above the minimum needed for devices, or + * already allocated to devices, can be distributed to the devices. + * The list of devices is iterated through to recalculate the desired + * entitlement level and to determine how much entitlement above the + * minimum entitlement is allocated to devices. + * + * Small chunks of the available entitlement are given to devices until + * their requirements are fulfilled or there is no entitlement left to give. + * Upon completion sizes of the reserve and excess pools are calculated. + * + * The system minimum entitlement level is also recalculated here. + * Entitlement will be reserved for devices even after vio_bus_remove to + * accommodate reloading the driver. The OF tree is walked to count the + * number of devices present and this will remove entitlement for devices + * that have actually left the system after having vio_bus_remove called. + */ +static void vio_cmo_balance(struct work_struct *work) +{ + struct vio_cmo *cmo; + struct vio_dev *viodev; + struct vio_cmo_dev_entry *dev_ent; + unsigned long flags; + size_t avail = 0, level, chunk, need; + int devcount = 0, fulfilled; + + cmo = container_of(work, struct vio_cmo, balance_q.work); + + spin_lock_irqsave(&vio_cmo.lock, flags); + + /* Calculate minimum entitlement and fulfill spare */ + cmo->min = vio_cmo_num_OF_devs() * VIO_CMO_MIN_ENT; + BUG_ON(cmo->min > cmo->entitled); + cmo->spare = min_t(size_t, VIO_CMO_MIN_ENT, (cmo->entitled - cmo->min)); + cmo->min += cmo->spare; + cmo->desired = cmo->min; + + /* + * Determine how much entitlement is available and reset device + * entitlements + */ + avail = cmo->entitled - cmo->spare; + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) { + viodev = dev_ent->viodev; + devcount++; + viodev->cmo.entitled = VIO_CMO_MIN_ENT; + cmo->desired += (viodev->cmo.desired - VIO_CMO_MIN_ENT); + avail -= max_t(size_t, viodev->cmo.allocated, VIO_CMO_MIN_ENT); + } + + /* + * Having provided each device with the minimum entitlement, loop + * over the devices portioning out the remaining entitlement + * until there is nothing left. + */ + level = VIO_CMO_MIN_ENT; + while (avail) { + fulfilled = 0; + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) { + viodev = dev_ent->viodev; + + if (viodev->cmo.desired <= level) { + fulfilled++; + continue; + } + + /* + * Give the device up to VIO_CMO_BALANCE_CHUNK + * bytes of entitlement, but do not exceed the + * desired level of entitlement for the device. + */ + chunk = min_t(size_t, avail, VIO_CMO_BALANCE_CHUNK); + chunk = min(chunk, (viodev->cmo.desired - + viodev->cmo.entitled)); + viodev->cmo.entitled += chunk; + + /* + * If the memory for this entitlement increase was + * already allocated to the device it does not come + * from the available pool being portioned out. + */ + need = max(viodev->cmo.allocated, viodev->cmo.entitled)- + max(viodev->cmo.allocated, level); + avail -= need; + + } + if (fulfilled == devcount) + break; + level += VIO_CMO_BALANCE_CHUNK; + } + + /* Calculate new reserve and excess pool sizes */ + cmo->reserve.size = cmo->min; + cmo->excess.free = 0; + cmo->excess.size = 0; + need = 0; + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) { + viodev = dev_ent->viodev; + /* Calculated reserve size above the minimum entitlement */ + if (viodev->cmo.entitled) + cmo->reserve.size += (viodev->cmo.entitled - + VIO_CMO_MIN_ENT); + /* Calculated used excess entitlement */ + if (viodev->cmo.allocated > viodev->cmo.entitled) + need += viodev->cmo.allocated - viodev->cmo.entitled; + } + cmo->excess.size = cmo->entitled - cmo->reserve.size; + cmo->excess.free = cmo->excess.size - need; + + cancel_delayed_work(to_delayed_work(work)); + spin_unlock_irqrestore(&vio_cmo.lock, flags); +} + +static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, + struct dma_attrs *attrs) +{ + struct vio_dev *viodev = to_vio_dev(dev); + void *ret; + + if (vio_cmo_alloc(viodev, roundup(size, PAGE_SIZE))) { + atomic_inc(&viodev->cmo.allocs_failed); + return NULL; + } + + ret = dma_iommu_ops.alloc(dev, size, dma_handle, flag, attrs); + if (unlikely(ret == NULL)) { + vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE)); + atomic_inc(&viodev->cmo.allocs_failed); + } + + return ret; +} + +static void vio_dma_iommu_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle, + struct dma_attrs *attrs) +{ + struct vio_dev *viodev = to_vio_dev(dev); + + dma_iommu_ops.free(dev, size, vaddr, dma_handle, attrs); + + vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE)); +} + +static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct vio_dev *viodev = to_vio_dev(dev); + struct iommu_table *tbl; + dma_addr_t ret = DMA_ERROR_CODE; + + tbl = get_iommu_table_base(dev); + if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) { + atomic_inc(&viodev->cmo.allocs_failed); + return ret; + } + + ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs); + if (unlikely(dma_mapping_error(dev, ret))) { + vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); + atomic_inc(&viodev->cmo.allocs_failed); + } + + return ret; +} + +static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, + size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct vio_dev *viodev = to_vio_dev(dev); + struct iommu_table *tbl; + + tbl = get_iommu_table_base(dev); + dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs); + + vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); +} + +static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct vio_dev *viodev = to_vio_dev(dev); + struct iommu_table *tbl; + struct scatterlist *sgl; + int ret, count = 0; + size_t alloc_size = 0; + + tbl = get_iommu_table_base(dev); + for (sgl = sglist; count < nelems; count++, sgl++) + alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl)); + + if (vio_cmo_alloc(viodev, alloc_size)) { + atomic_inc(&viodev->cmo.allocs_failed); + return 0; + } + + ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs); + + if (unlikely(!ret)) { + vio_cmo_dealloc(viodev, alloc_size); + atomic_inc(&viodev->cmo.allocs_failed); + return ret; + } + + for (sgl = sglist, count = 0; count < ret; count++, sgl++) + alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); + if (alloc_size) + vio_cmo_dealloc(viodev, alloc_size); + + return ret; +} + +static void vio_dma_iommu_unmap_sg(struct device *dev, + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct vio_dev *viodev = to_vio_dev(dev); + struct iommu_table *tbl; + struct scatterlist *sgl; + size_t alloc_size = 0; + int count = 0; + + tbl = get_iommu_table_base(dev); + for (sgl = sglist; count < nelems; count++, sgl++) + alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); + + dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs); + + vio_cmo_dealloc(viodev, alloc_size); +} + +static int vio_dma_iommu_dma_supported(struct device *dev, u64 mask) +{ + return dma_iommu_ops.dma_supported(dev, mask); +} + +static u64 vio_dma_get_required_mask(struct device *dev) +{ + return dma_iommu_ops.get_required_mask(dev); +} + +struct dma_map_ops vio_dma_mapping_ops = { + .alloc = vio_dma_iommu_alloc_coherent, + .free = vio_dma_iommu_free_coherent, + .mmap = dma_direct_mmap_coherent, + .map_sg = vio_dma_iommu_map_sg, + .unmap_sg = vio_dma_iommu_unmap_sg, + .map_page = vio_dma_iommu_map_page, + .unmap_page = vio_dma_iommu_unmap_page, + .dma_supported = vio_dma_iommu_dma_supported, + .get_required_mask = vio_dma_get_required_mask, +}; + +/** + * vio_cmo_set_dev_desired - Set desired entitlement for a device + * + * @viodev: struct vio_dev for device to alter + * @desired: new desired entitlement level in bytes + * + * For use by devices to request a change to their entitlement at runtime or + * through sysfs. The desired entitlement level is changed and a balancing + * of system resources is scheduled to run in the future. + */ +void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) +{ + unsigned long flags; + struct vio_cmo_dev_entry *dev_ent; + int found = 0; + + if (!firmware_has_feature(FW_FEATURE_CMO)) + return; + + spin_lock_irqsave(&vio_cmo.lock, flags); + if (desired < VIO_CMO_MIN_ENT) + desired = VIO_CMO_MIN_ENT; + + /* + * Changes will not be made for devices not in the device list. + * If it is not in the device list, then no driver is loaded + * for the device and it can not receive entitlement. + */ + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) + if (viodev == dev_ent->viodev) { + found = 1; + break; + } + if (!found) { + spin_unlock_irqrestore(&vio_cmo.lock, flags); + return; + } + + /* Increase/decrease in desired device entitlement */ + if (desired >= viodev->cmo.desired) { + /* Just bump the bus and device values prior to a balance*/ + vio_cmo.desired += desired - viodev->cmo.desired; + viodev->cmo.desired = desired; + } else { + /* Decrease bus and device values for desired entitlement */ + vio_cmo.desired -= viodev->cmo.desired - desired; + viodev->cmo.desired = desired; + /* + * If less entitlement is desired than current entitlement, move + * any reserve memory in the change region to the excess pool. + */ + if (viodev->cmo.entitled > desired) { + vio_cmo.reserve.size -= viodev->cmo.entitled - desired; + vio_cmo.excess.size += viodev->cmo.entitled - desired; + /* + * If entitlement moving from the reserve pool to the + * excess pool is currently unused, add to the excess + * free counter. + */ + if (viodev->cmo.allocated < viodev->cmo.entitled) + vio_cmo.excess.free += viodev->cmo.entitled - + max(viodev->cmo.allocated, desired); + viodev->cmo.entitled = desired; + } + } + schedule_delayed_work(&vio_cmo.balance_q, 0); + spin_unlock_irqrestore(&vio_cmo.lock, flags); +} + +/** + * vio_cmo_bus_probe - Handle CMO specific bus probe activities + * + * @viodev - Pointer to struct vio_dev for device + * + * Determine the devices IO memory entitlement needs, attempting + * to satisfy the system minimum entitlement at first and scheduling + * a balance operation to take care of the rest at a later time. + * + * Returns: 0 on success, -EINVAL when device doesn't support CMO, and + * -ENOMEM when entitlement is not available for device or + * device entry. + * + */ +static int vio_cmo_bus_probe(struct vio_dev *viodev) +{ + struct vio_cmo_dev_entry *dev_ent; + struct device *dev = &viodev->dev; + struct iommu_table *tbl; + struct vio_driver *viodrv = to_vio_driver(dev->driver); + unsigned long flags; + size_t size; + bool dma_capable = false; + + tbl = get_iommu_table_base(dev); + + /* A device requires entitlement if it has a DMA window property */ + switch (viodev->family) { + case VDEVICE: + if (of_get_property(viodev->dev.of_node, + "ibm,my-dma-window", NULL)) + dma_capable = true; + break; + case PFO: + dma_capable = false; + break; + default: + dev_warn(dev, "unknown device family: %d\n", viodev->family); + BUG(); + break; + } + + /* Configure entitlement for the device. */ + if (dma_capable) { + /* Check that the driver is CMO enabled and get desired DMA */ + if (!viodrv->get_desired_dma) { + dev_err(dev, "%s: device driver does not support CMO\n", + __func__); + return -EINVAL; + } + + viodev->cmo.desired = + IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev), tbl); + if (viodev->cmo.desired < VIO_CMO_MIN_ENT) + viodev->cmo.desired = VIO_CMO_MIN_ENT; + size = VIO_CMO_MIN_ENT; + + dev_ent = kmalloc(sizeof(struct vio_cmo_dev_entry), + GFP_KERNEL); + if (!dev_ent) + return -ENOMEM; + + dev_ent->viodev = viodev; + spin_lock_irqsave(&vio_cmo.lock, flags); + list_add(&dev_ent->list, &vio_cmo.device_list); + } else { + viodev->cmo.desired = 0; + size = 0; + spin_lock_irqsave(&vio_cmo.lock, flags); + } + + /* + * If the needs for vio_cmo.min have not changed since they + * were last set, the number of devices in the OF tree has + * been constant and the IO memory for this is already in + * the reserve pool. + */ + if (vio_cmo.min == ((vio_cmo_num_OF_devs() + 1) * + VIO_CMO_MIN_ENT)) { + /* Updated desired entitlement if device requires it */ + if (size) + vio_cmo.desired += (viodev->cmo.desired - + VIO_CMO_MIN_ENT); + } else { + size_t tmp; + + tmp = vio_cmo.spare + vio_cmo.excess.free; + if (tmp < size) { + dev_err(dev, "%s: insufficient free " + "entitlement to add device. " + "Need %lu, have %lu\n", __func__, + size, (vio_cmo.spare + tmp)); + spin_unlock_irqrestore(&vio_cmo.lock, flags); + return -ENOMEM; + } + + /* Use excess pool first to fulfill request */ + tmp = min(size, vio_cmo.excess.free); + vio_cmo.excess.free -= tmp; + vio_cmo.excess.size -= tmp; + vio_cmo.reserve.size += tmp; + + /* Use spare if excess pool was insufficient */ + vio_cmo.spare -= size - tmp; + + /* Update bus accounting */ + vio_cmo.min += size; + vio_cmo.desired += viodev->cmo.desired; + } + spin_unlock_irqrestore(&vio_cmo.lock, flags); + return 0; +} + +/** + * vio_cmo_bus_remove - Handle CMO specific bus removal activities + * + * @viodev - Pointer to struct vio_dev for device + * + * Remove the device from the cmo device list. The minimum entitlement + * will be reserved for the device as long as it is in the system. The + * rest of the entitlement the device had been allocated will be returned + * to the system. + */ +static void vio_cmo_bus_remove(struct vio_dev *viodev) +{ + struct vio_cmo_dev_entry *dev_ent; + unsigned long flags; + size_t tmp; + + spin_lock_irqsave(&vio_cmo.lock, flags); + if (viodev->cmo.allocated) { + dev_err(&viodev->dev, "%s: device had %lu bytes of IO " + "allocated after remove operation.\n", + __func__, viodev->cmo.allocated); + BUG(); + } + + /* + * Remove the device from the device list being maintained for + * CMO enabled devices. + */ + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) + if (viodev == dev_ent->viodev) { + list_del(&dev_ent->list); + kfree(dev_ent); + break; + } + + /* + * Devices may not require any entitlement and they do not need + * to be processed. Otherwise, return the device's entitlement + * back to the pools. + */ + if (viodev->cmo.entitled) { + /* + * This device has not yet left the OF tree, it's + * minimum entitlement remains in vio_cmo.min and + * vio_cmo.desired + */ + vio_cmo.desired -= (viodev->cmo.desired - VIO_CMO_MIN_ENT); + + /* + * Save min allocation for device in reserve as long + * as it exists in OF tree as determined by later + * balance operation + */ + viodev->cmo.entitled -= VIO_CMO_MIN_ENT; + + /* Replenish spare from freed reserve pool */ + if (viodev->cmo.entitled && (vio_cmo.spare < VIO_CMO_MIN_ENT)) { + tmp = min(viodev->cmo.entitled, (VIO_CMO_MIN_ENT - + vio_cmo.spare)); + vio_cmo.spare += tmp; + viodev->cmo.entitled -= tmp; + } + + /* Remaining reserve goes to excess pool */ + vio_cmo.excess.size += viodev->cmo.entitled; + vio_cmo.excess.free += viodev->cmo.entitled; + vio_cmo.reserve.size -= viodev->cmo.entitled; + + /* + * Until the device is removed it will keep a + * minimum entitlement; this will guarantee that + * a module unload/load will result in a success. + */ + viodev->cmo.entitled = VIO_CMO_MIN_ENT; + viodev->cmo.desired = VIO_CMO_MIN_ENT; + atomic_set(&viodev->cmo.allocs_failed, 0); + } + + spin_unlock_irqrestore(&vio_cmo.lock, flags); +} + +static void vio_cmo_set_dma_ops(struct vio_dev *viodev) +{ + set_dma_ops(&viodev->dev, &vio_dma_mapping_ops); +} + +/** + * vio_cmo_bus_init - CMO entitlement initialization at bus init time + * + * Set up the reserve and excess entitlement pools based on available + * system entitlement and the number of devices in the OF tree that + * require entitlement in the reserve pool. + */ +static void vio_cmo_bus_init(void) +{ + struct hvcall_mpp_data mpp_data; + int err; + + memset(&vio_cmo, 0, sizeof(struct vio_cmo)); + spin_lock_init(&vio_cmo.lock); + INIT_LIST_HEAD(&vio_cmo.device_list); + INIT_DELAYED_WORK(&vio_cmo.balance_q, vio_cmo_balance); + + /* Get current system entitlement */ + err = h_get_mpp(&mpp_data); + + /* + * On failure, continue with entitlement set to 0, will panic() + * later when spare is reserved. + */ + if (err != H_SUCCESS) { + printk(KERN_ERR "%s: unable to determine system IO "\ + "entitlement. (%d)\n", __func__, err); + vio_cmo.entitled = 0; + } else { + vio_cmo.entitled = mpp_data.entitled_mem; + } + + /* Set reservation and check against entitlement */ + vio_cmo.spare = VIO_CMO_MIN_ENT; + vio_cmo.reserve.size = vio_cmo.spare; + vio_cmo.reserve.size += (vio_cmo_num_OF_devs() * + VIO_CMO_MIN_ENT); + if (vio_cmo.reserve.size > vio_cmo.entitled) { + printk(KERN_ERR "%s: insufficient system entitlement\n", + __func__); + panic("%s: Insufficient system entitlement", __func__); + } + + /* Set the remaining accounting variables */ + vio_cmo.excess.size = vio_cmo.entitled - vio_cmo.reserve.size; + vio_cmo.excess.free = vio_cmo.excess.size; + vio_cmo.min = vio_cmo.reserve.size; + vio_cmo.desired = vio_cmo.reserve.size; +} + +/* sysfs device functions and data structures for CMO */ + +#define viodev_cmo_rd_attr(name) \ +static ssize_t viodev_cmo_##name##_show(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + return sprintf(buf, "%lu\n", to_vio_dev(dev)->cmo.name); \ +} + +static ssize_t viodev_cmo_allocs_failed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct vio_dev *viodev = to_vio_dev(dev); + return sprintf(buf, "%d\n", atomic_read(&viodev->cmo.allocs_failed)); +} + +static ssize_t viodev_cmo_allocs_failed_reset(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct vio_dev *viodev = to_vio_dev(dev); + atomic_set(&viodev->cmo.allocs_failed, 0); + return count; +} + +static ssize_t viodev_cmo_desired_set(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct vio_dev *viodev = to_vio_dev(dev); + size_t new_desired; + int ret; + + ret = strict_strtoul(buf, 10, &new_desired); + if (ret) + return ret; + + vio_cmo_set_dev_desired(viodev, new_desired); + return count; +} + +viodev_cmo_rd_attr(desired); +viodev_cmo_rd_attr(entitled); +viodev_cmo_rd_attr(allocated); + +static ssize_t name_show(struct device *, struct device_attribute *, char *); +static ssize_t devspec_show(struct device *, struct device_attribute *, char *); +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf); +static struct device_attribute vio_cmo_dev_attrs[] = { + __ATTR_RO(name), + __ATTR_RO(devspec), + __ATTR_RO(modalias), + __ATTR(cmo_desired, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH, + viodev_cmo_desired_show, viodev_cmo_desired_set), + __ATTR(cmo_entitled, S_IRUGO, viodev_cmo_entitled_show, NULL), + __ATTR(cmo_allocated, S_IRUGO, viodev_cmo_allocated_show, NULL), + __ATTR(cmo_allocs_failed, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH, + viodev_cmo_allocs_failed_show, viodev_cmo_allocs_failed_reset), + __ATTR_NULL +}; + +/* sysfs bus functions and data structures for CMO */ + +#define viobus_cmo_rd_attr(name) \ +static ssize_t cmo_##name##_show(struct bus_type *bt, char *buf) \ +{ \ + return sprintf(buf, "%lu\n", vio_cmo.name); \ +} \ +static BUS_ATTR_RO(cmo_##name) + +#define viobus_cmo_pool_rd_attr(name, var) \ +static ssize_t \ +cmo_##name##_##var##_show(struct bus_type *bt, char *buf) \ +{ \ + return sprintf(buf, "%lu\n", vio_cmo.name.var); \ +} \ +static BUS_ATTR_RO(cmo_##name##_##var) + +viobus_cmo_rd_attr(entitled); +viobus_cmo_rd_attr(spare); +viobus_cmo_rd_attr(min); +viobus_cmo_rd_attr(desired); +viobus_cmo_rd_attr(curr); +viobus_cmo_pool_rd_attr(reserve, size); +viobus_cmo_pool_rd_attr(excess, size); +viobus_cmo_pool_rd_attr(excess, free); + +static ssize_t cmo_high_show(struct bus_type *bt, char *buf) +{ + return sprintf(buf, "%lu\n", vio_cmo.high); +} + +static ssize_t cmo_high_store(struct bus_type *bt, const char *buf, + size_t count) +{ + unsigned long flags; + + spin_lock_irqsave(&vio_cmo.lock, flags); + vio_cmo.high = vio_cmo.curr; + spin_unlock_irqrestore(&vio_cmo.lock, flags); + + return count; +} +static BUS_ATTR_RW(cmo_high); + +static struct attribute *vio_bus_attrs[] = { + &bus_attr_cmo_entitled.attr, + &bus_attr_cmo_spare.attr, + &bus_attr_cmo_min.attr, + &bus_attr_cmo_desired.attr, + &bus_attr_cmo_curr.attr, + &bus_attr_cmo_high.attr, + &bus_attr_cmo_reserve_size.attr, + &bus_attr_cmo_excess_size.attr, + &bus_attr_cmo_excess_free.attr, + NULL, +}; +ATTRIBUTE_GROUPS(vio_bus); + +static void vio_cmo_sysfs_init(void) +{ + vio_bus_type.dev_attrs = vio_cmo_dev_attrs; + vio_bus_type.bus_groups = vio_bus_groups; +} +#else /* CONFIG_PPC_SMLPAR */ +int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; } +void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {} +static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; } +static void vio_cmo_bus_remove(struct vio_dev *viodev) {} +static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {} +static void vio_cmo_bus_init(void) {} +static void vio_cmo_sysfs_init(void) { } +#endif /* CONFIG_PPC_SMLPAR */ +EXPORT_SYMBOL(vio_cmo_entitlement_update); +EXPORT_SYMBOL(vio_cmo_set_dev_desired); + + +/* + * Platform Facilities Option (PFO) support + */ + +/** + * vio_h_cop_sync - Perform a synchronous PFO co-processor operation + * + * @vdev - Pointer to a struct vio_dev for device + * @op - Pointer to a struct vio_pfo_op for the operation parameters + * + * Calls the hypervisor to synchronously perform the PFO operation + * described in @op. In the case of a busy response from the hypervisor, + * the operation will be re-submitted indefinitely unless a non-zero timeout + * is specified or an error occurs. The timeout places a limit on when to + * stop re-submitting a operation, the total time can be exceeded if an + * operation is in progress. + * + * If op->hcall_ret is not NULL, this will be set to the return from the + * last h_cop_op call or it will be 0 if an error not involving the h_call + * was encountered. + * + * Returns: + * 0 on success, + * -EINVAL if the h_call fails due to an invalid parameter, + * -E2BIG if the h_call can not be performed synchronously, + * -EBUSY if a timeout is specified and has elapsed, + * -EACCES if the memory area for data/status has been rescinded, or + * -EPERM if a hardware fault has been indicated + */ +int vio_h_cop_sync(struct vio_dev *vdev, struct vio_pfo_op *op) +{ + struct device *dev = &vdev->dev; + unsigned long deadline = 0; + long hret = 0; + int ret = 0; + + if (op->timeout) + deadline = jiffies + msecs_to_jiffies(op->timeout); + + while (true) { + hret = plpar_hcall_norets(H_COP, op->flags, + vdev->resource_id, + op->in, op->inlen, op->out, + op->outlen, op->csbcpb); + + if (hret == H_SUCCESS || + (hret != H_NOT_ENOUGH_RESOURCES && + hret != H_BUSY && hret != H_RESOURCE) || + (op->timeout && time_after(deadline, jiffies))) + break; + + dev_dbg(dev, "%s: hcall ret(%ld), retrying.\n", __func__, hret); + } + + switch (hret) { + case H_SUCCESS: + ret = 0; + break; + case H_OP_MODE: + case H_TOO_BIG: + ret = -E2BIG; + break; + case H_RESCINDED: + ret = -EACCES; + break; + case H_HARDWARE: + ret = -EPERM; + break; + case H_NOT_ENOUGH_RESOURCES: + case H_RESOURCE: + case H_BUSY: + ret = -EBUSY; + break; + default: + ret = -EINVAL; + break; + } + + if (ret) + dev_dbg(dev, "%s: Sync h_cop_op failure (ret:%d) (hret:%ld)\n", + __func__, ret, hret); + + op->hcall_err = hret; + return ret; +} +EXPORT_SYMBOL(vio_h_cop_sync); + +static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) +{ + const __be32 *dma_window; + struct iommu_table *tbl; + unsigned long offset, size; + + dma_window = of_get_property(dev->dev.of_node, + "ibm,my-dma-window", NULL); + if (!dma_window) + return NULL; + + tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); + if (tbl == NULL) + return NULL; + + of_parse_dma_window(dev->dev.of_node, dma_window, + &tbl->it_index, &offset, &size); + + /* TCE table size - measured in tce entries */ + tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; + tbl->it_size = size >> tbl->it_page_shift; + /* offset for VIO should always be 0 */ + tbl->it_offset = offset >> tbl->it_page_shift; + tbl->it_busno = 0; + tbl->it_type = TCE_VB; + tbl->it_blocksize = 16; + + return iommu_init_table(tbl, -1); +} + +/** + * vio_match_device: - Tell if a VIO device has a matching + * VIO device id structure. + * @ids: array of VIO device id structures to search in + * @dev: the VIO device structure to match against + * + * Used by a driver to check whether a VIO device present in the + * system is in its list of supported devices. Returns the matching + * vio_device_id structure or NULL if there is no match. + */ +static const struct vio_device_id *vio_match_device( + const struct vio_device_id *ids, const struct vio_dev *dev) +{ + while (ids->type[0] != '\0') { + if ((strncmp(dev->type, ids->type, strlen(ids->type)) == 0) && + of_device_is_compatible(dev->dev.of_node, + ids->compat)) + return ids; + ids++; + } + return NULL; +} /* * Convert from struct device to struct vio_dev and pass to driver. @@ -50,8 +1238,17 @@ static int vio_bus_probe(struct device *dev) return error; id = vio_match_device(viodrv->id_table, viodev); - if (id) + if (id) { + memset(&viodev->cmo, 0, sizeof(viodev->cmo)); + if (firmware_has_feature(FW_FEATURE_CMO)) { + error = vio_cmo_bus_probe(viodev); + if (error) + return error; + } error = viodrv->probe(viodev, id); + if (error && firmware_has_feature(FW_FEATURE_CMO)) + vio_cmo_bus_remove(viodev); + } return error; } @@ -61,46 +1258,48 @@ static int vio_bus_remove(struct device *dev) { struct vio_dev *viodev = to_vio_dev(dev); struct vio_driver *viodrv = to_vio_driver(dev->driver); + struct device *devptr; + int ret = 1; - if (viodrv->remove) - return viodrv->remove(viodev); + /* + * Hold a reference to the device after the remove function is called + * to allow for CMO accounting cleanup for the device. + */ + devptr = get_device(dev); - /* driver can't remove */ - return 1; -} + if (viodrv->remove) + ret = viodrv->remove(viodev); -/* convert from struct device to struct vio_dev and pass to driver. */ -static void vio_bus_shutdown(struct device *dev) -{ - struct vio_dev *viodev = to_vio_dev(dev); - struct vio_driver *viodrv = to_vio_driver(dev->driver); + if (!ret && firmware_has_feature(FW_FEATURE_CMO)) + vio_cmo_bus_remove(viodev); - if (viodrv->shutdown) - viodrv->shutdown(viodev); + put_device(devptr); + return ret; } /** * vio_register_driver: - Register a new vio driver - * @drv: The vio_driver structure to be registered. + * @viodrv: The vio_driver structure to be registered. */ -int vio_register_driver(struct vio_driver *viodrv) +int __vio_register_driver(struct vio_driver *viodrv, struct module *owner, + const char *mod_name) { - printk(KERN_DEBUG "%s: driver %s registering\n", __FUNCTION__, - viodrv->driver.name); + pr_debug("%s: driver %s registering\n", __func__, viodrv->name); /* fill in 'struct driver' fields */ + viodrv->driver.name = viodrv->name; + viodrv->driver.pm = viodrv->pm; viodrv->driver.bus = &vio_bus_type; - viodrv->driver.probe = vio_bus_probe; - viodrv->driver.remove = vio_bus_remove; - viodrv->driver.shutdown = vio_bus_shutdown; + viodrv->driver.owner = owner; + viodrv->driver.mod_name = mod_name; return driver_register(&viodrv->driver); } -EXPORT_SYMBOL(vio_register_driver); +EXPORT_SYMBOL(__vio_register_driver); /** * vio_unregister_driver - Remove registration of vio driver. - * @driver: The vio_driver struct to be removed form registration + * @viodrv: The vio_driver struct to be removed form registration */ void vio_unregister_driver(struct vio_driver *viodrv) { @@ -108,35 +1307,192 @@ void vio_unregister_driver(struct vio_driver *viodrv) } EXPORT_SYMBOL(vio_unregister_driver); +/* vio_dev refcount hit 0 */ +static void vio_dev_release(struct device *dev) +{ + struct iommu_table *tbl = get_iommu_table_base(dev); + + if (tbl) + iommu_free_table(tbl, of_node_full_name(dev->of_node)); + of_node_put(dev->of_node); + kfree(to_vio_dev(dev)); +} + /** - * vio_match_device: - Tell if a VIO device has a matching - * VIO device id structure. - * @ids: array of VIO device id structures to search in - * @dev: the VIO device structure to match against + * vio_register_device_node: - Register a new vio device. + * @of_node: The OF node for this device. * - * Used by a driver to check whether a VIO device present in the - * system is in its list of supported devices. Returns the matching - * vio_device_id structure or NULL if there is no match. + * Creates and initializes a vio_dev structure from the data in + * of_node and adds it to the list of virtual devices. + * Returns a pointer to the created vio_dev or NULL if node has + * NULL device_type or compatible fields. */ -static const struct vio_device_id *vio_match_device( - const struct vio_device_id *ids, const struct vio_dev *dev) +struct vio_dev *vio_register_device_node(struct device_node *of_node) { - while (ids->type[0] != '\0') { - if (vio_bus_ops.match(ids, dev)) - return ids; - ids++; + struct vio_dev *viodev; + struct device_node *parent_node; + const __be32 *prop; + enum vio_dev_family family; + const char *of_node_name = of_node->name ? of_node->name : "<unknown>"; + + /* + * Determine if this node is a under the /vdevice node or under the + * /ibm,platform-facilities node. This decides the device's family. + */ + parent_node = of_get_parent(of_node); + if (parent_node) { + if (!strcmp(parent_node->full_name, "/ibm,platform-facilities")) + family = PFO; + else if (!strcmp(parent_node->full_name, "/vdevice")) + family = VDEVICE; + else { + pr_warn("%s: parent(%s) of %s not recognized.\n", + __func__, + parent_node->full_name, + of_node_name); + of_node_put(parent_node); + return NULL; + } + of_node_put(parent_node); + } else { + pr_warn("%s: could not determine the parent of node %s.\n", + __func__, of_node_name); + return NULL; + } + + if (family == PFO) { + if (of_get_property(of_node, "interrupt-controller", NULL)) { + pr_debug("%s: Skipping the interrupt controller %s.\n", + __func__, of_node_name); + return NULL; + } + } + + /* allocate a vio_dev for this node */ + viodev = kzalloc(sizeof(struct vio_dev), GFP_KERNEL); + if (viodev == NULL) { + pr_warn("%s: allocation failure for VIO device.\n", __func__); + return NULL; + } + + /* we need the 'device_type' property, in order to match with drivers */ + viodev->family = family; + if (viodev->family == VDEVICE) { + unsigned int unit_address; + + if (of_node->type != NULL) + viodev->type = of_node->type; + else { + pr_warn("%s: node %s is missing the 'device_type' " + "property.\n", __func__, of_node_name); + goto out; + } + + prop = of_get_property(of_node, "reg", NULL); + if (prop == NULL) { + pr_warn("%s: node %s missing 'reg'\n", + __func__, of_node_name); + goto out; + } + unit_address = of_read_number(prop, 1); + dev_set_name(&viodev->dev, "%x", unit_address); + viodev->irq = irq_of_parse_and_map(of_node, 0); + viodev->unit_address = unit_address; + } else { + /* PFO devices need their resource_id for submitting COP_OPs + * This is an optional field for devices, but is required when + * performing synchronous ops */ + prop = of_get_property(of_node, "ibm,resource-id", NULL); + if (prop != NULL) + viodev->resource_id = of_read_number(prop, 1); + + dev_set_name(&viodev->dev, "%s", of_node_name); + viodev->type = of_node_name; + viodev->irq = 0; + } + + viodev->name = of_node->name; + viodev->dev.of_node = of_node_get(of_node); + + set_dev_node(&viodev->dev, of_node_to_nid(of_node)); + + /* init generic 'struct device' fields: */ + viodev->dev.parent = &vio_bus_device.dev; + viodev->dev.bus = &vio_bus_type; + viodev->dev.release = vio_dev_release; + + if (of_get_property(viodev->dev.of_node, "ibm,my-dma-window", NULL)) { + if (firmware_has_feature(FW_FEATURE_CMO)) + vio_cmo_set_dma_ops(viodev); + else + set_dma_ops(&viodev->dev, &dma_iommu_ops); + + set_iommu_table_base(&viodev->dev, + vio_build_iommu_table(viodev)); + + /* needed to ensure proper operation of coherent allocations + * later, in case driver doesn't set it explicitly */ + viodev->dev.coherent_dma_mask = DMA_BIT_MASK(64); + viodev->dev.dma_mask = &viodev->dev.coherent_dma_mask; + } + + /* register with generic device framework */ + if (device_register(&viodev->dev)) { + printk(KERN_ERR "%s: failed to register device %s\n", + __func__, dev_name(&viodev->dev)); + put_device(&viodev->dev); + return NULL; } + + return viodev; + +out: /* Use this exit point for any return prior to device_register */ + kfree(viodev); + return NULL; } +EXPORT_SYMBOL(vio_register_device_node); + +/* + * vio_bus_scan_for_devices - Scan OF and register each child device + * @root_name - OF node name for the root of the subtree to search. + * This must be non-NULL + * + * Starting from the root node provide, register the device node for + * each child beneath the root. + */ +static void vio_bus_scan_register_devices(char *root_name) +{ + struct device_node *node_root, *node_child; + + if (!root_name) + return; + + node_root = of_find_node_by_name(NULL, root_name); + if (node_root) { + + /* + * Create struct vio_devices for each virtual device in + * the device tree. Drivers will associate with them later. + */ + node_child = of_get_next_child(node_root, NULL); + while (node_child) { + vio_register_device_node(node_child); + node_child = of_get_next_child(node_root, node_child); + } + of_node_put(node_root); + } +} /** * vio_bus_init: - Initialize the virtual IO bus */ -int __init vio_bus_init(struct vio_bus_ops *ops) +static int __init vio_bus_init(void) { int err; - vio_bus_ops = *ops; + if (firmware_has_feature(FW_FEATURE_CMO)) + vio_cmo_sysfs_init(); err = bus_register(&vio_bus_type); if (err) { @@ -151,121 +1507,191 @@ int __init vio_bus_init(struct vio_bus_ops *ops) err = device_register(&vio_bus_device.dev); if (err) { printk(KERN_WARNING "%s: device_register returned %i\n", - __FUNCTION__, err); + __func__, err); return err; } + if (firmware_has_feature(FW_FEATURE_CMO)) + vio_cmo_bus_init(); + return 0; } +postcore_initcall(vio_bus_init); -/* vio_dev refcount hit 0 */ -static void __devinit vio_dev_release(struct device *dev) +static int __init vio_device_init(void) { - if (vio_bus_ops.release_device) - vio_bus_ops.release_device(dev); - kfree(to_vio_dev(dev)); + vio_bus_scan_register_devices("vdevice"); + vio_bus_scan_register_devices("ibm,platform-facilities"); + + return 0; } +device_initcall(vio_device_init); -static ssize_t viodev_show_name(struct device *dev, +static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_vio_dev(dev)->name); } -DEVICE_ATTR(name, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_name, NULL); -struct vio_dev * __devinit vio_register_device(struct vio_dev *viodev) +static ssize_t devspec_show(struct device *dev, + struct device_attribute *attr, char *buf) { - /* init generic 'struct device' fields: */ - viodev->dev.parent = &vio_bus_device.dev; - viodev->dev.bus = &vio_bus_type; - viodev->dev.release = vio_dev_release; + struct device_node *of_node = dev->of_node; - /* register with generic device framework */ - if (device_register(&viodev->dev)) { - printk(KERN_ERR "%s: failed to register device %s\n", - __FUNCTION__, viodev->dev.bus_id); - return NULL; + return sprintf(buf, "%s\n", of_node_full_name(of_node)); +} + +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + const struct vio_dev *vio_dev = to_vio_dev(dev); + struct device_node *dn; + const char *cp; + + dn = dev->of_node; + if (!dn) { + strcpy(buf, "\n"); + return strlen(buf); + } + cp = of_get_property(dn, "compatible", NULL); + if (!cp) { + strcpy(buf, "\n"); + return strlen(buf); } - device_create_file(&viodev->dev, &dev_attr_name); - return viodev; + return sprintf(buf, "vio:T%sS%s\n", vio_dev->type, cp); } -void __devinit vio_unregister_device(struct vio_dev *viodev) +static struct device_attribute vio_dev_attrs[] = { + __ATTR_RO(name), + __ATTR_RO(devspec), + __ATTR_RO(modalias), + __ATTR_NULL +}; + +void vio_unregister_device(struct vio_dev *viodev) { - if (vio_bus_ops.unregister_device) - vio_bus_ops.unregister_device(viodev); - device_remove_file(&viodev->dev, &dev_attr_name); device_unregister(&viodev->dev); } EXPORT_SYMBOL(vio_unregister_device); -static dma_addr_t vio_map_single(struct device *dev, void *vaddr, - size_t size, enum dma_data_direction direction) +static int vio_bus_match(struct device *dev, struct device_driver *drv) { - return iommu_map_single(to_vio_dev(dev)->iommu_table, vaddr, size, - direction); -} + const struct vio_dev *vio_dev = to_vio_dev(dev); + struct vio_driver *vio_drv = to_vio_driver(drv); + const struct vio_device_id *ids = vio_drv->id_table; -static void vio_unmap_single(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction direction) -{ - iommu_unmap_single(to_vio_dev(dev)->iommu_table, dma_handle, size, - direction); + return (ids != NULL) && (vio_match_device(ids, vio_dev) != NULL); } -static int vio_map_sg(struct device *dev, struct scatterlist *sglist, - int nelems, enum dma_data_direction direction) +static int vio_hotplug(struct device *dev, struct kobj_uevent_env *env) { - return iommu_map_sg(dev, to_vio_dev(dev)->iommu_table, sglist, - nelems, direction); + const struct vio_dev *vio_dev = to_vio_dev(dev); + struct device_node *dn; + const char *cp; + + dn = dev->of_node; + if (!dn) + return -ENODEV; + cp = of_get_property(dn, "compatible", NULL); + if (!cp) + return -ENODEV; + + add_uevent_var(env, "MODALIAS=vio:T%sS%s", vio_dev->type, cp); + return 0; } -static void vio_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nelems, enum dma_data_direction direction) +struct bus_type vio_bus_type = { + .name = "vio", + .dev_attrs = vio_dev_attrs, + .uevent = vio_hotplug, + .match = vio_bus_match, + .probe = vio_bus_probe, + .remove = vio_bus_remove, +}; + +/** + * vio_get_attribute: - get attribute for virtual device + * @vdev: The vio device to get property. + * @which: The property/attribute to be extracted. + * @length: Pointer to length of returned data size (unused if NULL). + * + * Calls prom.c's of_get_property() to return the value of the + * attribute specified by @which +*/ +const void *vio_get_attribute(struct vio_dev *vdev, char *which, int *length) { - iommu_unmap_sg(to_vio_dev(dev)->iommu_table, sglist, nelems, direction); + return of_get_property(vdev->dev.of_node, which, length); } +EXPORT_SYMBOL(vio_get_attribute); -static void *vio_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag) +#ifdef CONFIG_PPC_PSERIES +/* vio_find_name() - internal because only vio.c knows how we formatted the + * kobject name + */ +static struct vio_dev *vio_find_name(const char *name) { - return iommu_alloc_coherent(to_vio_dev(dev)->iommu_table, size, - dma_handle, flag); + struct device *found; + + found = bus_find_device_by_name(&vio_bus_type, NULL, name); + if (!found) + return NULL; + + return to_vio_dev(found); } -static void vio_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) +/** + * vio_find_node - find an already-registered vio_dev + * @vnode: device_node of the virtual device we're looking for + */ +struct vio_dev *vio_find_node(struct device_node *vnode) { - iommu_free_coherent(to_vio_dev(dev)->iommu_table, size, vaddr, - dma_handle); + char kobj_name[20]; + struct device_node *vnode_parent; + const char *dev_type; + + vnode_parent = of_get_parent(vnode); + if (!vnode_parent) + return NULL; + + dev_type = of_get_property(vnode_parent, "device_type", NULL); + of_node_put(vnode_parent); + if (!dev_type) + return NULL; + + /* construct the kobject name from the device node */ + if (!strcmp(dev_type, "vdevice")) { + const __be32 *prop; + + prop = of_get_property(vnode, "reg", NULL); + if (!prop) + return NULL; + snprintf(kobj_name, sizeof(kobj_name), "%x", + (uint32_t)of_read_number(prop, 1)); + } else if (!strcmp(dev_type, "ibm,platform-facilities")) + snprintf(kobj_name, sizeof(kobj_name), "%s", vnode->name); + else + return NULL; + + return vio_find_name(kobj_name); } +EXPORT_SYMBOL(vio_find_node); -static int vio_dma_supported(struct device *dev, u64 mask) +int vio_enable_interrupts(struct vio_dev *dev) { - return 1; + int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE); + if (rc != H_SUCCESS) + printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc); + return rc; } +EXPORT_SYMBOL(vio_enable_interrupts); -struct dma_mapping_ops vio_dma_ops = { - .alloc_coherent = vio_alloc_coherent, - .free_coherent = vio_free_coherent, - .map_single = vio_map_single, - .unmap_single = vio_unmap_single, - .map_sg = vio_map_sg, - .unmap_sg = vio_unmap_sg, - .dma_supported = vio_dma_supported, -}; - -static int vio_bus_match(struct device *dev, struct device_driver *drv) +int vio_disable_interrupts(struct vio_dev *dev) { - const struct vio_dev *vio_dev = to_vio_dev(dev); - struct vio_driver *vio_drv = to_vio_driver(drv); - const struct vio_device_id *ids = vio_drv->id_table; - - return (ids != NULL) && (vio_match_device(ids, vio_dev) != NULL); + int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE); + if (rc != H_SUCCESS) + printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc); + return rc; } - -struct bus_type vio_bus_type = { - .name = "vio", - .match = vio_bus_match, -}; +EXPORT_SYMBOL(vio_disable_interrupts); +#endif /* CONFIG_PPC_PSERIES */ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index d4dfcfbce27..f096e72262f 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -1,10 +1,33 @@ -#include <linux/config.h> #ifdef CONFIG_PPC64 -#include <asm/page.h> +#define PROVIDE32(x) PROVIDE(__unused__##x) #else -#define PAGE_SIZE 4096 +#define PROVIDE32(x) PROVIDE(x) #endif +#include <asm/page.h> #include <asm-generic/vmlinux.lds.h> +#include <asm/cache.h> +#include <asm/thread_info.h> + +ENTRY(_stext) + +PHDRS { + kernel PT_LOAD FLAGS(7); /* RWX */ + notes PT_NOTE FLAGS(0); + dummy PT_NOTE FLAGS(0); + + /* binutils < 2.18 has a bug that makes it misbehave when taking an + ELF file with all segments at load address 0 as input. This + happens when running "strip" on vmlinux, because of the AT() magic + in this linker script. People using GCC >= 4.2 won't run into + this problem, because the "build-id" support will put some data + into the "notes" segment (at a non-zero load address). + + To work around this, we force some data into both the "dummy" + segment and the kernel segment, so the dummy segment will get a + non-zero load address. It's not enough to always create the + "notes" segment, since if nothing gets assigned to it, its load + address will be zero. */ +} #ifdef CONFIG_PPC64 OUTPUT_ARCH(powerpc:common64) @@ -15,265 +38,227 @@ jiffies = jiffies_64 + 4; #endif SECTIONS { - /* Sections to be discarded. */ - /DISCARD/ : { - *(.exitcall.exit) - *(.exit.data) - } - + . = KERNELBASE; + +/* + * Text, read only data and other permanent read-only sections + */ + + /* Text and gots */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { + ALIGN_FUNCTION(); + HEAD_TEXT + _text = .; + /* careful! __ftr_alt_* sections need to be close to .text */ + *(.text .fixup __ftr_alt_* .ref.text) + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT - /* Read-only sections, merged into text segment: */ -#ifdef CONFIG_PPC32 - . = + SIZEOF_HEADERS; - .interp : { *(.interp) } - .hash : { *(.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .rel.text : { *(.rel.text) } - .rela.text : { *(.rela.text) } - .rel.data : { *(.rel.data) } - .rela.data : { *(.rela.data) } - .rel.rodata : { *(.rel.rodata) } - .rela.rodata : { *(.rela.rodata) } - .rel.got : { *(.rel.got) } - .rela.got : { *(.rela.got) } - .rel.ctors : { *(.rel.ctors) } - .rela.ctors : { *(.rela.ctors) } - .rel.dtors : { *(.rel.dtors) } - .rela.dtors : { *(.rela.dtors) } - .rel.bss : { *(.rel.bss) } - .rela.bss : { *(.rela.bss) } - .rel.plt : { *(.rel.plt) } - .rela.plt : { *(.rela.plt) } -/* .init : { *(.init) } =0*/ - .plt : { *(.plt) } -#endif - .text : { - *(.text .text.*) - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - *(.fixup) -#ifdef CONFIG_PPC32 - *(.got1) - __got2_start = .; - *(.got2) - __got2_end = .; -#else - . = ALIGN(PAGE_SIZE); - _etext = .; -#endif - } #ifdef CONFIG_PPC32 - _etext = .; - PROVIDE (etext = .); + *(.got1) + __got2_start = .; + *(.got2) + __got2_end = .; +#endif /* CONFIG_PPC32 */ + + } :kernel + + . = ALIGN(PAGE_SIZE); + _etext = .; + PROVIDE32 (etext = .); + + /* Read-only data */ + RODATA + + EXCEPTION_TABLE(0) + + NOTES :kernel :notes + + /* The dummy segment contents for the bug workaround mentioned above + near PHDRS. */ + .dummy : AT(ADDR(.dummy) - LOAD_OFFSET) { + LONG(0) + LONG(0) + LONG(0) + } :kernel :dummy + +/* + * Init sections discarded at runtime + */ + . = ALIGN(PAGE_SIZE); + __init_begin = .; + INIT_TEXT_SECTION(PAGE_SIZE) :kernel + + /* .exit.text is discarded at runtime, not link time, + * to deal with references from __bug_table + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } - RODATA - .fini : { *(.fini) } =0 - .ctors : { *(.ctors) } - .dtors : { *(.dtors) } + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + INIT_DATA + __vtop_table_begin = .; + *(.vtop_fixup); + __vtop_table_end = .; + __ptov_table_begin = .; + *(.ptov_fixup); + __ptov_table_end = .; + } - .fixup : { *(.fixup) } -#endif + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + INIT_SETUP(16) + } - __ex_table : { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + INIT_CALLS } - __bug_table : { - __start___bug_table = .; - *(__bug_table) - __stop___bug_table = .; + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + CON_INITCALL } -#ifdef CONFIG_PPC64 - __ftr_fixup : { + SECURITY_INIT + + . = ALIGN(8); + __ftr_fixup : AT(ADDR(__ftr_fixup) - LOAD_OFFSET) { __start___ftr_fixup = .; *(__ftr_fixup) __stop___ftr_fixup = .; } - - RODATA + . = ALIGN(8); + __mmu_ftr_fixup : AT(ADDR(__mmu_ftr_fixup) - LOAD_OFFSET) { + __start___mmu_ftr_fixup = .; + *(__mmu_ftr_fixup) + __stop___mmu_ftr_fixup = .; + } + . = ALIGN(8); + __lwsync_fixup : AT(ADDR(__lwsync_fixup) - LOAD_OFFSET) { + __start___lwsync_fixup = .; + *(__lwsync_fixup) + __stop___lwsync_fixup = .; + } +#ifdef CONFIG_PPC64 + . = ALIGN(8); + __fw_ftr_fixup : AT(ADDR(__fw_ftr_fixup) - LOAD_OFFSET) { + __start___fw_ftr_fixup = .; + *(__fw_ftr_fixup) + __stop___fw_ftr_fixup = .; + } #endif + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + INIT_RAM_FS + } -#ifdef CONFIG_PPC32 - /* Read-write section, merged into data segment: */ - . = ALIGN(PAGE_SIZE); - _sdata = .; - .data : - { - *(.data) - *(.data1) - *(.sdata) - *(.sdata2) - *(.got.plt) *(.got) - *(.dynamic) - CONSTRUCTORS - } - - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - .data_nosave : { *(.data.nosave) } - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - - . = ALIGN(32); - .data.cacheline_aligned : { *(.data.cacheline_aligned) } - - _edata = .; - PROVIDE (edata = .); - - . = ALIGN(8192); - .data.init_task : { *(.data.init_task) } -#endif + PERCPU_SECTION(L1_CACHE_BYTES) - /* will be freed after init */ - . = ALIGN(PAGE_SIZE); - __init_begin = .; - .init.text : { - _sinittext = .; - *(.init.text) - _einittext = .; - } -#ifdef CONFIG_PPC32 - /* .exit.text is discarded at runtime, not link time, - to deal with references from __bug_table */ - .exit.text : { *(.exit.text) } + . = ALIGN(8); + .machine.desc : AT(ADDR(.machine.desc) - LOAD_OFFSET) { + __machine_desc_start = . ; + *(.machine.desc) + __machine_desc_end = . ; + } +#ifdef CONFIG_RELOCATABLE + . = ALIGN(8); + .dynsym : AT(ADDR(.dynsym) - LOAD_OFFSET) + { +#ifdef CONFIG_RELOCATABLE_PPC32 + __dynamic_symtab = .; #endif - .init.data : { - *(.init.data); - __vtop_table_begin = .; - *(.vtop_fixup); - __vtop_table_end = .; - __ptov_table_begin = .; - *(.ptov_fixup); - __ptov_table_end = .; - } - - . = ALIGN(16); - .init.setup : { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - - .initcall.init : { - __initcall_start = .; - *(.initcall1.init) - *(.initcall2.init) - *(.initcall3.init) - *(.initcall4.init) - *(.initcall5.init) - *(.initcall6.init) - *(.initcall7.init) - __initcall_end = .; - } - - .con_initcall.init : { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - - SECURITY_INIT + *(.dynsym) + } + .dynstr : AT(ADDR(.dynstr) - LOAD_OFFSET) { *(.dynstr) } + .dynamic : AT(ADDR(.dynamic) - LOAD_OFFSET) + { + __dynamic_start = .; + *(.dynamic) + } + .hash : AT(ADDR(.hash) - LOAD_OFFSET) { *(.hash) } + .interp : AT(ADDR(.interp) - LOAD_OFFSET) { *(.interp) } + .rela.dyn : AT(ADDR(.rela.dyn) - LOAD_OFFSET) + { + __rela_dyn_start = .; + *(.rela*) + } +#endif + + /* freed after init ends here */ + . = ALIGN(PAGE_SIZE); + __init_end = .; + +/* + * And now the various read/write data + */ + + . = ALIGN(PAGE_SIZE); + _sdata = .; #ifdef CONFIG_PPC32 - __start___ftr_fixup = .; - __ftr_fixup : { *(__ftr_fixup) } - __stop___ftr_fixup = .; + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + *(.sdata) + *(.got.plt) *(.got) + } #else - . = ALIGN(PAGE_SIZE); - .init.ramfs : { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + *(.data.rel*) + *(.toc1) + *(.branch_lt) + } -#ifdef CONFIG_PPC32 - . = ALIGN(32); + .opd : AT(ADDR(.opd) - LOAD_OFFSET) { + *(.opd) + } + + .got : AT(ADDR(.got) - LOAD_OFFSET) { + __toc_start = .; +#ifndef CONFIG_RELOCATABLE + __prom_init_toc_start = .; + arch/powerpc/kernel/prom_init.o*(.toc .got) + __prom_init_toc_end = .; +#endif + *(.got) + *(.toc) + } #endif - .data.percpu : { - __per_cpu_start = .; - *(.data.percpu) - __per_cpu_end = .; - } - . = ALIGN(PAGE_SIZE); -#ifdef CONFIG_PPC64 - . = ALIGN(16384); - __init_end = .; - /* freed after init ends here */ - - /* Read/write sections */ - . = ALIGN(PAGE_SIZE); - . = ALIGN(16384); - _sdata = .; - /* The initial task and kernel stack */ - .data.init_task : { - *(.data.init_task) - } - - . = ALIGN(PAGE_SIZE); - .data.page_aligned : { - *(.data.page_aligned) - } - - .data.cacheline_aligned : { - *(.data.cacheline_aligned) - } - - .data : { - *(.data .data.rel* .toc1) - *(.branch_lt) - } - - .opd : { - *(.opd) - } - - .got : { - __toc_start = .; - *(.got) - *(.toc) - . = ALIGN(PAGE_SIZE); - _edata = .; - } - - . = ALIGN(PAGE_SIZE); -#else - __initramfs_start = .; - .init.ramfs : { - *(.init.ramfs) - } - __initramfs_end = .; + /* The initial task and kernel stack */ + INIT_TASK_DATA_SECTION(THREAD_SIZE) - . = ALIGN(4096); - __init_end = .; + .data..page_aligned : AT(ADDR(.data..page_aligned) - LOAD_OFFSET) { + PAGE_ALIGNED_DATA(PAGE_SIZE) + } - . = ALIGN(4096); - _sextratext = .; - _eextratext = .; + .data..cacheline_aligned : AT(ADDR(.data..cacheline_aligned) - LOAD_OFFSET) { + CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES) + } - __bss_start = .; -#endif + .data..read_mostly : AT(ADDR(.data..read_mostly) - LOAD_OFFSET) { + READ_MOSTLY_DATA(L1_CACHE_BYTES) + } - .bss : { - __bss_start = .; - *(.sbss) *(.scommon) - *(.dynbss) - *(.bss) - *(COMMON) - __bss_stop = .; - } + . = ALIGN(PAGE_SIZE); + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + NOSAVE_DATA + } -#ifdef CONFIG_PPC64 - . = ALIGN(PAGE_SIZE); -#endif - _end = . ; -#ifdef CONFIG_PPC32 - PROVIDE (end = .); -#endif + . = ALIGN(PAGE_SIZE); + _edata = .; + PROVIDE32 (edata = .); + +/* + * And finally the bss + */ + + BSS_SECTION(0, 0, 0) + + . = ALIGN(PAGE_SIZE); + _end = . ; + PROVIDE32 (end = .); + + /* Sections to be discarded. */ + DISCARDS } |
