diff options
Diffstat (limited to 'arch/powerpc/kernel')
125 files changed, 13952 insertions, 6304 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 8f619342f14..670c312d914 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -2,12 +2,13 @@ # Makefile for the linux kernel. # +CFLAGS_prom.o = -I$(src)/../../../scripts/dtc/libfdt CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror ifeq ($(CONFIG_PPC64),y) -CFLAGS_prom_init.o += -mno-minimal-toc +CFLAGS_prom_init.o += $(NO_MINIMAL_TOC) endif ifeq ($(CONFIG_PPC32),y) CFLAGS_prom_init.o += -fPIC @@ -39,15 +40,14 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o +obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o obj64-$(CONFIG_RELOCATABLE) += reloc_64.o obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o -obj-$(CONFIG_PPC_A2) += cpu_setup_a2.o obj-$(CONFIG_PPC64) += vdso64/ obj-$(CONFIG_ALTIVEC) += vecemu.o obj-$(CONFIG_PPC_970_NAP) += idle_power4.o obj-$(CONFIG_PPC_P7_NAP) += idle_power7.o obj-$(CONFIG_PPC_OF) += of_platform.o prom_parse.o -obj-$(CONFIG_PPC_CLOCK) += clock.o procfs-y := proc_powerpc.o obj-$(CONFIG_PROC_FS) += $(procfs-y) rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI) := rtas_pci.o @@ -55,9 +55,10 @@ obj-$(CONFIG_PPC_RTAS) += rtas.o rtas-rtc.o $(rtaspci-y-y) obj-$(CONFIG_PPC_RTAS_DAEMON) += rtasd.o obj-$(CONFIG_RTAS_FLASH) += rtas_flash.o obj-$(CONFIG_RTAS_PROC) += rtas-proc.o -obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o +obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \ + eeh_driver.o eeh_event.o eeh_sysfs.o obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_FA_DUMP) += fadump.o @@ -75,8 +76,8 @@ endif obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o obj-$(CONFIG_MODULES) += module.o module_$(CONFIG_WORD_SIZE).o obj-$(CONFIG_44x) += cpu_setup_44x.o -obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o dbell.o -obj-$(CONFIG_PPC_BOOK3E_64) += dbell.o +obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o +obj-$(CONFIG_PPC_DOORBELL) += dbell.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o extra-y := head_$(CONFIG_WORD_SIZE).o @@ -91,7 +92,6 @@ obj-$(CONFIG_RELOCATABLE_PPC32) += reloc_32.o obj-$(CONFIG_PPC32) += entry_32.o setup_32.o obj-$(CONFIG_PPC64) += dma-iommu.o iommu.o obj-$(CONFIG_KGDB) += kgdb.o -obj-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o obj-$(CONFIG_MODULES) += ppc_ksyms.o obj-$(CONFIG_BOOTX_TEXT) += btext.o obj-$(CONFIG_SMP) += smp.o @@ -101,7 +101,7 @@ obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o -pci64-$(CONFIG_PPC64) += pci_dn.o isa-bridge.o +pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o obj-$(CONFIG_PCI) += pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \ pci-common.o pci_of_scan.o obj-$(CONFIG_PCI_MSI) += msi.o @@ -116,12 +116,12 @@ obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o -obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o - -ifneq ($(CONFIG_PPC_INDIRECT_IO),y) +ifneq ($(CONFIG_PPC_INDIRECT_PIO),y) obj-y += iomap.o endif +obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM) += tm.o + obj-$(CONFIG_PPC64) += $(obj64-y) obj-$(CONFIG_PPC32) += $(obj32-y) @@ -142,6 +142,7 @@ GCOV_PROFILE_kprobes.o := n extra-$(CONFIG_PPC_FPU) += fpu.o extra-$(CONFIG_ALTIVEC) += vector.o extra-$(CONFIG_PPC64) += entry_64.o +extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o extra-y += systbl_chk.i $(obj)/systbl.o: systbl_chk diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index ee5b690a0be..34f55524d45 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -25,14 +25,13 @@ #include <asm/cputable.h> #include <asm/emulated_ops.h> #include <asm/switch_to.h> +#include <asm/disassemble.h> struct aligninfo { unsigned char len; unsigned char flags; }; -#define IS_XFORM(inst) (((inst) >> 26) == 31) -#define IS_DSFORM(inst) (((inst) >> 26) >= 56) #define INVALID { 0, 0 } @@ -54,8 +53,6 @@ struct aligninfo { /* DSISR bits reported for a DCBZ instruction: */ #define DCBZ 0x5f /* 8xx/82xx dcbz faults when cache not enabled */ -#define SWAP(a, b) (t = (a), (a) = (b), (b) = t) - /* * The PowerPC stores certain bits of the instruction that caused the * alignment exception in the DSISR register. This array maps those @@ -75,7 +72,7 @@ static struct aligninfo aligninfo[128] = { { 8, LD+F }, /* 00 0 1001: lfd */ { 4, ST+F+S }, /* 00 0 1010: stfs */ { 8, ST+F }, /* 00 0 1011: stfd */ - INVALID, /* 00 0 1100 */ + { 16, LD }, /* 00 0 1100: lq */ { 8, LD }, /* 00 0 1101: ld/ldu/lwa */ INVALID, /* 00 0 1110 */ { 8, ST }, /* 00 0 1111: std/stdu */ @@ -142,7 +139,7 @@ static struct aligninfo aligninfo[128] = { { 2, LD+SW }, /* 10 0 1100: lhbrx */ { 4, LD+SE }, /* 10 0 1101 lwa */ { 2, ST+SW }, /* 10 0 1110: sthbrx */ - INVALID, /* 10 0 1111 */ + { 16, ST }, /* 10 0 1111: stq */ INVALID, /* 10 1 0000 */ INVALID, /* 10 1 0001 */ INVALID, /* 10 1 0010 */ @@ -194,37 +191,6 @@ static struct aligninfo aligninfo[128] = { }; /* - * Create a DSISR value from the instruction - */ -static inline unsigned make_dsisr(unsigned instr) -{ - unsigned dsisr; - - - /* bits 6:15 --> 22:31 */ - dsisr = (instr & 0x03ff0000) >> 16; - - if (IS_XFORM(instr)) { - /* bits 29:30 --> 15:16 */ - dsisr |= (instr & 0x00000006) << 14; - /* bit 25 --> 17 */ - dsisr |= (instr & 0x00000040) << 8; - /* bits 21:24 --> 18:21 */ - dsisr |= (instr & 0x00000780) << 3; - } else { - /* bit 5 --> 17 */ - dsisr |= (instr & 0x04000000) >> 12; - /* bits 1: 4 --> 18:21 */ - dsisr |= (instr & 0x78000000) >> 17; - /* bits 30:31 --> 12:13 */ - if (IS_DSFORM(instr)) - dsisr |= (instr & 0x00000003) << 18; - } - - return dsisr; -} - -/* * The dcbz (data cache block zero) instruction * gives an alignment fault if used on non-cacheable * memory. We handle the fault mainly for the @@ -256,11 +222,17 @@ static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr) * bottom 4 bytes of each register, and the loads clear the * top 4 bytes of the affected register. */ +#ifdef __BIG_ENDIAN__ #ifdef CONFIG_PPC64 #define REG_BYTE(rp, i) *((u8 *)((rp) + ((i) >> 2)) + ((i) & 3) + 4) #else #define REG_BYTE(rp, i) *((u8 *)(rp) + (i)) #endif +#endif + +#ifdef __LITTLE_ENDIAN__ +#define REG_BYTE(rp, i) (*(((u8 *)((rp) + ((i)>>2)) + ((i)&3)))) +#endif #define SWIZ_PTR(p) ((unsigned char __user *)((p) ^ swiz)) @@ -305,6 +277,15 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, nb0 = nb + reg * 4 - 128; nb = 128 - reg * 4; } +#ifdef __LITTLE_ENDIAN__ + /* + * String instructions are endian neutral but the code + * below is not. Force byte swapping on so that the + * effects of swizzling are undone in the load/store + * loops below. + */ + flags ^= SW; +#endif } else { /* lwm, stmw */ nb = (32 - reg) * 4; @@ -372,8 +353,6 @@ static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg, char *ptr1 = (char *) ¤t->thread.TS_FPR(reg+1); int i, ret, sw = 0; - if (!(flags & F)) - return 0; if (reg & 1) return 0; /* invalid form: FRS/FRT must be even */ if (flags & SW) @@ -393,6 +372,34 @@ static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg, return 1; /* exception handled and fixed up */ } +#ifdef CONFIG_PPC64 +static int emulate_lq_stq(struct pt_regs *regs, unsigned char __user *addr, + unsigned int reg, unsigned int flags) +{ + char *ptr0 = (char *)®s->gpr[reg]; + char *ptr1 = (char *)®s->gpr[reg+1]; + int i, ret, sw = 0; + + if (reg & 1) + return 0; /* invalid form: GPR must be even */ + if (flags & SW) + sw = 7; + ret = 0; + for (i = 0; i < 8; ++i) { + if (!(flags & ST)) { + ret |= __get_user(ptr0[i^sw], addr + i); + ret |= __get_user(ptr1[i^sw], addr + i + 8); + } else { + ret |= __put_user(ptr0[i^sw], addr + i); + ret |= __put_user(ptr1[i^sw], addr + i + 8); + } + } + if (ret) + return -EFAULT; + return 1; /* exception handled and fixed up */ +} +#endif /* CONFIG_PPC64 */ + #ifdef CONFIG_SPE static struct aligninfo spe_aligninfo[32] = { @@ -458,7 +465,7 @@ static struct aligninfo spe_aligninfo[32] = { static int emulate_spe(struct pt_regs *regs, unsigned int reg, unsigned int instr) { - int t, ret; + int ret; union { u64 ll; u32 w[2]; @@ -581,24 +588,18 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, if (flags & SW) { switch (flags & 0xf0) { case E8: - SWAP(data.v[0], data.v[7]); - SWAP(data.v[1], data.v[6]); - SWAP(data.v[2], data.v[5]); - SWAP(data.v[3], data.v[4]); + data.ll = swab64(data.ll); break; case E4: - - SWAP(data.v[0], data.v[3]); - SWAP(data.v[1], data.v[2]); - SWAP(data.v[4], data.v[7]); - SWAP(data.v[5], data.v[6]); + data.w[0] = swab32(data.w[0]); + data.w[1] = swab32(data.w[1]); break; /* Its half word endian */ default: - SWAP(data.v[0], data.v[1]); - SWAP(data.v[2], data.v[3]); - SWAP(data.v[4], data.v[5]); - SWAP(data.v[6], data.v[7]); + data.h[0] = swab16(data.h[0]); + data.h[1] = swab16(data.h[1]); + data.h[2] = swab16(data.h[2]); + data.h[3] = swab16(data.h[3]); break; } } @@ -651,17 +652,38 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg, int sw = 0; int i, j; + /* userland only */ + if (unlikely(!user_mode(regs))) + return 0; + flush_vsx_to_thread(current); if (reg < 32) - ptr = (char *) ¤t->thread.TS_FPR(reg); + ptr = (char *) ¤t->thread.fp_state.fpr[reg][0]; else - ptr = (char *) ¤t->thread.vr[reg - 32]; + ptr = (char *) ¤t->thread.vr_state.vr[reg - 32]; lptr = (unsigned long *) ptr; +#ifdef __LITTLE_ENDIAN__ + if (flags & SW) { + elsize = length; + sw = length-1; + } else { + /* + * The elements are BE ordered, even in LE mode, so process + * them in reverse order. + */ + addr += length - elsize; + + /* 8 byte memory accesses go in the top 8 bytes of the VR */ + if (length == 8) + ptr += 8; + } +#else if (flags & SW) sw = elsize-1; +#endif for (j = 0; j < length; j += elsize) { for (i = 0; i < elsize; ++i) { @@ -671,19 +693,31 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg, ret |= __get_user(ptr[i^sw], addr + i); } ptr += elsize; +#ifdef __LITTLE_ENDIAN__ + addr -= elsize; +#else addr += elsize; +#endif } +#ifdef __BIG_ENDIAN__ +#define VSX_HI 0 +#define VSX_LO 1 +#else +#define VSX_HI 1 +#define VSX_LO 0 +#endif + if (!ret) { if (flags & U) regs->gpr[areg] = regs->dar; /* Splat load copies the same data to top and bottom 8 bytes */ if (flags & SPLT) - lptr[1] = lptr[0]; - /* For 8 byte loads, zero the top 8 bytes */ + lptr[VSX_LO] = lptr[VSX_HI]; + /* For 8 byte loads, zero the low 8 bytes */ else if (!(flags & ST) && (8 == length)) - lptr[1] = 0; + lptr[VSX_LO] = 0; } else return -EFAULT; @@ -706,18 +740,28 @@ int fix_alignment(struct pt_regs *regs) unsigned int dsisr; unsigned char __user *addr; unsigned long p, swiz; - int ret, t; - union { + int ret, i; + union data { u64 ll; double dd; unsigned char v[8]; struct { +#ifdef __LITTLE_ENDIAN__ + int low32; + unsigned hi32; +#else unsigned hi32; int low32; +#endif } x32; struct { +#ifdef __LITTLE_ENDIAN__ + short low16; + unsigned char hi48[6]; +#else unsigned char hi48[6]; short low16; +#endif } x16; } data; @@ -764,10 +808,21 @@ int fix_alignment(struct pt_regs *regs) nb = aligninfo[instr].len; flags = aligninfo[instr].flags; + /* ldbrx/stdbrx overlap lfs/stfs in the DSISR unfortunately */ + if (IS_XFORM(instruction) && ((instruction >> 1) & 0x3ff) == 532) { + nb = 8; + flags = LD+SW; + } else if (IS_XFORM(instruction) && + ((instruction >> 1) & 0x3ff) == 660) { + nb = 8; + flags = ST+SW; + } + /* Byteswap little endian loads and stores */ swiz = 0; - if (regs->msr & MSR_LE) { + if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) { flags ^= SW; +#ifdef __BIG_ENDIAN__ /* * So-called "PowerPC little endian" mode works by * swizzling addresses rather than by actually doing @@ -780,6 +835,7 @@ int fix_alignment(struct pt_regs *regs) */ if (cpu_has_feature(CPU_FTR_PPC_LE)) swiz = 7; +#endif } /* DAR has the operand effective address */ @@ -804,7 +860,7 @@ int fix_alignment(struct pt_regs *regs) elsize = 8; flags = 0; - if (regs->msr & MSR_LE) + if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) flags |= SW; if (instruction & 0x100) flags |= ST; @@ -852,10 +908,20 @@ int fix_alignment(struct pt_regs *regs) flush_fp_to_thread(current); } - /* Special case for 16-byte FP loads and stores */ - if (nb == 16) { - PPC_WARN_ALIGNMENT(fp_pair, regs); - return emulate_fp_pair(addr, reg, flags); + if ((nb == 16)) { + if (flags & F) { + /* Special case for 16-byte FP loads and stores */ + PPC_WARN_ALIGNMENT(fp_pair, regs); + return emulate_fp_pair(addr, reg, flags); + } else { +#ifdef CONFIG_PPC64 + /* Special case for 16-byte loads and stores */ + PPC_WARN_ALIGNMENT(lq_stq, regs); + return emulate_lq_stq(regs, addr, reg, flags); +#else + return 0; +#endif + } } PPC_WARN_ALIGNMENT(unaligned, regs); @@ -864,32 +930,36 @@ int fix_alignment(struct pt_regs *regs) * get it from register values */ if (!(flags & ST)) { - data.ll = 0; - ret = 0; - p = (unsigned long) addr; + unsigned int start = 0; + switch (nb) { - case 8: - ret |= __get_user_inatomic(data.v[0], SWIZ_PTR(p++)); - ret |= __get_user_inatomic(data.v[1], SWIZ_PTR(p++)); - ret |= __get_user_inatomic(data.v[2], SWIZ_PTR(p++)); - ret |= __get_user_inatomic(data.v[3], SWIZ_PTR(p++)); case 4: - ret |= __get_user_inatomic(data.v[4], SWIZ_PTR(p++)); - ret |= __get_user_inatomic(data.v[5], SWIZ_PTR(p++)); + start = offsetof(union data, x32.low32); + break; case 2: - ret |= __get_user_inatomic(data.v[6], SWIZ_PTR(p++)); - ret |= __get_user_inatomic(data.v[7], SWIZ_PTR(p++)); - if (unlikely(ret)) - return -EFAULT; + start = offsetof(union data, x16.low16); + break; } + + data.ll = 0; + ret = 0; + p = (unsigned long)addr; + + for (i = 0; i < nb; i++) + ret |= __get_user_inatomic(data.v[start + i], + SWIZ_PTR(p++)); + + if (unlikely(ret)) + return -EFAULT; + } else if (flags & F) { - data.dd = current->thread.TS_FPR(reg); + data.ll = current->thread.TS_FPR(reg); if (flags & S) { /* Single-precision FP store requires conversion... */ #ifdef CONFIG_PPC_FPU preempt_disable(); enable_kernel_fp(); - cvt_df(&data.dd, (float *)&data.v[4]); + cvt_df(&data.dd, (float *)&data.x32.low32); preempt_enable(); #else return 0; @@ -901,17 +971,13 @@ int fix_alignment(struct pt_regs *regs) if (flags & SW) { switch (nb) { case 8: - SWAP(data.v[0], data.v[7]); - SWAP(data.v[1], data.v[6]); - SWAP(data.v[2], data.v[5]); - SWAP(data.v[3], data.v[4]); + data.ll = swab64(data.ll); break; case 4: - SWAP(data.v[4], data.v[7]); - SWAP(data.v[5], data.v[6]); + data.x32.low32 = swab32(data.x32.low32); break; case 2: - SWAP(data.v[6], data.v[7]); + data.x16.low16 = swab16(data.x16.low16); break; } } @@ -933,7 +999,7 @@ int fix_alignment(struct pt_regs *regs) #ifdef CONFIG_PPC_FPU preempt_disable(); enable_kernel_fp(); - cvt_fd((float *)&data.v[4], &data.dd); + cvt_fd((float *)&data.x32.low32, &data.dd); preempt_enable(); #else return 0; @@ -943,25 +1009,28 @@ int fix_alignment(struct pt_regs *regs) /* Store result to memory or update registers */ if (flags & ST) { - ret = 0; - p = (unsigned long) addr; + unsigned int start = 0; + switch (nb) { - case 8: - ret |= __put_user_inatomic(data.v[0], SWIZ_PTR(p++)); - ret |= __put_user_inatomic(data.v[1], SWIZ_PTR(p++)); - ret |= __put_user_inatomic(data.v[2], SWIZ_PTR(p++)); - ret |= __put_user_inatomic(data.v[3], SWIZ_PTR(p++)); case 4: - ret |= __put_user_inatomic(data.v[4], SWIZ_PTR(p++)); - ret |= __put_user_inatomic(data.v[5], SWIZ_PTR(p++)); + start = offsetof(union data, x32.low32); + break; case 2: - ret |= __put_user_inatomic(data.v[6], SWIZ_PTR(p++)); - ret |= __put_user_inatomic(data.v[7], SWIZ_PTR(p++)); + start = offsetof(union data, x16.low16); + break; } + + ret = 0; + p = (unsigned long)addr; + + for (i = 0; i < nb; i++) + ret |= __put_user_inatomic(data.v[start + i], + SWIZ_PTR(p++)); + if (unlikely(ret)) return -EFAULT; } else if (flags & F) - current->thread.TS_FPR(reg) = data.dd; + current->thread.TS_FPR(reg) = data.ll; else regs->gpr[reg] = data.ll; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 4e23ba2f3ca..f5995a91221 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -54,6 +54,7 @@ #endif #if defined(CONFIG_KVM) && defined(CONFIG_PPC_BOOK3S) #include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> #endif #ifdef CONFIG_PPC32 @@ -77,36 +78,36 @@ int main(void) DEFINE(NMI_MASK, NMI_MASK); DEFINE(THREAD_DSCR, offsetof(struct thread_struct, dscr)); DEFINE(THREAD_DSCR_INHERIT, offsetof(struct thread_struct, dscr_inherit)); + DEFINE(TASKTHREADPPR, offsetof(struct task_struct, thread.ppr)); #else DEFINE(THREAD_INFO, offsetof(struct task_struct, stack)); + DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16)); + DEFINE(KSP_LIMIT, offsetof(struct thread_struct, ksp_limit)); #endif /* CONFIG_PPC64 */ DEFINE(KSP, offsetof(struct thread_struct, ksp)); - DEFINE(KSP_LIMIT, offsetof(struct thread_struct, ksp_limit)); DEFINE(PT_REGS, offsetof(struct thread_struct, regs)); #ifdef CONFIG_BOOKE DEFINE(THREAD_NORMSAVES, offsetof(struct thread_struct, normsave[0])); #endif DEFINE(THREAD_FPEXC_MODE, offsetof(struct thread_struct, fpexc_mode)); - DEFINE(THREAD_FPR0, offsetof(struct thread_struct, fpr[0])); - DEFINE(THREAD_FPSCR, offsetof(struct thread_struct, fpscr)); + DEFINE(THREAD_FPSTATE, offsetof(struct thread_struct, fp_state)); + DEFINE(THREAD_FPSAVEAREA, offsetof(struct thread_struct, fp_save_area)); + DEFINE(FPSTATE_FPSCR, offsetof(struct thread_fp_state, fpscr)); #ifdef CONFIG_ALTIVEC - DEFINE(THREAD_VR0, offsetof(struct thread_struct, vr[0])); + DEFINE(THREAD_VRSTATE, offsetof(struct thread_struct, vr_state)); + DEFINE(THREAD_VRSAVEAREA, offsetof(struct thread_struct, vr_save_area)); DEFINE(THREAD_VRSAVE, offsetof(struct thread_struct, vrsave)); - DEFINE(THREAD_VSCR, offsetof(struct thread_struct, vscr)); DEFINE(THREAD_USED_VR, offsetof(struct thread_struct, used_vr)); + DEFINE(VRSTATE_VSCR, offsetof(struct thread_vr_state, vscr)); #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_VSX - DEFINE(THREAD_VSR0, offsetof(struct thread_struct, fpr)); DEFINE(THREAD_USED_VSR, offsetof(struct thread_struct, used_vsr)); #endif /* CONFIG_VSX */ #ifdef CONFIG_PPC64 DEFINE(KSP_VSID, offsetof(struct thread_struct, ksp_vsid)); #else /* CONFIG_PPC64 */ DEFINE(PGDIR, offsetof(struct thread_struct, pgdir)); -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0)); -#endif #ifdef CONFIG_SPE DEFINE(THREAD_EVR0, offsetof(struct thread_struct, evr[0])); DEFINE(THREAD_ACC, offsetof(struct thread_struct, acc)); @@ -114,13 +115,47 @@ int main(void) DEFINE(THREAD_USED_SPE, offsetof(struct thread_struct, used_spe)); #endif /* CONFIG_SPE */ #endif /* CONFIG_PPC64 */ +#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) + DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, debug.dbcr0)); +#endif #ifdef CONFIG_KVM_BOOK3S_32_HANDLER DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu)); #endif -#ifdef CONFIG_KVM_BOOKE_HV +#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu)); #endif +#ifdef CONFIG_PPC_BOOK3S_64 + DEFINE(THREAD_TAR, offsetof(struct thread_struct, tar)); + DEFINE(THREAD_BESCR, offsetof(struct thread_struct, bescr)); + DEFINE(THREAD_EBBHR, offsetof(struct thread_struct, ebbhr)); + DEFINE(THREAD_EBBRR, offsetof(struct thread_struct, ebbrr)); + DEFINE(THREAD_SIAR, offsetof(struct thread_struct, siar)); + DEFINE(THREAD_SDAR, offsetof(struct thread_struct, sdar)); + DEFINE(THREAD_SIER, offsetof(struct thread_struct, sier)); + DEFINE(THREAD_MMCR0, offsetof(struct thread_struct, mmcr0)); + DEFINE(THREAD_MMCR2, offsetof(struct thread_struct, mmcr2)); +#endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + DEFINE(PACATMSCRATCH, offsetof(struct paca_struct, tm_scratch)); + DEFINE(THREAD_TM_TFHAR, offsetof(struct thread_struct, tm_tfhar)); + DEFINE(THREAD_TM_TEXASR, offsetof(struct thread_struct, tm_texasr)); + DEFINE(THREAD_TM_TFIAR, offsetof(struct thread_struct, tm_tfiar)); + DEFINE(THREAD_TM_TAR, offsetof(struct thread_struct, tm_tar)); + DEFINE(THREAD_TM_PPR, offsetof(struct thread_struct, tm_ppr)); + DEFINE(THREAD_TM_DSCR, offsetof(struct thread_struct, tm_dscr)); + DEFINE(PT_CKPT_REGS, offsetof(struct thread_struct, ckpt_regs)); + DEFINE(THREAD_TRANSACT_VRSTATE, offsetof(struct thread_struct, + transact_vr)); + DEFINE(THREAD_TRANSACT_VRSAVE, offsetof(struct thread_struct, + transact_vrsave)); + DEFINE(THREAD_TRANSACT_FPSTATE, offsetof(struct thread_struct, + transact_fp)); + /* Local pt_regs on stack for Transactional Memory funcs. */ + DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD + + sizeof(struct pt_regs) + 16); +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); @@ -169,6 +204,15 @@ int main(void) DEFINE(PACA_MC_STACK, offsetof(struct paca_struct, mc_kstack)); DEFINE(PACA_CRIT_STACK, offsetof(struct paca_struct, crit_kstack)); DEFINE(PACA_DBG_STACK, offsetof(struct paca_struct, dbg_kstack)); + DEFINE(PACA_TCD_PTR, offsetof(struct paca_struct, tcd_ptr)); + + DEFINE(TCD_ESEL_NEXT, + offsetof(struct tlb_core_data, esel_next)); + DEFINE(TCD_ESEL_MAX, + offsetof(struct tlb_core_data, esel_max)); + DEFINE(TCD_ESEL_FIRST, + offsetof(struct tlb_core_data, esel_first)); + DEFINE(TCD_LOCK, offsetof(struct tlb_core_data, lock)); #endif /* CONFIG_PPC_BOOK3E */ #ifdef CONFIG_PPC_STD_MMU_64 @@ -198,15 +242,20 @@ int main(void) DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx)); #endif /* CONFIG_PPC_STD_MMU_64 */ DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp)); +#ifdef CONFIG_PPC_BOOK3S_64 + DEFINE(PACAMCEMERGSP, offsetof(struct paca_struct, mc_emergency_sp)); + DEFINE(PACA_IN_MCE, offsetof(struct paca_struct, in_mce)); +#endif DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id)); DEFINE(PACAKEXECSTATE, offsetof(struct paca_struct, kexec_state)); + DEFINE(PACA_DSCR, offsetof(struct paca_struct, dscr_default)); DEFINE(PACA_STARTTIME, offsetof(struct paca_struct, starttime)); DEFINE(PACA_STARTTIME_USER, offsetof(struct paca_struct, starttime_user)); DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time)); DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time)); DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save)); DEFINE(PACA_NAPSTATELOST, offsetof(struct paca_struct, nap_state_lost)); - DEFINE(PACA_SPRG3, offsetof(struct paca_struct, sprg3)); + DEFINE(PACA_SPRG_VDSO, offsetof(struct paca_struct, sprg_vdso)); #endif /* CONFIG_PPC64 */ /* RTAS */ @@ -391,21 +440,19 @@ int main(void) DEFINE(VCPU_GUEST_PID, offsetof(struct kvm_vcpu, arch.pid)); DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave)); - DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr)); - DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr)); + DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fp.fpr)); #ifdef CONFIG_ALTIVEC - DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr)); - DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr)); -#endif -#ifdef CONFIG_VSX - DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr)); + DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr.vr)); #endif DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer)); DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr)); DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr)); +#ifdef CONFIG_PPC_BOOK3S + DEFINE(VCPU_TAR, offsetof(struct kvm_vcpu, arch.tar)); +#endif DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc)); -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr)); DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0)); DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1)); @@ -414,6 +461,7 @@ int main(void) DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2)); DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3)); #endif + DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3)); DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4)); DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5)); DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6)); @@ -423,6 +471,9 @@ int main(void) DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared)); DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr)); DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr)); +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) + DEFINE(VCPU_SHAREDBE, offsetof(struct kvm_vcpu, arch.shared_big_endian)); +#endif DEFINE(VCPU_SHARED_MAS0, offsetof(struct kvm_vcpu_arch_shared, mas0)); DEFINE(VCPU_SHARED_MAS1, offsetof(struct kvm_vcpu_arch_shared, mas1)); @@ -435,7 +486,7 @@ int main(void) DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); /* book3s */ -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1)); DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid)); DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); @@ -448,16 +499,24 @@ int main(void) DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr)); + DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty)); #endif #ifdef CONFIG_PPC_BOOK3S DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr)); DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr)); + DEFINE(VCPU_IC, offsetof(struct kvm_vcpu, arch.ic)); + DEFINE(VCPU_VTB, offsetof(struct kvm_vcpu, arch.vtb)); DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr)); DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr)); DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor)); + DEFINE(VCPU_IAMR, offsetof(struct kvm_vcpu, arch.iamr)); DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl)); DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr)); + DEFINE(VCPU_DABRX, offsetof(struct kvm_vcpu, arch.dabrx)); + DEFINE(VCPU_DAWR, offsetof(struct kvm_vcpu, arch.dawr)); + DEFINE(VCPU_DAWRX, offsetof(struct kvm_vcpu, arch.dawrx)); + DEFINE(VCPU_CIABR, offsetof(struct kvm_vcpu, arch.ciabr)); DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec)); DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires)); @@ -466,26 +525,64 @@ int main(void) DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded)); DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr)); DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc)); + DEFINE(VCPU_SPMC, offsetof(struct kvm_vcpu, arch.spmc)); + DEFINE(VCPU_SIAR, offsetof(struct kvm_vcpu, arch.siar)); + DEFINE(VCPU_SDAR, offsetof(struct kvm_vcpu, arch.sdar)); + DEFINE(VCPU_SIER, offsetof(struct kvm_vcpu, arch.sier)); DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max)); DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); + DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr)); DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap)); - DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid)); + DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar)); + DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr)); + DEFINE(VCPU_FSCR, offsetof(struct kvm_vcpu, arch.fscr)); + DEFINE(VCPU_SHADOW_FSCR, offsetof(struct kvm_vcpu, arch.shadow_fscr)); + DEFINE(VCPU_PSPB, offsetof(struct kvm_vcpu, arch.pspb)); + DEFINE(VCPU_EBBHR, offsetof(struct kvm_vcpu, arch.ebbhr)); + DEFINE(VCPU_EBBRR, offsetof(struct kvm_vcpu, arch.ebbrr)); + DEFINE(VCPU_BESCR, offsetof(struct kvm_vcpu, arch.bescr)); + DEFINE(VCPU_CSIGR, offsetof(struct kvm_vcpu, arch.csigr)); + DEFINE(VCPU_TACR, offsetof(struct kvm_vcpu, arch.tacr)); + DEFINE(VCPU_TCSCR, offsetof(struct kvm_vcpu, arch.tcscr)); + DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop)); + DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort)); + DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1)); DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads)); - DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) - - offsetof(struct kvmppc_vcpu_book3s, vcpu)); + DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm)); + DEFINE(VCORE_TB_OFFSET, offsetof(struct kvmppc_vcore, tb_offset)); + DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr)); + DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr)); + DEFINE(VCORE_DPDES, offsetof(struct kvmppc_vcore, dpdes)); DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige)); DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv)); DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb)); +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + DEFINE(VCPU_TFHAR, offsetof(struct kvm_vcpu, arch.tfhar)); + DEFINE(VCPU_TFIAR, offsetof(struct kvm_vcpu, arch.tfiar)); + DEFINE(VCPU_TEXASR, offsetof(struct kvm_vcpu, arch.texasr)); + DEFINE(VCPU_GPR_TM, offsetof(struct kvm_vcpu, arch.gpr_tm)); + DEFINE(VCPU_FPRS_TM, offsetof(struct kvm_vcpu, arch.fp_tm.fpr)); + DEFINE(VCPU_VRS_TM, offsetof(struct kvm_vcpu, arch.vr_tm.vr)); + DEFINE(VCPU_VRSAVE_TM, offsetof(struct kvm_vcpu, arch.vrsave_tm)); + DEFINE(VCPU_CR_TM, offsetof(struct kvm_vcpu, arch.cr_tm)); + DEFINE(VCPU_LR_TM, offsetof(struct kvm_vcpu, arch.lr_tm)); + DEFINE(VCPU_CTR_TM, offsetof(struct kvm_vcpu, arch.ctr_tm)); + DEFINE(VCPU_AMR_TM, offsetof(struct kvm_vcpu, arch.amr_tm)); + DEFINE(VCPU_PPR_TM, offsetof(struct kvm_vcpu, arch.ppr_tm)); + DEFINE(VCPU_DSCR_TM, offsetof(struct kvm_vcpu, arch.dscr_tm)); + DEFINE(VCPU_TAR_TM, offsetof(struct kvm_vcpu, arch.tar_tm)); +#endif #ifdef CONFIG_PPC_BOOK3S_64 -#ifdef CONFIG_KVM_BOOK3S_PR +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + DEFINE(PACA_SVCPU, offsetof(struct paca_struct, shadow_vcpu)); # define SVCPU_FIELD(x, f) DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f)) #else # define SVCPU_FIELD(x, f) @@ -525,6 +622,7 @@ int main(void) #ifdef CONFIG_PPC64 SVCPU_FIELD(SVCPU_SLB, slb); SVCPU_FIELD(SVCPU_SLB_MAX, slb_max); + SVCPU_FIELD(SVCPU_SHADOW_FSCR, shadow_fscr); #endif HSTATE_FIELD(HSTATE_HOST_R1, host_r1); @@ -533,16 +631,20 @@ int main(void) HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler); HSTATE_FIELD(HSTATE_SCRATCH0, scratch0); HSTATE_FIELD(HSTATE_SCRATCH1, scratch1); + HSTATE_FIELD(HSTATE_SCRATCH2, scratch2); HSTATE_FIELD(HSTATE_IN_GUEST, in_guest); HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5); HSTATE_FIELD(HSTATE_NAPPING, napping); -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE HSTATE_FIELD(HSTATE_HWTHREAD_REQ, hwthread_req); HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state); HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore); HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys); + HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr); + HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); + HSTATE_FIELD(HSTATE_PTID, ptid); HSTATE_FIELD(HSTATE_MMCR, host_mmcr); HSTATE_FIELD(HSTATE_PMC, host_pmc); HSTATE_FIELD(HSTATE_PURR, host_purr); @@ -551,7 +653,13 @@ int main(void) HSTATE_FIELD(HSTATE_DABR, dabr); HSTATE_FIELD(HSTATE_DECEXP, dec_expires); DEFINE(IPI_PRIORITY, IPI_PRIORITY); -#endif /* CONFIG_KVM_BOOK3S_64_HV */ +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + +#ifdef CONFIG_PPC_BOOK3S_64 + HSTATE_FIELD(HSTATE_CFAR, cfar); + HSTATE_FIELD(HSTATE_PPR, ppr); + HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr); +#endif /* CONFIG_PPC_BOOK3S_64 */ #else /* CONFIG_PPC_BOOK3S */ DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr)); @@ -562,6 +670,7 @@ int main(void) DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); + DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save)); #endif /* CONFIG_PPC_BOOK3S */ #endif /* CONFIG_KVM */ diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index ac8f52732fd..41c011cb607 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -25,11 +25,6 @@ static void scrollscreen(void); #endif -static void draw_byte(unsigned char c, long locX, long locY); -static void draw_byte_32(unsigned char *bits, unsigned int *base, int rb); -static void draw_byte_16(unsigned char *bits, unsigned int *base, int rb); -static void draw_byte_8(unsigned char *bits, unsigned int *base, int rb); - #define __force_data __attribute__((__section__(".data"))) static int g_loc_X __force_data; @@ -52,6 +47,26 @@ static unsigned char vga_font[cmapsz]; int boot_text_mapped __force_data = 0; int force_printk_to_btext = 0; +extern void rmci_on(void); +extern void rmci_off(void); + +static inline void rmci_maybe_on(void) +{ +#if defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) && defined(CONFIG_PPC64) + if (!(mfmsr() & MSR_DR)) + rmci_on(); +#endif +} + +static inline void rmci_maybe_off(void) +{ +#if defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) && defined(CONFIG_PPC64) + if (!(mfmsr() & MSR_DR)) + rmci_off(); +#endif +} + + #ifdef CONFIG_PPC32 /* Calc BAT values for mapping the display and store them * in disp_BAT. Those values are then used from head.S to map @@ -134,7 +149,7 @@ void __init btext_unmap(void) * changes. */ -static void map_boot_text(void) +void btext_map(void) { unsigned long base, offset, size; unsigned char *vbase; @@ -209,7 +224,7 @@ int btext_initialize(struct device_node *np) dispDeviceRect[2] = width; dispDeviceRect[3] = height; - map_boot_text(); + btext_map(); return 0; } @@ -283,7 +298,7 @@ void btext_update_display(unsigned long phys, int width, int height, iounmap(logicalDisplayBase); boot_text_mapped = 0; } - map_boot_text(); + btext_map(); g_loc_X = 0; g_loc_Y = 0; g_max_loc_X = width / 8; @@ -298,6 +313,7 @@ void btext_clearscreen(void) (dispDeviceDepth >> 3)) >> 2; int i,j; + rmci_maybe_on(); for (i=0; i<(dispDeviceRect[3] - dispDeviceRect[1]); i++) { unsigned int *ptr = base; @@ -305,6 +321,7 @@ void btext_clearscreen(void) *(ptr++) = 0; base += (dispDeviceRowBytes >> 2); } + rmci_maybe_off(); } void btext_flushscreen(void) @@ -355,6 +372,8 @@ static void scrollscreen(void) (dispDeviceDepth >> 3)) >> 2; int i,j; + rmci_maybe_on(); + for (i=0; i<(dispDeviceRect[3] - dispDeviceRect[1] - 16); i++) { unsigned int *src_ptr = src; @@ -371,9 +390,116 @@ static void scrollscreen(void) *(dst_ptr++) = 0; dst += (dispDeviceRowBytes >> 2); } + + rmci_maybe_off(); } #endif /* ndef NO_SCROLL */ +static unsigned int expand_bits_8[16] = { + 0x00000000, + 0x000000ff, + 0x0000ff00, + 0x0000ffff, + 0x00ff0000, + 0x00ff00ff, + 0x00ffff00, + 0x00ffffff, + 0xff000000, + 0xff0000ff, + 0xff00ff00, + 0xff00ffff, + 0xffff0000, + 0xffff00ff, + 0xffffff00, + 0xffffffff +}; + +static unsigned int expand_bits_16[4] = { + 0x00000000, + 0x0000ffff, + 0xffff0000, + 0xffffffff +}; + + +static void draw_byte_32(unsigned char *font, unsigned int *base, int rb) +{ + int l, bits; + int fg = 0xFFFFFFFFUL; + int bg = 0x00000000UL; + + for (l = 0; l < 16; ++l) + { + bits = *font++; + base[0] = (-(bits >> 7) & fg) ^ bg; + base[1] = (-((bits >> 6) & 1) & fg) ^ bg; + base[2] = (-((bits >> 5) & 1) & fg) ^ bg; + base[3] = (-((bits >> 4) & 1) & fg) ^ bg; + base[4] = (-((bits >> 3) & 1) & fg) ^ bg; + base[5] = (-((bits >> 2) & 1) & fg) ^ bg; + base[6] = (-((bits >> 1) & 1) & fg) ^ bg; + base[7] = (-(bits & 1) & fg) ^ bg; + base = (unsigned int *) ((char *)base + rb); + } +} + +static inline void draw_byte_16(unsigned char *font, unsigned int *base, int rb) +{ + int l, bits; + int fg = 0xFFFFFFFFUL; + int bg = 0x00000000UL; + unsigned int *eb = (int *)expand_bits_16; + + for (l = 0; l < 16; ++l) + { + bits = *font++; + base[0] = (eb[bits >> 6] & fg) ^ bg; + base[1] = (eb[(bits >> 4) & 3] & fg) ^ bg; + base[2] = (eb[(bits >> 2) & 3] & fg) ^ bg; + base[3] = (eb[bits & 3] & fg) ^ bg; + base = (unsigned int *) ((char *)base + rb); + } +} + +static inline void draw_byte_8(unsigned char *font, unsigned int *base, int rb) +{ + int l, bits; + int fg = 0x0F0F0F0FUL; + int bg = 0x00000000UL; + unsigned int *eb = (int *)expand_bits_8; + + for (l = 0; l < 16; ++l) + { + bits = *font++; + base[0] = (eb[bits >> 4] & fg) ^ bg; + base[1] = (eb[bits & 0xf] & fg) ^ bg; + base = (unsigned int *) ((char *)base + rb); + } +} + +static noinline void draw_byte(unsigned char c, long locX, long locY) +{ + unsigned char *base = calc_base(locX << 3, locY << 4); + unsigned char *font = &vga_font[((unsigned int)c) * 16]; + int rb = dispDeviceRowBytes; + + rmci_maybe_on(); + switch(dispDeviceDepth) { + case 24: + case 32: + draw_byte_32(font, (unsigned int *)base, rb); + break; + case 15: + case 16: + draw_byte_16(font, (unsigned int *)base, rb); + break; + case 8: + draw_byte_8(font, (unsigned int *)base, rb); + break; + } + rmci_maybe_off(); +} + void btext_drawchar(char c) { int cline = 0; @@ -465,107 +591,12 @@ void btext_drawhex(unsigned long v) btext_drawchar(' '); } -static void draw_byte(unsigned char c, long locX, long locY) -{ - unsigned char *base = calc_base(locX << 3, locY << 4); - unsigned char *font = &vga_font[((unsigned int)c) * 16]; - int rb = dispDeviceRowBytes; - - switch(dispDeviceDepth) { - case 24: - case 32: - draw_byte_32(font, (unsigned int *)base, rb); - break; - case 15: - case 16: - draw_byte_16(font, (unsigned int *)base, rb); - break; - case 8: - draw_byte_8(font, (unsigned int *)base, rb); - break; - } -} - -static unsigned int expand_bits_8[16] = { - 0x00000000, - 0x000000ff, - 0x0000ff00, - 0x0000ffff, - 0x00ff0000, - 0x00ff00ff, - 0x00ffff00, - 0x00ffffff, - 0xff000000, - 0xff0000ff, - 0xff00ff00, - 0xff00ffff, - 0xffff0000, - 0xffff00ff, - 0xffffff00, - 0xffffffff -}; - -static unsigned int expand_bits_16[4] = { - 0x00000000, - 0x0000ffff, - 0xffff0000, - 0xffffffff -}; - - -static void draw_byte_32(unsigned char *font, unsigned int *base, int rb) -{ - int l, bits; - int fg = 0xFFFFFFFFUL; - int bg = 0x00000000UL; - - for (l = 0; l < 16; ++l) - { - bits = *font++; - base[0] = (-(bits >> 7) & fg) ^ bg; - base[1] = (-((bits >> 6) & 1) & fg) ^ bg; - base[2] = (-((bits >> 5) & 1) & fg) ^ bg; - base[3] = (-((bits >> 4) & 1) & fg) ^ bg; - base[4] = (-((bits >> 3) & 1) & fg) ^ bg; - base[5] = (-((bits >> 2) & 1) & fg) ^ bg; - base[6] = (-((bits >> 1) & 1) & fg) ^ bg; - base[7] = (-(bits & 1) & fg) ^ bg; - base = (unsigned int *) ((char *)base + rb); - } -} - -static void draw_byte_16(unsigned char *font, unsigned int *base, int rb) -{ - int l, bits; - int fg = 0xFFFFFFFFUL; - int bg = 0x00000000UL; - unsigned int *eb = (int *)expand_bits_16; - - for (l = 0; l < 16; ++l) - { - bits = *font++; - base[0] = (eb[bits >> 6] & fg) ^ bg; - base[1] = (eb[(bits >> 4) & 3] & fg) ^ bg; - base[2] = (eb[(bits >> 2) & 3] & fg) ^ bg; - base[3] = (eb[bits & 3] & fg) ^ bg; - base = (unsigned int *) ((char *)base + rb); - } -} - -static void draw_byte_8(unsigned char *font, unsigned int *base, int rb) +void __init udbg_init_btext(void) { - int l, bits; - int fg = 0x0F0F0F0FUL; - int bg = 0x00000000UL; - unsigned int *eb = (int *)expand_bits_8; - - for (l = 0; l < 16; ++l) - { - bits = *font++; - base[0] = (eb[bits >> 4] & fg) ^ bg; - base[1] = (eb[bits & 0xf] & fg) ^ bg; - base = (unsigned int *) ((char *)base + rb); - } + /* If btext is enabled, we might have a BAT setup for early display, + * thus we do enable some very basic udbg output + */ + udbg_putc = btext_drawchar; } static unsigned char vga_font[cmapsz] = { @@ -913,10 +944,3 @@ static unsigned char vga_font[cmapsz] = { 0x00, 0x00, 0x00, 0x00, }; -void __init udbg_init_btext(void) -{ - /* If btext is enabled, we might have a BAT setup for early display, - * thus we do enable some very basic udbg output - */ - udbg_putc = btext_drawchar; -} diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index 92c6b008dd2..40198d50b4c 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -12,7 +12,6 @@ #include <linux/cpu.h> #include <linux/cpumask.h> -#include <linux/init.h> #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/list.h> @@ -131,7 +130,8 @@ static const char *cache_type_string(const struct cache *cache) return cache_type_info[cache->type].name; } -static void __cpuinit cache_init(struct cache *cache, int type, int level, struct device_node *ofnode) +static void cache_init(struct cache *cache, int type, int level, + struct device_node *ofnode) { cache->type = type; cache->level = level; @@ -140,7 +140,7 @@ static void __cpuinit cache_init(struct cache *cache, int type, int level, struc list_add(&cache->list, &cache_list); } -static struct cache *__cpuinit new_cache(int type, int level, struct device_node *ofnode) +static struct cache *new_cache(int type, int level, struct device_node *ofnode) { struct cache *cache; @@ -195,7 +195,7 @@ static void cache_cpu_set(struct cache *cache, int cpu) static int cache_size(const struct cache *cache, unsigned int *ret) { const char *propname; - const u32 *cache_size; + const __be32 *cache_size; propname = cache_type_info[cache->type].size_prop; @@ -203,7 +203,7 @@ static int cache_size(const struct cache *cache, unsigned int *ret) if (!cache_size) return -ENODEV; - *ret = *cache_size; + *ret = of_read_number(cache_size, 1); return 0; } @@ -221,7 +221,7 @@ static int cache_size_kb(const struct cache *cache, unsigned int *ret) /* not cache_line_size() because that's a macro in include/linux/cache.h */ static int cache_get_line_size(const struct cache *cache, unsigned int *ret) { - const u32 *line_size; + const __be32 *line_size; int i, lim; lim = ARRAY_SIZE(cache_type_info[cache->type].line_size_props); @@ -238,14 +238,14 @@ static int cache_get_line_size(const struct cache *cache, unsigned int *ret) if (!line_size) return -ENODEV; - *ret = *line_size; + *ret = of_read_number(line_size, 1); return 0; } static int cache_nr_sets(const struct cache *cache, unsigned int *ret) { const char *propname; - const u32 *nr_sets; + const __be32 *nr_sets; propname = cache_type_info[cache->type].nr_sets_prop; @@ -253,7 +253,7 @@ static int cache_nr_sets(const struct cache *cache, unsigned int *ret) if (!nr_sets) return -ENODEV; - *ret = *nr_sets; + *ret = of_read_number(nr_sets, 1); return 0; } @@ -324,7 +324,8 @@ static bool cache_node_is_unified(const struct device_node *np) return of_get_property(np, "cache-unified", NULL); } -static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *node, int level) +static struct cache *cache_do_one_devnode_unified(struct device_node *node, + int level) { struct cache *cache; @@ -335,7 +336,8 @@ static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node * return cache; } -static struct cache *__cpuinit cache_do_one_devnode_split(struct device_node *node, int level) +static struct cache *cache_do_one_devnode_split(struct device_node *node, + int level) { struct cache *dcache, *icache; @@ -357,7 +359,7 @@ err: return NULL; } -static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, int level) +static struct cache *cache_do_one_devnode(struct device_node *node, int level) { struct cache *cache; @@ -369,7 +371,8 @@ static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, in return cache; } -static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *node, int level) +static struct cache *cache_lookup_or_instantiate(struct device_node *node, + int level) { struct cache *cache; @@ -385,7 +388,7 @@ static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *n return cache; } -static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigger) +static void link_cache_lists(struct cache *smaller, struct cache *bigger) { while (smaller->next_local) { if (smaller->next_local == bigger) @@ -396,13 +399,13 @@ static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigg smaller->next_local = bigger; } -static void __cpuinit do_subsidiary_caches_debugcheck(struct cache *cache) +static void do_subsidiary_caches_debugcheck(struct cache *cache) { WARN_ON_ONCE(cache->level != 1); WARN_ON_ONCE(strcmp(cache->ofnode->type, "cpu")); } -static void __cpuinit do_subsidiary_caches(struct cache *cache) +static void do_subsidiary_caches(struct cache *cache) { struct device_node *subcache_node; int level = cache->level; @@ -423,7 +426,7 @@ static void __cpuinit do_subsidiary_caches(struct cache *cache) } } -static struct cache *__cpuinit cache_chain_instantiate(unsigned int cpu_id) +static struct cache *cache_chain_instantiate(unsigned int cpu_id) { struct device_node *cpu_node; struct cache *cpu_cache = NULL; @@ -448,7 +451,7 @@ out: return cpu_cache; } -static struct cache_dir *__cpuinit cacheinfo_create_cache_dir(unsigned int cpu_id) +static struct cache_dir *cacheinfo_create_cache_dir(unsigned int cpu_id) { struct cache_dir *cache_dir; struct device *dev; @@ -653,7 +656,7 @@ static struct kobj_type cache_index_type = { .default_attrs = cache_index_default_attrs, }; -static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) +static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) { const char *cache_name; const char *cache_type; @@ -696,7 +699,8 @@ static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *d kfree(buf); } -static void __cpuinit cacheinfo_create_index_dir(struct cache *cache, int index, struct cache_dir *cache_dir) +static void cacheinfo_create_index_dir(struct cache *cache, int index, + struct cache_dir *cache_dir) { struct cache_index_dir *index_dir; int rc; @@ -722,7 +726,8 @@ err: kfree(index_dir); } -static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache *cache_list) +static void cacheinfo_sysfs_populate(unsigned int cpu_id, + struct cache *cache_list) { struct cache_dir *cache_dir; struct cache *cache; @@ -740,7 +745,7 @@ static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache } } -void __cpuinit cacheinfo_cpu_online(unsigned int cpu_id) +void cacheinfo_cpu_online(unsigned int cpu_id) { struct cache *cache; @@ -751,7 +756,10 @@ void __cpuinit cacheinfo_cpu_online(unsigned int cpu_id) cacheinfo_sysfs_populate(cpu_id, cache); } -#ifdef CONFIG_HOTPLUG_CPU /* functions needed for cpu offline */ +/* functions needed to remove cache entry for cpu offline or suspend/resume */ + +#if (defined(CONFIG_PPC_PSERIES) && defined(CONFIG_SUSPEND)) || \ + defined(CONFIG_HOTPLUG_CPU) static struct cache *cache_lookup_by_cpu(unsigned int cpu_id) { @@ -788,6 +796,9 @@ static void remove_cache_dir(struct cache_dir *cache_dir) { remove_index_dirs(cache_dir); + /* Remove cache dir from sysfs */ + kobject_del(cache_dir->kobj); + kobject_put(cache_dir->kobj); kfree(cache_dir); @@ -835,4 +846,4 @@ void cacheinfo_cpu_offline(unsigned int cpu_id) if (cache) cache_cpu_clear(cache, cpu_id); } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* (CONFIG_PPC_PSERIES && CONFIG_SUSPEND) || CONFIG_HOTPLUG_CPU */ diff --git a/arch/powerpc/kernel/clock.c b/arch/powerpc/kernel/clock.c deleted file mode 100644 index a764b47791e..00000000000 --- a/arch/powerpc/kernel/clock.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Dummy clk implementations for powerpc. - * These need to be overridden in platform code. - */ - -#include <linux/clk.h> -#include <linux/err.h> -#include <linux/errno.h> -#include <linux/export.h> -#include <asm/clk_interface.h> - -struct clk_interface clk_functions; - -struct clk *clk_get(struct device *dev, const char *id) -{ - if (clk_functions.clk_get) - return clk_functions.clk_get(dev, id); - return ERR_PTR(-ENOSYS); -} -EXPORT_SYMBOL(clk_get); - -void clk_put(struct clk *clk) -{ - if (clk_functions.clk_put) - clk_functions.clk_put(clk); -} -EXPORT_SYMBOL(clk_put); - -int clk_enable(struct clk *clk) -{ - if (clk_functions.clk_enable) - return clk_functions.clk_enable(clk); - return -ENOSYS; -} -EXPORT_SYMBOL(clk_enable); - -void clk_disable(struct clk *clk) -{ - if (clk_functions.clk_disable) - clk_functions.clk_disable(clk); -} -EXPORT_SYMBOL(clk_disable); - -unsigned long clk_get_rate(struct clk *clk) -{ - if (clk_functions.clk_get_rate) - return clk_functions.clk_get_rate(clk); - return 0; -} -EXPORT_SYMBOL(clk_get_rate); - -long clk_round_rate(struct clk *clk, unsigned long rate) -{ - if (clk_functions.clk_round_rate) - return clk_functions.clk_round_rate(clk, rate); - return -ENOSYS; -} -EXPORT_SYMBOL(clk_round_rate); - -int clk_set_rate(struct clk *clk, unsigned long rate) -{ - if (clk_functions.clk_set_rate) - return clk_functions.clk_set_rate(clk, rate); - return -ENOSYS; -} -EXPORT_SYMBOL(clk_set_rate); - -struct clk *clk_get_parent(struct clk *clk) -{ - if (clk_functions.clk_get_parent) - return clk_functions.clk_get_parent(clk); - return ERR_PTR(-ENOSYS); -} -EXPORT_SYMBOL(clk_get_parent); - -int clk_set_parent(struct clk *clk, struct clk *parent) -{ - if (clk_functions.clk_set_parent) - return clk_functions.clk_set_parent(clk, parent); - return -ENOSYS; -} -EXPORT_SYMBOL(clk_set_parent); diff --git a/arch/powerpc/kernel/cpu_setup_a2.S b/arch/powerpc/kernel/cpu_setup_a2.S deleted file mode 100644 index 61f079e05b6..00000000000 --- a/arch/powerpc/kernel/cpu_setup_a2.S +++ /dev/null @@ -1,120 +0,0 @@ -/* - * A2 specific assembly support code - * - * Copyright 2009 Ben Herrenschmidt, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/asm-offsets.h> -#include <asm/ppc_asm.h> -#include <asm/ppc-opcode.h> -#include <asm/processor.h> -#include <asm/reg_a2.h> -#include <asm/reg.h> -#include <asm/thread_info.h> - -/* - * Disable thdid and class fields in ERATs to bump PID to full 14 bits capacity. - * This also prevents external LPID accesses but that isn't a problem when not a - * guest. Under PV, this setting will be ignored and MMUCR will return the right - * number of PID bits we can use. - */ -#define MMUCR1_EXTEND_PID \ - (MMUCR1_ICTID | MMUCR1_ITTID | MMUCR1_DCTID | \ - MMUCR1_DTTID | MMUCR1_DCCD) - -/* - * Use extended PIDs if enabled. - * Don't clear the ERATs on context sync events and enable I & D LRU. - * Enable ERAT back invalidate when tlbwe overwrites an entry. - */ -#define INITIAL_MMUCR1 \ - (MMUCR1_EXTEND_PID | MMUCR1_CSINV_NEVER | MMUCR1_IRRE | \ - MMUCR1_DRRE | MMUCR1_TLBWE_BINV) - -_GLOBAL(__setup_cpu_a2) - /* Some of these are actually thread local and some are - * core local but doing it always won't hurt - */ - -#ifdef CONFIG_PPC_ICSWX - /* Make sure ACOP starts out as zero */ - li r3,0 - mtspr SPRN_ACOP,r3 - - /* Skip the following if we are in Guest mode */ - mfmsr r3 - andis. r0,r3,MSR_GS@h - bne _icswx_skip_guest - - /* Enable icswx instruction */ - mfspr r3,SPRN_A2_CCR2 - ori r3,r3,A2_CCR2_ENABLE_ICSWX - mtspr SPRN_A2_CCR2,r3 - - /* Unmask all CTs in HACOP */ - li r3,-1 - mtspr SPRN_HACOP,r3 -_icswx_skip_guest: -#endif /* CONFIG_PPC_ICSWX */ - - /* Enable doorbell */ - mfspr r3,SPRN_A2_CCR2 - oris r3,r3,A2_CCR2_ENABLE_PC@h - mtspr SPRN_A2_CCR2,r3 - isync - - /* Setup CCR0 to disable power saving for now as it's busted - * in the current implementations. Setup CCR1 to wake on - * interrupts normally (we write the default value but who - * knows what FW may have clobbered...) - */ - li r3,0 - mtspr SPRN_A2_CCR0, r3 - LOAD_REG_IMMEDIATE(r3,0x0f0f0f0f) - mtspr SPRN_A2_CCR1, r3 - - /* Initialise MMUCR1 */ - lis r3,INITIAL_MMUCR1@h - ori r3,r3,INITIAL_MMUCR1@l - mtspr SPRN_MMUCR1,r3 - - /* Set MMUCR2 to enable 4K, 64K, 1M, 16M and 1G pages */ - LOAD_REG_IMMEDIATE(r3, 0x000a7531) - mtspr SPRN_MMUCR2,r3 - - /* Set MMUCR3 to write all thids bit to the TLB */ - LOAD_REG_IMMEDIATE(r3, 0x0000000f) - mtspr SPRN_MMUCR3,r3 - - /* Don't do ERAT stuff if running guest mode */ - mfmsr r3 - andis. r0,r3,MSR_GS@h - bne 1f - - /* Now set the I-ERAT watermark to 15 */ - lis r4,(MMUCR0_TLBSEL_I|MMUCR0_ECL)@h - mtspr SPRN_MMUCR0, r4 - li r4,A2_IERAT_SIZE-1 - PPC_ERATWE(R4,R4,3) - - /* Now set the D-ERAT watermark to 31 */ - lis r4,(MMUCR0_TLBSEL_D|MMUCR0_ECL)@h - mtspr SPRN_MMUCR0, r4 - li r4,A2_DERAT_SIZE-1 - PPC_ERATWE(R4,R4,3) - - /* And invalidate the beast just in case. That won't get rid of - * a bolted entry though it will be in LRU and so will go away eventually - * but let's not bother for now - */ - PPC_ERATILX(0,0,R0) -1: - blr - -_GLOBAL(__restore_cpu_a2) - b __setup_cpu_a2 diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S index dcd881937f7..4f1393d2007 100644 --- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S +++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S @@ -53,6 +53,61 @@ _GLOBAL(__e500_dcache_setup) isync blr +/* + * FIXME - we haven't yet done testing to determine a reasonable default + * value for PW20_WAIT_IDLE_BIT. + */ +#define PW20_WAIT_IDLE_BIT 50 /* 1ms, TB frequency is 41.66MHZ */ +_GLOBAL(setup_pw20_idle) + mfspr r3, SPRN_PWRMGTCR0 + + /* Set PW20_WAIT bit, enable pw20 state*/ + ori r3, r3, PWRMGTCR0_PW20_WAIT + li r11, PW20_WAIT_IDLE_BIT + + /* Set Automatic PW20 Core Idle Count */ + rlwimi r3, r11, PWRMGTCR0_PW20_ENT_SHIFT, PWRMGTCR0_PW20_ENT + + mtspr SPRN_PWRMGTCR0, r3 + + blr + +/* + * FIXME - we haven't yet done testing to determine a reasonable default + * value for AV_WAIT_IDLE_BIT. + */ +#define AV_WAIT_IDLE_BIT 50 /* 1ms, TB frequency is 41.66MHZ */ +_GLOBAL(setup_altivec_idle) + mfspr r3, SPRN_PWRMGTCR0 + + /* Enable Altivec Idle */ + oris r3, r3, PWRMGTCR0_AV_IDLE_PD_EN@h + li r11, AV_WAIT_IDLE_BIT + + /* Set Automatic AltiVec Idle Count */ + rlwimi r3, r11, PWRMGTCR0_AV_IDLE_CNT_SHIFT, PWRMGTCR0_AV_IDLE_CNT + + mtspr SPRN_PWRMGTCR0, r3 + + blr + +_GLOBAL(__setup_cpu_e6500) + mflr r6 +#ifdef CONFIG_PPC64 + bl setup_altivec_ivors + /* Touch IVOR42 only if the CPU supports E.HV category */ + mfspr r10,SPRN_MMUCFG + rlwinm. r10,r10,0,MMUCFG_LPIDSIZE + beq 1f + bl setup_lrat_ivor +1: +#endif + bl setup_pw20_idle + bl setup_altivec_idle + bl __setup_cpu_e5500 + mtlr r6 + blr + #ifdef CONFIG_PPC32 _GLOBAL(__setup_cpu_e200) /* enable dedicated debug exception handling resources (Debug APU) */ @@ -66,7 +121,7 @@ _GLOBAL(__setup_cpu_e500v2) bl __e500_icache_setup bl __e500_dcache_setup bl __setup_e500_ivors -#ifdef CONFIG_FSL_RIO +#if defined(CONFIG_FSL_RIO) || defined(CONFIG_FSL_PCI) /* Ensure that RFXE is set */ mfspr r3,SPRN_HID1 oris r3,r3,HID1_RFXE@h @@ -107,13 +162,28 @@ _GLOBAL(__setup_cpu_e5500) #endif #ifdef CONFIG_PPC_BOOK3E_64 +_GLOBAL(__restore_cpu_e6500) + mflr r5 + bl setup_altivec_ivors + /* Touch IVOR42 only if the CPU supports E.HV category */ + mfspr r10,SPRN_MMUCFG + rlwinm. r10,r10,0,MMUCFG_LPIDSIZE + beq 1f + bl setup_lrat_ivor +1: + bl setup_pw20_idle + bl setup_altivec_idle + bl __restore_cpu_e5500 + mtlr r5 + blr + _GLOBAL(__restore_cpu_e5500) mflr r4 bl __e500_icache_setup bl __e500_dcache_setup - bl .__setup_base_ivors - bl .setup_perfmon_ivor - bl .setup_doorbell_ivors + bl __setup_base_ivors + bl setup_perfmon_ivor + bl setup_doorbell_ivors /* * We only want to touch IVOR38-41 if we're running on hardware * that supports category E.HV. The architectural way to determine @@ -122,7 +192,7 @@ _GLOBAL(__restore_cpu_e5500) mfspr r10,SPRN_MMUCFG rlwinm. r10,r10,0,MMUCFG_LPIDSIZE beq 1f - bl .setup_ehv_ivors + bl setup_ehv_ivors 1: mtlr r4 blr @@ -131,9 +201,9 @@ _GLOBAL(__setup_cpu_e5500) mflr r5 bl __e500_icache_setup bl __e500_dcache_setup - bl .__setup_base_ivors - bl .setup_perfmon_ivor - bl .setup_doorbell_ivors + bl __setup_base_ivors + bl setup_perfmon_ivor + bl setup_doorbell_ivors /* * We only want to touch IVOR38-41 if we're running on hardware * that supports category E.HV. The architectural way to determine @@ -142,7 +212,7 @@ _GLOBAL(__setup_cpu_e5500) mfspr r10,SPRN_MMUCFG rlwinm. r10,r10,0,MMUCFG_LPIDSIZE beq 1f - bl .setup_ehv_ivors + bl setup_ehv_ivors b 2f 1: ld r10,CPU_SPEC_FEATURES(r4) diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index 57cf14065ae..46733535cc0 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -29,7 +29,7 @@ _GLOBAL(__setup_cpu_power7) mtspr SPRN_LPID,r0 mfspr r3,SPRN_LPCR bl __init_LPCR - bl __init_TLB + bl __init_tlb_power7 mtlr r11 blr @@ -42,35 +42,44 @@ _GLOBAL(__restore_cpu_power7) mtspr SPRN_LPID,r0 mfspr r3,SPRN_LPCR bl __init_LPCR - bl __init_TLB + bl __init_tlb_power7 mtlr r11 blr _GLOBAL(__setup_cpu_power8) mflr r11 + bl __init_FSCR + bl __init_PMU bl __init_hvmode_206 mtlr r11 beqlr li r0,0 mtspr SPRN_LPID,r0 mfspr r3,SPRN_LPCR - oris r3, r3, LPCR_AIL_3@h + ori r3, r3, LPCR_PECEDH bl __init_LPCR - bl __init_TLB + bl __init_HFSCR + bl __init_tlb_power8 + bl __init_PMU_HV mtlr r11 blr _GLOBAL(__restore_cpu_power8) mflr r11 + bl __init_FSCR + bl __init_PMU mfmsr r3 rldicl. r0,r3,4,63 + mtlr r11 beqlr li r0,0 mtspr SPRN_LPID,r0 mfspr r3,SPRN_LPCR - oris r3, r3, LPCR_AIL_3@h + ori r3, r3, LPCR_PECEDH bl __init_LPCR - bl __init_TLB + bl __init_HFSCR + bl __init_tlb_power8 + bl __init_PMU_HV mtlr r11 blr @@ -112,14 +121,62 @@ __init_LPCR: isync blr -__init_TLB: - /* Clear the TLB */ +__init_FSCR: + mfspr r3,SPRN_FSCR + ori r3,r3,FSCR_TAR|FSCR_DSCR|FSCR_EBB + mtspr SPRN_FSCR,r3 + blr + +__init_HFSCR: + mfspr r3,SPRN_HFSCR + ori r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|\ + HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB + mtspr SPRN_HFSCR,r3 + blr + +/* + * Clear the TLB using the specified IS form of tlbiel instruction + * (invalidate by congruence class). P7 has 128 CCs., P8 has 512. + * + * r3 = IS field + */ +__init_tlb_power7: + li r3,0xc00 /* IS field = 0b11 */ +_GLOBAL(__flush_tlb_power7) li r6,128 mtctr r6 - li r7,0xc00 /* IS field = 0b11 */ + mr r7,r3 /* IS field */ + ptesync +2: tlbiel r7 + addi r7,r7,0x1000 + bdnz 2b + ptesync +1: blr + +__init_tlb_power8: + li r3,0xc00 /* IS field = 0b11 */ +_GLOBAL(__flush_tlb_power8) + li r6,512 + mtctr r6 + mr r7,r3 /* IS field */ ptesync 2: tlbiel r7 addi r7,r7,0x1000 bdnz 2b ptesync 1: blr + +__init_PMU_HV: + li r5,0 + mtspr SPRN_MMCRC,r5 + mtspr SPRN_MMCRH,r5 + blr + +__init_PMU: + li r5,0 + mtspr SPRN_MMCRS,r5 + mtspr SPRN_MMCRA,r5 + mtspr SPRN_MMCR0,r5 + mtspr SPRN_MMCR1,r5 + mtspr SPRN_MMCR2,r5 + blr diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 75a3d71b895..0c157642c2a 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -71,10 +71,16 @@ extern void __restore_cpu_power7(void); extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec); extern void __restore_cpu_power8(void); extern void __restore_cpu_a2(void); +extern void __flush_tlb_power7(unsigned long inval_selector); +extern void __flush_tlb_power8(unsigned long inval_selector); +extern long __machine_check_early_realmode_p7(struct pt_regs *regs); +extern long __machine_check_early_realmode_p8(struct pt_regs *regs); #endif /* CONFIG_PPC64 */ #if defined(CONFIG_E500) extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec); +extern void __setup_cpu_e6500(unsigned long offset, struct cpu_spec* spec); extern void __restore_cpu_e5500(void); +extern void __restore_cpu_e6500(void); #endif /* CONFIG_E500 */ /* This table only contains "desktop" CPUs, it need to be filled with embedded @@ -96,10 +102,15 @@ extern void __restore_cpu_e5500(void); PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \ PPC_FEATURE_TRUE_LE | \ PPC_FEATURE_PSERIES_PERFMON_COMPAT) +#define COMMON_USER2_POWER7 (PPC_FEATURE2_DSCR) #define COMMON_USER_POWER8 (COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_06 |\ PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \ PPC_FEATURE_TRUE_LE | \ PPC_FEATURE_PSERIES_PERFMON_COMPAT) +#define COMMON_USER2_POWER8 (PPC_FEATURE2_ARCH_2_07 | \ + PPC_FEATURE2_HTM_COMP | PPC_FEATURE2_DSCR | \ + PPC_FEATURE2_ISEL | PPC_FEATURE2_TAR | \ + PPC_FEATURE2_VEC_CRYPTO) #define COMMON_USER_PA6T (COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\ PPC_FEATURE_TRUE_LE | \ PPC_FEATURE_HAS_ALTIVEC_COMP) @@ -275,7 +286,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_features = CPU_FTRS_PPC970, .cpu_user_features = COMMON_USER_POWER4 | PPC_FEATURE_HAS_ALTIVEC_COMP, - .mmu_features = MMU_FTR_HPTE_TABLE, + .mmu_features = MMU_FTRS_PPC970, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, @@ -426,6 +437,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_name = "POWER7 (architected)", .cpu_features = CPU_FTRS_POWER7, .cpu_user_features = COMMON_USER_POWER7, + .cpu_user_features2 = COMMON_USER2_POWER7, .mmu_features = MMU_FTRS_POWER7, .icache_bsize = 128, .dcache_bsize = 128, @@ -433,6 +445,8 @@ static struct cpu_spec __initdata cpu_specs[] = { .oprofile_cpu_type = "ppc64/ibm-compat-v1", .cpu_setup = __setup_cpu_power7, .cpu_restore = __restore_cpu_power7, + .flush_tlb = __flush_tlb_power7, + .machine_check_early = __machine_check_early_realmode_p7, .platform = "power7", }, { /* 2.07-compliant processor, i.e. Power8 "architected" mode */ @@ -441,13 +455,16 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_name = "POWER8 (architected)", .cpu_features = CPU_FTRS_POWER8, .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, .mmu_features = MMU_FTRS_POWER8, .icache_bsize = 128, .dcache_bsize = 128, - .oprofile_type = PPC_OPROFILE_POWER4, + .oprofile_type = PPC_OPROFILE_INVALID, .oprofile_cpu_type = "ppc64/ibm-compat-v1", .cpu_setup = __setup_cpu_power8, .cpu_restore = __restore_cpu_power8, + .flush_tlb = __flush_tlb_power8, + .machine_check_early = __machine_check_early_realmode_p8, .platform = "power8", }, { /* Power7 */ @@ -456,6 +473,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_name = "POWER7 (raw)", .cpu_features = CPU_FTRS_POWER7, .cpu_user_features = COMMON_USER_POWER7, + .cpu_user_features2 = COMMON_USER2_POWER7, .mmu_features = MMU_FTRS_POWER7, .icache_bsize = 128, .dcache_bsize = 128, @@ -465,6 +483,8 @@ static struct cpu_spec __initdata cpu_specs[] = { .oprofile_type = PPC_OPROFILE_POWER4, .cpu_setup = __setup_cpu_power7, .cpu_restore = __restore_cpu_power7, + .flush_tlb = __flush_tlb_power7, + .machine_check_early = __machine_check_early_realmode_p7, .platform = "power7", }, { /* Power7+ */ @@ -473,6 +493,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_name = "POWER7+ (raw)", .cpu_features = CPU_FTRS_POWER7, .cpu_user_features = COMMON_USER_POWER7, + .cpu_user_features2 = COMMON_USER2_POWER7, .mmu_features = MMU_FTRS_POWER7, .icache_bsize = 128, .dcache_bsize = 128, @@ -482,23 +503,68 @@ static struct cpu_spec __initdata cpu_specs[] = { .oprofile_type = PPC_OPROFILE_POWER4, .cpu_setup = __setup_cpu_power7, .cpu_restore = __restore_cpu_power7, + .flush_tlb = __flush_tlb_power7, + .machine_check_early = __machine_check_early_realmode_p7, .platform = "power7+", }, - { /* Power8 */ + { /* Power8E */ .pvr_mask = 0xffff0000, .pvr_value = 0x004b0000, + .cpu_name = "POWER8E (raw)", + .cpu_features = CPU_FTRS_POWER8E, + .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, + .mmu_features = MMU_FTRS_POWER8, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power8", + .oprofile_type = PPC_OPROFILE_INVALID, + .cpu_setup = __setup_cpu_power8, + .cpu_restore = __restore_cpu_power8, + .flush_tlb = __flush_tlb_power8, + .machine_check_early = __machine_check_early_realmode_p8, + .platform = "power8", + }, + { /* Power8 DD1: Does not support doorbell IPIs */ + .pvr_mask = 0xffffff00, + .pvr_value = 0x004d0100, + .cpu_name = "POWER8 (raw)", + .cpu_features = CPU_FTRS_POWER8_DD1, + .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, + .mmu_features = MMU_FTRS_POWER8, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power8", + .oprofile_type = PPC_OPROFILE_INVALID, + .cpu_setup = __setup_cpu_power8, + .cpu_restore = __restore_cpu_power8, + .flush_tlb = __flush_tlb_power8, + .machine_check_early = __machine_check_early_realmode_p8, + .platform = "power8", + }, + { /* Power8 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004d0000, .cpu_name = "POWER8 (raw)", .cpu_features = CPU_FTRS_POWER8, .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, .mmu_features = MMU_FTRS_POWER8, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 6, .pmc_type = PPC_PMC_IBM, .oprofile_cpu_type = "ppc64/power8", - .oprofile_type = PPC_OPROFILE_POWER4, + .oprofile_type = PPC_OPROFILE_INVALID, .cpu_setup = __setup_cpu_power8, .cpu_restore = __restore_cpu_power8, + .flush_tlb = __flush_tlb_power8, + .machine_check_early = __machine_check_early_realmode_p8, .platform = "power8", }, { /* Cell Broadband Engine */ @@ -1993,6 +2059,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_SPE_COMP | PPC_FEATURE_HAS_EFP_SINGLE_COMP, + .cpu_user_features2 = PPC_FEATURE2_ISEL, .mmu_features = MMU_FTR_TYPE_FSL_E, .icache_bsize = 32, .dcache_bsize = 32, @@ -2012,6 +2079,7 @@ static struct cpu_spec __initdata cpu_specs[] = { PPC_FEATURE_HAS_SPE_COMP | PPC_FEATURE_HAS_EFP_SINGLE_COMP | PPC_FEATURE_HAS_EFP_DOUBLE_COMP, + .cpu_user_features2 = PPC_FEATURE2_ISEL, .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS, .icache_bsize = 32, .dcache_bsize = 32, @@ -2028,6 +2096,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_name = "e500mc", .cpu_features = CPU_FTRS_E500MC, .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .cpu_user_features2 = PPC_FEATURE2_ISEL, .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | MMU_FTR_USE_TLBILX, .icache_bsize = 64, @@ -2046,6 +2115,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_name = "e5500", .cpu_features = CPU_FTRS_E5500, .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .cpu_user_features2 = PPC_FEATURE2_ISEL, .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | MMU_FTR_USE_TLBILX, .icache_bsize = 64, @@ -2065,17 +2135,19 @@ static struct cpu_spec __initdata cpu_specs[] = { .pvr_value = 0x80400000, .cpu_name = "e6500", .cpu_features = CPU_FTRS_E6500, - .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU | + PPC_FEATURE_HAS_ALTIVEC_COMP, + .cpu_user_features2 = PPC_FEATURE2_ISEL, .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS | MMU_FTR_USE_TLBILX, .icache_bsize = 64, .dcache_bsize = 64, - .num_pmcs = 4, + .num_pmcs = 6, .oprofile_cpu_type = "ppc/e6500", .oprofile_type = PPC_OPROFILE_FSL_EMB, - .cpu_setup = __setup_cpu_e5500, + .cpu_setup = __setup_cpu_e6500, #ifndef CONFIG_PPC32 - .cpu_restore = __restore_cpu_e5500, + .cpu_restore = __restore_cpu_e6500, #endif .machine_check = machine_check_e500mc, .platform = "ppce6500", @@ -2097,44 +2169,6 @@ static struct cpu_spec __initdata cpu_specs[] = { } #endif /* CONFIG_PPC32 */ #endif /* CONFIG_E500 */ - -#ifdef CONFIG_PPC_A2 - { /* Standard A2 (>= DD2) + FPU core */ - .pvr_mask = 0xffff0000, - .pvr_value = 0x00480000, - .cpu_name = "A2 (>= DD2)", - .cpu_features = CPU_FTRS_A2, - .cpu_user_features = COMMON_USER_PPC64, - .mmu_features = MMU_FTRS_A2, - .icache_bsize = 64, - .dcache_bsize = 64, - .num_pmcs = 0, - .cpu_setup = __setup_cpu_a2, - .cpu_restore = __restore_cpu_a2, - .machine_check = machine_check_generic, - .platform = "ppca2", - }, - { /* This is a default entry to get going, to be replaced by - * a real one at some stage - */ -#define CPU_FTRS_BASE_BOOK3E (CPU_FTR_USE_TB | \ - CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_SMT | \ - CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE) - .pvr_mask = 0x00000000, - .pvr_value = 0x00000000, - .cpu_name = "Book3E", - .cpu_features = CPU_FTRS_BASE_BOOK3E, - .cpu_user_features = COMMON_USER_PPC64, - .mmu_features = MMU_FTR_TYPE_3E | MMU_FTR_USE_TLBILX | - MMU_FTR_USE_TLBIVAX_BCAST | - MMU_FTR_LOCK_BCAST_INVAL, - .icache_bsize = 64, - .dcache_bsize = 64, - .num_pmcs = 0, - .machine_check = machine_check_generic, - .platform = "power6", - }, -#endif /* CONFIG_PPC_A2 */ }; static struct cpu_spec the_cpu_spec; diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index fdcd8f551af..51dbace3269 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -17,7 +17,6 @@ #include <linux/export.h> #include <linux/crash_dump.h> #include <linux/delay.h> -#include <linux/init.h> #include <linux/irq.h> #include <linux/types.h> @@ -82,7 +81,7 @@ void crash_ipi_callback(struct pt_regs *regs) } atomic_inc(&cpus_in_crash); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* * Starting the kdump boot. diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index b3ba5163eae..7a13f378ca2 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -69,16 +69,6 @@ void __init setup_kdump_trampoline(void) } #endif /* CONFIG_NONSTATIC_KERNEL */ -static int __init parse_savemaxmem(char *p) -{ - if (p) - saved_max_pfn = (memparse(p, &p) >> PAGE_SHIFT) - 1; - - return 1; -} -__setup("savemaxmem=", parse_savemaxmem); - - static size_t copy_oldmem_vaddr(void *vaddr, char *buf, size_t csize, unsigned long offset, int userbuf) { @@ -108,17 +98,19 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, unsigned long offset, int userbuf) { void *vaddr; + phys_addr_t paddr; if (!csize) return 0; csize = min_t(size_t, csize, PAGE_SIZE); + paddr = pfn << PAGE_SHIFT; - if ((min_low_pfn < pfn) && (pfn < max_pfn)) { - vaddr = __va(pfn << PAGE_SHIFT); + if (memblock_is_region_memory(paddr, csize)) { + vaddr = __va(paddr); csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); } else { - vaddr = __ioremap(pfn << PAGE_SHIFT, PAGE_SIZE, 0); + vaddr = __ioremap(paddr, PAGE_SIZE, 0); csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); iounmap(vaddr); } @@ -134,15 +126,15 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) { unsigned long addr; - const u32 *basep, *sizep; + const __be32 *basep, *sizep; unsigned int rtas_start = 0, rtas_end = 0; basep = of_get_property(rtas.dev, "linux,rtas-base", NULL); sizep = of_get_property(rtas.dev, "rtas-size", NULL); if (basep && sizep) { - rtas_start = *basep; - rtas_end = *basep + *sizep; + rtas_start = be32_to_cpup(basep); + rtas_end = rtas_start + be32_to_cpup(sizep); } for (addr = begin; addr < end; addr += PAGE_SIZE) { @@ -150,10 +142,7 @@ void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) if (addr <= rtas_end && ((addr + PAGE_SIZE) > rtas_start)) continue; - ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); - init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); - free_page((unsigned long)__va(addr)); - totalram_pages++; + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); } } #endif diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index a892680668d..d55c76c571f 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -21,7 +21,7 @@ #ifdef CONFIG_SMP void doorbell_setup_this_cpu(void) { - unsigned long tag = mfspr(SPRN_PIR) & 0x3fff; + unsigned long tag = mfspr(SPRN_DOORBELL_CPUTAG) & PPC_DBELL_TAG_MASK; smp_muxed_ipi_set_data(smp_processor_id(), tag); } @@ -30,7 +30,7 @@ void doorbell_cause_ipi(int cpu, unsigned long data) { /* Order previous accesses vs. msgsnd, which is treated as a store */ mb(); - ppc_msgsnd(PPC_DBELL, 0, data); + ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, data); } void doorbell_exception(struct pt_regs *regs) @@ -41,6 +41,8 @@ void doorbell_exception(struct pt_regs *regs) may_hard_irq_enable(); + __get_cpu_var(irq_stat).doorbell_irqs++; + smp_ipi_demux(); irq_exit(); diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index e4897523de4..54d0116256f 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -83,10 +83,10 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask) return 0; } - if (tbl->it_offset > (mask >> IOMMU_PAGE_SHIFT)) { + if (tbl->it_offset > (mask >> tbl->it_page_shift)) { dev_info(dev, "Warning: IOMMU offset too big for device mask\n"); dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n", - mask, tbl->it_offset << IOMMU_PAGE_SHIFT); + mask, tbl->it_offset << tbl->it_page_shift); return 0; } else return 1; diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 8032b97ccdc..ee78f6e49d6 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -191,12 +191,10 @@ EXPORT_SYMBOL(dma_direct_ops); #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) -int dma_set_mask(struct device *dev, u64 dma_mask) +int __dma_set_mask(struct device *dev, u64 dma_mask) { struct dma_map_ops *dma_ops = get_dma_ops(dev); - if (ppc_md.dma_set_mask) - return ppc_md.dma_set_mask(dev, dma_mask); if ((dma_ops != NULL) && (dma_ops->set_dma_mask != NULL)) return dma_ops->set_dma_mask(dev, dma_mask); if (!dev->dma_mask || !dma_supported(dev, dma_mask)) @@ -204,6 +202,12 @@ int dma_set_mask(struct device *dev, u64 dma_mask) *dev->dma_mask = dma_mask; return 0; } +int dma_set_mask(struct device *dev, u64 dma_mask) +{ + if (ppc_md.dma_set_mask) + return ppc_md.dma_set_mask(dev, dma_mask); + return __dma_set_mask(dev, dma_mask); +} EXPORT_SYMBOL(dma_set_mask); u64 dma_get_required_mask(struct device *dev) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c new file mode 100644 index 00000000000..86e25702aac --- /dev/null +++ b/arch/powerpc/kernel/eeh.c @@ -0,0 +1,1183 @@ +/* + * Copyright IBM Corporation 2001, 2005, 2006 + * Copyright Dave Engebretsen & Todd Inglett 2001 + * Copyright Linas Vepstas 2005, 2006 + * Copyright 2001-2012 IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com> + */ + +#include <linux/delay.h> +#include <linux/debugfs.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/proc_fs.h> +#include <linux/rbtree.h> +#include <linux/reboot.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/export.h> +#include <linux/of.h> + +#include <linux/atomic.h> +#include <asm/debug.h> +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/io.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/rtas.h> + + +/** Overview: + * EEH, or "Extended Error Handling" is a PCI bridge technology for + * dealing with PCI bus errors that can't be dealt with within the + * usual PCI framework, except by check-stopping the CPU. Systems + * that are designed for high-availability/reliability cannot afford + * to crash due to a "mere" PCI error, thus the need for EEH. + * An EEH-capable bridge operates by converting a detected error + * into a "slot freeze", taking the PCI adapter off-line, making + * the slot behave, from the OS'es point of view, as if the slot + * were "empty": all reads return 0xff's and all writes are silently + * ignored. EEH slot isolation events can be triggered by parity + * errors on the address or data busses (e.g. during posted writes), + * which in turn might be caused by low voltage on the bus, dust, + * vibration, humidity, radioactivity or plain-old failed hardware. + * + * Note, however, that one of the leading causes of EEH slot + * freeze events are buggy device drivers, buggy device microcode, + * or buggy device hardware. This is because any attempt by the + * device to bus-master data to a memory address that is not + * assigned to the device will trigger a slot freeze. (The idea + * is to prevent devices-gone-wild from corrupting system memory). + * Buggy hardware/drivers will have a miserable time co-existing + * with EEH. + * + * Ideally, a PCI device driver, when suspecting that an isolation + * event has occurred (e.g. by reading 0xff's), will then ask EEH + * whether this is the case, and then take appropriate steps to + * reset the PCI slot, the PCI device, and then resume operations. + * However, until that day, the checking is done here, with the + * eeh_check_failure() routine embedded in the MMIO macros. If + * the slot is found to be isolated, an "EEH Event" is synthesized + * and sent out for processing. + */ + +/* If a device driver keeps reading an MMIO register in an interrupt + * handler after a slot isolation event, it might be broken. + * This sets the threshold for how many read attempts we allow + * before printing an error message. + */ +#define EEH_MAX_FAILS 2100000 + +/* Time to wait for a PCI slot to report status, in milliseconds */ +#define PCI_BUS_RESET_WAIT_MSEC (5*60*1000) + +/* + * EEH probe mode support, which is part of the flags, + * is to support multiple platforms for EEH. Some platforms + * like pSeries do PCI emunation based on device tree. + * However, other platforms like powernv probe PCI devices + * from hardware. The flag is used to distinguish that. + * In addition, struct eeh_ops::probe would be invoked for + * particular OF node or PCI device so that the corresponding + * PE would be created there. + */ +int eeh_subsystem_flags; +EXPORT_SYMBOL(eeh_subsystem_flags); + +/* Platform dependent EEH operations */ +struct eeh_ops *eeh_ops = NULL; + +/* Lock to avoid races due to multiple reports of an error */ +DEFINE_RAW_SPINLOCK(confirm_error_lock); + +/* Buffer for reporting pci register dumps. Its here in BSS, and + * not dynamically alloced, so that it ends up in RMO where RTAS + * can access it. + */ +#define EEH_PCI_REGS_LOG_LEN 4096 +static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN]; + +/* + * The struct is used to maintain the EEH global statistic + * information. Besides, the EEH global statistics will be + * exported to user space through procfs + */ +struct eeh_stats { + u64 no_device; /* PCI device not found */ + u64 no_dn; /* OF node not found */ + u64 no_cfg_addr; /* Config address not found */ + u64 ignored_check; /* EEH check skipped */ + u64 total_mmio_ffs; /* Total EEH checks */ + u64 false_positives; /* Unnecessary EEH checks */ + u64 slot_resets; /* PE reset */ +}; + +static struct eeh_stats eeh_stats; + +#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE) + +static int __init eeh_setup(char *str) +{ + if (!strcmp(str, "off")) + eeh_subsystem_flags |= EEH_FORCE_DISABLED; + + return 1; +} +__setup("eeh=", eeh_setup); + +/** + * eeh_gather_pci_data - Copy assorted PCI config space registers to buff + * @edev: device to report data for + * @buf: point to buffer in which to log + * @len: amount of room in buffer + * + * This routine captures assorted PCI configuration space data, + * and puts them into a buffer for RTAS error logging. + */ +static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len) +{ + struct device_node *dn = eeh_dev_to_of_node(edev); + u32 cfg; + int cap, i; + int n = 0; + + n += scnprintf(buf+n, len-n, "%s\n", dn->full_name); + pr_warn("EEH: of node=%s\n", dn->full_name); + + eeh_ops->read_config(dn, PCI_VENDOR_ID, 4, &cfg); + n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg); + pr_warn("EEH: PCI device/vendor: %08x\n", cfg); + + eeh_ops->read_config(dn, PCI_COMMAND, 4, &cfg); + n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg); + pr_warn("EEH: PCI cmd/status register: %08x\n", cfg); + + /* Gather bridge-specific registers */ + if (edev->mode & EEH_DEV_BRIDGE) { + eeh_ops->read_config(dn, PCI_SEC_STATUS, 2, &cfg); + n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg); + pr_warn("EEH: Bridge secondary status: %04x\n", cfg); + + eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &cfg); + n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg); + pr_warn("EEH: Bridge control: %04x\n", cfg); + } + + /* Dump out the PCI-X command and status regs */ + cap = edev->pcix_cap; + if (cap) { + eeh_ops->read_config(dn, cap, 4, &cfg); + n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg); + pr_warn("EEH: PCI-X cmd: %08x\n", cfg); + + eeh_ops->read_config(dn, cap+4, 4, &cfg); + n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg); + pr_warn("EEH: PCI-X status: %08x\n", cfg); + } + + /* If PCI-E capable, dump PCI-E cap 10 */ + cap = edev->pcie_cap; + if (cap) { + n += scnprintf(buf+n, len-n, "pci-e cap10:\n"); + pr_warn("EEH: PCI-E capabilities and status follow:\n"); + + for (i=0; i<=8; i++) { + eeh_ops->read_config(dn, cap+4*i, 4, &cfg); + n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); + pr_warn("EEH: PCI-E %02x: %08x\n", i, cfg); + } + } + + /* If AER capable, dump it */ + cap = edev->aer_cap; + if (cap) { + n += scnprintf(buf+n, len-n, "pci-e AER:\n"); + pr_warn("EEH: PCI-E AER capability register set follows:\n"); + + for (i=0; i<14; i++) { + eeh_ops->read_config(dn, cap+4*i, 4, &cfg); + n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); + pr_warn("EEH: PCI-E AER %02x: %08x\n", i, cfg); + } + } + + return n; +} + +/** + * eeh_slot_error_detail - Generate combined log including driver log and error log + * @pe: EEH PE + * @severity: temporary or permanent error log + * + * This routine should be called to generate the combined log, which + * is comprised of driver log and error log. The driver log is figured + * out from the config space of the corresponding PCI device, while + * the error log is fetched through platform dependent function call. + */ +void eeh_slot_error_detail(struct eeh_pe *pe, int severity) +{ + size_t loglen = 0; + struct eeh_dev *edev, *tmp; + + /* + * When the PHB is fenced or dead, it's pointless to collect + * the data from PCI config space because it should return + * 0xFF's. For ER, we still retrieve the data from the PCI + * config space. + * + * For pHyp, we have to enable IO for log retrieval. Otherwise, + * 0xFF's is always returned from PCI config space. + */ + if (!(pe->type & EEH_PE_PHB)) { + if (eeh_probe_mode_devtree()) + eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); + eeh_ops->configure_bridge(pe); + eeh_pe_restore_bars(pe); + + pci_regs_buf[0] = 0; + eeh_pe_for_each_dev(pe, edev, tmp) { + loglen += eeh_gather_pci_data(edev, pci_regs_buf + loglen, + EEH_PCI_REGS_LOG_LEN - loglen); + } + } + + eeh_ops->get_log(pe, severity, pci_regs_buf, loglen); +} + +/** + * eeh_token_to_phys - Convert EEH address token to phys address + * @token: I/O token, should be address in the form 0xA.... + * + * This routine should be called to convert virtual I/O address + * to physical one. + */ +static inline unsigned long eeh_token_to_phys(unsigned long token) +{ + pte_t *ptep; + unsigned long pa; + int hugepage_shift; + + /* + * We won't find hugepages here, iomem + */ + ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift); + if (!ptep) + return token; + WARN_ON(hugepage_shift); + pa = pte_pfn(*ptep) << PAGE_SHIFT; + + return pa | (token & (PAGE_SIZE-1)); +} + +/* + * On PowerNV platform, we might already have fenced PHB there. + * For that case, it's meaningless to recover frozen PE. Intead, + * We have to handle fenced PHB firstly. + */ +static int eeh_phb_check_failure(struct eeh_pe *pe) +{ + struct eeh_pe *phb_pe; + unsigned long flags; + int ret; + + if (!eeh_probe_mode_dev()) + return -EPERM; + + /* Find the PHB PE */ + phb_pe = eeh_phb_pe_get(pe->phb); + if (!phb_pe) { + pr_warning("%s Can't find PE for PHB#%d\n", + __func__, pe->phb->global_number); + return -EEXIST; + } + + /* If the PHB has been in problematic state */ + eeh_serialize_lock(&flags); + if (phb_pe->state & EEH_PE_ISOLATED) { + ret = 0; + goto out; + } + + /* Check PHB state */ + ret = eeh_ops->get_state(phb_pe, NULL); + if ((ret < 0) || + (ret == EEH_STATE_NOT_SUPPORT) || + (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) == + (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) { + ret = 0; + goto out; + } + + /* Isolate the PHB and send event */ + eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); + eeh_serialize_unlock(flags); + + pr_err("EEH: PHB#%x failure detected, location: %s\n", + phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe)); + dump_stack(); + eeh_send_failure_event(phb_pe); + + return 1; +out: + eeh_serialize_unlock(flags); + return ret; +} + +/** + * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze + * @edev: eeh device + * + * Check for an EEH failure for the given device node. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze. This routine + * will query firmware for the EEH status. + * + * Returns 0 if there has not been an EEH error; otherwise returns + * a non-zero value and queues up a slot isolation event notification. + * + * It is safe to call this routine in an interrupt context. + */ +int eeh_dev_check_failure(struct eeh_dev *edev) +{ + int ret; + int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); + unsigned long flags; + struct device_node *dn; + struct pci_dev *dev; + struct eeh_pe *pe, *parent_pe, *phb_pe; + int rc = 0; + const char *location; + + eeh_stats.total_mmio_ffs++; + + if (!eeh_enabled()) + return 0; + + if (!edev) { + eeh_stats.no_dn++; + return 0; + } + dn = eeh_dev_to_of_node(edev); + dev = eeh_dev_to_pci_dev(edev); + pe = edev->pe; + + /* Access to IO BARs might get this far and still not want checking. */ + if (!pe) { + eeh_stats.ignored_check++; + pr_debug("EEH: Ignored check for %s %s\n", + eeh_pci_name(dev), dn->full_name); + return 0; + } + + if (!pe->addr && !pe->config_addr) { + eeh_stats.no_cfg_addr++; + return 0; + } + + /* + * On PowerNV platform, we might already have fenced PHB + * there and we need take care of that firstly. + */ + ret = eeh_phb_check_failure(pe); + if (ret > 0) + return ret; + + /* If we already have a pending isolation event for this + * slot, we know it's bad already, we don't need to check. + * Do this checking under a lock; as multiple PCI devices + * in one slot might report errors simultaneously, and we + * only want one error recovery routine running. + */ + eeh_serialize_lock(&flags); + rc = 1; + if (pe->state & EEH_PE_ISOLATED) { + pe->check_count++; + if (pe->check_count % EEH_MAX_FAILS == 0) { + location = of_get_property(dn, "ibm,loc-code", NULL); + printk(KERN_ERR "EEH: %d reads ignored for recovering device at " + "location=%s driver=%s pci addr=%s\n", + pe->check_count, location, + eeh_driver_name(dev), eeh_pci_name(dev)); + printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n", + eeh_driver_name(dev)); + dump_stack(); + } + goto dn_unlock; + } + + /* + * Now test for an EEH failure. This is VERY expensive. + * Note that the eeh_config_addr may be a parent device + * in the case of a device behind a bridge, or it may be + * function zero of a multi-function device. + * In any case they must share a common PHB. + */ + ret = eeh_ops->get_state(pe, NULL); + + /* Note that config-io to empty slots may fail; + * they are empty when they don't have children. + * We will punt with the following conditions: Failure to get + * PE's state, EEH not support and Permanently unavailable + * state, PE is in good state. + */ + if ((ret < 0) || + (ret == EEH_STATE_NOT_SUPPORT) || + ((ret & active_flags) == active_flags)) { + eeh_stats.false_positives++; + pe->false_positives++; + rc = 0; + goto dn_unlock; + } + + /* + * It should be corner case that the parent PE has been + * put into frozen state as well. We should take care + * that at first. + */ + parent_pe = pe->parent; + while (parent_pe) { + /* Hit the ceiling ? */ + if (parent_pe->type & EEH_PE_PHB) + break; + + /* Frozen parent PE ? */ + ret = eeh_ops->get_state(parent_pe, NULL); + if (ret > 0 && + (ret & active_flags) != active_flags) + pe = parent_pe; + + /* Next parent level */ + parent_pe = parent_pe->parent; + } + + eeh_stats.slot_resets++; + + /* Avoid repeated reports of this failure, including problems + * with other functions on this device, and functions under + * bridges. + */ + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_serialize_unlock(flags); + + /* Most EEH events are due to device driver bugs. Having + * a stack trace will help the device-driver authors figure + * out what happened. So print that out. + */ + phb_pe = eeh_phb_pe_get(pe->phb); + pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", + pe->phb->global_number, pe->addr); + pr_err("EEH: PE location: %s, PHB location: %s\n", + eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); + dump_stack(); + + eeh_send_failure_event(pe); + + return 1; + +dn_unlock: + eeh_serialize_unlock(flags); + return rc; +} + +EXPORT_SYMBOL_GPL(eeh_dev_check_failure); + +/** + * eeh_check_failure - Check if all 1's data is due to EEH slot freeze + * @token: I/O token, should be address in the form 0xA.... + * @val: value, should be all 1's (XXX why do we need this arg??) + * + * Check for an EEH failure at the given token address. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze event. This routine + * will query firmware for the EEH status. + * + * Note this routine is safe to call in an interrupt context. + */ +unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val) +{ + unsigned long addr; + struct eeh_dev *edev; + + /* Finding the phys addr + pci device; this is pretty quick. */ + addr = eeh_token_to_phys((unsigned long __force) token); + edev = eeh_addr_cache_get_dev(addr); + if (!edev) { + eeh_stats.no_device++; + return val; + } + + eeh_dev_check_failure(edev); + return val; +} + +EXPORT_SYMBOL(eeh_check_failure); + + +/** + * eeh_pci_enable - Enable MMIO or DMA transfers for this slot + * @pe: EEH PE + * + * This routine should be called to reenable frozen MMIO or DMA + * so that it would work correctly again. It's useful while doing + * recovery or log collection on the indicated device. + */ +int eeh_pci_enable(struct eeh_pe *pe, int function) +{ + int rc, flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); + + /* + * pHyp doesn't allow to enable IO or DMA on unfrozen PE. + * Also, it's pointless to enable them on unfrozen PE. So + * we have the check here. + */ + if (function == EEH_OPT_THAW_MMIO || + function == EEH_OPT_THAW_DMA) { + rc = eeh_ops->get_state(pe, NULL); + if (rc < 0) + return rc; + + /* Needn't to enable or already enabled */ + if ((rc == EEH_STATE_NOT_SUPPORT) || + ((rc & flags) == flags)) + return 0; + } + + rc = eeh_ops->set_option(pe, function); + if (rc) + pr_warn("%s: Unexpected state change %d on " + "PHB#%d-PE#%x, err=%d\n", + __func__, function, pe->phb->global_number, + pe->addr, rc); + + rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); + if (rc <= 0) + return rc; + + if ((function == EEH_OPT_THAW_MMIO) && + (rc & EEH_STATE_MMIO_ENABLED)) + return 0; + + if ((function == EEH_OPT_THAW_DMA) && + (rc & EEH_STATE_DMA_ENABLED)) + return 0; + + return rc; +} + +/** + * pcibios_set_pcie_slot_reset - Set PCI-E reset state + * @dev: pci device struct + * @state: reset state to enter + * + * Return value: + * 0 if success + */ +int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(dev); + struct eeh_pe *pe = edev->pe; + + if (!pe) { + pr_err("%s: No PE found on PCI device %s\n", + __func__, pci_name(dev)); + return -EINVAL; + } + + switch (state) { + case pcie_deassert_reset: + eeh_ops->reset(pe, EEH_RESET_DEACTIVATE); + break; + case pcie_hot_reset: + eeh_ops->reset(pe, EEH_RESET_HOT); + break; + case pcie_warm_reset: + eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL); + break; + default: + return -EINVAL; + }; + + return 0; +} + +/** + * eeh_set_pe_freset - Check the required reset for the indicated device + * @data: EEH device + * @flag: return value + * + * Each device might have its preferred reset type: fundamental or + * hot reset. The routine is used to collected the information for + * the indicated device and its children so that the bunch of the + * devices could be reset properly. + */ +static void *eeh_set_dev_freset(void *data, void *flag) +{ + struct pci_dev *dev; + unsigned int *freset = (unsigned int *)flag; + struct eeh_dev *edev = (struct eeh_dev *)data; + + dev = eeh_dev_to_pci_dev(edev); + if (dev) + *freset |= dev->needs_freset; + + return NULL; +} + +/** + * eeh_reset_pe_once - Assert the pci #RST line for 1/4 second + * @pe: EEH PE + * + * Assert the PCI #RST line for 1/4 second. + */ +static void eeh_reset_pe_once(struct eeh_pe *pe) +{ + unsigned int freset = 0; + + /* Determine type of EEH reset required for + * Partitionable Endpoint, a hot-reset (1) + * or a fundamental reset (3). + * A fundamental reset required by any device under + * Partitionable Endpoint trumps hot-reset. + */ + eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset); + + if (freset) + eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL); + else + eeh_ops->reset(pe, EEH_RESET_HOT); + + eeh_ops->reset(pe, EEH_RESET_DEACTIVATE); +} + +/** + * eeh_reset_pe - Reset the indicated PE + * @pe: EEH PE + * + * This routine should be called to reset indicated device, including + * PE. A PE might include multiple PCI devices and sometimes PCI bridges + * might be involved as well. + */ +int eeh_reset_pe(struct eeh_pe *pe) +{ + int flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); + int i, rc; + + /* Take three shots at resetting the bus */ + for (i=0; i<3; i++) { + eeh_reset_pe_once(pe); + + /* + * EEH_PE_ISOLATED is expected to be removed after + * BAR restore. + */ + rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); + if ((rc & flags) == flags) + return 0; + + if (rc < 0) { + pr_err("%s: Unrecoverable slot failure on PHB#%d-PE#%x", + __func__, pe->phb->global_number, pe->addr); + return -1; + } + pr_err("EEH: bus reset %d failed on PHB#%d-PE#%x, rc=%d\n", + i+1, pe->phb->global_number, pe->addr, rc); + } + + return -1; +} + +/** + * eeh_save_bars - Save device bars + * @edev: PCI device associated EEH device + * + * Save the values of the device bars. Unlike the restore + * routine, this routine is *not* recursive. This is because + * PCI devices are added individually; but, for the restore, + * an entire slot is reset at a time. + */ +void eeh_save_bars(struct eeh_dev *edev) +{ + int i; + struct device_node *dn; + + if (!edev) + return; + dn = eeh_dev_to_of_node(edev); + + for (i = 0; i < 16; i++) + eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]); + + /* + * For PCI bridges including root port, we need enable bus + * master explicitly. Otherwise, it can't fetch IODA table + * entries correctly. So we cache the bit in advance so that + * we can restore it after reset, either PHB range or PE range. + */ + if (edev->mode & EEH_DEV_BRIDGE) + edev->config_space[1] |= PCI_COMMAND_MASTER; +} + +/** + * eeh_ops_register - Register platform dependent EEH operations + * @ops: platform dependent EEH operations + * + * Register the platform dependent EEH operation callback + * functions. The platform should call this function before + * any other EEH operations. + */ +int __init eeh_ops_register(struct eeh_ops *ops) +{ + if (!ops->name) { + pr_warning("%s: Invalid EEH ops name for %p\n", + __func__, ops); + return -EINVAL; + } + + if (eeh_ops && eeh_ops != ops) { + pr_warning("%s: EEH ops of platform %s already existing (%s)\n", + __func__, eeh_ops->name, ops->name); + return -EEXIST; + } + + eeh_ops = ops; + + return 0; +} + +/** + * eeh_ops_unregister - Unreigster platform dependent EEH operations + * @name: name of EEH platform operations + * + * Unregister the platform dependent EEH operation callback + * functions. + */ +int __exit eeh_ops_unregister(const char *name) +{ + if (!name || !strlen(name)) { + pr_warning("%s: Invalid EEH ops name\n", + __func__); + return -EINVAL; + } + + if (eeh_ops && !strcmp(eeh_ops->name, name)) { + eeh_ops = NULL; + return 0; + } + + return -EEXIST; +} + +static int eeh_reboot_notifier(struct notifier_block *nb, + unsigned long action, void *unused) +{ + eeh_set_enable(false); + return NOTIFY_DONE; +} + +static struct notifier_block eeh_reboot_nb = { + .notifier_call = eeh_reboot_notifier, +}; + +/** + * eeh_init - EEH initialization + * + * Initialize EEH by trying to enable it for all of the adapters in the system. + * As a side effect we can determine here if eeh is supported at all. + * Note that we leave EEH on so failed config cycles won't cause a machine + * check. If a user turns off EEH for a particular adapter they are really + * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't + * grant access to a slot if EEH isn't enabled, and so we always enable + * EEH for all slots/all devices. + * + * The eeh-force-off option disables EEH checking globally, for all slots. + * Even if force-off is set, the EEH hardware is still enabled, so that + * newer systems can boot. + */ +int eeh_init(void) +{ + struct pci_controller *hose, *tmp; + struct device_node *phb; + static int cnt = 0; + int ret = 0; + + /* + * We have to delay the initialization on PowerNV after + * the PCI hierarchy tree has been built because the PEs + * are figured out based on PCI devices instead of device + * tree nodes + */ + if (machine_is(powernv) && cnt++ <= 0) + return ret; + + /* Register reboot notifier */ + ret = register_reboot_notifier(&eeh_reboot_nb); + if (ret) { + pr_warn("%s: Failed to register notifier (%d)\n", + __func__, ret); + return ret; + } + + /* call platform initialization function */ + if (!eeh_ops) { + pr_warning("%s: Platform EEH operation not found\n", + __func__); + return -EEXIST; + } else if ((ret = eeh_ops->init())) { + pr_warning("%s: Failed to call platform init function (%d)\n", + __func__, ret); + return ret; + } + + /* Initialize EEH event */ + ret = eeh_event_init(); + if (ret) + return ret; + + /* Enable EEH for all adapters */ + if (eeh_probe_mode_devtree()) { + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) { + phb = hose->dn; + traverse_pci_devices(phb, eeh_ops->of_probe, NULL); + } + } else if (eeh_probe_mode_dev()) { + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) + pci_walk_bus(hose->bus, eeh_ops->dev_probe, NULL); + } else { + pr_warn("%s: Invalid probe mode %x", + __func__, eeh_subsystem_flags); + return -EINVAL; + } + + /* + * Call platform post-initialization. Actually, It's good chance + * to inform platform that EEH is ready to supply service if the + * I/O cache stuff has been built up. + */ + if (eeh_ops->post_init) { + ret = eeh_ops->post_init(); + if (ret) + return ret; + } + + if (eeh_enabled()) + pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); + else + pr_warning("EEH: No capable adapters found\n"); + + return ret; +} + +core_initcall_sync(eeh_init); + +/** + * eeh_add_device_early - Enable EEH for the indicated device_node + * @dn: device node for which to set up EEH + * + * This routine must be used to perform EEH initialization for PCI + * devices that were added after system boot (e.g. hotplug, dlpar). + * This routine must be called before any i/o is performed to the + * adapter (inluding any config-space i/o). + * Whether this actually enables EEH or not for this device depends + * on the CEC architecture, type of the device, on earlier boot + * command-line arguments & etc. + */ +void eeh_add_device_early(struct device_node *dn) +{ + struct pci_controller *phb; + + /* + * If we're doing EEH probe based on PCI device, we + * would delay the probe until late stage because + * the PCI device isn't available this moment. + */ + if (!eeh_probe_mode_devtree()) + return; + + if (!of_node_to_eeh_dev(dn)) + return; + phb = of_node_to_eeh_dev(dn)->phb; + + /* USB Bus children of PCI devices will not have BUID's */ + if (NULL == phb || 0 == phb->buid) + return; + + eeh_ops->of_probe(dn, NULL); +} + +/** + * eeh_add_device_tree_early - Enable EEH for the indicated device + * @dn: device node + * + * This routine must be used to perform EEH initialization for the + * indicated PCI device that was added after system boot (e.g. + * hotplug, dlpar). + */ +void eeh_add_device_tree_early(struct device_node *dn) +{ + struct device_node *sib; + + for_each_child_of_node(dn, sib) + eeh_add_device_tree_early(sib); + eeh_add_device_early(dn); +} +EXPORT_SYMBOL_GPL(eeh_add_device_tree_early); + +/** + * eeh_add_device_late - Perform EEH initialization for the indicated pci device + * @dev: pci device for which to set up EEH + * + * This routine must be used to complete EEH initialization for PCI + * devices that were added after system boot (e.g. hotplug, dlpar). + */ +void eeh_add_device_late(struct pci_dev *dev) +{ + struct device_node *dn; + struct eeh_dev *edev; + + if (!dev || !eeh_enabled()) + return; + + pr_debug("EEH: Adding device %s\n", pci_name(dev)); + + dn = pci_device_to_OF_node(dev); + edev = of_node_to_eeh_dev(dn); + if (edev->pdev == dev) { + pr_debug("EEH: Already referenced !\n"); + return; + } + + /* + * The EEH cache might not be removed correctly because of + * unbalanced kref to the device during unplug time, which + * relies on pcibios_release_device(). So we have to remove + * that here explicitly. + */ + if (edev->pdev) { + eeh_rmv_from_parent_pe(edev); + eeh_addr_cache_rmv_dev(edev->pdev); + eeh_sysfs_remove_device(edev->pdev); + edev->mode &= ~EEH_DEV_SYSFS; + + /* + * We definitely should have the PCI device removed + * though it wasn't correctly. So we needn't call + * into error handler afterwards. + */ + edev->mode |= EEH_DEV_NO_HANDLER; + + edev->pdev = NULL; + dev->dev.archdata.edev = NULL; + } + + edev->pdev = dev; + dev->dev.archdata.edev = edev; + + /* + * We have to do the EEH probe here because the PCI device + * hasn't been created yet in the early stage. + */ + if (eeh_probe_mode_dev()) + eeh_ops->dev_probe(dev, NULL); + + eeh_addr_cache_insert_dev(dev); +} + +/** + * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI bus + * @bus: PCI bus + * + * This routine must be used to perform EEH initialization for PCI + * devices which are attached to the indicated PCI bus. The PCI bus + * is added after system boot through hotplug or dlpar. + */ +void eeh_add_device_tree_late(struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + eeh_add_device_late(dev); + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { + struct pci_bus *subbus = dev->subordinate; + if (subbus) + eeh_add_device_tree_late(subbus); + } + } +} +EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); + +/** + * eeh_add_sysfs_files - Add EEH sysfs files for the indicated PCI bus + * @bus: PCI bus + * + * This routine must be used to add EEH sysfs files for PCI + * devices which are attached to the indicated PCI bus. The PCI bus + * is added after system boot through hotplug or dlpar. + */ +void eeh_add_sysfs_files(struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + eeh_sysfs_add_device(dev); + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { + struct pci_bus *subbus = dev->subordinate; + if (subbus) + eeh_add_sysfs_files(subbus); + } + } +} +EXPORT_SYMBOL_GPL(eeh_add_sysfs_files); + +/** + * eeh_remove_device - Undo EEH setup for the indicated pci device + * @dev: pci device to be removed + * + * This routine should be called when a device is removed from + * a running system (e.g. by hotplug or dlpar). It unregisters + * the PCI device from the EEH subsystem. I/O errors affecting + * this device will no longer be detected after this call; thus, + * i/o errors affecting this slot may leave this device unusable. + */ +void eeh_remove_device(struct pci_dev *dev) +{ + struct eeh_dev *edev; + + if (!dev || !eeh_enabled()) + return; + edev = pci_dev_to_eeh_dev(dev); + + /* Unregister the device with the EEH/PCI address search system */ + pr_debug("EEH: Removing device %s\n", pci_name(dev)); + + if (!edev || !edev->pdev || !edev->pe) { + pr_debug("EEH: Not referenced !\n"); + return; + } + + /* + * During the hotplug for EEH error recovery, we need the EEH + * device attached to the parent PE in order for BAR restore + * a bit later. So we keep it for BAR restore and remove it + * from the parent PE during the BAR resotre. + */ + edev->pdev = NULL; + dev->dev.archdata.edev = NULL; + if (!(edev->pe->state & EEH_PE_KEEP)) + eeh_rmv_from_parent_pe(edev); + else + edev->mode |= EEH_DEV_DISCONNECTED; + + /* + * We're removing from the PCI subsystem, that means + * the PCI device driver can't support EEH or not + * well. So we rely on hotplug completely to do recovery + * for the specific PCI device. + */ + edev->mode |= EEH_DEV_NO_HANDLER; + + eeh_addr_cache_rmv_dev(dev); + eeh_sysfs_remove_device(dev); + edev->mode &= ~EEH_DEV_SYSFS; +} + +static int proc_eeh_show(struct seq_file *m, void *v) +{ + if (!eeh_enabled()) { + seq_printf(m, "EEH Subsystem is globally disabled\n"); + seq_printf(m, "eeh_total_mmio_ffs=%llu\n", eeh_stats.total_mmio_ffs); + } else { + seq_printf(m, "EEH Subsystem is enabled\n"); + seq_printf(m, + "no device=%llu\n" + "no device node=%llu\n" + "no config address=%llu\n" + "check not wanted=%llu\n" + "eeh_total_mmio_ffs=%llu\n" + "eeh_false_positives=%llu\n" + "eeh_slot_resets=%llu\n", + eeh_stats.no_device, + eeh_stats.no_dn, + eeh_stats.no_cfg_addr, + eeh_stats.ignored_check, + eeh_stats.total_mmio_ffs, + eeh_stats.false_positives, + eeh_stats.slot_resets); + } + + return 0; +} + +static int proc_eeh_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_eeh_show, NULL); +} + +static const struct file_operations proc_eeh_operations = { + .open = proc_eeh_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef CONFIG_DEBUG_FS +static int eeh_enable_dbgfs_set(void *data, u64 val) +{ + if (val) + eeh_subsystem_flags &= ~EEH_FORCE_DISABLED; + else + eeh_subsystem_flags |= EEH_FORCE_DISABLED; + + /* Notify the backend */ + if (eeh_ops->post_init) + eeh_ops->post_init(); + + return 0; +} + +static int eeh_enable_dbgfs_get(void *data, u64 *val) +{ + if (eeh_enabled()) + *val = 0x1ul; + else + *val = 0x0ul; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get, + eeh_enable_dbgfs_set, "0x%llx\n"); +#endif + +static int __init eeh_init_proc(void) +{ + if (machine_is(pseries) || machine_is(powernv)) { + proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations); +#ifdef CONFIG_DEBUG_FS + debugfs_create_file("eeh_enable", 0600, + powerpc_debugfs_root, NULL, + &eeh_enable_dbgfs_ops); +#endif + } + + return 0; +} +__initcall(eeh_init_proc); diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c new file mode 100644 index 00000000000..e8c9fd546a5 --- /dev/null +++ b/arch/powerpc/kernel/eeh_cache.c @@ -0,0 +1,310 @@ +/* + * PCI address cache; allows the lookup of PCI devices based on I/O address + * + * Copyright IBM Corporation 2004 + * Copyright Linas Vepstas <linas@austin.ibm.com> 2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> + + +/** + * The pci address cache subsystem. This subsystem places + * PCI device address resources into a red-black tree, sorted + * according to the address range, so that given only an i/o + * address, the corresponding PCI device can be **quickly** + * found. It is safe to perform an address lookup in an interrupt + * context; this ability is an important feature. + * + * Currently, the only customer of this code is the EEH subsystem; + * thus, this code has been somewhat tailored to suit EEH better. + * In particular, the cache does *not* hold the addresses of devices + * for which EEH is not enabled. + * + * (Implementation Note: The RB tree seems to be better/faster + * than any hash algo I could think of for this problem, even + * with the penalty of slow pointer chases for d-cache misses). + */ +struct pci_io_addr_range { + struct rb_node rb_node; + unsigned long addr_lo; + unsigned long addr_hi; + struct eeh_dev *edev; + struct pci_dev *pcidev; + unsigned int flags; +}; + +static struct pci_io_addr_cache { + struct rb_root rb_root; + spinlock_t piar_lock; +} pci_io_addr_cache_root; + +static inline struct eeh_dev *__eeh_addr_cache_get_device(unsigned long addr) +{ + struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node; + + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + if (addr < piar->addr_lo) + n = n->rb_left; + else if (addr > piar->addr_hi) + n = n->rb_right; + else + return piar->edev; + } + + return NULL; +} + +/** + * eeh_addr_cache_get_dev - Get device, given only address + * @addr: mmio (PIO) phys address or i/o port number + * + * Given an mmio phys address, or a port number, find a pci device + * that implements this address. Be sure to pci_dev_put the device + * when finished. I/O port numbers are assumed to be offset + * from zero (that is, they do *not* have pci_io_addr added in). + * It is safe to call this function within an interrupt. + */ +struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr) +{ + struct eeh_dev *edev; + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + edev = __eeh_addr_cache_get_device(addr); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); + return edev; +} + +#ifdef DEBUG +/* + * Handy-dandy debug print routine, does nothing more + * than print out the contents of our addr cache. + */ +static void eeh_addr_cache_print(struct pci_io_addr_cache *cache) +{ + struct rb_node *n; + int cnt = 0; + + n = rb_first(&cache->rb_root); + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + pr_debug("PCI: %s addr range %d [%lx-%lx]: %s\n", + (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt, + piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev)); + cnt++; + n = rb_next(n); + } +} +#endif + +/* Insert address range into the rb tree. */ +static struct pci_io_addr_range * +eeh_addr_cache_insert(struct pci_dev *dev, unsigned long alo, + unsigned long ahi, unsigned int flags) +{ + struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node; + struct rb_node *parent = NULL; + struct pci_io_addr_range *piar; + + /* Walk tree, find a place to insert into tree */ + while (*p) { + parent = *p; + piar = rb_entry(parent, struct pci_io_addr_range, rb_node); + if (ahi < piar->addr_lo) { + p = &parent->rb_left; + } else if (alo > piar->addr_hi) { + p = &parent->rb_right; + } else { + if (dev != piar->pcidev || + alo != piar->addr_lo || ahi != piar->addr_hi) { + pr_warning("PIAR: overlapping address range\n"); + } + return piar; + } + } + piar = kzalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC); + if (!piar) + return NULL; + + piar->addr_lo = alo; + piar->addr_hi = ahi; + piar->edev = pci_dev_to_eeh_dev(dev); + piar->pcidev = dev; + piar->flags = flags; + +#ifdef DEBUG + pr_debug("PIAR: insert range=[%lx:%lx] dev=%s\n", + alo, ahi, pci_name(dev)); +#endif + + rb_link_node(&piar->rb_node, parent, p); + rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); + + return piar; +} + +static void __eeh_addr_cache_insert_dev(struct pci_dev *dev) +{ + struct device_node *dn; + struct eeh_dev *edev; + int i; + + dn = pci_device_to_OF_node(dev); + if (!dn) { + pr_warning("PCI: no pci dn found for dev=%s\n", pci_name(dev)); + return; + } + + edev = of_node_to_eeh_dev(dn); + if (!edev) { + pr_warning("PCI: no EEH dev found for dn=%s\n", + dn->full_name); + return; + } + + /* Skip any devices for which EEH is not enabled. */ + if (!eeh_probe_mode_dev() && !edev->pe) { +#ifdef DEBUG + pr_info("PCI: skip building address cache for=%s - %s\n", + pci_name(dev), dn->full_name); +#endif + return; + } + + /* Walk resources on this device, poke them into the tree */ + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + unsigned long start = pci_resource_start(dev,i); + unsigned long end = pci_resource_end(dev,i); + unsigned int flags = pci_resource_flags(dev,i); + + /* We are interested only bus addresses, not dma or other stuff */ + if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM))) + continue; + if (start == 0 || ~start == 0 || end == 0 || ~end == 0) + continue; + eeh_addr_cache_insert(dev, start, end, flags); + } +} + +/** + * eeh_addr_cache_insert_dev - Add a device to the address cache + * @dev: PCI device whose I/O addresses we are interested in. + * + * In order to support the fast lookup of devices based on addresses, + * we maintain a cache of devices that can be quickly searched. + * This routine adds a device to that cache. + */ +void eeh_addr_cache_insert_dev(struct pci_dev *dev) +{ + unsigned long flags; + + /* Ignore PCI bridges */ + if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE) + return; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + __eeh_addr_cache_insert_dev(dev); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); +} + +static inline void __eeh_addr_cache_rmv_dev(struct pci_dev *dev) +{ + struct rb_node *n; + +restart: + n = rb_first(&pci_io_addr_cache_root.rb_root); + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + if (piar->pcidev == dev) { + rb_erase(n, &pci_io_addr_cache_root.rb_root); + kfree(piar); + goto restart; + } + n = rb_next(n); + } +} + +/** + * eeh_addr_cache_rmv_dev - remove pci device from addr cache + * @dev: device to remove + * + * Remove a device from the addr-cache tree. + * This is potentially expensive, since it will walk + * the tree multiple times (once per resource). + * But so what; device removal doesn't need to be that fast. + */ +void eeh_addr_cache_rmv_dev(struct pci_dev *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + __eeh_addr_cache_rmv_dev(dev); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); +} + +/** + * eeh_addr_cache_build - Build a cache of I/O addresses + * + * Build a cache of pci i/o addresses. This cache will be used to + * find the pci device that corresponds to a given address. + * This routine scans all pci busses to build the cache. + * Must be run late in boot process, after the pci controllers + * have been scanned for devices (after all device resources are known). + */ +void eeh_addr_cache_build(void) +{ + struct device_node *dn; + struct eeh_dev *edev; + struct pci_dev *dev = NULL; + + spin_lock_init(&pci_io_addr_cache_root.piar_lock); + + for_each_pci_dev(dev) { + dn = pci_device_to_OF_node(dev); + if (!dn) + continue; + + edev = of_node_to_eeh_dev(dn); + if (!edev) + continue; + + dev->dev.archdata.edev = edev; + edev->pdev = dev; + + eeh_addr_cache_insert_dev(dev); + eeh_sysfs_add_device(dev); + } + +#ifdef DEBUG + /* Verify tree built up above, echo back the list of addrs. */ + eeh_addr_cache_print(&pci_io_addr_cache_root); +#endif +} diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c new file mode 100644 index 00000000000..1efa28f5fc5 --- /dev/null +++ b/arch/powerpc/kernel/eeh_dev.c @@ -0,0 +1,112 @@ +/* + * The file intends to implement dynamic creation of EEH device, which will + * be bound with OF node and PCI device simutaneously. The EEH devices would + * be foundamental information for EEH core components to work proerly. Besides, + * We have to support multiple situations where dynamic creation of EEH device + * is required: + * + * 1) Before PCI emunation starts, we need create EEH devices according to the + * PCI sensitive OF nodes. + * 2) When PCI emunation is done, we need do the binding between PCI device and + * the associated EEH device. + * 3) DR (Dynamic Reconfiguration) would create PCI sensitive OF node. EEH device + * will be created while PCI sensitive OF node is detected from DR. + * 4) PCI hotplug needs redoing the binding between PCI device and EEH device. If + * PHB is newly inserted, we also need create EEH devices accordingly. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/export.h> +#include <linux/gfp.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> + +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> + +/** + * eeh_dev_init - Create EEH device according to OF node + * @dn: device node + * @data: PHB + * + * It will create EEH device according to the given OF node. The function + * might be called by PCI emunation, DR, PHB hotplug. + */ +void *eeh_dev_init(struct device_node *dn, void *data) +{ + struct pci_controller *phb = data; + struct eeh_dev *edev; + + /* Allocate EEH device */ + edev = kzalloc(sizeof(*edev), GFP_KERNEL); + if (!edev) { + pr_warning("%s: out of memory\n", __func__); + return NULL; + } + + /* Associate EEH device with OF node */ + PCI_DN(dn)->edev = edev; + edev->dn = dn; + edev->phb = phb; + INIT_LIST_HEAD(&edev->list); + + return NULL; +} + +/** + * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB + * @phb: PHB + * + * Scan the PHB OF node and its child association, then create the + * EEH devices accordingly + */ +void eeh_dev_phb_init_dynamic(struct pci_controller *phb) +{ + struct device_node *dn = phb->dn; + + /* EEH PE for PHB */ + eeh_phb_pe_create(phb); + + /* EEH device for PHB */ + eeh_dev_init(dn, phb); + + /* EEH devices for children OF nodes */ + traverse_pci_devices(dn, eeh_dev_init, phb); +} + +/** + * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs + * + * Scan all the existing PHBs and create EEH devices for their OF + * nodes and their children OF nodes + */ +static int __init eeh_dev_phb_init(void) +{ + struct pci_controller *phb, *tmp; + + list_for_each_entry_safe(phb, tmp, &hose_list, list_node) + eeh_dev_phb_init_dynamic(phb); + + pr_info("EEH: devices created\n"); + + return 0; +} + +core_initcall(eeh_dev_phb_init); diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c new file mode 100644 index 00000000000..420da61d4ce --- /dev/null +++ b/arch/powerpc/kernel/eeh_driver.c @@ -0,0 +1,870 @@ +/* + * PCI Error Recovery Driver for RPA-compliant PPC64 platform. + * Copyright IBM Corp. 2004 2005 + * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> + */ +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/ppc-pci.h> +#include <asm/pci-bridge.h> +#include <asm/prom.h> +#include <asm/rtas.h> + +/** + * eeh_pcid_name - Retrieve name of PCI device driver + * @pdev: PCI device + * + * This routine is used to retrieve the name of PCI device driver + * if that's valid. + */ +static inline const char *eeh_pcid_name(struct pci_dev *pdev) +{ + if (pdev && pdev->dev.driver) + return pdev->dev.driver->name; + return ""; +} + +/** + * eeh_pcid_get - Get the PCI device driver + * @pdev: PCI device + * + * The function is used to retrieve the PCI device driver for + * the indicated PCI device. Besides, we will increase the reference + * of the PCI device driver to prevent that being unloaded on + * the fly. Otherwise, kernel crash would be seen. + */ +static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) +{ + if (!pdev || !pdev->driver) + return NULL; + + if (!try_module_get(pdev->driver->driver.owner)) + return NULL; + + return pdev->driver; +} + +/** + * eeh_pcid_put - Dereference on the PCI device driver + * @pdev: PCI device + * + * The function is called to do dereference on the PCI device + * driver of the indicated PCI device. + */ +static inline void eeh_pcid_put(struct pci_dev *pdev) +{ + if (!pdev || !pdev->driver) + return; + + module_put(pdev->driver->driver.owner); +} + +#if 0 +static void print_device_node_tree(struct pci_dn *pdn, int dent) +{ + int i; + struct device_node *pc; + + if (!pdn) + return; + for (i = 0; i < dent; i++) + printk(" "); + printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n", + pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr, + pdn->eeh_pe_config_addr, pdn->node->full_name); + dent += 3; + pc = pdn->node->child; + while (pc) { + print_device_node_tree(PCI_DN(pc), dent); + pc = pc->sibling; + } +} +#endif + +/** + * eeh_disable_irq - Disable interrupt for the recovering device + * @dev: PCI device + * + * This routine must be called when reporting temporary or permanent + * error to the particular PCI device to disable interrupt of that + * device. If the device has enabled MSI or MSI-X interrupt, we needn't + * do real work because EEH should freeze DMA transfers for those PCI + * devices encountering EEH errors, which includes MSI or MSI-X. + */ +static void eeh_disable_irq(struct pci_dev *dev) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(dev); + + /* Don't disable MSI and MSI-X interrupts. They are + * effectively disabled by the DMA Stopped state + * when an EEH error occurs. + */ + if (dev->msi_enabled || dev->msix_enabled) + return; + + if (!irq_has_action(dev->irq)) + return; + + edev->mode |= EEH_DEV_IRQ_DISABLED; + disable_irq_nosync(dev->irq); +} + +/** + * eeh_enable_irq - Enable interrupt for the recovering device + * @dev: PCI device + * + * This routine must be called to enable interrupt while failed + * device could be resumed. + */ +static void eeh_enable_irq(struct pci_dev *dev) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(dev); + + if ((edev->mode) & EEH_DEV_IRQ_DISABLED) { + edev->mode &= ~EEH_DEV_IRQ_DISABLED; + /* + * FIXME !!!!! + * + * This is just ass backwards. This maze has + * unbalanced irq_enable/disable calls. So instead of + * finding the root cause it works around the warning + * in the irq_enable code by conditionally calling + * into it. + * + * That's just wrong.The warning in the core code is + * there to tell people to fix their assymetries in + * their own code, not by abusing the core information + * to avoid it. + * + * I so wish that the assymetry would be the other way + * round and a few more irq_disable calls render that + * shit unusable forever. + * + * tglx + */ + if (irqd_irq_disabled(irq_get_irq_data(dev->irq))) + enable_irq(dev->irq); + } +} + +static bool eeh_dev_removed(struct eeh_dev *edev) +{ + /* EEH device removed ? */ + if (!edev || (edev->mode & EEH_DEV_REMOVED)) + return true; + + return false; +} + +/** + * eeh_report_error - Report pci error to each device driver + * @data: eeh device + * @userdata: return value + * + * Report an EEH error to each device driver, collect up and + * merge the device driver responses. Cumulative response + * passed back in "userdata". + */ +static void *eeh_report_error(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + enum pci_ers_result rc, *res = userdata; + struct pci_driver *driver; + + if (!dev || eeh_dev_removed(edev)) + return NULL; + dev->error_state = pci_channel_io_frozen; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + eeh_disable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->error_detected) { + eeh_pcid_put(dev); + return NULL; + } + + rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); + + /* A driver that needs a reset trumps all others */ + if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + if (*res == PCI_ERS_RESULT_NONE) *res = rc; + + eeh_pcid_put(dev); + return NULL; +} + +/** + * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled + * @data: eeh device + * @userdata: return value + * + * Tells each device driver that IO ports, MMIO and config space I/O + * are now enabled. Collects up and merges the device driver responses. + * Cumulative response passed back in "userdata". + */ +static void *eeh_report_mmio_enabled(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + enum pci_ers_result rc, *res = userdata; + struct pci_driver *driver; + + if (!dev || eeh_dev_removed(edev)) + return NULL; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + if (!driver->err_handler || + !driver->err_handler->mmio_enabled || + (edev->mode & EEH_DEV_NO_HANDLER)) { + eeh_pcid_put(dev); + return NULL; + } + + rc = driver->err_handler->mmio_enabled(dev); + + /* A driver that needs a reset trumps all others */ + if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + if (*res == PCI_ERS_RESULT_NONE) *res = rc; + + eeh_pcid_put(dev); + return NULL; +} + +/** + * eeh_report_reset - Tell device that slot has been reset + * @data: eeh device + * @userdata: return value + * + * This routine must be called while EEH tries to reset particular + * PCI device so that the associated PCI device driver could take + * some actions, usually to save data the driver needs so that the + * driver can work again while the device is recovered. + */ +static void *eeh_report_reset(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + enum pci_ers_result rc, *res = userdata; + struct pci_driver *driver; + + if (!dev || eeh_dev_removed(edev)) + return NULL; + dev->error_state = pci_channel_io_normal; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + eeh_enable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->slot_reset || + (edev->mode & EEH_DEV_NO_HANDLER)) { + eeh_pcid_put(dev); + return NULL; + } + + rc = driver->err_handler->slot_reset(dev); + if ((*res == PCI_ERS_RESULT_NONE) || + (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc; + if (*res == PCI_ERS_RESULT_DISCONNECT && + rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + + eeh_pcid_put(dev); + return NULL; +} + +/** + * eeh_report_resume - Tell device to resume normal operations + * @data: eeh device + * @userdata: return value + * + * This routine must be called to notify the device driver that it + * could resume so that the device driver can do some initialization + * to make the recovered device work again. + */ +static void *eeh_report_resume(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + struct pci_driver *driver; + + if (!dev || eeh_dev_removed(edev)) + return NULL; + dev->error_state = pci_channel_io_normal; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + eeh_enable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->resume || + (edev->mode & EEH_DEV_NO_HANDLER)) { + edev->mode &= ~EEH_DEV_NO_HANDLER; + eeh_pcid_put(dev); + return NULL; + } + + driver->err_handler->resume(dev); + + eeh_pcid_put(dev); + return NULL; +} + +/** + * eeh_report_failure - Tell device driver that device is dead. + * @data: eeh device + * @userdata: return value + * + * This informs the device driver that the device is permanently + * dead, and that no further recovery attempts will be made on it. + */ +static void *eeh_report_failure(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + struct pci_driver *driver; + + if (!dev || eeh_dev_removed(edev)) + return NULL; + dev->error_state = pci_channel_io_perm_failure; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + eeh_disable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->error_detected) { + eeh_pcid_put(dev); + return NULL; + } + + driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); + + eeh_pcid_put(dev); + return NULL; +} + +static void *eeh_rmv_device(void *data, void *userdata) +{ + struct pci_driver *driver; + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + int *removed = (int *)userdata; + + /* + * Actually, we should remove the PCI bridges as well. + * However, that's lots of complexity to do that, + * particularly some of devices under the bridge might + * support EEH. So we just care about PCI devices for + * simplicity here. + */ + if (!dev || (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE)) + return NULL; + + /* + * We rely on count-based pcibios_release_device() to + * detach permanently offlined PEs. Unfortunately, that's + * not reliable enough. We might have the permanently + * offlined PEs attached, but we needn't take care of + * them and their child devices. + */ + if (eeh_dev_removed(edev)) + return NULL; + + driver = eeh_pcid_get(dev); + if (driver) { + eeh_pcid_put(dev); + if (driver->err_handler) + return NULL; + } + + /* Remove it from PCI subsystem */ + pr_debug("EEH: Removing %s without EEH sensitive driver\n", + pci_name(dev)); + edev->bus = dev->bus; + edev->mode |= EEH_DEV_DISCONNECTED; + (*removed)++; + + pci_lock_rescan_remove(); + pci_stop_and_remove_bus_device(dev); + pci_unlock_rescan_remove(); + + return NULL; +} + +static void *eeh_pe_detach_dev(void *data, void *userdata) +{ + struct eeh_pe *pe = (struct eeh_pe *)data; + struct eeh_dev *edev, *tmp; + + eeh_pe_for_each_dev(pe, edev, tmp) { + if (!(edev->mode & EEH_DEV_DISCONNECTED)) + continue; + + edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED); + eeh_rmv_from_parent_pe(edev); + } + + return NULL; +} + +/* + * Explicitly clear PE's frozen state for PowerNV where + * we have frozen PE until BAR restore is completed. It's + * harmless to clear it for pSeries. To be consistent with + * PE reset (for 3 times), we try to clear the frozen state + * for 3 times as well. + */ +static void *__eeh_clear_pe_frozen_state(void *data, void *flag) +{ + struct eeh_pe *pe = (struct eeh_pe *)data; + int i, rc; + + for (i = 0; i < 3; i++) { + rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); + if (rc) + continue; + rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); + if (!rc) + break; + } + + /* The PE has been isolated, clear it */ + if (rc) { + pr_warn("%s: Can't clear frozen PHB#%x-PE#%x (%d)\n", + __func__, pe->phb->global_number, pe->addr, rc); + return (void *)pe; + } + + return NULL; +} + +static int eeh_clear_pe_frozen_state(struct eeh_pe *pe) +{ + void *rc; + + rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, NULL); + if (!rc) + eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + + return rc ? -EIO : 0; +} + +/** + * eeh_reset_device - Perform actual reset of a pci slot + * @pe: EEH PE + * @bus: PCI bus corresponding to the isolcated slot + * + * This routine must be called to do reset on the indicated PE. + * During the reset, udev might be invoked because those affected + * PCI devices will be removed and then added. + */ +static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) +{ + struct pci_bus *frozen_bus = eeh_pe_bus_get(pe); + struct timeval tstamp; + int cnt, rc, removed = 0; + + /* pcibios will clear the counter; save the value */ + cnt = pe->freeze_count; + tstamp = pe->tstamp; + + /* + * We don't remove the corresponding PE instances because + * we need the information afterwords. The attached EEH + * devices are expected to be attached soon when calling + * into pcibios_add_pci_devices(). + */ + eeh_pe_state_mark(pe, EEH_PE_KEEP); + if (bus) { + pci_lock_rescan_remove(); + pcibios_remove_pci_devices(bus); + pci_unlock_rescan_remove(); + } else if (frozen_bus) { + eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed); + } + + /* + * Reset the pci controller. (Asserts RST#; resets config space). + * Reconfigure bridges and devices. Don't try to bring the system + * up if the reset failed for some reason. + * + * During the reset, it's very dangerous to have uncontrolled PCI + * config accesses. So we prefer to block them. However, controlled + * PCI config accesses initiated from EEH itself are allowed. + */ + eeh_pe_state_mark(pe, EEH_PE_RESET); + rc = eeh_reset_pe(pe); + if (rc) { + eeh_pe_state_clear(pe, EEH_PE_RESET); + return rc; + } + + pci_lock_rescan_remove(); + + /* Restore PE */ + eeh_ops->configure_bridge(pe); + eeh_pe_restore_bars(pe); + eeh_pe_state_clear(pe, EEH_PE_RESET); + + /* Clear frozen state */ + rc = eeh_clear_pe_frozen_state(pe); + if (rc) + return rc; + + /* Give the system 5 seconds to finish running the user-space + * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, + * this is a hack, but if we don't do this, and try to bring + * the device up before the scripts have taken it down, + * potentially weird things happen. + */ + if (bus) { + pr_info("EEH: Sleep 5s ahead of complete hotplug\n"); + ssleep(5); + + /* + * The EEH device is still connected with its parent + * PE. We should disconnect it so the binding can be + * rebuilt when adding PCI devices. + */ + eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); + pcibios_add_pci_devices(bus); + } else if (frozen_bus && removed) { + pr_info("EEH: Sleep 5s ahead of partial hotplug\n"); + ssleep(5); + + eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); + pcibios_add_pci_devices(frozen_bus); + } + eeh_pe_state_clear(pe, EEH_PE_KEEP); + + pe->tstamp = tstamp; + pe->freeze_count = cnt; + + pci_unlock_rescan_remove(); + return 0; +} + +/* The longest amount of time to wait for a pci device + * to come back on line, in seconds. + */ +#define MAX_WAIT_FOR_RECOVERY 300 + +static void eeh_handle_normal_event(struct eeh_pe *pe) +{ + struct pci_bus *frozen_bus; + int rc = 0; + enum pci_ers_result result = PCI_ERS_RESULT_NONE; + + frozen_bus = eeh_pe_bus_get(pe); + if (!frozen_bus) { + pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n", + __func__, pe->phb->global_number, pe->addr); + return; + } + + eeh_pe_update_time_stamp(pe); + pe->freeze_count++; + if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) + goto excess_failures; + pr_warning("EEH: This PCI device has failed %d times in the last hour\n", + pe->freeze_count); + + /* Walk the various device drivers attached to this slot through + * a reset sequence, giving each an opportunity to do what it needs + * to accomplish the reset. Each child gets a report of the + * status ... if any child can't handle the reset, then the entire + * slot is dlpar removed and added. + */ + pr_info("EEH: Notify device drivers to shutdown\n"); + eeh_pe_dev_traverse(pe, eeh_report_error, &result); + + /* Get the current PCI slot state. This can take a long time, + * sometimes over 3 seconds for certain systems. + */ + rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); + if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { + pr_warning("EEH: Permanent failure\n"); + goto hard_fail; + } + + /* Since rtas may enable MMIO when posting the error log, + * don't post the error log until after all dev drivers + * have been informed. + */ + pr_info("EEH: Collect temporary log\n"); + eeh_slot_error_detail(pe, EEH_LOG_TEMP); + + /* If all device drivers were EEH-unaware, then shut + * down all of the device drivers, and hope they + * go down willingly, without panicing the system. + */ + if (result == PCI_ERS_RESULT_NONE) { + pr_info("EEH: Reset with hotplug activity\n"); + rc = eeh_reset_device(pe, frozen_bus); + if (rc) { + pr_warning("%s: Unable to reset, err=%d\n", + __func__, rc); + goto hard_fail; + } + } + + /* If all devices reported they can proceed, then re-enable MMIO */ + if (result == PCI_ERS_RESULT_CAN_RECOVER) { + pr_info("EEH: Enable I/O for affected devices\n"); + rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); + + if (rc < 0) + goto hard_fail; + if (rc) { + result = PCI_ERS_RESULT_NEED_RESET; + } else { + pr_info("EEH: Notify device drivers to resume I/O\n"); + eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result); + } + } + + /* If all devices reported they can proceed, then re-enable DMA */ + if (result == PCI_ERS_RESULT_CAN_RECOVER) { + pr_info("EEH: Enabled DMA for affected devices\n"); + rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); + + if (rc < 0) + goto hard_fail; + if (rc) { + result = PCI_ERS_RESULT_NEED_RESET; + } else { + /* + * We didn't do PE reset for the case. The PE + * is still in frozen state. Clear it before + * resuming the PE. + */ + eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + result = PCI_ERS_RESULT_RECOVERED; + } + } + + /* If any device has a hard failure, then shut off everything. */ + if (result == PCI_ERS_RESULT_DISCONNECT) { + pr_warning("EEH: Device driver gave up\n"); + goto hard_fail; + } + + /* If any device called out for a reset, then reset the slot */ + if (result == PCI_ERS_RESULT_NEED_RESET) { + pr_info("EEH: Reset without hotplug activity\n"); + rc = eeh_reset_device(pe, NULL); + if (rc) { + pr_warning("%s: Cannot reset, err=%d\n", + __func__, rc); + goto hard_fail; + } + + pr_info("EEH: Notify device drivers " + "the completion of reset\n"); + result = PCI_ERS_RESULT_NONE; + eeh_pe_dev_traverse(pe, eeh_report_reset, &result); + } + + /* All devices should claim they have recovered by now. */ + if ((result != PCI_ERS_RESULT_RECOVERED) && + (result != PCI_ERS_RESULT_NONE)) { + pr_warning("EEH: Not recovered\n"); + goto hard_fail; + } + + /* Tell all device drivers that they can resume operations */ + pr_info("EEH: Notify device driver to resume\n"); + eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); + + return; + +excess_failures: + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + pr_err("EEH: PHB#%d-PE#%x has failed %d times in the\n" + "last hour and has been permanently disabled.\n" + "Please try reseating or replacing it.\n", + pe->phb->global_number, pe->addr, + pe->freeze_count); + goto perm_error; + +hard_fail: + pr_err("EEH: Unable to recover from failure from PHB#%d-PE#%x.\n" + "Please try reseating or replacing it\n", + pe->phb->global_number, pe->addr); + +perm_error: + eeh_slot_error_detail(pe, EEH_LOG_PERM); + + /* Notify all devices that they're about to go down. */ + eeh_pe_dev_traverse(pe, eeh_report_failure, NULL); + + /* Mark the PE to be removed permanently */ + pe->freeze_count = EEH_MAX_ALLOWED_FREEZES + 1; + + /* + * Shut down the device drivers for good. We mark + * all removed devices correctly to avoid access + * the their PCI config any more. + */ + if (frozen_bus) { + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + + pci_lock_rescan_remove(); + pcibios_remove_pci_devices(frozen_bus); + pci_unlock_rescan_remove(); + } +} + +static void eeh_handle_special_event(void) +{ + struct eeh_pe *pe, *phb_pe; + struct pci_bus *bus; + struct pci_controller *hose; + unsigned long flags; + int rc; + + + do { + rc = eeh_ops->next_error(&pe); + + switch (rc) { + case EEH_NEXT_ERR_DEAD_IOC: + /* Mark all PHBs in dead state */ + eeh_serialize_lock(&flags); + + /* Purge all events */ + eeh_remove_event(NULL, true); + + list_for_each_entry(hose, &hose_list, list_node) { + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe) continue; + + eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); + } + + eeh_serialize_unlock(flags); + + break; + case EEH_NEXT_ERR_FROZEN_PE: + case EEH_NEXT_ERR_FENCED_PHB: + case EEH_NEXT_ERR_DEAD_PHB: + /* Mark the PE in fenced state */ + eeh_serialize_lock(&flags); + + /* Purge all events of the PHB */ + eeh_remove_event(pe, true); + + if (rc == EEH_NEXT_ERR_DEAD_PHB) + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + else + eeh_pe_state_mark(pe, + EEH_PE_ISOLATED | EEH_PE_RECOVERING); + + eeh_serialize_unlock(flags); + + break; + case EEH_NEXT_ERR_NONE: + return; + default: + pr_warn("%s: Invalid value %d from next_error()\n", + __func__, rc); + return; + } + + /* + * For fenced PHB and frozen PE, it's handled as normal + * event. We have to remove the affected PHBs for dead + * PHB and IOC + */ + if (rc == EEH_NEXT_ERR_FROZEN_PE || + rc == EEH_NEXT_ERR_FENCED_PHB) { + eeh_handle_normal_event(pe); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + } else { + pci_lock_rescan_remove(); + list_for_each_entry(hose, &hose_list, list_node) { + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe || + !(phb_pe->state & EEH_PE_ISOLATED) || + (phb_pe->state & EEH_PE_RECOVERING)) + continue; + + /* Notify all devices to be down */ + bus = eeh_pe_bus_get(phb_pe); + eeh_pe_dev_traverse(pe, + eeh_report_failure, NULL); + pcibios_remove_pci_devices(bus); + } + pci_unlock_rescan_remove(); + } + + /* + * If we have detected dead IOC, we needn't proceed + * any more since all PHBs would have been removed + */ + if (rc == EEH_NEXT_ERR_DEAD_IOC) + break; + } while (rc != EEH_NEXT_ERR_NONE); +} + +/** + * eeh_handle_event - Reset a PCI device after hard lockup. + * @pe: EEH PE + * + * While PHB detects address or data parity errors on particular PCI + * slot, the associated PE will be frozen. Besides, DMA's occurring + * to wild addresses (which usually happen due to bugs in device + * drivers or in PCI adapter firmware) can cause EEH error. #SERR, + * #PERR or other misc PCI-related errors also can trigger EEH errors. + * + * Recovery process consists of unplugging the device driver (which + * generated hotplug events to userspace), then issuing a PCI #RST to + * the device, then reconfiguring the PCI config space for all bridges + * & devices under this slot, and then finally restarting the device + * drivers (which cause a second set of hotplug events to go out to + * userspace). + */ +void eeh_handle_event(struct eeh_pe *pe) +{ + if (pe) + eeh_handle_normal_event(pe); + else + eeh_handle_special_event(); +} diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c new file mode 100644 index 00000000000..4eefb6e34db --- /dev/null +++ b/arch/powerpc/kernel/eeh_event.c @@ -0,0 +1,196 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2005 Linas Vepstas <linas@linas.org> + */ + +#include <linux/delay.h> +#include <linux/list.h> +#include <linux/sched.h> +#include <linux/semaphore.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/kthread.h> +#include <asm/eeh_event.h> +#include <asm/ppc-pci.h> + +/** Overview: + * EEH error states may be detected within exception handlers; + * however, the recovery processing needs to occur asynchronously + * in a normal kernel context and not an interrupt context. + * This pair of routines creates an event and queues it onto a + * work-queue, where a worker thread can drive recovery. + */ + +static DEFINE_SPINLOCK(eeh_eventlist_lock); +static struct semaphore eeh_eventlist_sem; +LIST_HEAD(eeh_eventlist); + +/** + * eeh_event_handler - Dispatch EEH events. + * @dummy - unused + * + * The detection of a frozen slot can occur inside an interrupt, + * where it can be hard to do anything about it. The goal of this + * routine is to pull these detection events out of the context + * of the interrupt handler, and re-dispatch them for processing + * at a later time in a normal context. + */ +static int eeh_event_handler(void * dummy) +{ + unsigned long flags; + struct eeh_event *event; + struct eeh_pe *pe; + + while (!kthread_should_stop()) { + if (down_interruptible(&eeh_eventlist_sem)) + break; + + /* Fetch EEH event from the queue */ + spin_lock_irqsave(&eeh_eventlist_lock, flags); + event = NULL; + if (!list_empty(&eeh_eventlist)) { + event = list_entry(eeh_eventlist.next, + struct eeh_event, list); + list_del(&event->list); + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + if (!event) + continue; + + /* We might have event without binding PE */ + pe = event->pe; + if (pe) { + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + if (pe->type & EEH_PE_PHB) + pr_info("EEH: Detected error on PHB#%d\n", + pe->phb->global_number); + else + pr_info("EEH: Detected PCI bus error on " + "PHB#%d-PE#%x\n", + pe->phb->global_number, pe->addr); + eeh_handle_event(pe); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + } else { + eeh_handle_event(NULL); + } + + kfree(event); + } + + return 0; +} + +/** + * eeh_event_init - Start kernel thread to handle EEH events + * + * This routine is called to start the kernel thread for processing + * EEH event. + */ +int eeh_event_init(void) +{ + struct task_struct *t; + int ret = 0; + + /* Initialize semaphore */ + sema_init(&eeh_eventlist_sem, 0); + + t = kthread_run(eeh_event_handler, NULL, "eehd"); + if (IS_ERR(t)) { + ret = PTR_ERR(t); + pr_err("%s: Failed to start EEH daemon (%d)\n", + __func__, ret); + return ret; + } + + return 0; +} + +/** + * eeh_send_failure_event - Generate a PCI error event + * @pe: EEH PE + * + * This routine can be called within an interrupt context; + * the actual event will be delivered in a normal context + * (from a workqueue). + */ +int eeh_send_failure_event(struct eeh_pe *pe) +{ + unsigned long flags; + struct eeh_event *event; + + event = kzalloc(sizeof(*event), GFP_ATOMIC); + if (!event) { + pr_err("EEH: out of memory, event not handled\n"); + return -ENOMEM; + } + event->pe = pe; + + /* We may or may not be called in an interrupt context */ + spin_lock_irqsave(&eeh_eventlist_lock, flags); + list_add(&event->list, &eeh_eventlist); + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + + /* For EEH deamon to knick in */ + up(&eeh_eventlist_sem); + + return 0; +} + +/** + * eeh_remove_event - Remove EEH event from the queue + * @pe: Event binding to the PE + * @force: Event will be removed unconditionally + * + * On PowerNV platform, we might have subsequent coming events + * is part of the former one. For that case, those subsequent + * coming events are totally duplicated and unnecessary, thus + * they should be removed. + */ +void eeh_remove_event(struct eeh_pe *pe, bool force) +{ + unsigned long flags; + struct eeh_event *event, *tmp; + + /* + * If we have NULL PE passed in, we have dead IOC + * or we're sure we can report all existing errors + * by the caller. + * + * With "force", the event with associated PE that + * have been isolated, the event won't be removed + * to avoid event lost. + */ + spin_lock_irqsave(&eeh_eventlist_lock, flags); + list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) { + if (!force && event->pe && + (event->pe->state & EEH_PE_ISOLATED)) + continue; + + if (!pe) { + list_del(&event->list); + kfree(event); + } else if (pe->type & EEH_PE_PHB) { + if (event->pe && event->pe->phb == pe->phb) { + list_del(&event->list); + kfree(event); + } + } else if (event->pe == pe) { + list_del(&event->list); + kfree(event); + } + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); +} diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c new file mode 100644 index 00000000000..fbd01eba447 --- /dev/null +++ b/arch/powerpc/kernel/eeh_pe.c @@ -0,0 +1,887 @@ +/* + * The file intends to implement PE based on the information from + * platforms. Basically, there have 3 types of PEs: PHB/Bus/Device. + * All the PEs should be organized as hierarchy tree. The first level + * of the tree will be associated to existing PHBs since the particular + * PE is only meaningful in one PHB domain. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> + +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> + +static LIST_HEAD(eeh_phb_pe); + +/** + * eeh_pe_alloc - Allocate PE + * @phb: PCI controller + * @type: PE type + * + * Allocate PE instance dynamically. + */ +static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type) +{ + struct eeh_pe *pe; + + /* Allocate PHB PE */ + pe = kzalloc(sizeof(struct eeh_pe), GFP_KERNEL); + if (!pe) return NULL; + + /* Initialize PHB PE */ + pe->type = type; + pe->phb = phb; + INIT_LIST_HEAD(&pe->child_list); + INIT_LIST_HEAD(&pe->child); + INIT_LIST_HEAD(&pe->edevs); + + return pe; +} + +/** + * eeh_phb_pe_create - Create PHB PE + * @phb: PCI controller + * + * The function should be called while the PHB is detected during + * system boot or PCI hotplug in order to create PHB PE. + */ +int eeh_phb_pe_create(struct pci_controller *phb) +{ + struct eeh_pe *pe; + + /* Allocate PHB PE */ + pe = eeh_pe_alloc(phb, EEH_PE_PHB); + if (!pe) { + pr_err("%s: out of memory!\n", __func__); + return -ENOMEM; + } + + /* Put it into the list */ + list_add_tail(&pe->child, &eeh_phb_pe); + + pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number); + + return 0; +} + +/** + * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB + * @phb: PCI controller + * + * The overall PEs form hierarchy tree. The first layer of the + * hierarchy tree is composed of PHB PEs. The function is used + * to retrieve the corresponding PHB PE according to the given PHB. + */ +struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb) +{ + struct eeh_pe *pe; + + list_for_each_entry(pe, &eeh_phb_pe, child) { + /* + * Actually, we needn't check the type since + * the PE for PHB has been determined when that + * was created. + */ + if ((pe->type & EEH_PE_PHB) && pe->phb == phb) + return pe; + } + + return NULL; +} + +/** + * eeh_pe_next - Retrieve the next PE in the tree + * @pe: current PE + * @root: root PE + * + * The function is used to retrieve the next PE in the + * hierarchy PE tree. + */ +static struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, + struct eeh_pe *root) +{ + struct list_head *next = pe->child_list.next; + + if (next == &pe->child_list) { + while (1) { + if (pe == root) + return NULL; + next = pe->child.next; + if (next != &pe->parent->child_list) + break; + pe = pe->parent; + } + } + + return list_entry(next, struct eeh_pe, child); +} + +/** + * eeh_pe_traverse - Traverse PEs in the specified PHB + * @root: root PE + * @fn: callback + * @flag: extra parameter to callback + * + * The function is used to traverse the specified PE and its + * child PEs. The traversing is to be terminated once the + * callback returns something other than NULL, or no more PEs + * to be traversed. + */ +void *eeh_pe_traverse(struct eeh_pe *root, + eeh_traverse_func fn, void *flag) +{ + struct eeh_pe *pe; + void *ret; + + for (pe = root; pe; pe = eeh_pe_next(pe, root)) { + ret = fn(pe, flag); + if (ret) return ret; + } + + return NULL; +} + +/** + * eeh_pe_dev_traverse - Traverse the devices from the PE + * @root: EEH PE + * @fn: function callback + * @flag: extra parameter to callback + * + * The function is used to traverse the devices of the specified + * PE and its child PEs. + */ +void *eeh_pe_dev_traverse(struct eeh_pe *root, + eeh_traverse_func fn, void *flag) +{ + struct eeh_pe *pe; + struct eeh_dev *edev, *tmp; + void *ret; + + if (!root) { + pr_warning("%s: Invalid PE %p\n", __func__, root); + return NULL; + } + + /* Traverse root PE */ + for (pe = root; pe; pe = eeh_pe_next(pe, root)) { + eeh_pe_for_each_dev(pe, edev, tmp) { + ret = fn(edev, flag); + if (ret) + return ret; + } + } + + return NULL; +} + +/** + * __eeh_pe_get - Check the PE address + * @data: EEH PE + * @flag: EEH device + * + * For one particular PE, it can be identified by PE address + * or tranditional BDF address. BDF address is composed of + * Bus/Device/Function number. The extra data referred by flag + * indicates which type of address should be used. + */ +static void *__eeh_pe_get(void *data, void *flag) +{ + struct eeh_pe *pe = (struct eeh_pe *)data; + struct eeh_dev *edev = (struct eeh_dev *)flag; + + /* Unexpected PHB PE */ + if (pe->type & EEH_PE_PHB) + return NULL; + + /* We prefer PE address */ + if (edev->pe_config_addr && + (edev->pe_config_addr == pe->addr)) + return pe; + + /* Try BDF address */ + if (edev->config_addr && + (edev->config_addr == pe->config_addr)) + return pe; + + return NULL; +} + +/** + * eeh_pe_get - Search PE based on the given address + * @edev: EEH device + * + * Search the corresponding PE based on the specified address which + * is included in the eeh device. The function is used to check if + * the associated PE has been created against the PE address. It's + * notable that the PE address has 2 format: traditional PE address + * which is composed of PCI bus/device/function number, or unified + * PE address. + */ +struct eeh_pe *eeh_pe_get(struct eeh_dev *edev) +{ + struct eeh_pe *root = eeh_phb_pe_get(edev->phb); + struct eeh_pe *pe; + + pe = eeh_pe_traverse(root, __eeh_pe_get, edev); + + return pe; +} + +/** + * eeh_pe_get_parent - Retrieve the parent PE + * @edev: EEH device + * + * The whole PEs existing in the system are organized as hierarchy + * tree. The function is used to retrieve the parent PE according + * to the parent EEH device. + */ +static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev) +{ + struct device_node *dn; + struct eeh_dev *parent; + + /* + * It might have the case for the indirect parent + * EEH device already having associated PE, but + * the direct parent EEH device doesn't have yet. + */ + dn = edev->dn->parent; + while (dn) { + /* We're poking out of PCI territory */ + if (!PCI_DN(dn)) return NULL; + + parent = of_node_to_eeh_dev(dn); + /* We're poking out of PCI territory */ + if (!parent) return NULL; + + if (parent->pe) + return parent->pe; + + dn = dn->parent; + } + + return NULL; +} + +/** + * eeh_add_to_parent_pe - Add EEH device to parent PE + * @edev: EEH device + * + * Add EEH device to the parent PE. If the parent PE already + * exists, the PE type will be changed to EEH_PE_BUS. Otherwise, + * we have to create new PE to hold the EEH device and the new + * PE will be linked to its parent PE as well. + */ +int eeh_add_to_parent_pe(struct eeh_dev *edev) +{ + struct eeh_pe *pe, *parent; + + /* + * Search the PE has been existing or not according + * to the PE address. If that has been existing, the + * PE should be composed of PCI bus and its subordinate + * components. + */ + pe = eeh_pe_get(edev); + if (pe && !(pe->type & EEH_PE_INVALID)) { + if (!edev->pe_config_addr) { + pr_err("%s: PE with addr 0x%x already exists\n", + __func__, edev->config_addr); + return -EEXIST; + } + + /* Mark the PE as type of PCI bus */ + pe->type = EEH_PE_BUS; + edev->pe = pe; + + /* Put the edev to PE */ + list_add_tail(&edev->list, &pe->edevs); + pr_debug("EEH: Add %s to Bus PE#%x\n", + edev->dn->full_name, pe->addr); + + return 0; + } else if (pe && (pe->type & EEH_PE_INVALID)) { + list_add_tail(&edev->list, &pe->edevs); + edev->pe = pe; + /* + * We're running to here because of PCI hotplug caused by + * EEH recovery. We need clear EEH_PE_INVALID until the top. + */ + parent = pe; + while (parent) { + if (!(parent->type & EEH_PE_INVALID)) + break; + parent->type &= ~(EEH_PE_INVALID | EEH_PE_KEEP); + parent = parent->parent; + } + pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n", + edev->dn->full_name, pe->addr, pe->parent->addr); + + return 0; + } + + /* Create a new EEH PE */ + pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE); + if (!pe) { + pr_err("%s: out of memory!\n", __func__); + return -ENOMEM; + } + pe->addr = edev->pe_config_addr; + pe->config_addr = edev->config_addr; + + /* + * While doing PE reset, we probably hot-reset the + * upstream bridge. However, the PCI devices including + * the associated EEH devices might be removed when EEH + * core is doing recovery. So that won't safe to retrieve + * the bridge through downstream EEH device. We have to + * trace the parent PCI bus, then the upstream bridge. + */ + if (eeh_probe_mode_dev()) + pe->bus = eeh_dev_to_pci_dev(edev)->bus; + + /* + * Put the new EEH PE into hierarchy tree. If the parent + * can't be found, the newly created PE will be attached + * to PHB directly. Otherwise, we have to associate the + * PE with its parent. + */ + parent = eeh_pe_get_parent(edev); + if (!parent) { + parent = eeh_phb_pe_get(edev->phb); + if (!parent) { + pr_err("%s: No PHB PE is found (PHB Domain=%d)\n", + __func__, edev->phb->global_number); + edev->pe = NULL; + kfree(pe); + return -EEXIST; + } + } + pe->parent = parent; + + /* + * Put the newly created PE into the child list and + * link the EEH device accordingly. + */ + list_add_tail(&pe->child, &parent->child_list); + list_add_tail(&edev->list, &pe->edevs); + edev->pe = pe; + pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n", + edev->dn->full_name, pe->addr, pe->parent->addr); + + return 0; +} + +/** + * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE + * @edev: EEH device + * + * The PE hierarchy tree might be changed when doing PCI hotplug. + * Also, the PCI devices or buses could be removed from the system + * during EEH recovery. So we have to call the function remove the + * corresponding PE accordingly if necessary. + */ +int eeh_rmv_from_parent_pe(struct eeh_dev *edev) +{ + struct eeh_pe *pe, *parent, *child; + int cnt; + + if (!edev->pe) { + pr_debug("%s: No PE found for EEH device %s\n", + __func__, edev->dn->full_name); + return -EEXIST; + } + + /* Remove the EEH device */ + pe = edev->pe; + edev->pe = NULL; + list_del(&edev->list); + + /* + * Check if the parent PE includes any EEH devices. + * If not, we should delete that. Also, we should + * delete the parent PE if it doesn't have associated + * child PEs and EEH devices. + */ + while (1) { + parent = pe->parent; + if (pe->type & EEH_PE_PHB) + break; + + if (!(pe->state & EEH_PE_KEEP)) { + if (list_empty(&pe->edevs) && + list_empty(&pe->child_list)) { + list_del(&pe->child); + kfree(pe); + } else { + break; + } + } else { + if (list_empty(&pe->edevs)) { + cnt = 0; + list_for_each_entry(child, &pe->child_list, child) { + if (!(child->type & EEH_PE_INVALID)) { + cnt++; + break; + } + } + + if (!cnt) + pe->type |= EEH_PE_INVALID; + else + break; + } + } + + pe = parent; + } + + return 0; +} + +/** + * eeh_pe_update_time_stamp - Update PE's frozen time stamp + * @pe: EEH PE + * + * We have time stamp for each PE to trace its time of getting + * frozen in last hour. The function should be called to update + * the time stamp on first error of the specific PE. On the other + * handle, we needn't account for errors happened in last hour. + */ +void eeh_pe_update_time_stamp(struct eeh_pe *pe) +{ + struct timeval tstamp; + + if (!pe) return; + + if (pe->freeze_count <= 0) { + pe->freeze_count = 0; + do_gettimeofday(&pe->tstamp); + } else { + do_gettimeofday(&tstamp); + if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) { + pe->tstamp = tstamp; + pe->freeze_count = 0; + } + } +} + +/** + * __eeh_pe_state_mark - Mark the state for the PE + * @data: EEH PE + * @flag: state + * + * The function is used to mark the indicated state for the given + * PE. Also, the associated PCI devices will be put into IO frozen + * state as well. + */ +static void *__eeh_pe_state_mark(void *data, void *flag) +{ + struct eeh_pe *pe = (struct eeh_pe *)data; + int state = *((int *)flag); + struct eeh_dev *edev, *tmp; + struct pci_dev *pdev; + + /* Keep the state of permanently removed PE intact */ + if ((pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) && + (state & (EEH_PE_ISOLATED | EEH_PE_RECOVERING))) + return NULL; + + pe->state |= state; + + /* Offline PCI devices if applicable */ + if (state != EEH_PE_ISOLATED) + return NULL; + + eeh_pe_for_each_dev(pe, edev, tmp) { + pdev = eeh_dev_to_pci_dev(edev); + if (pdev) + pdev->error_state = pci_channel_io_frozen; + } + + return NULL; +} + +/** + * eeh_pe_state_mark - Mark specified state for PE and its associated device + * @pe: EEH PE + * + * EEH error affects the current PE and its child PEs. The function + * is used to mark appropriate state for the affected PEs and the + * associated devices. + */ +void eeh_pe_state_mark(struct eeh_pe *pe, int state) +{ + eeh_pe_traverse(pe, __eeh_pe_state_mark, &state); +} + +static void *__eeh_pe_dev_mode_mark(void *data, void *flag) +{ + struct eeh_dev *edev = data; + int mode = *((int *)flag); + + edev->mode |= mode; + + return NULL; +} + +/** + * eeh_pe_dev_state_mark - Mark state for all device under the PE + * @pe: EEH PE + * + * Mark specific state for all child devices of the PE. + */ +void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode) +{ + eeh_pe_dev_traverse(pe, __eeh_pe_dev_mode_mark, &mode); +} + +/** + * __eeh_pe_state_clear - Clear state for the PE + * @data: EEH PE + * @flag: state + * + * The function is used to clear the indicated state from the + * given PE. Besides, we also clear the check count of the PE + * as well. + */ +static void *__eeh_pe_state_clear(void *data, void *flag) +{ + struct eeh_pe *pe = (struct eeh_pe *)data; + int state = *((int *)flag); + + /* Keep the state of permanently removed PE intact */ + if ((pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) && + (state & EEH_PE_ISOLATED)) + return NULL; + + pe->state &= ~state; + + /* Clear check count since last isolation */ + if (state & EEH_PE_ISOLATED) + pe->check_count = 0; + + return NULL; +} + +/** + * eeh_pe_state_clear - Clear state for the PE and its children + * @pe: PE + * @state: state to be cleared + * + * When the PE and its children has been recovered from error, + * we need clear the error state for that. The function is used + * for the purpose. + */ +void eeh_pe_state_clear(struct eeh_pe *pe, int state) +{ + eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); +} + +/* + * Some PCI bridges (e.g. PLX bridges) have primary/secondary + * buses assigned explicitly by firmware, and we probably have + * lost that after reset. So we have to delay the check until + * the PCI-CFG registers have been restored for the parent + * bridge. + * + * Don't use normal PCI-CFG accessors, which probably has been + * blocked on normal path during the stage. So we need utilize + * eeh operations, which is always permitted. + */ +static void eeh_bridge_check_link(struct eeh_dev *edev, + struct device_node *dn) +{ + int cap; + uint32_t val; + int timeout = 0; + + /* + * We only check root port and downstream ports of + * PCIe switches + */ + if (!(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT))) + return; + + pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n", + __func__, edev->phb->global_number, + edev->config_addr >> 8, + PCI_SLOT(edev->config_addr & 0xFF), + PCI_FUNC(edev->config_addr & 0xFF)); + + /* Check slot status */ + cap = edev->pcie_cap; + eeh_ops->read_config(dn, cap + PCI_EXP_SLTSTA, 2, &val); + if (!(val & PCI_EXP_SLTSTA_PDS)) { + pr_debug(" No card in the slot (0x%04x) !\n", val); + return; + } + + /* Check power status if we have the capability */ + eeh_ops->read_config(dn, cap + PCI_EXP_SLTCAP, 2, &val); + if (val & PCI_EXP_SLTCAP_PCP) { + eeh_ops->read_config(dn, cap + PCI_EXP_SLTCTL, 2, &val); + if (val & PCI_EXP_SLTCTL_PCC) { + pr_debug(" In power-off state, power it on ...\n"); + val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC); + val |= (0x0100 & PCI_EXP_SLTCTL_PIC); + eeh_ops->write_config(dn, cap + PCI_EXP_SLTCTL, 2, val); + msleep(2 * 1000); + } + } + + /* Enable link */ + eeh_ops->read_config(dn, cap + PCI_EXP_LNKCTL, 2, &val); + val &= ~PCI_EXP_LNKCTL_LD; + eeh_ops->write_config(dn, cap + PCI_EXP_LNKCTL, 2, val); + + /* Check link */ + eeh_ops->read_config(dn, cap + PCI_EXP_LNKCAP, 4, &val); + if (!(val & PCI_EXP_LNKCAP_DLLLARC)) { + pr_debug(" No link reporting capability (0x%08x) \n", val); + msleep(1000); + return; + } + + /* Wait the link is up until timeout (5s) */ + timeout = 0; + while (timeout < 5000) { + msleep(20); + timeout += 20; + + eeh_ops->read_config(dn, cap + PCI_EXP_LNKSTA, 2, &val); + if (val & PCI_EXP_LNKSTA_DLLLA) + break; + } + + if (val & PCI_EXP_LNKSTA_DLLLA) + pr_debug(" Link up (%s)\n", + (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB"); + else + pr_debug(" Link not ready (0x%04x)\n", val); +} + +#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) +#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)]) + +static void eeh_restore_bridge_bars(struct eeh_dev *edev, + struct device_node *dn) +{ + int i; + + /* + * Device BARs: 0x10 - 0x18 + * Bus numbers and windows: 0x18 - 0x30 + */ + for (i = 4; i < 13; i++) + eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]); + /* Rom: 0x38 */ + eeh_ops->write_config(dn, 14*4, 4, edev->config_space[14]); + + /* Cache line & Latency timer: 0xC 0xD */ + eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1, + SAVED_BYTE(PCI_CACHE_LINE_SIZE)); + eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1, + SAVED_BYTE(PCI_LATENCY_TIMER)); + /* Max latency, min grant, interrupt ping and line: 0x3C */ + eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]); + + /* PCI Command: 0x4 */ + eeh_ops->write_config(dn, PCI_COMMAND, 4, edev->config_space[1]); + + /* Check the PCIe link is ready */ + eeh_bridge_check_link(edev, dn); +} + +static void eeh_restore_device_bars(struct eeh_dev *edev, + struct device_node *dn) +{ + int i; + u32 cmd; + + for (i = 4; i < 10; i++) + eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]); + /* 12 == Expansion ROM Address */ + eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]); + + eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1, + SAVED_BYTE(PCI_CACHE_LINE_SIZE)); + eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1, + SAVED_BYTE(PCI_LATENCY_TIMER)); + + /* max latency, min grant, interrupt pin and line */ + eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]); + + /* + * Restore PERR & SERR bits, some devices require it, + * don't touch the other command bits + */ + eeh_ops->read_config(dn, PCI_COMMAND, 4, &cmd); + if (edev->config_space[1] & PCI_COMMAND_PARITY) + cmd |= PCI_COMMAND_PARITY; + else + cmd &= ~PCI_COMMAND_PARITY; + if (edev->config_space[1] & PCI_COMMAND_SERR) + cmd |= PCI_COMMAND_SERR; + else + cmd &= ~PCI_COMMAND_SERR; + eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd); +} + +/** + * eeh_restore_one_device_bars - Restore the Base Address Registers for one device + * @data: EEH device + * @flag: Unused + * + * Loads the PCI configuration space base address registers, + * the expansion ROM base address, the latency timer, and etc. + * from the saved values in the device node. + */ +static void *eeh_restore_one_device_bars(void *data, void *flag) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct device_node *dn = eeh_dev_to_of_node(edev); + + /* Do special restore for bridges */ + if (edev->mode & EEH_DEV_BRIDGE) + eeh_restore_bridge_bars(edev, dn); + else + eeh_restore_device_bars(edev, dn); + + if (eeh_ops->restore_config) + eeh_ops->restore_config(dn); + + return NULL; +} + +/** + * eeh_pe_restore_bars - Restore the PCI config space info + * @pe: EEH PE + * + * This routine performs a recursive walk to the children + * of this device as well. + */ +void eeh_pe_restore_bars(struct eeh_pe *pe) +{ + /* + * We needn't take the EEH lock since eeh_pe_dev_traverse() + * will take that. + */ + eeh_pe_dev_traverse(pe, eeh_restore_one_device_bars, NULL); +} + +/** + * eeh_pe_loc_get - Retrieve location code binding to the given PE + * @pe: EEH PE + * + * Retrieve the location code of the given PE. If the primary PE bus + * is root bus, we will grab location code from PHB device tree node + * or root port. Otherwise, the upstream bridge's device tree node + * of the primary PE bus will be checked for the location code. + */ +const char *eeh_pe_loc_get(struct eeh_pe *pe) +{ + struct pci_controller *hose; + struct pci_bus *bus = eeh_pe_bus_get(pe); + struct pci_dev *pdev; + struct device_node *dn; + const char *loc; + + if (!bus) + return "N/A"; + + /* PHB PE or root PE ? */ + if (pci_is_root_bus(bus)) { + hose = pci_bus_to_host(bus); + loc = of_get_property(hose->dn, + "ibm,loc-code", NULL); + if (loc) + return loc; + loc = of_get_property(hose->dn, + "ibm,io-base-loc-code", NULL); + if (loc) + return loc; + + pdev = pci_get_slot(bus, 0x0); + } else { + pdev = bus->self; + } + + if (!pdev) { + loc = "N/A"; + goto out; + } + + dn = pci_device_to_OF_node(pdev); + if (!dn) { + loc = "N/A"; + goto out; + } + + loc = of_get_property(dn, "ibm,loc-code", NULL); + if (!loc) + loc = of_get_property(dn, "ibm,slot-location-code", NULL); + if (!loc) + loc = "N/A"; + +out: + if (pci_is_root_bus(bus) && pdev) + pci_dev_put(pdev); + return loc; +} + +/** + * eeh_pe_bus_get - Retrieve PCI bus according to the given PE + * @pe: EEH PE + * + * Retrieve the PCI bus according to the given PE. Basically, + * there're 3 types of PEs: PHB/Bus/Device. For PHB PE, the + * primary PCI bus will be retrieved. The parent bus will be + * returned for BUS PE. However, we don't have associated PCI + * bus for DEVICE PE. + */ +struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) +{ + struct pci_bus *bus = NULL; + struct eeh_dev *edev; + struct pci_dev *pdev; + + if (pe->type & EEH_PE_PHB) { + bus = pe->phb->bus; + } else if (pe->type & EEH_PE_BUS || + pe->type & EEH_PE_DEVICE) { + if (pe->bus) { + bus = pe->bus; + goto out; + } + + edev = list_first_entry(&pe->edevs, struct eeh_dev, list); + pdev = eeh_dev_to_pci_dev(edev); + if (pdev) + bus = pdev->bus; + } + +out: + return bus; +} diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c new file mode 100644 index 00000000000..e2595ba4b72 --- /dev/null +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -0,0 +1,98 @@ +/* + * Sysfs entries for PCI Error Recovery for PAPR-compliant platform. + * Copyright IBM Corporation 2007 + * Copyright Linas Vepstas <linas@austin.ibm.com> 2007 + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> + */ +#include <linux/pci.h> +#include <linux/stat.h> +#include <asm/ppc-pci.h> +#include <asm/pci-bridge.h> + +/** + * EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic + * @_name: name of file in sysfs directory + * @_memb: name of member in struct pci_dn to access + * @_format: printf format for display + * + * All of the attributes look very similar, so just + * auto-gen a cut-n-paste routine to display them. + */ +#define EEH_SHOW_ATTR(_name,_memb,_format) \ +static ssize_t eeh_show_##_name(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct pci_dev *pdev = to_pci_dev(dev); \ + struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); \ + \ + if (!edev) \ + return 0; \ + \ + return sprintf(buf, _format "\n", edev->_memb); \ +} \ +static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL); + +EEH_SHOW_ATTR(eeh_mode, mode, "0x%x"); +EEH_SHOW_ATTR(eeh_config_addr, config_addr, "0x%x"); +EEH_SHOW_ATTR(eeh_pe_config_addr, pe_config_addr, "0x%x"); + +void eeh_sysfs_add_device(struct pci_dev *pdev) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); + int rc=0; + + if (!eeh_enabled()) + return; + + if (edev && (edev->mode & EEH_DEV_SYSFS)) + return; + + rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode); + rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr); + rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); + + if (rc) + printk(KERN_WARNING "EEH: Unable to create sysfs entries\n"); + else if (edev) + edev->mode |= EEH_DEV_SYSFS; +} + +void eeh_sysfs_remove_device(struct pci_dev *pdev) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); + + /* + * The parent directory might have been removed. We needn't + * continue for that case. + */ + if (!pdev->dev.kobj.sd) { + if (edev) + edev->mode &= ~EEH_DEV_SYSFS; + return; + } + + device_remove_file(&pdev->dev, &dev_attr_eeh_mode); + device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr); + device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); + + if (edev) + edev->mode &= ~EEH_DEV_SYSFS; +} diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index d22e73e4618..22b45a4955c 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -849,7 +849,7 @@ resume_kernel: /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ CURRENT_THREAD_INFO(r9, r1) lwz r8,TI_FLAGS(r9) - andis. r8,r8,_TIF_EMULATE_STACK_STORE@h + andis. r0,r8,_TIF_EMULATE_STACK_STORE@h beq+ 1f addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index b310a057362..6528c5e2cc4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -33,13 +33,14 @@ #include <asm/irqflags.h> #include <asm/ftrace.h> #include <asm/hw_irq.h> +#include <asm/context_tracking.h> /* * System calls. */ .section ".toc","aw" -.SYS_CALL_TABLE: - .tc .sys_call_table[TC],.sys_call_table +SYS_CALL_TABLE: + .tc sys_call_table[TC],sys_call_table /* This value is used to mark exception frames on the stack. */ exception_marker: @@ -62,8 +63,9 @@ system_call_common: std r12,_MSR(r1) std r0,GPR0(r1) std r10,GPR1(r1) + beq 2f /* if from kernel mode */ ACCOUNT_CPU_USER_ENTRY(r10, r11) - std r2,GPR2(r1) +2: std r2,GPR2(r1) std r3,GPR3(r1) mfcr r2 std r4,GPR4(r1) @@ -94,23 +96,24 @@ system_call_common: addi r9,r1,STACK_FRAME_OVERHEAD ld r11,exception_marker@toc(r2) std r11,-16(r9) /* "regshere" marker */ -#if defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR) +#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR) BEGIN_FW_FTR_SECTION beq 33f /* if from user, see if there are any DTL entries to process */ ld r10,PACALPPACAPTR(r13) /* get ptr to VPA */ ld r11,PACA_DTL_RIDX(r13) /* get log read index */ - ld r10,LPPACA_DTLIDX(r10) /* get log write index */ + addi r10,r10,LPPACA_DTLIDX + LDX_BE r10,0,r10 /* get log write index */ cmpd cr1,r11,r10 beq+ cr1,33f - bl .accumulate_stolen_time + bl accumulate_stolen_time REST_GPR(0,r1) REST_4GPRS(3,r1) REST_2GPRS(7,r1) addi r9,r1,STACK_FRAME_OVERHEAD 33: END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) -#endif /* CONFIG_VIRT_CPU_ACCOUNTING && CONFIG_PPC_SPLPAR */ +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE && CONFIG_PPC_SPLPAR */ /* * A syscall should always be called with interrupts enabled @@ -140,7 +143,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) std r10,SOFTE(r1) #ifdef SHOW_SYSCALLS - bl .do_show_syscall + bl do_show_syscall REST_GPR(0,r1) REST_4GPRS(3,r1) REST_2GPRS(7,r1) @@ -149,7 +152,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) CURRENT_THREAD_INFO(r11, r1) ld r10,TI_FLAGS(r11) andi. r11,r10,_TIF_SYSCALL_T_OR_A - bne- syscall_dotrace + bne syscall_dotrace .Lsyscall_dotrace_cont: cmpldi 0,r0,NR_syscalls bge- syscall_enosys @@ -159,7 +162,7 @@ system_call: /* label this so stack traces look sane */ * Need to vector to 32 Bit or default sys_call_table here, * based on caller's run-mode / personality. */ - ld r11,.SYS_CALL_TABLE@toc(2) + ld r11,SYS_CALL_TABLE@toc(2) andi. r10,r10,_TIF_32BIT beq 15f addi r11,r11,8 /* use 32-bit syscall entries */ @@ -171,14 +174,14 @@ system_call: /* label this so stack traces look sane */ clrldi r8,r8,32 15: slwi r0,r0,4 - ldx r10,r11,r0 /* Fetch system call handler [ptr] */ - mtctr r10 + ldx r12,r11,r0 /* Fetch system call handler [ptr] */ + mtctr r12 bctrl /* Call handler */ syscall_exit: std r3,RESULT(r1) #ifdef SHOW_SYSCALLS - bl .do_show_syscall_exit + bl do_show_syscall_exit ld r3,RESULT(r1) #endif CURRENT_THREAD_INFO(r12, r1) @@ -226,6 +229,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) beq- 1f ACCOUNT_CPU_USER_EXIT(r11, r12) + HMT_MEDIUM_LOW_HAS_PPR ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ 1: ld r2,GPR2(r1) ld r1,GPR1(r1) @@ -244,9 +248,9 @@ syscall_error: /* Traced system call support */ syscall_dotrace: - bl .save_nvgprs + bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_syscall_trace_enter + bl do_syscall_trace_enter /* * Restore argument registers possibly just changed. * We use the return value of do_syscall_trace_enter @@ -302,8 +306,9 @@ syscall_exit_work: subi r12,r12,TI_FLAGS 4: /* Anything else left to do? */ + SET_DEFAULT_THREAD_PPR(r3, r10) /* Set thread.ppr = 3 */ andi. r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP) - beq .ret_from_except_lite + beq ret_from_except_lite /* Re-enable interrupts */ #ifdef CONFIG_PPC_BOOK3E @@ -314,10 +319,10 @@ syscall_exit_work: mtmsrd r10,1 #endif /* CONFIG_PPC_BOOK3E */ - bl .save_nvgprs + bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_syscall_trace_leave - b .ret_from_except + bl do_syscall_trace_leave + b ret_from_except /* Save non-volatile GPRs, if not already saved. */ _GLOBAL(save_nvgprs) @@ -340,54 +345,48 @@ _GLOBAL(save_nvgprs) */ _GLOBAL(ppc_fork) - bl .save_nvgprs - bl .sys_fork + bl save_nvgprs + bl sys_fork b syscall_exit _GLOBAL(ppc_vfork) - bl .save_nvgprs - bl .sys_vfork + bl save_nvgprs + bl sys_vfork b syscall_exit _GLOBAL(ppc_clone) - bl .save_nvgprs - bl .sys_clone + bl save_nvgprs + bl sys_clone b syscall_exit _GLOBAL(ppc32_swapcontext) - bl .save_nvgprs - bl .compat_sys_swapcontext + bl save_nvgprs + bl compat_sys_swapcontext b syscall_exit _GLOBAL(ppc64_swapcontext) - bl .save_nvgprs - bl .sys_swapcontext + bl save_nvgprs + bl sys_swapcontext b syscall_exit _GLOBAL(ret_from_fork) - bl .schedule_tail + bl schedule_tail REST_NVGPRS(r1) li r3,0 b syscall_exit _GLOBAL(ret_from_kernel_thread) - bl .schedule_tail + bl schedule_tail REST_NVGPRS(r1) - li r3,0 - std r3,0(r1) - ld r14, 0(r14) mtlr r14 mr r3,r15 +#if defined(_CALL_ELF) && _CALL_ELF == 2 + mr r12,r14 +#endif blrl li r3,0 b syscall_exit - .section ".toc","aw" -DSCR_DEFAULT: - .tc dscr_default[TC],dscr_default - - .section ".text" - /* * This routine switches between two different tasks. The process * state of one is saved on its kernel stack. Then the state @@ -429,12 +428,6 @@ BEGIN_FTR_SECTION std r24,THREAD_VRSAVE(r3) END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif /* CONFIG_ALTIVEC */ -#ifdef CONFIG_PPC64 -BEGIN_FTR_SECTION - mfspr r25,SPRN_DSCR - std r25,THREAD_DSCR(r3) -END_FTR_SECTION_IFSET(CPU_FTR_DSCR) -#endif and. r0,r0,r22 beq+ 1f andc r22,r22,r0 @@ -445,6 +438,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_DSCR) std r23,_CCR(r1) std r1,KSP(r3) /* Set old stack pointer */ +#ifdef CONFIG_PPC_BOOK3S_64 +BEGIN_FTR_SECTION + /* Event based branch registers */ + mfspr r0, SPRN_BESCR + std r0, THREAD_BESCR(r3) + mfspr r0, SPRN_EBBHR + std r0, THREAD_EBBHR(r3) + mfspr r0, SPRN_EBBRR + std r0, THREAD_EBBRR(r3) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +#endif + #ifdef CONFIG_SMP /* We need a sync somewhere here to make sure that if the * previous task gets rescheduled on another CPU, it sees all @@ -464,6 +469,13 @@ BEGIN_FTR_SECTION ldarx r6,0,r1 END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS) +#ifdef CONFIG_PPC_BOOK3S +/* Cancel all explict user streams as they will have no use after context + * switch and will stop the HW from creating streams itself + */ + DCBT_STOP_ALL_STREAM_IDS(r6) +#endif + addi r6,r4,-THREAD /* Convert THREAD to 'current' */ std r6,PACACURRENT(r13) /* Set new 'current' */ @@ -501,9 +513,11 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) */ ld r9,PACA_SLBSHADOWPTR(r13) li r12,0 - std r12,SLBSHADOW_STACKESID(r9) /* Clear ESID */ - std r7,SLBSHADOW_STACKVSID(r9) /* Save VSID */ - std r0,SLBSHADOW_STACKESID(r9) /* Save ESID */ + std r12,SLBSHADOW_STACKESID(r9) /* Clear ESID */ + li r12,SLBSHADOW_STACKVSID + STDX_BE r7,r12,r9 /* Save VSID */ + li r12,SLBSHADOW_STACKESID + STDX_BE r0,r12,r9 /* Save ESID */ /* No need to check for MMU_FTR_NO_SLBIE_B here, since when * we have 1TB segments, the only CPUs known to have the errata @@ -527,6 +541,21 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) mr r1,r8 /* start using new stack pointer */ std r7,PACAKSAVE(r13) +#ifdef CONFIG_PPC_BOOK3S_64 +BEGIN_FTR_SECTION + /* Event based branch registers */ + ld r0, THREAD_BESCR(r4) + mtspr SPRN_BESCR, r0 + ld r0, THREAD_EBBHR(r4) + mtspr SPRN_EBBHR, r0 + ld r0, THREAD_EBBRR(r4) + mtspr SPRN_EBBRR, r0 + + ld r0,THREAD_TAR(r4) + mtspr SPRN_TAR,r0 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +#endif + #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION ld r0,THREAD_VRSAVE(r4) @@ -536,12 +565,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #ifdef CONFIG_PPC64 BEGIN_FTR_SECTION lwz r6,THREAD_DSCR_INHERIT(r4) - ld r7,DSCR_DEFAULT@toc(2) ld r0,THREAD_DSCR(r4) cmpwi r6,0 bne 1f - ld r0,0(r7) -1: cmpd r0,r25 + ld r0,PACA_DSCR(r13) +1: +BEGIN_FTR_SECTION_NESTED(70) + mfspr r8, SPRN_FSCR + rldimi r8, r6, FSCR_DSCR_LG, (63 - FSCR_DSCR_LG) + mtspr SPRN_FSCR, r8 +END_FTR_SECTION_NESTED(CPU_FTR_ARCH_207S, CPU_FTR_ARCH_207S, 70) + cmpd r0,r25 beq 2f mtspr SPRN_DSCR,r0 2: @@ -566,7 +600,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_DSCR) _GLOBAL(ret_from_except) ld r11,_TRAP(r1) andi. r0,r11,1 - bne .ret_from_except_lite + bne ret_from_except_lite REST_NVGPRS(r1) _GLOBAL(ret_from_except_lite) @@ -584,31 +618,59 @@ _GLOBAL(ret_from_except_lite) CURRENT_THREAD_INFO(r9, r1) ld r3,_MSR(r1) +#ifdef CONFIG_PPC_BOOK3E + ld r10,PACACURRENT(r13) +#endif /* CONFIG_PPC_BOOK3E */ ld r4,TI_FLAGS(r9) andi. r3,r3,MSR_PR beq resume_kernel +#ifdef CONFIG_PPC_BOOK3E + lwz r3,(THREAD+THREAD_DBCR0)(r10) +#endif /* CONFIG_PPC_BOOK3E */ /* Check current_thread_info()->flags */ andi. r0,r4,_TIF_USER_WORK_MASK +#ifdef CONFIG_PPC_BOOK3E + bne 1f + /* + * Check to see if the dbcr0 register is set up to debug. + * Use the internal debug mode bit to do this. + */ + andis. r0,r3,DBCR0_IDM@h beq restore - - andi. r0,r4,_TIF_NEED_RESCHED - beq 1f - bl .restore_interrupts - bl .schedule - b .ret_from_except_lite - -1: bl .save_nvgprs - bl .restore_interrupts + mfmsr r0 + rlwinm r0,r0,0,~MSR_DE /* Clear MSR.DE */ + mtmsr r0 + mtspr SPRN_DBCR0,r3 + li r10, -1 + mtspr SPRN_DBSR,r10 + b restore +#else + beq restore +#endif +1: andi. r0,r4,_TIF_NEED_RESCHED + beq 2f + bl restore_interrupts + SCHEDULE_USER + b ret_from_except_lite +2: +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + andi. r0,r4,_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM + bne 3f /* only restore TM if nothing else to do */ + addi r3,r1,STACK_FRAME_OVERHEAD + bl restore_tm_state + b restore +3: +#endif + bl save_nvgprs + bl restore_interrupts addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_notify_resume - b .ret_from_except + bl do_notify_resume + b ret_from_except resume_kernel: /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ - CURRENT_THREAD_INFO(r9, r1) - ld r8,TI_FLAGS(r9) - andis. r8,r8,_TIF_EMULATE_STACK_STORE@h + andis. r8,r4,_TIF_EMULATE_STACK_STORE@h beq+ 1f addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */ @@ -634,7 +696,7 @@ resume_kernel: /* Clear _TIF_EMULATE_STACK_STORE flag */ lis r11,_TIF_EMULATE_STACK_STORE@h addi r5,r9,TI_FLAGS - ldarx r4,0,r5 +0: ldarx r4,0,r5 andc r4,r4,r11 stdcx. r4,0,r5 bne- 0b @@ -654,16 +716,29 @@ resume_kernel: /* * Here we are preempting the current task. We want to make - * sure we are soft-disabled first + * sure we are soft-disabled first and reconcile irq state. */ - SOFT_DISABLE_INTS(r3,r4) -1: bl .preempt_schedule_irq + RECONCILE_IRQ_STATE(r3,r4) +1: bl preempt_schedule_irq /* Re-test flags and eventually loop */ CURRENT_THREAD_INFO(r9, r1) ld r4,TI_FLAGS(r9) andi. r0,r4,_TIF_NEED_RESCHED bne 1b + + /* + * arch_local_irq_restore() from preempt_schedule_irq above may + * enable hard interrupt but we really should disable interrupts + * when we return from the interrupt, and so that we don't get + * interrupted after loading SRR0/1. + */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 0 +#else + ld r10,PACAKMSR(r13) /* Get kernel MSR without EE */ + mtmsrd r10,1 /* Update machine state */ +#endif /* CONFIG_PPC_BOOK3E */ #endif /* CONFIG_PREEMPT */ .globl fast_exc_return_irq @@ -706,7 +781,7 @@ restore_no_replay: */ do_restore: #ifdef CONFIG_PPC_BOOK3E - b .exception_return_book3e + b exception_return_book3e #else /* * Clear the reservation. If we know the CPU tracks the address of @@ -740,6 +815,12 @@ fast_exception_return: andi. r0,r3,MSR_RI beq- unrecov_restore + /* Load PPR from thread struct before we clear MSR:RI */ +BEGIN_FTR_SECTION + ld r2,PACACURRENT(r13) + ld r2,TASKTHREADPPR(r2) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + /* * Clear RI before restoring r13. If we are returning to * userspace and we take an exception after restoring r13, @@ -749,6 +830,10 @@ fast_exception_return: andc r4,r4,r0 /* r0 contains MSR_RI here */ mtmsrd r4,1 +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + /* TM debug */ + std r3, PACATMSCRATCH(r13) /* Stash returned-to MSR */ +#endif /* * r13 is our per cpu area, only restore it if we are returning to * userspace the value stored in the stack frame may belong to @@ -756,6 +841,9 @@ fast_exception_return: */ andi. r0,r3,MSR_PR beq 1f +BEGIN_FTR_SECTION + mtspr SPRN_PPR,r2 /* Restore PPR */ +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ACCOUNT_CPU_USER_EXIT(r2, r4) REST_GPR(13, r1) 1: @@ -808,7 +896,7 @@ restore_check_irq_replay: * * Still, this might be useful for things like hash_page */ - bl .__check_irq_replay + bl __check_irq_replay cmpwi cr0,r3,0 beq restore_no_replay @@ -829,25 +917,34 @@ restore_check_irq_replay: cmpwi cr0,r3,0x500 bne 1f addi r3,r1,STACK_FRAME_OVERHEAD; - bl .do_IRQ - b .ret_from_except + bl do_IRQ + b ret_from_except 1: cmpwi cr0,r3,0x900 bne 1f addi r3,r1,STACK_FRAME_OVERHEAD; - bl .timer_interrupt - b .ret_from_except + bl timer_interrupt + b ret_from_except +#ifdef CONFIG_PPC_DOORBELL +1: #ifdef CONFIG_PPC_BOOK3E -1: cmpwi cr0,r3,0x280 + cmpwi cr0,r3,0x280 +#else + BEGIN_FTR_SECTION + cmpwi cr0,r3,0xe80 + FTR_SECTION_ELSE + cmpwi cr0,r3,0xa00 + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) +#endif /* CONFIG_PPC_BOOK3E */ bne 1f addi r3,r1,STACK_FRAME_OVERHEAD; - bl .doorbell_exception - b .ret_from_except -#endif /* CONFIG_PPC_BOOK3E */ -1: b .ret_from_except /* What else to do here ? */ + bl doorbell_exception + b ret_from_except +#endif /* CONFIG_PPC_DOORBELL */ +1: b ret_from_except /* What else to do here ? */ unrecov_restore: addi r3,r1,STACK_FRAME_OVERHEAD - bl .unrecoverable_exception + bl unrecoverable_exception b unrecov_restore #ifdef CONFIG_PPC_RTAS @@ -913,7 +1010,7 @@ _GLOBAL(enter_rtas) std r6,PACASAVEDMSR(r13) /* Setup our real return addr */ - LOAD_REG_ADDR(r4,.rtas_return_loc) + LOAD_REG_ADDR(r4,rtas_return_loc) clrldi r4,r4,2 /* convert to realmode address */ mtlr r4 @@ -923,7 +1020,7 @@ _GLOBAL(enter_rtas) li r9,1 rldicr r9,r9,MSR_SF_LG,(63-MSR_SF_LG) - ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI + ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE andc r6,r0,r9 sync /* disable interrupts so SRR0/1 */ mtmsrd r0 /* don't get trashed */ @@ -937,14 +1034,16 @@ _GLOBAL(enter_rtas) rfid b . /* prevent speculative execution */ -_STATIC(rtas_return_loc) +rtas_return_loc: + FIXUP_ENDIAN + /* relocation is off at this point */ GET_PACA(r4) clrldi r4,r4,2 /* convert to realmode address */ bcl 20,31,$+4 0: mflr r3 - ld r3,(1f-0b)(r3) /* get &.rtas_restore_regs */ + ld r3,(1f-0b)(r3) /* get &rtas_restore_regs */ mfmsr r6 li r0,MSR_RI @@ -961,9 +1060,9 @@ _STATIC(rtas_return_loc) b . /* prevent speculative execution */ .align 3 -1: .llong .rtas_restore_regs +1: .llong rtas_restore_regs -_STATIC(rtas_restore_regs) +rtas_restore_regs: /* relocation is on at this point */ REST_GPR(2, r1) /* Restore the TOC */ REST_GPR(13, r1) /* Restore paca */ @@ -1009,28 +1108,30 @@ _GLOBAL(enter_prom) std r10,_CCR(r1) std r11,_MSR(r1) - /* Get the PROM entrypoint */ - mtlr r4 + /* Put PROM address in SRR0 */ + mtsrr0 r4 - /* Switch MSR to 32 bits mode + /* Setup our trampoline return addr in LR */ + bcl 20,31,$+4 +0: mflr r4 + addi r4,r4,(1f - 0b) + mtlr r4 + + /* Prepare a 32-bit mode big endian MSR */ #ifdef CONFIG_PPC_BOOK3E rlwinm r11,r11,0,1,31 - mtmsr r11 + mtsrr1 r11 + rfi #else /* CONFIG_PPC_BOOK3E */ - mfmsr r11 - li r12,1 - rldicr r12,r12,MSR_SF_LG,(63-MSR_SF_LG) - andc r11,r11,r12 - li r12,1 - rldicr r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG) - andc r11,r11,r12 - mtmsrd r11 + LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE) + andc r11,r11,r12 + mtsrr1 r11 + rfid #endif /* CONFIG_PPC_BOOK3E */ - isync - /* Enter PROM here... */ - blrl +1: /* Return from OF */ + FIXUP_ENDIAN /* Just make sure that r1 top 32 bits didn't get * corrupt by OF @@ -1061,7 +1162,7 @@ _GLOBAL(mcount) _GLOBAL(_mcount) blr -_GLOBAL(ftrace_caller) +_GLOBAL_TOC(ftrace_caller) /* Taken from output of objdump from lib64/glibc */ mflr r3 ld r11, 0(r1) @@ -1085,10 +1186,7 @@ _GLOBAL(ftrace_graph_stub) _GLOBAL(ftrace_stub) blr #else -_GLOBAL(mcount) - blr - -_GLOBAL(_mcount) +_GLOBAL_TOC(_mcount) /* Taken from output of objdump from lib64/glibc */ mflr r3 ld r11, 0(r1) @@ -1126,7 +1224,7 @@ _GLOBAL(ftrace_graph_caller) ld r11, 112(r1) addi r3, r11, 16 - bl .prepare_ftrace_return + bl prepare_ftrace_return nop ld r0, 128(r1) @@ -1142,7 +1240,7 @@ _GLOBAL(return_to_handler) mr r31, r1 stdu r1, -112(r1) - bl .ftrace_return_to_handler + bl ftrace_return_to_handler nop /* return value has real return address */ @@ -1172,7 +1270,7 @@ _GLOBAL(mod_return_to_handler) */ ld r2, PACATOC(r13) - bl .ftrace_return_to_handler + bl ftrace_return_to_handler nop /* return value has real return address */ diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S index 62c0dc23782..9f1ebf7338f 100644 --- a/arch/powerpc/kernel/epapr_hcalls.S +++ b/arch/powerpc/kernel/epapr_hcalls.S @@ -17,6 +17,7 @@ #include <asm/asm-compat.h> #include <asm/asm-offsets.h> +#ifndef CONFIG_PPC64 /* epapr_ev_idle() was derived from e500_idle() */ _GLOBAL(epapr_ev_idle) CURRENT_THREAD_INFO(r3, r1) @@ -42,6 +43,7 @@ epapr_ev_idle_start: * _TLF_NAPPING. */ b idle_loop +#endif /* Hypercall entry point. Will be patched with device tree instructions. */ .global epapr_hypercall_start diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c index f3eab8594d9..59e4ba74975 100644 --- a/arch/powerpc/kernel/epapr_paravirt.c +++ b/arch/powerpc/kernel/epapr_paravirt.c @@ -18,44 +18,68 @@ */ #include <linux/of.h> +#include <linux/of_fdt.h> #include <asm/epapr_hcalls.h> #include <asm/cacheflush.h> #include <asm/code-patching.h> #include <asm/machdep.h> +#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) extern void epapr_ev_idle(void); extern u32 epapr_ev_idle_start[]; +#endif bool epapr_paravirt_enabled; +static bool __maybe_unused epapr_has_idle; -static int __init epapr_paravirt_init(void) +static int __init early_init_dt_scan_epapr(unsigned long node, + const char *uname, + int depth, void *data) { - struct device_node *hyper_node; const u32 *insts; - int len, i; + int len; + int i; - hyper_node = of_find_node_by_path("/hypervisor"); - if (!hyper_node) - return -ENODEV; - - insts = of_get_property(hyper_node, "hcall-instructions", &len); + insts = of_get_flat_dt_prop(node, "hcall-instructions", &len); if (!insts) - return -ENODEV; + return 0; if (len % 4 || len > (4 * 4)) - return -ENODEV; + return -1; for (i = 0; i < (len / 4); i++) { - patch_instruction(epapr_hypercall_start + i, insts[i]); - patch_instruction(epapr_ev_idle_start + i, insts[i]); + u32 inst = be32_to_cpu(insts[i]); + patch_instruction(epapr_hypercall_start + i, inst); +#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) + patch_instruction(epapr_ev_idle_start + i, inst); +#endif } - if (of_get_property(hyper_node, "has-idle", NULL)) - ppc_md.power_save = epapr_ev_idle; +#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) + if (of_get_flat_dt_prop(node, "has-idle", NULL)) + epapr_has_idle = true; +#endif epapr_paravirt_enabled = true; + return 1; +} + +int __init epapr_paravirt_early_init(void) +{ + of_scan_flat_dt(early_init_dt_scan_epapr, NULL); + + return 0; +} + +static int __init epapr_idle_init(void) +{ +#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64) + if (epapr_has_idle) + ppc_md.power_save = epapr_ev_idle; +#endif + return 0; } -early_initcall(epapr_paravirt_init); +postcore_initcall(epapr_idle_init); diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 4684e33a26c..bb9cac6c805 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -34,7 +34,250 @@ * special interrupts from within a non-standard level will probably * blow you up */ -#define SPECIAL_EXC_FRAME_SIZE INT_FRAME_SIZE +#define SPECIAL_EXC_SRR0 0 +#define SPECIAL_EXC_SRR1 1 +#define SPECIAL_EXC_SPRG_GEN 2 +#define SPECIAL_EXC_SPRG_TLB 3 +#define SPECIAL_EXC_MAS0 4 +#define SPECIAL_EXC_MAS1 5 +#define SPECIAL_EXC_MAS2 6 +#define SPECIAL_EXC_MAS3 7 +#define SPECIAL_EXC_MAS6 8 +#define SPECIAL_EXC_MAS7 9 +#define SPECIAL_EXC_MAS5 10 /* E.HV only */ +#define SPECIAL_EXC_MAS8 11 /* E.HV only */ +#define SPECIAL_EXC_IRQHAPPENED 12 +#define SPECIAL_EXC_DEAR 13 +#define SPECIAL_EXC_ESR 14 +#define SPECIAL_EXC_SOFTE 15 +#define SPECIAL_EXC_CSRR0 16 +#define SPECIAL_EXC_CSRR1 17 +/* must be even to keep 16-byte stack alignment */ +#define SPECIAL_EXC_END 18 + +#define SPECIAL_EXC_FRAME_SIZE (INT_FRAME_SIZE + SPECIAL_EXC_END * 8) +#define SPECIAL_EXC_FRAME_OFFS (INT_FRAME_SIZE - 288) + +#define SPECIAL_EXC_STORE(reg, name) \ + std reg, (SPECIAL_EXC_##name * 8 + SPECIAL_EXC_FRAME_OFFS)(r1) + +#define SPECIAL_EXC_LOAD(reg, name) \ + ld reg, (SPECIAL_EXC_##name * 8 + SPECIAL_EXC_FRAME_OFFS)(r1) + +special_reg_save: + lbz r9,PACAIRQHAPPENED(r13) + RECONCILE_IRQ_STATE(r3,r4) + + /* + * We only need (or have stack space) to save this stuff if + * we interrupted the kernel. + */ + ld r3,_MSR(r1) + andi. r3,r3,MSR_PR + bnelr + + /* Copy info into temporary exception thread info */ + ld r11,PACAKSAVE(r13) + CURRENT_THREAD_INFO(r11, r11) + CURRENT_THREAD_INFO(r12, r1) + ld r10,TI_FLAGS(r11) + std r10,TI_FLAGS(r12) + ld r10,TI_PREEMPT(r11) + std r10,TI_PREEMPT(r12) + ld r10,TI_TASK(r11) + std r10,TI_TASK(r12) + + /* + * Advance to the next TLB exception frame for handler + * types that don't do it automatically. + */ + LOAD_REG_ADDR(r11,extlb_level_exc) + lwz r12,0(r11) + mfspr r10,SPRN_SPRG_TLB_EXFRAME + add r10,r10,r12 + mtspr SPRN_SPRG_TLB_EXFRAME,r10 + + /* + * Save registers needed to allow nesting of certain exceptions + * (such as TLB misses) inside special exception levels + */ + mfspr r10,SPRN_SRR0 + SPECIAL_EXC_STORE(r10,SRR0) + mfspr r10,SPRN_SRR1 + SPECIAL_EXC_STORE(r10,SRR1) + mfspr r10,SPRN_SPRG_GEN_SCRATCH + SPECIAL_EXC_STORE(r10,SPRG_GEN) + mfspr r10,SPRN_SPRG_TLB_SCRATCH + SPECIAL_EXC_STORE(r10,SPRG_TLB) + mfspr r10,SPRN_MAS0 + SPECIAL_EXC_STORE(r10,MAS0) + mfspr r10,SPRN_MAS1 + SPECIAL_EXC_STORE(r10,MAS1) + mfspr r10,SPRN_MAS2 + SPECIAL_EXC_STORE(r10,MAS2) + mfspr r10,SPRN_MAS3 + SPECIAL_EXC_STORE(r10,MAS3) + mfspr r10,SPRN_MAS6 + SPECIAL_EXC_STORE(r10,MAS6) + mfspr r10,SPRN_MAS7 + SPECIAL_EXC_STORE(r10,MAS7) +BEGIN_FTR_SECTION + mfspr r10,SPRN_MAS5 + SPECIAL_EXC_STORE(r10,MAS5) + mfspr r10,SPRN_MAS8 + SPECIAL_EXC_STORE(r10,MAS8) + + /* MAS5/8 could have inappropriate values if we interrupted KVM code */ + li r10,0 + mtspr SPRN_MAS5,r10 + mtspr SPRN_MAS8,r10 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) + SPECIAL_EXC_STORE(r9,IRQHAPPENED) + + mfspr r10,SPRN_DEAR + SPECIAL_EXC_STORE(r10,DEAR) + mfspr r10,SPRN_ESR + SPECIAL_EXC_STORE(r10,ESR) + + lbz r10,PACASOFTIRQEN(r13) + SPECIAL_EXC_STORE(r10,SOFTE) + ld r10,_NIP(r1) + SPECIAL_EXC_STORE(r10,CSRR0) + ld r10,_MSR(r1) + SPECIAL_EXC_STORE(r10,CSRR1) + + blr + +ret_from_level_except: + ld r3,_MSR(r1) + andi. r3,r3,MSR_PR + beq 1f + b ret_from_except +1: + + LOAD_REG_ADDR(r11,extlb_level_exc) + lwz r12,0(r11) + mfspr r10,SPRN_SPRG_TLB_EXFRAME + sub r10,r10,r12 + mtspr SPRN_SPRG_TLB_EXFRAME,r10 + + /* + * It's possible that the special level exception interrupted a + * TLB miss handler, and inserted the same entry that the + * interrupted handler was about to insert. On CPUs without TLB + * write conditional, this can result in a duplicate TLB entry. + * Wipe all non-bolted entries to be safe. + * + * Note that this doesn't protect against any TLB misses + * we may take accessing the stack from here to the end of + * the special level exception. It's not clear how we can + * reasonably protect against that, but only CPUs with + * neither TLB write conditional nor bolted kernel memory + * are affected. Do any such CPUs even exist? + */ + PPC_TLBILX_ALL(0,R0) + + REST_NVGPRS(r1) + + SPECIAL_EXC_LOAD(r10,SRR0) + mtspr SPRN_SRR0,r10 + SPECIAL_EXC_LOAD(r10,SRR1) + mtspr SPRN_SRR1,r10 + SPECIAL_EXC_LOAD(r10,SPRG_GEN) + mtspr SPRN_SPRG_GEN_SCRATCH,r10 + SPECIAL_EXC_LOAD(r10,SPRG_TLB) + mtspr SPRN_SPRG_TLB_SCRATCH,r10 + SPECIAL_EXC_LOAD(r10,MAS0) + mtspr SPRN_MAS0,r10 + SPECIAL_EXC_LOAD(r10,MAS1) + mtspr SPRN_MAS1,r10 + SPECIAL_EXC_LOAD(r10,MAS2) + mtspr SPRN_MAS2,r10 + SPECIAL_EXC_LOAD(r10,MAS3) + mtspr SPRN_MAS3,r10 + SPECIAL_EXC_LOAD(r10,MAS6) + mtspr SPRN_MAS6,r10 + SPECIAL_EXC_LOAD(r10,MAS7) + mtspr SPRN_MAS7,r10 +BEGIN_FTR_SECTION + SPECIAL_EXC_LOAD(r10,MAS5) + mtspr SPRN_MAS5,r10 + SPECIAL_EXC_LOAD(r10,MAS8) + mtspr SPRN_MAS8,r10 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) + + lbz r6,PACASOFTIRQEN(r13) + ld r5,SOFTE(r1) + + /* Interrupts had better not already be enabled... */ + twnei r6,0 + + cmpwi cr0,r5,0 + beq 1f + + TRACE_ENABLE_INTS + stb r5,PACASOFTIRQEN(r13) +1: + /* + * Restore PACAIRQHAPPENED rather than setting it based on + * the return MSR[EE], since we could have interrupted + * __check_irq_replay() or other inconsistent transitory + * states that must remain that way. + */ + SPECIAL_EXC_LOAD(r10,IRQHAPPENED) + stb r10,PACAIRQHAPPENED(r13) + + SPECIAL_EXC_LOAD(r10,DEAR) + mtspr SPRN_DEAR,r10 + SPECIAL_EXC_LOAD(r10,ESR) + mtspr SPRN_ESR,r10 + + stdcx. r0,0,r1 /* to clear the reservation */ + + REST_4GPRS(2, r1) + REST_4GPRS(6, r1) + + ld r10,_CTR(r1) + ld r11,_XER(r1) + mtctr r10 + mtxer r11 + + blr + +.macro ret_from_level srr0 srr1 paca_ex scratch + bl ret_from_level_except + + ld r10,_LINK(r1) + ld r11,_CCR(r1) + ld r0,GPR13(r1) + mtlr r10 + mtcr r11 + + ld r10,GPR10(r1) + ld r11,GPR11(r1) + ld r12,GPR12(r1) + mtspr \scratch,r0 + + std r10,\paca_ex+EX_R10(r13); + std r11,\paca_ex+EX_R11(r13); + ld r10,_NIP(r1) + ld r11,_MSR(r1) + ld r0,GPR0(r1) + ld r1,GPR1(r1) + mtspr \srr0,r10 + mtspr \srr1,r11 + ld r10,\paca_ex+EX_R10(r13) + ld r11,\paca_ex+EX_R11(r13) + mfspr r13,\scratch +.endm + +ret_from_crit_except: + ret_from_level SPRN_CSRR0 SPRN_CSRR1 PACA_EXCRIT SPRN_SPRG_CRIT_SCRATCH + rfci + +ret_from_mc_except: + ret_from_level SPRN_MCSRR0 SPRN_MCSRR1 PACA_EXMC SPRN_SPRG_MC_SCRATCH + rfmci /* Exception prolog code for all exceptions */ #define EXCEPTION_PROLOG(n, intnum, type, addition) \ @@ -42,7 +285,6 @@ mfspr r13,SPRN_SPRG_PACA; /* get PACA */ \ std r10,PACA_EX##type+EX_R10(r13); \ std r11,PACA_EX##type+EX_R11(r13); \ - PROLOG_STORE_RESTORE_SCRATCH_##type; \ mfcr r10; /* save CR */ \ mfspr r11,SPRN_##type##_SRR1;/* what are we coming from */ \ DO_KVM intnum,SPRN_##type##_SRR1; /* KVM hook */ \ @@ -69,19 +311,19 @@ #define CRIT_SET_KSTACK \ ld r1,PACA_CRIT_STACK(r13); \ - subi r1,r1,SPECIAL_EXC_FRAME_SIZE; + subi r1,r1,SPECIAL_EXC_FRAME_SIZE #define SPRN_CRIT_SRR0 SPRN_CSRR0 #define SPRN_CRIT_SRR1 SPRN_CSRR1 #define DBG_SET_KSTACK \ ld r1,PACA_DBG_STACK(r13); \ - subi r1,r1,SPECIAL_EXC_FRAME_SIZE; + subi r1,r1,SPECIAL_EXC_FRAME_SIZE #define SPRN_DBG_SRR0 SPRN_DSRR0 #define SPRN_DBG_SRR1 SPRN_DSRR1 #define MC_SET_KSTACK \ ld r1,PACA_MC_STACK(r13); \ - subi r1,r1,SPECIAL_EXC_FRAME_SIZE; + subi r1,r1,SPECIAL_EXC_FRAME_SIZE #define SPRN_MC_SRR0 SPRN_MCSRR0 #define SPRN_MC_SRR1 SPRN_MCSRR1 @@ -100,20 +342,6 @@ #define GDBELL_EXCEPTION_PROLOG(n, intnum, addition) \ EXCEPTION_PROLOG(n, intnum, GDBELL, addition##_GDBELL(n)) -/* - * Store user-visible scratch in PACA exception slots and restore proper value - */ -#define PROLOG_STORE_RESTORE_SCRATCH_GEN -#define PROLOG_STORE_RESTORE_SCRATCH_GDBELL -#define PROLOG_STORE_RESTORE_SCRATCH_DBG -#define PROLOG_STORE_RESTORE_SCRATCH_MC - -#define PROLOG_STORE_RESTORE_SCRATCH_CRIT \ - mfspr r10,SPRN_SPRG_CRIT_SCRATCH; /* get r13 */ \ - std r10,PACA_EXCRIT+EX_R13(r13); \ - ld r11,PACA_SPRG3(r13); \ - mtspr SPRN_SPRG_CRIT_SCRATCH,r11; - /* Variants of the "addition" argument for the prolog */ #define PROLOG_ADDITION_NONE_GEN(n) @@ -147,10 +375,8 @@ std r15,PACA_EXMC+EX_R15(r13) -/* Core exception code for all exceptions except TLB misses. - * XXX: Needs to make SPRN_SPRG_GEN depend on exception type - */ -#define EXCEPTION_COMMON(n, excf, ints) \ +/* Core exception code for all exceptions except TLB misses. */ +#define EXCEPTION_COMMON_LVL(n, scratch, excf) \ exc_##n##_common: \ std r0,GPR0(r1); /* save r0 in stackframe */ \ std r2,GPR2(r1); /* save r2 in stackframe */ \ @@ -159,10 +385,11 @@ exc_##n##_common: \ std r9,GPR9(r1); /* save r9 in stackframe */ \ std r10,_NIP(r1); /* save SRR0 to stackframe */ \ std r11,_MSR(r1); /* save SRR1 to stackframe */ \ + beq 2f; /* if from kernel mode */ \ ACCOUNT_CPU_USER_ENTRY(r10,r11);/* accounting (uses cr0+eq) */ \ - ld r3,excf+EX_R10(r13); /* get back r10 */ \ +2: ld r3,excf+EX_R10(r13); /* get back r10 */ \ ld r4,excf+EX_R11(r13); /* get back r11 */ \ - mfspr r5,SPRN_SPRG_GEN_SCRATCH;/* get back r13 */ \ + mfspr r5,scratch; /* get back r13 */ \ std r12,GPR12(r1); /* save r12 in stackframe */ \ ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \ mflr r6; /* save LR in stackframe */ \ @@ -186,24 +413,29 @@ exc_##n##_common: \ std r11,SOFTE(r1); /* and save it to stackframe */ \ std r12,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ \ std r3,_TRAP(r1); /* set trap number */ \ - std r0,RESULT(r1); /* clear regs->result */ \ - ints; + std r0,RESULT(r1); /* clear regs->result */ -/* Variants for the "ints" argument. This one does nothing when we want - * to keep interrupts in their original state - */ -#define INTS_KEEP +#define EXCEPTION_COMMON(n) \ + EXCEPTION_COMMON_LVL(n, SPRN_SPRG_GEN_SCRATCH, PACA_EXGEN) +#define EXCEPTION_COMMON_CRIT(n) \ + EXCEPTION_COMMON_LVL(n, SPRN_SPRG_CRIT_SCRATCH, PACA_EXCRIT) +#define EXCEPTION_COMMON_MC(n) \ + EXCEPTION_COMMON_LVL(n, SPRN_SPRG_MC_SCRATCH, PACA_EXMC) +#define EXCEPTION_COMMON_DBG(n) \ + EXCEPTION_COMMON_LVL(n, SPRN_SPRG_DBG_SCRATCH, PACA_EXDBG) -/* This second version is meant for exceptions that don't immediately - * hard-enable. We set a bit in paca->irq_happened to ensure that - * a subsequent call to arch_local_irq_restore() will properly - * hard-enable and avoid the fast-path +/* + * This is meant for exceptions that don't immediately hard-enable. We + * set a bit in paca->irq_happened to ensure that a subsequent call to + * arch_local_irq_restore() will properly hard-enable and avoid the + * fast-path, and then reconcile irq state. */ -#define INTS_DISABLE SOFT_DISABLE_INTS(r3,r4) +#define INTS_DISABLE RECONCILE_IRQ_STATE(r3,r4) -/* This is called by exceptions that used INTS_KEEP (that did not touch - * irq indicators in the PACA). This will restore MSR:EE to it's previous - * value +/* + * This is called by exceptions that don't use INTS_DISABLE (that did not + * touch irq indicators in the PACA). This will restore MSR:EE to it's + * previous value * * XXX In the long run, we may want to open-code it in order to separate the * load from the wrtee, thus limiting the latency caused by the dependency @@ -261,12 +493,13 @@ exc_##n##_bad_stack: \ #define MASKABLE_EXCEPTION(trapnum, intnum, label, hdlr, ack) \ START_EXCEPTION(label); \ NORMAL_EXCEPTION_PROLOG(trapnum, intnum, PROLOG_ADDITION_MASKABLE)\ - EXCEPTION_COMMON(trapnum, PACA_EXGEN, INTS_DISABLE) \ + EXCEPTION_COMMON(trapnum) \ + INTS_DISABLE; \ ack(r8); \ CHECK_NAPPING(); \ addi r3,r1,STACK_FRAME_OVERHEAD; \ bl hdlr; \ - b .ret_from_except_lite; + b ret_from_except_lite; /* This value is used to mark exception frames on the stack. */ .section ".toc","aw" @@ -282,8 +515,8 @@ exception_marker: .balign 0x1000 .globl interrupt_base_book3e interrupt_base_book3e: /* fake trap */ - EXCEPTION_STUB(0x000, machine_check) /* 0x0200 */ - EXCEPTION_STUB(0x020, critical_input) /* 0x0580 */ + EXCEPTION_STUB(0x000, machine_check) + EXCEPTION_STUB(0x020, critical_input) /* 0x0100 */ EXCEPTION_STUB(0x040, debug_crit) /* 0x0d00 */ EXCEPTION_STUB(0x060, data_storage) /* 0x0300 */ EXCEPTION_STUB(0x080, instruction_storage) /* 0x0400 */ @@ -298,6 +531,8 @@ interrupt_base_book3e: /* fake trap */ EXCEPTION_STUB(0x1a0, watchdog) /* 0x09f0 */ EXCEPTION_STUB(0x1c0, data_tlb_miss) EXCEPTION_STUB(0x1e0, instruction_tlb_miss) + EXCEPTION_STUB(0x200, altivec_unavailable) + EXCEPTION_STUB(0x220, altivec_assist) EXCEPTION_STUB(0x260, perfmon) EXCEPTION_STUB(0x280, doorbell) EXCEPTION_STUB(0x2a0, doorbell_crit) @@ -305,6 +540,7 @@ interrupt_base_book3e: /* fake trap */ EXCEPTION_STUB(0x2e0, guest_doorbell_crit) EXCEPTION_STUB(0x300, hypercall) EXCEPTION_STUB(0x320, ehpriv) + EXCEPTION_STUB(0x340, lrat_error) .globl interrupt_end_book3e interrupt_end_book3e: @@ -313,25 +549,25 @@ interrupt_end_book3e: START_EXCEPTION(critical_input); CRIT_EXCEPTION_PROLOG(0x100, BOOKE_INTERRUPT_CRITICAL, PROLOG_ADDITION_NONE) -// EXCEPTION_COMMON(0x100, PACA_EXCRIT, INTS_DISABLE) -// bl special_reg_save_crit -// CHECK_NAPPING(); -// addi r3,r1,STACK_FRAME_OVERHEAD -// bl .critical_exception -// b ret_from_crit_except - b . + EXCEPTION_COMMON_CRIT(0x100) + bl save_nvgprs + bl special_reg_save + CHECK_NAPPING(); + addi r3,r1,STACK_FRAME_OVERHEAD + bl unknown_exception + b ret_from_crit_except /* Machine Check Interrupt */ START_EXCEPTION(machine_check); - MC_EXCEPTION_PROLOG(0x200, BOOKE_INTERRUPT_MACHINE_CHECK, + MC_EXCEPTION_PROLOG(0x000, BOOKE_INTERRUPT_MACHINE_CHECK, PROLOG_ADDITION_NONE) -// EXCEPTION_COMMON(0x200, PACA_EXMC, INTS_DISABLE) -// bl special_reg_save_mc -// addi r3,r1,STACK_FRAME_OVERHEAD -// CHECK_NAPPING(); -// bl .machine_check_exception -// b ret_from_mc_except - b . + EXCEPTION_COMMON_MC(0x000) + bl save_nvgprs + bl special_reg_save + CHECK_NAPPING(); + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_exception + b ret_from_mc_except /* Data Storage Interrupt */ START_EXCEPTION(data_storage) @@ -339,7 +575,8 @@ interrupt_end_book3e: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR - EXCEPTION_COMMON(0x300, PACA_EXGEN, INTS_DISABLE) + EXCEPTION_COMMON(0x300) + INTS_DISABLE b storage_fault_common /* Instruction Storage Interrupt */ @@ -348,12 +585,13 @@ interrupt_end_book3e: PROLOG_ADDITION_2REGS) li r15,0 mr r14,r10 - EXCEPTION_COMMON(0x400, PACA_EXGEN, INTS_DISABLE) + EXCEPTION_COMMON(0x400) + INTS_DISABLE b storage_fault_common /* External Input Interrupt */ MASKABLE_EXCEPTION(0x500, BOOKE_INTERRUPT_EXTERNAL, - external_input, .do_IRQ, ACK_NONE) + external_input, do_IRQ, ACK_NONE) /* Alignment */ START_EXCEPTION(alignment); @@ -361,7 +599,7 @@ interrupt_end_book3e: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR - EXCEPTION_COMMON(0x600, PACA_EXGEN, INTS_KEEP) + EXCEPTION_COMMON(0x600) b alignment_more /* no room, go out of line */ /* Program Interrupt */ @@ -369,50 +607,96 @@ interrupt_end_book3e: NORMAL_EXCEPTION_PROLOG(0x700, BOOKE_INTERRUPT_PROGRAM, PROLOG_ADDITION_1REG) mfspr r14,SPRN_ESR - EXCEPTION_COMMON(0x700, PACA_EXGEN, INTS_DISABLE) + EXCEPTION_COMMON(0x700) + INTS_DISABLE std r14,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD ld r14,PACA_EXGEN+EX_R14(r13) - bl .save_nvgprs - bl .program_check_exception - b .ret_from_except + bl save_nvgprs + bl program_check_exception + b ret_from_except /* Floating Point Unavailable Interrupt */ START_EXCEPTION(fp_unavailable); NORMAL_EXCEPTION_PROLOG(0x800, BOOKE_INTERRUPT_FP_UNAVAIL, PROLOG_ADDITION_NONE) /* we can probably do a shorter exception entry for that one... */ - EXCEPTION_COMMON(0x800, PACA_EXGEN, INTS_KEEP) + EXCEPTION_COMMON(0x800) ld r12,_MSR(r1) andi. r0,r12,MSR_PR; beq- 1f - bl .load_up_fpu + bl load_up_fpu b fast_exception_return 1: INTS_DISABLE - bl .save_nvgprs + bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD - bl .kernel_fp_unavailable_exception - b .ret_from_except + bl kernel_fp_unavailable_exception + b ret_from_except + +/* Altivec Unavailable Interrupt */ + START_EXCEPTION(altivec_unavailable); + NORMAL_EXCEPTION_PROLOG(0x200, BOOKE_INTERRUPT_SPE_ALTIVEC_UNAVAIL, + PROLOG_ADDITION_NONE) + /* we can probably do a shorter exception entry for that one... */ + EXCEPTION_COMMON(0x200) +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + ld r12,_MSR(r1) + andi. r0,r12,MSR_PR; + beq- 1f + bl load_up_altivec + b fast_exception_return +1: +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif + INTS_DISABLE + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + bl altivec_unavailable_exception + b ret_from_except + +/* AltiVec Assist */ + START_EXCEPTION(altivec_assist); + NORMAL_EXCEPTION_PROLOG(0x220, + BOOKE_INTERRUPT_SPE_FP_DATA_ALTIVEC_ASSIST, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0x220) + INTS_DISABLE + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + bl altivec_assist_exception +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#else + bl unknown_exception +#endif + b ret_from_except + /* Decrementer Interrupt */ MASKABLE_EXCEPTION(0x900, BOOKE_INTERRUPT_DECREMENTER, - decrementer, .timer_interrupt, ACK_DEC) + decrementer, timer_interrupt, ACK_DEC) /* Fixed Interval Timer Interrupt */ MASKABLE_EXCEPTION(0x980, BOOKE_INTERRUPT_FIT, - fixed_interval, .unknown_exception, ACK_FIT) + fixed_interval, unknown_exception, ACK_FIT) /* Watchdog Timer Interrupt */ START_EXCEPTION(watchdog); CRIT_EXCEPTION_PROLOG(0x9f0, BOOKE_INTERRUPT_WATCHDOG, PROLOG_ADDITION_NONE) -// EXCEPTION_COMMON(0x9f0, PACA_EXCRIT, INTS_DISABLE) -// bl special_reg_save_crit -// CHECK_NAPPING(); -// addi r3,r1,STACK_FRAME_OVERHEAD -// bl .unknown_exception -// b ret_from_crit_except - b . + EXCEPTION_COMMON_CRIT(0x9f0) + bl save_nvgprs + bl special_reg_save + CHECK_NAPPING(); + addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_BOOKE_WDT + bl WatchdogException +#else + bl unknown_exception +#endif + b ret_from_crit_except /* System Call Interrupt */ START_EXCEPTION(system_call) @@ -426,11 +710,12 @@ interrupt_end_book3e: START_EXCEPTION(ap_unavailable); NORMAL_EXCEPTION_PROLOG(0xf20, BOOKE_INTERRUPT_AP_UNAVAIL, PROLOG_ADDITION_NONE) - EXCEPTION_COMMON(0xf20, PACA_EXGEN, INTS_DISABLE) - bl .save_nvgprs + EXCEPTION_COMMON(0xf20) + INTS_DISABLE + bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD - bl .unknown_exception - b .ret_from_except + bl unknown_exception + b ret_from_except /* Debug exception as a critical interrupt*/ START_EXCEPTION(debug_crit); @@ -447,7 +732,7 @@ interrupt_end_book3e: */ mfspr r14,SPRN_DBSR /* check single-step/branch taken */ - andis. r15,r14,DBSR_IC@h + andis. r15,r14,(DBSR_IC|DBSR_BT)@h beq+ 1f LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) @@ -458,7 +743,7 @@ interrupt_end_book3e: bge+ cr1,1f /* here it looks like we got an inappropriate debug exception. */ - lis r14,DBSR_IC@h /* clear the IC event */ + lis r14,(DBSR_IC|DBSR_BT)@h /* clear the event */ rlwinm r11,r11,0,~MSR_DE /* clear DE in the CSRR1 value */ mtspr SPRN_DBSR,r14 mtspr SPRN_CSRR1,r11 @@ -469,7 +754,7 @@ interrupt_end_book3e: mtcr r10 ld r10,PACA_EXCRIT+EX_R10(r13) /* restore registers */ ld r11,PACA_EXCRIT+EX_R11(r13) - ld r13,PACA_EXCRIT+EX_R13(r13) + mfspr r13,SPRN_SPRG_CRIT_SCRATCH rfci /* Normal debug exception */ @@ -482,18 +767,16 @@ interrupt_end_book3e: /* Now we mash up things to make it look like we are coming on a * normal exception */ - ld r15,PACA_EXCRIT+EX_R13(r13) - mtspr SPRN_SPRG_GEN_SCRATCH,r15 mfspr r14,SPRN_DBSR - EXCEPTION_COMMON(0xd00, PACA_EXCRIT, INTS_DISABLE) + EXCEPTION_COMMON_CRIT(0xd00) std r14,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD mr r4,r14 ld r14,PACA_EXCRIT+EX_R14(r13) ld r15,PACA_EXCRIT+EX_R15(r13) - bl .save_nvgprs - bl .DebugException - b .ret_from_except + bl save_nvgprs + bl DebugException + b ret_from_except kernel_dbg_exc: b . /* NYI */ @@ -513,7 +796,7 @@ kernel_dbg_exc: */ mfspr r14,SPRN_DBSR /* check single-step/branch taken */ - andis. r15,r14,DBSR_IC@h + andis. r15,r14,(DBSR_IC|DBSR_BT)@h beq+ 1f LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) @@ -524,7 +807,7 @@ kernel_dbg_exc: bge+ cr1,1f /* here it looks like we got an inappropriate debug exception. */ - lis r14,DBSR_IC@h /* clear the IC event */ + lis r14,(DBSR_IC|DBSR_BT)@h /* clear the event */ rlwinm r11,r11,0,~MSR_DE /* clear DE in the DSRR1 value */ mtspr SPRN_DBSR,r14 mtspr SPRN_DSRR1,r11 @@ -548,42 +831,43 @@ kernel_dbg_exc: /* Now we mash up things to make it look like we are coming on a * normal exception */ - mfspr r15,SPRN_SPRG_DBG_SCRATCH - mtspr SPRN_SPRG_GEN_SCRATCH,r15 mfspr r14,SPRN_DBSR - EXCEPTION_COMMON(0xd08, PACA_EXDBG, INTS_DISABLE) + EXCEPTION_COMMON_DBG(0xd08) + INTS_DISABLE std r14,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD mr r4,r14 ld r14,PACA_EXDBG+EX_R14(r13) ld r15,PACA_EXDBG+EX_R15(r13) - bl .save_nvgprs - bl .DebugException - b .ret_from_except + bl save_nvgprs + bl DebugException + b ret_from_except START_EXCEPTION(perfmon); NORMAL_EXCEPTION_PROLOG(0x260, BOOKE_INTERRUPT_PERFORMANCE_MONITOR, PROLOG_ADDITION_NONE) - EXCEPTION_COMMON(0x260, PACA_EXGEN, INTS_DISABLE) + EXCEPTION_COMMON(0x260) + INTS_DISABLE + CHECK_NAPPING() addi r3,r1,STACK_FRAME_OVERHEAD - bl .performance_monitor_exception - b .ret_from_except_lite + bl performance_monitor_exception + b ret_from_except_lite /* Doorbell interrupt */ MASKABLE_EXCEPTION(0x280, BOOKE_INTERRUPT_DOORBELL, - doorbell, .doorbell_exception, ACK_NONE) + doorbell, doorbell_exception, ACK_NONE) /* Doorbell critical Interrupt */ START_EXCEPTION(doorbell_crit); CRIT_EXCEPTION_PROLOG(0x2a0, BOOKE_INTERRUPT_DOORBELL_CRITICAL, PROLOG_ADDITION_NONE) -// EXCEPTION_COMMON(0x2a0, PACA_EXCRIT, INTS_DISABLE) -// bl special_reg_save_crit -// CHECK_NAPPING(); -// addi r3,r1,STACK_FRAME_OVERHEAD -// bl .doorbell_critical_exception -// b ret_from_crit_except - b . + EXCEPTION_COMMON_CRIT(0x2a0) + bl save_nvgprs + bl special_reg_save + CHECK_NAPPING(); + addi r3,r1,STACK_FRAME_OVERHEAD + bl unknown_exception + b ret_from_crit_except /* * Guest doorbell interrupt @@ -592,41 +876,52 @@ kernel_dbg_exc: START_EXCEPTION(guest_doorbell); GDBELL_EXCEPTION_PROLOG(0x2c0, BOOKE_INTERRUPT_GUEST_DBELL, PROLOG_ADDITION_NONE) - EXCEPTION_COMMON(0x2c0, PACA_EXGEN, INTS_KEEP) + EXCEPTION_COMMON(0x2c0) addi r3,r1,STACK_FRAME_OVERHEAD - bl .save_nvgprs + bl save_nvgprs INTS_RESTORE_HARD - bl .unknown_exception - b .ret_from_except + bl unknown_exception + b ret_from_except /* Guest Doorbell critical Interrupt */ START_EXCEPTION(guest_doorbell_crit); CRIT_EXCEPTION_PROLOG(0x2e0, BOOKE_INTERRUPT_GUEST_DBELL_CRIT, PROLOG_ADDITION_NONE) -// EXCEPTION_COMMON(0x2e0, PACA_EXCRIT, INTS_DISABLE) -// bl special_reg_save_crit -// CHECK_NAPPING(); -// addi r3,r1,STACK_FRAME_OVERHEAD -// bl .guest_doorbell_critical_exception -// b ret_from_crit_except - b . + EXCEPTION_COMMON_CRIT(0x2e0) + bl save_nvgprs + bl special_reg_save + CHECK_NAPPING(); + addi r3,r1,STACK_FRAME_OVERHEAD + bl unknown_exception + b ret_from_crit_except /* Hypervisor call */ START_EXCEPTION(hypercall); NORMAL_EXCEPTION_PROLOG(0x310, BOOKE_INTERRUPT_HV_SYSCALL, PROLOG_ADDITION_NONE) - EXCEPTION_COMMON(0x310, PACA_EXGEN, INTS_KEEP) + EXCEPTION_COMMON(0x310) addi r3,r1,STACK_FRAME_OVERHEAD - bl .save_nvgprs + bl save_nvgprs INTS_RESTORE_HARD - bl .unknown_exception - b .ret_from_except + bl unknown_exception + b ret_from_except /* Embedded Hypervisor priviledged */ START_EXCEPTION(ehpriv); NORMAL_EXCEPTION_PROLOG(0x320, BOOKE_INTERRUPT_HV_PRIV, PROLOG_ADDITION_NONE) - EXCEPTION_COMMON(0x320, PACA_EXGEN, INTS_KEEP) + EXCEPTION_COMMON(0x320) + addi r3,r1,STACK_FRAME_OVERHEAD + bl save_nvgprs + INTS_RESTORE_HARD + bl unknown_exception + b ret_from_except + +/* LRAT Error interrupt */ + START_EXCEPTION(lrat_error); + NORMAL_EXCEPTION_PROLOG(0x340, BOOKE_INTERRUPT_LRAT_ERROR, + PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0x340) addi r3,r1,STACK_FRAME_OVERHEAD bl .save_nvgprs INTS_RESTORE_HARD @@ -719,16 +1014,16 @@ storage_fault_common: mr r5,r15 ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) - bl .do_page_fault + bl do_page_fault cmpdi r3,0 bne- 1f - b .ret_from_except_lite -1: bl .save_nvgprs + b ret_from_except_lite +1: bl save_nvgprs mr r5,r3 addi r3,r1,STACK_FRAME_OVERHEAD ld r4,_DAR(r1) - bl .bad_page_fault - b .ret_from_except + bl bad_page_fault + b ret_from_except /* * Alignment exception doesn't fit entirely in the 0x100 bytes so it @@ -740,10 +1035,10 @@ alignment_more: addi r3,r1,STACK_FRAME_OVERHEAD ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) - bl .save_nvgprs + bl save_nvgprs INTS_RESTORE_HARD - bl .alignment_exception - b .ret_from_except + bl alignment_exception + b ret_from_except /* * We branch here from entry_64.S for the last stage of the exception @@ -806,6 +1101,7 @@ fast_exception_return: BAD_STACK_TRAMPOLINE(0x000) BAD_STACK_TRAMPOLINE(0x100) BAD_STACK_TRAMPOLINE(0x200) +BAD_STACK_TRAMPOLINE(0x220) BAD_STACK_TRAMPOLINE(0x260) BAD_STACK_TRAMPOLINE(0x280) BAD_STACK_TRAMPOLINE(0x2a0) @@ -814,6 +1110,7 @@ BAD_STACK_TRAMPOLINE(0x2e0) BAD_STACK_TRAMPOLINE(0x300) BAD_STACK_TRAMPOLINE(0x310) BAD_STACK_TRAMPOLINE(0x320) +BAD_STACK_TRAMPOLINE(0x340) BAD_STACK_TRAMPOLINE(0x400) BAD_STACK_TRAMPOLINE(0x500) BAD_STACK_TRAMPOLINE(0x600) @@ -875,7 +1172,7 @@ bad_stack_book3e: std r12,0(r11) ld r2,PACATOC(r13) 1: addi r3,r1,STACK_FRAME_OVERHEAD - bl .kernel_bad_stack + bl kernel_bad_stack b 1b /* @@ -1010,12 +1307,9 @@ skpinv: addi r6,r6,1 /* Increment */ mtspr SPRN_MAS0,r3 tlbre mfspr r6,SPRN_MAS1 - rlwinm r6,r6,0,2,0 /* clear IPROT */ + rlwinm r6,r6,0,2,31 /* clear IPROT and VALID */ mtspr SPRN_MAS1,r6 tlbwe - - /* Invalidate TLB1 */ - PPC_TLBILX_ALL(0,R0) sync isync @@ -1069,12 +1363,9 @@ skpinv: addi r6,r6,1 /* Increment */ mtspr SPRN_MAS0,r4 tlbre mfspr r5,SPRN_MAS1 - rlwinm r5,r5,0,2,0 /* clear IPROT */ + rlwinm r5,r5,0,2,31 /* clear IPROT and VALID */ mtspr SPRN_MAS1,r5 tlbwe - - /* Invalidate TLB1 */ - PPC_TLBILX_ALL(0,R0) sync isync @@ -1176,22 +1467,6 @@ a2_tlbinit_after_linear_map: .globl a2_tlbinit_after_iprot_flush a2_tlbinit_after_iprot_flush: -#ifdef CONFIG_PPC_EARLY_DEBUG_WSP - /* Now establish early debug mappings if applicable */ - /* Restore the MAS0 we used for linear mapping load */ - mtspr SPRN_MAS0,r11 - - lis r3,(MAS1_VALID | MAS1_IPROT)@h - ori r3,r3,(BOOK3E_PAGESZ_4K << MAS1_TSIZE_SHIFT) - mtspr SPRN_MAS1,r3 - LOAD_REG_IMMEDIATE(r3, WSP_UART_VIRT | MAS2_I | MAS2_G) - mtspr SPRN_MAS2,r3 - LOAD_REG_IMMEDIATE(r3, WSP_UART_PHYS | MAS3_SR | MAS3_SW) - mtspr SPRN_MAS7_MAS3,r3 - /* re-use the MAS8 value from the linear mapping */ - tlbwe -#endif /* CONFIG_PPC_EARLY_DEBUG_WSP */ - PPC_TLBILX(0,0,R0) sync isync @@ -1230,13 +1505,13 @@ _GLOBAL(start_initialization_book3e) * and always use AS 0, so we just set it up to match our link * address and never use 0 based addresses. */ - bl .initial_tlb_book3e + bl initial_tlb_book3e /* Init global core bits */ - bl .init_core_book3e + bl init_core_book3e /* Init per-thread bits */ - bl .init_thread_book3e + bl init_thread_book3e /* Return to common init code */ tovirt(r28,r28) @@ -1257,7 +1532,7 @@ _GLOBAL(start_initialization_book3e) */ _GLOBAL(book3e_secondary_core_init_tlb_set) li r4,1 - b .generic_secondary_smp_init + b generic_secondary_smp_init _GLOBAL(book3e_secondary_core_init) mflr r28 @@ -1267,18 +1542,18 @@ _GLOBAL(book3e_secondary_core_init) bne 2f /* Setup TLB for this core */ - bl .initial_tlb_book3e + bl initial_tlb_book3e /* We can return from the above running at a different * address, so recalculate r2 (TOC) */ - bl .relative_toc + bl relative_toc /* Init global core bits */ -2: bl .init_core_book3e +2: bl init_core_book3e /* Init per-thread bits */ -3: bl .init_thread_book3e +3: bl init_thread_book3e /* Return to common init code at proper virtual address. * @@ -1305,14 +1580,14 @@ _GLOBAL(book3e_secondary_thread_init) mflr r28 b 3b -_STATIC(init_core_book3e) +init_core_book3e: /* Establish the interrupt vector base */ LOAD_REG_IMMEDIATE(r3, interrupt_base_book3e) mtspr SPRN_IVPR,r3 sync blr -_STATIC(init_thread_book3e) +init_thread_book3e: lis r3,(SPRN_EPCR_ICM | SPRN_EPCR_GICM)@h mtspr SPRN_EPCR,r3 @@ -1349,6 +1624,11 @@ _GLOBAL(__setup_base_ivors) blr +_GLOBAL(setup_altivec_ivors) + SET_IVOR(32, 0x200) /* AltiVec Unavailable */ + SET_IVOR(33, 0x220) /* AltiVec Assist */ + blr + _GLOBAL(setup_perfmon_ivor) SET_IVOR(35, 0x260) /* Performance Monitor */ blr @@ -1364,3 +1644,7 @@ _GLOBAL(setup_ehv_ivors) SET_IVOR(38, 0x2c0) /* Guest Processor Doorbell */ SET_IVOR(39, 0x2e0) /* Guest Processor Doorbell Crit/MC */ blr + +_GLOBAL(setup_lrat_ivor) + SET_IVOR(42, 0x340) /* LRAT Error */ + blr diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 4665e82fa37..a7d36b19221 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -54,14 +54,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ xori r12,r12,MSR_LE ; \ mtspr SPRN_SRR1,r12 ; \ rfid ; /* return to userspace */ \ - b . ; \ -2: mfspr r12,SPRN_SRR1 ; \ - andi. r12,r12,MSR_PR ; \ - bne 0b ; \ - mtspr SPRN_SRR0,r3 ; \ - mtspr SPRN_SRR1,r4 ; \ - mtspr SPRN_SDR1,r5 ; \ - rfid ; \ b . ; /* prevent speculative execution */ #if defined(CONFIG_RELOCATABLE) @@ -74,13 +66,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ mflr r10 ; \ ld r12,PACAKBASE(r13) ; \ LOAD_HANDLER(r12, system_call_entry_direct) ; \ - mtlr r12 ; \ + mtctr r12 ; \ mfspr r12,SPRN_SRR1 ; \ /* Re-use of r13... No spare regs to do this */ \ li r13,MSR_RI ; \ mtmsrd r13,1 ; \ GET_PACA(r13) ; /* get r13 back */ \ - blr ; + bctr ; #else /* We can branch directly */ #define SYSCALL_PSERIES_2_DIRECT \ @@ -104,7 +96,7 @@ __start_interrupts: .globl system_reset_pSeries; system_reset_pSeries: - HMT_MEDIUM; + HMT_MEDIUM_PPR_DISCARD SET_SCRATCH0(r13) #ifdef CONFIG_PPC_P7_NAP BEGIN_FTR_SECTION @@ -121,12 +113,13 @@ BEGIN_FTR_SECTION cmpwi cr1,r13,2 /* Total loss of HV state is fatal, we could try to use the * PIR to locate a PACA, then use an emergency stack etc... - * but for now, let's just stay stuck here + * OPAL v3 based powernv platforms have new idle states + * which fall in this catagory. */ - bgt cr1,. + bgt cr1,8f GET_PACA(r13) -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE li r0,KVM_HWTHREAD_IN_KERNEL stb r0,HSTATE_HWTHREAD_STATE(r13) /* Order setting hwthread_state vs. testing hwthread_req */ @@ -139,8 +132,13 @@ BEGIN_FTR_SECTION #endif beq cr1,2f - b .power7_wakeup_noloss -2: b .power7_wakeup_loss + b power7_wakeup_noloss +2: b power7_wakeup_loss + + /* Fast Sleep wakeup on PowerNV */ +8: GET_PACA(r13) + b power7_wakeup_tb_loss + 9: END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #endif /* CONFIG_PPC_P7_NAP */ @@ -153,12 +151,42 @@ machine_check_pSeries_1: * some code path might still want to branch into the original * vector */ - b machine_check_pSeries + HMT_MEDIUM_PPR_DISCARD + SET_SCRATCH0(r13) /* save r13 */ +#ifdef CONFIG_PPC_P7_NAP +BEGIN_FTR_SECTION + /* Running native on arch 2.06 or later, check if we are + * waking up from nap. We only handle no state loss and + * supervisor state loss. We do -not- handle hypervisor + * state loss at this time. + */ + mfspr r13,SPRN_SRR1 + rlwinm. r13,r13,47-31,30,31 + OPT_GET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR) + beq 9f + + mfspr r13,SPRN_SRR1 + rlwinm. r13,r13,47-31,30,31 + /* waking up from powersave (nap) state */ + cmpwi cr1,r13,2 + /* Total loss of HV state is fatal. let's just stay stuck here */ + OPT_GET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR) + bgt cr1,. +9: + OPT_SET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR) +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) +#endif /* CONFIG_PPC_P7_NAP */ + EXCEPTION_PROLOG_0(PACA_EXMC) +BEGIN_FTR_SECTION + b machine_check_pSeries_early +FTR_SECTION_ELSE + b machine_check_pSeries_0 +ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) . = 0x300 .globl data_access_pSeries data_access_pSeries: - HMT_MEDIUM + HMT_MEDIUM_PPR_DISCARD SET_SCRATCH0(r13) BEGIN_FTR_SECTION b data_access_check_stab @@ -170,8 +198,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB) . = 0x380 .globl data_access_slb_pSeries data_access_slb_pSeries: - HMT_MEDIUM + HMT_MEDIUM_PPR_DISCARD SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380) std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_DAR @@ -182,16 +211,16 @@ data_access_slb_pSeries: #endif /* __DISABLED__ */ mfspr r12,SPRN_SRR1 #ifndef CONFIG_RELOCATABLE - b .slb_miss_realmode + b slb_miss_realmode #else /* - * We can't just use a direct branch to .slb_miss_realmode + * We can't just use a direct branch to slb_miss_realmode * because the distance from here to there depends on where * the kernel ends up being put. */ mfctr r11 ld r10,PACAKBASE(r13) - LOAD_HANDLER(r10, .slb_miss_realmode) + LOAD_HANDLER(r10, slb_miss_realmode) mtctr r10 bctr #endif @@ -201,8 +230,9 @@ data_access_slb_pSeries: . = 0x480 .globl instruction_access_slb_pSeries instruction_access_slb_pSeries: - HMT_MEDIUM + HMT_MEDIUM_PPR_DISCARD SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ @@ -213,11 +243,11 @@ instruction_access_slb_pSeries: #endif /* __DISABLED__ */ mfspr r12,SPRN_SRR1 #ifndef CONFIG_RELOCATABLE - b .slb_miss_realmode + b slb_miss_realmode #else mfctr r11 ld r10,PACAKBASE(r13) - LOAD_HANDLER(r10, .slb_miss_realmode) + LOAD_HANDLER(r10, slb_miss_realmode) mtctr r10 bctr #endif @@ -230,6 +260,7 @@ instruction_access_slb_pSeries: .globl hardware_interrupt_hv; hardware_interrupt_pSeries: hardware_interrupt_hv: + HMT_MEDIUM_PPR_DISCARD BEGIN_FTR_SECTION _MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt, EXC_HV, SOFTEN_TEST_HV) @@ -249,10 +280,14 @@ hardware_interrupt_hv: STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800) - MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer) + . = 0x900 + .globl decrementer_pSeries +decrementer_pSeries: + _MASKABLE_EXCEPTION_PSERIES(0x900, decrementer, EXC_STD, SOFTEN_TEST_PR) + STD_EXCEPTION_HV(0x980, 0x982, hdecrementer) - STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a) + MASKABLE_EXCEPTION_PSERIES(0xa00, 0xa00, doorbell_super) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00) STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b) @@ -283,34 +318,70 @@ system_call_pSeries: * out of line to handle them */ . = 0xe00 -hv_exception_trampoline: +hv_data_storage_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) b h_data_storage_hv + . = 0xe20 +hv_instr_storage_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) b h_instr_storage_hv + . = 0xe40 +emulation_assist_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) b emulation_assist_hv - . = 0xe50 - b hmi_exception_hv + . = 0xe60 +hv_exception_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) b hmi_exception_hv + . = 0xe80 +hv_doorbell_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b h_doorbell_hv + /* We need to deal with the Altivec unavailable exception * here which is at 0xf20, thus in the middle of the * prolog code of the PerformanceMonitor one. A little * trickery is thus necessary */ -performance_monitor_pSeries_1: . = 0xf00 +performance_monitor_pseries_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) b performance_monitor_pSeries -altivec_unavailable_pSeries_1: . = 0xf20 +altivec_unavailable_pseries_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) b altivec_unavailable_pSeries -vsx_unavailable_pSeries_1: . = 0xf40 +vsx_unavailable_pseries_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) b vsx_unavailable_pSeries + . = 0xf60 +facility_unavailable_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b facility_unavailable_pSeries + + . = 0xf80 +hv_facility_unavailable_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b facility_unavailable_hv + #ifdef CONFIG_CBE_RAS STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error) KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202) @@ -322,16 +393,10 @@ vsx_unavailable_pSeries_1: . = 0x1500 .global denorm_exception_hv denorm_exception_hv: - HMT_MEDIUM + HMT_MEDIUM_PPR_DISCARD mtspr SPRN_SPRG_HSCRATCH0,r13 - mfspr r13,SPRN_SPRG_HPACA - std r9,PACA_EXGEN+EX_R9(r13) - std r10,PACA_EXGEN+EX_R10(r13) - std r11,PACA_EXGEN+EX_R11(r13) - std r12,PACA_EXGEN+EX_R12(r13) - mfspr r9,SPRN_SPRG_HSCRATCH0 - std r9,PACA_EXGEN+EX_R13(r13) - mfcr r9 + EXCEPTION_PROLOG_0(PACA_EXGEN) + EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x1500) #ifdef CONFIG_PPC_DENORMALISATION mfspr r10,SPRN_HSRR1 @@ -341,6 +406,7 @@ denorm_exception_hv: bne+ denorm_assist #endif + KVMTEST(0x1500) EXCEPTION_PROLOG_PSERIES_1(denorm_common, EXC_HV) KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1500) @@ -364,13 +430,89 @@ denorm_exception_hv: .align 7 /* moved from 0x200 */ +machine_check_pSeries_early: +BEGIN_FTR_SECTION + EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) + /* + * Register contents: + * R13 = PACA + * R9 = CR + * Original R9 to R13 is saved on PACA_EXMC + * + * Switch to mc_emergency stack and handle re-entrancy (we limit + * the nested MCE upto level 4 to avoid stack overflow). + * Save MCE registers srr1, srr0, dar and dsisr and then set ME=1 + * + * We use paca->in_mce to check whether this is the first entry or + * nested machine check. We increment paca->in_mce to track nested + * machine checks. + * + * If this is the first entry then set stack pointer to + * paca->mc_emergency_sp, otherwise r1 is already pointing to + * stack frame on mc_emergency stack. + * + * NOTE: We are here with MSR_ME=0 (off), which means we risk a + * checkstop if we get another machine check exception before we do + * rfid with MSR_ME=1. + */ + mr r11,r1 /* Save r1 */ + lhz r10,PACA_IN_MCE(r13) + cmpwi r10,0 /* Are we in nested machine check */ + bne 0f /* Yes, we are. */ + /* First machine check entry */ + ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ +0: subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + addi r10,r10,1 /* increment paca->in_mce */ + sth r10,PACA_IN_MCE(r13) + /* Limit nested MCE to level 4 to avoid stack overflow */ + cmpwi r10,4 + bgt 2f /* Check if we hit limit of 4 */ + std r11,GPR1(r1) /* Save r1 on the stack. */ + std r11,0(r1) /* make stack chain pointer */ + mfspr r11,SPRN_SRR0 /* Save SRR0 */ + std r11,_NIP(r1) + mfspr r11,SPRN_SRR1 /* Save SRR1 */ + std r11,_MSR(r1) + mfspr r11,SPRN_DAR /* Save DAR */ + std r11,_DAR(r1) + mfspr r11,SPRN_DSISR /* Save DSISR */ + std r11,_DSISR(r1) + std r9,_CCR(r1) /* Save CR in stackframe */ + /* Save r9 through r13 from EXMC save area to stack frame. */ + EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) + mfmsr r11 /* get MSR value */ + ori r11,r11,MSR_ME /* turn on ME bit */ + ori r11,r11,MSR_RI /* turn on RI bit */ + ld r12,PACAKBASE(r13) /* get high part of &label */ + LOAD_HANDLER(r12, machine_check_handle_early) +1: mtspr SPRN_SRR0,r12 + mtspr SPRN_SRR1,r11 + rfid + b . /* prevent speculative execution */ +2: + /* Stack overflow. Stay on emergency stack and panic. + * Keep the ME bit off while panic-ing, so that if we hit + * another machine check we checkstop. + */ + addi r1,r1,INT_FRAME_SIZE /* go back to previous stack frame */ + ld r11,PACAKMSR(r13) + ld r12,PACAKBASE(r13) + LOAD_HANDLER(r12, unrecover_mce) + li r10,MSR_ME + andc r11,r11,r10 /* Turn off MSR_ME */ + b 1b + b . /* prevent speculative execution */ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) + machine_check_pSeries: .globl machine_check_fwnmi machine_check_fwnmi: - HMT_MEDIUM + HMT_MEDIUM_PPR_DISCARD SET_SCRATCH0(r13) /* save r13 */ - EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, - EXC_STD, KVMTEST, 0x200) + EXCEPTION_PROLOG_0(PACA_EXMC) +machine_check_pSeries_0: + EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST, 0x200) + EXCEPTION_PROLOG_PSERIES_1(machine_check_common, EXC_STD) KVM_HANDLER_SKIP(PACA_EXMC, EXC_STD, 0x200) /* moved from 0x300 */ @@ -382,7 +524,7 @@ data_access_check_stab: mfspr r9,SPRN_DSISR srdi r10,r10,60 rlwimi r10,r9,16,0x20 -#ifdef CONFIG_KVM_BOOK3S_PR +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE lbz r9,HSTATE_IN_GUEST(r13) rlwimi r10,r9,8,0x300 #endif @@ -398,7 +540,7 @@ do_stab_bolted_pSeries: std r12,PACA_EXSLB+EX_R12(r13) GET_SCRATCH0(r10) std r10,PACA_EXSLB+EX_R13(r13) - EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD) + EXCEPTION_PROLOG_PSERIES_1(do_stab_bolted, EXC_STD) KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300) KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380) @@ -419,38 +561,14 @@ BEGIN_FTR_SECTION xori r10,r10,(MSR_FE0|MSR_FE1) mtmsrd r10 sync - fmr 0,0 - fmr 1,1 - fmr 2,2 - fmr 3,3 - fmr 4,4 - fmr 5,5 - fmr 6,6 - fmr 7,7 - fmr 8,8 - fmr 9,9 - fmr 10,10 - fmr 11,11 - fmr 12,12 - fmr 13,13 - fmr 14,14 - fmr 15,15 - fmr 16,16 - fmr 17,17 - fmr 18,18 - fmr 19,19 - fmr 20,20 - fmr 21,21 - fmr 22,22 - fmr 23,23 - fmr 24,24 - fmr 25,25 - fmr 26,26 - fmr 27,27 - fmr 28,28 - fmr 29,29 - fmr 30,30 - fmr 31,31 + +#define FMR2(n) fmr (n), (n) ; fmr n+1, n+1 +#define FMR4(n) FMR2(n) ; FMR2(n+2) +#define FMR8(n) FMR4(n) ; FMR4(n+4) +#define FMR16(n) FMR8(n) ; FMR8(n+8) +#define FMR32(n) FMR16(n) ; FMR16(n+16) + FMR32(0) + FTR_SECTION_ELSE /* * To denormalise we need to move a copy of the register to itself. @@ -460,42 +578,33 @@ FTR_SECTION_ELSE oris r10,r10,MSR_VSX@h mtmsrd r10 sync - XVCPSGNDP(0,0,0) - XVCPSGNDP(1,1,1) - XVCPSGNDP(2,2,2) - XVCPSGNDP(3,3,3) - XVCPSGNDP(4,4,4) - XVCPSGNDP(5,5,5) - XVCPSGNDP(6,6,6) - XVCPSGNDP(7,7,7) - XVCPSGNDP(8,8,8) - XVCPSGNDP(9,9,9) - XVCPSGNDP(10,10,10) - XVCPSGNDP(11,11,11) - XVCPSGNDP(12,12,12) - XVCPSGNDP(13,13,13) - XVCPSGNDP(14,14,14) - XVCPSGNDP(15,15,15) - XVCPSGNDP(16,16,16) - XVCPSGNDP(17,17,17) - XVCPSGNDP(18,18,18) - XVCPSGNDP(19,19,19) - XVCPSGNDP(20,20,20) - XVCPSGNDP(21,21,21) - XVCPSGNDP(22,22,22) - XVCPSGNDP(23,23,23) - XVCPSGNDP(24,24,24) - XVCPSGNDP(25,25,25) - XVCPSGNDP(26,26,26) - XVCPSGNDP(27,27,27) - XVCPSGNDP(28,28,28) - XVCPSGNDP(29,29,29) - XVCPSGNDP(30,30,30) - XVCPSGNDP(31,31,31) + +#define XVCPSGNDP2(n) XVCPSGNDP(n,n,n) ; XVCPSGNDP(n+1,n+1,n+1) +#define XVCPSGNDP4(n) XVCPSGNDP2(n) ; XVCPSGNDP2(n+2) +#define XVCPSGNDP8(n) XVCPSGNDP4(n) ; XVCPSGNDP4(n+4) +#define XVCPSGNDP16(n) XVCPSGNDP8(n) ; XVCPSGNDP8(n+8) +#define XVCPSGNDP32(n) XVCPSGNDP16(n) ; XVCPSGNDP16(n+16) + XVCPSGNDP32(0) + ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_206) + +BEGIN_FTR_SECTION + b denorm_done +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) +/* + * To denormalise we need to move a copy of the register to itself. + * For POWER8 we need to do that for all 64 VSX registers + */ + XVCPSGNDP32(32) +denorm_done: mtspr SPRN_HSRR0,r11 mtcrf 0x80,r9 ld r9,PACA_EXGEN+EX_R9(r13) + RESTORE_PPR_PACA(PACA_EXGEN, r10) +BEGIN_FTR_SECTION + ld r10,PACA_EXGEN+EX_CFAR(r13) + mtspr SPRN_CFAR,r10 +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) ld r10,PACA_EXGEN+EX_R10(r13) ld r11,PACA_EXGEN+EX_R11(r13) ld r12,PACA_EXGEN+EX_R12(r13) @@ -506,28 +615,36 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_206) .align 7 /* moved from 0xe00 */ - STD_EXCEPTION_HV(., 0xe02, h_data_storage) + STD_EXCEPTION_HV_OOL(0xe02, h_data_storage) KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0xe02) - STD_EXCEPTION_HV(., 0xe22, h_instr_storage) + STD_EXCEPTION_HV_OOL(0xe22, h_instr_storage) KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe22) - STD_EXCEPTION_HV(., 0xe42, emulation_assist) + STD_EXCEPTION_HV_OOL(0xe42, emulation_assist) KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe42) - STD_EXCEPTION_HV(., 0xe62, hmi_exception) /* need to flush cache ? */ + STD_EXCEPTION_HV_OOL(0xe62, hmi_exception) /* need to flush cache ? */ KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62) + MASKABLE_EXCEPTION_HV_OOL(0xe82, h_doorbell) + KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe82) /* moved from 0xf00 */ - STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor) + STD_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf00) - STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable) + STD_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20) - STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable) + STD_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40) + STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) + KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60) + STD_EXCEPTION_HV_OOL(0xf82, facility_unavailable) + KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xf82) /* - * An interrupt came in while soft-disabled. We set paca->irq_happened, - * then, if it was a decrementer interrupt, we bump the dec to max and - * and return, else we hard disable and return. This is called with - * r10 containing the value to OR to the paca field. + * An interrupt came in while soft-disabled. We set paca->irq_happened, then: + * - If it was a decrementer interrupt, we bump the dec to max and and return. + * - If it was a doorbell we return immediately since doorbells are edge + * triggered and won't automatically refire. + * - else we hard disable and return. + * This is called with r10 containing the value to OR to the paca field. */ #define MASKED_INTERRUPT(_H) \ masked_##_H##interrupt: \ @@ -535,13 +652,15 @@ masked_##_H##interrupt: \ lbz r11,PACAIRQHAPPENED(r13); \ or r11,r11,r10; \ stb r11,PACAIRQHAPPENED(r13); \ - andi. r10,r10,PACA_IRQ_DEC; \ - beq 1f; \ + cmpwi r10,PACA_IRQ_DEC; \ + bne 1f; \ lis r10,0x7fff; \ ori r10,r10,0xffff; \ mtspr SPRN_DEC,r10; \ b 2f; \ -1: mfspr r10,SPRN_##_H##SRR1; \ +1: cmpwi r10,PACA_IRQ_DBELL; \ + beq 2f; \ + mfspr r10,SPRN_##_H##SRR1; \ rldicl r10,r10,48,1; /* clear MSR_EE */ \ rotldi r10,r10,16; \ mtspr SPRN_##_H##SRR1,r10; \ @@ -558,8 +677,8 @@ masked_##_H##interrupt: \ /* * Called from arch_local_irq_enable when an interrupt needs - * to be resent. r3 contains 0x500 or 0x900 to indicate which - * kind of interrupt. MSR:EE is already off. We generate a + * to be resent. r3 contains 0x500, 0x900, 0xa00 or 0xe80 to indicate + * which kind of interrupt. MSR:EE is already off. We generate a * stackframe like if a real interrupt had happened. * * Note: While MSR:EE is off, we need to make sure that _MSR @@ -575,9 +694,18 @@ _GLOBAL(__replay_interrupt) mflr r11 mfcr r9 ori r12,r12,MSR_EE - andi. r3,r3,0x0800 - bne decrementer_common - b hardware_interrupt_common + cmpwi r3,0x900 + beq decrementer_common + cmpwi r3,0x500 + beq hardware_interrupt_common +BEGIN_FTR_SECTION + cmpwi r3,0xe80 + beq h_doorbell_common +FTR_SECTION_ELSE + cmpwi r3,0xa00 + beq doorbell_super_common +ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) + blr #ifdef CONFIG_PPC_PSERIES /* @@ -586,7 +714,7 @@ _GLOBAL(__replay_interrupt) .globl system_reset_fwnmi .align 7 system_reset_fwnmi: - HMT_MEDIUM + HMT_MEDIUM_PPR_DISCARD SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD, NOTEST, 0x100) @@ -621,6 +749,32 @@ slb_miss_user_pseries: b . /* prevent spec. execution */ #endif /* __DISABLED__ */ +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER +kvmppc_skip_interrupt: + /* + * Here all GPRs are unchanged from when the interrupt happened + * except for r13, which is saved in SPRG_SCRATCH0. + */ + mfspr r13, SPRN_SRR0 + addi r13, r13, 4 + mtspr SPRN_SRR0, r13 + GET_SCRATCH0(r13) + rfid + b . + +kvmppc_skip_Hinterrupt: + /* + * Here all GPRs are unchanged from when the interrupt happened + * except for r13, which is saved in SPRG_SCRATCH0. + */ + mfspr r13, SPRN_HSRR0 + addi r13, r13, 4 + mtspr SPRN_HSRR0, r13 + GET_SCRATCH0(r13) + hrfid + b . +#endif + /* * Code from here down to __end_handlers is invoked from the * exception prologs above. Because the prologs assemble the @@ -631,44 +785,38 @@ slb_miss_user_pseries: /*** Common interrupt handlers ***/ - STD_EXCEPTION_COMMON(0x100, system_reset, .system_reset_exception) - - /* - * Machine check is different because we use a different - * save area: PACA_EXMC instead of PACA_EXGEN. - */ - .align 7 - .globl machine_check_common -machine_check_common: - EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC) - FINISH_NAP - DISABLE_INTS - bl .save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl .machine_check_exception - b .ret_from_except + STD_EXCEPTION_COMMON(0x100, system_reset, system_reset_exception) STD_EXCEPTION_COMMON_ASYNC(0x500, hardware_interrupt, do_IRQ) - STD_EXCEPTION_COMMON_ASYNC(0x900, decrementer, .timer_interrupt) - STD_EXCEPTION_COMMON(0x980, hdecrementer, .hdec_interrupt) - STD_EXCEPTION_COMMON(0xa00, trap_0a, .unknown_exception) - STD_EXCEPTION_COMMON(0xb00, trap_0b, .unknown_exception) - STD_EXCEPTION_COMMON(0xd00, single_step, .single_step_exception) - STD_EXCEPTION_COMMON(0xe00, trap_0e, .unknown_exception) - STD_EXCEPTION_COMMON(0xe40, emulation_assist, .program_check_exception) - STD_EXCEPTION_COMMON(0xe60, hmi_exception, .unknown_exception) - STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, .performance_monitor_exception) - STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, .instruction_breakpoint_exception) - STD_EXCEPTION_COMMON(0x1502, denorm, .unknown_exception) + STD_EXCEPTION_COMMON_ASYNC(0x900, decrementer, timer_interrupt) + STD_EXCEPTION_COMMON(0x980, hdecrementer, hdec_interrupt) +#ifdef CONFIG_PPC_DOORBELL + STD_EXCEPTION_COMMON_ASYNC(0xa00, doorbell_super, doorbell_exception) +#else + STD_EXCEPTION_COMMON_ASYNC(0xa00, doorbell_super, unknown_exception) +#endif + STD_EXCEPTION_COMMON(0xb00, trap_0b, unknown_exception) + STD_EXCEPTION_COMMON(0xd00, single_step, single_step_exception) + STD_EXCEPTION_COMMON(0xe00, trap_0e, unknown_exception) + STD_EXCEPTION_COMMON(0xe40, emulation_assist, emulation_assist_interrupt) + STD_EXCEPTION_COMMON(0xe60, hmi_exception, unknown_exception) +#ifdef CONFIG_PPC_DOORBELL + STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, doorbell_exception) +#else + STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, unknown_exception) +#endif + STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, performance_monitor_exception) + STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, instruction_breakpoint_exception) + STD_EXCEPTION_COMMON(0x1502, denorm, unknown_exception) #ifdef CONFIG_ALTIVEC - STD_EXCEPTION_COMMON(0x1700, altivec_assist, .altivec_assist_exception) + STD_EXCEPTION_COMMON(0x1700, altivec_assist, altivec_assist_exception) #else - STD_EXCEPTION_COMMON(0x1700, altivec_assist, .unknown_exception) + STD_EXCEPTION_COMMON(0x1700, altivec_assist, unknown_exception) #endif #ifdef CONFIG_CBE_RAS - STD_EXCEPTION_COMMON(0x1200, cbe_system_error, .cbe_system_error_exception) - STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, .cbe_maintenance_exception) - STD_EXCEPTION_COMMON(0x1800, cbe_thermal, .cbe_thermal_exception) + STD_EXCEPTION_COMMON(0x1200, cbe_system_error, cbe_system_error_exception) + STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, cbe_maintenance_exception) + STD_EXCEPTION_COMMON(0x1800, cbe_thermal, cbe_thermal_exception) #endif /* CONFIG_CBE_RAS */ /* @@ -690,23 +838,23 @@ machine_check_common: . = 0x4380 .globl data_access_slb_relon_pSeries data_access_slb_relon_pSeries: - HMT_MEDIUM SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_DAR mfspr r12,SPRN_SRR1 #ifndef CONFIG_RELOCATABLE - b .slb_miss_realmode + b slb_miss_realmode #else /* - * We can't just use a direct branch to .slb_miss_realmode + * We can't just use a direct branch to slb_miss_realmode * because the distance from here to there depends on where * the kernel ends up being put. */ mfctr r11 ld r10,PACAKBASE(r13) - LOAD_HANDLER(r10, .slb_miss_realmode) + LOAD_HANDLER(r10, slb_miss_realmode) mtctr r10 bctr #endif @@ -715,18 +863,18 @@ data_access_slb_relon_pSeries: . = 0x4480 .globl instruction_access_slb_relon_pSeries instruction_access_slb_relon_pSeries: - HMT_MEDIUM SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480) std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ mfspr r12,SPRN_SRR1 #ifndef CONFIG_RELOCATABLE - b .slb_miss_realmode + b slb_miss_realmode #else mfctr r11 ld r10,PACAKBASE(r13) - LOAD_HANDLER(r10, .slb_miss_realmode) + LOAD_HANDLER(r10, slb_miss_realmode) mtctr r10 bctr #endif @@ -740,12 +888,13 @@ hardware_interrupt_relon_hv: _MASKABLE_RELON_EXCEPTION_PSERIES(0x502, hardware_interrupt, EXC_HV, SOFTEN_TEST_HV) FTR_SECTION_ELSE _MASKABLE_RELON_EXCEPTION_PSERIES(0x500, hardware_interrupt, EXC_STD, SOFTEN_TEST_PR) - ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_206) + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) STD_RELON_EXCEPTION_PSERIES(0x4600, 0x600, alignment) STD_RELON_EXCEPTION_PSERIES(0x4700, 0x700, program_check) STD_RELON_EXCEPTION_PSERIES(0x4800, 0x800, fp_unavailable) MASKABLE_RELON_EXCEPTION_PSERIES(0x4900, 0x900, decrementer) STD_RELON_EXCEPTION_HV(0x4980, 0x982, hdecrementer) + MASKABLE_RELON_EXCEPTION_PSERIES(0x4a00, 0xa00, doorbell_super) STD_RELON_EXCEPTION_PSERIES(0x4b00, 0xb00, trap_0b) . = 0x4c00 @@ -759,56 +908,62 @@ system_call_relon_pSeries: STD_RELON_EXCEPTION_PSERIES(0x4d00, 0xd00, single_step) . = 0x4e00 - b h_data_storage_relon_hv + b . /* Can't happen, see v2.07 Book III-S section 6.5 */ . = 0x4e20 - b h_instr_storage_relon_hv + b . /* Can't happen, see v2.07 Book III-S section 6.5 */ . = 0x4e40 +emulation_assist_relon_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) b emulation_assist_relon_hv - . = 0x4e50 - b hmi_exception_relon_hv - . = 0x4e60 - b hmi_exception_relon_hv + b . /* Can't happen, see v2.07 Book III-S section 6.5 */ - /* For when we support the doorbell interrupt: - STD_RELON_EXCEPTION_HYPERVISOR(0x4e80, 0xe80, doorbell_hyper) - */ + . = 0x4e80 +h_doorbell_relon_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b h_doorbell_relon_hv -performance_monitor_relon_pSeries_1: . = 0x4f00 +performance_monitor_relon_pseries_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) b performance_monitor_relon_pSeries -altivec_unavailable_relon_pSeries_1: . = 0x4f20 +altivec_unavailable_relon_pseries_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) b altivec_unavailable_relon_pSeries -vsx_unavailable_relon_pSeries_1: . = 0x4f40 +vsx_unavailable_relon_pseries_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) b vsx_unavailable_relon_pSeries -#ifdef CONFIG_CBE_RAS - STD_RELON_EXCEPTION_HV(0x5200, 0x1202, cbe_system_error) -#endif /* CONFIG_CBE_RAS */ + . = 0x4f60 +facility_unavailable_relon_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b facility_unavailable_relon_pSeries + + . = 0x4f80 +hv_facility_unavailable_relon_trampoline: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b hv_facility_unavailable_relon_hv + STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint) #ifdef CONFIG_PPC_DENORMALISATION . = 0x5500 b denorm_exception_hv #endif -#ifdef CONFIG_CBE_RAS - STD_RELON_EXCEPTION_HV(0x5600, 0x1602, cbe_maintenance) -#else -#ifdef CONFIG_HVC_SCOM - STD_RELON_EXCEPTION_HV(0x5600, 0x1600, maintence_interrupt) - KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1600) -#endif /* CONFIG_HVC_SCOM */ -#endif /* CONFIG_CBE_RAS */ STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) -#ifdef CONFIG_CBE_RAS - STD_RELON_EXCEPTION_HV(0x5800, 0x1802, cbe_thermal) -#endif /* CONFIG_CBE_RAS */ /* Other future vectors */ .align 7 @@ -827,7 +982,7 @@ system_call_entry: b system_call_common ppc64_runlatch_on_trampoline: - b .__ppc64_runlatch_on + b __ppc64_runlatch_on /* * Here we have detected that the kernel stack pointer is bad. @@ -886,7 +1041,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) std r12,RESULT(r1) std r11,STACK_FRAME_OVERHEAD-16(r1) 1: addi r3,r1,STACK_FRAME_OVERHEAD - bl .kernel_bad_stack + bl kernel_bad_stack b 1b /* @@ -907,7 +1062,7 @@ data_access_common: ld r3,PACA_EXGEN+EX_DAR(r13) lwz r4,PACA_EXGEN+EX_DSISR(r13) li r5,0x300 - b .do_hash_page /* Try to handle as hpte fault */ + b do_hash_page /* Try to handle as hpte fault */ .align 7 .globl h_data_storage_common @@ -917,11 +1072,11 @@ h_data_storage_common: mfspr r10,SPRN_HDSISR stw r10,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN) - bl .save_nvgprs + bl save_nvgprs DISABLE_INTS addi r3,r1,STACK_FRAME_OVERHEAD - bl .unknown_exception - b .ret_from_except + bl unknown_exception + b ret_from_except .align 7 .globl instruction_access_common @@ -932,9 +1087,9 @@ instruction_access_common: ld r3,_NIP(r1) andis. r4,r12,0x5820 li r5,0x400 - b .do_hash_page /* Try to handle as hpte fault */ + b do_hash_page /* Try to handle as hpte fault */ - STD_EXCEPTION_COMMON(0xe20, h_instr_storage, .unknown_exception) + STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception) /* * Here is the common SLB miss user that is used when going to virtual @@ -949,7 +1104,7 @@ slb_miss_user_common: stw r9,PACA_EXGEN+EX_CCR(r13) std r10,PACA_EXGEN+EX_LR(r13) std r11,PACA_EXGEN+EX_SRR0(r13) - bl .slb_allocate_user + bl slb_allocate_user ld r10,PACA_EXGEN+EX_LR(r13) ld r3,PACA_EXGEN+EX_R3(r13) @@ -992,14 +1147,369 @@ slb_miss_fault: unrecov_user_slb: EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN) DISABLE_INTS - bl .save_nvgprs + bl save_nvgprs 1: addi r3,r1,STACK_FRAME_OVERHEAD - bl .unrecoverable_exception + bl unrecoverable_exception b 1b #endif /* __DISABLED__ */ + /* + * Machine check is different because we use a different + * save area: PACA_EXMC instead of PACA_EXGEN. + */ + .align 7 + .globl machine_check_common +machine_check_common: + + mfspr r10,SPRN_DAR + std r10,PACA_EXGEN+EX_DAR(r13) + mfspr r10,SPRN_DSISR + stw r10,PACA_EXGEN+EX_DSISR(r13) + EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC) + FINISH_NAP + DISABLE_INTS + ld r3,PACA_EXGEN+EX_DAR(r13) + lwz r4,PACA_EXGEN+EX_DSISR(r13) + std r3,_DAR(r1) + std r4,_DSISR(r1) + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_exception + b ret_from_except + + .align 7 + .globl alignment_common +alignment_common: + mfspr r10,SPRN_DAR + std r10,PACA_EXGEN+EX_DAR(r13) + mfspr r10,SPRN_DSISR + stw r10,PACA_EXGEN+EX_DSISR(r13) + EXCEPTION_PROLOG_COMMON(0x600, PACA_EXGEN) + ld r3,PACA_EXGEN+EX_DAR(r13) + lwz r4,PACA_EXGEN+EX_DSISR(r13) + std r3,_DAR(r1) + std r4,_DSISR(r1) + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl alignment_exception + b ret_from_except + + .align 7 + .globl program_check_common +program_check_common: + EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl program_check_exception + b ret_from_except + + .align 7 + .globl fp_unavailable_common +fp_unavailable_common: + EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN) + bne 1f /* if from user, just load it up */ + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl kernel_fp_unavailable_exception + BUG_OPCODE +1: +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + /* Test if 2 TM state bits are zero. If non-zero (ie. userspace was in + * transaction), go do TM stuff + */ + rldicl. r0, r12, (64-MSR_TS_LG), (64-2) + bne- 2f +END_FTR_SECTION_IFSET(CPU_FTR_TM) +#endif + bl load_up_fpu + b fast_exception_return +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +2: /* User process was in a transaction */ + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl fp_unavailable_tm + b ret_from_except +#endif + .align 7 + .globl altivec_unavailable_common +altivec_unavailable_common: + EXCEPTION_PROLOG_COMMON(0xf20, PACA_EXGEN) +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + beq 1f +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + BEGIN_FTR_SECTION_NESTED(69) + /* Test if 2 TM state bits are zero. If non-zero (ie. userspace was in + * transaction), go do TM stuff + */ + rldicl. r0, r12, (64-MSR_TS_LG), (64-2) + bne- 2f + END_FTR_SECTION_NESTED(CPU_FTR_TM, CPU_FTR_TM, 69) +#endif + bl load_up_altivec + b fast_exception_return +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +2: /* User process was in a transaction */ + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl altivec_unavailable_tm + b ret_from_except +#endif +1: +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl altivec_unavailable_exception + b ret_from_except + + .align 7 + .globl vsx_unavailable_common +vsx_unavailable_common: + EXCEPTION_PROLOG_COMMON(0xf40, PACA_EXGEN) +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + beq 1f +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + BEGIN_FTR_SECTION_NESTED(69) + /* Test if 2 TM state bits are zero. If non-zero (ie. userspace was in + * transaction), go do TM stuff + */ + rldicl. r0, r12, (64-MSR_TS_LG), (64-2) + bne- 2f + END_FTR_SECTION_NESTED(CPU_FTR_TM, CPU_FTR_TM, 69) +#endif + b load_up_vsx +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +2: /* User process was in a transaction */ + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl vsx_unavailable_tm + b ret_from_except +#endif +1: +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + bl save_nvgprs + DISABLE_INTS + addi r3,r1,STACK_FRAME_OVERHEAD + bl vsx_unavailable_exception + b ret_from_except + + STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) + STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) + + .align 7 + .globl __end_handlers +__end_handlers: + + /* Equivalents to the above handlers for relocation-on interrupt vectors */ + STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) + MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) + + STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) + STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) + +#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) +/* + * Data area reserved for FWNMI option. + * This address (0x7000) is fixed by the RPA. + */ + .= 0x7000 + .globl fwnmi_data_area +fwnmi_data_area: + + /* pseries and powernv need to keep the whole page from + * 0x7000 to 0x8000 free for use by the firmware + */ + . = 0x8000 +#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ + +/* Space for CPU0's segment table */ + .balign 4096 + .globl initial_stab +initial_stab: + .space 4096 + +#ifdef CONFIG_PPC_POWERNV +_GLOBAL(opal_mc_secondary_handler) + HMT_MEDIUM_PPR_DISCARD + SET_SCRATCH0(r13) + GET_PACA(r13) + clrldi r3,r3,2 + tovirt(r3,r3) + std r3,PACA_OPAL_MC_EVT(r13) + ld r13,OPAL_MC_SRR0(r3) + mtspr SPRN_SRR0,r13 + ld r13,OPAL_MC_SRR1(r3) + mtspr SPRN_SRR1,r13 + ld r3,OPAL_MC_GPR3(r3) + GET_SCRATCH0(r13) + b machine_check_pSeries +#endif /* CONFIG_PPC_POWERNV */ + + +#define MACHINE_CHECK_HANDLER_WINDUP \ + /* Clear MSR_RI before setting SRR0 and SRR1. */\ + li r0,MSR_RI; \ + mfmsr r9; /* get MSR value */ \ + andc r9,r9,r0; \ + mtmsrd r9,1; /* Clear MSR_RI */ \ + /* Move original SRR0 and SRR1 into the respective regs */ \ + ld r9,_MSR(r1); \ + mtspr SPRN_SRR1,r9; \ + ld r3,_NIP(r1); \ + mtspr SPRN_SRR0,r3; \ + ld r9,_CTR(r1); \ + mtctr r9; \ + ld r9,_XER(r1); \ + mtxer r9; \ + ld r9,_LINK(r1); \ + mtlr r9; \ + REST_GPR(0, r1); \ + REST_8GPRS(2, r1); \ + REST_GPR(10, r1); \ + ld r11,_CCR(r1); \ + mtcr r11; \ + /* Decrement paca->in_mce. */ \ + lhz r12,PACA_IN_MCE(r13); \ + subi r12,r12,1; \ + sth r12,PACA_IN_MCE(r13); \ + REST_GPR(11, r1); \ + REST_2GPRS(12, r1); \ + /* restore original r1. */ \ + ld r1,GPR1(r1) + + /* + * Handle machine check early in real mode. We come here with + * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack. + */ + .align 7 + .globl machine_check_handle_early +machine_check_handle_early: + std r0,GPR0(r1) /* Save r0 */ + EXCEPTION_PROLOG_COMMON_3(0x200) + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_early + std r3,RESULT(r1) /* Save result */ + ld r12,_MSR(r1) +#ifdef CONFIG_PPC_P7_NAP + /* + * Check if thread was in power saving mode. We come here when any + * of the following is true: + * a. thread wasn't in power saving mode + * b. thread was in power saving mode with no state loss or + * supervisor state loss + * + * Go back to nap again if (b) is true. + */ + rlwinm. r11,r12,47-31,30,31 /* Was it in power saving mode? */ + beq 4f /* No, it wasn;t */ + /* Thread was in power saving mode. Go back to nap again. */ + cmpwi r11,2 + bne 3f + /* Supervisor state loss */ + li r0,1 + stb r0,PACA_NAPSTATELOST(r13) +3: bl machine_check_queue_event + MACHINE_CHECK_HANDLER_WINDUP + GET_PACA(r13) + ld r1,PACAR1(r13) + b power7_enter_nap_mode +4: +#endif + /* + * Check if we are coming from hypervisor userspace. If yes then we + * continue in host kernel in V mode to deliver the MC event. + */ + rldicl. r11,r12,4,63 /* See if MC hit while in HV mode. */ + beq 5f + andi. r11,r12,MSR_PR /* See if coming from user. */ + bne 9f /* continue in V mode if we are. */ + +5: +#ifdef CONFIG_KVM_BOOK3S_64_HV + /* + * We are coming from kernel context. Check if we are coming from + * guest. if yes, then we can continue. We will fall through + * do_kvm_200->kvmppc_interrupt to deliver the MC event to guest. + */ + lbz r11,HSTATE_IN_GUEST(r13) + cmpwi r11,0 /* Check if coming from guest */ + bne 9f /* continue if we are. */ +#endif + /* + * At this point we are not sure about what context we come from. + * Queue up the MCE event and return from the interrupt. + * But before that, check if this is an un-recoverable exception. + * If yes, then stay on emergency stack and panic. + */ + andi. r11,r12,MSR_RI + bne 2f +1: mfspr r11,SPRN_SRR0 + ld r10,PACAKBASE(r13) + LOAD_HANDLER(r10,unrecover_mce) + mtspr SPRN_SRR0,r10 + ld r10,PACAKMSR(r13) + /* + * We are going down. But there are chances that we might get hit by + * another MCE during panic path and we may run into unstable state + * with no way out. Hence, turn ME bit off while going down, so that + * when another MCE is hit during panic path, system will checkstop + * and hypervisor will get restarted cleanly by SP. + */ + li r3,MSR_ME + andc r10,r10,r3 /* Turn off MSR_ME */ + mtspr SPRN_SRR1,r10 + rfid + b . +2: + /* + * Check if we have successfully handled/recovered from error, if not + * then stay on emergency stack and panic. + */ + ld r3,RESULT(r1) /* Load result */ + cmpdi r3,0 /* see if we handled MCE successfully */ + + beq 1b /* if !handled then panic */ + /* + * Return from MC interrupt. + * Queue up the MCE event so that we can log it later, while + * returning from kernel or opal call. + */ + bl machine_check_queue_event + MACHINE_CHECK_HANDLER_WINDUP + rfid +9: + /* Deliver the machine check to host kernel in V mode. */ + MACHINE_CHECK_HANDLER_WINDUP + b machine_check_pSeries + +unrecover_mce: + /* Invoke machine_check_exception to print MCE event and panic. */ + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_exception + /* + * We will not reach here. Even if we did, there is no way out. Call + * unrecoverable_exception and die. + */ +1: addi r3,r1,STACK_FRAME_OVERHEAD + bl unrecoverable_exception + b 1b /* * r13 points to the PACA, r9 contains the saved CR, * r12 contain the saved SRR1, SRR0 is still ready for return @@ -1008,7 +1518,7 @@ unrecov_user_slb: * r3 is saved in paca->slb_r3 * We assume we aren't going to take any exceptions during this procedure. */ -_GLOBAL(slb_miss_realmode) +slb_miss_realmode: mflr r10 #ifdef CONFIG_RELOCATABLE mtctr r11 @@ -1017,7 +1527,7 @@ _GLOBAL(slb_miss_realmode) stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ - bl .slb_allocate_realmode + bl slb_allocate_realmode /* All done -- return from exception. */ @@ -1036,6 +1546,7 @@ _GLOBAL(slb_miss_realmode) mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ .machine pop + RESTORE_PPR_PACA(PACA_EXSLB, r9) ld r9,PACA_EXSLB+EX_R9(r13) ld r10,PACA_EXSLB+EX_R10(r13) ld r11,PACA_EXSLB+EX_R11(r13) @@ -1056,9 +1567,9 @@ _GLOBAL(slb_miss_realmode) unrecov_slb: EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB) DISABLE_INTS - bl .save_nvgprs + bl save_nvgprs 1: addi r3,r1,STACK_FRAME_OVERHEAD - bl .unrecoverable_exception + bl unrecoverable_exception b 1b @@ -1071,91 +1582,11 @@ power4_fixup_nap: blr #endif - .align 7 - .globl alignment_common -alignment_common: - mfspr r10,SPRN_DAR - std r10,PACA_EXGEN+EX_DAR(r13) - mfspr r10,SPRN_DSISR - stw r10,PACA_EXGEN+EX_DSISR(r13) - EXCEPTION_PROLOG_COMMON(0x600, PACA_EXGEN) - ld r3,PACA_EXGEN+EX_DAR(r13) - lwz r4,PACA_EXGEN+EX_DSISR(r13) - std r3,_DAR(r1) - std r4,_DSISR(r1) - bl .save_nvgprs - DISABLE_INTS - addi r3,r1,STACK_FRAME_OVERHEAD - bl .alignment_exception - b .ret_from_except - - .align 7 - .globl program_check_common -program_check_common: - EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) - bl .save_nvgprs - DISABLE_INTS - addi r3,r1,STACK_FRAME_OVERHEAD - bl .program_check_exception - b .ret_from_except - - .align 7 - .globl fp_unavailable_common -fp_unavailable_common: - EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN) - bne 1f /* if from user, just load it up */ - bl .save_nvgprs - DISABLE_INTS - addi r3,r1,STACK_FRAME_OVERHEAD - bl .kernel_fp_unavailable_exception - BUG_OPCODE -1: bl .load_up_fpu - b fast_exception_return - - .align 7 - .globl altivec_unavailable_common -altivec_unavailable_common: - EXCEPTION_PROLOG_COMMON(0xf20, PACA_EXGEN) -#ifdef CONFIG_ALTIVEC -BEGIN_FTR_SECTION - beq 1f - bl .load_up_altivec - b fast_exception_return -1: -END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) -#endif - bl .save_nvgprs - DISABLE_INTS - addi r3,r1,STACK_FRAME_OVERHEAD - bl .altivec_unavailable_exception - b .ret_from_except - - .align 7 - .globl vsx_unavailable_common -vsx_unavailable_common: - EXCEPTION_PROLOG_COMMON(0xf40, PACA_EXGEN) -#ifdef CONFIG_VSX -BEGIN_FTR_SECTION - beq 1f - b .load_up_vsx -1: -END_FTR_SECTION_IFSET(CPU_FTR_VSX) -#endif - bl .save_nvgprs - DISABLE_INTS - addi r3,r1,STACK_FRAME_OVERHEAD - bl .vsx_unavailable_exception - b .ret_from_except - - .align 7 - .globl __end_handlers -__end_handlers: - /* * Hash table stuff */ .align 7 -_STATIC(do_hash_page) +do_hash_page: std r3,_DAR(r1) std r4,_DSISR(r1) @@ -1192,7 +1623,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB) * * at return r3 = 0 for success, 1 for page fault, negative for error */ - bl .hash_page /* build HPTE if possible */ + bl hash_page /* build HPTE if possible */ cmpdi r3,0 /* see if hash_page succeeded */ /* Success */ @@ -1206,35 +1637,35 @@ handle_page_fault: 11: ld r4,_DAR(r1) ld r5,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_page_fault + bl do_page_fault cmpdi r3,0 beq+ 12f - bl .save_nvgprs + bl save_nvgprs mr r5,r3 addi r3,r1,STACK_FRAME_OVERHEAD lwz r4,_DAR(r1) - bl .bad_page_fault - b .ret_from_except + bl bad_page_fault + b ret_from_except /* We have a data breakpoint exception - handle it */ handle_dabr_fault: - bl .save_nvgprs + bl save_nvgprs ld r4,_DAR(r1) ld r5,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD - bl .do_dabr -12: b .ret_from_except_lite + bl do_break +12: b ret_from_except_lite /* We have a page fault that hash_page could handle but HV refused * the PTE insertion */ -13: bl .save_nvgprs +13: bl save_nvgprs mr r5,r3 addi r3,r1,STACK_FRAME_OVERHEAD ld r4,_DAR(r1) - bl .low_hash_fault - b .ret_from_except + bl low_hash_fault + b ret_from_except /* * We come here as a result of a DSI at a point where we don't want @@ -1243,16 +1674,16 @@ handle_dabr_fault: * were soft-disabled. We want to invoke the exception handler for * the access, or panic if there isn't a handler. */ -77: bl .save_nvgprs +77: bl save_nvgprs mr r4,r3 addi r3,r1,STACK_FRAME_OVERHEAD li r5,SIGSEGV - bl .bad_page_fault - b .ret_from_except + bl bad_page_fault + b ret_from_except /* here we have a segment miss */ do_ste_alloc: - bl .ste_allocate /* try to insert stab entry */ + bl ste_allocate /* try to insert stab entry */ cmpdi r3,0 bne- handle_page_fault b fast_exception_return @@ -1265,23 +1696,39 @@ do_ste_alloc: * We assume (DAR >> 60) == 0xc. */ .align 7 -_GLOBAL(do_stab_bolted) +do_stab_bolted: stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r11,PACA_EXSLB+EX_SRR0(r13) /* save SRR0 in exc. frame */ + mfspr r11,SPRN_DAR /* ea */ + + /* + * check for bad kernel/user address + * (ea & ~REGION_MASK) >= PGTABLE_RANGE + */ + rldicr. r9,r11,4,(63 - 46 - 4) + li r9,0 /* VSID = 0 for bad address */ + bne- 0f + + /* + * Calculate VSID: + * This is the kernel vsid, we take the top for context from + * the range. context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 + * Here we know that (ea >> 60) == 0xc + */ + lis r9,(MAX_USER_CONTEXT + 1)@ha + addi r9,r9,(MAX_USER_CONTEXT + 1)@l + srdi r10,r11,SID_SHIFT + rldimi r10,r9,ESID_BITS,0 /* proto vsid */ + ASM_VSID_SCRAMBLE(r10, r9, 256M) + rldic r9,r10,12,16 /* r9 = vsid << 12 */ + +0: /* Hash to the primary group */ ld r10,PACASTABVIRT(r13) - mfspr r11,SPRN_DAR - srdi r11,r11,28 + srdi r11,r11,SID_SHIFT rldimi r10,r11,7,52 /* r10 = first ste of the group */ - /* Calculate VSID */ - /* This is a kernel address, so protovsid = ESID | 1 << 37 */ - li r9,0x1 - rldimi r11,r9,(CONTEXT_BITS + USER_ESID_BITS),0 - ASM_VSID_SCRAMBLE(r11, r9, 256M) - rldic r9,r11,12,16 /* r9 = vsid << 12 */ - /* Search the primary group for a free entry */ 1: ld r11,0(r10) /* Test valid bit of the current ste */ andi. r11,r11,0x80 @@ -1344,56 +1791,3 @@ _GLOBAL(do_stab_bolted) ld r13,PACA_EXSLB+EX_R13(r13) rfid b . /* prevent speculative execution */ - - - /* Equivalents to the above handlers for relocation-on interrupt vectors */ - STD_RELON_EXCEPTION_HV(., 0xe00, h_data_storage) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe00) - STD_RELON_EXCEPTION_HV(., 0xe20, h_instr_storage) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe20) - STD_RELON_EXCEPTION_HV(., 0xe40, emulation_assist) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe40) - STD_RELON_EXCEPTION_HV(., 0xe60, hmi_exception) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe60) - - STD_RELON_EXCEPTION_PSERIES(., 0xf00, performance_monitor) - STD_RELON_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable) - STD_RELON_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable) - -#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) -/* - * Data area reserved for FWNMI option. - * This address (0x7000) is fixed by the RPA. - */ - .= 0x7000 - .globl fwnmi_data_area -fwnmi_data_area: - - /* pseries and powernv need to keep the whole page from - * 0x7000 to 0x8000 free for use by the firmware - */ - . = 0x8000 -#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ - -/* Space for CPU0's segment table */ - .balign 4096 - .globl initial_stab -initial_stab: - .space 4096 - -#ifdef CONFIG_PPC_POWERNV -_GLOBAL(opal_mc_secondary_handler) - HMT_MEDIUM - SET_SCRATCH0(r13) - GET_PACA(r13) - clrldi r3,r3,2 - tovirt(r3,r3) - std r3,PACA_OPAL_MC_EVT(r13) - ld r13,OPAL_MC_SRR0(r3) - mtspr SPRN_SRR0,r13 - ld r13,OPAL_MC_SRR1(r3) - mtspr SPRN_SRR1,r13 - ld r3,OPAL_MC_GPR3(r3) - GET_SCRATCH0(r13) - b machine_check_pSeries -#endif /* CONFIG_PPC_POWERNV */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 06c8202a69c..742694c1d85 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -55,9 +55,9 @@ int crash_mem_ranges; int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data) { - __be32 *sections; + const __be32 *sections; int i, num_sections; - unsigned long size; + int size; const int *token; if (depth != 1 || strcmp(uname, "rtas") != 0) @@ -69,7 +69,7 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, */ token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL); if (!token) - return 0; + return 1; fw_dump.fadump_supported = 1; fw_dump.ibm_configure_kernel_dump = *token; @@ -92,7 +92,7 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, &size); if (!sections) - return 0; + return 1; num_sections = size / (3 * sizeof(u32)); @@ -110,6 +110,7 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, break; } } + return 1; } @@ -645,7 +646,7 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) } /* Lower 4 bytes of reg_value contains logical cpu id */ cpu = reg_entry->reg_value & FADUMP_CPU_ID_MASK; - if (!cpumask_test_cpu(cpu, &fdh->cpu_online_mask)) { + if (fdh && !cpumask_test_cpu(cpu, &fdh->cpu_online_mask)) { SKIP_TO_NEXT_CPU(reg_entry); continue; } @@ -662,9 +663,11 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) } fadump_final_note(note_buf); - pr_debug("Updating elfcore header (%llx) with cpu notes\n", + if (fdh) { + pr_debug("Updating elfcore header (%llx) with cpu notes\n", fdh->elfcorehdr_addr); - fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr)); + fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr)); + } return 0; error_out: @@ -1045,10 +1048,7 @@ static void fadump_release_memory(unsigned long begin, unsigned long end) if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start)) continue; - ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); - init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); - free_page((unsigned long)__va(addr)); - totalram_pages++; + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); } } diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index e0ada05f2df..9ad236e5d2c 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -50,12 +50,80 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX); \ #define REST_32FPVSRS(n,c,base) __REST_32FPVSRS(n,__REG_##c,__REG_##base) #define SAVE_32FPVSRS(n,c,base) __SAVE_32FPVSRS(n,__REG_##c,__REG_##base) +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* void do_load_up_transact_fpu(struct thread_struct *thread) + * + * This is similar to load_up_fpu but for the transactional version of the FP + * register set. It doesn't mess with the task MSR or valid flags. + * Furthermore, we don't do lazy FP with TM currently. + */ +_GLOBAL(do_load_up_transact_fpu) + mfmsr r6 + ori r5,r6,MSR_FP +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + oris r5,r5,MSR_VSX@h +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + SYNC + MTMSRD(r5) + + addi r7,r3,THREAD_TRANSACT_FPSTATE + lfd fr0,FPSTATE_FPSCR(r7) + MTFSF_L(fr0) + REST_32FPVSRS(0, R4, R7) + + /* FP/VSX off again */ + MTMSRD(r6) + SYNC + + blr +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + +/* + * Enable use of the FPU, and VSX if possible, for the caller. + */ +_GLOBAL(fp_enable) + mfmsr r3 + ori r3,r3,MSR_FP +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + oris r3,r3,MSR_VSX@h +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + SYNC + MTMSRD(r3) + isync /* (not necessary for arch 2.02 and later) */ + blr + +/* + * Load state from memory into FP registers including FPSCR. + * Assumes the caller has enabled FP in the MSR. + */ +_GLOBAL(load_fp_state) + lfd fr0,FPSTATE_FPSCR(r3) + MTFSF_L(fr0) + REST_32FPVSRS(0, R4, R3) + blr + +/* + * Store FP state into memory, including FPSCR + * Assumes the caller has enabled FP in the MSR. + */ +_GLOBAL(store_fp_state) + SAVE_32FPVSRS(0, R4, R3) + mffs fr0 + stfd fr0,FPSTATE_FPSCR(r3) + blr + /* * This task wants to use the FPU now. * On UP, disable FP for the task which had the FPU previously, * and save its floating-point registers in its thread_struct. * Load up this task's FP registers from its thread_struct, * enable the FPU for the current task and return to the task. + * Note that on 32-bit this can only use registers that will be + * restored by fast_exception_return, i.e. r3 - r6, r10 and r11. */ _GLOBAL(load_up_fpu) mfmsr r5 @@ -81,9 +149,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) beq 1f toreal(r4) addi r4,r4,THREAD /* want last_task_used_math->thread */ - SAVE_32FPVSRS(0, R5, R4) + addi r10,r4,THREAD_FPSTATE + SAVE_32FPVSRS(0, R5, R10) mffs fr0 - stfd fr0,THREAD_FPSCR(r4) + stfd fr0,FPSTATE_FPSCR(r10) PPC_LL r5,PT_REGS(r4) toreal(r5) PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5) @@ -94,7 +163,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif /* CONFIG_SMP */ /* enable use of FP after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ + mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ lwz r4,THREAD_FPEXC_MODE(r5) ori r9,r9,MSR_FP /* enable FP for current */ or r9,r9,r4 @@ -106,9 +175,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) or r12,r12,r4 std r12,_MSR(r1) #endif - lfd fr0,THREAD_FPSCR(r5) + addi r10,r5,THREAD_FPSTATE + lfd fr0,FPSTATE_FPSCR(r10) MTFSF_L(fr0) - REST_32FPVSRS(0, R4, R5) + REST_32FPVSRS(0, R4, R10) #ifndef CONFIG_SMP subi r4,r5,THREAD fromreal(r4) @@ -140,11 +210,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) PPC_LCMPI 0,r3,0 beqlr- /* if no previous owner, done */ addi r3,r3,THREAD /* want THREAD of task */ + PPC_LL r6,THREAD_FPSAVEAREA(r3) PPC_LL r5,PT_REGS(r3) - PPC_LCMPI 0,r5,0 - SAVE_32FPVSRS(0, R4 ,R3) + PPC_LCMPI 0,r6,0 + bne 2f + addi r6,r3,THREAD_FPSTATE +2: PPC_LCMPI 0,r5,0 + SAVE_32FPVSRS(0, R4, R6) mffs fr0 - stfd fr0,THREAD_FPSCR(r3) + stfd fr0,FPSTATE_FPSCR(r6) beq 1f PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5) li r3,MSR_FP|MSR_FE0|MSR_FE1 diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S index a92c79be272..f22e7e44fbf 100644 --- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S +++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S @@ -176,6 +176,8 @@ skpinv: addi r6,r6,1 /* Increment */ /* 7. Jump to KERNELBASE mapping */ lis r6,(KERNELBASE & ~0xfff)@h ori r6,r6,(KERNELBASE & ~0xfff)@l + rlwinm r7,r25,0,0x03ffffff + add r6,r7,r6 #elif defined(ENTRY_MAPPING_KEXEC_SETUP) /* diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 1fb78561096..d178834fe50 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -10,6 +10,8 @@ * */ +#define pr_fmt(fmt) "ftrace-powerpc: " fmt + #include <linux/spinlock.h> #include <linux/hardirq.h> #include <linux/uaccess.h> @@ -74,6 +76,7 @@ ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new) */ static int test_24bit_addr(unsigned long ip, unsigned long addr) { + addr = ppc_function_entry((void *)addr); /* use the create_branch to verify that this offset can be branched */ return create_branch((unsigned int *)ip, addr, 0); @@ -104,11 +107,9 @@ __ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { unsigned int op; - unsigned int jmp[5]; - unsigned long ptr; + unsigned long entry, ptr; unsigned long ip = rec->ip; - unsigned long tramp; - int offset; + void *tramp; /* read where this goes */ if (probe_kernel_read(&op, (void *)ip, sizeof(int))) @@ -116,97 +117,46 @@ __ftrace_make_nop(struct module *mod, /* Make sure that that this is still a 24bit jump */ if (!is_bl_op(op)) { - printk(KERN_ERR "Not expected bl: opcode is %x\n", op); + pr_err("Not expected bl: opcode is %x\n", op); return -EINVAL; } /* lets find where the pointer goes */ - tramp = find_bl_target(ip, op); - - /* - * On PPC64 the trampoline looks like: - * 0x3d, 0x82, 0x00, 0x00, addis r12,r2, <high> - * 0x39, 0x8c, 0x00, 0x00, addi r12,r12, <low> - * Where the bytes 2,3,6 and 7 make up the 32bit offset - * to the TOC that holds the pointer. - * to jump to. - * 0xf8, 0x41, 0x00, 0x28, std r2,40(r1) - * 0xe9, 0x6c, 0x00, 0x20, ld r11,32(r12) - * The actually address is 32 bytes from the offset - * into the TOC. - * 0xe8, 0x4c, 0x00, 0x28, ld r2,40(r12) - */ - - pr_devel("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc); + tramp = (void *)find_bl_target(ip, op); - /* Find where the trampoline jumps to */ - if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) { - printk(KERN_ERR "Failed to read %lx\n", tramp); - return -EFAULT; - } + pr_devel("ip:%lx jumps to %p", ip, tramp); - pr_devel(" %08x %08x", jmp[0], jmp[1]); - - /* verify that this is what we expect it to be */ - if (((jmp[0] & 0xffff0000) != 0x3d820000) || - ((jmp[1] & 0xffff0000) != 0x398c0000) || - (jmp[2] != 0xf8410028) || - (jmp[3] != 0xe96c0020) || - (jmp[4] != 0xe84c0028)) { - printk(KERN_ERR "Not a trampoline\n"); + if (!is_module_trampoline(tramp)) { + pr_err("Not a trampoline\n"); return -EINVAL; } - /* The bottom half is signed extended */ - offset = ((unsigned)((unsigned short)jmp[0]) << 16) + - (int)((short)jmp[1]); - - pr_devel(" %x ", offset); - - /* get the address this jumps too */ - tramp = mod->arch.toc + offset + 32; - pr_devel("toc: %lx", tramp); - - if (probe_kernel_read(jmp, (void *)tramp, 8)) { - printk(KERN_ERR "Failed to read %lx\n", tramp); + if (module_trampoline_target(mod, tramp, &ptr)) { + pr_err("Failed to get trampoline target\n"); return -EFAULT; } - pr_devel(" %08x %08x\n", jmp[0], jmp[1]); - - ptr = ((unsigned long)jmp[0] << 32) + jmp[1]; + pr_devel("trampoline target %lx", ptr); + entry = ppc_global_function_entry((void *)addr); /* This should match what was called */ - if (ptr != ppc_function_entry((void *)addr)) { - printk(KERN_ERR "addr does not match %lx\n", ptr); + if (ptr != entry) { + pr_err("addr %lx does not match expected %lx\n", ptr, entry); return -EINVAL; } /* - * We want to nop the line, but the next line is - * 0xe8, 0x41, 0x00, 0x28 ld r2,40(r1) - * This needs to be turned to a nop too. - */ - if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE)) - return -EFAULT; - - if (op != 0xe8410028) { - printk(KERN_ERR "Next line is not ld! (%08x)\n", op); - return -EINVAL; - } - - /* - * Milton Miller pointed out that we can not blindly do nops. - * If a task was preempted when calling a trace function, - * the nops will remove the way to restore the TOC in r2 - * and the r2 TOC will get corrupted. - */ - - /* - * Replace: - * bl <tramp> <==== will be replaced with "b 1f" - * ld r2,40(r1) - * 1: + * Our original call site looks like: + * + * bl <tramp> + * ld r2,XX(r1) + * + * Milton Miller pointed out that we can not simply nop the branch. + * If a task was preempted when calling a trace function, the nops + * will remove the way to restore the TOC in r2 and the r2 TOC will + * get corrupted. + * + * Use a b +8 to jump over the load. */ op = 0x48000008; /* b +8 */ @@ -231,7 +181,7 @@ __ftrace_make_nop(struct module *mod, /* Make sure that that this is still a 24bit jump */ if (!is_bl_op(op)) { - printk(KERN_ERR "Not expected bl: opcode is %x\n", op); + pr_err("Not expected bl: opcode is %x\n", op); return -EINVAL; } @@ -250,7 +200,7 @@ __ftrace_make_nop(struct module *mod, /* Find where the trampoline jumps to */ if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) { - printk(KERN_ERR "Failed to read %lx\n", tramp); + pr_err("Failed to read %lx\n", tramp); return -EFAULT; } @@ -261,7 +211,7 @@ __ftrace_make_nop(struct module *mod, ((jmp[1] & 0xffff0000) != 0x398c0000) || (jmp[2] != 0x7d8903a6) || (jmp[3] != 0x4e800420)) { - printk(KERN_ERR "Not a trampoline\n"); + pr_err("Not a trampoline\n"); return -EINVAL; } @@ -273,8 +223,7 @@ __ftrace_make_nop(struct module *mod, pr_devel(" %lx ", tramp); if (tramp != addr) { - printk(KERN_ERR - "Trampoline location %08lx does not match addr\n", + pr_err("Trampoline location %08lx does not match addr\n", tramp); return -EINVAL; } @@ -315,15 +264,13 @@ int ftrace_make_nop(struct module *mod, */ if (!rec->arch.mod) { if (!mod) { - printk(KERN_ERR "No module loaded addr=%lx\n", - addr); + pr_err("No module loaded addr=%lx\n", addr); return -EFAULT; } rec->arch.mod = mod; } else if (mod) { if (mod != rec->arch.mod) { - printk(KERN_ERR - "Record mod %p not equal to passed in mod %p\n", + pr_err("Record mod %p not equal to passed in mod %p\n", rec->arch.mod, mod); return -EINVAL; } @@ -344,45 +291,42 @@ static int __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned int op[2]; - unsigned long ip = rec->ip; + void *ip = (void *)rec->ip; /* read where this goes */ - if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2)) + if (probe_kernel_read(op, ip, sizeof(op))) return -EFAULT; /* - * It should be pointing to two nops or - * b +8; ld r2,40(r1) + * We expect to see: + * + * b +8 + * ld r2,XX(r1) + * + * The load offset is different depending on the ABI. For simplicity + * just mask it out when doing the compare. */ - if (((op[0] != 0x48000008) || (op[1] != 0xe8410028)) && - ((op[0] != PPC_INST_NOP) || (op[1] != PPC_INST_NOP))) { - printk(KERN_ERR "Expected NOPs but have %x %x\n", op[0], op[1]); + if ((op[0] != 0x48000008) || ((op[1] & 0xffff0000) != 0xe8410000)) { + pr_err("Unexpected call sequence: %x %x\n", op[0], op[1]); return -EINVAL; } /* If we never set up a trampoline to ftrace_caller, then bail */ if (!rec->arch.mod->arch.tramp) { - printk(KERN_ERR "No ftrace trampoline\n"); + pr_err("No ftrace trampoline\n"); return -EINVAL; } - /* create the branch to the trampoline */ - op[0] = create_branch((unsigned int *)ip, - rec->arch.mod->arch.tramp, BRANCH_SET_LINK); - if (!op[0]) { - printk(KERN_ERR "REL24 out of range!\n"); + /* Ensure branch is within 24 bits */ + if (!create_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) { + pr_err("Branch out of range\n"); return -EINVAL; } - /* ld r2,40(r1) */ - op[1] = 0xe8410028; - - pr_devel("write to %lx\n", rec->ip); - - if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2)) - return -EPERM; - - flush_icache_range(ip, ip + 8); + if (patch_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) { + pr_err("REL24 out of range!\n"); + return -EINVAL; + } return 0; } @@ -399,13 +343,13 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) /* It should be pointing to a nop */ if (op != PPC_INST_NOP) { - printk(KERN_ERR "Expected NOP but have %x\n", op); + pr_err("Expected NOP but have %x\n", op); return -EINVAL; } /* If we never set up a trampoline to ftrace_caller, then bail */ if (!rec->arch.mod->arch.tramp) { - printk(KERN_ERR "No ftrace trampoline\n"); + pr_err("No ftrace trampoline\n"); return -EINVAL; } @@ -413,7 +357,7 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) op = create_branch((unsigned int *)ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK); if (!op) { - printk(KERN_ERR "REL24 out of range!\n"); + pr_err("REL24 out of range!\n"); return -EINVAL; } @@ -451,7 +395,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) * already have a module defined. */ if (!rec->arch.mod) { - printk(KERN_ERR "No module loaded\n"); + pr_err("No module loaded\n"); return -EINVAL; } @@ -527,13 +471,8 @@ void arch_ftrace_update_code(int command) ftrace_disable_ftrace_graph_caller(); } -int __init ftrace_dyn_arch_init(void *data) +int __init ftrace_dyn_arch_init(void) { - /* caller expects data to be zero */ - unsigned long *p = data; - - *p = 0; - return 0; } #endif /* CONFIG_DYNAMIC_FTRACE */ diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 4989661b710..7d7d8635227 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -430,30 +430,18 @@ label: EXCEPTION(0x0F00, Trap_0F, unknown_exception, EXC_XFER_EE) /* 0x1000 - Programmable Interval Timer (PIT) Exception */ - START_EXCEPTION(0x1000, Decrementer) - NORMAL_EXCEPTION_PROLOG - lis r0,TSR_PIS@h - mtspr SPRN_TSR,r0 /* Clear the PIT exception */ - addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_LITE(0x1000, timer_interrupt) - -#if 0 -/* NOTE: - * FIT and WDT handlers are not implemented yet. - */ + . = 0x1000 + b Decrementer /* 0x1010 - Fixed Interval Timer (FIT) Exception */ - STND_EXCEPTION(0x1010, FITException, unknown_exception) + . = 0x1010 + b FITException /* 0x1020 - Watchdog Timer (WDT) Exception */ -#ifdef CONFIG_BOOKE_WDT - CRITICAL_EXCEPTION(0x1020, WDTException, WatchdogException) -#else - CRITICAL_EXCEPTION(0x1020, WDTException, unknown_exception) -#endif -#endif + . = 0x1020 + b WDTException /* 0x1100 - Data TLB Miss Exception * As the name implies, translation is not in the MMU, so search the @@ -738,6 +726,29 @@ label: (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ NOCOPY, crit_transfer_to_handler, ret_from_crit_exc) + /* Programmable Interval Timer (PIT) Exception. (from 0x1000) */ +Decrementer: + NORMAL_EXCEPTION_PROLOG + lis r0,TSR_PIS@h + mtspr SPRN_TSR,r0 /* Clear the PIT exception */ + addi r3,r1,STACK_FRAME_OVERHEAD + EXC_XFER_LITE(0x1000, timer_interrupt) + + /* Fixed Interval Timer (FIT) Exception. (from 0x1010) */ +FITException: + NORMAL_EXCEPTION_PROLOG + addi r3,r1,STACK_FRAME_OVERHEAD; + EXC_XFER_EE(0x1010, unknown_exception) + + /* Watchdog Timer (WDT) Exception. (from 0x1020) */ +WDTException: + CRITICAL_EXCEPTION_PROLOG; + addi r3,r1,STACK_FRAME_OVERHEAD; + EXC_XFER_TEMPLATE(WatchdogException, 0x1020+2, + (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), + NOCOPY, crit_transfer_to_handler, + ret_from_crit_exc) + /* * The other Data TLB exceptions bail out to this point * if they can't resolve the lightweight TLB fault. @@ -811,14 +822,6 @@ finish_tlb_load: rfi /* Should sync shadow TLBs */ b . /* prevent prefetch past rfi */ -/* extern void giveup_fpu(struct task_struct *prev) - * - * The PowerPC 4xx family of processors do not have an FPU, so this just - * returns. - */ -_ENTRY(giveup_fpu) - blr - /* This is where the main kernel code starts. */ start_here: @@ -927,25 +930,6 @@ initial_mmu: tlbwe r4,r0,TLB_DATA /* Load the data portion of the entry */ tlbwe r3,r0,TLB_TAG /* Load the tag portion of the entry */ -#if defined(CONFIG_SERIAL_TEXT_DEBUG) && defined(SERIAL_DEBUG_IO_BASE) - - /* Load a TLB entry for the UART, so that ppc4xx_progress() can use - * the UARTs nice and early. We use a 4k real==virtual mapping. */ - - lis r3,SERIAL_DEBUG_IO_BASE@h - ori r3,r3,SERIAL_DEBUG_IO_BASE@l - mr r4,r3 - clrrwi r4,r4,12 - ori r4,r4,(TLB_WR|TLB_I|TLB_M|TLB_G) - - clrrwi r3,r3,12 - ori r3,r3,(TLB_VALID | TLB_PAGESZ(PAGESZ_4K)) - - li r0,0 /* TLB slot 0 */ - tlbwe r4,r0,TLB_DATA - tlbwe r3,r0,TLB_TAG -#endif /* CONFIG_SERIAL_DEBUG_TEXT && SERIAL_DEBUG_IO_BASE */ - isync /* Establish the exception vector base diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 7a2e5e421ab..c334f53453f 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -769,6 +769,8 @@ finish_tlb_load_47x: */ DEBUG_CRIT_EXCEPTION +interrupt_end: + /* * Global functions */ @@ -782,16 +784,6 @@ _GLOBAL(__fixup_440A_mcheck) sync blr -/* - * extern void giveup_fpu(struct task_struct *prev) - * - * The 44x core does not have an FPU. - */ -#ifndef CONFIG_PPC_FPU -_GLOBAL(giveup_fpu) - blr -#endif - _GLOBAL(set_context) #ifdef CONFIG_BDI_SWITCH diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 116f0868695..a95145d7f61 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -23,6 +23,7 @@ */ #include <linux/threads.h> +#include <linux/init.h> #include <asm/reg.h> #include <asm/page.h> #include <asm/mmu.h> @@ -68,17 +69,18 @@ _stext: _GLOBAL(__start) /* NOP this out unconditionally */ BEGIN_FTR_SECTION - b .__start_initialization_multiplatform + FIXUP_ENDIAN + b __start_initialization_multiplatform END_FTR_SECTION(0, 1) /* Catch branch to 0 in real mode */ trap - /* Secondary processors spin on this value until it becomes nonzero. - * When it does it contains the real address of the descriptor - * of the function that the cpu should jump to to continue - * initialization. + /* Secondary processors spin on this value until it becomes non-zero. + * When non-zero, it contains the real address of the function the cpu + * should jump to. */ + .balign 8 .globl __secondary_hold_spinloop __secondary_hold_spinloop: .llong 0x0 @@ -115,6 +117,7 @@ __run_at_load: */ .globl __secondary_hold __secondary_hold: + FIXUP_ENDIAN #ifndef CONFIG_PPC_BOOK3E mfmsr r24 ori r24,r24,MSR_RI @@ -122,6 +125,8 @@ __secondary_hold: #endif /* Grab our physical cpu number */ mr r24,r3 + /* stash r4 for book3e */ + mr r25,r4 /* Tell the master cpu we're here */ /* Relocation is off & we are located at an address less */ @@ -129,16 +134,30 @@ __secondary_hold: std r24,__secondary_hold_acknowledge-_stext(0) sync + li r26,0 +#ifdef CONFIG_PPC_BOOK3E + tovirt(r26,r26) +#endif /* All secondary cpus wait here until told to start. */ -100: ld r4,__secondary_hold_spinloop-_stext(0) - cmpdi 0,r4,0 +100: ld r12,__secondary_hold_spinloop-_stext(r26) + cmpdi 0,r12,0 beq 100b #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC) - ld r4,0(r4) /* deref function descriptor */ - mtctr r4 +#ifdef CONFIG_PPC_BOOK3E + tovirt(r12,r12) +#endif + mtctr r12 mr r3,r24 + /* + * it may be the case that other platforms have r4 right to + * begin with, this gives us some safety in case it is not + */ +#ifdef CONFIG_PPC_BOOK3E + mr r4,r25 +#else li r4,0 +#endif /* Make sure that patched code is visible */ isync bctr @@ -165,15 +184,16 @@ _GLOBAL(generic_secondary_thread_init) mr r24,r3 /* turn on 64-bit mode */ - bl .enable_64b_mode + bl enable_64b_mode /* get a valid TOC pointer, wherever we're mapped at */ - bl .relative_toc + bl relative_toc + tovirt(r2,r2) #ifdef CONFIG_PPC_BOOK3E /* Book3E initialization */ mr r3,r24 - bl .book3e_secondary_thread_init + bl book3e_secondary_thread_init #endif b generic_secondary_common_init @@ -187,20 +207,22 @@ _GLOBAL(generic_secondary_thread_init) * as SCOM before entry). */ _GLOBAL(generic_secondary_smp_init) + FIXUP_ENDIAN mr r24,r3 mr r25,r4 /* turn on 64-bit mode */ - bl .enable_64b_mode + bl enable_64b_mode /* get a valid TOC pointer, wherever we're mapped at */ - bl .relative_toc + bl relative_toc + tovirt(r2,r2) #ifdef CONFIG_PPC_BOOK3E /* Book3E initialization */ mr r3,r24 mr r4,r25 - bl .book3e_secondary_core_init + bl book3e_secondary_core_init #endif generic_secondary_common_init: @@ -212,7 +234,7 @@ generic_secondary_common_init: ld r13,0(r13) /* Get base vaddr of paca array */ #ifndef CONFIG_SMP addi r13,r13,PACA_SIZE /* know r13 if used accidentally */ - b .kexec_wait /* wait for next kernel if !SMP */ + b kexec_wait /* wait for next kernel if !SMP */ #else LOAD_REG_ADDR(r7, nr_cpu_ids) /* Load nr_cpu_ids address */ lwz r7,0(r7) /* also the max paca allocated */ @@ -226,7 +248,7 @@ generic_secondary_common_init: blt 1b mr r3,r24 /* not found, copy phys to r3 */ - b .kexec_wait /* next kernel might do better */ + b kexec_wait /* next kernel might do better */ 2: SET_PACA(r13) #ifdef CONFIG_PPC_BOOK3E @@ -240,11 +262,13 @@ generic_secondary_common_init: /* See if we need to call a cpu state restore handler */ LOAD_REG_ADDR(r23, cur_cpu_spec) ld r23,0(r23) - ld r23,CPU_SPEC_RESTORE(r23) - cmpdi 0,r23,0 + ld r12,CPU_SPEC_RESTORE(r23) + cmpdi 0,r12,0 beq 3f - ld r23,0(r23) - mtctr r23 +#if !defined(_CALL_ELF) || _CALL_ELF != 2 + ld r12,0(r12) +#endif + mtctr r12 bctrl 3: LOAD_REG_ADDR(r3, spinning_secondaries) /* Decrement spinning_secondaries */ @@ -275,7 +299,7 @@ generic_secondary_common_init: * Assumes we're mapped EA == RA if the MMU is on. */ #ifdef CONFIG_PPC_BOOK3S -_STATIC(__mmu_off) +__mmu_off: mfmsr r3 andi. r0,r3,MSR_IR|MSR_DR beqlr @@ -300,12 +324,12 @@ _STATIC(__mmu_off) * DT block, r4 is a physical pointer to the kernel itself * */ -_GLOBAL(__start_initialization_multiplatform) +__start_initialization_multiplatform: /* Make sure we are running in 64 bits mode */ - bl .enable_64b_mode + bl enable_64b_mode /* Get TOC pointer (current runtime address) */ - bl .relative_toc + bl relative_toc /* find out where we are now */ bcl 20,31,$+4 @@ -318,7 +342,7 @@ _GLOBAL(__start_initialization_multiplatform) */ cmpldi cr0,r5,0 beq 1f - b .__boot_from_prom /* yes -> prom */ + b __boot_from_prom /* yes -> prom */ 1: /* Save parameters */ mr r31,r3 @@ -330,8 +354,8 @@ _GLOBAL(__start_initialization_multiplatform) #endif #ifdef CONFIG_PPC_BOOK3E - bl .start_initialization_book3e - b .__after_prom_start + bl start_initialization_book3e + b __after_prom_start #else /* Setup some critical 970 SPRs before switching MMU off */ mfspr r0,SPRN_PVR @@ -344,15 +368,15 @@ _GLOBAL(__start_initialization_multiplatform) beq 1f cmpwi r0,0x45 /* 970GX */ bne 2f -1: bl .__cpu_preinit_ppc970 +1: bl __cpu_preinit_ppc970 2: /* Switch off MMU if not already off */ - bl .__mmu_off - b .__after_prom_start + bl __mmu_off + b __after_prom_start #endif /* CONFIG_PPC_BOOK3E */ -_INIT_STATIC(__boot_from_prom) +__boot_from_prom: #ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE /* Save parameters */ mr r31,r3 @@ -371,7 +395,7 @@ _INIT_STATIC(__boot_from_prom) #ifdef CONFIG_RELOCATABLE /* Relocate code for where we are now */ mr r3,r26 - bl .relocate + bl relocate #endif /* Restore parameters */ @@ -383,14 +407,14 @@ _INIT_STATIC(__boot_from_prom) /* Do all of the interaction with OF client interface */ mr r8,r26 - bl .prom_init + bl prom_init #endif /* #CONFIG_PPC_OF_BOOT_TRAMPOLINE */ /* We never return. We also hit that trap if trying to boot * from OF while CONFIG_PPC_OF_BOOT_TRAMPOLINE isn't selected */ trap -_STATIC(__after_prom_start) +__after_prom_start: #ifdef CONFIG_RELOCATABLE /* process relocations for the final address of the kernel */ lis r25,PAGE_OFFSET@highest /* compute virtual base of kernel */ @@ -400,7 +424,7 @@ _STATIC(__after_prom_start) bne 1f add r25,r25,r26 1: mr r3,r25 - bl .relocate + bl relocate #endif /* @@ -440,22 +464,23 @@ _STATIC(__after_prom_start) lis r5,(copy_to_here - _stext)@ha addi r5,r5,(copy_to_here - _stext)@l /* # bytes of memory to copy */ - bl .copy_and_flush /* copy the first n bytes */ + bl copy_and_flush /* copy the first n bytes */ /* this includes the code being */ /* executed here. */ addis r8,r3,(4f - _stext)@ha /* Jump to the copy of this code */ - addi r8,r8,(4f - _stext)@l /* that we just made */ - mtctr r8 + addi r12,r8,(4f - _stext)@l /* that we just made */ + mtctr r12 bctr +.balign 8 p_end: .llong _end - _stext 4: /* Now copy the rest of the kernel up to _end */ addis r5,r26,(p_end - _stext)@ha ld r5,(p_end - _stext)@l(r5) /* get _end */ -5: bl .copy_and_flush /* copy the rest */ +5: bl copy_and_flush /* copy the rest */ -9: b .start_here_multiplatform +9: b start_here_multiplatform /* * Copy routine used to copy the kernel to start at physical address 0 @@ -490,6 +515,7 @@ _GLOBAL(copy_and_flush) sync addi r5,r5,8 addi r6,r6,8 + isync blr .align 8 @@ -518,7 +544,7 @@ __secondary_start_pmac_0: _GLOBAL(pmac_secondary_start) /* turn on 64-bit mode */ - bl .enable_64b_mode + bl enable_64b_mode li r0,0 mfspr r3,SPRN_HID4 @@ -530,10 +556,11 @@ _GLOBAL(pmac_secondary_start) slbia /* get TOC pointer (real address) */ - bl .relative_toc + bl relative_toc + tovirt(r2,r2) /* Copy some CPU settings from CPU 0 */ - bl .__restore_cpu_ppc970 + bl __restore_cpu_ppc970 /* pSeries do that early though I don't think we really need it */ mfmsr r3 @@ -592,7 +619,7 @@ __secondary_start: std r14,PACAKSAVE(r13) /* Do early setup for that CPU (stab, slb, hash table pointer) */ - bl .early_setup_secondary + bl early_setup_secondary /* * setup the new stack pointer, but *don't* use this until @@ -612,7 +639,7 @@ __secondary_start: stb r0,PACAIRQHAPPENED(r13) /* enable MMU and jump to start_secondary */ - LOAD_REG_ADDR(r3, .start_secondary_prolog) + LOAD_REG_ADDR(r3, start_secondary_prolog) LOAD_REG_IMMEDIATE(r4, MSR_KERNEL) mtspr SPRN_SRR0,r3 @@ -625,11 +652,11 @@ __secondary_start: * zero the stack back-chain pointer and get the TOC virtual address * before going into C code. */ -_GLOBAL(start_secondary_prolog) +start_secondary_prolog: ld r2,PACATOC(r13) li r3,0 std r3,0(r1) /* Zero the stack frame pointer */ - bl .start_secondary + bl start_secondary b . /* * Reset stack pointer and call start_secondary @@ -640,14 +667,14 @@ _GLOBAL(start_secondary_resume) ld r1,PACAKSAVE(r13) /* Reload kernel stack pointer */ li r3,0 std r3,0(r1) /* Zero the stack frame pointer */ - bl .start_secondary + bl start_secondary b . #endif /* * This subroutine clobbers r11 and r12 */ -_GLOBAL(enable_64b_mode) +enable_64b_mode: mfmsr r11 /* grab the current MSR */ #ifdef CONFIG_PPC_BOOK3E oris r11,r11,0x8000 /* CM bit set, we'll set ICM later */ @@ -665,6 +692,13 @@ _GLOBAL(enable_64b_mode) * This puts the TOC pointer into r2, offset by 0x8000 (as expected * by the toolchain). It computes the correct value for wherever we * are running at the moment, using position-independent code. + * + * Note: The compiler constructs pointers using offsets from the + * TOC in -mcmodel=medium mode. After we relocate to 0 but before + * the MMU is on we need our TOC to be a virtual address otherwise + * these pointers will be real addresses which may get stored and + * accessed later with the MMU on. We use tovirt() at the call + * sites to handle this. */ _GLOBAL(relative_toc) mflr r0 @@ -675,14 +709,16 @@ _GLOBAL(relative_toc) mtlr r0 blr +.balign 8 p_toc: .llong __toc_start + 0x8000 - 0b /* * This is where the main kernel code starts. */ -_INIT_STATIC(start_here_multiplatform) - /* set up the TOC (real address) */ - bl .relative_toc +start_here_multiplatform: + /* set up the TOC */ + bl relative_toc + tovirt(r2,r2) /* Clear out the BSS. It may have been done in prom_init, * already but that's irrelevant since prom_init will soon @@ -740,9 +776,9 @@ _INIT_STATIC(start_here_multiplatform) /* Restore parameters passed from prom_init/kexec */ mr r3,r31 - bl .early_setup /* also sets r13 and SPRG_PACA */ + bl early_setup /* also sets r13 and SPRG_PACA */ - LOAD_REG_ADDR(r3, .start_here_common) + LOAD_REG_ADDR(r3, start_here_common) ld r4,PACAKMSR(r13) mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 @@ -750,7 +786,8 @@ _INIT_STATIC(start_here_multiplatform) b . /* prevent speculative execution */ /* This is where all platforms converge execution */ -_INIT_GLOBAL(start_here_common) + +start_here_common: /* relocation is on at this point */ std r1,PACAKSAVE(r13) @@ -758,7 +795,7 @@ _INIT_GLOBAL(start_here_common) ld r2,PACATOC(r13) /* Do more system initializations in virtual mode */ - bl .setup_system + bl setup_system /* Mark interrupts soft and hard disabled (they might be enabled * in the PACA when doing hotplug) @@ -769,7 +806,7 @@ _INIT_GLOBAL(start_here_common) stb r0,PACAIRQHAPPENED(r13) /* Generic kernel entry */ - bl .start_kernel + bl start_kernel /* Not reached */ BUG_OPCODE diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index b2a5860accf..7ee876d2adb 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -691,10 +691,6 @@ modified_instr: b 151b #endif - .globl giveup_fpu -giveup_fpu: - blr - /* * This is where the main kernel code starts. */ @@ -862,6 +858,9 @@ initial_mmu: addis r11, r11, 0x0080 /* Add 8M */ mtspr SPRN_MD_RPN, r11 + addi r10, r10, 0x0100 + mtspr SPRN_MD_CTR, r10 + addis r8, r8, 0x0080 /* Add 8M */ mtspr SPRN_MD_EPN, r8 mtspr SPRN_MD_TWC, r9 diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 5f051eeb93a..a620203f7de 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -199,11 +199,6 @@ .align 5; \ label: -#define FINISH_EXCEPTION(func) \ - bl transfer_to_handler_full; \ - .long func; \ - .long ret_from_except_full - #define EXCEPTION(n, intno, label, hdlr, xfer) \ START_EXCEPTION(label); \ NORMAL_EXCEPTION_PROLOG(intno); \ @@ -286,13 +281,13 @@ label: andis. r10,r10,(DBSR_IC|DBSR_BT)@h; \ beq+ 2f; \ \ - lis r10,KERNELBASE@h; /* check if exception in vectors */ \ - ori r10,r10,KERNELBASE@l; \ + lis r10,interrupt_base@h; /* check if exception in vectors */ \ + ori r10,r10,interrupt_base@l; \ cmplw r12,r10; \ blt+ 2f; /* addr below exception vectors */ \ \ - lis r10,DebugDebug@h; \ - ori r10,r10,DebugDebug@l; \ + lis r10,interrupt_end@h; \ + ori r10,r10,interrupt_end@l; \ cmplw r12,r10; \ bgt+ 2f; /* addr above exception vectors */ \ \ @@ -339,13 +334,13 @@ label: andis. r10,r10,(DBSR_IC|DBSR_BT)@h; \ beq+ 2f; \ \ - lis r10,KERNELBASE@h; /* check if exception in vectors */ \ - ori r10,r10,KERNELBASE@l; \ + lis r10,interrupt_base@h; /* check if exception in vectors */ \ + ori r10,r10,interrupt_base@l; \ cmplw r12,r10; \ blt+ 2f; /* addr below exception vectors */ \ \ - lis r10,DebugCrit@h; \ - ori r10,r10,DebugCrit@l; \ + lis r10,interrupt_end@h; \ + ori r10,r10,interrupt_end@l; \ cmplw r12,r10; \ bgt+ 2f; /* addr above exception vectors */ \ \ diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 6f62a737f60..b497188a94a 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -65,29 +65,78 @@ _ENTRY(_start); nop /* Translate device tree address to physical, save in r30/r31 */ - mfmsr r16 - mfspr r17,SPRN_PID - rlwinm r17,r17,16,0x3fff0000 /* turn PID into MAS6[SPID] */ - rlwimi r17,r16,28,0x00000001 /* turn MSR[DS] into MAS6[SAS] */ - mtspr SPRN_MAS6,r17 - - tlbsx 0,r3 /* must succeed */ - - mfspr r16,SPRN_MAS1 - mfspr r20,SPRN_MAS3 - rlwinm r17,r16,25,0x1f /* r17 = log2(page size) */ - li r18,1024 - slw r18,r18,r17 /* r18 = page size */ - addi r18,r18,-1 - and r19,r3,r18 /* r19 = page offset */ - andc r31,r20,r18 /* r31 = page base */ - or r31,r31,r19 /* r31 = devtree phys addr */ - mfspr r30,SPRN_MAS7 + bl get_phys_addr + mr r30,r3 + mr r31,r4 li r25,0 /* phys kernel start (low) */ li r24,0 /* CPU number */ li r23,0 /* phys kernel start (high) */ +#ifdef CONFIG_RELOCATABLE + LOAD_REG_ADDR_PIC(r3, _stext) /* Get our current runtime base */ + + /* Translate _stext address to physical, save in r23/r25 */ + bl get_phys_addr + mr r23,r3 + mr r25,r4 + + bl 0f +0: mflr r8 + addis r3,r8,(is_second_reloc - 0b)@ha + lwz r19,(is_second_reloc - 0b)@l(r3) + + /* Check if this is the second relocation. */ + cmpwi r19,1 + bne 1f + + /* + * For the second relocation, we already get the real memstart_addr + * from device tree. So we will map PAGE_OFFSET to memstart_addr, + * then the virtual address of start kernel should be: + * PAGE_OFFSET + (kernstart_addr - memstart_addr) + * Since the offset between kernstart_addr and memstart_addr should + * never be beyond 1G, so we can just use the lower 32bit of them + * for the calculation. + */ + lis r3,PAGE_OFFSET@h + + addis r4,r8,(kernstart_addr - 0b)@ha + addi r4,r4,(kernstart_addr - 0b)@l + lwz r5,4(r4) + + addis r6,r8,(memstart_addr - 0b)@ha + addi r6,r6,(memstart_addr - 0b)@l + lwz r7,4(r6) + + subf r5,r7,r5 + add r3,r3,r5 + b 2f + +1: + /* + * We have the runtime (virutal) address of our base. + * We calculate our shift of offset from a 64M page. + * We could map the 64M page we belong to at PAGE_OFFSET and + * get going from there. + */ + lis r4,KERNELBASE@h + ori r4,r4,KERNELBASE@l + rlwinm r6,r25,0,0x3ffffff /* r6 = PHYS_START % 64M */ + rlwinm r5,r4,0,0x3ffffff /* r5 = KERNELBASE % 64M */ + subf r3,r5,r6 /* r3 = r6 - r5 */ + add r3,r4,r3 /* Required Virtual Address */ + +2: bl relocate + + /* + * For the second relocation, we already set the right tlb entries + * for the kernel space, so skip the code in fsl_booke_entry_mapping.S + */ + cmpwi r19,1 + beq set_ivor +#endif + /* We try to not make any assumptions about how the boot loader * setup or used the TLBs. We invalidate all mappings from the * boot loader and load a single entry in TLB1[0] to map the @@ -113,6 +162,7 @@ _ENTRY(__early_start) #include "fsl_booke_entry_mapping.S" #undef ENTRY_MAPPING_BOOT_SETUP +set_ivor: /* Establish the interrupt vector offsets */ SET_IVOR(0, CriticalInput); SET_IVOR(1, MachineCheck); @@ -166,8 +216,7 @@ _ENTRY(__early_start) /* Check to see if we're the second processor, and jump * to the secondary_start code if so */ - lis r24, boot_cpuid@h - ori r24, r24, boot_cpuid@l + LOAD_REG_ADDR_PIC(r24, boot_cpuid) lwz r24, 0(r24) cmpwi r24, -1 mfspr r24,SPRN_PIR @@ -197,6 +246,18 @@ _ENTRY(__early_start) bl early_init +#ifdef CONFIG_RELOCATABLE + mr r3,r30 + mr r4,r31 +#ifdef CONFIG_PHYS_64BIT + mr r5,r23 + mr r6,r25 +#else + mr r5,r25 +#endif + bl relocate_init +#endif + #ifdef CONFIG_DYNAMIC_MEMSTART lis r3,kernstart_addr@ha la r3,kernstart_addr@l(r3) @@ -555,27 +616,27 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) #ifdef CONFIG_SPE /* SPE Unavailable */ START_EXCEPTION(SPEUnavailable) - NORMAL_EXCEPTION_PROLOG(SPE_UNAVAIL) + NORMAL_EXCEPTION_PROLOG(SPE_ALTIVEC_UNAVAIL) beq 1f bl load_up_spe b fast_exception_return 1: addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_EE_LITE(0x2010, KernelSPE) #else - EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, \ + EXCEPTION(0x2020, SPE_ALTIVEC_UNAVAIL, SPEUnavailable, \ unknown_exception, EXC_XFER_EE) #endif /* CONFIG_SPE */ /* SPE Floating Point Data */ #ifdef CONFIG_SPE - EXCEPTION(0x2030, SPE_FP_DATA, SPEFloatingPointData, \ - SPEFloatingPointException, EXC_XFER_EE); + EXCEPTION(0x2030, SPE_FP_DATA_ALTIVEC_ASSIST, SPEFloatingPointData, + SPEFloatingPointException, EXC_XFER_EE) /* SPE Floating Point Round */ EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ SPEFloatingPointRoundException, EXC_XFER_EE) #else - EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData, \ + EXCEPTION(0x2040, SPE_FP_DATA_ALTIVEC_ASSIST, SPEFloatingPointData, unknown_exception, EXC_XFER_EE) EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ unknown_exception, EXC_XFER_EE) @@ -605,6 +666,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) /* Embedded Hypervisor Privilege */ EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_EE) +interrupt_end: + /* * Local functions */ @@ -854,6 +917,33 @@ KernelSPE: #endif /* CONFIG_SPE */ /* + * Translate the effec addr in r3 to phys addr. The phys addr will be put + * into r3(higher 32bit) and r4(lower 32bit) + */ +get_phys_addr: + mfmsr r8 + mfspr r9,SPRN_PID + rlwinm r9,r9,16,0x3fff0000 /* turn PID into MAS6[SPID] */ + rlwimi r9,r8,28,0x00000001 /* turn MSR[DS] into MAS6[SAS] */ + mtspr SPRN_MAS6,r9 + + tlbsx 0,r3 /* must succeed */ + + mfspr r8,SPRN_MAS1 + mfspr r12,SPRN_MAS3 + rlwinm r9,r8,25,0x1f /* r9 = log2(page size) */ + li r10,1024 + slw r10,r10,r9 /* r10 = page size */ + addi r10,r10,-1 + and r11,r3,r10 /* r11 = page offset */ + andc r4,r12,r10 /* r4 = page base */ + or r4,r4,r11 /* r4 = devtree phys addr */ +#ifdef CONFIG_PHYS_64BIT + mfspr r3,SPRN_MAS7 +#endif + blr + +/* * Global functions */ @@ -946,16 +1036,6 @@ _GLOBAL(giveup_spe) #endif /* CONFIG_SPE */ /* - * extern void giveup_fpu(struct task_struct *prev) - * - * Not all FSL Book-E cores have an FPU - */ -#ifndef CONFIG_PPC_FPU -_GLOBAL(giveup_fpu) - blr -#endif - -/* * extern void abort(void) * * At present, this routine just applies a system reset. @@ -1065,24 +1145,36 @@ _GLOBAL(__flush_disable_L1) /* When we get here, r24 needs to hold the CPU # */ .globl __secondary_start __secondary_start: - lis r3,__secondary_hold_acknowledge@h - ori r3,r3,__secondary_hold_acknowledge@l - stw r24,0(r3) - - li r3,0 - mr r4,r24 /* Why? */ - bl call_setup_cpu - - lis r3,tlbcam_index@ha - lwz r3,tlbcam_index@l(r3) + LOAD_REG_ADDR_PIC(r3, tlbcam_index) + lwz r3,0(r3) mtctr r3 li r26,0 /* r26 safe? */ + bl switch_to_as1 + mr r27,r3 /* tlb entry */ /* Load each CAM entry */ 1: mr r3,r26 bl loadcam_entry addi r26,r26,1 bdnz 1b + mr r3,r27 /* tlb entry */ + LOAD_REG_ADDR_PIC(r4, memstart_addr) + lwz r4,0(r4) + mr r5,r25 /* phys kernel start */ + rlwinm r5,r5,0,~0x3ffffff /* aligned 64M */ + subf r4,r5,r4 /* memstart_addr - phys kernel start */ + li r5,0 /* no device tree */ + li r6,0 /* not boot cpu */ + bl restore_to_as0 + + + lis r3,__secondary_hold_acknowledge@h + ori r3,r3,__secondary_hold_acknowledge@l + stw r24,0(r3) + + li r3,0 + mr r4,r24 /* Why? */ + bl call_setup_cpu /* get current_thread_info and current */ lis r1,secondary_ti@ha @@ -1119,6 +1211,112 @@ __secondary_hold_acknowledge: #endif /* + * Create a tlb entry with the same effective and physical address as + * the tlb entry used by the current running code. But set the TS to 1. + * Then switch to the address space 1. It will return with the r3 set to + * the ESEL of the new created tlb. + */ +_GLOBAL(switch_to_as1) + mflr r5 + + /* Find a entry not used */ + mfspr r3,SPRN_TLB1CFG + andi. r3,r3,0xfff + mfspr r4,SPRN_PID + rlwinm r4,r4,16,0x3fff0000 /* turn PID into MAS6[SPID] */ + mtspr SPRN_MAS6,r4 +1: lis r4,0x1000 /* Set MAS0(TLBSEL) = 1 */ + addi r3,r3,-1 + rlwimi r4,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ + mtspr SPRN_MAS0,r4 + tlbre + mfspr r4,SPRN_MAS1 + andis. r4,r4,MAS1_VALID@h + bne 1b + + /* Get the tlb entry used by the current running code */ + bl 0f +0: mflr r4 + tlbsx 0,r4 + + mfspr r4,SPRN_MAS1 + ori r4,r4,MAS1_TS /* Set the TS = 1 */ + mtspr SPRN_MAS1,r4 + + mfspr r4,SPRN_MAS0 + rlwinm r4,r4,0,~MAS0_ESEL_MASK + rlwimi r4,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ + mtspr SPRN_MAS0,r4 + tlbwe + isync + sync + + mfmsr r4 + ori r4,r4,MSR_IS | MSR_DS + mtspr SPRN_SRR0,r5 + mtspr SPRN_SRR1,r4 + sync + rfi + +/* + * Restore to the address space 0 and also invalidate the tlb entry created + * by switch_to_as1. + * r3 - the tlb entry which should be invalidated + * r4 - __pa(PAGE_OFFSET in AS1) - __pa(PAGE_OFFSET in AS0) + * r5 - device tree virtual address. If r4 is 0, r5 is ignored. + * r6 - boot cpu +*/ +_GLOBAL(restore_to_as0) + mflr r0 + + bl 0f +0: mflr r9 + addi r9,r9,1f - 0b + + /* + * We may map the PAGE_OFFSET in AS0 to a different physical address, + * so we need calculate the right jump and device tree address based + * on the offset passed by r4. + */ + add r9,r9,r4 + add r5,r5,r4 + add r0,r0,r4 + +2: mfmsr r7 + li r8,(MSR_IS | MSR_DS) + andc r7,r7,r8 + + mtspr SPRN_SRR0,r9 + mtspr SPRN_SRR1,r7 + sync + rfi + + /* Invalidate the temporary tlb entry for AS1 */ +1: lis r9,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r9,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */ + mtspr SPRN_MAS0,r9 + tlbre + mfspr r9,SPRN_MAS1 + rlwinm r9,r9,0,2,31 /* Clear MAS1 Valid and IPPROT */ + mtspr SPRN_MAS1,r9 + tlbwe + isync + + cmpwi r4,0 + cmpwi cr1,r6,0 + cror eq,4*cr1+eq,eq + bne 3f /* offset != 0 && is_boot_cpu */ + mtlr r0 + blr + + /* + * The PAGE_OFFSET will map to a different physical address, + * jump to _start to do another relocation again. + */ +3: mr r3,r5 + bl _start + +/* * We put a few things here that have to be page-aligned. This stuff * goes at the beginning of the data segment, which is page-aligned. */ diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index a89cae481b0..0bb5918faaa 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -28,7 +28,6 @@ #include <linux/percpu.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/init.h> #include <linux/smp.h> #include <asm/hw_breakpoint.h> @@ -73,7 +72,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp) * If so, DABR will be populated in single_step_dabr_instruction(). */ if (current->thread.last_hit_ubp != bp) - set_dabr(info->address | info->type | DABR_TRANSLATION, info->dabrx); + __set_breakpoint(info); return 0; } @@ -97,7 +96,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) } *slot = NULL; - set_dabr(0, 0); + hw_breakpoint_disable(); } /* @@ -127,19 +126,13 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp) int arch_bp_generic_fields(int type, int *gen_bp_type) { - switch (type) { - case DABR_DATA_READ: - *gen_bp_type = HW_BREAKPOINT_R; - break; - case DABR_DATA_WRITE: - *gen_bp_type = HW_BREAKPOINT_W; - break; - case (DABR_DATA_WRITE | DABR_DATA_READ): - *gen_bp_type = (HW_BREAKPOINT_W | HW_BREAKPOINT_R); - break; - default: + *gen_bp_type = 0; + if (type & HW_BRK_TYPE_READ) + *gen_bp_type |= HW_BREAKPOINT_R; + if (type & HW_BRK_TYPE_WRITE) + *gen_bp_type |= HW_BREAKPOINT_W; + if (*gen_bp_type == 0) return -EINVAL; - } return 0; } @@ -148,35 +141,28 @@ int arch_bp_generic_fields(int type, int *gen_bp_type) */ int arch_validate_hwbkpt_settings(struct perf_event *bp) { - int ret = -EINVAL; + int ret = -EINVAL, length_max; struct arch_hw_breakpoint *info = counter_arch_bp(bp); if (!bp) return ret; - switch (bp->attr.bp_type) { - case HW_BREAKPOINT_R: - info->type = DABR_DATA_READ; - break; - case HW_BREAKPOINT_W: - info->type = DABR_DATA_WRITE; - break; - case HW_BREAKPOINT_R | HW_BREAKPOINT_W: - info->type = (DABR_DATA_READ | DABR_DATA_WRITE); - break; - default: + info->type = HW_BRK_TYPE_TRANSLATE; + if (bp->attr.bp_type & HW_BREAKPOINT_R) + info->type |= HW_BRK_TYPE_READ; + if (bp->attr.bp_type & HW_BREAKPOINT_W) + info->type |= HW_BRK_TYPE_WRITE; + if (info->type == HW_BRK_TYPE_TRANSLATE) + /* must set alteast read or write */ return ret; - } - + if (!(bp->attr.exclude_user)) + info->type |= HW_BRK_TYPE_USER; + if (!(bp->attr.exclude_kernel)) + info->type |= HW_BRK_TYPE_KERNEL; + if (!(bp->attr.exclude_hv)) + info->type |= HW_BRK_TYPE_HYP; info->address = bp->attr.bp_addr; info->len = bp->attr.bp_len; - info->dabrx = DABRX_ALL; - if (bp->attr.exclude_user) - info->dabrx &= ~DABRX_USER; - if (bp->attr.exclude_kernel) - info->dabrx &= ~DABRX_KERNEL; - if (bp->attr.exclude_hv) - info->dabrx &= ~DABRX_HYP; /* * Since breakpoint length can be a maximum of HW_BREAKPOINT_LEN(8) @@ -184,8 +170,16 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * HW_BREAKPOINT_ALIGN by rounding off to the lower address, the * 'symbolsize' should satisfy the check below. */ + length_max = 8; /* DABR */ + if (cpu_has_feature(CPU_FTR_DAWR)) { + length_max = 512 ; /* 64 doublewords */ + /* DAWR region can't cross 512 boundary */ + if ((bp->attr.bp_addr >> 10) != + ((bp->attr.bp_addr + bp->attr.bp_len - 1) >> 10)) + return -EINVAL; + } if (info->len > - (HW_BREAKPOINT_LEN - (info->address & HW_BREAKPOINT_ALIGN))) + (length_max - (info->address & HW_BREAKPOINT_ALIGN))) return -EINVAL; return 0; } @@ -204,7 +198,7 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs) info = counter_arch_bp(tsk->thread.last_hit_ubp); regs->msr &= ~MSR_SE; - set_dabr(info->address | info->type | DABR_TRANSLATION, info->dabrx); + __set_breakpoint(info); tsk->thread.last_hit_ubp = NULL; } @@ -222,7 +216,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) unsigned long dar = regs->dar; /* Disable breakpoints during exception handling */ - set_dabr(0, 0); + hw_breakpoint_disable(); /* * The counter may be concurrently released but that can only @@ -255,8 +249,10 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) * we still need to single-step the instruction, but we don't * generate an event. */ - info->extraneous_interrupt = !((bp->attr.bp_addr <= dar) && - (dar - bp->attr.bp_addr < bp->attr.bp_len)); + info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ; + if (!((bp->attr.bp_addr <= dar) && + (dar - bp->attr.bp_addr < bp->attr.bp_len))) + info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; /* Do not emulate user-space instructions, instead single-step them */ if (user_mode(regs)) { @@ -285,10 +281,10 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) * As a policy, the callback is invoked in a 'trigger-after-execute' * fashion */ - if (!info->extraneous_interrupt) + if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ)) perf_bp_event(bp, regs); - set_dabr(info->address | info->type | DABR_TRANSLATION, info->dabrx); + __set_breakpoint(info); out: rcu_read_unlock(); return rc; @@ -317,10 +313,10 @@ int __kprobes single_step_dabr_instruction(struct die_args *args) * We shall invoke the user-defined callback function in the single * stepping handler to confirm to 'trigger-after-execute' semantics */ - if (!info->extraneous_interrupt) + if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ)) perf_bp_event(bp, regs); - set_dabr(info->address | info->type | DABR_TRANSLATION, info->dabrx); + __set_breakpoint(info); current->thread.last_hit_ubp = NULL; /* diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c index 8220baa46fa..1114d13ac19 100644 --- a/arch/powerpc/kernel/ibmebus.c +++ b/arch/powerpc/kernel/ibmebus.c @@ -205,7 +205,7 @@ static int ibmebus_create_devices(const struct of_device_id *matches) return ret; } -int ibmebus_register_driver(struct of_platform_driver *drv) +int ibmebus_register_driver(struct platform_driver *drv) { /* If the driver uses devices that ibmebus doesn't know, add them */ ibmebus_create_devices(drv->driver.of_match_table); @@ -215,7 +215,7 @@ int ibmebus_register_driver(struct of_platform_driver *drv) } EXPORT_SYMBOL(ibmebus_register_driver); -void ibmebus_unregister_driver(struct of_platform_driver *drv) +void ibmebus_unregister_driver(struct platform_driver *drv) { driver_unregister(&drv->driver); } @@ -292,6 +292,7 @@ out: return rc; return count; } +static BUS_ATTR(probe, S_IWUSR, NULL, ibmebus_store_probe); static ssize_t ibmebus_store_remove(struct bus_type *bus, const char *buf, size_t count) @@ -317,13 +318,14 @@ static ssize_t ibmebus_store_remove(struct bus_type *bus, return -ENODEV; } } +static BUS_ATTR(remove, S_IWUSR, NULL, ibmebus_store_remove); - -static struct bus_attribute ibmebus_bus_attrs[] = { - __ATTR(probe, S_IWUSR, NULL, ibmebus_store_probe), - __ATTR(remove, S_IWUSR, NULL, ibmebus_store_remove), - __ATTR_NULL +static struct attribute *ibmbus_bus_attrs[] = { + &bus_attr_probe.attr, + &bus_attr_remove.attr, + NULL, }; +ATTRIBUTE_GROUPS(ibmbus_bus); static int ibmebus_bus_bus_match(struct device *dev, struct device_driver *drv) { @@ -338,11 +340,10 @@ static int ibmebus_bus_bus_match(struct device *dev, struct device_driver *drv) static int ibmebus_bus_device_probe(struct device *dev) { int error = -ENODEV; - struct of_platform_driver *drv; + struct platform_driver *drv; struct platform_device *of_dev; - const struct of_device_id *match; - drv = to_of_platform_driver(dev->driver); + drv = to_platform_driver(dev->driver); of_dev = to_platform_device(dev); if (!drv->probe) @@ -350,9 +351,8 @@ static int ibmebus_bus_device_probe(struct device *dev) of_dev_get(of_dev); - match = of_match_device(drv->driver.of_match_table, dev); - if (match) - error = drv->probe(of_dev, match); + if (of_driver_match_device(dev, dev->driver)) + error = drv->probe(of_dev); if (error) of_dev_put(of_dev); @@ -362,7 +362,7 @@ static int ibmebus_bus_device_probe(struct device *dev) static int ibmebus_bus_device_remove(struct device *dev) { struct platform_device *of_dev = to_platform_device(dev); - struct of_platform_driver *drv = to_of_platform_driver(dev->driver); + struct platform_driver *drv = to_platform_driver(dev->driver); if (dev->driver && drv->remove) drv->remove(of_dev); @@ -372,7 +372,7 @@ static int ibmebus_bus_device_remove(struct device *dev) static void ibmebus_bus_device_shutdown(struct device *dev) { struct platform_device *of_dev = to_platform_device(dev); - struct of_platform_driver *drv = to_of_platform_driver(dev->driver); + struct platform_driver *drv = to_platform_driver(dev->driver); if (dev->driver && drv->shutdown) drv->shutdown(of_dev); @@ -419,7 +419,7 @@ struct device_attribute ibmebus_bus_device_attrs[] = { static int ibmebus_bus_legacy_suspend(struct device *dev, pm_message_t mesg) { struct platform_device *of_dev = to_platform_device(dev); - struct of_platform_driver *drv = to_of_platform_driver(dev->driver); + struct platform_driver *drv = to_platform_driver(dev->driver); int ret = 0; if (dev->driver && drv->suspend) @@ -430,7 +430,7 @@ static int ibmebus_bus_legacy_suspend(struct device *dev, pm_message_t mesg) static int ibmebus_bus_legacy_resume(struct device *dev) { struct platform_device *of_dev = to_platform_device(dev); - struct of_platform_driver *drv = to_of_platform_driver(dev->driver); + struct platform_driver *drv = to_platform_driver(dev->driver); int ret = 0; if (dev->driver && drv->resume) @@ -715,7 +715,7 @@ static struct dev_pm_ops ibmebus_bus_dev_pm_ops = { struct bus_type ibmebus_bus_type = { .name = "ibmebus", .uevent = of_device_uevent_modalias, - .bus_attrs = ibmebus_bus_attrs, + .bus_groups = ibmbus_bus_groups, .match = ibmebus_bus_bus_match, .probe = ibmebus_bus_device_probe, .remove = ibmebus_bus_device_remove, diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index ea78761aa16..d7216c9abda 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -33,11 +33,6 @@ #include <asm/runlatch.h> #include <asm/smp.h> -#ifdef CONFIG_HOTPLUG_CPU -#define cpu_should_die() cpu_is_offline(smp_processor_id()) -#else -#define cpu_should_die() 0 -#endif unsigned long cpuidle_disable = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(cpuidle_disable); @@ -50,64 +45,38 @@ static int __init powersave_off(char *arg) } __setup("powersave=off", powersave_off); -/* - * The body of the idle task. - */ -void cpu_idle(void) +#ifdef CONFIG_HOTPLUG_CPU +void arch_cpu_idle_dead(void) { - set_thread_flag(TIF_POLLING_NRFLAG); - while (1) { - tick_nohz_idle_enter(); - rcu_idle_enter(); - - while (!need_resched() && !cpu_should_die()) { - ppc64_runlatch_off(); - - if (ppc_md.power_save) { - clear_thread_flag(TIF_POLLING_NRFLAG); - /* - * smp_mb is so clearing of TIF_POLLING_NRFLAG - * is ordered w.r.t. need_resched() test. - */ - smp_mb(); - local_irq_disable(); - - /* Don't trace irqs off for idle */ - stop_critical_timings(); - - /* check again after disabling irqs */ - if (!need_resched() && !cpu_should_die()) - ppc_md.power_save(); - - start_critical_timings(); - - /* Some power_save functions return with - * interrupts enabled, some don't. - */ - if (irqs_disabled()) - local_irq_enable(); - set_thread_flag(TIF_POLLING_NRFLAG); - - } else { - /* - * Go into low thread priority and possibly - * low power mode. - */ - HMT_low(); - HMT_very_low(); - } - } + sched_preempt_enable_no_resched(); + cpu_die(); +} +#endif - HMT_medium(); - ppc64_runlatch_on(); - rcu_idle_exit(); - tick_nohz_idle_exit(); - if (cpu_should_die()) { - sched_preempt_enable_no_resched(); - cpu_die(); - } - schedule_preempt_disabled(); +void arch_cpu_idle(void) +{ + ppc64_runlatch_off(); + + if (ppc_md.power_save) { + ppc_md.power_save(); + /* + * Some power_save functions return with + * interrupts enabled, some don't. + */ + if (irqs_disabled()) + local_irq_enable(); + } else { + local_irq_enable(); + /* + * Go into low thread priority and possibly + * low power mode. + */ + HMT_low(); + HMT_very_low(); } + + HMT_medium(); + ppc64_runlatch_on(); } int powersave_nap; @@ -116,7 +85,7 @@ int powersave_nap; /* * Register the sysctl to set/clear powersave_nap. */ -static ctl_table powersave_nap_ctl_table[]={ +static struct ctl_table powersave_nap_ctl_table[] = { { .procname = "powersave-nap", .data = &powersave_nap, @@ -126,7 +95,7 @@ static ctl_table powersave_nap_ctl_table[]={ }, {} }; -static ctl_table powersave_nap_sysctl_root[] = { +static struct ctl_table powersave_nap_sysctl_root[] = { { .procname = "kernel", .mode = 0555, diff --git a/arch/powerpc/kernel/idle_book3e.S b/arch/powerpc/kernel/idle_book3e.S index 4c7cb400858..48c21acef91 100644 --- a/arch/powerpc/kernel/idle_book3e.S +++ b/arch/powerpc/kernel/idle_book3e.S @@ -16,11 +16,13 @@ #include <asm/ppc-opcode.h> #include <asm/processor.h> #include <asm/thread_info.h> +#include <asm/epapr_hcalls.h> /* 64-bit version only for now */ #ifdef CONFIG_PPC64 -_GLOBAL(book3e_idle) +.macro BOOK3E_IDLE name loop +_GLOBAL(\name) /* Save LR for later */ mflr r0 std r0,16(r1) @@ -41,7 +43,7 @@ _GLOBAL(book3e_idle) */ #ifdef CONFIG_TRACE_IRQFLAGS stdu r1,-128(r1) - bl .trace_hardirqs_on + bl trace_hardirqs_on addi r1,r1,128 #endif li r0,1 @@ -67,7 +69,33 @@ _GLOBAL(book3e_idle) /* We can now re-enable hard interrupts and go to sleep */ wrteei 1 -1: PPC_WAIT(0) + \loop + +.endm + +.macro BOOK3E_IDLE_LOOP +1: + PPC_WAIT(0) b 1b +.endm + +/* epapr_ev_idle_start below is patched with the proper hcall + opcodes during kernel initialization */ +.macro EPAPR_EV_IDLE_LOOP +idle_loop: + LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE)) + +.global epapr_ev_idle_start +epapr_ev_idle_start: + li r3, -1 + nop + nop + nop + b idle_loop +.endm + +BOOK3E_IDLE epapr_ev_idle EPAPR_EV_IDLE_LOOP + +BOOK3E_IDLE book3e_idle BOOK3E_IDLE_LOOP #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S index e3edaa18991..f57a19348bd 100644 --- a/arch/powerpc/kernel/idle_power4.S +++ b/arch/powerpc/kernel/idle_power4.S @@ -46,7 +46,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP) mflr r0 std r0,16(r1) stdu r1,-128(r1) - bl .trace_hardirqs_on + bl trace_hardirqs_on addi r1,r1,128 ld r0,16(r1) mtlr r0 diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index e11863f4e59..5cf3d367190 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -17,20 +17,35 @@ #include <asm/ppc-opcode.h> #include <asm/hw_irq.h> #include <asm/kvm_book3s_asm.h> +#include <asm/opal.h> #undef DEBUG - .text +/* Idle state entry routines */ -_GLOBAL(power7_idle) - /* Now check if user or arch enabled NAP mode */ - LOAD_REG_ADDRBASE(r3,powersave_nap) - lwz r4,ADDROFF(powersave_nap)(r3) - cmpwi 0,r4,0 - beqlr - /* fall through */ +#define IDLE_STATE_ENTER_SEQ(IDLE_INST) \ + /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \ + std r0,0(r1); \ + ptesync; \ + ld r0,0(r1); \ +1: cmp cr0,r0,r0; \ + bne 1b; \ + IDLE_INST; \ + b . -_GLOBAL(power7_nap) + .text + +/* + * Pass requested state in r3: + * 0 - nap + * 1 - sleep + * + * To check IRQ_HAPPENED in r4 + * 0 - don't check + * 1 - check + */ +_GLOBAL(power7_powersave_common) + /* Use r3 to pass state nap/sleep/winkle */ /* NAP is a state loss, we create a regs frame on the * stack, fill it up with the state we care about and * stick a pointer to it in PACAR1. We really only @@ -47,7 +62,7 @@ _GLOBAL(power7_nap) /* Make sure FPU, VSX etc... are flushed as we may lose * state when going to nap mode */ - bl .discard_lazy_cpu_state + bl discard_lazy_cpu_state #endif /* CONFIG_SMP */ /* Hard disable interrupts */ @@ -60,6 +75,8 @@ _GLOBAL(power7_nap) lbz r0,PACAIRQHAPPENED(r13) cmpwi cr0,r0,0 beq 1f + cmpwi cr0,r4,0 + beq 1f addi r1,r1,INT_FRAME_SIZE ld r0,16(r1) mtlr r0 @@ -79,25 +96,70 @@ _GLOBAL(power7_nap) /* Continue saving state */ SAVE_GPR(2, r1) SAVE_NVGPRS(r1) - mfcr r3 - std r3,_CCR(r1) + mfcr r4 + std r4,_CCR(r1) std r9,_MSR(r1) std r1,PACAR1(r13) -#ifdef CONFIG_KVM_BOOK3S_64_HV +_GLOBAL(power7_enter_nap_mode) +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* Tell KVM we're napping */ li r4,KVM_HWTHREAD_IN_NAP stb r4,HSTATE_HWTHREAD_STATE(r13) #endif + cmpwi cr0,r3,1 + beq 2f + IDLE_STATE_ENTER_SEQ(PPC_NAP) + /* No return */ +2: IDLE_STATE_ENTER_SEQ(PPC_SLEEP) + /* No return */ - /* Magic NAP mode enter sequence */ - std r0,0(r1) - ptesync - ld r0,0(r1) -1: cmp cr0,r0,r0 - bne 1b - PPC_NAP - b . +_GLOBAL(power7_idle) + /* Now check if user or arch enabled NAP mode */ + LOAD_REG_ADDRBASE(r3,powersave_nap) + lwz r4,ADDROFF(powersave_nap)(r3) + cmpwi 0,r4,0 + beqlr + li r3, 1 + /* fall through */ + +_GLOBAL(power7_nap) + mr r4,r3 + li r3,0 + b power7_powersave_common + /* No return */ + +_GLOBAL(power7_sleep) + li r3,1 + li r4,1 + b power7_powersave_common + /* No return */ + +_GLOBAL(power7_wakeup_tb_loss) + ld r2,PACATOC(r13); + ld r1,PACAR1(r13) + + /* Time base re-sync */ + li r0,OPAL_RESYNC_TIMEBASE + LOAD_REG_ADDR(r11,opal); + ld r12,8(r11); + ld r2,0(r11); + mtctr r12 + bctrl + + /* TODO: Check r3 for failure */ + + REST_NVGPRS(r1) + REST_GPR(2, r1) + ld r3,_CCR(r1) + ld r4,_MSR(r1) + ld r5,_NIP(r1) + addi r1,r1,INT_FRAME_SIZE + mtcr r3 + mfspr r3,SPRN_SRR1 /* Return SRR1 */ + mtspr SPRN_SRR1,r4 + mtspr SPRN_SRR0,r5 + rfid _GLOBAL(power7_wakeup_loss) ld r1,PACAR1(r13) @@ -115,7 +177,7 @@ _GLOBAL(power7_wakeup_loss) _GLOBAL(power7_wakeup_noloss) lbz r0,PACA_NAPSTATELOST(r13) cmpwi r0,0 - bne .power7_wakeup_loss + bne power7_wakeup_loss ld r1,PACAR1(r13) ld r4,_MSR(r1) ld r5,_NIP(r1) diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index 50e90b7e713..24b968f8e4d 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -53,8 +53,10 @@ static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr) return NULL; } +#ifdef CONFIG_PPC_INDIRECT_MMIO struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) { + unsigned hugepage_shift; struct iowa_bus *bus; int token; @@ -70,11 +72,17 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END) return NULL; - ptep = find_linux_pte(init_mm.pgd, vaddr); + ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr, + &hugepage_shift); if (ptep == NULL) paddr = 0; - else + else { + /* + * we don't have hugepages backing iomem + */ + WARN_ON(hugepage_shift); paddr = pte_pfn(*ptep) << PAGE_SHIFT; + } bus = iowa_pci_find(vaddr, paddr); if (bus == NULL) @@ -83,13 +91,25 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) return bus; } +#else /* CONFIG_PPC_INDIRECT_MMIO */ +struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) +{ + return NULL; +} +#endif /* !CONFIG_PPC_INDIRECT_MMIO */ +#ifdef CONFIG_PPC_INDIRECT_PIO struct iowa_bus *iowa_pio_find_bus(unsigned long port) { unsigned long vaddr = (unsigned long)pci_io_base + port; return iowa_pci_find(vaddr, 0); } - +#else +struct iowa_bus *iowa_pio_find_bus(unsigned long port) +{ + return NULL; +} +#endif #define DEF_PCI_AC_RET(name, ret, at, al, space, aa) \ static ret iowa_##name at \ @@ -130,6 +150,7 @@ static const struct ppc_pci_io iowa_pci_io = { }; +#ifdef CONFIG_PPC_INDIRECT_MMIO static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, unsigned long flags, void *caller) { @@ -144,6 +165,9 @@ static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, } return res; } +#else /* CONFIG_PPC_INDIRECT_MMIO */ +#define iowa_ioremap NULL +#endif /* !CONFIG_PPC_INDIRECT_MMIO */ /* Enable IO workaround */ static void io_workaround_init(void) diff --git a/arch/powerpc/kernel/io.c b/arch/powerpc/kernel/io.c index 886381f32c3..2a2b4aeab80 100644 --- a/arch/powerpc/kernel/io.c +++ b/arch/powerpc/kernel/io.c @@ -25,6 +25,9 @@ #include <asm/firmware.h> #include <asm/bug.h> +/* See definition in io.h */ +bool isa_io_special; + void _insb(const volatile u8 __iomem *port, void *buf, long count) { u8 *tbuf = buf; diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c index 97a3715ac8b..12e48d56f77 100644 --- a/arch/powerpc/kernel/iomap.c +++ b/arch/powerpc/kernel/iomap.c @@ -3,7 +3,6 @@ * * (C) Copyright 2004 Linus Torvalds */ -#include <linux/init.h> #include <linux/pci.h> #include <linux/mm.h> #include <linux/export.h> @@ -24,7 +23,7 @@ unsigned int ioread16(void __iomem *addr) } unsigned int ioread16be(void __iomem *addr) { - return in_be16(addr); + return readw_be(addr); } unsigned int ioread32(void __iomem *addr) { @@ -32,7 +31,7 @@ unsigned int ioread32(void __iomem *addr) } unsigned int ioread32be(void __iomem *addr) { - return in_be32(addr); + return readl_be(addr); } EXPORT_SYMBOL(ioread8); EXPORT_SYMBOL(ioread16); @@ -50,7 +49,7 @@ void iowrite16(u16 val, void __iomem *addr) } void iowrite16be(u16 val, void __iomem *addr) { - out_be16(addr, val); + writew_be(val, addr); } void iowrite32(u32 val, void __iomem *addr) { @@ -58,7 +57,7 @@ void iowrite32(u32 val, void __iomem *addr) } void iowrite32be(u32 val, void __iomem *addr) { - out_be32(addr, val); + writel_be(val, addr); } EXPORT_SYMBOL(iowrite8); EXPORT_SYMBOL(iowrite16); @@ -76,15 +75,15 @@ EXPORT_SYMBOL(iowrite32be); */ void ioread8_rep(void __iomem *addr, void *dst, unsigned long count) { - _insb((u8 __iomem *) addr, dst, count); + readsb(addr, dst, count); } void ioread16_rep(void __iomem *addr, void *dst, unsigned long count) { - _insw_ns((u16 __iomem *) addr, dst, count); + readsw(addr, dst, count); } void ioread32_rep(void __iomem *addr, void *dst, unsigned long count) { - _insl_ns((u32 __iomem *) addr, dst, count); + readsl(addr, dst, count); } EXPORT_SYMBOL(ioread8_rep); EXPORT_SYMBOL(ioread16_rep); @@ -92,15 +91,15 @@ EXPORT_SYMBOL(ioread32_rep); void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) { - _outsb((u8 __iomem *) addr, src, count); + writesb(addr, src, count); } void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) { - _outsw_ns((u16 __iomem *) addr, src, count); + writesw(addr, src, count); } void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) { - _outsl_ns((u32 __iomem *) addr, src, count); + writesl(addr, src, count); } EXPORT_SYMBOL(iowrite8_rep); EXPORT_SYMBOL(iowrite16_rep); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index c862fd716fe..88e3ec6e1d9 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -36,6 +36,8 @@ #include <linux/hash.h> #include <linux/fault-inject.h> #include <linux/pci.h> +#include <linux/iommu.h> +#include <linux/sched.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/iommu.h> @@ -44,6 +46,7 @@ #include <asm/kdump.h> #include <asm/fadump.h> #include <asm/vio.h> +#include <asm/tce.h> #define DBG(...) @@ -102,7 +105,7 @@ static int __init fail_iommu_debugfs(void) struct dentry *dir = fault_create_debugfs_attr("fail_iommu", NULL, &fail_iommu); - return IS_ERR(dir) ? PTR_ERR(dir) : 0; + return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_iommu_debugfs); @@ -248,14 +251,13 @@ again: if (dev) boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - 1 << IOMMU_PAGE_SHIFT); + 1 << tbl->it_page_shift); else - boundary_size = ALIGN(1UL << 32, 1 << IOMMU_PAGE_SHIFT); + boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift); /* 4GB boundary for iseries_hv_alloc and iseries_hv_map */ - n = iommu_area_alloc(tbl->it_map, limit, start, npages, - tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT, - align_mask); + n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset, + boundary_size >> tbl->it_page_shift, align_mask); if (n == -1) { if (likely(pass == 0)) { /* First try the pool from the start */ @@ -317,12 +319,12 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, return DMA_ERROR_CODE; entry += tbl->it_offset; /* Offset into real TCE table */ - ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */ + ret = entry << tbl->it_page_shift; /* Set the return dma address */ /* Put the TCEs in the HW table */ build_fail = ppc_md.tce_build(tbl, entry, npages, - (unsigned long)page & IOMMU_PAGE_MASK, - direction, attrs); + (unsigned long)page & + IOMMU_PAGE_MASK(tbl), direction, attrs); /* ppc_md.tce_build() only returns non-zero for transient errors. * Clean up the table bitmap in this case and return @@ -349,7 +351,7 @@ static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr, { unsigned long entry, free_entry; - entry = dma_addr >> IOMMU_PAGE_SHIFT; + entry = dma_addr >> tbl->it_page_shift; free_entry = entry - tbl->it_offset; if (((free_entry + npages) > tbl->it_size) || @@ -398,7 +400,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned long flags; struct iommu_pool *pool; - entry = dma_addr >> IOMMU_PAGE_SHIFT; + entry = dma_addr >> tbl->it_page_shift; free_entry = entry - tbl->it_offset; pool = get_pool(tbl, free_entry); @@ -465,13 +467,13 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, } /* Allocate iommu entries for that segment */ vaddr = (unsigned long) sg_virt(s); - npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE); + npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl)); align = 0; - if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && slen >= PAGE_SIZE && + if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE && (vaddr & ~PAGE_MASK) == 0) - align = PAGE_SHIFT - IOMMU_PAGE_SHIFT; + align = PAGE_SHIFT - tbl->it_page_shift; entry = iommu_range_alloc(dev, tbl, npages, &handle, - mask >> IOMMU_PAGE_SHIFT, align); + mask >> tbl->it_page_shift, align); DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); @@ -486,16 +488,16 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, /* Convert entry to a dma_addr_t */ entry += tbl->it_offset; - dma_addr = entry << IOMMU_PAGE_SHIFT; - dma_addr |= (s->offset & ~IOMMU_PAGE_MASK); + dma_addr = entry << tbl->it_page_shift; + dma_addr |= (s->offset & ~IOMMU_PAGE_MASK(tbl)); DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n", npages, entry, dma_addr); /* Insert into HW table */ build_fail = ppc_md.tce_build(tbl, entry, npages, - vaddr & IOMMU_PAGE_MASK, - direction, attrs); + vaddr & IOMMU_PAGE_MASK(tbl), + direction, attrs); if(unlikely(build_fail)) goto failure; @@ -556,9 +558,9 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, if (s->dma_length != 0) { unsigned long vaddr, npages; - vaddr = s->dma_address & IOMMU_PAGE_MASK; + vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl); npages = iommu_num_pages(s->dma_address, s->dma_length, - IOMMU_PAGE_SIZE); + IOMMU_PAGE_SIZE(tbl)); __iommu_free(tbl, vaddr, npages); s->dma_address = DMA_ERROR_CODE; s->dma_length = 0; @@ -589,7 +591,7 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, if (sg->dma_length == 0) break; npages = iommu_num_pages(dma_handle, sg->dma_length, - IOMMU_PAGE_SIZE); + IOMMU_PAGE_SIZE(tbl)); __iommu_free(tbl, dma_handle, npages); sg = sg_next(sg); } @@ -658,7 +660,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) /* number of bytes needed for the bitmap */ sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); - page = alloc_pages_node(nid, GFP_ATOMIC, get_order(sz)); + page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz)); if (!page) panic("iommu_init_table: Can't allocate %ld bytes\n", sz); tbl->it_map = page_address(page); @@ -673,7 +675,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) set_bit(0, tbl->it_map); /* We only split the IOMMU table if we have 1GB or more of space */ - if ((tbl->it_size << IOMMU_PAGE_SHIFT) >= (1UL * 1024 * 1024 * 1024)) + if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024)) tbl->nr_pools = IOMMU_NR_POOLS; else tbl->nr_pools = 1; @@ -717,6 +719,20 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) return; } + /* + * In case we have reserved the first bit, we should not emit + * the warning below. + */ + if (tbl->it_offset == 0) + clear_bit(0, tbl->it_map); + +#ifdef CONFIG_IOMMU_API + if (tbl->it_group) { + iommu_group_put(tbl->it_group); + BUG_ON(tbl->it_group); + } +#endif + /* verify that table contains no entries */ if (!bitmap_empty(tbl->it_map, tbl->it_size)) pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name); @@ -751,16 +767,16 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, vaddr = page_address(page) + offset; uaddr = (unsigned long)vaddr; - npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE); + npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl)); if (tbl) { align = 0; - if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && size >= PAGE_SIZE && + if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0) - align = PAGE_SHIFT - IOMMU_PAGE_SHIFT; + align = PAGE_SHIFT - tbl->it_page_shift; dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction, - mask >> IOMMU_PAGE_SHIFT, align, + mask >> tbl->it_page_shift, align, attrs); if (dma_handle == DMA_ERROR_CODE) { if (printk_ratelimit()) { @@ -769,7 +785,7 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, npages); } } else - dma_handle |= (uaddr & ~IOMMU_PAGE_MASK); + dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl)); } return dma_handle; @@ -784,7 +800,8 @@ void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle, BUG_ON(direction == DMA_NONE); if (tbl) { - npages = iommu_num_pages(dma_handle, size, IOMMU_PAGE_SIZE); + npages = iommu_num_pages(dma_handle, size, + IOMMU_PAGE_SIZE(tbl)); iommu_free(tbl, dma_handle, npages); } } @@ -828,10 +845,10 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, memset(ret, 0, size); /* Set up tces to cover the allocated range */ - nio_pages = size >> IOMMU_PAGE_SHIFT; - io_order = get_iommu_order(size); + nio_pages = size >> tbl->it_page_shift; + io_order = get_iommu_order(size, tbl); mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, - mask >> IOMMU_PAGE_SHIFT, io_order, NULL); + mask >> tbl->it_page_shift, io_order, NULL); if (mapping == DMA_ERROR_CODE) { free_pages((unsigned long)ret, order); return NULL; @@ -847,9 +864,311 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, unsigned int nio_pages; size = PAGE_ALIGN(size); - nio_pages = size >> IOMMU_PAGE_SHIFT; + nio_pages = size >> tbl->it_page_shift; iommu_free(tbl, dma_handle, nio_pages); size = PAGE_ALIGN(size); free_pages((unsigned long)vaddr, get_order(size)); } } + +#ifdef CONFIG_IOMMU_API +/* + * SPAPR TCE API + */ +static void group_release(void *iommu_data) +{ + struct iommu_table *tbl = iommu_data; + tbl->it_group = NULL; +} + +void iommu_register_group(struct iommu_table *tbl, + int pci_domain_number, unsigned long pe_num) +{ + struct iommu_group *grp; + char *name; + + grp = iommu_group_alloc(); + if (IS_ERR(grp)) { + pr_warn("powerpc iommu api: cannot create new group, err=%ld\n", + PTR_ERR(grp)); + return; + } + tbl->it_group = grp; + iommu_group_set_iommudata(grp, tbl, group_release); + name = kasprintf(GFP_KERNEL, "domain%d-pe%lx", + pci_domain_number, pe_num); + if (!name) + return; + iommu_group_set_name(grp, name); + kfree(name); +} + +enum dma_data_direction iommu_tce_direction(unsigned long tce) +{ + if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE)) + return DMA_BIDIRECTIONAL; + else if (tce & TCE_PCI_READ) + return DMA_TO_DEVICE; + else if (tce & TCE_PCI_WRITE) + return DMA_FROM_DEVICE; + else + return DMA_NONE; +} +EXPORT_SYMBOL_GPL(iommu_tce_direction); + +void iommu_flush_tce(struct iommu_table *tbl) +{ + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + /* Make sure updates are seen by hardware */ + mb(); +} +EXPORT_SYMBOL_GPL(iommu_flush_tce); + +int iommu_tce_clear_param_check(struct iommu_table *tbl, + unsigned long ioba, unsigned long tce_value, + unsigned long npages) +{ + /* ppc_md.tce_free() does not support any value but 0 */ + if (tce_value) + return -EINVAL; + + if (ioba & ~IOMMU_PAGE_MASK(tbl)) + return -EINVAL; + + ioba >>= tbl->it_page_shift; + if (ioba < tbl->it_offset) + return -EINVAL; + + if ((ioba + npages) > (tbl->it_offset + tbl->it_size)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check); + +int iommu_tce_put_param_check(struct iommu_table *tbl, + unsigned long ioba, unsigned long tce) +{ + if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ))) + return -EINVAL; + + if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ)) + return -EINVAL; + + if (ioba & ~IOMMU_PAGE_MASK(tbl)) + return -EINVAL; + + ioba >>= tbl->it_page_shift; + if (ioba < tbl->it_offset) + return -EINVAL; + + if ((ioba + 1) > (tbl->it_offset + tbl->it_size)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); + +unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) +{ + unsigned long oldtce; + struct iommu_pool *pool = get_pool(tbl, entry); + + spin_lock(&(pool->lock)); + + oldtce = ppc_md.tce_get(tbl, entry); + if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) + ppc_md.tce_free(tbl, entry, 1); + else + oldtce = 0; + + spin_unlock(&(pool->lock)); + + return oldtce; +} +EXPORT_SYMBOL_GPL(iommu_clear_tce); + +int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, + unsigned long entry, unsigned long pages) +{ + unsigned long oldtce; + struct page *page; + + for ( ; pages; --pages, ++entry) { + oldtce = iommu_clear_tce(tbl, entry); + if (!oldtce) + continue; + + page = pfn_to_page(oldtce >> PAGE_SHIFT); + WARN_ON(!page); + if (page) { + if (oldtce & TCE_PCI_WRITE) + SetPageDirty(page); + put_page(page); + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages); + +/* + * hwaddr is a kernel virtual address here (0xc... bazillion), + * tce_build converts it to a physical address. + */ +int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, + unsigned long hwaddr, enum dma_data_direction direction) +{ + int ret = -EBUSY; + unsigned long oldtce; + struct iommu_pool *pool = get_pool(tbl, entry); + + spin_lock(&(pool->lock)); + + oldtce = ppc_md.tce_get(tbl, entry); + /* Add new entry if it is not busy */ + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))) + ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL); + + spin_unlock(&(pool->lock)); + + /* if (unlikely(ret)) + pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n", + __func__, hwaddr, entry << IOMMU_PAGE_SHIFT(tbl), + hwaddr, ret); */ + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_tce_build); + +int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, + unsigned long tce) +{ + int ret; + struct page *page = NULL; + unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; + enum dma_data_direction direction = iommu_tce_direction(tce); + + ret = get_user_pages_fast(tce & PAGE_MASK, 1, + direction != DMA_TO_DEVICE, &page); + if (unlikely(ret != 1)) { + /* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n", + tce, entry << IOMMU_PAGE_SHIFT(tbl), ret); */ + return -EFAULT; + } + hwaddr = (unsigned long) page_address(page) + offset; + + ret = iommu_tce_build(tbl, entry, hwaddr, direction); + if (ret) + put_page(page); + + if (ret < 0) + pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n", + __func__, entry << tbl->it_page_shift, tce, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); + +int iommu_take_ownership(struct iommu_table *tbl) +{ + unsigned long sz = (tbl->it_size + 7) >> 3; + + if (tbl->it_offset == 0) + clear_bit(0, tbl->it_map); + + if (!bitmap_empty(tbl->it_map, tbl->it_size)) { + pr_err("iommu_tce: it_map is not empty"); + return -EBUSY; + } + + memset(tbl->it_map, 0xff, sz); + iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); + + /* + * Disable iommu bypass, otherwise the user can DMA to all of + * our physical memory via the bypass window instead of just + * the pages that has been explicitly mapped into the iommu + */ + if (tbl->set_bypass) + tbl->set_bypass(tbl, false); + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_take_ownership); + +void iommu_release_ownership(struct iommu_table *tbl) +{ + unsigned long sz = (tbl->it_size + 7) >> 3; + + iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); + memset(tbl->it_map, 0, sz); + + /* Restore bit#0 set by iommu_init_table() */ + if (tbl->it_offset == 0) + set_bit(0, tbl->it_map); + + /* The kernel owns the device now, we can restore the iommu bypass */ + if (tbl->set_bypass) + tbl->set_bypass(tbl, true); +} +EXPORT_SYMBOL_GPL(iommu_release_ownership); + +int iommu_add_device(struct device *dev) +{ + struct iommu_table *tbl; + int ret = 0; + + if (WARN_ON(dev->iommu_group)) { + pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n", + dev_name(dev), + iommu_group_id(dev->iommu_group)); + return -EBUSY; + } + + tbl = get_iommu_table_base(dev); + if (!tbl || !tbl->it_group) { + pr_debug("iommu_tce: skipping device %s with no tbl\n", + dev_name(dev)); + return 0; + } + + pr_debug("iommu_tce: adding %s to iommu group %d\n", + dev_name(dev), iommu_group_id(tbl->it_group)); + + if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) { + pr_err("iommu_tce: unsupported iommu page size."); + pr_err("%s has not been added\n", dev_name(dev)); + return -EINVAL; + } + + ret = iommu_group_add_device(tbl->it_group, dev); + if (ret < 0) + pr_err("iommu_tce: %s has not been added, ret=%d\n", + dev_name(dev), ret); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_add_device); + +void iommu_del_device(struct device *dev) +{ + /* + * Some devices might not have IOMMU table and group + * and we needn't detach them from the associated + * IOMMU groups + */ + if (!dev->iommu_group) { + pr_debug("iommu_tce: skipping device %s with no tbl\n", + dev_name(dev)); + return; + } + + iommu_group_remove_device(dev); +} +EXPORT_SYMBOL_GPL(iommu_del_device); + +#endif /* CONFIG_IOMMU_API */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 71413f41278..248ee7e5beb 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -116,14 +116,12 @@ static inline notrace int decrementer_check_overflow(void) u64 now = get_tb_or_rtc(); u64 *next_tb = &__get_cpu_var(decrementers_next_tb); - if (now >= *next_tb) - set_dec(1); return now >= *next_tb; } /* This is called whenever we are re-enabling interrupts - * and returns either 0 (nothing to do) or 500/900 if there's - * either an EE or a DEC to generate. + * and returns either 0 (nothing to do) or 500/900/280/a00/e80 if + * there's an EE, DEC or DBELL to generate. * * This is called in two contexts: From arch_local_irq_restore() * before soft-enabling interrupts, and from the exception exit @@ -162,7 +160,7 @@ notrace unsigned int __check_irq_replay(void) * in case we also had a rollover while hard disabled */ local_paca->irq_happened &= ~PACA_IRQ_DEC; - if (decrementer_check_overflow()) + if ((happened & PACA_IRQ_DEC) || decrementer_check_overflow()) return 0x900; /* Finally check if an external interrupt happened */ @@ -182,6 +180,13 @@ notrace unsigned int __check_irq_replay(void) local_paca->irq_happened &= ~PACA_IRQ_DBELL; if (happened & PACA_IRQ_DBELL) return 0x280; +#else + local_paca->irq_happened &= ~PACA_IRQ_DBELL; + if (happened & PACA_IRQ_DBELL) { + if (cpu_has_feature(CPU_FTR_HVMODE)) + return 0xe80; + return 0xa00; + } #endif /* CONFIG_PPC_BOOK3E */ /* There should be nothing left ! */ @@ -299,7 +304,7 @@ void notrace restore_interrupts(void) * being re-enabled and generally sanitized the lazy irq state, * and in the latter case it will leave with interrupts hard * disabled and marked as such, so the local_irq_enable() call - * in cpu_idle() will properly re-enable everything. + * in arch_cpu_idle() will properly re-enable everything. */ bool prep_irq_for_idle(void) { @@ -349,15 +354,20 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%*s: ", prec, "LOC"); for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs); - seq_printf(p, " Local timer interrupts\n"); + seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_event); + seq_printf(p, " Local timer interrupts for timer event device\n"); + + seq_printf(p, "%*s: ", prec, "LOC"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_others); + seq_printf(p, " Local timer interrupts for others\n"); seq_printf(p, "%*s: ", prec, "SPU"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(irq_stat, j).spurious_irqs); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "%*s: ", prec, "CNT"); + seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(irq_stat, j).pmu_irqs); seq_printf(p, " Performance monitoring interrupts\n"); @@ -367,6 +377,15 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", per_cpu(irq_stat, j).mce_exceptions); seq_printf(p, " Machine check exceptions\n"); +#ifdef CONFIG_PPC_DOORBELL + if (cpu_has_feature(CPU_FTR_DBELL)) { + seq_printf(p, "%*s: ", prec, "DBL"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).doorbell_irqs); + seq_printf(p, " Doorbell interrupts\n"); + } +#endif + return 0; } @@ -375,11 +394,15 @@ int arch_show_interrupts(struct seq_file *p, int prec) */ u64 arch_irq_stat_cpu(unsigned int cpu) { - u64 sum = per_cpu(irq_stat, cpu).timer_irqs; + u64 sum = per_cpu(irq_stat, cpu).timer_irqs_event; sum += per_cpu(irq_stat, cpu).pmu_irqs; sum += per_cpu(irq_stat, cpu).mce_exceptions; sum += per_cpu(irq_stat, cpu).spurious_irqs; + sum += per_cpu(irq_stat, cpu).timer_irqs_others; +#ifdef CONFIG_PPC_DOORBELL + sum += per_cpu(irq_stat, cpu).doorbell_irqs; +#endif return sum; } @@ -424,50 +447,6 @@ void migrate_irqs(void) } #endif -static inline void handle_one_irq(unsigned int irq) -{ - struct thread_info *curtp, *irqtp; - unsigned long saved_sp_limit; - struct irq_desc *desc; - - desc = irq_to_desc(irq); - if (!desc) - return; - - /* Switch to the irq stack to handle this */ - curtp = current_thread_info(); - irqtp = hardirq_ctx[smp_processor_id()]; - - if (curtp == irqtp) { - /* We're already on the irq stack, just handle it */ - desc->handle_irq(irq, desc); - return; - } - - saved_sp_limit = current->thread.ksp_limit; - - irqtp->task = curtp->task; - irqtp->flags = 0; - - /* Copy the softirq bits in preempt_count so that the - * softirq checks work in the hardirq context. */ - irqtp->preempt_count = (irqtp->preempt_count & ~SOFTIRQ_MASK) | - (curtp->preempt_count & SOFTIRQ_MASK); - - current->thread.ksp_limit = (unsigned long)irqtp + - _ALIGN_UP(sizeof(struct thread_info), 16); - - call_handle_irq(irq, desc, irqtp, desc->handle_irq); - current->thread.ksp_limit = saved_sp_limit; - irqtp->task = NULL; - - /* Set any flag that may have been set on the - * alternate stack - */ - if (irqtp->flags) - set_bits(irqtp->flags, &curtp->flags); -} - static inline void check_stack_overflow(void) { #ifdef CONFIG_DEBUG_STACKOVERFLOW @@ -484,9 +463,8 @@ static inline void check_stack_overflow(void) #endif } -void do_IRQ(struct pt_regs *regs) +void __do_irq(struct pt_regs *regs) { - struct pt_regs *old_regs = set_irq_regs(regs); unsigned int irq; irq_enter(); @@ -502,18 +480,54 @@ void do_IRQ(struct pt_regs *regs) */ irq = ppc_md.get_irq(); - /* We can hard enable interrupts now */ + /* We can hard enable interrupts now to allow perf interrupts */ may_hard_irq_enable(); /* And finally process it */ - if (irq != NO_IRQ) - handle_one_irq(irq); - else + if (unlikely(irq == NO_IRQ)) __get_cpu_var(irq_stat).spurious_irqs++; + else + generic_handle_irq(irq); trace_irq_exit(regs); irq_exit(); +} + +void do_IRQ(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + struct thread_info *curtp, *irqtp, *sirqtp; + + /* Switch to the irq stack to handle this */ + curtp = current_thread_info(); + irqtp = hardirq_ctx[raw_smp_processor_id()]; + sirqtp = softirq_ctx[raw_smp_processor_id()]; + + /* Already there ? */ + if (unlikely(curtp == irqtp || curtp == sirqtp)) { + __do_irq(regs); + set_irq_regs(old_regs); + return; + } + + /* Prepare the thread_info in the irq stack */ + irqtp->task = curtp->task; + irqtp->flags = 0; + + /* Copy the preempt_count so that the [soft]irq checks work. */ + irqtp->preempt_count = curtp->preempt_count; + + /* Switch stack and call */ + call_do_irq(regs, irqtp); + + /* Restore stack limit */ + irqtp->task = NULL; + + /* Copy back updates to the thread_info */ + if (irqtp->flags) + set_bits(irqtp->flags, &curtp->flags); + set_irq_regs(old_regs); } @@ -541,8 +555,13 @@ void exc_lvl_ctx_init(void) #ifdef CONFIG_PPC64 cpu_nr = i; #else +#ifdef CONFIG_SMP cpu_nr = get_hard_smp_processor_id(i); +#else + cpu_nr = 0; +#endif #endif + memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE); tp = critirq_ctx[cpu_nr]; tp->cpu = cpu_nr; @@ -575,28 +594,22 @@ void irq_ctx_init(void) memset((void *)softirq_ctx[i], 0, THREAD_SIZE); tp = softirq_ctx[i]; tp->cpu = i; - tp->preempt_count = 0; memset((void *)hardirq_ctx[i], 0, THREAD_SIZE); tp = hardirq_ctx[i]; tp->cpu = i; - tp->preempt_count = HARDIRQ_OFFSET; } } -static inline void do_softirq_onstack(void) +void do_softirq_own_stack(void) { struct thread_info *curtp, *irqtp; - unsigned long saved_sp_limit = current->thread.ksp_limit; curtp = current_thread_info(); irqtp = softirq_ctx[smp_processor_id()]; irqtp->task = curtp->task; irqtp->flags = 0; - current->thread.ksp_limit = (unsigned long)irqtp + - _ALIGN_UP(sizeof(struct thread_info), 16); call_do_softirq(irqtp); - current->thread.ksp_limit = saved_sp_limit; irqtp->task = NULL; /* Set any flag that may have been set on the @@ -606,21 +619,6 @@ static inline void do_softirq_onstack(void) set_bits(irqtp->flags, &curtp->flags); } -void do_softirq(void) -{ - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); - - if (local_softirq_pending()) - do_softirq_onstack(); - - local_irq_restore(flags); -} - irq_hw_number_t virq_to_hw(unsigned int virq) { struct irq_data *irq_data = irq_get_irq_data(virq); diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index c470a40b29f..8504657379f 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -15,7 +15,6 @@ */ #include <linux/kernel.h> -#include <linux/init.h> #include <linux/kgdb.h> #include <linux/smp.h> #include <linux/signal.h> @@ -151,11 +150,12 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs) return 1; } +static DEFINE_PER_CPU(struct thread_info, kgdb_thread_info); static int kgdb_singlestep(struct pt_regs *regs) { struct thread_info *thread_info, *exception_thread_info; - struct thread_info *backup_current_thread_info = \ - (struct thread_info *)kmalloc(sizeof(struct thread_info), GFP_KERNEL); + struct thread_info *backup_current_thread_info = + &__get_cpu_var(kgdb_thread_info); if (user_mode(regs)) return 0; @@ -198,7 +198,7 @@ static int kgdb_iabr_match(struct pt_regs *regs) return 1; } -static int kgdb_dabr_match(struct pt_regs *regs) +static int kgdb_break_match(struct pt_regs *regs) { if (user_mode(regs)) return 0; @@ -458,7 +458,7 @@ static void *old__debugger; static void *old__debugger_bpt; static void *old__debugger_sstep; static void *old__debugger_iabr_match; -static void *old__debugger_dabr_match; +static void *old__debugger_break_match; static void *old__debugger_fault_handler; int kgdb_arch_init(void) @@ -468,7 +468,7 @@ int kgdb_arch_init(void) old__debugger_bpt = __debugger_bpt; old__debugger_sstep = __debugger_sstep; old__debugger_iabr_match = __debugger_iabr_match; - old__debugger_dabr_match = __debugger_dabr_match; + old__debugger_break_match = __debugger_break_match; old__debugger_fault_handler = __debugger_fault_handler; __debugger_ipi = kgdb_call_nmi_hook; @@ -476,7 +476,7 @@ int kgdb_arch_init(void) __debugger_bpt = kgdb_handle_breakpoint; __debugger_sstep = kgdb_singlestep; __debugger_iabr_match = kgdb_iabr_match; - __debugger_dabr_match = kgdb_dabr_match; + __debugger_break_match = kgdb_break_match; __debugger_fault_handler = kgdb_not_implemented; return 0; @@ -489,6 +489,6 @@ void kgdb_arch_exit(void) __debugger_bpt = old__debugger_bpt; __debugger_sstep = old__debugger_sstep; __debugger_iabr_match = old__debugger_iabr_match; - __debugger_dabr_match = old__debugger_dabr_match; + __debugger_break_match = old__debugger_break_match; __debugger_fault_handler = old__debugger_fault_handler; } diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index e88c6433181..2f72af82513 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -32,16 +32,11 @@ #include <linux/module.h> #include <linux/kdebug.h> #include <linux/slab.h> +#include <asm/code-patching.h> #include <asm/cacheflush.h> #include <asm/sstep.h> #include <asm/uaccess.h> -#ifdef CONFIG_PPC_ADV_DEBUG_REGS -#define MSR_SINGLESTEP (MSR_DE) -#else -#define MSR_SINGLESTEP (MSR_SE) -#endif - DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); @@ -104,19 +99,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p) static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) { - /* We turn off async exceptions to ensure that the single step will - * be for the instruction we have the kprobe on, if we dont its - * possible we'd get the single step reported for an exception handler - * like Decrementer or External Interrupt */ - regs->msr &= ~MSR_EE; - regs->msr |= MSR_SINGLESTEP; -#ifdef CONFIG_PPC_ADV_DEBUG_REGS - regs->msr &= ~MSR_CE; - mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM); -#ifdef CONFIG_PPC_47x - isync(); -#endif -#endif + enable_single_step(regs); /* * On powerpc we should single step on the original @@ -310,7 +293,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, { struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; @@ -330,7 +313,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, * real return address, and all the rest will point to * kretprobe_trampoline */ - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + hlist_for_each_entry_safe(ri, tmp, head, hlist) { if (ri->task != current) /* another task is sharing our hash bucket */ continue; @@ -357,7 +340,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, kretprobe_hash_unlock(current, &flags); preempt_enable_no_resched(); - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); } @@ -447,7 +430,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) case KPROBE_HIT_SSDONE: /* * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accouting + * we can also use npre/npostfault count for accounting * these specific fault cases. */ kprobes_inc_nmissed_count(cur); @@ -509,12 +492,10 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, return ret; } -#ifdef CONFIG_PPC64 unsigned long arch_deref_entry_point(void *entry) { - return ((func_descr_t *)entry)->entry; + return ppc_global_function_entry(entry); } -#endif int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) { @@ -526,8 +507,12 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) /* setup return addr to the jprobe handler routine */ regs->nip = arch_deref_entry_point(jp->entry); #ifdef CONFIG_PPC64 +#if defined(_CALL_ELF) && _CALL_ELF == 2 + regs->gpr[12] = (unsigned long)jp->entry; +#else regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc); #endif +#endif return 1; } diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index a61b133c4f9..33aa4ddf597 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -74,7 +74,7 @@ #define KVM_INST_MTSRIN 0x7c0001e4 static bool kvm_patching_worked = true; -static char kvm_tmp[1024 * 1024]; +char kvm_tmp[1024 * 1024]; static int kvm_tmp_index; static inline void kvm_patch_ins(u32 *inst, u32 new_inst) @@ -413,13 +413,13 @@ static void kvm_map_magic_page(void *data) { u32 *features = data; - ulong in[8]; + ulong in[8] = {0}; ulong out[8]; in[0] = KVM_MAGIC_PAGE; - in[1] = KVM_MAGIC_PAGE; + in[1] = KVM_MAGIC_PAGE | MAGIC_PAGE_FLAG_NOT_MAPPED_NX; - kvm_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE)); + epapr_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE)); *features = out[0]; } @@ -711,57 +711,10 @@ static void kvm_use_magic_page(void) kvm_patching_worked ? "worked" : "failed"); } -unsigned long kvm_hypercall(unsigned long *in, - unsigned long *out, - unsigned long nr) -{ - unsigned long register r0 asm("r0"); - unsigned long register r3 asm("r3") = in[0]; - unsigned long register r4 asm("r4") = in[1]; - unsigned long register r5 asm("r5") = in[2]; - unsigned long register r6 asm("r6") = in[3]; - unsigned long register r7 asm("r7") = in[4]; - unsigned long register r8 asm("r8") = in[5]; - unsigned long register r9 asm("r9") = in[6]; - unsigned long register r10 asm("r10") = in[7]; - unsigned long register r11 asm("r11") = nr; - unsigned long register r12 asm("r12"); - - asm volatile("bl epapr_hypercall_start" - : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6), - "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11), - "=r"(r12) - : "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7), "r"(r8), - "r"(r9), "r"(r10), "r"(r11) - : "memory", "cc", "xer", "ctr", "lr"); - - out[0] = r4; - out[1] = r5; - out[2] = r6; - out[3] = r7; - out[4] = r8; - out[5] = r9; - out[6] = r10; - out[7] = r11; - - return r3; -} -EXPORT_SYMBOL_GPL(kvm_hypercall); - static __init void kvm_free_tmp(void) { - unsigned long start, end; - - start = (ulong)&kvm_tmp[kvm_tmp_index + (PAGE_SIZE - 1)] & PAGE_MASK; - end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK; - - /* Free the tmp space we don't need */ - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - init_page_count(virt_to_page(start)); - free_page(start); - totalram_pages++; - } + free_reserved_area(&kvm_tmp[kvm_tmp_index], + &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL); } static int __init kvm_guest_init(void) diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c index 0733b05eb85..936258881c9 100644 --- a/arch/powerpc/kernel/legacy_serial.c +++ b/arch/powerpc/kernel/legacy_serial.c @@ -35,7 +35,7 @@ static struct legacy_serial_info { phys_addr_t taddr; } legacy_serial_infos[MAX_LEGACY_SERIAL_PORTS]; -static struct __initdata of_device_id legacy_serial_parents[] = { +static struct of_device_id legacy_serial_parents[] __initdata = { {.type = "soc",}, {.type = "tsi-bridge",}, {.type = "opb", }, @@ -48,6 +48,9 @@ static struct __initdata of_device_id legacy_serial_parents[] = { static unsigned int legacy_serial_count; static int legacy_serial_console = -1; +static const upf_t legacy_port_flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | + UPF_SHARE_IRQ | UPF_FIXED_PORT; + static unsigned int tsi_serial_in(struct uart_port *p, int offset) { unsigned int tmp; @@ -71,8 +74,9 @@ static int __init add_legacy_port(struct device_node *np, int want_index, phys_addr_t taddr, unsigned long irq, upf_t flags, int irq_check_parent) { - const __be32 *clk, *spd; + const __be32 *clk, *spd, *rs; u32 clock = BASE_BAUD * 16; + u32 shift = 0; int index; /* get clock freq. if present */ @@ -83,6 +87,11 @@ static int __init add_legacy_port(struct device_node *np, int want_index, /* get default speed if present */ spd = of_get_property(np, "current-speed", NULL); + /* get register shift if present */ + rs = of_get_property(np, "reg-shift", NULL); + if (rs && *rs) + shift = be32_to_cpup(rs); + /* If we have a location index, then try to use it */ if (want_index >= 0 && want_index < MAX_LEGACY_SERIAL_PORTS) index = want_index; @@ -99,7 +108,7 @@ static int __init add_legacy_port(struct device_node *np, int want_index, legacy_serial_count = index + 1; /* Check if there is a port who already claimed our slot */ - if (legacy_serial_infos[index].np != 0) { + if (legacy_serial_infos[index].np != NULL) { /* if we still have some room, move it, else override */ if (legacy_serial_count < MAX_LEGACY_SERIAL_PORTS) { printk(KERN_DEBUG "Moved legacy port %d -> %d\n", @@ -126,6 +135,7 @@ static int __init add_legacy_port(struct device_node *np, int want_index, legacy_serial_ports[index].uartclk = clock; legacy_serial_ports[index].irq = irq; legacy_serial_ports[index].flags = flags; + legacy_serial_ports[index].regshift = shift; legacy_serial_infos[index].taddr = taddr; legacy_serial_infos[index].np = of_node_get(np); legacy_serial_infos[index].clock = clock; @@ -152,9 +162,7 @@ static int __init add_legacy_soc_port(struct device_node *np, struct device_node *soc_dev) { u64 addr; - const u32 *addrp; - upf_t flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_SHARE_IRQ - | UPF_FIXED_PORT; + const __be32 *addrp; struct device_node *tsi = of_get_parent(np); /* We only support ports that have a clock frequency properly @@ -163,9 +171,8 @@ static int __init add_legacy_soc_port(struct device_node *np, if (of_get_property(np, "clock-frequency", NULL) == NULL) return -1; - /* if reg-shift or offset, don't try to use it */ - if ((of_get_property(np, "reg-shift", NULL) != NULL) || - (of_get_property(np, "reg-offset", NULL) != NULL)) + /* if reg-offset don't try to use it */ + if ((of_get_property(np, "reg-offset", NULL) != NULL)) return -1; /* if rtas uses this device, don't try to use it as well */ @@ -185,9 +192,11 @@ static int __init add_legacy_soc_port(struct device_node *np, * IO port value. It will be fixed up later along with the irq */ if (tsi && !strcmp(tsi->type, "tsi-bridge")) - return add_legacy_port(np, -1, UPIO_TSI, addr, addr, NO_IRQ, flags, 0); + return add_legacy_port(np, -1, UPIO_TSI, addr, addr, + NO_IRQ, legacy_port_flags, 0); else - return add_legacy_port(np, -1, UPIO_MEM, addr, addr, NO_IRQ, flags, 0); + return add_legacy_port(np, -1, UPIO_MEM, addr, addr, + NO_IRQ, legacy_port_flags, 0); } static int __init add_legacy_isa_port(struct device_node *np, @@ -221,14 +230,19 @@ static int __init add_legacy_isa_port(struct device_node *np, /* Translate ISA address. If it fails, we still register the port * with no translated address so that it can be picked up as an IO * port later by the serial driver + * + * Note: Don't even try on P8 lpc, we know it's not directly mapped */ - taddr = of_translate_address(np, reg); - if (taddr == OF_BAD_ADDR) + if (!of_device_is_compatible(isa_brg, "ibm,power8-lpc")) { + taddr = of_translate_address(np, reg); + if (taddr == OF_BAD_ADDR) + taddr = 0; + } else taddr = 0; /* Add port, irq will be dealt with later */ - return add_legacy_port(np, index, UPIO_PORT, be32_to_cpu(reg[1]), taddr, - NO_IRQ, UPF_BOOT_AUTOCONF, 0); + return add_legacy_port(np, index, UPIO_PORT, be32_to_cpu(reg[1]), + taddr, NO_IRQ, legacy_port_flags, 0); } @@ -237,7 +251,7 @@ static int __init add_legacy_pci_port(struct device_node *np, struct device_node *pci_dev) { u64 addr, base; - const u32 *addrp; + const __be32 *addrp; unsigned int flags; int iotype, index = -1, lindex = 0; @@ -270,7 +284,7 @@ static int __init add_legacy_pci_port(struct device_node *np, if (iotype == UPIO_MEM) base = addr; else - base = addrp[2]; + base = of_read_number(&addrp[2], 1); /* Try to guess an index... If we have subdevices of the pci dev, * we get to their "reg" property @@ -301,25 +315,40 @@ static int __init add_legacy_pci_port(struct device_node *np, * IO port value. It will be fixed up later along with the irq */ return add_legacy_port(np, index, iotype, base, addr, NO_IRQ, - UPF_BOOT_AUTOCONF, np != pci_dev); + legacy_port_flags, np != pci_dev); } #endif static void __init setup_legacy_serial_console(int console) { - struct legacy_serial_info *info = - &legacy_serial_infos[console]; + struct legacy_serial_info *info = &legacy_serial_infos[console]; + struct plat_serial8250_port *port = &legacy_serial_ports[console]; void __iomem *addr; + unsigned int stride; - if (info->taddr == 0) - return; - addr = ioremap(info->taddr, 0x1000); - if (addr == NULL) - return; + stride = 1 << port->regshift; + + /* Check if a translated MMIO address has been found */ + if (info->taddr) { + addr = ioremap(info->taddr, 0x1000); + if (addr == NULL) + return; + udbg_uart_init_mmio(addr, stride); + } else { + /* Check if it's PIO and we support untranslated PIO */ + if (port->iotype == UPIO_PORT && isa_io_special) + udbg_uart_init_pio(port->iobase, stride); + else + return; + } + + /* Try to query the current speed */ if (info->speed == 0) - info->speed = udbg_probe_uart_speed(addr, info->clock); + info->speed = udbg_probe_uart_speed(info->clock); + + /* Set it up */ DBG("default console speed = %d\n", info->speed); - udbg_init_uart(addr, info->speed, info->clock); + udbg_uart_setup(info->speed, info->clock); } /* @@ -367,10 +396,13 @@ void __init find_legacy_serial_ports(void) /* Next, fill our array with ISA ports */ for_each_node_by_type(np, "serial") { struct device_node *isa = of_get_parent(np); - if (isa && !strcmp(isa->name, "isa")) { - index = add_legacy_isa_port(np, isa); - if (index >= 0 && np == stdout) - legacy_serial_console = index; + if (isa && (!strcmp(isa->name, "isa") || + !strcmp(isa->name, "lpc"))) { + if (of_device_is_available(np)) { + index = add_legacy_isa_port(np, isa); + if (index >= 0 && np == stdout) + legacy_serial_console = index; + } } of_node_put(isa); } diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c deleted file mode 100644 index f5725bce9ed..00000000000 --- a/arch/powerpc/kernel/lparcfg.c +++ /dev/null @@ -1,718 +0,0 @@ -/* - * PowerPC64 LPAR Configuration Information Driver - * - * Dave Engebretsen engebret@us.ibm.com - * Copyright (c) 2003 Dave Engebretsen - * Will Schmidt willschm@us.ibm.com - * SPLPAR updates, Copyright (c) 2003 Will Schmidt IBM Corporation. - * seq_file updates, Copyright (c) 2004 Will Schmidt IBM Corporation. - * Nathan Lynch nathanl@austin.ibm.com - * Added lparcfg_write, Copyright (C) 2004 Nathan Lynch IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * This driver creates a proc file at /proc/ppc64/lparcfg which contains - * keyword - value pairs that specify the configuration of the partition. - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/proc_fs.h> -#include <linux/init.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <asm/uaccess.h> -#include <asm/lppaca.h> -#include <asm/hvcall.h> -#include <asm/firmware.h> -#include <asm/rtas.h> -#include <asm/time.h> -#include <asm/prom.h> -#include <asm/vdso_datapage.h> -#include <asm/vio.h> -#include <asm/mmu.h> - -#define MODULE_VERS "1.9" -#define MODULE_NAME "lparcfg" - -/* #define LPARCFG_DEBUG */ - -static struct proc_dir_entry *proc_ppc64_lparcfg; - -/* - * Track sum of all purrs across all processors. This is used to further - * calculate usage values by different applications - */ -static unsigned long get_purr(void) -{ - unsigned long sum_purr = 0; - int cpu; - - for_each_possible_cpu(cpu) { - struct cpu_usage *cu; - - cu = &per_cpu(cpu_usage_array, cpu); - sum_purr += cu->current_tb; - } - return sum_purr; -} - -/* - * Methods used to fetch LPAR data when running on a pSeries platform. - */ - -struct hvcall_ppp_data { - u64 entitlement; - u64 unallocated_entitlement; - u16 group_num; - u16 pool_num; - u8 capped; - u8 weight; - u8 unallocated_weight; - u16 active_procs_in_pool; - u16 active_system_procs; - u16 phys_platform_procs; - u32 max_proc_cap_avail; - u32 entitled_proc_cap_avail; -}; - -/* - * H_GET_PPP hcall returns info in 4 parms. - * entitled_capacity,unallocated_capacity, - * aggregation, resource_capability). - * - * R4 = Entitled Processor Capacity Percentage. - * R5 = Unallocated Processor Capacity Percentage. - * R6 (AABBCCDDEEFFGGHH). - * XXXX - reserved (0) - * XXXX - reserved (0) - * XXXX - Group Number - * XXXX - Pool Number. - * R7 (IIJJKKLLMMNNOOPP). - * XX - reserved. (0) - * XX - bit 0-6 reserved (0). bit 7 is Capped indicator. - * XX - variable processor Capacity Weight - * XX - Unallocated Variable Processor Capacity Weight. - * XXXX - Active processors in Physical Processor Pool. - * XXXX - Processors active on platform. - * R8 (QQQQRRRRRRSSSSSS). if ibm,partition-performance-parameters-level >= 1 - * XXXX - Physical platform procs allocated to virtualization. - * XXXXXX - Max procs capacity % available to the partitions pool. - * XXXXXX - Entitled procs capacity % available to the - * partitions pool. - */ -static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data) -{ - unsigned long rc; - unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; - - rc = plpar_hcall9(H_GET_PPP, retbuf); - - ppp_data->entitlement = retbuf[0]; - ppp_data->unallocated_entitlement = retbuf[1]; - - ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff; - ppp_data->pool_num = retbuf[2] & 0xffff; - - ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01; - ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff; - ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff; - ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff; - ppp_data->active_system_procs = retbuf[3] & 0xffff; - - ppp_data->phys_platform_procs = retbuf[4] >> 6 * 8; - ppp_data->max_proc_cap_avail = (retbuf[4] >> 3 * 8) & 0xffffff; - ppp_data->entitled_proc_cap_avail = retbuf[4] & 0xffffff; - - return rc; -} - -static unsigned h_pic(unsigned long *pool_idle_time, - unsigned long *num_procs) -{ - unsigned long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - rc = plpar_hcall(H_PIC, retbuf); - - *pool_idle_time = retbuf[0]; - *num_procs = retbuf[1]; - - return rc; -} - -/* - * parse_ppp_data - * Parse out the data returned from h_get_ppp and h_pic - */ -static void parse_ppp_data(struct seq_file *m) -{ - struct hvcall_ppp_data ppp_data; - struct device_node *root; - const int *perf_level; - int rc; - - rc = h_get_ppp(&ppp_data); - if (rc) - return; - - seq_printf(m, "partition_entitled_capacity=%lld\n", - ppp_data.entitlement); - seq_printf(m, "group=%d\n", ppp_data.group_num); - seq_printf(m, "system_active_processors=%d\n", - ppp_data.active_system_procs); - - /* pool related entries are appropriate for shared configs */ - if (lppaca_of(0).shared_proc) { - unsigned long pool_idle_time, pool_procs; - - seq_printf(m, "pool=%d\n", ppp_data.pool_num); - - /* report pool_capacity in percentage */ - seq_printf(m, "pool_capacity=%d\n", - ppp_data.active_procs_in_pool * 100); - - h_pic(&pool_idle_time, &pool_procs); - seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time); - seq_printf(m, "pool_num_procs=%ld\n", pool_procs); - } - - seq_printf(m, "unallocated_capacity_weight=%d\n", - ppp_data.unallocated_weight); - seq_printf(m, "capacity_weight=%d\n", ppp_data.weight); - seq_printf(m, "capped=%d\n", ppp_data.capped); - seq_printf(m, "unallocated_capacity=%lld\n", - ppp_data.unallocated_entitlement); - - /* The last bits of information returned from h_get_ppp are only - * valid if the ibm,partition-performance-parameters-level - * property is >= 1. - */ - root = of_find_node_by_path("/"); - if (root) { - perf_level = of_get_property(root, - "ibm,partition-performance-parameters-level", - NULL); - if (perf_level && (*perf_level >= 1)) { - seq_printf(m, - "physical_procs_allocated_to_virtualization=%d\n", - ppp_data.phys_platform_procs); - seq_printf(m, "max_proc_capacity_available=%d\n", - ppp_data.max_proc_cap_avail); - seq_printf(m, "entitled_proc_capacity_available=%d\n", - ppp_data.entitled_proc_cap_avail); - } - - of_node_put(root); - } -} - -/** - * parse_mpp_data - * Parse out data returned from h_get_mpp - */ -static void parse_mpp_data(struct seq_file *m) -{ - struct hvcall_mpp_data mpp_data; - int rc; - - rc = h_get_mpp(&mpp_data); - if (rc) - return; - - seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem); - - if (mpp_data.mapped_mem != -1) - seq_printf(m, "mapped_entitled_memory=%ld\n", - mpp_data.mapped_mem); - - seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num); - seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num); - - seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight); - seq_printf(m, "unallocated_entitled_memory_weight=%d\n", - mpp_data.unallocated_mem_weight); - seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n", - mpp_data.unallocated_entitlement); - - if (mpp_data.pool_size != -1) - seq_printf(m, "entitled_memory_pool_size=%ld bytes\n", - mpp_data.pool_size); - - seq_printf(m, "entitled_memory_loan_request=%ld\n", - mpp_data.loan_request); - - seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem); -} - -/** - * parse_mpp_x_data - * Parse out data returned from h_get_mpp_x - */ -static void parse_mpp_x_data(struct seq_file *m) -{ - struct hvcall_mpp_x_data mpp_x_data; - - if (!firmware_has_feature(FW_FEATURE_XCMO)) - return; - if (h_get_mpp_x(&mpp_x_data)) - return; - - seq_printf(m, "coalesced_bytes=%ld\n", mpp_x_data.coalesced_bytes); - - if (mpp_x_data.pool_coalesced_bytes) - seq_printf(m, "pool_coalesced_bytes=%ld\n", - mpp_x_data.pool_coalesced_bytes); - if (mpp_x_data.pool_purr_cycles) - seq_printf(m, "coalesce_pool_purr=%ld\n", mpp_x_data.pool_purr_cycles); - if (mpp_x_data.pool_spurr_cycles) - seq_printf(m, "coalesce_pool_spurr=%ld\n", mpp_x_data.pool_spurr_cycles); -} - -#define SPLPAR_CHARACTERISTICS_TOKEN 20 -#define SPLPAR_MAXLENGTH 1026*(sizeof(char)) - -/* - * parse_system_parameter_string() - * Retrieve the potential_processors, max_entitled_capacity and friends - * through the get-system-parameter rtas call. Replace keyword strings as - * necessary. - */ -static void parse_system_parameter_string(struct seq_file *m) -{ - int call_status; - - unsigned char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL); - if (!local_buffer) { - printk(KERN_ERR "%s %s kmalloc failure at line %d\n", - __FILE__, __func__, __LINE__); - return; - } - - spin_lock(&rtas_data_buf_lock); - memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH); - call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, - NULL, - SPLPAR_CHARACTERISTICS_TOKEN, - __pa(rtas_data_buf), - RTAS_DATA_BUF_SIZE); - memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH); - spin_unlock(&rtas_data_buf_lock); - - if (call_status != 0) { - printk(KERN_INFO - "%s %s Error calling get-system-parameter (0x%x)\n", - __FILE__, __func__, call_status); - } else { - int splpar_strlen; - int idx, w_idx; - char *workbuffer = kzalloc(SPLPAR_MAXLENGTH, GFP_KERNEL); - if (!workbuffer) { - printk(KERN_ERR "%s %s kmalloc failure at line %d\n", - __FILE__, __func__, __LINE__); - kfree(local_buffer); - return; - } -#ifdef LPARCFG_DEBUG - printk(KERN_INFO "success calling get-system-parameter\n"); -#endif - splpar_strlen = local_buffer[0] * 256 + local_buffer[1]; - local_buffer += 2; /* step over strlen value */ - - w_idx = 0; - idx = 0; - while ((*local_buffer) && (idx < splpar_strlen)) { - workbuffer[w_idx++] = local_buffer[idx++]; - if ((local_buffer[idx] == ',') - || (local_buffer[idx] == '\0')) { - workbuffer[w_idx] = '\0'; - if (w_idx) { - /* avoid the empty string */ - seq_printf(m, "%s\n", workbuffer); - } - memset(workbuffer, 0, SPLPAR_MAXLENGTH); - idx++; /* skip the comma */ - w_idx = 0; - } else if (local_buffer[idx] == '=') { - /* code here to replace workbuffer contents - with different keyword strings */ - if (0 == strcmp(workbuffer, "MaxEntCap")) { - strcpy(workbuffer, - "partition_max_entitled_capacity"); - w_idx = strlen(workbuffer); - } - if (0 == strcmp(workbuffer, "MaxPlatProcs")) { - strcpy(workbuffer, - "system_potential_processors"); - w_idx = strlen(workbuffer); - } - } - } - kfree(workbuffer); - local_buffer -= 2; /* back up over strlen value */ - } - kfree(local_buffer); -} - -/* Return the number of processors in the system. - * This function reads through the device tree and counts - * the virtual processors, this does not include threads. - */ -static int lparcfg_count_active_processors(void) -{ - struct device_node *cpus_dn = NULL; - int count = 0; - - while ((cpus_dn = of_find_node_by_type(cpus_dn, "cpu"))) { -#ifdef LPARCFG_DEBUG - printk(KERN_ERR "cpus_dn %p\n", cpus_dn); -#endif - count++; - } - return count; -} - -static void pseries_cmo_data(struct seq_file *m) -{ - int cpu; - unsigned long cmo_faults = 0; - unsigned long cmo_fault_time = 0; - - seq_printf(m, "cmo_enabled=%d\n", firmware_has_feature(FW_FEATURE_CMO)); - - if (!firmware_has_feature(FW_FEATURE_CMO)) - return; - - for_each_possible_cpu(cpu) { - cmo_faults += lppaca_of(cpu).cmo_faults; - cmo_fault_time += lppaca_of(cpu).cmo_fault_time; - } - - seq_printf(m, "cmo_faults=%lu\n", cmo_faults); - seq_printf(m, "cmo_fault_time_usec=%lu\n", - cmo_fault_time / tb_ticks_per_usec); - seq_printf(m, "cmo_primary_psp=%d\n", cmo_get_primary_psp()); - seq_printf(m, "cmo_secondary_psp=%d\n", cmo_get_secondary_psp()); - seq_printf(m, "cmo_page_size=%lu\n", cmo_get_page_size()); -} - -static void splpar_dispatch_data(struct seq_file *m) -{ - int cpu; - unsigned long dispatches = 0; - unsigned long dispatch_dispersions = 0; - - for_each_possible_cpu(cpu) { - dispatches += lppaca_of(cpu).yield_count; - dispatch_dispersions += lppaca_of(cpu).dispersion_count; - } - - seq_printf(m, "dispatches=%lu\n", dispatches); - seq_printf(m, "dispatch_dispersions=%lu\n", dispatch_dispersions); -} - -static void parse_em_data(struct seq_file *m) -{ - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; - - if (plpar_hcall(H_GET_EM_PARMS, retbuf) == H_SUCCESS) - seq_printf(m, "power_mode_data=%016lx\n", retbuf[0]); -} - -static int pseries_lparcfg_data(struct seq_file *m, void *v) -{ - int partition_potential_processors; - int partition_active_processors; - struct device_node *rtas_node; - const int *lrdrp = NULL; - - rtas_node = of_find_node_by_path("/rtas"); - if (rtas_node) - lrdrp = of_get_property(rtas_node, "ibm,lrdr-capacity", NULL); - - if (lrdrp == NULL) { - partition_potential_processors = vdso_data->processorCount; - } else { - partition_potential_processors = *(lrdrp + 4); - } - of_node_put(rtas_node); - - partition_active_processors = lparcfg_count_active_processors(); - - if (firmware_has_feature(FW_FEATURE_SPLPAR)) { - /* this call handles the ibm,get-system-parameter contents */ - parse_system_parameter_string(m); - parse_ppp_data(m); - parse_mpp_data(m); - parse_mpp_x_data(m); - pseries_cmo_data(m); - splpar_dispatch_data(m); - - seq_printf(m, "purr=%ld\n", get_purr()); - } else { /* non SPLPAR case */ - - seq_printf(m, "system_active_processors=%d\n", - partition_potential_processors); - - seq_printf(m, "system_potential_processors=%d\n", - partition_potential_processors); - - seq_printf(m, "partition_max_entitled_capacity=%d\n", - partition_potential_processors * 100); - - seq_printf(m, "partition_entitled_capacity=%d\n", - partition_active_processors * 100); - } - - seq_printf(m, "partition_active_processors=%d\n", - partition_active_processors); - - seq_printf(m, "partition_potential_processors=%d\n", - partition_potential_processors); - - seq_printf(m, "shared_processor_mode=%d\n", lppaca_of(0).shared_proc); - - seq_printf(m, "slb_size=%d\n", mmu_slb_size); - - parse_em_data(m); - - return 0; -} - -static ssize_t update_ppp(u64 *entitlement, u8 *weight) -{ - struct hvcall_ppp_data ppp_data; - u8 new_weight; - u64 new_entitled; - ssize_t retval; - - /* Get our current parameters */ - retval = h_get_ppp(&ppp_data); - if (retval) - return retval; - - if (entitlement) { - new_weight = ppp_data.weight; - new_entitled = *entitlement; - } else if (weight) { - new_weight = *weight; - new_entitled = ppp_data.entitlement; - } else - return -EINVAL; - - pr_debug("%s: current_entitled = %llu, current_weight = %u\n", - __func__, ppp_data.entitlement, ppp_data.weight); - - pr_debug("%s: new_entitled = %llu, new_weight = %u\n", - __func__, new_entitled, new_weight); - - retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight); - return retval; -} - -/** - * update_mpp - * - * Update the memory entitlement and weight for the partition. Caller must - * specify either a new entitlement or weight, not both, to be updated - * since the h_set_mpp call takes both entitlement and weight as parameters. - */ -static ssize_t update_mpp(u64 *entitlement, u8 *weight) -{ - struct hvcall_mpp_data mpp_data; - u64 new_entitled; - u8 new_weight; - ssize_t rc; - - if (entitlement) { - /* Check with vio to ensure the new memory entitlement - * can be handled. - */ - rc = vio_cmo_entitlement_update(*entitlement); - if (rc) - return rc; - } - - rc = h_get_mpp(&mpp_data); - if (rc) - return rc; - - if (entitlement) { - new_weight = mpp_data.mem_weight; - new_entitled = *entitlement; - } else if (weight) { - new_weight = *weight; - new_entitled = mpp_data.entitled_mem; - } else - return -EINVAL; - - pr_debug("%s: current_entitled = %lu, current_weight = %u\n", - __func__, mpp_data.entitled_mem, mpp_data.mem_weight); - - pr_debug("%s: new_entitled = %llu, new_weight = %u\n", - __func__, new_entitled, new_weight); - - rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight); - return rc; -} - -/* - * Interface for changing system parameters (variable capacity weight - * and entitled capacity). Format of input is "param_name=value"; - * anything after value is ignored. Valid parameters at this time are - * "partition_entitled_capacity" and "capacity_weight". We use - * H_SET_PPP to alter parameters. - * - * This function should be invoked only on systems with - * FW_FEATURE_SPLPAR. - */ -static ssize_t lparcfg_write(struct file *file, const char __user * buf, - size_t count, loff_t * off) -{ - int kbuf_sz = 64; - char kbuf[kbuf_sz]; - char *tmp; - u64 new_entitled, *new_entitled_ptr = &new_entitled; - u8 new_weight, *new_weight_ptr = &new_weight; - ssize_t retval; - - if (!firmware_has_feature(FW_FEATURE_SPLPAR)) - return -EINVAL; - - if (count > kbuf_sz) - return -EINVAL; - - if (copy_from_user(kbuf, buf, count)) - return -EFAULT; - - kbuf[count - 1] = '\0'; - tmp = strchr(kbuf, '='); - if (!tmp) - return -EINVAL; - - *tmp++ = '\0'; - - if (!strcmp(kbuf, "partition_entitled_capacity")) { - char *endp; - *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10); - if (endp == tmp) - return -EINVAL; - - retval = update_ppp(new_entitled_ptr, NULL); - } else if (!strcmp(kbuf, "capacity_weight")) { - char *endp; - *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); - if (endp == tmp) - return -EINVAL; - - retval = update_ppp(NULL, new_weight_ptr); - } else if (!strcmp(kbuf, "entitled_memory")) { - char *endp; - *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10); - if (endp == tmp) - return -EINVAL; - - retval = update_mpp(new_entitled_ptr, NULL); - } else if (!strcmp(kbuf, "entitled_memory_weight")) { - char *endp; - *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); - if (endp == tmp) - return -EINVAL; - - retval = update_mpp(NULL, new_weight_ptr); - } else - return -EINVAL; - - if (retval == H_SUCCESS || retval == H_CONSTRAINED) { - retval = count; - } else if (retval == H_BUSY) { - retval = -EBUSY; - } else if (retval == H_HARDWARE) { - retval = -EIO; - } else if (retval == H_PARAMETER) { - retval = -EINVAL; - } - - return retval; -} - -static int lparcfg_data(struct seq_file *m, void *v) -{ - struct device_node *rootdn; - const char *model = ""; - const char *system_id = ""; - const char *tmp; - const unsigned int *lp_index_ptr; - unsigned int lp_index = 0; - - seq_printf(m, "%s %s\n", MODULE_NAME, MODULE_VERS); - - rootdn = of_find_node_by_path("/"); - if (rootdn) { - tmp = of_get_property(rootdn, "model", NULL); - if (tmp) - model = tmp; - tmp = of_get_property(rootdn, "system-id", NULL); - if (tmp) - system_id = tmp; - lp_index_ptr = of_get_property(rootdn, "ibm,partition-no", - NULL); - if (lp_index_ptr) - lp_index = *lp_index_ptr; - of_node_put(rootdn); - } - seq_printf(m, "serial_number=%s\n", system_id); - seq_printf(m, "system_type=%s\n", model); - seq_printf(m, "partition_id=%d\n", (int)lp_index); - - return pseries_lparcfg_data(m, v); -} - -static int lparcfg_open(struct inode *inode, struct file *file) -{ - return single_open(file, lparcfg_data, NULL); -} - -static const struct file_operations lparcfg_fops = { - .owner = THIS_MODULE, - .read = seq_read, - .write = lparcfg_write, - .open = lparcfg_open, - .release = single_release, - .llseek = seq_lseek, -}; - -static int __init lparcfg_init(void) -{ - struct proc_dir_entry *ent; - umode_t mode = S_IRUSR | S_IRGRP | S_IROTH; - - /* Allow writing if we have FW_FEATURE_SPLPAR */ - if (firmware_has_feature(FW_FEATURE_SPLPAR)) - mode |= S_IWUSR; - - ent = proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops); - if (!ent) { - printk(KERN_ERR "Failed to create powerpc/lparcfg\n"); - return -EIO; - } - - proc_ppc64_lparcfg = ent; - return 0; -} - -static void __exit lparcfg_cleanup(void) -{ - if (proc_ppc64_lparcfg) - remove_proc_entry("lparcfg", proc_ppc64_lparcfg->parent); -} - -module_init(lparcfg_init); -module_exit(lparcfg_cleanup); -MODULE_DESCRIPTION("Interface for LPAR configuration data"); -MODULE_AUTHOR("Dave Engebretsen"); -MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index e1ec57e87b3..015ae55c186 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -18,6 +18,7 @@ #include <linux/ftrace.h> #include <asm/machdep.h> +#include <asm/pgalloc.h> #include <asm/prom.h> #include <asm/sections.h> @@ -75,6 +76,17 @@ void arch_crash_save_vmcoreinfo(void) #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(contig_page_data); #endif +#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) + VMCOREINFO_SYMBOL(vmemmap_list); + VMCOREINFO_SYMBOL(mmu_vmemmap_psize); + VMCOREINFO_SYMBOL(mmu_psize_defs); + VMCOREINFO_STRUCT_SIZE(vmemmap_backing); + VMCOREINFO_OFFSET(vmemmap_backing, list); + VMCOREINFO_OFFSET(vmemmap_backing, phys); + VMCOREINFO_OFFSET(vmemmap_backing, virt_addr); + VMCOREINFO_STRUCT_SIZE(mmu_psize_def); + VMCOREINFO_OFFSET(mmu_psize_def, shift); +#endif } /* @@ -136,7 +148,7 @@ void __init reserve_crashkernel(void) * a small SLB (128MB) since the crash kernel needs to place * itself and some stacks to be in the first segment. */ - crashk_res.start = min(0x80000000ULL, (ppc64_rma_size / 2)); + crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2)); #else crashk_res.start = KDUMP_KERNELBASE; #endif @@ -184,7 +196,9 @@ int overlaps_crashkernel(unsigned long start, unsigned long size) /* Values we need to export to the second kernel via the device tree. */ static phys_addr_t kernel_end; +static phys_addr_t crashk_base; static phys_addr_t crashk_size; +static unsigned long long mem_limit; static struct property kernel_end_prop = { .name = "linux,kernel-end", @@ -195,7 +209,7 @@ static struct property kernel_end_prop = { static struct property crashk_base_prop = { .name = "linux,crashkernel-base", .length = sizeof(phys_addr_t), - .value = &crashk_res.start, + .value = &crashk_base }; static struct property crashk_size_prop = { @@ -207,9 +221,11 @@ static struct property crashk_size_prop = { static struct property memory_limit_prop = { .name = "linux,memory-limit", .length = sizeof(unsigned long long), - .value = &memory_limit, + .value = &mem_limit, }; +#define cpu_to_be_ulong __PASTE(cpu_to_be, BITS_PER_LONG) + static void __init export_crashk_values(struct device_node *node) { struct property *prop; @@ -225,8 +241,9 @@ static void __init export_crashk_values(struct device_node *node) of_remove_property(node, prop); if (crashk_res.start != 0) { + crashk_base = cpu_to_be_ulong(crashk_res.start), of_add_property(node, &crashk_base_prop); - crashk_size = resource_size(&crashk_res); + crashk_size = cpu_to_be_ulong(resource_size(&crashk_res)); of_add_property(node, &crashk_size_prop); } @@ -234,6 +251,7 @@ static void __init export_crashk_values(struct device_node *node) * memory_limit is required by the kexec-tools to limit the * crash regions to the actual memory used. */ + mem_limit = cpu_to_be_ulong(memory_limit); of_update_property(node, &memory_limit_prop); } @@ -252,7 +270,7 @@ static int __init kexec_setup(void) of_remove_property(node, prop); /* information needed by userspace when using default_machine_kexec */ - kernel_end = __pa(_end); + kernel_end = cpu_to_be_ulong(__pa(_end)); of_add_property(node, &kernel_end_prop); export_crashk_values(node); diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 7206701b1ff..879b3aacac3 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -17,6 +17,7 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/cpu.h> +#include <linux/hardirq.h> #include <asm/page.h> #include <asm/current.h> @@ -162,6 +163,8 @@ static int kexec_all_irq_disabled = 0; static void kexec_smp_down(void *arg) { local_irq_disable(); + hard_irq_disable(); + mb(); /* make sure our irqs are disabled before we say they are */ get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF; while(kexec_all_irq_disabled == 0) @@ -234,7 +237,7 @@ static void wake_offline_cpus(void) if (!cpu_online(cpu)) { printk(KERN_INFO "kexec: Waking offline cpu %d.\n", cpu); - cpu_up(cpu); + WARN_ON(cpu_up(cpu)); } } } @@ -244,6 +247,8 @@ static void kexec_prepare_cpus(void) wake_offline_cpus(); smp_call_function(kexec_smp_down, NULL, /* wait */0); local_irq_disable(); + hard_irq_disable(); + mb(); /* make sure IRQs are disabled before we say they are */ get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF; @@ -281,6 +286,7 @@ static void kexec_prepare_cpus(void) if (ppc_md.kexec_cpu_down) ppc_md.kexec_cpu_down(0, 0); local_irq_disable(); + hard_irq_disable(); } #endif /* SMP */ @@ -306,7 +312,7 @@ static union thread_union kexec_stack __init_task_data = */ struct paca_struct kexec_paca; -/* Our assembly helper, in kexec_stub.S */ +/* Our assembly helper, in misc_64.S */ extern void kexec_sequence(void *newstack, unsigned long start, void *image, void *control, void (*clear_all)(void)) __noreturn; @@ -330,10 +336,13 @@ void default_machine_kexec(struct kimage *image) pr_debug("kexec: Starting switchover sequence.\n"); /* switch to a staticly allocated stack. Based on irq stack code. + * We setup preempt_count to avoid using VMX in memcpy. * XXX: the task struct will likely be invalid once we do the copy! */ kexec_stack.thread_info.task = current_thread_info()->task; kexec_stack.thread_info.flags = 0; + kexec_stack.thread_info.preempt_count = HARDIRQ_OFFSET; + kexec_stack.thread_info.cpu = current_thread_info()->cpu; /* We need a static PACA, too; copy this CPU's PACA over and switch to * it. Also poison per_cpu_offset to catch anyone using non-static @@ -360,6 +369,7 @@ void default_machine_kexec(struct kimage *image) /* Values we need to export to the second kernel via the device tree. */ static unsigned long htab_base; +static unsigned long htab_size; static struct property htab_base_prop = { .name = "linux,htab-base", @@ -370,7 +380,7 @@ static struct property htab_base_prop = { static struct property htab_size_prop = { .name = "linux,htab-size", .length = sizeof(unsigned long), - .value = &htab_size_bytes, + .value = &htab_size, }; static int __init export_htab_values(void) @@ -394,8 +404,9 @@ static int __init export_htab_values(void) if (prop) of_remove_property(node, prop); - htab_base = __pa(htab_address); + htab_base = cpu_to_be64(__pa(htab_address)); of_add_property(node, &htab_base_prop); + htab_size = cpu_to_be64(htab_size_bytes); of_add_property(node, &htab_size_prop); of_node_put(node); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c new file mode 100644 index 00000000000..a7fd4cb78b7 --- /dev/null +++ b/arch/powerpc/kernel/mce.c @@ -0,0 +1,352 @@ +/* + * Machine check exception handling. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2013 IBM Corporation + * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> + */ + +#undef DEBUG +#define pr_fmt(fmt) "mce: " fmt + +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/percpu.h> +#include <linux/export.h> +#include <linux/irq_work.h> +#include <asm/mce.h> + +static DEFINE_PER_CPU(int, mce_nest_count); +static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); + +/* Queue for delayed MCE events. */ +static DEFINE_PER_CPU(int, mce_queue_count); +static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); + +static void machine_check_process_queued_event(struct irq_work *work); +struct irq_work mce_event_process_work = { + .func = machine_check_process_queued_event, +}; + +static void mce_set_error_info(struct machine_check_event *mce, + struct mce_error_info *mce_err) +{ + mce->error_type = mce_err->error_type; + switch (mce_err->error_type) { + case MCE_ERROR_TYPE_UE: + mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type; + break; + case MCE_ERROR_TYPE_SLB: + mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type; + break; + case MCE_ERROR_TYPE_ERAT: + mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type; + break; + case MCE_ERROR_TYPE_TLB: + mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type; + break; + case MCE_ERROR_TYPE_UNKNOWN: + default: + break; + } +} + +/* + * Decode and save high level MCE information into per cpu buffer which + * is an array of machine_check_event structure. + */ +void save_mce_event(struct pt_regs *regs, long handled, + struct mce_error_info *mce_err, + uint64_t nip, uint64_t addr) +{ + uint64_t srr1; + int index = __get_cpu_var(mce_nest_count)++; + struct machine_check_event *mce = &__get_cpu_var(mce_event[index]); + + /* + * Return if we don't have enough space to log mce event. + * mce_nest_count may go beyond MAX_MC_EVT but that's ok, + * the check below will stop buffer overrun. + */ + if (index >= MAX_MC_EVT) + return; + + /* Populate generic machine check info */ + mce->version = MCE_V1; + mce->srr0 = nip; + mce->srr1 = regs->msr; + mce->gpr3 = regs->gpr[3]; + mce->in_use = 1; + + mce->initiator = MCE_INITIATOR_CPU; + if (handled) + mce->disposition = MCE_DISPOSITION_RECOVERED; + else + mce->disposition = MCE_DISPOSITION_NOT_RECOVERED; + mce->severity = MCE_SEV_ERROR_SYNC; + + srr1 = regs->msr; + + /* + * Populate the mce error_type and type-specific error_type. + */ + mce_set_error_info(mce, mce_err); + + if (!addr) + return; + + if (mce->error_type == MCE_ERROR_TYPE_TLB) { + mce->u.tlb_error.effective_address_provided = true; + mce->u.tlb_error.effective_address = addr; + } else if (mce->error_type == MCE_ERROR_TYPE_SLB) { + mce->u.slb_error.effective_address_provided = true; + mce->u.slb_error.effective_address = addr; + } else if (mce->error_type == MCE_ERROR_TYPE_ERAT) { + mce->u.erat_error.effective_address_provided = true; + mce->u.erat_error.effective_address = addr; + } else if (mce->error_type == MCE_ERROR_TYPE_UE) { + mce->u.ue_error.effective_address_provided = true; + mce->u.ue_error.effective_address = addr; + } + return; +} + +/* + * get_mce_event: + * mce Pointer to machine_check_event structure to be filled. + * release Flag to indicate whether to free the event slot or not. + * 0 <= do not release the mce event. Caller will invoke + * release_mce_event() once event has been consumed. + * 1 <= release the slot. + * + * return 1 = success + * 0 = failure + * + * get_mce_event() will be called by platform specific machine check + * handle routine and in KVM. + * When we call get_mce_event(), we are still in interrupt context and + * preemption will not be scheduled until ret_from_expect() routine + * is called. + */ +int get_mce_event(struct machine_check_event *mce, bool release) +{ + int index = __get_cpu_var(mce_nest_count) - 1; + struct machine_check_event *mc_evt; + int ret = 0; + + /* Sanity check */ + if (index < 0) + return ret; + + /* Check if we have MCE info to process. */ + if (index < MAX_MC_EVT) { + mc_evt = &__get_cpu_var(mce_event[index]); + /* Copy the event structure and release the original */ + if (mce) + *mce = *mc_evt; + if (release) + mc_evt->in_use = 0; + ret = 1; + } + /* Decrement the count to free the slot. */ + if (release) + __get_cpu_var(mce_nest_count)--; + + return ret; +} + +void release_mce_event(void) +{ + get_mce_event(NULL, true); +} + +/* + * Queue up the MCE event which then can be handled later. + */ +void machine_check_queue_event(void) +{ + int index; + struct machine_check_event evt; + + if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) + return; + + index = __get_cpu_var(mce_queue_count)++; + /* If queue is full, just return for now. */ + if (index >= MAX_MC_EVT) { + __get_cpu_var(mce_queue_count)--; + return; + } + __get_cpu_var(mce_event_queue[index]) = evt; + + /* Queue irq work to process this event later. */ + irq_work_queue(&mce_event_process_work); +} + +/* + * process pending MCE event from the mce event queue. This function will be + * called during syscall exit. + */ +static void machine_check_process_queued_event(struct irq_work *work) +{ + int index; + + /* + * For now just print it to console. + * TODO: log this error event to FSP or nvram. + */ + while (__get_cpu_var(mce_queue_count) > 0) { + index = __get_cpu_var(mce_queue_count) - 1; + machine_check_print_event_info( + &__get_cpu_var(mce_event_queue[index])); + __get_cpu_var(mce_queue_count)--; + } +} + +void machine_check_print_event_info(struct machine_check_event *evt) +{ + const char *level, *sevstr, *subtype; + static const char *mc_ue_types[] = { + "Indeterminate", + "Instruction fetch", + "Page table walk ifetch", + "Load/Store", + "Page table walk Load/Store", + }; + static const char *mc_slb_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + static const char *mc_erat_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + static const char *mc_tlb_types[] = { + "Indeterminate", + "Parity", + "Multihit", + }; + + /* Print things out */ + if (evt->version != MCE_V1) { + pr_err("Machine Check Exception, Unknown event version %d !\n", + evt->version); + return; + } + switch (evt->severity) { + case MCE_SEV_NO_ERROR: + level = KERN_INFO; + sevstr = "Harmless"; + break; + case MCE_SEV_WARNING: + level = KERN_WARNING; + sevstr = ""; + break; + case MCE_SEV_ERROR_SYNC: + level = KERN_ERR; + sevstr = "Severe"; + break; + case MCE_SEV_FATAL: + default: + level = KERN_ERR; + sevstr = "Fatal"; + break; + } + + printk("%s%s Machine check interrupt [%s]\n", level, sevstr, + evt->disposition == MCE_DISPOSITION_RECOVERED ? + "Recovered" : "[Not recovered"); + printk("%s Initiator: %s\n", level, + evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown"); + switch (evt->error_type) { + case MCE_ERROR_TYPE_UE: + subtype = evt->u.ue_error.ue_error_type < + ARRAY_SIZE(mc_ue_types) ? + mc_ue_types[evt->u.ue_error.ue_error_type] + : "Unknown"; + printk("%s Error type: UE [%s]\n", level, subtype); + if (evt->u.ue_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt->u.ue_error.effective_address); + if (evt->u.ue_error.physical_address_provided) + printk("%s Physial address: %016llx\n", + level, evt->u.ue_error.physical_address); + break; + case MCE_ERROR_TYPE_SLB: + subtype = evt->u.slb_error.slb_error_type < + ARRAY_SIZE(mc_slb_types) ? + mc_slb_types[evt->u.slb_error.slb_error_type] + : "Unknown"; + printk("%s Error type: SLB [%s]\n", level, subtype); + if (evt->u.slb_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt->u.slb_error.effective_address); + break; + case MCE_ERROR_TYPE_ERAT: + subtype = evt->u.erat_error.erat_error_type < + ARRAY_SIZE(mc_erat_types) ? + mc_erat_types[evt->u.erat_error.erat_error_type] + : "Unknown"; + printk("%s Error type: ERAT [%s]\n", level, subtype); + if (evt->u.erat_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt->u.erat_error.effective_address); + break; + case MCE_ERROR_TYPE_TLB: + subtype = evt->u.tlb_error.tlb_error_type < + ARRAY_SIZE(mc_tlb_types) ? + mc_tlb_types[evt->u.tlb_error.tlb_error_type] + : "Unknown"; + printk("%s Error type: TLB [%s]\n", level, subtype); + if (evt->u.tlb_error.effective_address_provided) + printk("%s Effective address: %016llx\n", + level, evt->u.tlb_error.effective_address); + break; + default: + case MCE_ERROR_TYPE_UNKNOWN: + printk("%s Error type: Unknown\n", level); + break; + } +} + +uint64_t get_mce_fault_addr(struct machine_check_event *evt) +{ + switch (evt->error_type) { + case MCE_ERROR_TYPE_UE: + if (evt->u.ue_error.effective_address_provided) + return evt->u.ue_error.effective_address; + break; + case MCE_ERROR_TYPE_SLB: + if (evt->u.slb_error.effective_address_provided) + return evt->u.slb_error.effective_address; + break; + case MCE_ERROR_TYPE_ERAT: + if (evt->u.erat_error.effective_address_provided) + return evt->u.erat_error.effective_address; + break; + case MCE_ERROR_TYPE_TLB: + if (evt->u.tlb_error.effective_address_provided) + return evt->u.tlb_error.effective_address; + break; + default: + case MCE_ERROR_TYPE_UNKNOWN: + break; + } + return 0; +} +EXPORT_SYMBOL(get_mce_fault_addr); diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c new file mode 100644 index 00000000000..aa9aff3d6ad --- /dev/null +++ b/arch/powerpc/kernel/mce_power.c @@ -0,0 +1,313 @@ +/* + * Machine check exception handling CPU-side for power7 and power8 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2013 IBM Corporation + * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> + */ + +#undef DEBUG +#define pr_fmt(fmt) "mce_power: " fmt + +#include <linux/types.h> +#include <linux/ptrace.h> +#include <asm/mmu.h> +#include <asm/mce.h> +#include <asm/machdep.h> + +/* flush SLBs and reload */ +static void flush_and_reload_slb(void) +{ + struct slb_shadow *slb; + unsigned long i, n; + + /* Invalidate all SLBs */ + asm volatile("slbmte %0,%0; slbia" : : "r" (0)); + +#ifdef CONFIG_KVM_BOOK3S_HANDLER + /* + * If machine check is hit when in guest or in transition, we will + * only flush the SLBs and continue. + */ + if (get_paca()->kvm_hstate.in_guest) + return; +#endif + + /* For host kernel, reload the SLBs from shadow SLB buffer. */ + slb = get_slb_shadow(); + if (!slb) + return; + + n = min_t(u32, be32_to_cpu(slb->persistent), SLB_MIN_SIZE); + + /* Load up the SLB entries from shadow SLB */ + for (i = 0; i < n; i++) { + unsigned long rb = be64_to_cpu(slb->save_area[i].esid); + unsigned long rs = be64_to_cpu(slb->save_area[i].vsid); + + rb = (rb & ~0xFFFul) | i; + asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); + } +} + +static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) +{ + long handled = 1; + + /* + * flush and reload SLBs for SLB errors and flush TLBs for TLB errors. + * reset the error bits whenever we handle them so that at the end + * we can check whether we handled all of them or not. + * */ + if (dsisr & slb_error_bits) { + flush_and_reload_slb(); + /* reset error bits */ + dsisr &= ~(slb_error_bits); + } + if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) { + if (cur_cpu_spec && cur_cpu_spec->flush_tlb) + cur_cpu_spec->flush_tlb(TLBIEL_INVAL_PAGE); + /* reset error bits */ + dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB; + } + /* Any other errors we don't understand? */ + if (dsisr & 0xffffffffUL) + handled = 0; + + return handled; +} + +static long mce_handle_derror_p7(uint64_t dsisr) +{ + return mce_handle_derror(dsisr, P7_DSISR_MC_SLB_ERRORS); +} + +static long mce_handle_common_ierror(uint64_t srr1) +{ + long handled = 0; + + switch (P7_SRR1_MC_IFETCH(srr1)) { + case 0: + break; + case P7_SRR1_MC_IFETCH_SLB_PARITY: + case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: + /* flush and reload SLBs for SLB errors. */ + flush_and_reload_slb(); + handled = 1; + break; + case P7_SRR1_MC_IFETCH_TLB_MULTIHIT: + if (cur_cpu_spec && cur_cpu_spec->flush_tlb) { + cur_cpu_spec->flush_tlb(TLBIEL_INVAL_PAGE); + handled = 1; + } + break; + default: + break; + } + + return handled; +} + +static long mce_handle_ierror_p7(uint64_t srr1) +{ + long handled = 0; + + handled = mce_handle_common_ierror(srr1); + + if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { + flush_and_reload_slb(); + handled = 1; + } + return handled; +} + +static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1) +{ + switch (P7_SRR1_MC_IFETCH(srr1)) { + case P7_SRR1_MC_IFETCH_SLB_PARITY: + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; + break; + case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; + break; + case P7_SRR1_MC_IFETCH_TLB_MULTIHIT: + mce_err->error_type = MCE_ERROR_TYPE_TLB; + mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; + break; + case P7_SRR1_MC_IFETCH_UE: + case P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL: + mce_err->error_type = MCE_ERROR_TYPE_UE; + mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH; + break; + case P7_SRR1_MC_IFETCH_UE_TLB_RELOAD: + mce_err->error_type = MCE_ERROR_TYPE_UE; + mce_err->u.ue_error_type = + MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; + break; + } +} + +static void mce_get_ierror_p7(struct mce_error_info *mce_err, uint64_t srr1) +{ + mce_get_common_ierror(mce_err, srr1); + if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; + } +} + +static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr) +{ + if (dsisr & P7_DSISR_MC_UE) { + mce_err->error_type = MCE_ERROR_TYPE_UE; + mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; + } else if (dsisr & P7_DSISR_MC_UE_TABLEWALK) { + mce_err->error_type = MCE_ERROR_TYPE_UE; + mce_err->u.ue_error_type = + MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; + } else if (dsisr & P7_DSISR_MC_ERAT_MULTIHIT) { + mce_err->error_type = MCE_ERROR_TYPE_ERAT; + mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; + } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT) { + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; + } else if (dsisr & P7_DSISR_MC_SLB_PARITY_MFSLB) { + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; + } else if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) { + mce_err->error_type = MCE_ERROR_TYPE_TLB; + mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; + } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT_PARITY) { + mce_err->error_type = MCE_ERROR_TYPE_SLB; + mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; + } +} + +static long mce_handle_ue_error(struct pt_regs *regs) +{ + long handled = 0; + + /* + * On specific SCOM read via MMIO we may get a machine check + * exception with SRR0 pointing inside opal. If that is the + * case OPAL may have recovery address to re-read SCOM data in + * different way and hence we can recover from this MC. + */ + + if (ppc_md.mce_check_early_recovery) { + if (ppc_md.mce_check_early_recovery(regs)) + handled = 1; + } + return handled; +} + +long __machine_check_early_realmode_p7(struct pt_regs *regs) +{ + uint64_t srr1, nip, addr; + long handled = 1; + struct mce_error_info mce_error_info = { 0 }; + + srr1 = regs->msr; + nip = regs->nip; + + /* + * Handle memory errors depending whether this was a load/store or + * ifetch exception. Also, populate the mce error_type and + * type-specific error_type from either SRR1 or DSISR, depending + * whether this was a load/store or ifetch exception + */ + if (P7_SRR1_MC_LOADSTORE(srr1)) { + handled = mce_handle_derror_p7(regs->dsisr); + mce_get_derror_p7(&mce_error_info, regs->dsisr); + addr = regs->dar; + } else { + handled = mce_handle_ierror_p7(srr1); + mce_get_ierror_p7(&mce_error_info, srr1); + addr = regs->nip; + } + + /* Handle UE error. */ + if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) + handled = mce_handle_ue_error(regs); + + save_mce_event(regs, handled, &mce_error_info, nip, addr); + return handled; +} + +static void mce_get_ierror_p8(struct mce_error_info *mce_err, uint64_t srr1) +{ + mce_get_common_ierror(mce_err, srr1); + if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { + mce_err->error_type = MCE_ERROR_TYPE_ERAT; + mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; + } +} + +static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr) +{ + mce_get_derror_p7(mce_err, dsisr); + if (dsisr & P8_DSISR_MC_ERAT_MULTIHIT_SEC) { + mce_err->error_type = MCE_ERROR_TYPE_ERAT; + mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; + } +} + +static long mce_handle_ierror_p8(uint64_t srr1) +{ + long handled = 0; + + handled = mce_handle_common_ierror(srr1); + + if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { + flush_and_reload_slb(); + handled = 1; + } + return handled; +} + +static long mce_handle_derror_p8(uint64_t dsisr) +{ + return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS); +} + +long __machine_check_early_realmode_p8(struct pt_regs *regs) +{ + uint64_t srr1, nip, addr; + long handled = 1; + struct mce_error_info mce_error_info = { 0 }; + + srr1 = regs->msr; + nip = regs->nip; + + if (P7_SRR1_MC_LOADSTORE(srr1)) { + handled = mce_handle_derror_p8(regs->dsisr); + mce_get_derror_p8(&mce_error_info, regs->dsisr); + addr = regs->dar; + } else { + handled = mce_handle_ierror_p8(srr1); + mce_get_ierror_p8(&mce_error_info, srr1); + addr = regs->nip; + } + + /* Handle UE error. */ + if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) + handled = mce_handle_ue_error(regs); + + save_mce_event(regs, handled, &mce_error_info, nip, addr); + return handled; +} diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 19e096bd0e7..7c6bb4b17b4 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -36,26 +36,44 @@ .text +/* + * We store the saved ksp_limit in the unused part + * of the STACK_FRAME_OVERHEAD + */ _GLOBAL(call_do_softirq) mflr r0 stw r0,4(r1) + lwz r10,THREAD+KSP_LIMIT(r2) + addi r11,r3,THREAD_INFO_GAP stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) mr r1,r3 + stw r10,8(r1) + stw r11,THREAD+KSP_LIMIT(r2) bl __do_softirq + lwz r10,8(r1) lwz r1,0(r1) lwz r0,4(r1) + stw r10,THREAD+KSP_LIMIT(r2) mtlr r0 blr -_GLOBAL(call_handle_irq) +/* + * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp); + */ +_GLOBAL(call_do_irq) mflr r0 stw r0,4(r1) - mtctr r6 - stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r5) - mr r1,r5 - bctrl + lwz r10,THREAD+KSP_LIMIT(r2) + addi r11,r4,THREAD_INFO_GAP + stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4) + mr r1,r4 + stw r10,8(r1) + stw r11,THREAD+KSP_LIMIT(r2) + bl __do_irq + lwz r10,8(r1) lwz r1,0(r1) lwz r0,4(r1) + stw r10,THREAD+KSP_LIMIT(r2) mtlr r0 blr @@ -327,8 +345,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE) * * flush_icache_range(unsigned long start, unsigned long stop) */ -_KPROBE(__flush_icache_range) +_KPROBE(flush_icache_range) BEGIN_FTR_SECTION + PURGE_PREFETCHED_INS blr /* for 601, do nothing */ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) li r5,L1_CACHE_BYTES-1 @@ -432,6 +451,7 @@ _GLOBAL(invalidate_dcache_range) */ _GLOBAL(__flush_dcache_icache) BEGIN_FTR_SECTION + PURGE_PREFETCHED_INS blr END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) rlwinm r3,r3,0,0,31-PAGE_SHIFT /* Get page base address */ @@ -473,6 +493,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x) */ _GLOBAL(__flush_dcache_icache_phys) BEGIN_FTR_SECTION + PURGE_PREFETCHED_INS blr /* for 601, do nothing */ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) mfmsr r10 @@ -643,6 +664,20 @@ _GLOBAL(__lshrdi3) blr /* + * 64-bit comparison: __cmpdi2(s64 a, s64 b) + * Returns 0 if a < b, 1 if a == b, 2 if a > b. + */ +_GLOBAL(__cmpdi2) + cmpw r3,r5 + li r3,1 + bne 1f + cmplw r4,r6 + beqlr +1: li r3,0 + bltlr + li r3,2 + blr +/* * 64-bit comparison: __ucmpdi2(u64 a, u64 b) * Returns 0 if a < b, 1 if a == b, 2 if a > b. */ @@ -657,6 +692,17 @@ _GLOBAL(__ucmpdi2) li r3,2 blr +_GLOBAL(__bswapdi2) + rotlwi r9,r4,8 + rotlwi r10,r3,8 + rlwimi r9,r4,24,0,7 + rlwimi r10,r3,24,0,7 + rlwimi r9,r4,24,16,23 + rlwimi r10,r3,24,16,23 + mr r3,r9 + mr r4,r10 + blr + _GLOBAL(abs) srawi r4,r3,31 xor r3,r3,r4 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 5cfa8008693..4e314b90c75 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -34,20 +34,18 @@ _GLOBAL(call_do_softirq) std r0,16(r1) stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) mr r1,r3 - bl .__do_softirq + bl __do_softirq ld r1,0(r1) ld r0,16(r1) mtlr r0 blr -_GLOBAL(call_handle_irq) - ld r8,0(r6) +_GLOBAL(call_do_irq) mflr r0 std r0,16(r1) - mtctr r8 - stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r5) - mr r1,r5 - bctrl + stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4) + mr r1,r4 + bl __do_irq ld r1,0(r1) ld r0,16(r1) mtlr r0 @@ -67,8 +65,11 @@ PPC64_CACHES: * flush all bytes from start through stop-1 inclusive */ -_KPROBE(__flush_icache_range) - +_KPROBE(flush_icache_range) +BEGIN_FTR_SECTION + PURGE_PREFETCHED_INS + blr +END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) /* * Flush the data cache to memory * @@ -211,6 +212,11 @@ _GLOBAL(__flush_dcache_icache) * Different systems have different cache line sizes */ +BEGIN_FTR_SECTION + PURGE_PREFETCHED_INS + blr +END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) + /* Flush the dcache */ ld r7,PPC64_CACHES@toc(r2) clrrdi r3,r3,PAGE_SHIFT /* Page align */ @@ -234,8 +240,53 @@ _GLOBAL(__flush_dcache_icache) isync blr +_GLOBAL(__bswapdi2) + srdi r8,r3,32 + rlwinm r7,r3,8,0xffffffff + rlwimi r7,r3,24,0,7 + rlwinm r9,r8,8,0xffffffff + rlwimi r7,r3,24,16,23 + rlwimi r9,r8,24,0,7 + rlwimi r9,r8,24,16,23 + sldi r7,r7,32 + or r3,r7,r9 + blr + + +#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX +_GLOBAL(rmci_on) + sync + isync + li r3,0x100 + rldicl r3,r3,32,0 + mfspr r5,SPRN_HID4 + or r5,r5,r3 + sync + mtspr SPRN_HID4,r5 + isync + slbia + isync + sync + blr + +_GLOBAL(rmci_off) + sync + isync + li r3,0x100 + rldicl r3,r3,32,0 + mfspr r5,SPRN_HID4 + andc r5,r5,r3 + sync + mtspr SPRN_HID4,r5 + isync + slbia + isync + sync + blr +#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */ #if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE) + /* * Do an IO access in real mode */ @@ -405,19 +456,6 @@ _GLOBAL(scom970_write) blr #endif /* CONFIG_CPU_FREQ_PMAC64 || CONFIG_CPU_FREQ_MAPLE */ - -/* - * disable_kernel_fp() - * Disable the FPU. - */ -_GLOBAL(disable_kernel_fp) - mfmsr r3 - rldicl r0,r3,(63-MSR_FP_LG),1 - rldicl r3,r0,(MSR_FP_LG+1),0 - mtmsrd r3 /* disable use of fpu now */ - isync - blr - /* kexec_wait(phys_cpu) * * wait for the flag to change, indicating this kernel is going away but @@ -468,7 +506,7 @@ _GLOBAL(kexec_smp_wait) stb r4,PACAKEXECSTATE(r13) SYNC - b .kexec_wait + b kexec_wait /* * switch to real mode (turn mmu off) @@ -538,7 +576,7 @@ _GLOBAL(kexec_sequence) /* copy dest pages, flush whole dest image */ mr r3,r29 - bl .kexec_copy_flush /* (image) */ + bl kexec_copy_flush /* (image) */ /* turn off mmu */ bl real_mode @@ -548,7 +586,7 @@ _GLOBAL(kexec_sequence) mr r4,r30 /* start, aka phys mem offset */ li r5,0x100 li r6,0 - bl .copy_and_flush /* (dest, src, copy limit, start offset) */ + bl copy_and_flush /* (dest, src, copy limit, start offset) */ 1: /* assume normal blr return */ /* release other cpus to the new kernel secondary start at 0x60 */ @@ -557,8 +595,12 @@ _GLOBAL(kexec_sequence) stw r6,kexec_flag-1b(5) /* clear out hardware hash page table and tlb */ - ld r5,0(r27) /* deref function descriptor */ - mtctr r5 +#if !defined(_CALL_ELF) || _CALL_ELF != 2 + ld r12,0(r27) /* deref function descriptor */ +#else + mr r12,r27 +#endif + mtctr r12 bctrl /* ppc_md.hpte_clear_all(void); */ /* @@ -592,3 +634,31 @@ _GLOBAL(kexec_sequence) li r5,0 blr /* image->start(physid, image->start, 0); */ #endif /* CONFIG_KEXEC */ + +#ifdef CONFIG_MODULES +#if defined(_CALL_ELF) && _CALL_ELF == 2 + +#ifdef CONFIG_MODVERSIONS +.weak __crc_TOC. +.section "___kcrctab+TOC.","a" +.globl __kcrctab_TOC. +__kcrctab_TOC.: + .llong __crc_TOC. +#endif + +/* + * Export a fake .TOC. since both modpost and depmod will complain otherwise. + * Both modpost and depmod strip the leading . so we do the same here. + */ +.section "__ksymtab_strings","a" +__kstrtab_TOC.: + .asciz "TOC." + +.section "___ksymtab+TOC.","a" +/* This symbol name is important: it's used by modpost to find exported syms */ +.globl __ksymtab_TOC. +__ksymtab_TOC.: + .llong 0 /* .value */ + .llong __kstrtab_TOC. +#endif /* ELFv2 */ +#endif /* MODULES */ diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index 2d275707f41..9547381b631 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -25,8 +25,7 @@ #include <asm/uaccess.h> #include <asm/firmware.h> #include <linux/sort.h> - -#include "setup.h" +#include <asm/setup.h> LIST_HEAD(module_bug_list); diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index 2e3200ca485..6cff040bf45 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c @@ -26,8 +26,7 @@ #include <linux/cache.h> #include <linux/bug.h> #include <linux/sort.h> - -#include "setup.h" +#include <asm/setup.h> #if 0 #define DEBUGP printk diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 9f44a775a10..d807ee626af 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -22,12 +22,12 @@ #include <linux/vmalloc.h> #include <linux/ftrace.h> #include <linux/bug.h> +#include <linux/uaccess.h> #include <asm/module.h> #include <asm/firmware.h> #include <asm/code-patching.h> #include <linux/sort.h> - -#include "setup.h" +#include <asm/setup.h> /* FIXME: We don't do .init separately. To do this, we'd need to have a separate r2 value in the init and core section, and stub between @@ -42,35 +42,170 @@ #define DEBUGP(fmt , ...) #endif +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define R2_STACK_OFFSET 24 + +/* An address is simply the address of the function. */ +typedef unsigned long func_desc_t; + +static func_desc_t func_desc(unsigned long addr) +{ + return addr; +} +static unsigned long func_addr(unsigned long addr) +{ + return addr; +} +static unsigned long stub_func_addr(func_desc_t func) +{ + return func; +} + +/* PowerPC64 specific values for the Elf64_Sym st_other field. */ +#define STO_PPC64_LOCAL_BIT 5 +#define STO_PPC64_LOCAL_MASK (7 << STO_PPC64_LOCAL_BIT) +#define PPC64_LOCAL_ENTRY_OFFSET(other) \ + (((1 << (((other) & STO_PPC64_LOCAL_MASK) >> STO_PPC64_LOCAL_BIT)) >> 2) << 2) + +static unsigned int local_entry_offset(const Elf64_Sym *sym) +{ + /* sym->st_other indicates offset to local entry point + * (otherwise it will assume r12 is the address of the start + * of function and try to derive r2 from it). */ + return PPC64_LOCAL_ENTRY_OFFSET(sym->st_other); +} +#else +#define R2_STACK_OFFSET 40 + +/* An address is address of the OPD entry, which contains address of fn. */ +typedef struct ppc64_opd_entry func_desc_t; + +static func_desc_t func_desc(unsigned long addr) +{ + return *(struct ppc64_opd_entry *)addr; +} +static unsigned long func_addr(unsigned long addr) +{ + return func_desc(addr).funcaddr; +} +static unsigned long stub_func_addr(func_desc_t func) +{ + return func.funcaddr; +} +static unsigned int local_entry_offset(const Elf64_Sym *sym) +{ + return 0; +} +#endif + /* Like PPC32, we need little trampolines to do > 24-bit jumps (into the kernel itself). But on PPC64, these need to be used for every jump, actually, to reset r2 (TOC+0x8000). */ struct ppc64_stub_entry { - /* 28 byte jump instruction sequence (7 instructions) */ - unsigned char jump[28]; - unsigned char unused[4]; + /* 28 byte jump instruction sequence (7 instructions). We only + * need 6 instructions on ABIv2 but we always allocate 7 so + * so we don't have to modify the trampoline load instruction. */ + u32 jump[7]; + u32 unused; /* Data for the above code */ - struct ppc64_opd_entry opd; + func_desc_t funcdata; }; -/* We use a stub to fix up r2 (TOC ptr) and to jump to the (external) - function which may be more than 24-bits away. We could simply - patch the new r2 value and function pointer into the stub, but it's - significantly shorter to put these values at the end of the stub - code, and patch the stub address (32-bits relative to the TOC ptr, - r2) into the stub. */ -static struct ppc64_stub_entry ppc64_stub = -{ .jump = { - 0x3d, 0x82, 0x00, 0x00, /* addis r12,r2, <high> */ - 0x39, 0x8c, 0x00, 0x00, /* addi r12,r12, <low> */ +/* + * PPC64 uses 24 bit jumps, but we need to jump into other modules or + * the kernel which may be further. So we jump to a stub. + * + * For ELFv1 we need to use this to set up the new r2 value (aka TOC + * pointer). For ELFv2 it's the callee's responsibility to set up the + * new r2, but for both we need to save the old r2. + * + * We could simply patch the new r2 value and function pointer into + * the stub, but it's significantly shorter to put these values at the + * end of the stub code, and patch the stub address (32-bits relative + * to the TOC ptr, r2) into the stub. + */ + +static u32 ppc64_stub_insns[] = { + 0x3d620000, /* addis r11,r2, <high> */ + 0x396b0000, /* addi r11,r11, <low> */ /* Save current r2 value in magic place on the stack. */ - 0xf8, 0x41, 0x00, 0x28, /* std r2,40(r1) */ - 0xe9, 0x6c, 0x00, 0x20, /* ld r11,32(r12) */ - 0xe8, 0x4c, 0x00, 0x28, /* ld r2,40(r12) */ - 0x7d, 0x69, 0x03, 0xa6, /* mtctr r11 */ - 0x4e, 0x80, 0x04, 0x20 /* bctr */ -} }; + 0xf8410000|R2_STACK_OFFSET, /* std r2,R2_STACK_OFFSET(r1) */ + 0xe98b0020, /* ld r12,32(r11) */ +#if !defined(_CALL_ELF) || _CALL_ELF != 2 + /* Set up new r2 from function descriptor */ + 0xe84b0028, /* ld r2,40(r11) */ +#endif + 0x7d8903a6, /* mtctr r12 */ + 0x4e800420 /* bctr */ +}; + +#ifdef CONFIG_DYNAMIC_FTRACE + +static u32 ppc64_stub_mask[] = { + 0xffff0000, + 0xffff0000, + 0xffffffff, + 0xffffffff, +#if !defined(_CALL_ELF) || _CALL_ELF != 2 + 0xffffffff, +#endif + 0xffffffff, + 0xffffffff +}; + +bool is_module_trampoline(u32 *p) +{ + unsigned int i; + u32 insns[ARRAY_SIZE(ppc64_stub_insns)]; + + BUILD_BUG_ON(sizeof(ppc64_stub_insns) != sizeof(ppc64_stub_mask)); + + if (probe_kernel_read(insns, p, sizeof(insns))) + return -EFAULT; + + for (i = 0; i < ARRAY_SIZE(ppc64_stub_insns); i++) { + u32 insna = insns[i]; + u32 insnb = ppc64_stub_insns[i]; + u32 mask = ppc64_stub_mask[i]; + + if ((insna & mask) != (insnb & mask)) + return false; + } + + return true; +} + +int module_trampoline_target(struct module *mod, u32 *trampoline, + unsigned long *target) +{ + u32 buf[2]; + u16 upper, lower; + long offset; + void *toc_entry; + + if (probe_kernel_read(buf, trampoline, sizeof(buf))) + return -EFAULT; + + upper = buf[0] & 0xffff; + lower = buf[1] & 0xffff; + + /* perform the addis/addi, both signed */ + offset = ((short)upper << 16) + (short)lower; + + /* + * Now get the address this trampoline jumps to. This + * is always 32 bytes into our trampoline stub. + */ + toc_entry = (void *)mod->arch.toc + offset + 32; + + if (probe_kernel_read(target, toc_entry, sizeof(*target))) + return -EFAULT; + + return 0; +} + +#endif /* Count how many different 24-bit relocations (different symbol, different addend) */ @@ -173,17 +308,27 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, return relocs * sizeof(struct ppc64_stub_entry); } +/* Still needed for ELFv2, for .TOC. */ static void dedotify_versions(struct modversion_info *vers, unsigned long size) { struct modversion_info *end; for (end = (void *)vers + size; vers < end; vers++) - if (vers->name[0] == '.') + if (vers->name[0] == '.') { memmove(vers->name, vers->name+1, strlen(vers->name)); +#ifdef ARCH_RELOCATES_KCRCTAB + /* The TOC symbol has no CRC computed. To avoid CRC + * check failing, we must force it to the expected + * value (see CRC check in module.c). + */ + if (!strcmp(vers->name, "TOC.")) + vers->crc = -(unsigned long)reloc_start; +#endif + } } -/* Undefined symbols which refer to .funcname, hack to funcname */ +/* Undefined symbols which refer to .funcname, hack to funcname (or .TOC.) */ static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab) { unsigned int i; @@ -197,6 +342,24 @@ static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab) } } +static Elf64_Sym *find_dot_toc(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex) +{ + unsigned int i, numsyms; + Elf64_Sym *syms; + + syms = (Elf64_Sym *)sechdrs[symindex].sh_addr; + numsyms = sechdrs[symindex].sh_size / sizeof(Elf64_Sym); + + for (i = 1; i < numsyms; i++) { + if (syms[i].st_shndx == SHN_UNDEF + && strcmp(strtab + syms[i].st_name, "TOC.") == 0) + return &syms[i]; + } + return NULL; +} + int module_frob_arch_sections(Elf64_Ehdr *hdr, Elf64_Shdr *sechdrs, char *secstrings, @@ -261,16 +424,12 @@ static inline unsigned long my_r2(Elf64_Shdr *sechdrs, struct module *me) /* Patch stub to reference function and correct r2 value. */ static inline int create_stub(Elf64_Shdr *sechdrs, struct ppc64_stub_entry *entry, - struct ppc64_opd_entry *opd, + unsigned long addr, struct module *me) { - Elf64_Half *loc1, *loc2; long reladdr; - *entry = ppc64_stub; - - loc1 = (Elf64_Half *)&entry->jump[2]; - loc2 = (Elf64_Half *)&entry->jump[6]; + memcpy(entry->jump, ppc64_stub_insns, sizeof(ppc64_stub_insns)); /* Stub uses address relative to r2. */ reladdr = (unsigned long)entry - my_r2(sechdrs, me); @@ -281,35 +440,33 @@ static inline int create_stub(Elf64_Shdr *sechdrs, } DEBUGP("Stub %p get data from reladdr %li\n", entry, reladdr); - *loc1 = PPC_HA(reladdr); - *loc2 = PPC_LO(reladdr); - entry->opd.funcaddr = opd->funcaddr; - entry->opd.r2 = opd->r2; + entry->jump[0] |= PPC_HA(reladdr); + entry->jump[1] |= PPC_LO(reladdr); + entry->funcdata = func_desc(addr); return 1; } -/* Create stub to jump to function described in this OPD: we need the +/* Create stub to jump to function described in this OPD/ptr: we need the stub to set up the TOC ptr (r2) for the function. */ static unsigned long stub_for_addr(Elf64_Shdr *sechdrs, - unsigned long opdaddr, + unsigned long addr, struct module *me) { struct ppc64_stub_entry *stubs; - struct ppc64_opd_entry *opd = (void *)opdaddr; unsigned int i, num_stubs; num_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stubs); /* Find this stub, or if that fails, the next avail. entry */ stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr; - for (i = 0; stubs[i].opd.funcaddr; i++) { + for (i = 0; stub_func_addr(stubs[i].funcdata); i++) { BUG_ON(i >= num_stubs); - if (stubs[i].opd.funcaddr == opd->funcaddr) + if (stub_func_addr(stubs[i].funcdata) == func_addr(addr)) return (unsigned long)&stubs[i]; } - if (!create_stub(sechdrs, &stubs[i], opd, me)) + if (!create_stub(sechdrs, &stubs[i], addr, me)) return 0; return (unsigned long)&stubs[i]; @@ -324,7 +481,8 @@ static int restore_r2(u32 *instruction, struct module *me) me->name, *instruction); return 0; } - *instruction = 0xe8410028; /* ld r2,40(r1) */ + /* ld r2,R2_STACK_OFFSET(r1) */ + *instruction = 0xe8410000 | R2_STACK_OFFSET; return 1; } @@ -342,6 +500,17 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, DEBUGP("Applying ADD relocate section %u to %u\n", relsec, sechdrs[relsec].sh_info); + + /* First time we're called, we can fix up .TOC. */ + if (!me->arch.toc_fixed) { + sym = find_dot_toc(sechdrs, strtab, symindex); + /* It's theoretically possible that a module doesn't want a + * .TOC. so don't fail it just for that. */ + if (sym) + sym->st_value = my_r2(sechdrs, me); + me->arch.toc_fixed = true; + } + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) { /* This is where to make the change */ location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -386,6 +555,14 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, | (value & 0xffff); break; + case R_PPC64_TOC16_LO: + /* Subtract TOC pointer */ + value -= my_r2(sechdrs, me); + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xffff) + | (value & 0xffff); + break; + case R_PPC64_TOC16_DS: /* Subtract TOC pointer */ value -= my_r2(sechdrs, me); @@ -399,6 +576,28 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, | (value & 0xfffc); break; + case R_PPC64_TOC16_LO_DS: + /* Subtract TOC pointer */ + value -= my_r2(sechdrs, me); + if ((value & 3) != 0) { + printk("%s: bad TOC16_LO_DS relocation (%lu)\n", + me->name, value); + return -ENOEXEC; + } + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xfffc) + | (value & 0xfffc); + break; + + case R_PPC64_TOC16_HA: + /* Subtract TOC pointer */ + value -= my_r2(sechdrs, me); + value = ((value + 0x8000) >> 16); + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xffff) + | (value & 0xffff); + break; + case R_PPC_REL24: /* FIXME: Handle weak symbols here --RR */ if (sym->st_shndx == SHN_UNDEF) { @@ -408,7 +607,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, return -ENOENT; if (!restore_r2((u32 *)location + 1, me)) return -ENOEXEC; - } + } else + value += local_entry_offset(sym); /* Convert value to relative */ value -= (unsigned long)location; @@ -429,6 +629,31 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, *location = value - (unsigned long)location; break; + case R_PPC64_TOCSAVE: + /* + * Marker reloc indicates we don't have to save r2. + * That would only save us one instruction, so ignore + * it. + */ + break; + + case R_PPC64_REL16_HA: + /* Subtract location pointer */ + value -= (unsigned long)location; + value = ((value + 0x8000) >> 16); + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xffff) + | (value & 0xffff); + break; + + case R_PPC64_REL16_LO: + /* Subtract location pointer */ + value -= (unsigned long)location; + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xffff) + | (value & 0xffff); + break; + default: printk("%s: Unknown ADD relocation: %lu\n", me->name, diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index bec1e930ed7..28b898e6818 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -84,22 +84,30 @@ static ssize_t dev_nvram_read(struct file *file, char __user *buf, char *tmp = NULL; ssize_t size; - ret = -ENODEV; - if (!ppc_md.nvram_size) + if (!ppc_md.nvram_size) { + ret = -ENODEV; goto out; + } - ret = 0; size = ppc_md.nvram_size(); - if (*ppos >= size || size < 0) + if (size < 0) { + ret = size; goto out; + } + + if (*ppos >= size) { + ret = 0; + goto out; + } count = min_t(size_t, count, size - *ppos); count = min(count, PAGE_SIZE); - ret = -ENOMEM; tmp = kmalloc(count, GFP_KERNEL); - if (!tmp) + if (!tmp) { + ret = -ENOMEM; goto out; + } ret = ppc_md.nvram_read(tmp, count, ppos); if (ret <= 0) @@ -202,7 +210,7 @@ static void __init nvram_print_partitions(char * label) printk(KERN_WARNING "--------%s---------\n", label); printk(KERN_WARNING "indx\t\tsig\tchks\tlen\tname\n"); list_for_each_entry(tmp_part, &nvram_partitions, partition) { - printk(KERN_WARNING "%4d \t%02x\t%02x\t%d\t%12s\n", + printk(KERN_WARNING "%4d \t%02x\t%02x\t%d\t%12.12s\n", tmp_part->index, tmp_part->header.signature, tmp_part->header.checksum, tmp_part->header.length, tmp_part->header.name); @@ -215,9 +223,13 @@ static int __init nvram_write_header(struct nvram_partition * part) { loff_t tmp_index; int rc; - + struct nvram_header phead; + + memcpy(&phead, &part->header, NVRAM_HEADER_LEN); + phead.length = cpu_to_be16(phead.length); + tmp_index = part->index; - rc = ppc_md.nvram_write((char *)&part->header, NVRAM_HEADER_LEN, &tmp_index); + rc = ppc_md.nvram_write((char *)&phead, NVRAM_HEADER_LEN, &tmp_index); return rc; } @@ -497,6 +509,8 @@ int __init nvram_scan_partitions(void) memcpy(&phead, header, NVRAM_HEADER_LEN); + phead.length = be16_to_cpu(phead.length); + err = 0; c_sum = nvram_checksum(&phead); if (c_sum != phead.checksum) { @@ -511,8 +525,7 @@ int __init nvram_scan_partitions(void) "detected: 0-length partition\n"); goto out; } - tmp_part = (struct nvram_partition *) - kmalloc(sizeof(struct nvram_partition), GFP_KERNEL); + tmp_part = kmalloc(sizeof(struct nvram_partition), GFP_KERNEL); err = -ENOMEM; if (!tmp_part) { printk(KERN_ERR "nvram_scan_partitions: kmalloc failed\n"); diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 07c12697d70..a7b74307672 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -71,10 +71,8 @@ static int of_pci_phb_probe(struct platform_device *dev) eeh_dev_phb_init_dynamic(phb); /* Register devices with EEH */ -#ifdef CONFIG_EEH if (dev->dev.of_node->child) eeh_add_device_tree_early(dev->dev.of_node); -#endif /* CONFIG_EEH */ /* Scan the bus */ pcibios_scan_phb(phb); @@ -88,13 +86,14 @@ static int of_pci_phb_probe(struct platform_device *dev) pcibios_claim_one_bus(phb->bus); /* Finish EEH setup */ -#ifdef CONFIG_EEH eeh_add_device_tree_late(phb->bus); -#endif /* Add probed PCI devices to the device model */ pci_bus_add_devices(phb->bus); + /* sysfs files should only be added after devices are added */ + eeh_add_sysfs_files(phb->bus); + return 0; } diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index cd6da855090..d6e195e8cd4 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -34,10 +34,10 @@ extern unsigned long __toc_start; */ struct lppaca lppaca[] = { [0 ... (NR_LPPACAS-1)] = { - .desc = 0xd397d781, /* "LpPa" */ - .size = sizeof(struct lppaca), + .desc = cpu_to_be32(0xd397d781), /* "LpPa" */ + .size = cpu_to_be16(sizeof(struct lppaca)), .fpregs_in_use = 1, - .slb_count = 64, + .slb_count = cpu_to_be16(64), .vmxregs_in_use = 0, .page_ins = 0, }, @@ -46,7 +46,7 @@ struct lppaca lppaca[] = { static struct lppaca *extra_lppacas; static long __initdata lppaca_size; -static void allocate_lppacas(int nr_cpus, unsigned long limit) +static void __init allocate_lppacas(int nr_cpus, unsigned long limit) { if (nr_cpus <= NR_LPPACAS) return; @@ -57,7 +57,7 @@ static void allocate_lppacas(int nr_cpus, unsigned long limit) PAGE_SIZE, limit)); } -static struct lppaca *new_lppaca(int cpu) +static struct lppaca * __init new_lppaca(int cpu) { struct lppaca *lp; @@ -70,7 +70,7 @@ static struct lppaca *new_lppaca(int cpu) return lp; } -static void free_lppacas(void) +static void __init free_lppacas(void) { long new_size = 0, nr; @@ -98,13 +98,32 @@ static inline void free_lppacas(void) { } /* * 3 persistent SLBs are registered here. The buffer will be zero * initially, hence will all be invaild until we actually write them. + * + * If you make the number of persistent SLB entries dynamic, please also + * update PR KVM to flush and restore them accordingly. */ -struct slb_shadow slb_shadow[] __cacheline_aligned = { - [0 ... (NR_CPUS-1)] = { - .persistent = SLB_NUM_BOLTED, - .buffer_length = sizeof(struct slb_shadow), - }, -}; +static struct slb_shadow *slb_shadow; + +static void __init allocate_slb_shadows(int nr_cpus, int limit) +{ + int size = PAGE_ALIGN(sizeof(struct slb_shadow) * nr_cpus); + slb_shadow = __va(memblock_alloc_base(size, PAGE_SIZE, limit)); + memset(slb_shadow, 0, size); +} + +static struct slb_shadow * __init init_slb_shadow(int cpu) +{ + struct slb_shadow *s = &slb_shadow[cpu]; + + s->persistent = cpu_to_be32(SLB_NUM_BOLTED); + s->buffer_length = cpu_to_be32(sizeof(*s)); + + return s; +} + +#else /* CONFIG_PPC_STD_MMU_64 */ + +static void __init allocate_slb_shadows(int nr_cpus, int limit) { } #endif /* CONFIG_PPC_STD_MMU_64 */ @@ -120,8 +139,6 @@ struct slb_shadow slb_shadow[] __cacheline_aligned = { struct paca_struct *paca; EXPORT_SYMBOL(paca); -struct paca_struct boot_paca; - void __init initialise_paca(struct paca_struct *new_paca, int cpu) { /* The TOC register (GPR2) points 32kB into the TOC, so that 64kB @@ -138,14 +155,20 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu) new_paca->paca_index = cpu; new_paca->kernel_toc = kernel_toc; new_paca->kernelbase = (unsigned long) _stext; - new_paca->kernel_msr = MSR_KERNEL; + /* Only set MSR:IR/DR when MMU is initialized */ + new_paca->kernel_msr = MSR_KERNEL & ~(MSR_IR | MSR_DR); new_paca->hw_cpu_id = 0xffff; new_paca->kexec_state = KEXEC_STATE_NONE; new_paca->__current = &init_task; new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL; #ifdef CONFIG_PPC_STD_MMU_64 - new_paca->slb_shadow_ptr = &slb_shadow[cpu]; + new_paca->slb_shadow_ptr = init_slb_shadow(cpu); #endif /* CONFIG_PPC_STD_MMU_64 */ + +#ifdef CONFIG_PPC_BOOK3E + /* For now -- if we have threads this will be adjusted later */ + new_paca->tcd_ptr = &new_paca->tcd; +#endif } /* Put the paca pointer into r13 and SPRG_PACA */ @@ -192,6 +215,8 @@ void __init allocate_pacas(void) allocate_lppacas(nr_cpu_ids, limit); + allocate_slb_shadows(nr_cpu_ids, limit); + /* Can't use for_each_*_cpu, as they aren't functional yet */ for (cpu = 0; cpu < nr_cpu_ids; cpu++) initialise_paca(&paca[cpu], cpu); diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 7c37379ea9b..b49c72fd7f1 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -21,6 +21,7 @@ #include <linux/string.h> #include <linux/init.h> #include <linux/bootmem.h> +#include <linux/delay.h> #include <linux/export.h> #include <linux/of_address.h> #include <linux/of_pci.h> @@ -30,6 +31,7 @@ #include <linux/irq.h> #include <linux/vmalloc.h> #include <linux/slab.h> +#include <linux/vgaarb.h> #include <asm/processor.h> #include <asm/io.h> @@ -119,6 +121,25 @@ resource_size_t pcibios_window_alignment(struct pci_bus *bus, return 1; } +void pcibios_reset_secondary_bus(struct pci_dev *dev) +{ + u16 ctrl; + + if (ppc_md.pcibios_reset_secondary_bus) { + ppc_md.pcibios_reset_secondary_bus(dev); + return; + } + + pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl); + ctrl |= PCI_BRIDGE_CTL_BUS_RESET; + pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); + msleep(2); + + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; + pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl); + ssleep(1); +} + static resource_size_t pcibios_io_size(const struct pci_controller *hose) { #ifdef CONFIG_PPC64 @@ -200,26 +221,6 @@ struct pci_controller* pci_find_hose_for_OF_device(struct device_node* node) return NULL; } -static ssize_t pci_show_devspec(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pci_dev *pdev; - struct device_node *np; - - pdev = to_pci_dev (dev); - np = pci_device_to_OF_node(pdev); - if (np == NULL || np->full_name == NULL) - return 0; - return sprintf(buf, "%s", np->full_name); -} -static DEVICE_ATTR(devspec, S_IRUGO, pci_show_devspec, NULL); - -/* Add sysfs properties */ -int pcibios_add_platform_entries(struct pci_dev *pdev) -{ - return device_create_file(&pdev->dev, &dev_attr_devspec); -} - /* * Reads the interrupt pin to determine if interrupt is use by card. * If the interrupt is used, then gets the interrupt line from the @@ -227,7 +228,7 @@ int pcibios_add_platform_entries(struct pci_dev *pdev) */ static int pci_read_irq_line(struct pci_dev *pci_dev) { - struct of_irq oirq; + struct of_phandle_args oirq; unsigned int virq; pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev)); @@ -236,7 +237,7 @@ static int pci_read_irq_line(struct pci_dev *pci_dev) memset(&oirq, 0xff, sizeof(oirq)); #endif /* Try to get a mapping from the device-tree */ - if (of_irq_map_pci(pci_dev, &oirq)) { + if (of_irq_parse_pci(pci_dev, &oirq)) { u8 line, pin; /* If that fails, lets fallback to what is in the config @@ -262,11 +263,10 @@ static int pci_read_irq_line(struct pci_dev *pci_dev) irq_set_irq_type(virq, IRQ_TYPE_LEVEL_LOW); } else { pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %s\n", - oirq.size, oirq.specifier[0], oirq.specifier[1], - of_node_full_name(oirq.controller)); + oirq.args_count, oirq.args[0], oirq.args[1], + of_node_full_name(oirq.np)); - virq = irq_create_of_mapping(oirq.controller, oirq.specifier, - oirq.size); + virq = irq_create_of_mapping(&oirq); } if(virq == NO_IRQ) { pr_debug(" Failed to map !\n"); @@ -305,7 +305,7 @@ static struct resource *__pci_mmap_make_offset(struct pci_dev *dev, unsigned long io_offset = 0; int i, res_bit; - if (hose == 0) + if (hose == NULL) return NULL; /* should never happen */ /* If memory, add on the PCI bridge address offset */ @@ -358,7 +358,6 @@ static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp, enum pci_mmap_state mmap_state, int write_combine) { - unsigned long prot = pgprot_val(protection); /* Write combine is always 0 on non-memory space mappings. On * memory space, if the user didn't pass 1, we check for a @@ -375,9 +374,9 @@ static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp, /* XXX would be nice to have a way to ask for write-through */ if (write_combine) - return pgprot_noncached_wc(prot); + return pgprot_noncached_wc(protection); else - return pgprot_noncached(prot); + return pgprot_noncached(protection); } /* @@ -657,15 +656,6 @@ void pci_resource_to_user(const struct pci_dev *dev, int bar, * ranges. However, some machines (thanks Apple !) tend to split their * space into lots of small contiguous ranges. So we have to coalesce. * - * - We can only cope with all memory ranges having the same offset - * between CPU addresses and PCI addresses. Unfortunately, some bridges - * are setup for a large 1:1 mapping along with a small "window" which - * maps PCI address 0 to some arbitrary high address of the CPU space in - * order to give access to the ISA memory hole. - * The way out of here that I've chosen for now is to always set the - * offset based on the first resource found, then override it if we - * have a different offset and the previous was set by an ISA hole. - * * - Some busses have IO space not starting at 0, which causes trouble with * the way we do our IO resource renumbering. The code somewhat deals with * it for 64 bits but I would expect problems on 32 bits. @@ -676,61 +666,36 @@ void pci_resource_to_user(const struct pci_dev *dev, int bar, void pci_process_bridge_OF_ranges(struct pci_controller *hose, struct device_node *dev, int primary) { - const u32 *ranges; - int rlen; - int pna = of_n_addr_cells(dev); - int np = pna + 5; - int memno = 0, isa_hole = -1; - u32 pci_space; - unsigned long long pci_addr, cpu_addr, pci_next, cpu_next, size; - unsigned long long isa_mb = 0; + int memno = 0; struct resource *res; + struct of_pci_range range; + struct of_pci_range_parser parser; printk(KERN_INFO "PCI host bridge %s %s ranges:\n", dev->full_name, primary ? "(primary)" : ""); - /* Get ranges property */ - ranges = of_get_property(dev, "ranges", &rlen); - if (ranges == NULL) + /* Check for ranges property */ + if (of_pci_range_parser_init(&parser, dev)) return; /* Parse it */ - while ((rlen -= np * 4) >= 0) { - /* Read next ranges element */ - pci_space = ranges[0]; - pci_addr = of_read_number(ranges + 1, 2); - cpu_addr = of_translate_address(dev, ranges + 3); - size = of_read_number(ranges + pna + 3, 2); - ranges += np; - + for_each_of_pci_range(&parser, &range) { /* If we failed translation or got a zero-sized region * (some FW try to feed us with non sensical zero sized regions * such as power3 which look like some kind of attempt at exposing * the VGA memory hole) */ - if (cpu_addr == OF_BAD_ADDR || size == 0) + if (range.cpu_addr == OF_BAD_ADDR || range.size == 0) continue; - /* Now consume following elements while they are contiguous */ - for (; rlen >= np * sizeof(u32); - ranges += np, rlen -= np * 4) { - if (ranges[0] != pci_space) - break; - pci_next = of_read_number(ranges + 1, 2); - cpu_next = of_translate_address(dev, ranges + 3); - if (pci_next != pci_addr + size || - cpu_next != cpu_addr + size) - break; - size += of_read_number(ranges + pna + 3, 2); - } - /* Act based on address space type */ res = NULL; - switch ((pci_space >> 24) & 0x3) { - case 1: /* PCI IO space */ + switch (range.flags & IORESOURCE_TYPE_BITS) { + case IORESOURCE_IO: printk(KERN_INFO " IO 0x%016llx..0x%016llx -> 0x%016llx\n", - cpu_addr, cpu_addr + size - 1, pci_addr); + range.cpu_addr, range.cpu_addr + range.size - 1, + range.pci_addr); /* We support only one IO range */ if (hose->pci_io_size) { @@ -740,11 +705,12 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose, } #ifdef CONFIG_PPC32 /* On 32 bits, limit I/O space to 16MB */ - if (size > 0x01000000) - size = 0x01000000; + if (range.size > 0x01000000) + range.size = 0x01000000; /* 32 bits needs to map IOs here */ - hose->io_base_virt = ioremap(cpu_addr, size); + hose->io_base_virt = ioremap(range.cpu_addr, + range.size); /* Expect trouble if pci_addr is not 0 */ if (primary) @@ -754,20 +720,20 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose, /* pci_io_size and io_base_phys always represent IO * space starting at 0 so we factor in pci_addr */ - hose->pci_io_size = pci_addr + size; - hose->io_base_phys = cpu_addr - pci_addr; + hose->pci_io_size = range.pci_addr + range.size; + hose->io_base_phys = range.cpu_addr - range.pci_addr; /* Build resource */ res = &hose->io_resource; - res->flags = IORESOURCE_IO; - res->start = pci_addr; + range.cpu_addr = range.pci_addr; break; - case 2: /* PCI Memory space */ - case 3: /* PCI 64 bits Memory space */ + case IORESOURCE_MEM: printk(KERN_INFO " MEM 0x%016llx..0x%016llx -> 0x%016llx %s\n", - cpu_addr, cpu_addr + size - 1, pci_addr, - (pci_space & 0x40000000) ? "Prefetch" : ""); + range.cpu_addr, range.cpu_addr + range.size - 1, + range.pci_addr, + (range.pci_space & 0x40000000) ? + "Prefetch" : ""); /* We support only 3 memory ranges */ if (memno >= 3) { @@ -776,60 +742,23 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose, continue; } /* Handles ISA memory hole space here */ - if (pci_addr == 0) { - isa_mb = cpu_addr; - isa_hole = memno; + if (range.pci_addr == 0) { if (primary || isa_mem_base == 0) - isa_mem_base = cpu_addr; - hose->isa_mem_phys = cpu_addr; - hose->isa_mem_size = size; - } - - /* We get the PCI/Mem offset from the first range or - * the, current one if the offset came from an ISA - * hole. If they don't match, bugger. - */ - if (memno == 0 || - (isa_hole >= 0 && pci_addr != 0 && - hose->pci_mem_offset == isa_mb)) - hose->pci_mem_offset = cpu_addr - pci_addr; - else if (pci_addr != 0 && - hose->pci_mem_offset != cpu_addr - pci_addr) { - printk(KERN_INFO - " \\--> Skipped (offset mismatch) !\n"); - continue; + isa_mem_base = range.cpu_addr; + hose->isa_mem_phys = range.cpu_addr; + hose->isa_mem_size = range.size; } /* Build resource */ + hose->mem_offset[memno] = range.cpu_addr - + range.pci_addr; res = &hose->mem_resources[memno++]; - res->flags = IORESOURCE_MEM; - if (pci_space & 0x40000000) - res->flags |= IORESOURCE_PREFETCH; - res->start = cpu_addr; break; } if (res != NULL) { - res->name = dev->full_name; - res->end = res->start + size - 1; - res->parent = NULL; - res->sibling = NULL; - res->child = NULL; + of_pci_range_to_resource(&range, dev, res); } } - - /* If there's an ISA hole and the pci_mem_offset is -not- matching - * the ISA hole offset, then we need to remove the ISA hole from - * the resource list for that brige - */ - if (isa_hole >= 0 && hose->pci_mem_offset != isa_mb) { - unsigned int next = isa_hole + 1; - printk(KERN_INFO " Removing ISA hole at 0x%016llx\n", isa_mb); - if (next < memno) - memmove(&hose->mem_resources[isa_hole], - &hose->mem_resources[next], - sizeof(struct resource) * (memno - next)); - hose->mem_resources[--memno].flags = 0; - } } /* Decide whether to display the domain number in /proc */ @@ -844,6 +773,14 @@ int pci_proc_domain(struct pci_bus *bus) return 1; } +int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) +{ + if (ppc_md.pcibios_root_bridge_prepare) + return ppc_md.pcibios_root_bridge_prepare(bridge); + + return 0; +} + /* This header fixup will do the resource fixup for all devices as they are * probed, but not for bridge ranges */ @@ -859,6 +796,7 @@ static void pcibios_fixup_resources(struct pci_dev *dev) } for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { struct resource *res = dev->resource + i; + struct pci_bus_region reg; if (!res->flags) continue; @@ -867,8 +805,9 @@ static void pcibios_fixup_resources(struct pci_dev *dev) * at 0 as unset as well, except if PCI_PROBE_ONLY is also set * since in that case, we don't want to re-assign anything */ + pcibios_resource_to_bus(dev->bus, ®, res); if (pci_has_flag(PCI_REASSIGN_ALL_RSRC) || - (res->start == 0 && !pci_has_flag(PCI_PROBE_ONLY))) { + (reg.start == 0 && !pci_has_flag(PCI_PROBE_ONLY))) { /* Only print message if not re-assigning */ if (!pci_has_flag(PCI_REASSIGN_ALL_RSRC)) pr_debug("PCI:%s Resource %d %016llx-%016llx [%x] " @@ -907,6 +846,7 @@ static int pcibios_uninitialized_bridge_resource(struct pci_bus *bus, struct pci_controller *hose = pci_bus_to_host(bus); struct pci_dev *dev = bus->self; resource_size_t offset; + struct pci_bus_region region; u16 command; int i; @@ -916,10 +856,10 @@ static int pcibios_uninitialized_bridge_resource(struct pci_bus *bus, /* Job is a bit different between memory and IO */ if (res->flags & IORESOURCE_MEM) { - /* If the BAR is non-0 (res != pci_mem_offset) then it's probably been - * initialized by somebody - */ - if (res->start != hose->pci_mem_offset) + pcibios_resource_to_bus(dev->bus, ®ion, res); + + /* If the BAR is non-0 then it's probably been initialized */ + if (region.start != 0) return 0; /* The BAR is 0, let's check if memory decoding is enabled on @@ -931,11 +871,11 @@ static int pcibios_uninitialized_bridge_resource(struct pci_bus *bus, /* Memory decoding is enabled and the BAR is 0. If any of the bridge * resources covers that starting address (0 then it's good enough for - * us for memory + * us for memory space) */ for (i = 0; i < 3; i++) { if ((hose->mem_resources[i].flags & IORESOURCE_MEM) && - hose->mem_resources[i].start == hose->pci_mem_offset) + hose->mem_resources[i].start == hose->mem_offset[i]) return 0; } @@ -1023,6 +963,38 @@ void pcibios_setup_bus_self(struct pci_bus *bus) ppc_md.pci_dma_bus_setup(bus); } +static void pcibios_setup_device(struct pci_dev *dev) +{ + /* Fixup NUMA node as it may not be setup yet by the generic + * code and is needed by the DMA init + */ + set_dev_node(&dev->dev, pcibus_to_node(dev->bus)); + + /* Hook up default DMA ops */ + set_dma_ops(&dev->dev, pci_dma_ops); + set_dma_offset(&dev->dev, PCI_DRAM_OFFSET); + + /* Additional platform DMA/iommu setup */ + if (ppc_md.pci_dma_dev_setup) + ppc_md.pci_dma_dev_setup(dev); + + /* Read default IRQs and fixup if necessary */ + pci_read_irq_line(dev); + if (ppc_md.pci_irq_fixup) + ppc_md.pci_irq_fixup(dev); +} + +int pcibios_add_device(struct pci_dev *dev) +{ + /* + * We can only call pcibios_setup_device() after bus setup is complete, + * since some of the platform specific DMA setup code depends on it. + */ + if (dev->bus->is_added) + pcibios_setup_device(dev); + return 0; +} + void pcibios_setup_bus_devices(struct pci_bus *bus) { struct pci_dev *dev; @@ -1037,23 +1009,7 @@ void pcibios_setup_bus_devices(struct pci_bus *bus) if (dev->is_added) continue; - /* Fixup NUMA node as it may not be setup yet by the generic - * code and is needed by the DMA init - */ - set_dev_node(&dev->dev, pcibus_to_node(dev->bus)); - - /* Hook up default DMA ops */ - set_dma_ops(&dev->dev, pci_dma_ops); - set_dma_offset(&dev->dev, PCI_DRAM_OFFSET); - - /* Additional platform DMA/iommu setup */ - if (ppc_md.pci_dma_dev_setup) - ppc_md.pci_dma_dev_setup(dev); - - /* Read default IRQs and fixup if necessary */ - pci_read_irq_line(dev); - if (ppc_md.pci_irq_fixup) - ppc_md.pci_irq_fixup(dev); + pcibios_setup_device(dev); } } @@ -1068,8 +1024,7 @@ void pcibios_fixup_bus(struct pci_bus *bus) * bases. This is -not- called when generating the PCI tree from * the OF device-tree. */ - if (bus->self != NULL) - pci_read_bridge_bases(bus); + pci_read_bridge_bases(bus); /* Now fixup the bus bus */ pcibios_setup_bus_self(bus); @@ -1367,10 +1322,9 @@ static void __init pcibios_reserve_legacy_regions(struct pci_bus *bus) no_io: /* Check for memory */ - offset = hose->pci_mem_offset; - pr_debug("hose mem offset: %016llx\n", (unsigned long long)offset); for (i = 0; i < 3; i++) { pres = &hose->mem_resources[i]; + offset = hose->mem_offset[i]; if (!(pres->flags & IORESOURCE_MEM)) continue; pr_debug("hose mem res: %pR\n", pres); @@ -1476,12 +1430,17 @@ void pcibios_finish_adding_to_bus(struct pci_bus *bus) /* Allocate bus and devices resources */ pcibios_allocate_bus_resources(bus); pcibios_claim_one_bus(bus); + if (!pci_has_flag(PCI_PROBE_ONLY)) + pci_assign_unassigned_bus_resources(bus); + + /* Fixup EEH */ + eeh_add_device_tree_late(bus); /* Add new devices to global lists. Register in proc, sysfs. */ pci_bus_add_devices(bus); - /* Fixup EEH */ - eeh_add_device_tree_late(bus); + /* sysfs files should only be added after devices are added */ + eeh_add_sysfs_files(bus); } EXPORT_SYMBOL_GPL(pcibios_finish_adding_to_bus); @@ -1503,6 +1462,7 @@ static void pcibios_setup_phb_resources(struct pci_controller *hose, struct list_head *resources) { struct resource *res; + resource_size_t offset; int i; /* Hookup PHB IO resource */ @@ -1512,49 +1472,38 @@ static void pcibios_setup_phb_resources(struct pci_controller *hose, printk(KERN_WARNING "PCI: I/O resource not set for host" " bridge %s (domain %d)\n", hose->dn->full_name, hose->global_number); -#ifdef CONFIG_PPC32 - /* Workaround for lack of IO resource only on 32-bit */ - res->start = (unsigned long)hose->io_base_virt - isa_io_base; - res->end = res->start + IO_SPACE_LIMIT; - res->flags = IORESOURCE_IO; -#endif /* CONFIG_PPC32 */ - } + } else { + offset = pcibios_io_space_offset(hose); - pr_debug("PCI: PHB IO resource = %016llx-%016llx [%lx]\n", - (unsigned long long)res->start, - (unsigned long long)res->end, - (unsigned long)res->flags); - pci_add_resource_offset(resources, res, pcibios_io_space_offset(hose)); + pr_debug("PCI: PHB IO resource = %08llx-%08llx [%lx] off 0x%08llx\n", + (unsigned long long)res->start, + (unsigned long long)res->end, + (unsigned long)res->flags, + (unsigned long long)offset); + pci_add_resource_offset(resources, res, offset); + } /* Hookup PHB Memory resources */ for (i = 0; i < 3; ++i) { res = &hose->mem_resources[i]; if (!res->flags) { - if (i > 0) - continue; - printk(KERN_ERR "PCI: Memory resource 0 not set for " - "host bridge %s (domain %d)\n", - hose->dn->full_name, hose->global_number); -#ifdef CONFIG_PPC32 - /* Workaround for lack of MEM resource only on 32-bit */ - res->start = hose->pci_mem_offset; - res->end = (resource_size_t)-1LL; - res->flags = IORESOURCE_MEM; -#endif /* CONFIG_PPC32 */ + if (i == 0) + printk(KERN_ERR "PCI: Memory resource 0 not set for " + "host bridge %s (domain %d)\n", + hose->dn->full_name, hose->global_number); + continue; } + offset = hose->mem_offset[i]; + - pr_debug("PCI: PHB MEM resource %d = %016llx-%016llx [%lx]\n", i, + pr_debug("PCI: PHB MEM resource %d = %08llx-%08llx [%lx] off 0x%08llx\n", i, (unsigned long long)res->start, (unsigned long long)res->end, - (unsigned long)res->flags); - pci_add_resource_offset(resources, res, hose->pci_mem_offset); - } - - pr_debug("PCI: PHB MEM offset = %016llx\n", - (unsigned long long)hose->pci_mem_offset); - pr_debug("PCI: PHB IO offset = %08lx\n", - (unsigned long)hose->io_base_virt - _IO_BASE); + (unsigned long)res->flags, + (unsigned long long)offset); + pci_add_resource_offset(resources, res, offset); + } } /* @@ -1597,7 +1546,7 @@ fake_pci_bus(struct pci_controller *hose, int busnr) { static struct pci_bus bus; - if (hose == 0) { + if (hose == NULL) { printk(KERN_ERR "Can't find hose for PCI bus %d!\n", busnr); } bus.number = busnr; @@ -1693,12 +1642,8 @@ void pcibios_scan_phb(struct pci_controller *hose) /* Configure PCI Express settings */ if (bus && !pci_has_flag(PCI_PROBE_ONLY)) { struct pci_bus *child; - list_for_each_entry(child, &bus->children, node) { - struct pci_dev *self = child->self; - if (!self) - continue; - pcie_bus_configure_settings(child, self->pcie_mpss); - } + list_for_each_entry(child, &bus->children, node) + pcie_bus_configure_settings(child); } } @@ -1722,3 +1667,15 @@ static void fixup_hide_host_resource_fsl(struct pci_dev *dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MOTOROLA, PCI_ANY_ID, fixup_hide_host_resource_fsl); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, fixup_hide_host_resource_fsl); + +static void fixup_vga(struct pci_dev *pdev) +{ + u16 cmd; + + pci_read_config_word(pdev, PCI_COMMAND, &cmd); + if ((cmd & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) || !vga_default_device()) + vga_set_default_device(pdev); + +} +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, + PCI_CLASS_DISPLAY_VGA, 8, fixup_vga); diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c new file mode 100644 index 00000000000..5b789177aa2 --- /dev/null +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -0,0 +1,109 @@ +/* + * Derived from "arch/powerpc/platforms/pseries/pci_dlpar.c" + * + * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com> + * Copyright (C) 2005 International Business Machines + * + * Updates, 2005, John Rose <johnrose@austin.ibm.com> + * Updates, 2005, Linas Vepstas <linas@austin.ibm.com> + * Updates, 2013, Gavin Shan <shangw@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/pci.h> +#include <linux/export.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> +#include <asm/firmware.h> +#include <asm/eeh.h> + +/** + * pcibios_release_device - release PCI device + * @dev: PCI device + * + * The function is called before releasing the indicated PCI device. + */ +void pcibios_release_device(struct pci_dev *dev) +{ + eeh_remove_device(dev); +} + +/** + * pcibios_remove_pci_devices - remove all devices under this bus + * @bus: the indicated PCI bus + * + * Remove all of the PCI devices under this bus both from the + * linux pci device tree, and from the powerpc EEH address cache. + */ +void pcibios_remove_pci_devices(struct pci_bus *bus) +{ + struct pci_dev *dev, *tmp; + struct pci_bus *child_bus; + + /* First go down child busses */ + list_for_each_entry(child_bus, &bus->children, node) + pcibios_remove_pci_devices(child_bus); + + pr_debug("PCI: Removing devices on bus %04x:%02x\n", + pci_domain_nr(bus), bus->number); + list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) { + pr_debug(" Removing %s...\n", pci_name(dev)); + pci_stop_and_remove_bus_device(dev); + } +} + +EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); + +/** + * pcibios_add_pci_devices - adds new pci devices to bus + * @bus: the indicated PCI bus + * + * This routine will find and fixup new pci devices under + * the indicated bus. This routine presumes that there + * might already be some devices under this bridge, so + * it carefully tries to add only new devices. (And that + * is how this routine differs from other, similar pcibios + * routines.) + */ +void pcibios_add_pci_devices(struct pci_bus * bus) +{ + int slotno, mode, pass, max; + struct pci_dev *dev; + struct device_node *dn = pci_bus_to_OF_node(bus); + + eeh_add_device_tree_early(dn); + + mode = PCI_PROBE_NORMAL; + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + + if (mode == PCI_PROBE_DEVTREE) { + /* use ofdt-based probe */ + of_rescan_bus(dn, bus); + } else if (mode == PCI_PROBE_NORMAL) { + /* + * Use legacy probe. In the partial hotplug case, we + * probably have grandchildren devices unplugged. So + * we don't check the return value from pci_scan_slot() in + * order for fully rescan all the way down to pick them up. + * They can have been removed during partial hotplug. + */ + slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); + pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); + pcibios_setup_bus_devices(bus); + max = bus->busn_res.start; + for (pass = 0; pass < 2; pass++) { + list_for_each_entry(dev, &bus->devices, bus_list) { + if (pci_is_bridge(dev)) + max = pci_scan_bridge(bus, dev, + max, pass); + } + } + } + pcibios_finish_adding_to_bus(bus); +} +EXPORT_SYMBOL_GPL(pcibios_add_pci_devices); diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index e37c2152acf..432459c817f 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -295,7 +295,7 @@ long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn) case IOBASE_BRIDGE_NUMBER: return (long)hose->first_busno; case IOBASE_MEMORY: - return (long)hose->pci_mem_offset; + return (long)hose->mem_offset[0]; case IOBASE_IO: return (long)hose->io_base_phys; case IOBASE_ISA_IO: diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 51a133a78a0..155013da27e 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -109,7 +109,7 @@ int pcibios_unmap_io_space(struct pci_bus *bus) hose = pci_bus_to_host(bus); /* Check if we have IOs allocated */ - if (hose->io_base_alloc == 0) + if (hose->io_base_alloc == NULL) return 0; pr_debug("IO unmapping for PHB %s\n", hose->dn->full_name); @@ -208,8 +208,7 @@ long sys_pciconfig_iobase(long which, unsigned long in_bus, unsigned long in_devfn) { struct pci_controller* hose; - struct list_head *ln; - struct pci_bus *bus = NULL; + struct pci_bus *tmp_bus, *bus = NULL; struct device_node *hose_node; /* Argh ! Please forgive me for that hack, but that's the @@ -230,11 +229,12 @@ long sys_pciconfig_iobase(long which, unsigned long in_bus, * used on pre-domains setup. We return the first match */ - for (ln = pci_root_buses.next; ln != &pci_root_buses; ln = ln->next) { - bus = pci_bus_b(ln); - if (in_bus >= bus->number && in_bus <= bus->busn_res.end) + list_for_each_entry(tmp_bus, &pci_root_buses, node) { + if (in_bus >= tmp_bus->number && + in_bus <= tmp_bus->busn_res.end) { + bus = tmp_bus; break; - bus = NULL; + } } if (bus == NULL || bus->dev.of_node == NULL) return -ENODEV; @@ -246,7 +246,7 @@ long sys_pciconfig_iobase(long which, unsigned long in_bus, case IOBASE_BRIDGE_NUMBER: return (long)hose->first_busno; case IOBASE_MEMORY: - return (long)hose->pci_mem_offset; + return (long)hose->mem_offset[0]; case IOBASE_IO: return (long)hose->io_base_phys; case IOBASE_ISA_IO: @@ -266,3 +266,13 @@ int pcibus_to_node(struct pci_bus *bus) } EXPORT_SYMBOL(pcibus_to_node); #endif + +static void quirk_radeon_32bit_msi(struct pci_dev *dev) +{ + struct pci_dn *pdn = pci_get_pdn(dev); + + if (pdn) + pdn->force_32bit_msi = true; +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x68f2, quirk_radeon_32bit_msi); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0xaa68, quirk_radeon_32bit_msi); diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index e7af165f8b9..1f61fab59d9 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -32,6 +32,14 @@ #include <asm/ppc-pci.h> #include <asm/firmware.h> +struct pci_dn *pci_get_pdn(struct pci_dev *pdev) +{ + struct device_node *dn = pci_device_to_OF_node(pdev); + if (!dn) + return NULL; + return PCI_DN(dn); +} + /* * Traverse_func that inits the PCI fields of the device node. * NOTE: this *must* be done before read/write config to the device. @@ -39,9 +47,8 @@ void *update_dn_pci_info(struct device_node *dn, void *data) { struct pci_controller *phb = data; - const int *type = - of_get_property(dn, "ibm,pci-config-space-type", NULL); - const u32 *regs; + const __be32 *type = of_get_property(dn, "ibm,pci-config-space-type", NULL); + const __be32 *regs; struct pci_dn *pdn; pdn = zalloc_maybe_bootmem(sizeof(*pdn), GFP_KERNEL); @@ -55,12 +62,14 @@ void *update_dn_pci_info(struct device_node *dn, void *data) #endif regs = of_get_property(dn, "reg", NULL); if (regs) { + u32 addr = of_read_number(regs, 1); + /* First register entry is addr (00BBSS00) */ - pdn->busno = (regs[0] >> 16) & 0xff; - pdn->devfn = (regs[0] >> 8) & 0xff; + pdn->busno = (addr >> 16) & 0xff; + pdn->devfn = (addr >> 8) & 0xff; } - pdn->pci_ext_config_space = (type && *type == 1); + pdn->pci_ext_config_space = (type && of_read_number(type, 1) == 1); return NULL; } @@ -90,12 +99,13 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre, /* We started with a phb, iterate all childs */ for (dn = start->child; dn; dn = nextdn) { - const u32 *classp; - u32 class; + const __be32 *classp; + u32 class = 0; nextdn = NULL; classp = of_get_property(dn, "class-code", NULL); - class = classp ? *classp : 0; + if (classp) + class = of_read_number(classp, 1); if (pre && ((ret = pre(dn, data)) != NULL)) return ret; diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index 2a67e9baa59..44562aa97f1 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -24,12 +24,12 @@ */ static u32 get_int_prop(struct device_node *np, const char *name, u32 def) { - const u32 *prop; + const __be32 *prop; int len; prop = of_get_property(np, name, &len); if (prop && len >= 4) - return *prop; + return of_read_number(prop, 1); return def; } @@ -77,7 +77,7 @@ static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev) unsigned int flags; struct pci_bus_region region; struct resource *res; - const u32 *addrs; + const __be32 *addrs; u32 i; int proplen; @@ -86,14 +86,14 @@ static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev) return; pr_debug(" parse addresses (%d bytes) @ %p\n", proplen, addrs); for (; proplen >= 20; proplen -= 20, addrs += 5) { - flags = pci_parse_of_flags(addrs[0], 0); + flags = pci_parse_of_flags(of_read_number(addrs, 1), 0); if (!flags) continue; base = of_read_number(&addrs[1], 2); size = of_read_number(&addrs[3], 2); if (!size) continue; - i = addrs[0] & 0xff; + i = of_read_number(addrs, 1) & 0xff; pr_debug(" base: %llx, size: %llx, i: %x\n", (unsigned long long)base, (unsigned long long)size, i); @@ -111,7 +111,7 @@ static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev) res->name = pci_name(dev); region.start = base; region.end = base + size - 1; - pcibios_bus_to_resource(dev, res, ®ion); + pcibios_bus_to_resource(dev->bus, res, ®ion); } } @@ -128,7 +128,7 @@ struct pci_dev *of_create_pci_dev(struct device_node *node, const char *type; struct pci_slot *slot; - dev = alloc_pci_dev(); + dev = pci_alloc_dev(bus); if (!dev) return NULL; type = of_get_property(node, "device_type", NULL); @@ -137,7 +137,6 @@ struct pci_dev *of_create_pci_dev(struct device_node *node, pr_debug(" create device, devfn: %x, type: %s\n", devfn, type); - dev->bus = bus; dev->dev.of_node = of_node_get(node); dev->dev.parent = bus->bridge; dev->dev.bus = &pci_bus_type; @@ -165,7 +164,7 @@ struct pci_dev *of_create_pci_dev(struct device_node *node, pr_debug(" class: 0x%x\n", dev->class); pr_debug(" revision: 0x%x\n", dev->revision); - dev->current_state = 4; /* unknown power state */ + dev->current_state = PCI_UNKNOWN; /* unknown power state */ dev->error_state = pci_channel_io_normal; dev->dma_mask = 0xffffffff; @@ -208,7 +207,7 @@ void of_scan_pci_bridge(struct pci_dev *dev) { struct device_node *node = dev->dev.of_node; struct pci_bus *bus; - const u32 *busrange, *ranges; + const __be32 *busrange, *ranges; int len, i, mode; struct pci_bus_region region; struct resource *res; @@ -231,15 +230,21 @@ void of_scan_pci_bridge(struct pci_dev *dev) return; } - bus = pci_add_new_bus(dev->bus, dev, busrange[0]); + bus = pci_find_bus(pci_domain_nr(dev->bus), + of_read_number(busrange, 1)); if (!bus) { - printk(KERN_ERR "Failed to create pci bus for %s\n", - node->full_name); - return; + bus = pci_add_new_bus(dev->bus, dev, + of_read_number(busrange, 1)); + if (!bus) { + printk(KERN_ERR "Failed to create pci bus for %s\n", + node->full_name); + return; + } } bus->primary = dev->bus->number; - pci_bus_insert_busn_res(bus, busrange[0], busrange[1]); + pci_bus_insert_busn_res(bus, of_read_number(busrange, 1), + of_read_number(busrange+1, 1)); bus->bridge_ctl = 0; /* parse ranges property */ @@ -252,7 +257,7 @@ void of_scan_pci_bridge(struct pci_dev *dev) } i = 1; for (; len >= 32; len -= 32, ranges += 8) { - flags = pci_parse_of_flags(ranges[0], 1); + flags = pci_parse_of_flags(of_read_number(ranges, 1), 1); size = of_read_number(&ranges[6], 2); if (flags == 0 || size == 0) continue; @@ -275,7 +280,7 @@ void of_scan_pci_bridge(struct pci_dev *dev) res->flags = flags; region.start = of_read_number(&ranges[1], 2); region.end = region.start + size - 1; - pcibios_bus_to_resource(dev, res, ®ion); + pcibios_bus_to_resource(dev->bus, res, ®ion); } sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus), bus->number); @@ -293,6 +298,47 @@ void of_scan_pci_bridge(struct pci_dev *dev) } EXPORT_SYMBOL(of_scan_pci_bridge); +static struct pci_dev *of_scan_pci_dev(struct pci_bus *bus, + struct device_node *dn) +{ + struct pci_dev *dev = NULL; + const __be32 *reg; + int reglen, devfn; +#ifdef CONFIG_EEH + struct eeh_dev *edev = of_node_to_eeh_dev(dn); +#endif + + pr_debug(" * %s\n", dn->full_name); + if (!of_device_is_available(dn)) + return NULL; + + reg = of_get_property(dn, "reg", ®len); + if (reg == NULL || reglen < 20) + return NULL; + devfn = (of_read_number(reg, 1) >> 8) & 0xff; + + /* Check if the PCI device is already there */ + dev = pci_get_slot(bus, devfn); + if (dev) { + pci_dev_put(dev); + return dev; + } + + /* Device removed permanently ? */ +#ifdef CONFIG_EEH + if (edev && (edev->mode & EEH_DEV_REMOVED)) + return NULL; +#endif + + /* create a new pci_dev for this device */ + dev = of_create_pci_dev(dn, bus, devfn); + if (!dev) + return NULL; + + pr_debug(" dev header type: %x\n", dev->hdr_type); + return dev; +} + /** * __of_scan_bus - given a PCI bus node, setup bus and scan for child devices * @node: device tree node for the PCI bus @@ -303,8 +349,6 @@ static void __of_scan_bus(struct device_node *node, struct pci_bus *bus, int rescan_existing) { struct device_node *child; - const u32 *reg; - int reglen, devfn; struct pci_dev *dev; pr_debug("of_scan_bus(%s) bus no %d...\n", @@ -312,16 +356,7 @@ static void __of_scan_bus(struct device_node *node, struct pci_bus *bus, /* Scan direct children */ for_each_child_of_node(node, child) { - pr_debug(" * %s\n", child->full_name); - if (!of_device_is_available(child)) - continue; - reg = of_get_property(child, "reg", ®len); - if (reg == NULL || reglen < 20) - continue; - devfn = (reg[0] >> 8) & 0xff; - - /* create a new pci_dev for this device */ - dev = of_create_pci_dev(child, bus, devfn); + dev = of_scan_pci_dev(bus, child); if (!dev) continue; pr_debug(" dev header type: %x\n", dev->hdr_type); @@ -336,8 +371,7 @@ static void __of_scan_bus(struct device_node *node, struct pci_bus *bus, /* Now scan child busses */ list_for_each_entry(dev, &bus->devices, bus_list) { - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || - dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) { + if (pci_is_bridge(dev)) { of_scan_pci_bridge(dev); } } diff --git a/arch/powerpc/kernel/ppc32.h b/arch/powerpc/kernel/ppc32.h index 02fb0ee2609..a27c914d580 100644 --- a/arch/powerpc/kernel/ppc32.h +++ b/arch/powerpc/kernel/ppc32.h @@ -16,30 +16,6 @@ /* These are here to support 32-bit syscalls on a 64-bit kernel. */ -#define __old_sigaction32 old_sigaction32 - -struct __old_sigaction32 { - compat_uptr_t sa_handler; - compat_old_sigset_t sa_mask; - unsigned int sa_flags; - compat_uptr_t sa_restorer; /* not used by Linux/SPARC yet */ -}; - - - -struct sigaction32 { - compat_uptr_t sa_handler; /* Really a pointer, but need to deal with 32 bits */ - unsigned int sa_flags; - compat_uptr_t sa_restorer; /* Another 32 bit pointer */ - compat_sigset_t sa_mask; /* A 32 bit mask */ -}; - -typedef struct sigaltstack_32 { - unsigned int ss_sp; - int ss_flags; - compat_size_t ss_size; -} stack_32_t; - struct pt_regs32 { unsigned int gpr[32]; unsigned int nip; @@ -75,7 +51,7 @@ struct mcontext32 { struct ucontext32 { unsigned int uc_flags; unsigned int uc_link; - stack_32_t uc_stack; + compat_stack_t uc_stack; int uc_pad[7]; compat_uptr_t uc_regs; /* points to uc_mcontext field */ compat_sigset_t uc_sigmask; /* mask last for extensibility */ diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 78b8766fd79..48d17d6fca5 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -79,10 +79,12 @@ EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); +#ifndef CONFIG_GENERIC_CSUM EXPORT_SYMBOL(csum_partial); EXPORT_SYMBOL(csum_partial_copy_generic); EXPORT_SYMBOL(ip_fast_csum); EXPORT_SYMBOL(csum_tcpudp_magic); +#endif EXPORT_SYMBOL(__copy_tofrom_user); EXPORT_SYMBOL(__clear_user); @@ -96,9 +98,15 @@ EXPORT_SYMBOL(pci_dram_offset); EXPORT_SYMBOL(start_thread); +#ifdef CONFIG_PPC_FPU EXPORT_SYMBOL(giveup_fpu); +EXPORT_SYMBOL(load_fp_state); +EXPORT_SYMBOL(store_fp_state); +#endif #ifdef CONFIG_ALTIVEC EXPORT_SYMBOL(giveup_altivec); +EXPORT_SYMBOL(load_vr_state); +EXPORT_SYMBOL(store_vr_state); #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_VSX EXPORT_SYMBOL(giveup_vsx); @@ -111,8 +119,8 @@ EXPORT_SYMBOL(giveup_spe); #ifndef CONFIG_PPC64 EXPORT_SYMBOL(flush_instruction_cache); #endif -EXPORT_SYMBOL(__flush_icache_range); EXPORT_SYMBOL(flush_dcache_range); +EXPORT_SYMBOL(flush_icache_range); #ifdef CONFIG_SMP #ifdef CONFIG_PPC32 @@ -142,8 +150,11 @@ EXPORT_SYMBOL(__ashldi3); EXPORT_SYMBOL(__lshrdi3); int __ucmpdi2(unsigned long long, unsigned long long); EXPORT_SYMBOL(__ucmpdi2); +int __cmpdi2(long long, long long); +EXPORT_SYMBOL(__cmpdi2); #endif - +long long __bswapdi2(long long); +EXPORT_SYMBOL(__bswapdi2); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memmove); diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c index c8ae3714e79..c30612aad68 100644 --- a/arch/powerpc/kernel/proc_powerpc.c +++ b/arch/powerpc/kernel/proc_powerpc.c @@ -29,45 +29,26 @@ #ifdef CONFIG_PPC64 -static loff_t page_map_seek( struct file *file, loff_t off, int whence) +static loff_t page_map_seek(struct file *file, loff_t off, int whence) { - loff_t new; - struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode); - - switch(whence) { - case 0: - new = off; - break; - case 1: - new = file->f_pos + off; - break; - case 2: - new = dp->size + off; - break; - default: - return -EINVAL; - } - if ( new < 0 || new > dp->size ) - return -EINVAL; - return (file->f_pos = new); + return fixed_size_llseek(file, off, whence, PAGE_SIZE); } static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode); - return simple_read_from_buffer(buf, nbytes, ppos, dp->data, dp->size); + return simple_read_from_buffer(buf, nbytes, ppos, + PDE_DATA(file_inode(file)), PAGE_SIZE); } static int page_map_mmap( struct file *file, struct vm_area_struct *vma ) { - struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode); - - if ((vma->vm_end - vma->vm_start) > dp->size) + if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) return -EINVAL; - remap_pfn_range(vma, vma->vm_start, __pa(dp->data) >> PAGE_SHIFT, - dp->size, vma->vm_page_prot); + remap_pfn_range(vma, vma->vm_start, + __pa(PDE_DATA(file_inode(file))) >> PAGE_SHIFT, + PAGE_SIZE, vma->vm_page_prot); return 0; } @@ -86,7 +67,7 @@ static int __init proc_ppc64_init(void) &page_map_fops, vdso_data); if (!pde) return 1; - pde->size = PAGE_SIZE; + proc_set_size(pde, PAGE_SIZE); return 0; } diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 81430674e71..be99774d3f4 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -25,7 +25,6 @@ #include <linux/slab.h> #include <linux/user.h> #include <linux/elf.h> -#include <linux/init.h> #include <linux/prctl.h> #include <linux/init_task.h> #include <linux/export.h> @@ -50,13 +49,22 @@ #include <asm/runlatch.h> #include <asm/syscalls.h> #include <asm/switch_to.h> +#include <asm/tm.h> #include <asm/debug.h> #ifdef CONFIG_PPC64 #include <asm/firmware.h> #endif +#include <asm/code-patching.h> #include <linux/kprobes.h> #include <linux/kdebug.h> +/* Transactional Memory debug */ +#ifdef TM_DEBUG_SW +#define TM_DEBUG(x...) printk(KERN_INFO x) +#else +#define TM_DEBUG(x...) do { } while(0) +#endif + extern unsigned long _get_SP(void); #ifndef CONFIG_SMP @@ -66,6 +74,49 @@ struct task_struct *last_task_used_vsx = NULL; struct task_struct *last_task_used_spe = NULL; #endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +void giveup_fpu_maybe_transactional(struct task_struct *tsk) +{ + /* + * If we are saving the current thread's registers, and the + * thread is in a transactional state, set the TIF_RESTORE_TM + * bit so that we know to restore the registers before + * returning to userspace. + */ + if (tsk == current && tsk->thread.regs && + MSR_TM_ACTIVE(tsk->thread.regs->msr) && + !test_thread_flag(TIF_RESTORE_TM)) { + tsk->thread.tm_orig_msr = tsk->thread.regs->msr; + set_thread_flag(TIF_RESTORE_TM); + } + + giveup_fpu(tsk); +} + +void giveup_altivec_maybe_transactional(struct task_struct *tsk) +{ + /* + * If we are saving the current thread's registers, and the + * thread is in a transactional state, set the TIF_RESTORE_TM + * bit so that we know to restore the registers before + * returning to userspace. + */ + if (tsk == current && tsk->thread.regs && + MSR_TM_ACTIVE(tsk->thread.regs->msr) && + !test_thread_flag(TIF_RESTORE_TM)) { + tsk->thread.tm_orig_msr = tsk->thread.regs->msr; + set_thread_flag(TIF_RESTORE_TM); + } + + giveup_altivec(tsk); +} + +#else +#define giveup_fpu_maybe_transactional(tsk) giveup_fpu(tsk) +#define giveup_altivec_maybe_transactional(tsk) giveup_altivec(tsk) +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + +#ifdef CONFIG_PPC_FPU /* * Make sure the floating-point register state in the * the thread_struct is up to date for task tsk. @@ -93,12 +144,13 @@ void flush_fp_to_thread(struct task_struct *tsk) */ BUG_ON(tsk != current); #endif - giveup_fpu(tsk); + giveup_fpu_maybe_transactional(tsk); } preempt_enable(); } } EXPORT_SYMBOL_GPL(flush_fp_to_thread); +#endif /* CONFIG_PPC_FPU */ void enable_kernel_fp(void) { @@ -106,11 +158,11 @@ void enable_kernel_fp(void) #ifdef CONFIG_SMP if (current->thread.regs && (current->thread.regs->msr & MSR_FP)) - giveup_fpu(current); + giveup_fpu_maybe_transactional(current); else giveup_fpu(NULL); /* just enables FP for kernel */ #else - giveup_fpu(last_task_used_math); + giveup_fpu_maybe_transactional(last_task_used_math); #endif /* CONFIG_SMP */ } EXPORT_SYMBOL(enable_kernel_fp); @@ -122,11 +174,11 @@ void enable_kernel_altivec(void) #ifdef CONFIG_SMP if (current->thread.regs && (current->thread.regs->msr & MSR_VEC)) - giveup_altivec(current); + giveup_altivec_maybe_transactional(current); else giveup_altivec_notask(); #else - giveup_altivec(last_task_used_altivec); + giveup_altivec_maybe_transactional(last_task_used_altivec); #endif /* CONFIG_SMP */ } EXPORT_SYMBOL(enable_kernel_altivec); @@ -143,7 +195,7 @@ void flush_altivec_to_thread(struct task_struct *tsk) #ifdef CONFIG_SMP BUG_ON(tsk != current); #endif - giveup_altivec(tsk); + giveup_altivec_maybe_transactional(tsk); } preempt_enable(); } @@ -172,8 +224,8 @@ EXPORT_SYMBOL(enable_kernel_vsx); void giveup_vsx(struct task_struct *tsk) { - giveup_fpu(tsk); - giveup_altivec(tsk); + giveup_fpu_maybe_transactional(tsk); + giveup_altivec_maybe_transactional(tsk); __giveup_vsx(tsk); } @@ -271,7 +323,7 @@ void do_send_trap(struct pt_regs *regs, unsigned long address, force_sig_info(SIGTRAP, &info, current); } #else /* !CONFIG_PPC_ADV_DEBUG_REGS */ -void do_dabr(struct pt_regs *regs, unsigned long address, +void do_break (struct pt_regs *regs, unsigned long address, unsigned long error_code) { siginfo_t info; @@ -281,11 +333,11 @@ void do_dabr(struct pt_regs *regs, unsigned long address, 11, SIGSEGV) == NOTIFY_STOP) return; - if (debugger_dabr_match(regs)) + if (debugger_break_match(regs)) return; - /* Clear the DABR */ - set_dabr(0, 0); + /* Clear the breakpoint */ + hw_breakpoint_disable(); /* Deliver the signal to userspace */ info.si_signo = SIGTRAP; @@ -296,7 +348,7 @@ void do_dabr(struct pt_regs *regs, unsigned long address, } #endif /* CONFIG_PPC_ADV_DEBUG_REGS */ -static DEFINE_PER_CPU(unsigned long, current_dabr); +static DEFINE_PER_CPU(struct arch_hw_breakpoint, current_brk); #ifdef CONFIG_PPC_ADV_DEBUG_REGS /* @@ -304,49 +356,56 @@ static DEFINE_PER_CPU(unsigned long, current_dabr); */ static void set_debug_reg_defaults(struct thread_struct *thread) { - thread->iac1 = thread->iac2 = 0; + thread->debug.iac1 = thread->debug.iac2 = 0; #if CONFIG_PPC_ADV_DEBUG_IACS > 2 - thread->iac3 = thread->iac4 = 0; + thread->debug.iac3 = thread->debug.iac4 = 0; #endif - thread->dac1 = thread->dac2 = 0; + thread->debug.dac1 = thread->debug.dac2 = 0; #if CONFIG_PPC_ADV_DEBUG_DVCS > 0 - thread->dvc1 = thread->dvc2 = 0; + thread->debug.dvc1 = thread->debug.dvc2 = 0; #endif - thread->dbcr0 = 0; + thread->debug.dbcr0 = 0; #ifdef CONFIG_BOOKE /* * Force User/Supervisor bits to b11 (user-only MSR[PR]=1) */ - thread->dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US | \ + thread->debug.dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US | DBCR1_IAC3US | DBCR1_IAC4US; /* * Force Data Address Compare User/Supervisor bits to be User-only * (0b11 MSR[PR]=1) and set all other bits in DBCR2 register to be 0. */ - thread->dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US; + thread->debug.dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US; #else - thread->dbcr1 = 0; + thread->debug.dbcr1 = 0; #endif } -static void prime_debug_regs(struct thread_struct *thread) +static void prime_debug_regs(struct debug_reg *debug) { - mtspr(SPRN_IAC1, thread->iac1); - mtspr(SPRN_IAC2, thread->iac2); + /* + * We could have inherited MSR_DE from userspace, since + * it doesn't get cleared on exception entry. Make sure + * MSR_DE is clear before we enable any debug events. + */ + mtmsr(mfmsr() & ~MSR_DE); + + mtspr(SPRN_IAC1, debug->iac1); + mtspr(SPRN_IAC2, debug->iac2); #if CONFIG_PPC_ADV_DEBUG_IACS > 2 - mtspr(SPRN_IAC3, thread->iac3); - mtspr(SPRN_IAC4, thread->iac4); + mtspr(SPRN_IAC3, debug->iac3); + mtspr(SPRN_IAC4, debug->iac4); #endif - mtspr(SPRN_DAC1, thread->dac1); - mtspr(SPRN_DAC2, thread->dac2); + mtspr(SPRN_DAC1, debug->dac1); + mtspr(SPRN_DAC2, debug->dac2); #if CONFIG_PPC_ADV_DEBUG_DVCS > 0 - mtspr(SPRN_DVC1, thread->dvc1); - mtspr(SPRN_DVC2, thread->dvc2); + mtspr(SPRN_DVC1, debug->dvc1); + mtspr(SPRN_DVC2, debug->dvc2); #endif - mtspr(SPRN_DBCR0, thread->dbcr0); - mtspr(SPRN_DBCR1, thread->dbcr1); + mtspr(SPRN_DBCR0, debug->dbcr0); + mtspr(SPRN_DBCR1, debug->dbcr1); #ifdef CONFIG_BOOKE - mtspr(SPRN_DBCR2, thread->dbcr2); + mtspr(SPRN_DBCR2, debug->dbcr2); #endif } /* @@ -354,59 +413,360 @@ static void prime_debug_regs(struct thread_struct *thread) * debug registers, set the debug registers from the values * stored in the new thread. */ -static void switch_booke_debug_regs(struct thread_struct *new_thread) +void switch_booke_debug_regs(struct debug_reg *new_debug) { - if ((current->thread.dbcr0 & DBCR0_IDM) - || (new_thread->dbcr0 & DBCR0_IDM)) - prime_debug_regs(new_thread); + if ((current->thread.debug.dbcr0 & DBCR0_IDM) + || (new_debug->dbcr0 & DBCR0_IDM)) + prime_debug_regs(new_debug); } +EXPORT_SYMBOL_GPL(switch_booke_debug_regs); #else /* !CONFIG_PPC_ADV_DEBUG_REGS */ #ifndef CONFIG_HAVE_HW_BREAKPOINT static void set_debug_reg_defaults(struct thread_struct *thread) { - if (thread->dabr) { - thread->dabr = 0; - thread->dabrx = 0; - set_dabr(0, 0); - } + thread->hw_brk.address = 0; + thread->hw_brk.type = 0; + set_breakpoint(&thread->hw_brk); } #endif /* !CONFIG_HAVE_HW_BREAKPOINT */ #endif /* CONFIG_PPC_ADV_DEBUG_REGS */ -int set_dabr(unsigned long dabr, unsigned long dabrx) -{ - __get_cpu_var(current_dabr) = dabr; - - if (ppc_md.set_dabr) - return ppc_md.set_dabr(dabr, dabrx); - - /* XXX should we have a CPU_FTR_HAS_DABR ? */ #ifdef CONFIG_PPC_ADV_DEBUG_REGS +static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) +{ mtspr(SPRN_DAC1, dabr); #ifdef CONFIG_PPC_47x isync(); #endif + return 0; +} #elif defined(CONFIG_PPC_BOOK3S) +static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) +{ mtspr(SPRN_DABR, dabr); - mtspr(SPRN_DABRX, dabrx); + if (cpu_has_feature(CPU_FTR_DABRX)) + mtspr(SPRN_DABRX, dabrx); + return 0; +} +#else +static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) +{ + return -EINVAL; +} #endif + +static inline int set_dabr(struct arch_hw_breakpoint *brk) +{ + unsigned long dabr, dabrx; + + dabr = brk->address | (brk->type & HW_BRK_TYPE_DABR); + dabrx = ((brk->type >> 3) & 0x7); + + if (ppc_md.set_dabr) + return ppc_md.set_dabr(dabr, dabrx); + + return __set_dabr(dabr, dabrx); +} + +static inline int set_dawr(struct arch_hw_breakpoint *brk) +{ + unsigned long dawr, dawrx, mrd; + + dawr = brk->address; + + dawrx = (brk->type & (HW_BRK_TYPE_READ | HW_BRK_TYPE_WRITE)) \ + << (63 - 58); //* read/write bits */ + dawrx |= ((brk->type & (HW_BRK_TYPE_TRANSLATE)) >> 2) \ + << (63 - 59); //* translate */ + dawrx |= (brk->type & (HW_BRK_TYPE_PRIV_ALL)) \ + >> 3; //* PRIM bits */ + /* dawr length is stored in field MDR bits 48:53. Matches range in + doublewords (64 bits) baised by -1 eg. 0b000000=1DW and + 0b111111=64DW. + brk->len is in bytes. + This aligns up to double word size, shifts and does the bias. + */ + mrd = ((brk->len + 7) >> 3) - 1; + dawrx |= (mrd & 0x3f) << (63 - 53); + + if (ppc_md.set_dawr) + return ppc_md.set_dawr(dawr, dawrx); + mtspr(SPRN_DAWR, dawr); + mtspr(SPRN_DAWRX, dawrx); return 0; } +void __set_breakpoint(struct arch_hw_breakpoint *brk) +{ + __get_cpu_var(current_brk) = *brk; + + if (cpu_has_feature(CPU_FTR_DAWR)) + set_dawr(brk); + else + set_dabr(brk); +} + +void set_breakpoint(struct arch_hw_breakpoint *brk) +{ + preempt_disable(); + __set_breakpoint(brk); + preempt_enable(); +} + #ifdef CONFIG_PPC64 DEFINE_PER_CPU(struct cpu_usage, cpu_usage_array); #endif +static inline bool hw_brk_match(struct arch_hw_breakpoint *a, + struct arch_hw_breakpoint *b) +{ + if (a->address != b->address) + return false; + if (a->type != b->type) + return false; + if (a->len != b->len) + return false; + return true; +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static void tm_reclaim_thread(struct thread_struct *thr, + struct thread_info *ti, uint8_t cause) +{ + unsigned long msr_diff = 0; + + /* + * If FP/VSX registers have been already saved to the + * thread_struct, move them to the transact_fp array. + * We clear the TIF_RESTORE_TM bit since after the reclaim + * the thread will no longer be transactional. + */ + if (test_ti_thread_flag(ti, TIF_RESTORE_TM)) { + msr_diff = thr->tm_orig_msr & ~thr->regs->msr; + if (msr_diff & MSR_FP) + memcpy(&thr->transact_fp, &thr->fp_state, + sizeof(struct thread_fp_state)); + if (msr_diff & MSR_VEC) + memcpy(&thr->transact_vr, &thr->vr_state, + sizeof(struct thread_vr_state)); + clear_ti_thread_flag(ti, TIF_RESTORE_TM); + msr_diff &= MSR_FP | MSR_VEC | MSR_VSX | MSR_FE0 | MSR_FE1; + } + + tm_reclaim(thr, thr->regs->msr, cause); + + /* Having done the reclaim, we now have the checkpointed + * FP/VSX values in the registers. These might be valid + * even if we have previously called enable_kernel_fp() or + * flush_fp_to_thread(), so update thr->regs->msr to + * indicate their current validity. + */ + thr->regs->msr |= msr_diff; +} + +void tm_reclaim_current(uint8_t cause) +{ + tm_enable(); + tm_reclaim_thread(¤t->thread, current_thread_info(), cause); +} + +static inline void tm_reclaim_task(struct task_struct *tsk) +{ + /* We have to work out if we're switching from/to a task that's in the + * middle of a transaction. + * + * In switching we need to maintain a 2nd register state as + * oldtask->thread.ckpt_regs. We tm_reclaim(oldproc); this saves the + * checkpointed (tbegin) state in ckpt_regs and saves the transactional + * (current) FPRs into oldtask->thread.transact_fpr[]. + * + * We also context switch (save) TFHAR/TEXASR/TFIAR in here. + */ + struct thread_struct *thr = &tsk->thread; + + if (!thr->regs) + return; + + if (!MSR_TM_ACTIVE(thr->regs->msr)) + goto out_and_saveregs; + + /* Stash the original thread MSR, as giveup_fpu et al will + * modify it. We hold onto it to see whether the task used + * FP & vector regs. If the TIF_RESTORE_TM flag is set, + * tm_orig_msr is already set. + */ + if (!test_ti_thread_flag(task_thread_info(tsk), TIF_RESTORE_TM)) + thr->tm_orig_msr = thr->regs->msr; + + TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, " + "ccr=%lx, msr=%lx, trap=%lx)\n", + tsk->pid, thr->regs->nip, + thr->regs->ccr, thr->regs->msr, + thr->regs->trap); + + tm_reclaim_thread(thr, task_thread_info(tsk), TM_CAUSE_RESCHED); + + TM_DEBUG("--- tm_reclaim on pid %d complete\n", + tsk->pid); + +out_and_saveregs: + /* Always save the regs here, even if a transaction's not active. + * This context-switches a thread's TM info SPRs. We do it here to + * be consistent with the restore path (in recheckpoint) which + * cannot happen later in _switch(). + */ + tm_save_sprs(thr); +} + +extern void __tm_recheckpoint(struct thread_struct *thread, + unsigned long orig_msr); + +void tm_recheckpoint(struct thread_struct *thread, + unsigned long orig_msr) +{ + unsigned long flags; + + /* We really can't be interrupted here as the TEXASR registers can't + * change and later in the trecheckpoint code, we have a userspace R1. + * So let's hard disable over this region. + */ + local_irq_save(flags); + hard_irq_disable(); + + /* The TM SPRs are restored here, so that TEXASR.FS can be set + * before the trecheckpoint and no explosion occurs. + */ + tm_restore_sprs(thread); + + __tm_recheckpoint(thread, orig_msr); + + local_irq_restore(flags); +} + +static inline void tm_recheckpoint_new_task(struct task_struct *new) +{ + unsigned long msr; + + if (!cpu_has_feature(CPU_FTR_TM)) + return; + + /* Recheckpoint the registers of the thread we're about to switch to. + * + * If the task was using FP, we non-lazily reload both the original and + * the speculative FP register states. This is because the kernel + * doesn't see if/when a TM rollback occurs, so if we take an FP + * unavoidable later, we are unable to determine which set of FP regs + * need to be restored. + */ + if (!new->thread.regs) + return; + + if (!MSR_TM_ACTIVE(new->thread.regs->msr)){ + tm_restore_sprs(&new->thread); + return; + } + msr = new->thread.tm_orig_msr; + /* Recheckpoint to restore original checkpointed register state. */ + TM_DEBUG("*** tm_recheckpoint of pid %d " + "(new->msr 0x%lx, new->origmsr 0x%lx)\n", + new->pid, new->thread.regs->msr, msr); + + /* This loads the checkpointed FP/VEC state, if used */ + tm_recheckpoint(&new->thread, msr); + + /* This loads the speculative FP/VEC state, if used */ + if (msr & MSR_FP) { + do_load_up_transact_fpu(&new->thread); + new->thread.regs->msr |= + (MSR_FP | new->thread.fpexc_mode); + } +#ifdef CONFIG_ALTIVEC + if (msr & MSR_VEC) { + do_load_up_transact_altivec(&new->thread); + new->thread.regs->msr |= MSR_VEC; + } +#endif + /* We may as well turn on VSX too since all the state is restored now */ + if (msr & MSR_VSX) + new->thread.regs->msr |= MSR_VSX; + + TM_DEBUG("*** tm_recheckpoint of pid %d complete " + "(kernel msr 0x%lx)\n", + new->pid, mfmsr()); +} + +static inline void __switch_to_tm(struct task_struct *prev) +{ + if (cpu_has_feature(CPU_FTR_TM)) { + tm_enable(); + tm_reclaim_task(prev); + } +} + +/* + * This is called if we are on the way out to userspace and the + * TIF_RESTORE_TM flag is set. It checks if we need to reload + * FP and/or vector state and does so if necessary. + * If userspace is inside a transaction (whether active or + * suspended) and FP/VMX/VSX instructions have ever been enabled + * inside that transaction, then we have to keep them enabled + * and keep the FP/VMX/VSX state loaded while ever the transaction + * continues. The reason is that if we didn't, and subsequently + * got a FP/VMX/VSX unavailable interrupt inside a transaction, + * we don't know whether it's the same transaction, and thus we + * don't know which of the checkpointed state and the transactional + * state to use. + */ +void restore_tm_state(struct pt_regs *regs) +{ + unsigned long msr_diff; + + clear_thread_flag(TIF_RESTORE_TM); + if (!MSR_TM_ACTIVE(regs->msr)) + return; + + msr_diff = current->thread.tm_orig_msr & ~regs->msr; + msr_diff &= MSR_FP | MSR_VEC | MSR_VSX; + if (msr_diff & MSR_FP) { + fp_enable(); + load_fp_state(¤t->thread.fp_state); + regs->msr |= current->thread.fpexc_mode; + } + if (msr_diff & MSR_VEC) { + vec_enable(); + load_vr_state(¤t->thread.vr_state); + } + regs->msr |= msr_diff; +} + +#else +#define tm_recheckpoint_new_task(new) +#define __switch_to_tm(prev) +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *new) { struct thread_struct *new_thread, *old_thread; - unsigned long flags; struct task_struct *last; #ifdef CONFIG_PPC_BOOK3S_64 struct ppc64_tlb_batch *batch; #endif + WARN_ON(!irqs_disabled()); + + /* Back up the TAR and DSCR across context switches. + * Note that the TAR is not available for use in the kernel. (To + * provide this, the TAR should be backed up/restored on exception + * entry/exit instead, and be in pt_regs. FIXME, this should be in + * pt_regs anyway (for debug).) + * Save the TAR and DSCR here before we do treclaim/trecheckpoint as + * these will change them. + */ + save_early_sprs(&prev->thread); + + __switch_to_tm(prev); + #ifdef CONFIG_SMP /* avoid complexity of lazy save/restore of fpu * by just saving it every time we switch out if @@ -474,15 +834,15 @@ struct task_struct *__switch_to(struct task_struct *prev, #endif /* CONFIG_SMP */ #ifdef CONFIG_PPC_ADV_DEBUG_REGS - switch_booke_debug_regs(&new->thread); + switch_booke_debug_regs(&new->thread.debug); #else /* * For PPC_BOOK3S_64, we use the hw-breakpoint interfaces that would * schedule DABR */ #ifndef CONFIG_HAVE_HW_BREAKPOINT - if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr)) - set_dabr(new->thread.dabr, new->thread.dabrx); + if (unlikely(!hw_brk_match(&__get_cpu_var(current_brk), &new->thread.hw_brk))) + __set_breakpoint(&new->thread.hw_brk); #endif /* CONFIG_HAVE_HW_BREAKPOINT */ #endif @@ -514,14 +874,15 @@ struct task_struct *__switch_to(struct task_struct *prev, } #endif /* CONFIG_PPC_BOOK3S_64 */ - local_irq_save(flags); - /* * We can't take a PMU exception inside _switch() since there is a * window where the kernel stack SLB and the kernel stack are out * of sync. Hard disable here. */ hard_irq_disable(); + + tm_recheckpoint_new_task(new); + last = _switch(old_thread, new_thread); #ifdef CONFIG_PPC_BOOK3S_64 @@ -532,8 +893,6 @@ struct task_struct *__switch_to(struct task_struct *prev, } #endif /* CONFIG_PPC_BOOK3S_64 */ - local_irq_restore(flags); - return last; } @@ -641,6 +1000,8 @@ void show_regs(struct pt_regs * regs) { int i, trap; + show_regs_print_info(KERN_DEFAULT); + printk("NIP: "REG" LR: "REG" CTR: "REG"\n", regs->nip, regs->link, regs->ctr); printk("REGS: %p TRAP: %04lx %s (%s)\n", @@ -648,24 +1009,22 @@ void show_regs(struct pt_regs * regs) printk("MSR: "REG" ", regs->msr); printbits(regs->msr, msr_bits); printk(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); -#ifdef CONFIG_PPC64 - printk("SOFTE: %ld\n", regs->softe); -#endif trap = TRAP(regs); if ((regs->trap != 0xc00) && cpu_has_feature(CPU_FTR_CFAR)) - printk("CFAR: "REG"\n", regs->orig_gpr3); - if (trap == 0x300 || trap == 0x600) + printk("CFAR: "REG" ", regs->orig_gpr3); + if (trap == 0x200 || trap == 0x300 || trap == 0x600) #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - printk("DEAR: "REG", ESR: "REG"\n", regs->dar, regs->dsisr); + printk("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr); #else - printk("DAR: "REG", DSISR: %08lx\n", regs->dar, regs->dsisr); + printk("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); +#endif +#ifdef CONFIG_PPC64 + printk("SOFTE: %ld ", regs->softe); +#endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(regs->msr)) + printk("\nPACATMSCRATCH: %016llx ", get_paca()->tm_scratch); #endif - printk("TASK = %p[%d] '%s' THREAD: %p", - current, task_pid_nr(current), current->comm, task_thread_info(current)); - -#ifdef CONFIG_SMP - printk(" CPU: %d", raw_smp_processor_id()); -#endif /* CONFIG_SMP */ for (i = 0; i < 32; i++) { if ((i % REGS_PER_LINE) == 0) @@ -719,11 +1078,20 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) flush_altivec_to_thread(src); flush_vsx_to_thread(src); flush_spe_to_thread(src); -#ifdef CONFIG_HAVE_HW_BREAKPOINT - flush_ptrace_hw_breakpoint(src); -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ + /* + * Flush TM state out so we can copy it. __switch_to_tm() does this + * flush but it removes the checkpointed state from the current CPU and + * transitions the CPU out of TM mode. Hence we need to call + * tm_recheckpoint_new_task() (on the same task) to restore the + * checkpointed state back and the TM mode. + */ + __switch_to_tm(src); + tm_recheckpoint_new_task(src); *dst = *src; + + clear_task_ebb(dst); + return 0; } @@ -748,7 +1116,9 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, struct thread_info *ti = (void *)task_stack_page(p); memset(childregs, 0, sizeof(struct pt_regs)); childregs->gpr[1] = sp + sizeof(struct pt_regs); - childregs->gpr[14] = usp; /* function */ + /* function */ + if (usp) + childregs->gpr[14] = ppc_function_entry((void *)usp); #ifdef CONFIG_PPC64 clear_tsk_thread_flag(p, TIF_32BIT); childregs->softe = 1; @@ -786,12 +1156,23 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, * do some house keeping and then return from the fork or clone * system call, using the stack frame created above. */ + ((unsigned long *)sp)[0] = 0; sp -= sizeof(struct pt_regs); kregs = (struct pt_regs *) sp; sp -= STACK_FRAME_OVERHEAD; p->thread.ksp = sp; +#ifdef CONFIG_PPC32 p->thread.ksp_limit = (unsigned long)task_stack_page(p) + _ALIGN_UP(sizeof(struct thread_info), 16); +#endif +#ifdef CONFIG_HAVE_HW_BREAKPOINT + p->thread.ptrace_bps[0] = NULL; +#endif + + p->thread.fp_save_area = NULL; +#ifdef CONFIG_ALTIVEC + p->thread.vr_save_area = NULL; +#endif #ifdef CONFIG_PPC_STD_MMU_64 if (mmu_has_feature(MMU_FTR_SLB)) { @@ -813,18 +1194,10 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, p->thread.dscr_inherit = current->thread.dscr_inherit; p->thread.dscr = current->thread.dscr; } + if (cpu_has_feature(CPU_FTR_HAS_PPR)) + p->thread.ppr = INIT_PPR; #endif - /* - * The PPC64 ABI makes use of a TOC to contain function - * pointers. The function (ret_from_except) is actually a pointer - * to the TOC entry. The first entry is a pointer to the actual - * function. - */ -#ifdef CONFIG_PPC64 - kregs->nip = *((unsigned long *)f); -#else - kregs->nip = (unsigned long)f; -#endif + kregs->nip = ppc_function_entry(f); return 0; } @@ -866,25 +1239,45 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) regs->msr = MSR_USER; #else if (!is_32bit_task()) { - unsigned long entry, toc; + unsigned long entry; - /* start is a relocated pointer to the function descriptor for - * the elf _start routine. The first entry in the function - * descriptor is the entry address of _start and the second - * entry is the TOC value we need to use. - */ - __get_user(entry, (unsigned long __user *)start); - __get_user(toc, (unsigned long __user *)start+1); + if (is_elf2_task()) { + /* Look ma, no function descriptors! */ + entry = start; - /* Check whether the e_entry function descriptor entries - * need to be relocated before we can use them. - */ - if (load_addr != 0) { - entry += load_addr; - toc += load_addr; + /* + * Ulrich says: + * The latest iteration of the ABI requires that when + * calling a function (at its global entry point), + * the caller must ensure r12 holds the entry point + * address (so that the function can quickly + * establish addressability). + */ + regs->gpr[12] = start; + /* Make sure that's restored on entry to userspace. */ + set_thread_flag(TIF_RESTOREALL); + } else { + unsigned long toc; + + /* start is a relocated pointer to the function + * descriptor for the elf _start routine. The first + * entry in the function descriptor is the entry + * address of _start and the second entry is the TOC + * value we need to use. + */ + __get_user(entry, (unsigned long __user *)start); + __get_user(toc, (unsigned long __user *)start+1); + + /* Check whether the e_entry function descriptor entries + * need to be relocated before we can use them. + */ + if (load_addr != 0) { + entry += load_addr; + toc += load_addr; + } + regs->gpr[2] = toc; } regs->nip = entry; - regs->gpr[2] = toc; regs->msr = MSR_USER64; } else { regs->nip = start; @@ -892,17 +1285,16 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) regs->msr = MSR_USER32; } #endif - discard_lazy_cpu_state(); #ifdef CONFIG_VSX current->thread.used_vsr = 0; #endif - memset(current->thread.fpr, 0, sizeof(current->thread.fpr)); - current->thread.fpscr.val = 0; + memset(¤t->thread.fp_state, 0, sizeof(current->thread.fp_state)); + current->thread.fp_save_area = NULL; #ifdef CONFIG_ALTIVEC - memset(current->thread.vr, 0, sizeof(current->thread.vr)); - memset(¤t->thread.vscr, 0, sizeof(current->thread.vscr)); - current->thread.vscr.u[3] = 0x00010000; /* Java mode disabled */ + memset(¤t->thread.vr_state, 0, sizeof(current->thread.vr_state)); + current->thread.vr_state.vscr.u[3] = 0x00010000; /* Java mode disabled */ + current->thread.vr_save_area = NULL; current->thread.vrsave = 0; current->thread.used_vr = 0; #endif /* CONFIG_ALTIVEC */ @@ -912,6 +1304,13 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) current->thread.spefscr = 0; current->thread.used_spe = 0; #endif /* CONFIG_SPE */ +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (cpu_has_feature(CPU_FTR_TM)) + regs->msr |= MSR_TM; + current->thread.tm_tfhar = 0; + current->thread.tm_texasr = 0; + current->thread.tm_tfiar = 0; +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ } #define PR_FP_ALL_EXCEPT (PR_FP_EXC_DIV | PR_FP_EXC_OVF | PR_FP_EXC_UND \ @@ -928,6 +1327,19 @@ int set_fpexc_mode(struct task_struct *tsk, unsigned int val) if (val & PR_FP_EXC_SW_ENABLE) { #ifdef CONFIG_SPE if (cpu_has_feature(CPU_FTR_SPE)) { + /* + * When the sticky exception bits are set + * directly by userspace, it must call prctl + * with PR_GET_FPEXC (with PR_FP_EXC_SW_ENABLE + * in the existing prctl settings) or + * PR_SET_FPEXC (with PR_FP_EXC_SW_ENABLE in + * the bits being set). <fenv.h> functions + * saving and restoring the whole + * floating-point environment need to do so + * anyway to restore the prctl settings from + * the saved environment. + */ + tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR); tsk->thread.fpexc_mode = val & (PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT); return 0; @@ -959,9 +1371,22 @@ int get_fpexc_mode(struct task_struct *tsk, unsigned long adr) if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE) #ifdef CONFIG_SPE - if (cpu_has_feature(CPU_FTR_SPE)) + if (cpu_has_feature(CPU_FTR_SPE)) { + /* + * When the sticky exception bits are set + * directly by userspace, it must call prctl + * with PR_GET_FPEXC (with PR_FP_EXC_SW_ENABLE + * in the existing prctl settings) or + * PR_SET_FPEXC (with PR_FP_EXC_SW_ENABLE in + * the bits being set). <fenv.h> functions + * saving and restoring the whole + * floating-point environment need to do so + * anyway to restore the prctl settings from + * the saved environment. + */ + tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR); val = tsk->thread.fpexc_mode; - else + } else return -EINVAL; #else return -EINVAL; @@ -1161,15 +1586,9 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) } while (count++ < kstack_depth_to_print); } -void dump_stack(void) -{ - show_stack(current, NULL); -} -EXPORT_SYMBOL(dump_stack); - #ifdef CONFIG_PPC64 /* Called with hard IRQs off */ -void __ppc64_runlatch_on(void) +void notrace __ppc64_runlatch_on(void) { struct thread_info *ti = current_thread_info(); unsigned long ctrl; @@ -1182,7 +1601,7 @@ void __ppc64_runlatch_on(void) } /* Called with hard IRQs off */ -void __ppc64_runlatch_off(void) +void notrace __ppc64_runlatch_off(void) { struct thread_info *ti = current_thread_info(); unsigned long ctrl; diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 8b6f7a99cce..b694b073097 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -29,10 +29,11 @@ #include <linux/bitops.h> #include <linux/export.h> #include <linux/kexec.h> -#include <linux/debugfs.h> #include <linux/irq.h> #include <linux/memblock.h> #include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/libfdt.h> #include <asm/prom.h> #include <asm/rtas.h> @@ -117,14 +118,14 @@ static void __init move_device_tree(void) DBG("-> move_device_tree\n"); start = __pa(initial_boot_params); - size = be32_to_cpu(initial_boot_params->totalsize); + size = fdt_totalsize(initial_boot_params); if ((memory_limit && (start + size) > PHYSICAL_START + memory_limit) || overlaps_crashkernel(start, size) || overlaps_initrd(start, size)) { p = __va(memblock_alloc(size, PAGE_SIZE)); memcpy(p, initial_boot_params, size); - initial_boot_params = (struct boot_param_header *)p; + initial_boot_params = p; DBG("Moved device tree to 0x%p\n", p); } @@ -162,7 +163,7 @@ static struct ibm_pa_feature { {CPU_FTR_REAL_LE, PPC_FEATURE_TRUE_LE, 5, 0, 0}, }; -static void __init scan_features(unsigned long node, unsigned char *ftrs, +static void __init scan_features(unsigned long node, const unsigned char *ftrs, unsigned long tablelen, struct ibm_pa_feature *fp, unsigned long ft_size) @@ -201,8 +202,8 @@ static void __init scan_features(unsigned long node, unsigned char *ftrs, static void __init check_cpu_pa_features(unsigned long node) { - unsigned char *pa_ftrs; - unsigned long tablelen; + const unsigned char *pa_ftrs; + int tablelen; pa_ftrs = of_get_flat_dt_prop(node, "ibm,pa-features", &tablelen); if (pa_ftrs == NULL) @@ -215,16 +216,16 @@ static void __init check_cpu_pa_features(unsigned long node) #ifdef CONFIG_PPC_STD_MMU_64 static void __init check_cpu_slb_size(unsigned long node) { - u32 *slb_size_ptr; + const __be32 *slb_size_ptr; slb_size_ptr = of_get_flat_dt_prop(node, "slb-size", NULL); if (slb_size_ptr != NULL) { - mmu_slb_size = *slb_size_ptr; + mmu_slb_size = be32_to_cpup(slb_size_ptr); return; } slb_size_ptr = of_get_flat_dt_prop(node, "ibm,slb-size", NULL); if (slb_size_ptr != NULL) { - mmu_slb_size = *slb_size_ptr; + mmu_slb_size = be32_to_cpup(slb_size_ptr); } } #else @@ -256,7 +257,7 @@ static struct feature_property { static inline void identical_pvr_fixup(unsigned long node) { unsigned int pvr; - char *model = of_get_flat_dt_prop(node, "model", NULL); + const char *model = of_get_flat_dt_prop(node, "model", NULL); /* * Since 440GR(x)/440EP(x) processors have the same pvr, @@ -279,11 +280,11 @@ static void __init check_cpu_feature_properties(unsigned long node) { unsigned long i; struct feature_property *fp = feature_properties; - const u32 *prop; + const __be32 *prop; for (i = 0; i < ARRAY_SIZE(feature_properties); ++i, ++fp) { prop = of_get_flat_dt_prop(node, fp->name, NULL); - if (prop && *prop >= fp->min_value) { + if (prop && be32_to_cpup(prop) >= fp->min_value) { cur_cpu_spec->cpu_features |= fp->cpu_feature; cur_cpu_spec->cpu_user_features |= fp->cpu_user_ftr; } @@ -294,11 +295,11 @@ static int __init early_init_dt_scan_cpus(unsigned long node, const char *uname, int depth, void *data) { - char *type = of_get_flat_dt_prop(node, "device_type", NULL); - const u32 *prop; - const u32 *intserv; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + const __be32 *intserv; int i, nthreads; - unsigned long len; + int len; int found = -1; int found_thread = 0; @@ -324,8 +325,9 @@ static int __init early_init_dt_scan_cpus(unsigned long node, * version 2 of the kexec param format adds the phys cpuid of * booted proc. */ - if (initial_boot_params->version >= 2) { - if (intserv[i] == initial_boot_params->boot_cpuid_phys) { + if (fdt_version(initial_boot_params) >= 2) { + if (be32_to_cpu(intserv[i]) == + fdt_boot_cpuid_phys(initial_boot_params)) { found = boot_cpu_count; found_thread = i; } @@ -345,51 +347,52 @@ static int __init early_init_dt_scan_cpus(unsigned long node, #endif } - if (found >= 0) { - DBG("boot cpu: logical %d physical %d\n", found, - intserv[found_thread]); - boot_cpuid = found; - set_hard_smp_processor_id(found, intserv[found_thread]); + /* Not the boot CPU */ + if (found < 0) + return 0; - /* - * PAPR defines "logical" PVR values for cpus that - * meet various levels of the architecture: - * 0x0f000001 Architecture version 2.04 - * 0x0f000002 Architecture version 2.05 - * If the cpu-version property in the cpu node contains - * such a value, we call identify_cpu again with the - * logical PVR value in order to use the cpu feature - * bits appropriate for the architecture level. - * - * A POWER6 partition in "POWER6 architected" mode - * uses the 0x0f000002 PVR value; in POWER5+ mode - * it uses 0x0f000001. - */ - prop = of_get_flat_dt_prop(node, "cpu-version", NULL); - if (prop && (*prop & 0xff000000) == 0x0f000000) - identify_cpu(0, *prop); + DBG("boot cpu: logical %d physical %d\n", found, + be32_to_cpu(intserv[found_thread])); + boot_cpuid = found; + set_hard_smp_processor_id(found, be32_to_cpu(intserv[found_thread])); - identical_pvr_fixup(node); - } + /* + * PAPR defines "logical" PVR values for cpus that + * meet various levels of the architecture: + * 0x0f000001 Architecture version 2.04 + * 0x0f000002 Architecture version 2.05 + * If the cpu-version property in the cpu node contains + * such a value, we call identify_cpu again with the + * logical PVR value in order to use the cpu feature + * bits appropriate for the architecture level. + * + * A POWER6 partition in "POWER6 architected" mode + * uses the 0x0f000002 PVR value; in POWER5+ mode + * it uses 0x0f000001. + */ + prop = of_get_flat_dt_prop(node, "cpu-version", NULL); + if (prop && (be32_to_cpup(prop) & 0xff000000) == 0x0f000000) + identify_cpu(0, be32_to_cpup(prop)); + + identical_pvr_fixup(node); check_cpu_feature_properties(node); check_cpu_pa_features(node); check_cpu_slb_size(node); -#ifdef CONFIG_PPC_PSERIES +#ifdef CONFIG_PPC64 if (nthreads > 1) cur_cpu_spec->cpu_features |= CPU_FTR_SMT; else cur_cpu_spec->cpu_features &= ~CPU_FTR_SMT; #endif - return 0; } int __init early_init_dt_scan_chosen_ppc(unsigned long node, const char *uname, int depth, void *data) { - unsigned long *lprop; + const unsigned long *lprop; /* All these set by kernel, so no need to convert endian */ /* Use common scan routine to determine if this is the chosen node */ if (early_init_dt_scan_chosen(node, uname, depth, data) == 0) @@ -440,8 +443,9 @@ int __init early_init_dt_scan_chosen_ppc(unsigned long node, const char *uname, */ static int __init early_init_dt_scan_drconf_memory(unsigned long node) { - __be32 *dm, *ls, *usm; - unsigned long l, n, flags; + const __be32 *dm, *ls, *usm; + int l; + unsigned long n, flags; u64 base, size, memblock_size; unsigned int is_kexec_kdump = 0, rngs; @@ -454,7 +458,7 @@ static int __init early_init_dt_scan_drconf_memory(unsigned long node) if (dm == NULL || l < sizeof(__be32)) return 0; - n = *dm++; /* number of entries */ + n = of_read_number(dm++, 1); /* number of entries */ if (l < (n * (dt_root_addr_cells + 4) + 1) * sizeof(__be32)) return 0; @@ -466,7 +470,7 @@ static int __init early_init_dt_scan_drconf_memory(unsigned long node) for (; n != 0; --n) { base = dt_mem_next_cell(dt_root_addr_cells, &dm); - flags = dm[3]; + flags = of_read_number(&dm[3], 1); /* skip DRC index, pad, assoc. list index, flags */ dm += 4; /* skip this block if the reserved bit is set in flags (0x80) @@ -521,6 +525,20 @@ static int __init early_init_dt_scan_memory_ppc(unsigned long node, return early_init_dt_scan_memory(node, uname, depth, data); } +/* + * For a relocatable kernel, we need to get the memstart_addr first, + * then use it to calculate the virtual kernel start address. This has + * to happen at a very early stage (before machine_init). In this case, + * we just want to get the memstart_address and would not like to mess the + * memblock at this stage. So introduce a variable to skip the memblock_add() + * for this reason. + */ +#ifdef CONFIG_RELOCATABLE +static int add_mem_to_memblock = 1; +#else +#define add_mem_to_memblock 1 +#endif + void __init early_init_dt_add_memory_arch(u64 base, u64 size) { #ifdef CONFIG_PPC64 @@ -541,45 +559,59 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) } /* Add the chunk to the MEMBLOCK list */ - memblock_add(base, size); + if (add_mem_to_memblock) + memblock_add(base, size); } -void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) +static void __init early_reserve_mem_dt(void) { - return __va(memblock_alloc(size, align)); -} + unsigned long i, dt_root; + int len; + const __be32 *prop; -#ifdef CONFIG_BLK_DEV_INITRD -void __init early_init_dt_setup_initrd_arch(unsigned long start, - unsigned long end) -{ - initrd_start = (unsigned long)__va(start); - initrd_end = (unsigned long)__va(end); - initrd_below_start_ok = 1; + early_init_fdt_scan_reserved_mem(); + + dt_root = of_get_flat_dt_root(); + + prop = of_get_flat_dt_prop(dt_root, "reserved-ranges", &len); + + if (!prop) + return; + + DBG("Found new-style reserved-ranges\n"); + + /* Each reserved range is an (address,size) pair, 2 cells each, + * totalling 4 cells per range. */ + for (i = 0; i < len / (sizeof(*prop) * 4); i++) { + u64 base, size; + + base = of_read_number(prop + (i * 4) + 0, 2); + size = of_read_number(prop + (i * 4) + 2, 2); + + if (size) { + DBG("reserving: %llx -> %llx\n", base, size); + memblock_reserve(base, size); + } + } } -#endif static void __init early_reserve_mem(void) { - u64 base, size; - u64 *reserve_map; - unsigned long self_base; - unsigned long self_size; + __be64 *reserve_map; - reserve_map = (u64 *)(((unsigned long)initial_boot_params) + - initial_boot_params->off_mem_rsvmap); + reserve_map = (__be64 *)(((unsigned long)initial_boot_params) + + fdt_off_mem_rsvmap(initial_boot_params)); - /* before we do anything, lets reserve the dt blob */ - self_base = __pa((unsigned long)initial_boot_params); - self_size = initial_boot_params->totalsize; - memblock_reserve(self_base, self_size); + /* Look for the new "reserved-regions" property in the DT */ + early_reserve_mem_dt(); #ifdef CONFIG_BLK_DEV_INITRD - /* then reserve the initrd, if any */ - if (initrd_start && (initrd_end > initrd_start)) + /* Then reserve the initrd, if any */ + if (initrd_start && (initrd_end > initrd_start)) { memblock_reserve(_ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE), _ALIGN_UP(initrd_end, PAGE_SIZE) - _ALIGN_DOWN(initrd_start, PAGE_SIZE)); + } #endif /* CONFIG_BLK_DEV_INITRD */ #ifdef CONFIG_PPC32 @@ -587,32 +619,23 @@ static void __init early_reserve_mem(void) * Handle the case where we might be booting from an old kexec * image that setup the mem_rsvmap as pairs of 32-bit values */ - if (*reserve_map > 0xffffffffull) { + if (be64_to_cpup(reserve_map) > 0xffffffffull) { u32 base_32, size_32; - u32 *reserve_map_32 = (u32 *)reserve_map; + __be32 *reserve_map_32 = (__be32 *)reserve_map; + + DBG("Found old 32-bit reserve map\n"); while (1) { - base_32 = *(reserve_map_32++); - size_32 = *(reserve_map_32++); + base_32 = be32_to_cpup(reserve_map_32++); + size_32 = be32_to_cpup(reserve_map_32++); if (size_32 == 0) break; - /* skip if the reservation is for the blob */ - if (base_32 == self_base && size_32 == self_size) - continue; DBG("reserving: %x -> %x\n", base_32, size_32); memblock_reserve(base_32, size_32); } return; } #endif - while (1) { - base = *(reserve_map++); - size = *(reserve_map++); - if (size == 0) - break; - DBG("reserving: %llx -> %llx\n", base, size); - memblock_reserve(base, size); - } } void __init early_init_devtree(void *params) @@ -639,13 +662,6 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL); #endif - /* Pre-initialize the cmd_line with the content of boot_commmand_line, - * which will be empty except when the content of the variable has - * been overriden by a bootloading mechanism. This happens typically - * with HAL takeover - */ - strlcpy(cmd_line, boot_command_line, COMMAND_LINE_SIZE); - /* Retrieve various informations from the /chosen node of the * device-tree, including the platform type, initrd location and * size, TCE reserve, and more ... @@ -704,6 +720,10 @@ void __init early_init_devtree(void *params) * (altivec support, boot CPU ID, ...) */ of_scan_flat_dt(early_init_dt_scan_cpus, NULL); + if (boot_cpuid < 0) { + printk("Failed to indentify boot CPU !\n"); + BUG(); + } #if defined(CONFIG_SMP) && defined(CONFIG_PPC64) /* We'll later wait for secondaries to check in; there are @@ -712,9 +732,38 @@ void __init early_init_devtree(void *params) spinning_secondaries = boot_cpu_count - 1; #endif +#ifdef CONFIG_PPC_POWERNV + /* Scan and build the list of machine check recoverable ranges */ + of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL); +#endif + DBG(" <- early_init_devtree()\n"); } +#ifdef CONFIG_RELOCATABLE +/* + * This function run before early_init_devtree, so we have to init + * initial_boot_params. + */ +void __init early_get_first_memblock_info(void *params, phys_addr_t *size) +{ + /* Setup flat device-tree pointer */ + initial_boot_params = params; + + /* + * Scan the memory nodes and set add_mem_to_memblock to 0 to avoid + * mess the memblock. + */ + add_mem_to_memblock = 0; + of_scan_flat_dt(early_init_dt_scan_root, NULL); + of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL); + add_mem_to_memblock = 1; + + if (size) + *size = first_memblock_size; +} +#endif + /******* * * New implementation of the OF "find" APIs, return a refcounted @@ -727,35 +776,50 @@ void __init early_init_devtree(void *params) *******/ /** - * of_find_next_cache_node - Find a node's subsidiary cache - * @np: node of type "cpu" or "cache" + * of_get_ibm_chip_id - Returns the IBM "chip-id" of a device + * @np: device node of the device * - * Returns a node pointer with refcount incremented, use - * of_node_put() on it when done. Caller should hold a reference - * to np. + * This looks for a property "ibm,chip-id" in the node or any + * of its parents and returns its content, or -1 if it cannot + * be found. */ -struct device_node *of_find_next_cache_node(struct device_node *np) +int of_get_ibm_chip_id(struct device_node *np) { - struct device_node *child; - const phandle *handle; - - handle = of_get_property(np, "l2-cache", NULL); - if (!handle) - handle = of_get_property(np, "next-level-cache", NULL); + of_node_get(np); + while(np) { + struct device_node *old = np; + const __be32 *prop; + + prop = of_get_property(np, "ibm,chip-id", NULL); + if (prop) { + of_node_put(np); + return be32_to_cpup(prop); + } + np = of_get_parent(np); + of_node_put(old); + } + return -1; +} - if (handle) - return of_find_node_by_phandle(*handle); +/** + * cpu_to_chip_id - Return the cpus chip-id + * @cpu: The logical cpu number. + * + * Return the value of the ibm,chip-id property corresponding to the given + * logical cpu number. If the chip-id can not be found, returns -1. + */ +int cpu_to_chip_id(int cpu) +{ + struct device_node *np; - /* OF on pmac has nodes instead of properties named "l2-cache" - * beneath CPU nodes. - */ - if (!strcmp(np->type, "cpu")) - for_each_child_of_node(np, child) - if (!strcmp(child->type, "cache")) - return child; + np = of_get_cpu_node(cpu, NULL); + if (!np) + return -1; - return NULL; + of_node_put(np); + return of_get_ibm_chip_id(np); } +EXPORT_SYMBOL(cpu_to_chip_id); #ifdef CONFIG_PPC_PSERIES /* @@ -827,66 +891,7 @@ static int __init prom_reconfig_setup(void) __initcall(prom_reconfig_setup); #endif -/* Find the device node for a given logical cpu number, also returns the cpu - * local thread number (index in ibm,interrupt-server#s) if relevant and - * asked for (non NULL) - */ -struct device_node *of_get_cpu_node(int cpu, unsigned int *thread) +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) { - int hardid; - struct device_node *np; - - hardid = get_hard_smp_processor_id(cpu); - - for_each_node_by_type(np, "cpu") { - const u32 *intserv; - unsigned int plen, t; - - /* Check for ibm,ppc-interrupt-server#s. If it doesn't exist - * fallback to "reg" property and assume no threads - */ - intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", - &plen); - if (intserv == NULL) { - const u32 *reg = of_get_property(np, "reg", NULL); - if (reg == NULL) - continue; - if (*reg == hardid) { - if (thread) - *thread = 0; - return np; - } - } else { - plen /= sizeof(u32); - for (t = 0; t < plen; t++) { - if (hardid == intserv[t]) { - if (thread) - *thread = t; - return np; - } - } - } - } - return NULL; + return (int)phys_id == get_hard_smp_processor_id(cpu); } -EXPORT_SYMBOL(of_get_cpu_node); - -#if defined(CONFIG_DEBUG_FS) && defined(DEBUG) -static struct debugfs_blob_wrapper flat_dt_blob; - -static int __init export_flat_device_tree(void) -{ - struct dentry *d; - - flat_dt_blob.data = initial_boot_params; - flat_dt_blob.size = initial_boot_params->totalsize; - - d = debugfs_create_blob("flat-device-tree", S_IFREG | S_IRUSR, - powerpc_debugfs_root, &flat_dt_blob); - if (!d) - return 1; - - return 0; -} -__initcall(export_flat_device_tree); -#endif diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 779f34049a5..1a85d8f9673 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -66,8 +66,8 @@ * is running at whatever address it has been loaded at. * On ppc32 we compile with -mrelocatable, which means that references * to extern and static variables get relocated automatically. - * On ppc64 we have to relocate the references explicitly with - * RELOC. (Note that strings count as static variables.) + * ppc64 objects are always relocatable, we just need to relocate the + * TOC. * * Because OF may have mapped I/O devices into the area starting at * KERNELBASE, particularly on CHRP machines, we can't safely call @@ -79,13 +79,11 @@ * On ppc64, 64 bit values are truncated to 32 bits (and * fortunately don't get interpreted as two arguments). */ +#define ADDR(x) (u32)(unsigned long)(x) + #ifdef CONFIG_PPC64 -#define RELOC(x) (*PTRRELOC(&(x))) -#define ADDR(x) (u32) add_reloc_offset((unsigned long)(x)) #define OF_WORKAROUNDS 0 #else -#define RELOC(x) (x) -#define ADDR(x) (u32) (x) #define OF_WORKAROUNDS of_workarounds int of_workarounds; #endif @@ -95,7 +93,7 @@ int of_workarounds; #define PROM_BUG() do { \ prom_printf("kernel BUG at %s line 0x%x!\n", \ - RELOC(__FILE__), __LINE__); \ + __FILE__, __LINE__); \ __asm__ __volatile__(".long " BUG_ILLEGAL_INSTR); \ } while (0) @@ -109,10 +107,10 @@ int of_workarounds; typedef u32 prom_arg_t; struct prom_args { - u32 service; - u32 nargs; - u32 nret; - prom_arg_t args[10]; + __be32 service; + __be32 nargs; + __be32 nret; + __be32 args[10]; }; struct prom_t { @@ -125,11 +123,11 @@ struct prom_t { }; struct mem_map_entry { - u64 base; - u64 size; + __be64 base; + __be64 size; }; -typedef u32 cell_t; +typedef __be32 cell_t; extern void __start(unsigned long r3, unsigned long r4, unsigned long r5, unsigned long r6, unsigned long r7, unsigned long r8, @@ -198,6 +196,8 @@ static int __initdata mem_reserve_cnt; static cell_t __initdata regbuf[1024]; +static bool rtas_has_query_cpu_stopped; + /* * Error results ... some OF calls will return "-1" on error, some @@ -221,22 +221,22 @@ static int __init call_prom(const char *service, int nargs, int nret, ...) struct prom_args args; va_list list; - args.service = ADDR(service); - args.nargs = nargs; - args.nret = nret; + args.service = cpu_to_be32(ADDR(service)); + args.nargs = cpu_to_be32(nargs); + args.nret = cpu_to_be32(nret); va_start(list, nret); for (i = 0; i < nargs; i++) - args.args[i] = va_arg(list, prom_arg_t); + args.args[i] = cpu_to_be32(va_arg(list, prom_arg_t)); va_end(list); for (i = 0; i < nret; i++) args.args[nargs+i] = 0; - if (enter_prom(&args, RELOC(prom_entry)) < 0) + if (enter_prom(&args, prom_entry) < 0) return PROM_ERROR; - return (nret > 0) ? args.args[nargs] : 0; + return (nret > 0) ? be32_to_cpu(args.args[nargs]) : 0; } static int __init call_prom_ret(const char *service, int nargs, int nret, @@ -246,46 +246,45 @@ static int __init call_prom_ret(const char *service, int nargs, int nret, struct prom_args args; va_list list; - args.service = ADDR(service); - args.nargs = nargs; - args.nret = nret; + args.service = cpu_to_be32(ADDR(service)); + args.nargs = cpu_to_be32(nargs); + args.nret = cpu_to_be32(nret); va_start(list, rets); for (i = 0; i < nargs; i++) - args.args[i] = va_arg(list, prom_arg_t); + args.args[i] = cpu_to_be32(va_arg(list, prom_arg_t)); va_end(list); for (i = 0; i < nret; i++) args.args[nargs+i] = 0; - if (enter_prom(&args, RELOC(prom_entry)) < 0) + if (enter_prom(&args, prom_entry) < 0) return PROM_ERROR; if (rets != NULL) for (i = 1; i < nret; ++i) - rets[i-1] = args.args[nargs+i]; + rets[i-1] = be32_to_cpu(args.args[nargs+i]); - return (nret > 0) ? args.args[nargs] : 0; + return (nret > 0) ? be32_to_cpu(args.args[nargs]) : 0; } static void __init prom_print(const char *msg) { const char *p, *q; - struct prom_t *_prom = &RELOC(prom); - if (_prom->stdout == 0) + if (prom.stdout == 0) return; for (p = msg; *p != 0; p = q) { for (q = p; *q != 0 && *q != '\n'; ++q) ; if (q > p) - call_prom("write", 3, 1, _prom->stdout, p, q - p); + call_prom("write", 3, 1, prom.stdout, p, q - p); if (*q == 0) break; ++q; - call_prom("write", 3, 1, _prom->stdout, ADDR("\r\n"), 2); + call_prom("write", 3, 1, prom.stdout, ADDR("\r\n"), 2); } } @@ -294,7 +293,6 @@ static void __init prom_print_hex(unsigned long val) { int i, nibbles = sizeof(val)*2; char buf[sizeof(val)*2+1]; - struct prom_t *_prom = &RELOC(prom); for (i = nibbles-1; i >= 0; i--) { buf[i] = (val & 0xf) + '0'; @@ -303,7 +301,7 @@ static void __init prom_print_hex(unsigned long val) val >>= 4; } buf[nibbles] = '\0'; - call_prom("write", 3, 1, _prom->stdout, buf, nibbles); + call_prom("write", 3, 1, prom.stdout, buf, nibbles); } /* max number of decimal digits in an unsigned long */ @@ -312,7 +310,6 @@ static void __init prom_print_dec(unsigned long val) { int i, size; char buf[UL_DIGITS+1]; - struct prom_t *_prom = &RELOC(prom); for (i = UL_DIGITS-1; i >= 0; i--) { buf[i] = (val % 10) + '0'; @@ -322,7 +319,7 @@ static void __init prom_print_dec(unsigned long val) } /* shift stuff down */ size = UL_DIGITS - i; - call_prom("write", 3, 1, _prom->stdout, buf+i, size); + call_prom("write", 3, 1, prom.stdout, buf+i, size); } static void __init prom_printf(const char *format, ...) @@ -331,22 +328,18 @@ static void __init prom_printf(const char *format, ...) va_list args; unsigned long v; long vs; - struct prom_t *_prom = &RELOC(prom); va_start(args, format); -#ifdef CONFIG_PPC64 - format = PTRRELOC(format); -#endif for (p = format; *p != 0; p = q) { for (q = p; *q != 0 && *q != '\n' && *q != '%'; ++q) ; if (q > p) - call_prom("write", 3, 1, _prom->stdout, p, q - p); + call_prom("write", 3, 1, prom.stdout, p, q - p); if (*q == 0) break; if (*q == '\n') { ++q; - call_prom("write", 3, 1, _prom->stdout, + call_prom("write", 3, 1, prom.stdout, ADDR("\r\n"), 2); continue; } @@ -368,7 +361,7 @@ static void __init prom_printf(const char *format, ...) ++q; vs = va_arg(args, int); if (vs < 0) { - prom_print(RELOC("-")); + prom_print("-"); vs = -vs; } prom_print_dec(vs); @@ -389,7 +382,7 @@ static void __init prom_printf(const char *format, ...) ++q; vs = va_arg(args, long); if (vs < 0) { - prom_print(RELOC("-")); + prom_print("-"); vs = -vs; } prom_print_dec(vs); @@ -403,7 +396,6 @@ static void __init prom_printf(const char *format, ...) static unsigned int __init prom_claim(unsigned long virt, unsigned long size, unsigned long align) { - struct prom_t *_prom = &RELOC(prom); if (align == 0 && (OF_WORKAROUNDS & OF_WA_CLAIM)) { /* @@ -414,21 +406,21 @@ static unsigned int __init prom_claim(unsigned long virt, unsigned long size, prom_arg_t result; ret = call_prom_ret("call-method", 5, 2, &result, - ADDR("claim"), _prom->memory, + ADDR("claim"), prom.memory, align, size, virt); if (ret != 0 || result == -1) return -1; ret = call_prom_ret("call-method", 5, 2, &result, - ADDR("claim"), _prom->mmumap, + ADDR("claim"), prom.mmumap, align, size, virt); if (ret != 0) { call_prom("call-method", 4, 1, ADDR("release"), - _prom->memory, size, virt); + prom.memory, size, virt); return -1; } /* the 0x12 is M (coherence) + PP == read/write */ call_prom("call-method", 6, 1, - ADDR("map"), _prom->mmumap, 0x12, size, virt, virt); + ADDR("map"), prom.mmumap, 0x12, size, virt, virt); return virt; } return call_prom("claim", 3, 1, (prom_arg_t)virt, (prom_arg_t)size, @@ -437,13 +429,10 @@ static unsigned int __init prom_claim(unsigned long virt, unsigned long size, static void __init __attribute__((noreturn)) prom_panic(const char *reason) { -#ifdef CONFIG_PPC64 - reason = PTRRELOC(reason); -#endif prom_print(reason); /* Do not call exit because it clears the screen on pmac * it also causes some sort of double-fault on early pmacs */ - if (RELOC(of_platform) == PLATFORM_POWERMAC) + if (of_platform == PLATFORM_POWERMAC) asm("trap\n"); /* ToDo: should put up an SRC here on pSeries */ @@ -525,13 +514,13 @@ static int __init prom_setprop(phandle node, const char *nodename, add_string(&p, tohex((u32)(unsigned long) value)); add_string(&p, tohex(valuelen)); add_string(&p, tohex(ADDR(pname))); - add_string(&p, tohex(strlen(RELOC(pname)))); + add_string(&p, tohex(strlen(pname))); add_string(&p, "property"); *p = 0; return call_prom("interpret", 1, 1, (u32)(unsigned long) cmd); } -/* We can't use the standard versions because of RELOC headaches. */ +/* We can't use the standard versions because of relocation headaches. */ #define isxdigit(c) (('0' <= (c) && (c) <= '9') \ || ('a' <= (c) && (c) <= 'f') \ || ('A' <= (c) && (c) <= 'F')) @@ -540,7 +529,7 @@ static int __init prom_setprop(phandle node, const char *nodename, #define islower(c) ('a' <= (c) && (c) <= 'z') #define toupper(c) (islower(c) ? ((c) - 'a' + 'A') : (c)) -unsigned long prom_strtoul(const char *cp, const char **endp) +static unsigned long prom_strtoul(const char *cp, const char **endp) { unsigned long result = 0, base = 10, value; @@ -565,7 +554,7 @@ unsigned long prom_strtoul(const char *cp, const char **endp) return result; } -unsigned long prom_memparse(const char *ptr, const char **retptr) +static unsigned long prom_memparse(const char *ptr, const char **retptr) { unsigned long ret = prom_strtoul(ptr, retptr); int shift = 0; @@ -598,59 +587,53 @@ unsigned long prom_memparse(const char *ptr, const char **retptr) */ static void __init early_cmdline_parse(void) { - struct prom_t *_prom = &RELOC(prom); const char *opt; char *p; int l = 0; - RELOC(prom_cmd_line[0]) = 0; - p = RELOC(prom_cmd_line); - if ((long)_prom->chosen > 0) - l = prom_getprop(_prom->chosen, "bootargs", p, COMMAND_LINE_SIZE-1); + prom_cmd_line[0] = 0; + p = prom_cmd_line; + if ((long)prom.chosen > 0) + l = prom_getprop(prom.chosen, "bootargs", p, COMMAND_LINE_SIZE-1); #ifdef CONFIG_CMDLINE if (l <= 0 || p[0] == '\0') /* dbl check */ - strlcpy(RELOC(prom_cmd_line), - RELOC(CONFIG_CMDLINE), sizeof(prom_cmd_line)); + strlcpy(prom_cmd_line, + CONFIG_CMDLINE, sizeof(prom_cmd_line)); #endif /* CONFIG_CMDLINE */ - prom_printf("command line: %s\n", RELOC(prom_cmd_line)); + prom_printf("command line: %s\n", prom_cmd_line); #ifdef CONFIG_PPC64 - opt = strstr(RELOC(prom_cmd_line), RELOC("iommu=")); + opt = strstr(prom_cmd_line, "iommu="); if (opt) { prom_printf("iommu opt is: %s\n", opt); opt += 6; while (*opt && *opt == ' ') opt++; - if (!strncmp(opt, RELOC("off"), 3)) - RELOC(prom_iommu_off) = 1; - else if (!strncmp(opt, RELOC("force"), 5)) - RELOC(prom_iommu_force_on) = 1; + if (!strncmp(opt, "off", 3)) + prom_iommu_off = 1; + else if (!strncmp(opt, "force", 5)) + prom_iommu_force_on = 1; } #endif - opt = strstr(RELOC(prom_cmd_line), RELOC("mem=")); + opt = strstr(prom_cmd_line, "mem="); if (opt) { opt += 4; - RELOC(prom_memory_limit) = prom_memparse(opt, (const char **)&opt); + prom_memory_limit = prom_memparse(opt, (const char **)&opt); #ifdef CONFIG_PPC64 /* Align to 16 MB == size of ppc64 large page */ - RELOC(prom_memory_limit) = ALIGN(RELOC(prom_memory_limit), 0x1000000); + prom_memory_limit = ALIGN(prom_memory_limit, 0x1000000); #endif } } #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* - * There are two methods for telling firmware what our capabilities are. - * Newer machines have an "ibm,client-architecture-support" method on the - * root node. For older machines, we have to call the "process-elf-header" - * method in the /packages/elf-loader node, passing it a fake 32-bit - * ELF header containing a couple of PT_NOTE sections that contain - * structures that contain various information. - */ - -/* - * New method - extensible architecture description vector. + * The architecture vector has an array of PVR mask/value pairs, + * followed by # option vectors - 1, followed by the option vectors. + * + * See prom.h for the definition of the bits specified in the + * architecture vector. * * Because the description vector contains a mix of byte and word * values, we declare it as an unsigned char array, and use this @@ -659,69 +642,12 @@ static void __init early_cmdline_parse(void) #define W(x) ((x) >> 24) & 0xff, ((x) >> 16) & 0xff, \ ((x) >> 8) & 0xff, (x) & 0xff -/* Option vector bits - generic bits in byte 1 */ -#define OV_IGNORE 0x80 /* ignore this vector */ -#define OV_CESSATION_POLICY 0x40 /* halt if unsupported option present*/ - -/* Option vector 1: processor architectures supported */ -#define OV1_PPC_2_00 0x80 /* set if we support PowerPC 2.00 */ -#define OV1_PPC_2_01 0x40 /* set if we support PowerPC 2.01 */ -#define OV1_PPC_2_02 0x20 /* set if we support PowerPC 2.02 */ -#define OV1_PPC_2_03 0x10 /* set if we support PowerPC 2.03 */ -#define OV1_PPC_2_04 0x08 /* set if we support PowerPC 2.04 */ -#define OV1_PPC_2_05 0x04 /* set if we support PowerPC 2.05 */ -#define OV1_PPC_2_06 0x02 /* set if we support PowerPC 2.06 */ -#define OV1_PPC_2_07 0x01 /* set if we support PowerPC 2.07 */ - -/* Option vector 2: Open Firmware options supported */ -#define OV2_REAL_MODE 0x20 /* set if we want OF in real mode */ - -/* Option vector 3: processor options supported */ -#define OV3_FP 0x80 /* floating point */ -#define OV3_VMX 0x40 /* VMX/Altivec */ -#define OV3_DFP 0x20 /* decimal FP */ - -/* Option vector 4: IBM PAPR implementation */ -#define OV4_MIN_ENT_CAP 0x01 /* minimum VP entitled capacity */ - -/* Option vector 5: PAPR/OF options supported */ -#define OV5_LPAR 0x80 /* logical partitioning supported */ -#define OV5_SPLPAR 0x40 /* shared-processor LPAR supported */ -/* ibm,dynamic-reconfiguration-memory property supported */ -#define OV5_DRCONF_MEMORY 0x20 -#define OV5_LARGE_PAGES 0x10 /* large pages supported */ -#define OV5_DONATE_DEDICATE_CPU 0x02 /* donate dedicated CPU support */ -/* PCIe/MSI support. Without MSI full PCIe is not supported */ -#ifdef CONFIG_PCI_MSI -#define OV5_MSI 0x01 /* PCIe/MSI support */ -#else -#define OV5_MSI 0x00 -#endif /* CONFIG_PCI_MSI */ -#ifdef CONFIG_PPC_SMLPAR -#define OV5_CMO 0x80 /* Cooperative Memory Overcommitment */ -#define OV5_XCMO 0x40 /* Page Coalescing */ -#else -#define OV5_CMO 0x00 -#define OV5_XCMO 0x00 -#endif -#define OV5_TYPE1_AFFINITY 0x80 /* Type 1 NUMA affinity */ -#define OV5_PFO_HW_RNG 0x80 /* PFO Random Number Generator */ -#define OV5_PFO_HW_842 0x40 /* PFO Compression Accelerator */ -#define OV5_PFO_HW_ENCR 0x20 /* PFO Encryption Accelerator */ -#define OV5_SUB_PROCESSORS 0x01 /* 1,2,or 4 Sub-Processors supported */ - -/* Option Vector 6: IBM PAPR hints */ -#define OV6_LINUX 0x02 /* Linux is our OS */ - -/* - * The architecture vector has an array of PVR mask/value pairs, - * followed by # option vectors - 1, followed by the option vectors. - */ -static unsigned char ibm_architecture_vec[] = { +unsigned char ibm_architecture_vec[] = { W(0xfffe0000), W(0x003a0000), /* POWER5/POWER5+ */ W(0xffff0000), W(0x003e0000), /* POWER6 */ W(0xffff0000), W(0x003f0000), /* POWER7 */ - W(0xffff0000), W(0x004b0000), /* POWER8 */ + W(0xffff0000), W(0x004b0000), /* POWER8E */ + W(0xffff0000), W(0x004d0000), /* POWER8 */ W(0xffffffff), W(0x0f000004), /* all 2.07-compliant */ W(0xffffffff), W(0x0f000003), /* all 2.06-compliant */ W(0xffffffff), W(0x0f000002), /* all 2.05-compliant */ @@ -761,11 +687,21 @@ static unsigned char ibm_architecture_vec[] = { /* option vector 5: PAPR/OF options */ 19 - 2, /* length */ 0, /* don't ignore, don't halt */ - OV5_LPAR | OV5_SPLPAR | OV5_LARGE_PAGES | OV5_DRCONF_MEMORY | - OV5_DONATE_DEDICATE_CPU | OV5_MSI, + OV5_FEAT(OV5_LPAR) | OV5_FEAT(OV5_SPLPAR) | OV5_FEAT(OV5_LARGE_PAGES) | + OV5_FEAT(OV5_DRCONF_MEMORY) | OV5_FEAT(OV5_DONATE_DEDICATE_CPU) | +#ifdef CONFIG_PCI_MSI + /* PCIe/MSI support. Without MSI full PCIe is not supported */ + OV5_FEAT(OV5_MSI), +#else + 0, +#endif 0, - OV5_CMO | OV5_XCMO, - OV5_TYPE1_AFFINITY, +#ifdef CONFIG_PPC_SMLPAR + OV5_FEAT(OV5_CMO) | OV5_FEAT(OV5_XCMO), +#else + 0, +#endif + OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN), 0, 0, 0, @@ -773,14 +709,15 @@ static unsigned char ibm_architecture_vec[] = { * must match by the macro below. Update the definition if * the structure layout changes. */ -#define IBM_ARCH_VEC_NRCORES_OFFSET 117 +#define IBM_ARCH_VEC_NRCORES_OFFSET 125 W(NR_CPUS), /* number of cores supported */ 0, 0, 0, 0, - OV5_PFO_HW_RNG | OV5_PFO_HW_ENCR | OV5_PFO_HW_842, - OV5_SUB_PROCESSORS, + OV5_FEAT(OV5_PFO_HW_RNG) | OV5_FEAT(OV5_PFO_HW_ENCR) | + OV5_FEAT(OV5_PFO_HW_842), + OV5_FEAT(OV5_SUB_PROCESSORS), /* option vector 6: IBM PAPR hints */ 4 - 2, /* length */ 0, @@ -789,7 +726,8 @@ static unsigned char ibm_architecture_vec[] = { }; -/* Old method - ELF header with PT_NOTE sections */ +/* Old method - ELF header with PT_NOTE sections only works on BE */ +#ifdef __BIG_ENDIAN__ static struct fake_elf { Elf32_Ehdr elfhdr; Elf32_Phdr phdr[2]; @@ -875,6 +813,7 @@ static struct fake_elf { } } }; +#endif /* __BIG_ENDIAN__ */ static int __init prom_count_smt_threads(void) { @@ -887,7 +826,7 @@ static int __init prom_count_smt_threads(void) type[0] = 0; prom_getprop(node, "device_type", type, sizeof(type)); - if (strcmp(type, RELOC("cpu"))) + if (strcmp(type, "cpu")) continue; /* * There is an entry for each smt thread, each entry being @@ -917,9 +856,10 @@ static int __init prom_count_smt_threads(void) static void __init prom_send_capabilities(void) { - ihandle elfloader, root; + ihandle root; prom_arg_t ret; - u32 *cores; + u32 cores; + unsigned char *ptcores; root = call_prom("open", 1, 1, ADDR("/")); if (root != 0) { @@ -929,15 +869,30 @@ static void __init prom_send_capabilities(void) * (we assume this is the same for all cores) and use it to * divide NR_CPUS. */ - cores = (u32 *)PTRRELOC(&ibm_architecture_vec[IBM_ARCH_VEC_NRCORES_OFFSET]); - if (*cores != NR_CPUS) { + + /* The core value may start at an odd address. If such a word + * access is made at a cache line boundary, this leads to an + * exception which may not be handled at this time. + * Forcing a per byte access to avoid exception. + */ + ptcores = &ibm_architecture_vec[IBM_ARCH_VEC_NRCORES_OFFSET]; + cores = 0; + cores |= ptcores[0] << 24; + cores |= ptcores[1] << 16; + cores |= ptcores[2] << 8; + cores |= ptcores[3]; + if (cores != NR_CPUS) { prom_printf("WARNING ! " "ibm_architecture_vec structure inconsistent: %lu!\n", - *cores); + cores); } else { - *cores = DIV_ROUND_UP(NR_CPUS, prom_count_smt_threads()); + cores = DIV_ROUND_UP(NR_CPUS, prom_count_smt_threads()); prom_printf("Max number of cores passed to firmware: %lu (NR_CPUS = %lu)\n", - *cores, NR_CPUS); + cores, NR_CPUS); + ptcores[0] = (cores >> 24) & 0xff; + ptcores[1] = (cores >> 16) & 0xff; + ptcores[2] = (cores >> 8) & 0xff; + ptcores[3] = cores & 0xff; } /* try calling the ibm,client-architecture-support method */ @@ -958,17 +913,24 @@ static void __init prom_send_capabilities(void) prom_printf(" not implemented\n"); } - /* no ibm,client-architecture-support call, try the old way */ - elfloader = call_prom("open", 1, 1, ADDR("/packages/elf-loader")); - if (elfloader == 0) { - prom_printf("couldn't open /packages/elf-loader\n"); - return; +#ifdef __BIG_ENDIAN__ + { + ihandle elfloader; + + /* no ibm,client-architecture-support call, try the old way */ + elfloader = call_prom("open", 1, 1, + ADDR("/packages/elf-loader")); + if (elfloader == 0) { + prom_printf("couldn't open /packages/elf-loader\n"); + return; + } + call_prom("call-method", 3, 1, ADDR("process-elf-header"), + elfloader, ADDR(&fake_elf)); + call_prom("close", 1, 0, elfloader); } - call_prom("call-method", 3, 1, ADDR("process-elf-header"), - elfloader, ADDR(&fake_elf)); - call_prom("close", 1, 0, elfloader); +#endif /* __BIG_ENDIAN__ */ } -#endif +#endif /* #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ /* * Memory allocation strategy... our layout is normally: @@ -1005,21 +967,21 @@ static void __init prom_send_capabilities(void) */ static unsigned long __init alloc_up(unsigned long size, unsigned long align) { - unsigned long base = RELOC(alloc_bottom); + unsigned long base = alloc_bottom; unsigned long addr = 0; if (align) base = _ALIGN_UP(base, align); prom_debug("alloc_up(%x, %x)\n", size, align); - if (RELOC(ram_top) == 0) + if (ram_top == 0) prom_panic("alloc_up() called with mem not initialized\n"); if (align) - base = _ALIGN_UP(RELOC(alloc_bottom), align); + base = _ALIGN_UP(alloc_bottom, align); else - base = RELOC(alloc_bottom); + base = alloc_bottom; - for(; (base + size) <= RELOC(alloc_top); + for(; (base + size) <= alloc_top; base = _ALIGN_UP(base + 0x100000, align)) { prom_debug(" trying: 0x%x\n\r", base); addr = (unsigned long)prom_claim(base, size, 0); @@ -1031,14 +993,14 @@ static unsigned long __init alloc_up(unsigned long size, unsigned long align) } if (addr == 0) return 0; - RELOC(alloc_bottom) = addr + size; + alloc_bottom = addr + size; prom_debug(" -> %x\n", addr); - prom_debug(" alloc_bottom : %x\n", RELOC(alloc_bottom)); - prom_debug(" alloc_top : %x\n", RELOC(alloc_top)); - prom_debug(" alloc_top_hi : %x\n", RELOC(alloc_top_high)); - prom_debug(" rmo_top : %x\n", RELOC(rmo_top)); - prom_debug(" ram_top : %x\n", RELOC(ram_top)); + prom_debug(" alloc_bottom : %x\n", alloc_bottom); + prom_debug(" alloc_top : %x\n", alloc_top); + prom_debug(" alloc_top_hi : %x\n", alloc_top_high); + prom_debug(" rmo_top : %x\n", rmo_top); + prom_debug(" ram_top : %x\n", ram_top); return addr; } @@ -1054,32 +1016,32 @@ static unsigned long __init alloc_down(unsigned long size, unsigned long align, unsigned long base, addr = 0; prom_debug("alloc_down(%x, %x, %s)\n", size, align, - highmem ? RELOC("(high)") : RELOC("(low)")); - if (RELOC(ram_top) == 0) + highmem ? "(high)" : "(low)"); + if (ram_top == 0) prom_panic("alloc_down() called with mem not initialized\n"); if (highmem) { /* Carve out storage for the TCE table. */ - addr = _ALIGN_DOWN(RELOC(alloc_top_high) - size, align); - if (addr <= RELOC(alloc_bottom)) + addr = _ALIGN_DOWN(alloc_top_high - size, align); + if (addr <= alloc_bottom) return 0; /* Will we bump into the RMO ? If yes, check out that we * didn't overlap existing allocations there, if we did, * we are dead, we must be the first in town ! */ - if (addr < RELOC(rmo_top)) { + if (addr < rmo_top) { /* Good, we are first */ - if (RELOC(alloc_top) == RELOC(rmo_top)) - RELOC(alloc_top) = RELOC(rmo_top) = addr; + if (alloc_top == rmo_top) + alloc_top = rmo_top = addr; else return 0; } - RELOC(alloc_top_high) = addr; + alloc_top_high = addr; goto bail; } - base = _ALIGN_DOWN(RELOC(alloc_top) - size, align); - for (; base > RELOC(alloc_bottom); + base = _ALIGN_DOWN(alloc_top - size, align); + for (; base > alloc_bottom; base = _ALIGN_DOWN(base - 0x100000, align)) { prom_debug(" trying: 0x%x\n\r", base); addr = (unsigned long)prom_claim(base, size, 0); @@ -1089,15 +1051,15 @@ static unsigned long __init alloc_down(unsigned long size, unsigned long align, } if (addr == 0) return 0; - RELOC(alloc_top) = addr; + alloc_top = addr; bail: prom_debug(" -> %x\n", addr); - prom_debug(" alloc_bottom : %x\n", RELOC(alloc_bottom)); - prom_debug(" alloc_top : %x\n", RELOC(alloc_top)); - prom_debug(" alloc_top_hi : %x\n", RELOC(alloc_top_high)); - prom_debug(" rmo_top : %x\n", RELOC(rmo_top)); - prom_debug(" ram_top : %x\n", RELOC(ram_top)); + prom_debug(" alloc_bottom : %x\n", alloc_bottom); + prom_debug(" alloc_top : %x\n", alloc_top); + prom_debug(" alloc_top_hi : %x\n", alloc_top_high); + prom_debug(" rmo_top : %x\n", rmo_top); + prom_debug(" ram_top : %x\n", ram_top); return addr; } @@ -1115,11 +1077,11 @@ static unsigned long __init prom_next_cell(int s, cell_t **cellp) p++; s--; } - r = *p++; + r = be32_to_cpu(*p++); #ifdef CONFIG_PPC64 if (s > 1) { r <<= 32; - r |= *(p++); + r |= be32_to_cpu(*(p++)); } #endif *cellp = p; @@ -1137,7 +1099,7 @@ static unsigned long __init prom_next_cell(int s, cell_t **cellp) static void __init reserve_mem(u64 base, u64 size) { u64 top = base + size; - unsigned long cnt = RELOC(mem_reserve_cnt); + unsigned long cnt = mem_reserve_cnt; if (size == 0) return; @@ -1152,9 +1114,9 @@ static void __init reserve_mem(u64 base, u64 size) if (cnt >= (MEM_RESERVE_MAP_SIZE - 1)) prom_panic("Memory reserve map exhausted !\n"); - RELOC(mem_reserve_map)[cnt].base = base; - RELOC(mem_reserve_map)[cnt].size = size; - RELOC(mem_reserve_cnt) = cnt + 1; + mem_reserve_map[cnt].base = cpu_to_be64(base); + mem_reserve_map[cnt].size = cpu_to_be64(size); + mem_reserve_cnt = cnt + 1; } /* @@ -1167,7 +1129,7 @@ static void __init prom_init_mem(void) char *path, type[64]; unsigned int plen; cell_t *p, *endp; - struct prom_t *_prom = &RELOC(prom); + __be32 val; u32 rac, rsc; /* @@ -1175,15 +1137,17 @@ static void __init prom_init_mem(void) * 1) top of RMO (first node) * 2) top of memory */ - rac = 2; - prom_getprop(_prom->root, "#address-cells", &rac, sizeof(rac)); - rsc = 1; - prom_getprop(_prom->root, "#size-cells", &rsc, sizeof(rsc)); - prom_debug("root_addr_cells: %x\n", (unsigned long) rac); - prom_debug("root_size_cells: %x\n", (unsigned long) rsc); + val = cpu_to_be32(2); + prom_getprop(prom.root, "#address-cells", &val, sizeof(val)); + rac = be32_to_cpu(val); + val = cpu_to_be32(1); + prom_getprop(prom.root, "#size-cells", &val, sizeof(rsc)); + rsc = be32_to_cpu(val); + prom_debug("root_addr_cells: %x\n", rac); + prom_debug("root_size_cells: %x\n", rsc); prom_debug("scanning memory:\n"); - path = RELOC(prom_scratch); + path = prom_scratch; for (node = 0; prom_next_node(&node); ) { type[0] = 0; @@ -1196,15 +1160,15 @@ static void __init prom_init_mem(void) */ prom_getprop(node, "name", type, sizeof(type)); } - if (strcmp(type, RELOC("memory"))) + if (strcmp(type, "memory")) continue; - plen = prom_getprop(node, "reg", RELOC(regbuf), sizeof(regbuf)); + plen = prom_getprop(node, "reg", regbuf, sizeof(regbuf)); if (plen > sizeof(regbuf)) { prom_printf("memory node too large for buffer !\n"); plen = sizeof(regbuf); } - p = RELOC(regbuf); + p = regbuf; endp = p + (plen / sizeof(cell_t)); #ifdef DEBUG_PROM @@ -1222,14 +1186,14 @@ static void __init prom_init_mem(void) if (size == 0) continue; prom_debug(" %x %x\n", base, size); - if (base == 0 && (RELOC(of_platform) & PLATFORM_LPAR)) - RELOC(rmo_top) = size; - if ((base + size) > RELOC(ram_top)) - RELOC(ram_top) = base + size; + if (base == 0 && (of_platform & PLATFORM_LPAR)) + rmo_top = size; + if ((base + size) > ram_top) + ram_top = base + size; } } - RELOC(alloc_bottom) = PAGE_ALIGN((unsigned long)&RELOC(_end) + 0x4000); + alloc_bottom = PAGE_ALIGN((unsigned long)&_end + 0x4000); /* * If prom_memory_limit is set we reduce the upper limits *except* for @@ -1237,20 +1201,20 @@ static void __init prom_init_mem(void) * TCE's up there. */ - RELOC(alloc_top_high) = RELOC(ram_top); + alloc_top_high = ram_top; - if (RELOC(prom_memory_limit)) { - if (RELOC(prom_memory_limit) <= RELOC(alloc_bottom)) { + if (prom_memory_limit) { + if (prom_memory_limit <= alloc_bottom) { prom_printf("Ignoring mem=%x <= alloc_bottom.\n", - RELOC(prom_memory_limit)); - RELOC(prom_memory_limit) = 0; - } else if (RELOC(prom_memory_limit) >= RELOC(ram_top)) { + prom_memory_limit); + prom_memory_limit = 0; + } else if (prom_memory_limit >= ram_top) { prom_printf("Ignoring mem=%x >= ram_top.\n", - RELOC(prom_memory_limit)); - RELOC(prom_memory_limit) = 0; + prom_memory_limit); + prom_memory_limit = 0; } else { - RELOC(ram_top) = RELOC(prom_memory_limit); - RELOC(rmo_top) = min(RELOC(rmo_top), RELOC(prom_memory_limit)); + ram_top = prom_memory_limit; + rmo_top = min(rmo_top, prom_memory_limit); } } @@ -1262,238 +1226,48 @@ static void __init prom_init_mem(void) * Since 768MB is plenty of room, and we need to cap to something * reasonable on 32-bit, cap at 768MB on all machines. */ - if (!RELOC(rmo_top)) - RELOC(rmo_top) = RELOC(ram_top); - RELOC(rmo_top) = min(0x30000000ul, RELOC(rmo_top)); - RELOC(alloc_top) = RELOC(rmo_top); - RELOC(alloc_top_high) = RELOC(ram_top); + if (!rmo_top) + rmo_top = ram_top; + rmo_top = min(0x30000000ul, rmo_top); + alloc_top = rmo_top; + alloc_top_high = ram_top; /* * Check if we have an initrd after the kernel but still inside * the RMO. If we do move our bottom point to after it. */ - if (RELOC(prom_initrd_start) && - RELOC(prom_initrd_start) < RELOC(rmo_top) && - RELOC(prom_initrd_end) > RELOC(alloc_bottom)) - RELOC(alloc_bottom) = PAGE_ALIGN(RELOC(prom_initrd_end)); + if (prom_initrd_start && + prom_initrd_start < rmo_top && + prom_initrd_end > alloc_bottom) + alloc_bottom = PAGE_ALIGN(prom_initrd_end); prom_printf("memory layout at init:\n"); - prom_printf(" memory_limit : %x (16 MB aligned)\n", RELOC(prom_memory_limit)); - prom_printf(" alloc_bottom : %x\n", RELOC(alloc_bottom)); - prom_printf(" alloc_top : %x\n", RELOC(alloc_top)); - prom_printf(" alloc_top_hi : %x\n", RELOC(alloc_top_high)); - prom_printf(" rmo_top : %x\n", RELOC(rmo_top)); - prom_printf(" ram_top : %x\n", RELOC(ram_top)); + prom_printf(" memory_limit : %x (16 MB aligned)\n", prom_memory_limit); + prom_printf(" alloc_bottom : %x\n", alloc_bottom); + prom_printf(" alloc_top : %x\n", alloc_top); + prom_printf(" alloc_top_hi : %x\n", alloc_top_high); + prom_printf(" rmo_top : %x\n", rmo_top); + prom_printf(" ram_top : %x\n", ram_top); } static void __init prom_close_stdin(void) { - struct prom_t *_prom = &RELOC(prom); - ihandle val; + __be32 val; + ihandle stdin; - if (prom_getprop(_prom->chosen, "stdin", &val, sizeof(val)) > 0) - call_prom("close", 1, 0, val); + if (prom_getprop(prom.chosen, "stdin", &val, sizeof(val)) > 0) { + stdin = be32_to_cpu(val); + call_prom("close", 1, 0, stdin); + } } #ifdef CONFIG_PPC_POWERNV -static u64 __initdata prom_opal_size; -static u64 __initdata prom_opal_align; -static int __initdata prom_rtas_start_cpu; -static u64 __initdata prom_rtas_data; -static u64 __initdata prom_rtas_entry; - #ifdef CONFIG_PPC_EARLY_DEBUG_OPAL static u64 __initdata prom_opal_base; static u64 __initdata prom_opal_entry; #endif -/* XXX Don't change this structure without updating opal-takeover.S */ -static struct opal_secondary_data { - s64 ack; /* 0 */ - u64 go; /* 8 */ - struct opal_takeover_args args; /* 16 */ -} opal_secondary_data; - -extern char opal_secondary_entry; - -static void __init prom_query_opal(void) -{ - long rc; - - /* We must not query for OPAL presence on a machine that - * supports TNK takeover (970 blades), as this uses the same - * h-call with different arguments and will crash - */ - if (PHANDLE_VALID(call_prom("finddevice", 1, 1, - ADDR("/tnk-memory-map")))) { - prom_printf("TNK takeover detected, skipping OPAL check\n"); - return; - } - - prom_printf("Querying for OPAL presence... "); - rc = opal_query_takeover(&RELOC(prom_opal_size), - &RELOC(prom_opal_align)); - prom_debug("(rc = %ld) ", rc); - if (rc != 0) { - prom_printf("not there.\n"); - return; - } - RELOC(of_platform) = PLATFORM_OPAL; - prom_printf(" there !\n"); - prom_debug(" opal_size = 0x%lx\n", RELOC(prom_opal_size)); - prom_debug(" opal_align = 0x%lx\n", RELOC(prom_opal_align)); - if (RELOC(prom_opal_align) < 0x10000) - RELOC(prom_opal_align) = 0x10000; -} - -static int prom_rtas_call(int token, int nargs, int nret, int *outputs, ...) -{ - struct rtas_args rtas_args; - va_list list; - int i; - - rtas_args.token = token; - rtas_args.nargs = nargs; - rtas_args.nret = nret; - rtas_args.rets = (rtas_arg_t *)&(rtas_args.args[nargs]); - va_start(list, outputs); - for (i = 0; i < nargs; ++i) - rtas_args.args[i] = va_arg(list, rtas_arg_t); - va_end(list); - - for (i = 0; i < nret; ++i) - rtas_args.rets[i] = 0; - - opal_enter_rtas(&rtas_args, RELOC(prom_rtas_data), - RELOC(prom_rtas_entry)); - - if (nret > 1 && outputs != NULL) - for (i = 0; i < nret-1; ++i) - outputs[i] = rtas_args.rets[i+1]; - return (nret > 0)? rtas_args.rets[0]: 0; -} - -static void __init prom_opal_hold_cpus(void) -{ - int i, cnt, cpu, rc; - long j; - phandle node; - char type[64]; - u32 servers[8]; - struct prom_t *_prom = &RELOC(prom); - void *entry = (unsigned long *)&RELOC(opal_secondary_entry); - struct opal_secondary_data *data = &RELOC(opal_secondary_data); - - prom_debug("prom_opal_hold_cpus: start...\n"); - prom_debug(" - entry = 0x%x\n", entry); - prom_debug(" - data = 0x%x\n", data); - - data->ack = -1; - data->go = 0; - - /* look for cpus */ - for (node = 0; prom_next_node(&node); ) { - type[0] = 0; - prom_getprop(node, "device_type", type, sizeof(type)); - if (strcmp(type, RELOC("cpu")) != 0) - continue; - - /* Skip non-configured cpus. */ - if (prom_getprop(node, "status", type, sizeof(type)) > 0) - if (strcmp(type, RELOC("okay")) != 0) - continue; - - cnt = prom_getprop(node, "ibm,ppc-interrupt-server#s", servers, - sizeof(servers)); - if (cnt == PROM_ERROR) - break; - cnt >>= 2; - for (i = 0; i < cnt; i++) { - cpu = servers[i]; - prom_debug("CPU %d ... ", cpu); - if (cpu == _prom->cpu) { - prom_debug("booted !\n"); - continue; - } - prom_debug("starting ... "); - - /* Init the acknowledge var which will be reset by - * the secondary cpu when it awakens from its OF - * spinloop. - */ - data->ack = -1; - rc = prom_rtas_call(RELOC(prom_rtas_start_cpu), 3, 1, - NULL, cpu, entry, data); - prom_debug("rtas rc=%d ...", rc); - - for (j = 0; j < 100000000 && data->ack == -1; j++) { - HMT_low(); - mb(); - } - HMT_medium(); - if (data->ack != -1) - prom_debug("done, PIR=0x%x\n", data->ack); - else - prom_debug("timeout !\n"); - } - } - prom_debug("prom_opal_hold_cpus: end...\n"); -} - -static void __init prom_opal_takeover(void) -{ - struct opal_secondary_data *data = &RELOC(opal_secondary_data); - struct opal_takeover_args *args = &data->args; - u64 align = RELOC(prom_opal_align); - u64 top_addr, opal_addr; - - args->k_image = (u64)RELOC(_stext); - args->k_size = _end - _stext; - args->k_entry = 0; - args->k_entry2 = 0x60; - - top_addr = _ALIGN_UP(args->k_size, align); - - if (RELOC(prom_initrd_start) != 0) { - args->rd_image = RELOC(prom_initrd_start); - args->rd_size = RELOC(prom_initrd_end) - args->rd_image; - args->rd_loc = top_addr; - top_addr = _ALIGN_UP(args->rd_loc + args->rd_size, align); - } - - /* Pickup an address for the HAL. We want to go really high - * up to avoid problem with future kexecs. On the other hand - * we don't want to be all over the TCEs on P5IOC2 machines - * which are going to be up there too. We assume the machine - * has plenty of memory, and we ask for the HAL for now to - * be just below the 1G point, or above the initrd - */ - opal_addr = _ALIGN_DOWN(0x40000000 - RELOC(prom_opal_size), align); - if (opal_addr < top_addr) - opal_addr = top_addr; - args->hal_addr = opal_addr; - - /* Copy the command line to the kernel image */ - strlcpy(RELOC(boot_command_line), RELOC(prom_cmd_line), - COMMAND_LINE_SIZE); - - prom_debug(" k_image = 0x%lx\n", args->k_image); - prom_debug(" k_size = 0x%lx\n", args->k_size); - prom_debug(" k_entry = 0x%lx\n", args->k_entry); - prom_debug(" k_entry2 = 0x%lx\n", args->k_entry2); - prom_debug(" hal_addr = 0x%lx\n", args->hal_addr); - prom_debug(" rd_image = 0x%lx\n", args->rd_image); - prom_debug(" rd_size = 0x%lx\n", args->rd_size); - prom_debug(" rd_loc = 0x%lx\n", args->rd_loc); - prom_printf("Performing OPAL takeover,this can take a few minutes..\n"); - prom_close_stdin(); - mb(); - data->go = 1; - for (;;) - opal_do_takeover(args); -} - /* * Allocate room for and instantiate OPAL */ @@ -1503,6 +1277,7 @@ static void __init prom_instantiate_opal(void) ihandle opal_inst; u64 base, entry; u64 size = 0, align = 0x10000; + __be64 val64; u32 rets[2]; prom_debug("prom_instantiate_opal: start...\n"); @@ -1512,11 +1287,14 @@ static void __init prom_instantiate_opal(void) if (!PHANDLE_VALID(opal_node)) return; - prom_getprop(opal_node, "opal-runtime-size", &size, sizeof(size)); + val64 = 0; + prom_getprop(opal_node, "opal-runtime-size", &val64, sizeof(val64)); + size = be64_to_cpu(val64); if (size == 0) return; - prom_getprop(opal_node, "opal-runtime-alignment", &align, - sizeof(align)); + val64 = 0; + prom_getprop(opal_node, "opal-runtime-alignment", &val64,sizeof(val64)); + align = be64_to_cpu(val64); base = alloc_down(size, align, 0); if (base == 0) { @@ -1557,8 +1335,8 @@ static void __init prom_instantiate_opal(void) &entry, sizeof(entry)); #ifdef CONFIG_PPC_EARLY_DEBUG_OPAL - RELOC(prom_opal_base) = base; - RELOC(prom_opal_entry) = entry; + prom_opal_base = base; + prom_opal_entry = entry; #endif prom_debug("prom_instantiate_opal: end...\n"); } @@ -1573,6 +1351,7 @@ static void __init prom_instantiate_rtas(void) phandle rtas_node; ihandle rtas_inst; u32 base, entry = 0; + __be32 val; u32 size = 0; prom_debug("prom_instantiate_rtas: start...\n"); @@ -1582,7 +1361,9 @@ static void __init prom_instantiate_rtas(void) if (!PHANDLE_VALID(rtas_node)) return; - prom_getprop(rtas_node, "rtas-size", &size, sizeof(size)); + val = 0; + prom_getprop(rtas_node, "rtas-size", &val, sizeof(size)); + size = be32_to_cpu(val); if (size == 0) return; @@ -1609,17 +1390,18 @@ static void __init prom_instantiate_rtas(void) reserve_mem(base, size); + val = cpu_to_be32(base); prom_setprop(rtas_node, "/rtas", "linux,rtas-base", - &base, sizeof(base)); + &val, sizeof(val)); + val = cpu_to_be32(entry); prom_setprop(rtas_node, "/rtas", "linux,rtas-entry", - &entry, sizeof(entry)); + &val, sizeof(val)); + + /* Check if it supports "query-cpu-stopped-state" */ + if (prom_getprop(rtas_node, "query-cpu-stopped-state", + &val, sizeof(val)) != PROM_ERROR) + rtas_has_query_cpu_stopped = true; -#ifdef CONFIG_PPC_POWERNV - /* PowerVN takeover hack */ - RELOC(prom_rtas_data) = base; - RELOC(prom_rtas_entry) = entry; - prom_getprop(rtas_node, "start-cpu", &RELOC(prom_rtas_start_cpu), 4); -#endif prom_debug("rtas base = 0x%x\n", base); prom_debug("rtas entry = 0x%x\n", entry); prom_debug("rtas size = 0x%x\n", (long)size); @@ -1688,25 +1470,26 @@ static void __init prom_instantiate_sml(void) /* * Allocate room for and initialize TCE tables */ +#ifdef __BIG_ENDIAN__ static void __init prom_initialize_tce_table(void) { phandle node; ihandle phb_node; char compatible[64], type[64], model[64]; - char *path = RELOC(prom_scratch); + char *path = prom_scratch; u64 base, align; u32 minalign, minsize; u64 tce_entry, *tce_entryp; u64 local_alloc_top, local_alloc_bottom; u64 i; - if (RELOC(prom_iommu_off)) + if (prom_iommu_off) return; prom_debug("starting prom_initialize_tce_table\n"); /* Cache current top of allocs so we reserve a single block */ - local_alloc_top = RELOC(alloc_top_high); + local_alloc_top = alloc_top_high; local_alloc_bottom = local_alloc_top; /* Search all nodes looking for PHBs. */ @@ -1719,19 +1502,19 @@ static void __init prom_initialize_tce_table(void) prom_getprop(node, "device_type", type, sizeof(type)); prom_getprop(node, "model", model, sizeof(model)); - if ((type[0] == 0) || (strstr(type, RELOC("pci")) == NULL)) + if ((type[0] == 0) || (strstr(type, "pci") == NULL)) continue; /* Keep the old logic intact to avoid regression. */ if (compatible[0] != 0) { - if ((strstr(compatible, RELOC("python")) == NULL) && - (strstr(compatible, RELOC("Speedwagon")) == NULL) && - (strstr(compatible, RELOC("Winnipeg")) == NULL)) + if ((strstr(compatible, "python") == NULL) && + (strstr(compatible, "Speedwagon") == NULL) && + (strstr(compatible, "Winnipeg") == NULL)) continue; } else if (model[0] != 0) { - if ((strstr(model, RELOC("ython")) == NULL) && - (strstr(model, RELOC("peedwagon")) == NULL) && - (strstr(model, RELOC("innipeg")) == NULL)) + if ((strstr(model, "ython") == NULL) && + (strstr(model, "peedwagon") == NULL) && + (strstr(model, "innipeg") == NULL)) continue; } @@ -1810,13 +1593,14 @@ static void __init prom_initialize_tce_table(void) /* These are only really needed if there is a memory limit in * effect, but we don't know so export them always. */ - RELOC(prom_tce_alloc_start) = local_alloc_bottom; - RELOC(prom_tce_alloc_end) = local_alloc_top; + prom_tce_alloc_start = local_alloc_bottom; + prom_tce_alloc_end = local_alloc_top; /* Flag the first invalid entry */ prom_debug("ending prom_initialize_tce_table\n"); } -#endif +#endif /* __BIG_ENDIAN__ */ +#endif /* CONFIG_PPC64 */ /* * With CHRP SMP we need to use the OF to start the other processors. @@ -1845,16 +1629,26 @@ static void __init prom_initialize_tce_table(void) static void __init prom_hold_cpus(void) { unsigned long i; - unsigned int reg; phandle node; char type[64]; - struct prom_t *_prom = &RELOC(prom); unsigned long *spinloop = (void *) LOW_ADDR(__secondary_hold_spinloop); unsigned long *acknowledge = (void *) LOW_ADDR(__secondary_hold_acknowledge); unsigned long secondary_hold = LOW_ADDR(__secondary_hold); + /* + * On pseries, if RTAS supports "query-cpu-stopped-state", + * we skip this stage, the CPUs will be started by the + * kernel using RTAS. + */ + if ((of_platform == PLATFORM_PSERIES || + of_platform == PLATFORM_PSERIES_LPAR) && + rtas_has_query_cpu_stopped) { + prom_printf("prom_hold_cpus: skipped\n"); + return; + } + prom_debug("prom_hold_cpus: start...\n"); prom_debug(" 1) spinloop = 0x%x\n", (unsigned long)spinloop); prom_debug(" 1) *spinloop = 0x%x\n", *spinloop); @@ -1872,20 +1666,24 @@ static void __init prom_hold_cpus(void) /* look for cpus */ for (node = 0; prom_next_node(&node); ) { + unsigned int cpu_no; + __be32 reg; + type[0] = 0; prom_getprop(node, "device_type", type, sizeof(type)); - if (strcmp(type, RELOC("cpu")) != 0) + if (strcmp(type, "cpu") != 0) continue; /* Skip non-configured cpus. */ if (prom_getprop(node, "status", type, sizeof(type)) > 0) - if (strcmp(type, RELOC("okay")) != 0) + if (strcmp(type, "okay") != 0) continue; - reg = -1; + reg = cpu_to_be32(-1); /* make sparse happy */ prom_getprop(node, "reg", ®, sizeof(reg)); + cpu_no = be32_to_cpu(reg); - prom_debug("cpu hw idx = %lu\n", reg); + prom_debug("cpu hw idx = %lu\n", cpu_no); /* Init the acknowledge var which will be reset by * the secondary cpu when it awakens from its OF @@ -1893,24 +1691,24 @@ static void __init prom_hold_cpus(void) */ *acknowledge = (unsigned long)-1; - if (reg != _prom->cpu) { + if (cpu_no != prom.cpu) { /* Primary Thread of non-boot cpu or any thread */ - prom_printf("starting cpu hw idx %lu... ", reg); + prom_printf("starting cpu hw idx %lu... ", cpu_no); call_prom("start-cpu", 3, 0, node, - secondary_hold, reg); + secondary_hold, cpu_no); for (i = 0; (i < 100000000) && (*acknowledge == ((unsigned long)-1)); i++ ) mb(); - if (*acknowledge == reg) + if (*acknowledge == cpu_no) prom_printf("done\n"); else prom_printf("failed: %x\n", *acknowledge); } #ifdef CONFIG_SMP else - prom_printf("boot cpu hw idx %lu\n", reg); + prom_printf("boot cpu hw idx %lu\n", cpu_no); #endif /* CONFIG_SMP */ } @@ -1920,22 +1718,20 @@ static void __init prom_hold_cpus(void) static void __init prom_init_client_services(unsigned long pp) { - struct prom_t *_prom = &RELOC(prom); - /* Get a handle to the prom entry point before anything else */ - RELOC(prom_entry) = pp; + prom_entry = pp; /* get a handle for the stdout device */ - _prom->chosen = call_prom("finddevice", 1, 1, ADDR("/chosen")); - if (!PHANDLE_VALID(_prom->chosen)) + prom.chosen = call_prom("finddevice", 1, 1, ADDR("/chosen")); + if (!PHANDLE_VALID(prom.chosen)) prom_panic("cannot find chosen"); /* msg won't be printed :( */ /* get device tree root */ - _prom->root = call_prom("finddevice", 1, 1, ADDR("/")); - if (!PHANDLE_VALID(_prom->root)) + prom.root = call_prom("finddevice", 1, 1, ADDR("/")); + if (!PHANDLE_VALID(prom.root)) prom_panic("cannot find device tree root"); /* msg won't be printed :( */ - _prom->mmumap = 0; + prom.mmumap = 0; } #ifdef CONFIG_PPC32 @@ -1946,7 +1742,6 @@ static void __init prom_init_client_services(unsigned long pp) */ static void __init prom_find_mmu(void) { - struct prom_t *_prom = &RELOC(prom); phandle oprom; char version[64]; @@ -1964,10 +1759,11 @@ static void __init prom_find_mmu(void) call_prom("interpret", 1, 1, "dev /memory 0 to allow-reclaim"); } else return; - _prom->memory = call_prom("open", 1, 1, ADDR("/memory")); - prom_getprop(_prom->chosen, "mmu", &_prom->mmumap, - sizeof(_prom->mmumap)); - if (!IHANDLE_VALID(_prom->memory) || !IHANDLE_VALID(_prom->mmumap)) + prom.memory = call_prom("open", 1, 1, ADDR("/memory")); + prom_getprop(prom.chosen, "mmu", &prom.mmumap, + sizeof(prom.mmumap)); + prom.mmumap = be32_to_cpu(prom.mmumap); + if (!IHANDLE_VALID(prom.memory) || !IHANDLE_VALID(prom.mmumap)) of_workarounds &= ~OF_WA_CLAIM; /* hmmm */ } #else @@ -1976,36 +1772,40 @@ static void __init prom_find_mmu(void) static void __init prom_init_stdout(void) { - struct prom_t *_prom = &RELOC(prom); - char *path = RELOC(of_stdout_device); + char *path = of_stdout_device; char type[16]; - u32 val; + phandle stdout_node; + __be32 val; - if (prom_getprop(_prom->chosen, "stdout", &val, sizeof(val)) <= 0) + if (prom_getprop(prom.chosen, "stdout", &val, sizeof(val)) <= 0) prom_panic("cannot find stdout"); - _prom->stdout = val; + prom.stdout = be32_to_cpu(val); /* Get the full OF pathname of the stdout device */ memset(path, 0, 256); - call_prom("instance-to-path", 3, 1, _prom->stdout, path, 255); - val = call_prom("instance-to-package", 1, 1, _prom->stdout); - prom_setprop(_prom->chosen, "/chosen", "linux,stdout-package", - &val, sizeof(val)); - prom_printf("OF stdout device is: %s\n", RELOC(of_stdout_device)); - prom_setprop(_prom->chosen, "/chosen", "linux,stdout-path", + call_prom("instance-to-path", 3, 1, prom.stdout, path, 255); + prom_printf("OF stdout device is: %s\n", of_stdout_device); + prom_setprop(prom.chosen, "/chosen", "linux,stdout-path", path, strlen(path) + 1); - /* If it's a display, note it */ - memset(type, 0, sizeof(type)); - prom_getprop(val, "device_type", type, sizeof(type)); - if (strcmp(type, RELOC("display")) == 0) - prom_setprop(val, path, "linux,boot-display", NULL, 0); + /* instance-to-package fails on PA-Semi */ + stdout_node = call_prom("instance-to-package", 1, 1, prom.stdout); + if (stdout_node != PROM_ERROR) { + val = cpu_to_be32(stdout_node); + prom_setprop(prom.chosen, "/chosen", "linux,stdout-package", + &val, sizeof(val)); + + /* If it's a display, note it */ + memset(type, 0, sizeof(type)); + prom_getprop(stdout_node, "device_type", type, sizeof(type)); + if (strcmp(type, "display") == 0) + prom_setprop(stdout_node, path, "linux,boot-display", NULL, 0); + } } static int __init prom_find_machine_type(void) { - struct prom_t *_prom = &RELOC(prom); char compat[256]; int len, i = 0; #ifdef CONFIG_PPC64 @@ -2014,7 +1814,7 @@ static int __init prom_find_machine_type(void) #endif /* Look for a PowerMac or a Cell */ - len = prom_getprop(_prom->root, "compatible", + len = prom_getprop(prom.root, "compatible", compat, sizeof(compat)-1); if (len > 0) { compat[len] = 0; @@ -2023,16 +1823,16 @@ static int __init prom_find_machine_type(void) int sl = strlen(p); if (sl == 0) break; - if (strstr(p, RELOC("Power Macintosh")) || - strstr(p, RELOC("MacRISC"))) + if (strstr(p, "Power Macintosh") || + strstr(p, "MacRISC")) return PLATFORM_POWERMAC; #ifdef CONFIG_PPC64 /* We must make sure we don't detect the IBM Cell * blades as pSeries due to some firmware issues, * so we do it here. */ - if (strstr(p, RELOC("IBM,CBEA")) || - strstr(p, RELOC("IBM,CPBW-1.0"))) + if (strstr(p, "IBM,CBEA") || + strstr(p, "IBM,CPBW-1.0")) return PLATFORM_GENERIC; #endif /* CONFIG_PPC64 */ i += sl + 1; @@ -2049,11 +1849,11 @@ static int __init prom_find_machine_type(void) * non-IBM designs ! * - it has /rtas */ - len = prom_getprop(_prom->root, "device_type", + len = prom_getprop(prom.root, "device_type", compat, sizeof(compat)-1); if (len <= 0) return PLATFORM_GENERIC; - if (strcmp(compat, RELOC("chrp"))) + if (strcmp(compat, "chrp")) return PLATFORM_GENERIC; /* Default to pSeries. We need to know if we are running LPAR */ @@ -2115,11 +1915,11 @@ static void __init prom_check_displays(void) for (node = 0; prom_next_node(&node); ) { memset(type, 0, sizeof(type)); prom_getprop(node, "device_type", type, sizeof(type)); - if (strcmp(type, RELOC("display")) != 0) + if (strcmp(type, "display") != 0) continue; /* It seems OF doesn't null-terminate the path :-( */ - path = RELOC(prom_scratch); + path = prom_scratch; memset(path, 0, PROM_SCRATCH_SIZE); /* @@ -2143,19 +1943,35 @@ static void __init prom_check_displays(void) /* Setup a usable color table when the appropriate * method is available. Should update this to set-colors */ - clut = RELOC(default_colors); + clut = default_colors; for (i = 0; i < 16; i++, clut += 3) if (prom_set_color(ih, i, clut[0], clut[1], clut[2]) != 0) break; #ifdef CONFIG_LOGO_LINUX_CLUT224 - clut = PTRRELOC(RELOC(logo_linux_clut224.clut)); - for (i = 0; i < RELOC(logo_linux_clut224.clutsize); i++, clut += 3) + clut = PTRRELOC(logo_linux_clut224.clut); + for (i = 0; i < logo_linux_clut224.clutsize; i++, clut += 3) if (prom_set_color(ih, i + 32, clut[0], clut[1], clut[2]) != 0) break; #endif /* CONFIG_LOGO_LINUX_CLUT224 */ + +#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX + if (prom_getprop(node, "linux,boot-display", NULL, 0) != + PROM_ERROR) { + u32 width, height, pitch, addr; + + prom_printf("Setting btext !\n"); + prom_getprop(node, "width", &width, 4); + prom_getprop(node, "height", &height, 4); + prom_getprop(node, "linebytes", &pitch, 4); + prom_getprop(node, "address", &addr, 4); + prom_printf("W=%d H=%d LB=%d addr=0x%x\n", + width, height, pitch, addr); + btext_setup_display(width, height, 8, pitch, addr); + } +#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */ } } @@ -2171,8 +1987,8 @@ static void __init *make_room(unsigned long *mem_start, unsigned long *mem_end, unsigned long room, chunk; prom_debug("Chunk exhausted, claiming more at %x...\n", - RELOC(alloc_bottom)); - room = RELOC(alloc_top) - RELOC(alloc_bottom); + alloc_bottom); + room = alloc_top - alloc_bottom; if (room > DEVTREE_CHUNK_SIZE) room = DEVTREE_CHUNK_SIZE; if (room < PAGE_SIZE) @@ -2191,16 +2007,18 @@ static void __init *make_room(unsigned long *mem_start, unsigned long *mem_end, return ret; } -#define dt_push_token(token, mem_start, mem_end) \ - do { *((u32 *)make_room(mem_start, mem_end, 4, 4)) = token; } while(0) +#define dt_push_token(token, mem_start, mem_end) do { \ + void *room = make_room(mem_start, mem_end, 4, 4); \ + *(__be32 *)room = cpu_to_be32(token); \ + } while(0) static unsigned long __init dt_find_string(char *str) { char *s, *os; - s = os = (char *)RELOC(dt_string_start); + s = os = (char *)dt_string_start; s += 4; - while (s < (char *)RELOC(dt_string_end)) { + while (s < (char *)dt_string_end) { if (strcmp(s, str) == 0) return s - os; s += strlen(s) + 1; @@ -2222,10 +2040,10 @@ static void __init scan_dt_build_strings(phandle node, unsigned long soff; phandle child; - sstart = (char *)RELOC(dt_string_start); + sstart = (char *)dt_string_start; /* get and store all property names */ - prev_name = RELOC(""); + prev_name = ""; for (;;) { /* 64 is max len of name including nul. */ namep = make_room(mem_start, mem_end, MAX_PROPERTY_NAME, 1); @@ -2236,9 +2054,9 @@ static void __init scan_dt_build_strings(phandle node, } /* skip "name" */ - if (strcmp(namep, RELOC("name")) == 0) { + if (strcmp(namep, "name") == 0) { *mem_start = (unsigned long)namep; - prev_name = RELOC("name"); + prev_name = "name"; continue; } /* get/create string entry */ @@ -2249,7 +2067,7 @@ static void __init scan_dt_build_strings(phandle node, } else { /* Trim off some if we can */ *mem_start = (unsigned long)namep + strlen(namep) + 1; - RELOC(dt_string_end) = *mem_start; + dt_string_end = *mem_start; } prev_name = namep; } @@ -2304,35 +2122,35 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, } /* get it again for debugging */ - path = RELOC(prom_scratch); + path = prom_scratch; memset(path, 0, PROM_SCRATCH_SIZE); call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1); /* get and store all properties */ - prev_name = RELOC(""); - sstart = (char *)RELOC(dt_string_start); + prev_name = ""; + sstart = (char *)dt_string_start; for (;;) { if (call_prom("nextprop", 3, 1, node, prev_name, - RELOC(pname)) != 1) + pname) != 1) break; /* skip "name" */ - if (strcmp(RELOC(pname), RELOC("name")) == 0) { - prev_name = RELOC("name"); + if (strcmp(pname, "name") == 0) { + prev_name = "name"; continue; } /* find string offset */ - soff = dt_find_string(RELOC(pname)); + soff = dt_find_string(pname); if (soff == 0) { prom_printf("WARNING: Can't find string index for" - " <%s>, node %s\n", RELOC(pname), path); + " <%s>, node %s\n", pname, path); break; } prev_name = sstart + soff; /* get length */ - l = call_prom("getproplen", 2, 1, node, RELOC(pname)); + l = call_prom("getproplen", 2, 1, node, pname); /* sanity checks */ if (l == PROM_ERROR) @@ -2345,10 +2163,10 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, /* push property content */ valp = make_room(mem_start, mem_end, l, 4); - call_prom("getprop", 4, 1, node, RELOC(pname), valp, l); + call_prom("getprop", 4, 1, node, pname, valp, l); *mem_start = _ALIGN(*mem_start, 4); - if (!strcmp(RELOC(pname), RELOC("phandle"))) + if (!strcmp(pname, "phandle")) has_phandle = 1; } @@ -2356,7 +2174,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, * existed (can happen with OPAL) */ if (!has_phandle) { - soff = dt_find_string(RELOC("linux,phandle")); + soff = dt_find_string("linux,phandle"); if (soff == 0) prom_printf("WARNING: Can't find string index for" " <linux-phandle> node %s\n", path); @@ -2365,7 +2183,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, dt_push_token(4, mem_start, mem_end); dt_push_token(soff, mem_start, mem_end); valp = make_room(mem_start, mem_end, 4, 4); - *(u32 *)valp = node; + *(__be32 *)valp = cpu_to_be32(node); } } @@ -2384,7 +2202,6 @@ static void __init flatten_device_tree(void) phandle root; unsigned long mem_start, mem_end, room; struct boot_param_header *hdr; - struct prom_t *_prom = &RELOC(prom); char *namep; u64 *rsvmap; @@ -2392,10 +2209,10 @@ static void __init flatten_device_tree(void) * Check how much room we have between alloc top & bottom (+/- a * few pages), crop to 1MB, as this is our "chunk" size */ - room = RELOC(alloc_top) - RELOC(alloc_bottom) - 0x4000; + room = alloc_top - alloc_bottom - 0x4000; if (room > DEVTREE_CHUNK_SIZE) room = DEVTREE_CHUNK_SIZE; - prom_debug("starting device tree allocs at %x\n", RELOC(alloc_bottom)); + prom_debug("starting device tree allocs at %x\n", alloc_bottom); /* Now try to claim that */ mem_start = (unsigned long)alloc_up(room, PAGE_SIZE); @@ -2412,67 +2229,66 @@ static void __init flatten_device_tree(void) mem_start = _ALIGN(mem_start, 4); hdr = make_room(&mem_start, &mem_end, sizeof(struct boot_param_header), 4); - RELOC(dt_header_start) = (unsigned long)hdr; + dt_header_start = (unsigned long)hdr; rsvmap = make_room(&mem_start, &mem_end, sizeof(mem_reserve_map), 8); /* Start of strings */ mem_start = PAGE_ALIGN(mem_start); - RELOC(dt_string_start) = mem_start; + dt_string_start = mem_start; mem_start += 4; /* hole */ /* Add "linux,phandle" in there, we'll need it */ namep = make_room(&mem_start, &mem_end, 16, 1); - strcpy(namep, RELOC("linux,phandle")); + strcpy(namep, "linux,phandle"); mem_start = (unsigned long)namep + strlen(namep) + 1; /* Build string array */ prom_printf("Building dt strings...\n"); scan_dt_build_strings(root, &mem_start, &mem_end); - RELOC(dt_string_end) = mem_start; + dt_string_end = mem_start; /* Build structure */ mem_start = PAGE_ALIGN(mem_start); - RELOC(dt_struct_start) = mem_start; + dt_struct_start = mem_start; prom_printf("Building dt structure...\n"); scan_dt_build_struct(root, &mem_start, &mem_end); dt_push_token(OF_DT_END, &mem_start, &mem_end); - RELOC(dt_struct_end) = PAGE_ALIGN(mem_start); + dt_struct_end = PAGE_ALIGN(mem_start); /* Finish header */ - hdr->boot_cpuid_phys = _prom->cpu; - hdr->magic = OF_DT_HEADER; - hdr->totalsize = RELOC(dt_struct_end) - RELOC(dt_header_start); - hdr->off_dt_struct = RELOC(dt_struct_start) - RELOC(dt_header_start); - hdr->off_dt_strings = RELOC(dt_string_start) - RELOC(dt_header_start); - hdr->dt_strings_size = RELOC(dt_string_end) - RELOC(dt_string_start); - hdr->off_mem_rsvmap = ((unsigned long)rsvmap) - RELOC(dt_header_start); - hdr->version = OF_DT_VERSION; + hdr->boot_cpuid_phys = cpu_to_be32(prom.cpu); + hdr->magic = cpu_to_be32(OF_DT_HEADER); + hdr->totalsize = cpu_to_be32(dt_struct_end - dt_header_start); + hdr->off_dt_struct = cpu_to_be32(dt_struct_start - dt_header_start); + hdr->off_dt_strings = cpu_to_be32(dt_string_start - dt_header_start); + hdr->dt_strings_size = cpu_to_be32(dt_string_end - dt_string_start); + hdr->off_mem_rsvmap = cpu_to_be32(((unsigned long)rsvmap) - dt_header_start); + hdr->version = cpu_to_be32(OF_DT_VERSION); /* Version 16 is not backward compatible */ - hdr->last_comp_version = 0x10; + hdr->last_comp_version = cpu_to_be32(0x10); /* Copy the reserve map in */ - memcpy(rsvmap, RELOC(mem_reserve_map), sizeof(mem_reserve_map)); + memcpy(rsvmap, mem_reserve_map, sizeof(mem_reserve_map)); #ifdef DEBUG_PROM { int i; prom_printf("reserved memory map:\n"); - for (i = 0; i < RELOC(mem_reserve_cnt); i++) + for (i = 0; i < mem_reserve_cnt; i++) prom_printf(" %x - %x\n", - RELOC(mem_reserve_map)[i].base, - RELOC(mem_reserve_map)[i].size); + be64_to_cpu(mem_reserve_map[i].base), + be64_to_cpu(mem_reserve_map[i].size)); } #endif /* Bump mem_reserve_cnt to cause further reservations to fail * since it's too late. */ - RELOC(mem_reserve_cnt) = MEM_RESERVE_MAP_SIZE; + mem_reserve_cnt = MEM_RESERVE_MAP_SIZE; prom_printf("Device tree strings 0x%x -> 0x%x\n", - RELOC(dt_string_start), RELOC(dt_string_end)); + dt_string_start, dt_string_end); prom_printf("Device tree struct 0x%x -> 0x%x\n", - RELOC(dt_struct_start), RELOC(dt_struct_end)); - + dt_struct_start, dt_struct_end); } #ifdef CONFIG_PPC_MAPLE @@ -2526,7 +2342,6 @@ static void __init fixup_device_tree_maple_memory_controller(void) phandle mc; u32 mc_reg[4]; char *name = "/hostbridge@f8000000"; - struct prom_t *_prom = &RELOC(prom); u32 ac, sc; mc = call_prom("finddevice", 1, 1, ADDR(name)); @@ -2536,8 +2351,8 @@ static void __init fixup_device_tree_maple_memory_controller(void) if (prom_getproplen(mc, "reg") != 8) return; - prom_getprop(_prom->root, "#address-cells", &ac, sizeof(ac)); - prom_getprop(_prom->root, "#size-cells", &sc, sizeof(sc)); + prom_getprop(prom.root, "#address-cells", &ac, sizeof(ac)); + prom_getprop(prom.root, "#size-cells", &sc, sizeof(sc)); if ((ac != 2) || (sc != 2)) return; @@ -2806,50 +2621,95 @@ static void __init fixup_device_tree(void) static void __init prom_find_boot_cpu(void) { - struct prom_t *_prom = &RELOC(prom); - u32 getprop_rval; + __be32 rval; ihandle prom_cpu; phandle cpu_pkg; - _prom->cpu = 0; - if (prom_getprop(_prom->chosen, "cpu", &prom_cpu, sizeof(prom_cpu)) <= 0) + rval = 0; + if (prom_getprop(prom.chosen, "cpu", &rval, sizeof(rval)) <= 0) return; + prom_cpu = be32_to_cpu(rval); cpu_pkg = call_prom("instance-to-package", 1, 1, prom_cpu); - prom_getprop(cpu_pkg, "reg", &getprop_rval, sizeof(getprop_rval)); - _prom->cpu = getprop_rval; + prom_getprop(cpu_pkg, "reg", &rval, sizeof(rval)); + prom.cpu = be32_to_cpu(rval); - prom_debug("Booting CPU hw index = %lu\n", _prom->cpu); + prom_debug("Booting CPU hw index = %lu\n", prom.cpu); } static void __init prom_check_initrd(unsigned long r3, unsigned long r4) { #ifdef CONFIG_BLK_DEV_INITRD - struct prom_t *_prom = &RELOC(prom); - if (r3 && r4 && r4 != 0xdeadbeef) { - unsigned long val; + __be64 val; - RELOC(prom_initrd_start) = is_kernel_addr(r3) ? __pa(r3) : r3; - RELOC(prom_initrd_end) = RELOC(prom_initrd_start) + r4; + prom_initrd_start = is_kernel_addr(r3) ? __pa(r3) : r3; + prom_initrd_end = prom_initrd_start + r4; - val = RELOC(prom_initrd_start); - prom_setprop(_prom->chosen, "/chosen", "linux,initrd-start", + val = cpu_to_be64(prom_initrd_start); + prom_setprop(prom.chosen, "/chosen", "linux,initrd-start", &val, sizeof(val)); - val = RELOC(prom_initrd_end); - prom_setprop(_prom->chosen, "/chosen", "linux,initrd-end", + val = cpu_to_be64(prom_initrd_end); + prom_setprop(prom.chosen, "/chosen", "linux,initrd-end", &val, sizeof(val)); - reserve_mem(RELOC(prom_initrd_start), - RELOC(prom_initrd_end) - RELOC(prom_initrd_start)); + reserve_mem(prom_initrd_start, + prom_initrd_end - prom_initrd_start); - prom_debug("initrd_start=0x%x\n", RELOC(prom_initrd_start)); - prom_debug("initrd_end=0x%x\n", RELOC(prom_initrd_end)); + prom_debug("initrd_start=0x%x\n", prom_initrd_start); + prom_debug("initrd_end=0x%x\n", prom_initrd_end); } #endif /* CONFIG_BLK_DEV_INITRD */ } +#ifdef CONFIG_PPC64 +#ifdef CONFIG_RELOCATABLE +static void reloc_toc(void) +{ +} + +static void unreloc_toc(void) +{ +} +#else +static void __reloc_toc(unsigned long offset, unsigned long nr_entries) +{ + unsigned long i; + unsigned long *toc_entry; + + /* Get the start of the TOC by using r2 directly. */ + asm volatile("addi %0,2,-0x8000" : "=b" (toc_entry)); + + for (i = 0; i < nr_entries; i++) { + *toc_entry = *toc_entry + offset; + toc_entry++; + } +} + +static void reloc_toc(void) +{ + unsigned long offset = reloc_offset(); + unsigned long nr_entries = + (__prom_init_toc_end - __prom_init_toc_start) / sizeof(long); + + __reloc_toc(offset, nr_entries); + + mb(); +} + +static void unreloc_toc(void) +{ + unsigned long offset = reloc_offset(); + unsigned long nr_entries = + (__prom_init_toc_end - __prom_init_toc_start) / sizeof(long); + + mb(); + + __reloc_toc(-offset, nr_entries); +} +#endif +#endif /* * We enter here early on, when the Open Firmware prom is still @@ -2861,20 +2721,19 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, unsigned long r6, unsigned long r7, unsigned long kbase) { - struct prom_t *_prom; unsigned long hdr; #ifdef CONFIG_PPC32 unsigned long offset = reloc_offset(); reloc_got2(offset); +#else + reloc_toc(); #endif - _prom = &RELOC(prom); - /* * First zero the BSS */ - memset(&RELOC(__bss_start), 0, __bss_stop - __bss_start); + memset(&__bss_start, 0, __bss_stop - __bss_start); /* * Init interface to Open Firmware, get some node references, @@ -2893,14 +2752,14 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, */ prom_init_stdout(); - prom_printf("Preparing to boot %s", RELOC(linux_banner)); + prom_printf("Preparing to boot %s", linux_banner); /* * Get default machine type. At this point, we do not differentiate * between pSeries SMP and pSeries LPAR */ - RELOC(of_platform) = prom_find_machine_type(); - prom_printf("Detected machine type: %x\n", RELOC(of_platform)); + of_platform = prom_find_machine_type(); + prom_printf("Detected machine type: %x\n", of_platform); #ifndef CONFIG_NONSTATIC_KERNEL /* Bail if this is a kdump kernel. */ @@ -2917,15 +2776,15 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, /* * On pSeries, inform the firmware about our capabilities */ - if (RELOC(of_platform) == PLATFORM_PSERIES || - RELOC(of_platform) == PLATFORM_PSERIES_LPAR) + if (of_platform == PLATFORM_PSERIES || + of_platform == PLATFORM_PSERIES_LPAR) prom_send_capabilities(); #endif /* * Copy the CPU hold code */ - if (RELOC(of_platform) != PLATFORM_POWERMAC) + if (of_platform != PLATFORM_POWERMAC) copy_and_flush(0, kbase, 0x100, 0); /* @@ -2948,13 +2807,13 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, */ prom_check_displays(); -#ifdef CONFIG_PPC64 +#if defined(CONFIG_PPC64) && defined(__BIG_ENDIAN__) /* * Initialize IOMMU (TCE tables) on pSeries. Do that before anything else * that uses the allocator, we need to make sure we get the top of memory * available for us here... */ - if (RELOC(of_platform) == PLATFORM_PSERIES) + if (of_platform == PLATFORM_PSERIES) prom_initialize_tce_table(); #endif @@ -2962,21 +2821,14 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * On non-powermacs, try to instantiate RTAS. PowerMacs don't * have a usable RTAS implementation. */ - if (RELOC(of_platform) != PLATFORM_POWERMAC && - RELOC(of_platform) != PLATFORM_OPAL) + if (of_platform != PLATFORM_POWERMAC && + of_platform != PLATFORM_OPAL) prom_instantiate_rtas(); #ifdef CONFIG_PPC_POWERNV - /* Detect HAL and try instanciating it & doing takeover */ - if (RELOC(of_platform) == PLATFORM_PSERIES_LPAR) { - prom_query_opal(); - if (RELOC(of_platform) == PLATFORM_OPAL) { - prom_opal_hold_cpus(); - prom_opal_takeover(); - } - } else if (RELOC(of_platform) == PLATFORM_OPAL) + if (of_platform == PLATFORM_OPAL) prom_instantiate_opal(); -#endif +#endif /* CONFIG_PPC_POWERNV */ #ifdef CONFIG_PPC64 /* instantiate sml */ @@ -2987,33 +2839,36 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * On non-powermacs, put all CPUs in spin-loops. * * PowerMacs use a different mechanism to spin CPUs + * + * (This must be done after instanciating RTAS) */ - if (RELOC(of_platform) != PLATFORM_POWERMAC && - RELOC(of_platform) != PLATFORM_OPAL) + if (of_platform != PLATFORM_POWERMAC && + of_platform != PLATFORM_OPAL) prom_hold_cpus(); /* * Fill in some infos for use by the kernel later on */ - if (RELOC(prom_memory_limit)) - prom_setprop(_prom->chosen, "/chosen", "linux,memory-limit", - &RELOC(prom_memory_limit), - sizeof(prom_memory_limit)); + if (prom_memory_limit) { + __be64 val = cpu_to_be64(prom_memory_limit); + prom_setprop(prom.chosen, "/chosen", "linux,memory-limit", + &val, sizeof(val)); + } #ifdef CONFIG_PPC64 - if (RELOC(prom_iommu_off)) - prom_setprop(_prom->chosen, "/chosen", "linux,iommu-off", + if (prom_iommu_off) + prom_setprop(prom.chosen, "/chosen", "linux,iommu-off", NULL, 0); - if (RELOC(prom_iommu_force_on)) - prom_setprop(_prom->chosen, "/chosen", "linux,iommu-force-on", + if (prom_iommu_force_on) + prom_setprop(prom.chosen, "/chosen", "linux,iommu-force-on", NULL, 0); - if (RELOC(prom_tce_alloc_start)) { - prom_setprop(_prom->chosen, "/chosen", "linux,tce-alloc-start", - &RELOC(prom_tce_alloc_start), + if (prom_tce_alloc_start) { + prom_setprop(prom.chosen, "/chosen", "linux,tce-alloc-start", + &prom_tce_alloc_start, sizeof(prom_tce_alloc_start)); - prom_setprop(_prom->chosen, "/chosen", "linux,tce-alloc-end", - &RELOC(prom_tce_alloc_end), + prom_setprop(prom.chosen, "/chosen", "linux,tce-alloc-end", + &prom_tce_alloc_end, sizeof(prom_tce_alloc_end)); } #endif @@ -3035,8 +2890,8 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * closed stdin already (in particular the powerbook 101). It * appears that the OPAL version of OFW doesn't like it either. */ - if (RELOC(of_platform) != PLATFORM_POWERMAC && - RELOC(of_platform) != PLATFORM_OPAL) + if (of_platform != PLATFORM_POWERMAC && + of_platform != PLATFORM_OPAL) prom_close_stdin(); /* @@ -3051,22 +2906,24 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * tree and NULL as r5, thus triggering the new entry point which * is common to us and kexec */ - hdr = RELOC(dt_header_start); + hdr = dt_header_start; /* Don't print anything after quiesce under OPAL, it crashes OFW */ - if (RELOC(of_platform) != PLATFORM_OPAL) { + if (of_platform != PLATFORM_OPAL) { prom_printf("returning from prom_init\n"); prom_debug("->dt_header_start=0x%x\n", hdr); } #ifdef CONFIG_PPC32 reloc_got2(-offset); +#else + unreloc_toc(); #endif #ifdef CONFIG_PPC_EARLY_DEBUG_OPAL /* OPAL early debug gets the OPAL base & entry in r8 and r9 */ __start(hdr, kbase, 0, 0, 0, - RELOC(prom_opal_base), RELOC(prom_opal_entry)); + prom_opal_base, prom_opal_entry); #else __start(hdr, kbase, 0, 0, 0, 0, 0); #endif diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index 70f4286eaa7..fe8e54b9ef7 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -21,8 +21,7 @@ _end enter_prom memcpy memset reloc_offset __secondary_hold __secondary_hold_acknowledge __secondary_hold_spinloop __start strcmp strcpy strlcpy strlen strncmp strstr logo_linux_clut224 reloc_got2 kernstart_addr memstart_addr linux_banner _stext -opal_query_takeover opal_do_takeover opal_enter_rtas opal_secondary_entry -boot_command_line" +__prom_init_toc_start __prom_init_toc_end btext_setup_display TOC." NM="$1" OBJ="$2" diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c index 4e1331b8eb3..6295e646f78 100644 --- a/arch/powerpc/kernel/prom_parse.c +++ b/arch/powerpc/kernel/prom_parse.c @@ -7,28 +7,27 @@ #include <linux/of_address.h> #include <asm/prom.h> -void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop, - unsigned long *busno, unsigned long *phys, unsigned long *size) +void of_parse_dma_window(struct device_node *dn, const __be32 *dma_window, + unsigned long *busno, unsigned long *phys, + unsigned long *size) { - const u32 *dma_window; u32 cells; - const unsigned char *prop; - - dma_window = dma_window_prop; + const __be32 *prop; /* busno is always one cell */ - *busno = *(dma_window++); + *busno = of_read_number(dma_window, 1); + dma_window++; prop = of_get_property(dn, "ibm,#dma-address-cells", NULL); if (!prop) prop = of_get_property(dn, "#address-cells", NULL); - cells = prop ? *(u32 *)prop : of_n_addr_cells(dn); + cells = prop ? of_read_number(prop, 1) : of_n_addr_cells(dn); *phys = of_read_number(dma_window, cells); dma_window += cells; prop = of_get_property(dn, "ibm,#dma-size-cells", NULL); - cells = prop ? *(u32 *)prop : of_n_size_cells(dn); + cells = prop ? of_read_number(prop, 1) : of_n_size_cells(dn); *size = of_read_number(dma_window, cells); } diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index c4970004d44..2e3d2bf536c 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -32,6 +32,7 @@ #include <trace/syscall.h> #include <linux/hw_breakpoint.h> #include <linux/perf_event.h> +#include <linux/context_tracking.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -179,6 +180,31 @@ static int set_user_msr(struct task_struct *task, unsigned long msr) return 0; } +#ifdef CONFIG_PPC64 +static int get_user_dscr(struct task_struct *task, unsigned long *data) +{ + *data = task->thread.dscr; + return 0; +} + +static int set_user_dscr(struct task_struct *task, unsigned long dscr) +{ + task->thread.dscr = dscr; + task->thread.dscr_inherit = 1; + return 0; +} +#else +static int get_user_dscr(struct task_struct *task, unsigned long *data) +{ + return -EIO; +} + +static int set_user_dscr(struct task_struct *task, unsigned long dscr) +{ + return -EIO; +} +#endif + /* * We prevent mucking around with the reserved area of trap * which are used internally by the kernel. @@ -192,16 +218,23 @@ static int set_user_trap(struct task_struct *task, unsigned long trap) /* * Get contents of register REGNO in task TASK. */ -unsigned long ptrace_get_reg(struct task_struct *task, int regno) +int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) { - if (task->thread.regs == NULL) + if ((task->thread.regs == NULL) || !data) return -EIO; - if (regno == PT_MSR) - return get_user_msr(task); + if (regno == PT_MSR) { + *data = get_user_msr(task); + return 0; + } - if (regno < (sizeof(struct pt_regs) / sizeof(unsigned long))) - return ((unsigned long *)task->thread.regs)[regno]; + if (regno == PT_DSCR) + return get_user_dscr(task, data); + + if (regno < (sizeof(struct pt_regs) / sizeof(unsigned long))) { + *data = ((unsigned long *)task->thread.regs)[regno]; + return 0; + } return -EIO; } @@ -218,6 +251,8 @@ int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data) return set_user_msr(task, data); if (regno == PT_TRAP) return set_user_trap(task, data); + if (regno == PT_DSCR) + return set_user_dscr(task, data); if (regno <= PT_MAX_PUT_REG) { ((unsigned long *)task->thread.regs)[regno] = data; @@ -327,7 +362,7 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset, void *kbuf, void __user *ubuf) { #ifdef CONFIG_VSX - double buf[33]; + u64 buf[33]; int i; #endif flush_fp_to_thread(target); @@ -336,15 +371,15 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset, /* copy to local buffer then write that out */ for (i = 0; i < 32 ; i++) buf[i] = target->thread.TS_FPR(i); - memcpy(&buf[32], &target->thread.fpscr, sizeof(double)); + buf[32] = target->thread.fp_state.fpscr; return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1); #else - BUILD_BUG_ON(offsetof(struct thread_struct, fpscr) != - offsetof(struct thread_struct, TS_FPR(32))); + BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != + offsetof(struct thread_fp_state, fpr[32][0])); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fpr, 0, -1); + &target->thread.fp_state, 0, -1); #endif } @@ -353,7 +388,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, const void *kbuf, const void __user *ubuf) { #ifdef CONFIG_VSX - double buf[33]; + u64 buf[33]; int i; #endif flush_fp_to_thread(target); @@ -365,14 +400,14 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, return i; for (i = 0; i < 32 ; i++) target->thread.TS_FPR(i) = buf[i]; - memcpy(&target->thread.fpscr, &buf[32], sizeof(double)); + target->thread.fp_state.fpscr = buf[32]; return 0; #else - BUILD_BUG_ON(offsetof(struct thread_struct, fpscr) != - offsetof(struct thread_struct, TS_FPR(32))); + BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != + offsetof(struct thread_fp_state, fpr[32][0])); return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpr, 0, -1); + &target->thread.fp_state, 0, -1); #endif } @@ -405,11 +440,11 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset, flush_altivec_to_thread(target); - BUILD_BUG_ON(offsetof(struct thread_struct, vscr) != - offsetof(struct thread_struct, vr[32])); + BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) != + offsetof(struct thread_vr_state, vr[32])); ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.vr, 0, + &target->thread.vr_state, 0, 33 * sizeof(vector128)); if (!ret) { /* @@ -436,11 +471,12 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset, flush_altivec_to_thread(target); - BUILD_BUG_ON(offsetof(struct thread_struct, vscr) != - offsetof(struct thread_struct, vr[32])); + BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) != + offsetof(struct thread_vr_state, vr[32])); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.vr, 0, 33 * sizeof(vector128)); + &target->thread.vr_state, 0, + 33 * sizeof(vector128)); if (!ret && count > 0) { /* * We use only the first word of vrsave. @@ -479,13 +515,13 @@ static int vsr_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - double buf[32]; + u64 buf[32]; int ret, i; flush_vsx_to_thread(target); for (i = 0; i < 32 ; i++) - buf[i] = target->thread.fpr[i][TS_VSRLOWOFFSET]; + buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, 32 * sizeof(double)); @@ -496,7 +532,7 @@ static int vsr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - double buf[32]; + u64 buf[32]; int ret,i; flush_vsx_to_thread(target); @@ -504,7 +540,7 @@ static int vsr_set(struct task_struct *target, const struct user_regset *regset, ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, 32 * sizeof(double)); for (i = 0; i < 32 ; i++) - target->thread.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; return ret; @@ -622,7 +658,7 @@ static const struct user_regset native_regsets[] = { #endif #ifdef CONFIG_SPE [REGSET_SPE] = { - .n = 35, + .core_note_type = NT_PPC_SPE, .n = 35, .size = sizeof(u32), .align = sizeof(u32), .active = evr_active, .get = evr_get, .set = evr_set }, @@ -819,8 +855,8 @@ void user_enable_single_step(struct task_struct *task) if (regs != NULL) { #ifdef CONFIG_PPC_ADV_DEBUG_REGS - task->thread.dbcr0 &= ~DBCR0_BT; - task->thread.dbcr0 |= DBCR0_IDM | DBCR0_IC; + task->thread.debug.dbcr0 &= ~DBCR0_BT; + task->thread.debug.dbcr0 |= DBCR0_IDM | DBCR0_IC; regs->msr |= MSR_DE; #else regs->msr &= ~MSR_BE; @@ -836,8 +872,8 @@ void user_enable_block_step(struct task_struct *task) if (regs != NULL) { #ifdef CONFIG_PPC_ADV_DEBUG_REGS - task->thread.dbcr0 &= ~DBCR0_IC; - task->thread.dbcr0 = DBCR0_IDM | DBCR0_BT; + task->thread.debug.dbcr0 &= ~DBCR0_IC; + task->thread.debug.dbcr0 = DBCR0_IDM | DBCR0_BT; regs->msr |= MSR_DE; #else regs->msr &= ~MSR_SE; @@ -859,16 +895,16 @@ void user_disable_single_step(struct task_struct *task) * And, after doing so, if all debug flags are off, turn * off DBCR0(IDM) and MSR(DE) .... Torez */ - task->thread.dbcr0 &= ~DBCR0_IC; + task->thread.debug.dbcr0 &= ~(DBCR0_IC|DBCR0_BT); /* * Test to see if any of the DBCR_ACTIVE_EVENTS bits are set. */ - if (!DBCR_ACTIVE_EVENTS(task->thread.dbcr0, - task->thread.dbcr1)) { + if (!DBCR_ACTIVE_EVENTS(task->thread.debug.dbcr0, + task->thread.debug.dbcr1)) { /* * All debug events were off..... */ - task->thread.dbcr0 &= ~DBCR0_IDM; + task->thread.debug.dbcr0 &= ~DBCR0_IDM; regs->msr &= ~MSR_DE; } #else @@ -905,6 +941,9 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, struct perf_event *bp; struct perf_event_attr attr; #endif /* CONFIG_HAVE_HW_BREAKPOINT */ +#ifndef CONFIG_PPC_ADV_DEBUG_REGS + struct arch_hw_breakpoint hw_brk; +#endif /* For ppc64 we support one DABR and no IABR's at the moment (ppc64). * For embedded processors we support one DAC and no IAC's at the @@ -931,64 +970,52 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, */ /* Ensure breakpoint translation bit is set */ - if (data && !(data & DABR_TRANSLATION)) + if (data && !(data & HW_BRK_TYPE_TRANSLATE)) return -EIO; + hw_brk.address = data & (~HW_BRK_TYPE_DABR); + hw_brk.type = (data & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL; + hw_brk.len = 8; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(task) < 0) - return -ESRCH; - bp = thread->ptrace_bps[0]; - if ((!data) || !(data & (DABR_DATA_WRITE | DABR_DATA_READ))) { + if ((!data) || !(hw_brk.type & HW_BRK_TYPE_RDWR)) { if (bp) { unregister_hw_breakpoint(bp); thread->ptrace_bps[0] = NULL; } - ptrace_put_breakpoints(task); return 0; } if (bp) { attr = bp->attr; - attr.bp_addr = data & ~HW_BREAKPOINT_ALIGN; - arch_bp_generic_fields(data & - (DABR_DATA_WRITE | DABR_DATA_READ), - &attr.bp_type); + attr.bp_addr = hw_brk.address; + arch_bp_generic_fields(hw_brk.type, &attr.bp_type); /* Enable breakpoint */ attr.disabled = false; ret = modify_user_hw_breakpoint(bp, &attr); if (ret) { - ptrace_put_breakpoints(task); return ret; } thread->ptrace_bps[0] = bp; - ptrace_put_breakpoints(task); - thread->dabr = data; - thread->dabrx = DABRX_ALL; + thread->hw_brk = hw_brk; return 0; } /* Create a new breakpoint request if one doesn't exist already */ hw_breakpoint_init(&attr); - attr.bp_addr = data & ~HW_BREAKPOINT_ALIGN; - arch_bp_generic_fields(data & (DABR_DATA_WRITE | DABR_DATA_READ), - &attr.bp_type); + attr.bp_addr = hw_brk.address; + arch_bp_generic_fields(hw_brk.type, + &attr.bp_type); thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr, ptrace_triggered, NULL, task); if (IS_ERR(bp)) { thread->ptrace_bps[0] = NULL; - ptrace_put_breakpoints(task); return PTR_ERR(bp); } - ptrace_put_breakpoints(task); - #endif /* CONFIG_HAVE_HW_BREAKPOINT */ - - /* Move contents to the DABR register */ - task->thread.dabr = data; - task->thread.dabrx = DABRX_ALL; + task->thread.hw_brk = hw_brk; #else /* CONFIG_PPC_ADV_DEBUG_REGS */ /* As described above, it was assumed 3 bits were passed with the data * address, but we will assume only the mode bits will be passed @@ -996,14 +1023,14 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, */ /* DAC's hold the whole address without any mode flags */ - task->thread.dac1 = data & ~0x3UL; + task->thread.debug.dac1 = data & ~0x3UL; - if (task->thread.dac1 == 0) { + if (task->thread.debug.dac1 == 0) { dbcr_dac(task) &= ~(DBCR_DAC1R | DBCR_DAC1W); - if (!DBCR_ACTIVE_EVENTS(task->thread.dbcr0, - task->thread.dbcr1)) { + if (!DBCR_ACTIVE_EVENTS(task->thread.debug.dbcr0, + task->thread.debug.dbcr1)) { task->thread.regs->msr &= ~MSR_DE; - task->thread.dbcr0 &= ~DBCR0_IDM; + task->thread.debug.dbcr0 &= ~DBCR0_IDM; } return 0; } @@ -1015,7 +1042,7 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, /* Set the Internal Debugging flag (IDM bit 1) for the DBCR0 register */ - task->thread.dbcr0 |= DBCR0_IDM; + task->thread.debug.dbcr0 |= DBCR0_IDM; /* Check for write and read flags and set DBCR0 accordingly */ @@ -1045,10 +1072,10 @@ static long set_instruction_bp(struct task_struct *child, struct ppc_hw_breakpoint *bp_info) { int slot; - int slot1_in_use = ((child->thread.dbcr0 & DBCR0_IAC1) != 0); - int slot2_in_use = ((child->thread.dbcr0 & DBCR0_IAC2) != 0); - int slot3_in_use = ((child->thread.dbcr0 & DBCR0_IAC3) != 0); - int slot4_in_use = ((child->thread.dbcr0 & DBCR0_IAC4) != 0); + int slot1_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC1) != 0); + int slot2_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC2) != 0); + int slot3_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC3) != 0); + int slot4_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC4) != 0); if (dbcr_iac_range(child) & DBCR_IAC12MODE) slot2_in_use = 1; @@ -1067,9 +1094,9 @@ static long set_instruction_bp(struct task_struct *child, /* We need a pair of IAC regsisters */ if ((!slot1_in_use) && (!slot2_in_use)) { slot = 1; - child->thread.iac1 = bp_info->addr; - child->thread.iac2 = bp_info->addr2; - child->thread.dbcr0 |= DBCR0_IAC1; + child->thread.debug.iac1 = bp_info->addr; + child->thread.debug.iac2 = bp_info->addr2; + child->thread.debug.dbcr0 |= DBCR0_IAC1; if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE) dbcr_iac_range(child) |= DBCR_IAC12X; @@ -1078,9 +1105,9 @@ static long set_instruction_bp(struct task_struct *child, #if CONFIG_PPC_ADV_DEBUG_IACS > 2 } else if ((!slot3_in_use) && (!slot4_in_use)) { slot = 3; - child->thread.iac3 = bp_info->addr; - child->thread.iac4 = bp_info->addr2; - child->thread.dbcr0 |= DBCR0_IAC3; + child->thread.debug.iac3 = bp_info->addr; + child->thread.debug.iac4 = bp_info->addr2; + child->thread.debug.dbcr0 |= DBCR0_IAC3; if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE) dbcr_iac_range(child) |= DBCR_IAC34X; @@ -1100,30 +1127,30 @@ static long set_instruction_bp(struct task_struct *child, */ if (slot2_in_use || (slot3_in_use == slot4_in_use)) { slot = 1; - child->thread.iac1 = bp_info->addr; - child->thread.dbcr0 |= DBCR0_IAC1; + child->thread.debug.iac1 = bp_info->addr; + child->thread.debug.dbcr0 |= DBCR0_IAC1; goto out; } } if (!slot2_in_use) { slot = 2; - child->thread.iac2 = bp_info->addr; - child->thread.dbcr0 |= DBCR0_IAC2; + child->thread.debug.iac2 = bp_info->addr; + child->thread.debug.dbcr0 |= DBCR0_IAC2; #if CONFIG_PPC_ADV_DEBUG_IACS > 2 } else if (!slot3_in_use) { slot = 3; - child->thread.iac3 = bp_info->addr; - child->thread.dbcr0 |= DBCR0_IAC3; + child->thread.debug.iac3 = bp_info->addr; + child->thread.debug.dbcr0 |= DBCR0_IAC3; } else if (!slot4_in_use) { slot = 4; - child->thread.iac4 = bp_info->addr; - child->thread.dbcr0 |= DBCR0_IAC4; + child->thread.debug.iac4 = bp_info->addr; + child->thread.debug.dbcr0 |= DBCR0_IAC4; #endif } else return -ENOSPC; } out: - child->thread.dbcr0 |= DBCR0_IDM; + child->thread.debug.dbcr0 |= DBCR0_IDM; child->thread.regs->msr |= MSR_DE; return slot; @@ -1133,49 +1160,49 @@ static int del_instruction_bp(struct task_struct *child, int slot) { switch (slot) { case 1: - if ((child->thread.dbcr0 & DBCR0_IAC1) == 0) + if ((child->thread.debug.dbcr0 & DBCR0_IAC1) == 0) return -ENOENT; if (dbcr_iac_range(child) & DBCR_IAC12MODE) { /* address range - clear slots 1 & 2 */ - child->thread.iac2 = 0; + child->thread.debug.iac2 = 0; dbcr_iac_range(child) &= ~DBCR_IAC12MODE; } - child->thread.iac1 = 0; - child->thread.dbcr0 &= ~DBCR0_IAC1; + child->thread.debug.iac1 = 0; + child->thread.debug.dbcr0 &= ~DBCR0_IAC1; break; case 2: - if ((child->thread.dbcr0 & DBCR0_IAC2) == 0) + if ((child->thread.debug.dbcr0 & DBCR0_IAC2) == 0) return -ENOENT; if (dbcr_iac_range(child) & DBCR_IAC12MODE) /* used in a range */ return -EINVAL; - child->thread.iac2 = 0; - child->thread.dbcr0 &= ~DBCR0_IAC2; + child->thread.debug.iac2 = 0; + child->thread.debug.dbcr0 &= ~DBCR0_IAC2; break; #if CONFIG_PPC_ADV_DEBUG_IACS > 2 case 3: - if ((child->thread.dbcr0 & DBCR0_IAC3) == 0) + if ((child->thread.debug.dbcr0 & DBCR0_IAC3) == 0) return -ENOENT; if (dbcr_iac_range(child) & DBCR_IAC34MODE) { /* address range - clear slots 3 & 4 */ - child->thread.iac4 = 0; + child->thread.debug.iac4 = 0; dbcr_iac_range(child) &= ~DBCR_IAC34MODE; } - child->thread.iac3 = 0; - child->thread.dbcr0 &= ~DBCR0_IAC3; + child->thread.debug.iac3 = 0; + child->thread.debug.dbcr0 &= ~DBCR0_IAC3; break; case 4: - if ((child->thread.dbcr0 & DBCR0_IAC4) == 0) + if ((child->thread.debug.dbcr0 & DBCR0_IAC4) == 0) return -ENOENT; if (dbcr_iac_range(child) & DBCR_IAC34MODE) /* Used in a range */ return -EINVAL; - child->thread.iac4 = 0; - child->thread.dbcr0 &= ~DBCR0_IAC4; + child->thread.debug.iac4 = 0; + child->thread.debug.dbcr0 &= ~DBCR0_IAC4; break; #endif default: @@ -1205,18 +1232,18 @@ static int set_dac(struct task_struct *child, struct ppc_hw_breakpoint *bp_info) dbcr_dac(child) |= DBCR_DAC1R; if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) dbcr_dac(child) |= DBCR_DAC1W; - child->thread.dac1 = (unsigned long)bp_info->addr; + child->thread.debug.dac1 = (unsigned long)bp_info->addr; #if CONFIG_PPC_ADV_DEBUG_DVCS > 0 if (byte_enable) { - child->thread.dvc1 = + child->thread.debug.dvc1 = (unsigned long)bp_info->condition_value; - child->thread.dbcr2 |= + child->thread.debug.dbcr2 |= ((byte_enable << DBCR2_DVC1BE_SHIFT) | (condition_mode << DBCR2_DVC1M_SHIFT)); } #endif #ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE - } else if (child->thread.dbcr2 & DBCR2_DAC12MODE) { + } else if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) { /* Both dac1 and dac2 are part of a range */ return -ENOSPC; #endif @@ -1226,19 +1253,19 @@ static int set_dac(struct task_struct *child, struct ppc_hw_breakpoint *bp_info) dbcr_dac(child) |= DBCR_DAC2R; if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) dbcr_dac(child) |= DBCR_DAC2W; - child->thread.dac2 = (unsigned long)bp_info->addr; + child->thread.debug.dac2 = (unsigned long)bp_info->addr; #if CONFIG_PPC_ADV_DEBUG_DVCS > 0 if (byte_enable) { - child->thread.dvc2 = + child->thread.debug.dvc2 = (unsigned long)bp_info->condition_value; - child->thread.dbcr2 |= + child->thread.debug.dbcr2 |= ((byte_enable << DBCR2_DVC2BE_SHIFT) | (condition_mode << DBCR2_DVC2M_SHIFT)); } #endif } else return -ENOSPC; - child->thread.dbcr0 |= DBCR0_IDM; + child->thread.debug.dbcr0 |= DBCR0_IDM; child->thread.regs->msr |= MSR_DE; return slot + 4; @@ -1250,32 +1277,32 @@ static int del_dac(struct task_struct *child, int slot) if ((dbcr_dac(child) & (DBCR_DAC1R | DBCR_DAC1W)) == 0) return -ENOENT; - child->thread.dac1 = 0; + child->thread.debug.dac1 = 0; dbcr_dac(child) &= ~(DBCR_DAC1R | DBCR_DAC1W); #ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE - if (child->thread.dbcr2 & DBCR2_DAC12MODE) { - child->thread.dac2 = 0; - child->thread.dbcr2 &= ~DBCR2_DAC12MODE; + if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) { + child->thread.debug.dac2 = 0; + child->thread.debug.dbcr2 &= ~DBCR2_DAC12MODE; } - child->thread.dbcr2 &= ~(DBCR2_DVC1M | DBCR2_DVC1BE); + child->thread.debug.dbcr2 &= ~(DBCR2_DVC1M | DBCR2_DVC1BE); #endif #if CONFIG_PPC_ADV_DEBUG_DVCS > 0 - child->thread.dvc1 = 0; + child->thread.debug.dvc1 = 0; #endif } else if (slot == 2) { if ((dbcr_dac(child) & (DBCR_DAC2R | DBCR_DAC2W)) == 0) return -ENOENT; #ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE - if (child->thread.dbcr2 & DBCR2_DAC12MODE) + if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) /* Part of a range */ return -EINVAL; - child->thread.dbcr2 &= ~(DBCR2_DVC2M | DBCR2_DVC2BE); + child->thread.debug.dbcr2 &= ~(DBCR2_DVC2M | DBCR2_DVC2BE); #endif #if CONFIG_PPC_ADV_DEBUG_DVCS > 0 - child->thread.dvc2 = 0; + child->thread.debug.dvc2 = 0; #endif - child->thread.dac2 = 0; + child->thread.debug.dac2 = 0; dbcr_dac(child) &= ~(DBCR_DAC2R | DBCR_DAC2W); } else return -EINVAL; @@ -1317,22 +1344,22 @@ static int set_dac_range(struct task_struct *child, return -EIO; } - if (child->thread.dbcr0 & + if (child->thread.debug.dbcr0 & (DBCR0_DAC1R | DBCR0_DAC1W | DBCR0_DAC2R | DBCR0_DAC2W)) return -ENOSPC; if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) - child->thread.dbcr0 |= (DBCR0_DAC1R | DBCR0_IDM); + child->thread.debug.dbcr0 |= (DBCR0_DAC1R | DBCR0_IDM); if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) - child->thread.dbcr0 |= (DBCR0_DAC1W | DBCR0_IDM); - child->thread.dac1 = bp_info->addr; - child->thread.dac2 = bp_info->addr2; + child->thread.debug.dbcr0 |= (DBCR0_DAC1W | DBCR0_IDM); + child->thread.debug.dac1 = bp_info->addr; + child->thread.debug.dac2 = bp_info->addr2; if (mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) - child->thread.dbcr2 |= DBCR2_DAC12M; + child->thread.debug.dbcr2 |= DBCR2_DAC12M; else if (mode == PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE) - child->thread.dbcr2 |= DBCR2_DAC12MX; + child->thread.debug.dbcr2 |= DBCR2_DAC12MX; else /* PPC_BREAKPOINT_MODE_MASK */ - child->thread.dbcr2 |= DBCR2_DAC12MM; + child->thread.debug.dbcr2 |= DBCR2_DAC12MM; child->thread.regs->msr |= MSR_DE; return 5; @@ -1349,7 +1376,7 @@ static long ppc_set_hwdebug(struct task_struct *child, struct perf_event_attr attr; #endif /* CONFIG_HAVE_HW_BREAKPOINT */ #ifndef CONFIG_PPC_ADV_DEBUG_REGS - unsigned long dabr; + struct arch_hw_breakpoint brk; #endif if (bp_info->version != 1) @@ -1397,59 +1424,51 @@ static long ppc_set_hwdebug(struct task_struct *child, if ((unsigned long)bp_info->addr >= TASK_SIZE) return -EIO; - dabr = (unsigned long)bp_info->addr & ~7UL; - dabr |= DABR_TRANSLATION; + brk.address = bp_info->addr & ~7UL; + brk.type = HW_BRK_TYPE_TRANSLATE; + brk.len = 8; if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) - dabr |= DABR_DATA_READ; + brk.type |= HW_BRK_TYPE_READ; if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) - dabr |= DABR_DATA_WRITE; + brk.type |= HW_BRK_TYPE_WRITE; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - /* * Check if the request is for 'range' breakpoints. We can * support it if range < 8 bytes. */ - if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) { + if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) len = bp_info->addr2 - bp_info->addr; - } else if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) { - ptrace_put_breakpoints(child); + else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) + len = 1; + else return -EINVAL; - } bp = thread->ptrace_bps[0]; - if (bp) { - ptrace_put_breakpoints(child); + if (bp) return -ENOSPC; - } /* Create a new breakpoint request if one doesn't exist already */ hw_breakpoint_init(&attr); attr.bp_addr = (unsigned long)bp_info->addr & ~HW_BREAKPOINT_ALIGN; attr.bp_len = len; - arch_bp_generic_fields(dabr & (DABR_DATA_WRITE | DABR_DATA_READ), - &attr.bp_type); + arch_bp_generic_fields(brk.type, &attr.bp_type); thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr, ptrace_triggered, NULL, child); if (IS_ERR(bp)) { thread->ptrace_bps[0] = NULL; - ptrace_put_breakpoints(child); return PTR_ERR(bp); } - ptrace_put_breakpoints(child); return 1; #endif /* CONFIG_HAVE_HW_BREAKPOINT */ if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) return -EINVAL; - if (child->thread.dabr) + if (child->thread.hw_brk.address) return -ENOSPC; - child->thread.dabr = dabr; - child->thread.dabrx = DABRX_ALL; + child->thread.hw_brk = brk; return 1; #endif /* !CONFIG_PPC_ADV_DEBUG_DVCS */ @@ -1471,9 +1490,9 @@ static long ppc_del_hwdebug(struct task_struct *child, long data) rc = del_dac(child, (int)data - 4); if (!rc) { - if (!DBCR_ACTIVE_EVENTS(child->thread.dbcr0, - child->thread.dbcr1)) { - child->thread.dbcr0 &= ~DBCR0_IDM; + if (!DBCR_ACTIVE_EVENTS(child->thread.debug.dbcr0, + child->thread.debug.dbcr1)) { + child->thread.debug.dbcr0 &= ~DBCR0_IDM; child->thread.regs->msr &= ~MSR_DE; } } @@ -1483,22 +1502,19 @@ static long ppc_del_hwdebug(struct task_struct *child, long data) return -EINVAL; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - bp = thread->ptrace_bps[0]; if (bp) { unregister_hw_breakpoint(bp); thread->ptrace_bps[0] = NULL; } else ret = -ENOENT; - ptrace_put_breakpoints(child); return ret; #else /* CONFIG_HAVE_HW_BREAKPOINT */ - if (child->thread.dabr == 0) + if (child->thread.hw_brk.address == 0) return -ENOENT; - child->thread.dabr = 0; + child->thread.hw_brk.address = 0; + child->thread.hw_brk.type = 0; #endif /* CONFIG_HAVE_HW_BREAKPOINT */ return 0; @@ -1531,16 +1547,18 @@ long arch_ptrace(struct task_struct *child, long request, CHECK_FULL_REGS(child->thread.regs); if (index < PT_FPR0) { - tmp = ptrace_get_reg(child, (int) index); + ret = ptrace_get_reg(child, (int) index, &tmp); + if (ret) + break; } else { unsigned int fpidx = index - PT_FPR0; flush_fp_to_thread(child); if (fpidx < (PT_FPSCR - PT_FPR0)) - tmp = ((unsigned long *)child->thread.fpr) - [fpidx * TS_FPRWIDTH]; + memcpy(&tmp, &child->thread.TS_FPR(fpidx), + sizeof(long)); else - tmp = child->thread.fpscr.val; + tmp = child->thread.fp_state.fpscr; } ret = put_user(tmp, datalp); break; @@ -1570,10 +1588,10 @@ long arch_ptrace(struct task_struct *child, long request, flush_fp_to_thread(child); if (fpidx < (PT_FPSCR - PT_FPR0)) - ((unsigned long *)child->thread.fpr) - [fpidx * TS_FPRWIDTH] = data; + memcpy(&child->thread.TS_FPR(fpidx), &data, + sizeof(long)); else - child->thread.fpscr.val = data; + child->thread.fp_state.fpscr = data; ret = 0; } break; @@ -1608,6 +1626,8 @@ long arch_ptrace(struct task_struct *child, long request, dbginfo.sizeof_condition = 0; #ifdef CONFIG_HAVE_HW_BREAKPOINT dbginfo.features = PPC_DEBUG_FEATURE_DATA_BP_RANGE; + if (cpu_has_feature(CPU_FTR_DAWR)) + dbginfo.features |= PPC_DEBUG_FEATURE_DATA_BP_DAWR; #else dbginfo.features = 0; #endif /* CONFIG_HAVE_HW_BREAKPOINT */ @@ -1642,14 +1662,19 @@ long arch_ptrace(struct task_struct *child, long request, } case PTRACE_GET_DEBUGREG: { +#ifndef CONFIG_PPC_ADV_DEBUG_REGS + unsigned long dabr_fake; +#endif ret = -EINVAL; /* We only support one DABR and no IABRS at the moment */ if (addr > 0) break; #ifdef CONFIG_PPC_ADV_DEBUG_REGS - ret = put_user(child->thread.dac1, datalp); + ret = put_user(child->thread.debug.dac1, datalp); #else - ret = put_user(child->thread.dabr, datalp); + dabr_fake = ((child->thread.hw_brk.address & (~HW_BRK_TYPE_DABR)) | + (child->thread.hw_brk.type & HW_BRK_TYPE_DABR)); + ret = put_user(dabr_fake, datalp); #endif break; } @@ -1745,6 +1770,8 @@ long do_syscall_trace_enter(struct pt_regs *regs) { long ret = 0; + user_exit(); + secure_computing_strict(regs->gpr[0]); if (test_thread_flag(TIF_SYSCALL_TRACE) && @@ -1789,4 +1816,6 @@ void do_syscall_trace_leave(struct pt_regs *regs) step = test_thread_flag(TIF_SINGLESTEP); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, step); + + user_enter(); } diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c index 8c21658719d..f52b7db327c 100644 --- a/arch/powerpc/kernel/ptrace32.c +++ b/arch/powerpc/kernel/ptrace32.c @@ -43,7 +43,6 @@ #define FPRNUMBER(i) (((i) - PT_FPR0) >> 1) #define FPRHALF(i) (((i) - PT_FPR0) & 1) #define FPRINDEX(i) TS_FPRWIDTH * FPRNUMBER(i) * 2 + FPRHALF(i) -#define FPRINDEX_3264(i) (TS_FPRWIDTH * ((i) - PT_FPR0)) long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) @@ -95,7 +94,9 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, CHECK_FULL_REGS(child->thread.regs); if (index < PT_FPR0) { - tmp = ptrace_get_reg(child, index); + ret = ptrace_get_reg(child, index, &tmp); + if (ret) + break; } else { flush_fp_to_thread(child); /* @@ -103,7 +104,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, * to be an array of unsigned int (32 bits) - the * index passed in is based on this assumption. */ - tmp = ((unsigned int *)child->thread.fpr) + tmp = ((unsigned int *)child->thread.fp_state.fpr) [FPRINDEX(index)]; } ret = put_user((unsigned int)tmp, (u32 __user *)data); @@ -145,10 +146,13 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if (numReg >= PT_FPR0) { flush_fp_to_thread(child); /* get 64 bit FPR */ - tmp = ((u64 *)child->thread.fpr) - [FPRINDEX_3264(numReg)]; + tmp = child->thread.fp_state.fpr[numReg - PT_FPR0][0]; } else { /* register within PT_REGS struct */ - tmp = ptrace_get_reg(child, numReg); + unsigned long tmp2; + ret = ptrace_get_reg(child, numReg, &tmp2); + if (ret) + break; + tmp = tmp2; } reg32bits = ((u32*)&tmp)[part]; ret = put_user(reg32bits, (u32 __user *)data); @@ -201,7 +205,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, * to be an array of unsigned int (32 bits) - the * index passed in is based on this assumption. */ - ((unsigned int *)child->thread.fpr) + ((unsigned int *)child->thread.fp_state.fpr) [FPRINDEX(index)] = data; ret = 0; } @@ -232,7 +236,10 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; CHECK_FULL_REGS(child->thread.regs); if (numReg < PT_FPR0) { - unsigned long freg = ptrace_get_reg(child, numReg); + unsigned long freg; + ret = ptrace_get_reg(child, numReg, &freg); + if (ret) + break; if (index % 2) freg = (freg & ~0xfffffffful) | (data & 0xfffffffful); else @@ -242,8 +249,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, u64 *tmp; flush_fp_to_thread(child); /* get 64 bit FPR ... */ - tmp = &(((u64 *)child->thread.fpr) - [FPRINDEX_3264(numReg)]); + tmp = &child->thread.fp_state.fpr[numReg - PT_FPR0][0]; /* ... write the 32 bit part we want */ ((u32 *)tmp)[index % 2] = data; ret = 0; @@ -252,14 +258,20 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, } case PTRACE_GET_DEBUGREG: { +#ifndef CONFIG_PPC_ADV_DEBUG_REGS + unsigned long dabr_fake; +#endif ret = -EINVAL; /* We only support one DABR and no IABRS at the moment */ if (addr > 0) break; #ifdef CONFIG_PPC_ADV_DEBUG_REGS - ret = put_user(child->thread.dac1, (u32 __user *)data); + ret = put_user(child->thread.debug.dac1, (u32 __user *)data); #else - ret = put_user(child->thread.dabr, (u32 __user *)data); + dabr_fake = ( + (child->thread.hw_brk.address & (~HW_BRK_TYPE_DABR)) | + (child->thread.hw_brk.type & HW_BRK_TYPE_DABR)); + ret = put_user(dabr_fake, (u32 __user *)data); #endif break; } diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S index ef46ba6e094..f366fedb087 100644 --- a/arch/powerpc/kernel/reloc_32.S +++ b/arch/powerpc/kernel/reloc_32.S @@ -166,7 +166,7 @@ ha16: /* R_PPC_ADDR16_LO */ lo16: cmpwi r4, R_PPC_ADDR16_LO - bne nxtrela + bne unknown_type lwz r4, 0(r9) /* r_offset */ lwz r0, 8(r9) /* r_addend */ add r0, r0, r3 @@ -191,6 +191,7 @@ nxtrela: dcbst r4,r7 sync /* Ensure the data is flushed before icbi */ icbi r4,r7 +unknown_type: cmpwi r8, 0 /* relasz = 0 ? */ ble done add r9, r9, r6 /* move to next entry in the .rela table */ diff --git a/arch/powerpc/kernel/reloc_64.S b/arch/powerpc/kernel/reloc_64.S index b47a0e1ab00..d88736fbece 100644 --- a/arch/powerpc/kernel/reloc_64.S +++ b/arch/powerpc/kernel/reloc_64.S @@ -69,8 +69,8 @@ _GLOBAL(relocate) * R_PPC64_RELATIVE ones. */ mtctr r8 -5: lwz r0,12(9) /* ELF64_R_TYPE(reloc->r_info) */ - cmpwi r0,R_PPC64_RELATIVE +5: ld r0,8(9) /* ELF64_R_TYPE(reloc->r_info) */ + cmpdi r0,R_PPC64_RELATIVE bne 6f ld r6,0(r9) /* reloc->r_offset */ ld r0,16(r9) /* reloc->r_addend */ @@ -81,6 +81,7 @@ _GLOBAL(relocate) 6: blr +.balign 8 p_dyn: .llong __dynamic_start - 0b p_rela: .llong __rela_dyn_start - 0b p_st: .llong _stext - 0b diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 1fd6e7b2f39..8b4c857c142 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -19,6 +19,7 @@ #include <linux/init.h> #include <linux/capability.h> #include <linux/delay.h> +#include <linux/cpu.h> #include <linux/smp.h> #include <linux/completion.h> #include <linux/cpumask.h> @@ -90,7 +91,7 @@ static void unlock_rtas(unsigned long flags) * are designed only for very early low-level debugging, which * is why the token is hard-coded to 10. */ -static void call_rtas_display_status(char c) +static void call_rtas_display_status(unsigned char c) { struct rtas_args *args = &rtas.args; unsigned long s; @@ -99,11 +100,11 @@ static void call_rtas_display_status(char c) return; s = lock_rtas(); - args->token = 10; - args->nargs = 1; - args->nret = 1; - args->rets = (rtas_arg_t *)&(args->args[1]); - args->args[0] = (unsigned char)c; + args->token = cpu_to_be32(10); + args->nargs = cpu_to_be32(1); + args->nret = cpu_to_be32(1); + args->rets = &(args->args[1]); + args->args[0] = cpu_to_be32(c); enter_rtas(__pa(args)); @@ -203,7 +204,7 @@ void rtas_progress(char *s, unsigned short hex) { struct device_node *root; int width; - const int *p; + const __be32 *p; char *os; static int display_character, set_indicator; static int display_width, display_lines, form_feed; @@ -220,13 +221,13 @@ void rtas_progress(char *s, unsigned short hex) if ((root = of_find_node_by_path("/rtas"))) { if ((p = of_get_property(root, "ibm,display-line-length", NULL))) - display_width = *p; + display_width = be32_to_cpu(*p); if ((p = of_get_property(root, "ibm,form-feed", NULL))) - form_feed = *p; + form_feed = be32_to_cpu(*p); if ((p = of_get_property(root, "ibm,display-number-of-lines", NULL))) - display_lines = *p; + display_lines = be32_to_cpu(*p); row_width = of_get_property(root, "ibm,display-truncation-length", NULL); of_node_put(root); @@ -321,11 +322,11 @@ EXPORT_SYMBOL(rtas_progress); /* needed by rtas_flash module */ int rtas_token(const char *service) { - const int *tokp; + const __be32 *tokp; if (rtas.dev == NULL) return RTAS_UNKNOWN_SERVICE; tokp = of_get_property(rtas.dev, service, NULL); - return tokp ? *tokp : RTAS_UNKNOWN_SERVICE; + return tokp ? be32_to_cpu(*tokp) : RTAS_UNKNOWN_SERVICE; } EXPORT_SYMBOL(rtas_token); @@ -379,11 +380,11 @@ static char *__fetch_rtas_last_error(char *altbuf) bufsz = rtas_get_error_log_max(); - err_args.token = rtas_last_error_token; - err_args.nargs = 2; - err_args.nret = 1; - err_args.args[0] = (rtas_arg_t)__pa(rtas_err_buf); - err_args.args[1] = bufsz; + err_args.token = cpu_to_be32(rtas_last_error_token); + err_args.nargs = cpu_to_be32(2); + err_args.nret = cpu_to_be32(1); + err_args.args[0] = cpu_to_be32(__pa(rtas_err_buf)); + err_args.args[1] = cpu_to_be32(bufsz); err_args.args[2] = 0; save_args = rtas.args; @@ -432,13 +433,13 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...) s = lock_rtas(); rtas_args = &rtas.args; - rtas_args->token = token; - rtas_args->nargs = nargs; - rtas_args->nret = nret; - rtas_args->rets = (rtas_arg_t *)&(rtas_args->args[nargs]); + rtas_args->token = cpu_to_be32(token); + rtas_args->nargs = cpu_to_be32(nargs); + rtas_args->nret = cpu_to_be32(nret); + rtas_args->rets = &(rtas_args->args[nargs]); va_start(list, outputs); for (i = 0; i < nargs; ++i) - rtas_args->args[i] = va_arg(list, rtas_arg_t); + rtas_args->args[i] = cpu_to_be32(va_arg(list, __u32)); va_end(list); for (i = 0; i < nret; ++i) @@ -448,13 +449,13 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...) /* A -1 return code indicates that the last command couldn't be completed due to a hardware error. */ - if (rtas_args->rets[0] == -1) + if (be32_to_cpu(rtas_args->rets[0]) == -1) buff_copy = __fetch_rtas_last_error(NULL); if (nret > 1 && outputs != NULL) for (i = 0; i < nret-1; ++i) - outputs[i] = rtas_args->rets[i+1]; - ret = (nret > 0)? rtas_args->rets[0]: 0; + outputs[i] = be32_to_cpu(rtas_args->rets[i+1]); + ret = (nret > 0)? be32_to_cpu(rtas_args->rets[0]): 0; unlock_rtas(s); @@ -587,8 +588,8 @@ bool rtas_indicator_present(int token, int *maxindex) { int proplen, count, i; const struct indicator_elem { - u32 token; - u32 maxindex; + __be32 token; + __be32 maxindex; } *indicators; indicators = of_get_property(rtas.dev, "rtas-indicators", &proplen); @@ -598,10 +599,10 @@ bool rtas_indicator_present(int token, int *maxindex) count = proplen / sizeof(struct indicator_elem); for (i = 0; i < count; i++) { - if (indicators[i].token != token) + if (__be32_to_cpu(indicators[i].token) != token) continue; if (maxindex) - *maxindex = indicators[i].maxindex; + *maxindex = __be32_to_cpu(indicators[i].maxindex); return true; } @@ -807,6 +808,95 @@ static void rtas_percpu_suspend_me(void *info) __rtas_suspend_cpu((struct rtas_suspend_me_data *)info, 1); } +enum rtas_cpu_state { + DOWN, + UP, +}; + +#ifndef CONFIG_SMP +static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, + cpumask_var_t cpus) +{ + if (!cpumask_empty(cpus)) { + cpumask_clear(cpus); + return -EINVAL; + } else + return 0; +} +#else +/* On return cpumask will be altered to indicate CPUs changed. + * CPUs with states changed will be set in the mask, + * CPUs with status unchanged will be unset in the mask. */ +static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, + cpumask_var_t cpus) +{ + int cpu; + int cpuret = 0; + int ret = 0; + + if (cpumask_empty(cpus)) + return 0; + + for_each_cpu(cpu, cpus) { + switch (state) { + case DOWN: + cpuret = cpu_down(cpu); + break; + case UP: + cpuret = cpu_up(cpu); + break; + } + if (cpuret) { + pr_debug("%s: cpu_%s for cpu#%d returned %d.\n", + __func__, + ((state == UP) ? "up" : "down"), + cpu, cpuret); + if (!ret) + ret = cpuret; + if (state == UP) { + /* clear bits for unchanged cpus, return */ + cpumask_shift_right(cpus, cpus, cpu); + cpumask_shift_left(cpus, cpus, cpu); + break; + } else { + /* clear bit for unchanged cpu, continue */ + cpumask_clear_cpu(cpu, cpus); + } + } + } + + return ret; +} +#endif + +int rtas_online_cpus_mask(cpumask_var_t cpus) +{ + int ret; + + ret = rtas_cpu_state_change_mask(UP, cpus); + + if (ret) { + cpumask_var_t tmp_mask; + + if (!alloc_cpumask_var(&tmp_mask, GFP_TEMPORARY)) + return ret; + + /* Use tmp_mask to preserve cpus mask from first failure */ + cpumask_copy(tmp_mask, cpus); + rtas_offline_cpus_mask(tmp_mask); + free_cpumask_var(tmp_mask); + } + + return ret; +} +EXPORT_SYMBOL(rtas_online_cpus_mask); + +int rtas_offline_cpus_mask(cpumask_var_t cpus) +{ + return rtas_cpu_state_change_mask(DOWN, cpus); +} +EXPORT_SYMBOL(rtas_offline_cpus_mask); + int rtas_ibm_suspend_me(struct rtas_args *args) { long state; @@ -814,6 +904,8 @@ int rtas_ibm_suspend_me(struct rtas_args *args) unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; struct rtas_suspend_me_data data; DECLARE_COMPLETION_ONSTACK(done); + cpumask_var_t offline_mask; + int cpuret; if (!rtas_service_present("ibm,suspend-me")) return -ENOSYS; @@ -837,11 +929,24 @@ int rtas_ibm_suspend_me(struct rtas_args *args) return 0; } + if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY)) + return -ENOMEM; + atomic_set(&data.working, 0); atomic_set(&data.done, 0); atomic_set(&data.error, 0); data.token = rtas_token("ibm,suspend-me"); data.complete = &done; + + /* All present CPUs must be online */ + cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask); + cpuret = rtas_online_cpus_mask(offline_mask); + if (cpuret) { + pr_err("%s: Could not bring present CPUs online.\n", __func__); + atomic_set(&data.error, cpuret); + goto out; + } + stop_topology_update(); /* Call function on all CPUs. One of us will make the @@ -857,6 +962,14 @@ int rtas_ibm_suspend_me(struct rtas_args *args) start_topology_update(); + /* Take down CPUs not online prior to suspend */ + cpuret = rtas_offline_cpus_mask(offline_mask); + if (cpuret) + pr_warn("%s: Could not restore CPUs to offline state.\n", + __func__); + +out: + free_cpumask_var(offline_mask); return atomic_read(&data.error); } #else /* CONFIG_PPC_PSERIES */ @@ -880,32 +993,36 @@ struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, (struct rtas_ext_event_log_v6 *)log->buffer; struct pseries_errorlog *sect; unsigned char *p, *log_end; + uint32_t ext_log_length = rtas_error_extended_log_length(log); + uint8_t log_format = rtas_ext_event_log_format(ext_log); + uint32_t company_id = rtas_ext_event_company_id(ext_log); /* Check that we understand the format */ - if (log->extended_log_length < sizeof(struct rtas_ext_event_log_v6) || - ext_log->log_format != RTAS_V6EXT_LOG_FORMAT_EVENT_LOG || - ext_log->company_id != RTAS_V6EXT_COMPANY_ID_IBM) + if (ext_log_length < sizeof(struct rtas_ext_event_log_v6) || + log_format != RTAS_V6EXT_LOG_FORMAT_EVENT_LOG || + company_id != RTAS_V6EXT_COMPANY_ID_IBM) return NULL; - log_end = log->buffer + log->extended_log_length; + log_end = log->buffer + ext_log_length; p = ext_log->vendor_log; while (p < log_end) { sect = (struct pseries_errorlog *)p; - if (sect->id == section_id) + if (pseries_errorlog_id(sect) == section_id) return sect; - p += sect->length; + p += pseries_errorlog_length(sect); } return NULL; } +/* We assume to be passed big endian arguments */ asmlinkage int ppc_rtas(struct rtas_args __user *uargs) { struct rtas_args args; unsigned long flags; char *buff_copy, *errbuf = NULL; - int nargs; + int nargs, nret, token; int rc; if (!capable(CAP_SYS_ADMIN)) @@ -914,10 +1031,13 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs) if (copy_from_user(&args, uargs, 3 * sizeof(u32)) != 0) return -EFAULT; - nargs = args.nargs; + nargs = be32_to_cpu(args.nargs); + nret = be32_to_cpu(args.nret); + token = be32_to_cpu(args.token); + if (nargs > ARRAY_SIZE(args.args) - || args.nret > ARRAY_SIZE(args.args) - || nargs + args.nret > ARRAY_SIZE(args.args)) + || nret > ARRAY_SIZE(args.args) + || nargs + nret > ARRAY_SIZE(args.args)) return -EINVAL; /* Copy in args. */ @@ -925,14 +1045,14 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs) nargs * sizeof(rtas_arg_t)) != 0) return -EFAULT; - if (args.token == RTAS_UNKNOWN_SERVICE) + if (token == RTAS_UNKNOWN_SERVICE) return -EINVAL; args.rets = &args.args[nargs]; - memset(args.rets, 0, args.nret * sizeof(rtas_arg_t)); + memset(args.rets, 0, nret * sizeof(rtas_arg_t)); /* Need to handle ibm,suspend_me call specially */ - if (args.token == ibm_suspend_me_token) { + if (token == ibm_suspend_me_token) { rc = rtas_ibm_suspend_me(&args); if (rc) return rc; @@ -949,7 +1069,7 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs) /* A -1 return code indicates that the last command couldn't be completed due to a hardware error. */ - if (args.rets[0] == -1) + if (be32_to_cpu(args.rets[0]) == -1) errbuf = __fetch_rtas_last_error(buff_copy); unlock_rtas(flags); @@ -964,7 +1084,7 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs) /* Copy out args. */ if (copy_to_user(uargs->args + nargs, args.args + nargs, - args.nret * sizeof(rtas_arg_t)) != 0) + nret * sizeof(rtas_arg_t)) != 0) return -EFAULT; return 0; @@ -984,19 +1104,19 @@ void __init rtas_initialize(void) */ rtas.dev = of_find_node_by_name(NULL, "rtas"); if (rtas.dev) { - const u32 *basep, *entryp, *sizep; + const __be32 *basep, *entryp, *sizep; basep = of_get_property(rtas.dev, "linux,rtas-base", NULL); sizep = of_get_property(rtas.dev, "rtas-size", NULL); if (basep != NULL && sizep != NULL) { - rtas.base = *basep; - rtas.size = *sizep; + rtas.base = __be32_to_cpu(*basep); + rtas.size = __be32_to_cpu(*sizep); entryp = of_get_property(rtas.dev, "linux,rtas-entry", NULL); if (entryp == NULL) /* Ugh */ rtas.entry = rtas.base; else - rtas.entry = *entryp; + rtas.entry = __be32_to_cpu(*entryp); } else rtas.dev = NULL; } @@ -1022,7 +1142,7 @@ void __init rtas_initialize(void) int __init early_init_dt_scan_rtas(unsigned long node, const char *uname, int depth, void *data) { - u32 *basep, *entryp, *sizep; + const u32 *basep, *entryp, *sizep; if (depth != 1 || strcmp(uname, "rtas") != 0) return 0; @@ -1059,7 +1179,7 @@ int __init early_init_dt_scan_rtas(unsigned long node, static arch_spinlock_t timebase_lock; static u64 timebase = 0; -void __cpuinit rtas_give_timebase(void) +void rtas_give_timebase(void) { unsigned long flags; @@ -1076,7 +1196,7 @@ void __cpuinit rtas_give_timebase(void) local_irq_restore(flags); } -void __cpuinit rtas_take_timebase(void) +void rtas_take_timebase(void) { while (!timebase) barrier(); diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c index 8329190312c..db2b482af65 100644 --- a/arch/powerpc/kernel/rtas_flash.c +++ b/arch/powerpc/kernel/rtas_flash.c @@ -57,13 +57,31 @@ #define VALIDATE_READY -1001 /* Firmware image ready for validation */ #define VALIDATE_PARAM_ERR -3 /* RTAS Parameter Error */ #define VALIDATE_HW_ERR -1 /* RTAS Hardware Error */ -#define VALIDATE_TMP_UPDATE 0 /* Validate Return Status */ -#define VALIDATE_FLASH_AUTH 1 /* Validate Return Status */ -#define VALIDATE_INVALID_IMG 2 /* Validate Return Status */ -#define VALIDATE_CUR_UNKNOWN 3 /* Validate Return Status */ -#define VALIDATE_TMP_COMMIT_DL 4 /* Validate Return Status */ -#define VALIDATE_TMP_COMMIT 5 /* Validate Return Status */ -#define VALIDATE_TMP_UPDATE_DL 6 /* Validate Return Status */ + +/* ibm,validate-flash-image update result tokens */ +#define VALIDATE_TMP_UPDATE 0 /* T side will be updated */ +#define VALIDATE_FLASH_AUTH 1 /* Partition does not have authority */ +#define VALIDATE_INVALID_IMG 2 /* Candidate image is not valid */ +#define VALIDATE_CUR_UNKNOWN 3 /* Current fixpack level is unknown */ +/* + * Current T side will be committed to P side before being replace with new + * image, and the new image is downlevel from current image + */ +#define VALIDATE_TMP_COMMIT_DL 4 +/* + * Current T side will be committed to P side before being replaced with new + * image + */ +#define VALIDATE_TMP_COMMIT 5 +/* + * T side will be updated with a downlevel image + */ +#define VALIDATE_TMP_UPDATE_DL 6 +/* + * The candidate image's release date is later than the system's firmware + * service entitlement date - service warranty period has expired + */ +#define VALIDATE_OUT_OF_WRNTY 7 /* ibm,manage-flash-image operation tokens */ #define RTAS_REJECT_TMP_IMG 0 @@ -71,6 +89,7 @@ /* Array sizes */ #define VALIDATE_BUF_SIZE 4096 +#define VALIDATE_MSG_LEN 256 #define RTAS_MSG_MAXLEN 64 /* Quirk - RTAS requires 4k list length and block size */ @@ -102,9 +121,10 @@ static struct kmem_cache *flash_block_cache = NULL; #define FLASH_BLOCK_LIST_VERSION (1UL) -/* Local copy of the flash block list. - * We only allow one open of the flash proc file and create this - * list as we go. The rtas_firmware_flash_list varable will be +/* + * Local copy of the flash block list. + * + * The rtas_firmware_flash_list varable will be * set once the data is fully read. * * For convenience as we build the list we use virtual addrs, @@ -125,23 +145,23 @@ struct rtas_update_flash_t struct rtas_manage_flash_t { int status; /* Returned status */ - unsigned int op; /* Reject or commit image */ }; /* Status int must be first member of struct */ struct rtas_validate_flash_t { int status; /* Returned status */ - char buf[VALIDATE_BUF_SIZE]; /* Candidate image buffer */ + char *buf; /* Candidate image buffer */ unsigned int buf_size; /* Size of image buf */ unsigned int update_results; /* Update results token */ }; -static DEFINE_SPINLOCK(flash_file_open_lock); -static struct proc_dir_entry *firmware_flash_pde; -static struct proc_dir_entry *firmware_update_pde; -static struct proc_dir_entry *validate_pde; -static struct proc_dir_entry *manage_pde; +static struct rtas_update_flash_t rtas_update_flash_data; +static struct rtas_manage_flash_t rtas_manage_flash_data; +static struct rtas_validate_flash_t rtas_validate_flash_data; +static DEFINE_MUTEX(rtas_update_flash_mutex); +static DEFINE_MUTEX(rtas_manage_flash_mutex); +static DEFINE_MUTEX(rtas_validate_flash_mutex); /* Do simple sanity checks on the flash image. */ static int flash_list_valid(struct flash_block_list *flist) @@ -191,10 +211,10 @@ static void free_flash_list(struct flash_block_list *f) static int rtas_flash_release(struct inode *inode, struct file *file) { - struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode); - struct rtas_update_flash_t *uf; - - uf = (struct rtas_update_flash_t *) dp->data; + struct rtas_update_flash_t *const uf = &rtas_update_flash_data; + + mutex_lock(&rtas_update_flash_mutex); + if (uf->flist) { /* File was opened in write mode for a new flash attempt */ /* Clear saved list */ @@ -214,13 +234,14 @@ static int rtas_flash_release(struct inode *inode, struct file *file) uf->flist = NULL; } - atomic_dec(&dp->count); + mutex_unlock(&rtas_update_flash_mutex); return 0; } -static void get_flash_status_msg(int status, char *buf) +static size_t get_flash_status_msg(int status, char *buf) { - char *msg; + const char *msg; + size_t len; switch (status) { case FLASH_AUTH: @@ -242,36 +263,47 @@ static void get_flash_status_msg(int status, char *buf) msg = "ready: firmware image ready for flash on reboot\n"; break; default: - sprintf(buf, "error: unexpected status value %d\n", status); - return; + return sprintf(buf, "error: unexpected status value %d\n", + status); } - strcpy(buf, msg); + len = strlen(msg); + memcpy(buf, msg, len + 1); + return len; } /* Reading the proc file will show status (not the firmware contents) */ -static ssize_t rtas_flash_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t rtas_flash_read_msg(struct file *file, char __user *buf, + size_t count, loff_t *ppos) { - struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode); - struct rtas_update_flash_t *uf; + struct rtas_update_flash_t *const uf = &rtas_update_flash_data; char msg[RTAS_MSG_MAXLEN]; + size_t len; + int status; - uf = dp->data; - - if (!strcmp(dp->name, FIRMWARE_FLASH_NAME)) { - get_flash_status_msg(uf->status, msg); - } else { /* FIRMWARE_UPDATE_NAME */ - sprintf(msg, "%d\n", uf->status); - } + mutex_lock(&rtas_update_flash_mutex); + status = uf->status; + mutex_unlock(&rtas_update_flash_mutex); - return simple_read_from_buffer(buf, count, ppos, msg, strlen(msg)); + /* Read as text message */ + len = get_flash_status_msg(status, msg); + return simple_read_from_buffer(buf, count, ppos, msg, len); } -/* constructor for flash_block_cache */ -void rtas_block_ctor(void *ptr) +static ssize_t rtas_flash_read_num(struct file *file, char __user *buf, + size_t count, loff_t *ppos) { - memset(ptr, 0, RTAS_BLK_SIZE); + struct rtas_update_flash_t *const uf = &rtas_update_flash_data; + char msg[RTAS_MSG_MAXLEN]; + int status; + + mutex_lock(&rtas_update_flash_mutex); + status = uf->status; + mutex_unlock(&rtas_update_flash_mutex); + + /* Read as number */ + sprintf(msg, "%d\n", status); + return simple_read_from_buffer(buf, count, ppos, msg, strlen(msg)); } /* We could be much more efficient here. But to keep this function @@ -282,25 +314,24 @@ void rtas_block_ctor(void *ptr) static ssize_t rtas_flash_write(struct file *file, const char __user *buffer, size_t count, loff_t *off) { - struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode); - struct rtas_update_flash_t *uf; + struct rtas_update_flash_t *const uf = &rtas_update_flash_data; char *p; - int next_free; + int next_free, rc; struct flash_block_list *fl; - uf = (struct rtas_update_flash_t *) dp->data; + mutex_lock(&rtas_update_flash_mutex); if (uf->status == FLASH_AUTH || count == 0) - return count; /* discard data */ + goto out; /* discard data */ /* In the case that the image is not ready for flashing, the memory * allocated for the block list will be freed upon the release of the * proc file */ if (uf->flist == NULL) { - uf->flist = kmem_cache_alloc(flash_block_cache, GFP_KERNEL); + uf->flist = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL); if (!uf->flist) - return -ENOMEM; + goto nomem; } fl = uf->flist; @@ -309,63 +340,48 @@ static ssize_t rtas_flash_write(struct file *file, const char __user *buffer, next_free = fl->num_blocks; if (next_free == FLASH_BLOCKS_PER_NODE) { /* Need to allocate another block_list */ - fl->next = kmem_cache_alloc(flash_block_cache, GFP_KERNEL); + fl->next = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL); if (!fl->next) - return -ENOMEM; + goto nomem; fl = fl->next; next_free = 0; } if (count > RTAS_BLK_SIZE) count = RTAS_BLK_SIZE; - p = kmem_cache_alloc(flash_block_cache, GFP_KERNEL); + p = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL); if (!p) - return -ENOMEM; + goto nomem; if(copy_from_user(p, buffer, count)) { kmem_cache_free(flash_block_cache, p); - return -EFAULT; + rc = -EFAULT; + goto error; } fl->blocks[next_free].data = p; fl->blocks[next_free].length = count; fl->num_blocks++; - +out: + mutex_unlock(&rtas_update_flash_mutex); return count; -} - -static int rtas_excl_open(struct inode *inode, struct file *file) -{ - struct proc_dir_entry *dp = PDE(inode); - - /* Enforce exclusive open with use count of PDE */ - spin_lock(&flash_file_open_lock); - if (atomic_read(&dp->count) > 2) { - spin_unlock(&flash_file_open_lock); - return -EBUSY; - } - - atomic_inc(&dp->count); - spin_unlock(&flash_file_open_lock); - - return 0; -} -static int rtas_excl_release(struct inode *inode, struct file *file) -{ - struct proc_dir_entry *dp = PDE(inode); - - atomic_dec(&dp->count); - - return 0; +nomem: + rc = -ENOMEM; +error: + mutex_unlock(&rtas_update_flash_mutex); + return rc; } -static void manage_flash(struct rtas_manage_flash_t *args_buf) +/* + * Flash management routines. + */ +static void manage_flash(struct rtas_manage_flash_t *args_buf, unsigned int op) { s32 rc; do { - rc = rtas_call(rtas_token("ibm,manage-flash-image"), 1, - 1, NULL, args_buf->op); + rc = rtas_call(rtas_token("ibm,manage-flash-image"), 1, 1, + NULL, op); } while (rtas_busy_delay(rc)); args_buf->status = rc; @@ -374,55 +390,62 @@ static void manage_flash(struct rtas_manage_flash_t *args_buf) static ssize_t manage_flash_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode); - struct rtas_manage_flash_t *args_buf; + struct rtas_manage_flash_t *const args_buf = &rtas_manage_flash_data; char msg[RTAS_MSG_MAXLEN]; - int msglen; - - args_buf = dp->data; - if (args_buf == NULL) - return 0; + int msglen, status; - msglen = sprintf(msg, "%d\n", args_buf->status); + mutex_lock(&rtas_manage_flash_mutex); + status = args_buf->status; + mutex_unlock(&rtas_manage_flash_mutex); + msglen = sprintf(msg, "%d\n", status); return simple_read_from_buffer(buf, count, ppos, msg, msglen); } static ssize_t manage_flash_write(struct file *file, const char __user *buf, size_t count, loff_t *off) { - struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode); - struct rtas_manage_flash_t *args_buf; - const char reject_str[] = "0"; - const char commit_str[] = "1"; + struct rtas_manage_flash_t *const args_buf = &rtas_manage_flash_data; + static const char reject_str[] = "0"; + static const char commit_str[] = "1"; char stkbuf[10]; - int op; + int op, rc; + + mutex_lock(&rtas_manage_flash_mutex); - args_buf = (struct rtas_manage_flash_t *) dp->data; if ((args_buf->status == MANAGE_AUTH) || (count == 0)) - return count; + goto out; op = -1; if (buf) { if (count > 9) count = 9; - if (copy_from_user (stkbuf, buf, count)) { - return -EFAULT; - } + rc = -EFAULT; + if (copy_from_user (stkbuf, buf, count)) + goto error; if (strncmp(stkbuf, reject_str, strlen(reject_str)) == 0) op = RTAS_REJECT_TMP_IMG; else if (strncmp(stkbuf, commit_str, strlen(commit_str)) == 0) op = RTAS_COMMIT_TMP_IMG; } - if (op == -1) /* buf is empty, or contains invalid string */ - return -EINVAL; - - args_buf->op = op; - manage_flash(args_buf); + if (op == -1) { /* buf is empty, or contains invalid string */ + rc = -EINVAL; + goto error; + } + manage_flash(args_buf, op); +out: + mutex_unlock(&rtas_manage_flash_mutex); return count; + +error: + mutex_unlock(&rtas_manage_flash_mutex); + return rc; } +/* + * Validation routines. + */ static void validate_flash(struct rtas_validate_flash_t *args_buf) { int token = rtas_token("ibm,validate-flash-image"); @@ -444,7 +467,7 @@ static void validate_flash(struct rtas_validate_flash_t *args_buf) } static int get_validate_flash_msg(struct rtas_validate_flash_t *args_buf, - char *msg) + char *msg, int msglen) { int n; @@ -452,7 +475,8 @@ static int get_validate_flash_msg(struct rtas_validate_flash_t *args_buf, n = sprintf(msg, "%d\n", args_buf->update_results); if ((args_buf->update_results >= VALIDATE_CUR_UNKNOWN) || (args_buf->update_results == VALIDATE_TMP_UPDATE)) - n += sprintf(msg + n, "%s\n", args_buf->buf); + n += snprintf(msg + n, msglen - n, "%s\n", + args_buf->buf); } else { n = sprintf(msg, "%d\n", args_buf->status); } @@ -462,14 +486,14 @@ static int get_validate_flash_msg(struct rtas_validate_flash_t *args_buf, static ssize_t validate_flash_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode); - struct rtas_validate_flash_t *args_buf; - char msg[RTAS_MSG_MAXLEN]; + struct rtas_validate_flash_t *const args_buf = + &rtas_validate_flash_data; + char msg[VALIDATE_MSG_LEN]; int msglen; - args_buf = dp->data; - - msglen = get_validate_flash_msg(args_buf, msg); + mutex_lock(&rtas_validate_flash_mutex); + msglen = get_validate_flash_msg(args_buf, msg, VALIDATE_MSG_LEN); + mutex_unlock(&rtas_validate_flash_mutex); return simple_read_from_buffer(buf, count, ppos, msg, msglen); } @@ -477,24 +501,18 @@ static ssize_t validate_flash_read(struct file *file, char __user *buf, static ssize_t validate_flash_write(struct file *file, const char __user *buf, size_t count, loff_t *off) { - struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode); - struct rtas_validate_flash_t *args_buf; + struct rtas_validate_flash_t *const args_buf = + &rtas_validate_flash_data; int rc; - args_buf = (struct rtas_validate_flash_t *) dp->data; - - if (dp->data == NULL) { - dp->data = kmalloc(sizeof(struct rtas_validate_flash_t), - GFP_KERNEL); - if (dp->data == NULL) - return -ENOMEM; - } + mutex_lock(&rtas_validate_flash_mutex); /* We are only interested in the first 4K of the * candidate image */ if ((*off >= VALIDATE_BUF_SIZE) || (args_buf->status == VALIDATE_AUTH)) { *off += count; + mutex_unlock(&rtas_validate_flash_mutex); return count; } @@ -517,31 +535,29 @@ static ssize_t validate_flash_write(struct file *file, const char __user *buf, *off += count; rc = count; done: - if (rc < 0) { - kfree(dp->data); - dp->data = NULL; - } + mutex_unlock(&rtas_validate_flash_mutex); return rc; } static int validate_flash_release(struct inode *inode, struct file *file) { - struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode); - struct rtas_validate_flash_t *args_buf; + struct rtas_validate_flash_t *const args_buf = + &rtas_validate_flash_data; - args_buf = (struct rtas_validate_flash_t *) dp->data; + mutex_lock(&rtas_validate_flash_mutex); if (args_buf->status == VALIDATE_READY) { args_buf->buf_size = VALIDATE_BUF_SIZE; validate_flash(args_buf); } - /* The matching atomic_inc was in rtas_excl_open() */ - atomic_dec(&dp->count); - + mutex_unlock(&rtas_validate_flash_mutex); return 0; } +/* + * On-reboot flash update applicator. + */ static void rtas_flash_firmware(int reboot_type) { unsigned long image_size; @@ -595,17 +611,19 @@ static void rtas_flash_firmware(int reboot_type) for (f = flist; f; f = next) { /* Translate data addrs to absolute */ for (i = 0; i < f->num_blocks; i++) { - f->blocks[i].data = (char *)__pa(f->blocks[i].data); + f->blocks[i].data = (char *)cpu_to_be64(__pa(f->blocks[i].data)); image_size += f->blocks[i].length; + f->blocks[i].length = cpu_to_be64(f->blocks[i].length); } next = f->next; /* Don't translate NULL pointer for last entry */ if (f->next) - f->next = (struct flash_block_list *)__pa(f->next); + f->next = (struct flash_block_list *)cpu_to_be64(__pa(f->next)); else f->next = NULL; /* make num_blocks into the version/length field */ f->num_blocks = (FLASH_BLOCK_LIST_VERSION << 56) | ((f->num_blocks+1)*16); + f->num_blocks = cpu_to_be64(f->num_blocks); } printk(KERN_ALERT "FLASH: flash image is %ld bytes\n", image_size); @@ -634,169 +652,128 @@ static void rtas_flash_firmware(int reboot_type) spin_unlock(&rtas_data_buf_lock); } -static void remove_flash_pde(struct proc_dir_entry *dp) -{ - if (dp) { - kfree(dp->data); - remove_proc_entry(dp->name, dp->parent); - } -} - -static int initialize_flash_pde_data(const char *rtas_call_name, - size_t buf_size, - struct proc_dir_entry *dp) -{ +/* + * Manifest of proc files to create + */ +struct rtas_flash_file { + const char *filename; + const char *rtas_call_name; int *status; - int token; - - dp->data = kzalloc(buf_size, GFP_KERNEL); - if (dp->data == NULL) - return -ENOMEM; - - /* - * This code assumes that the status int is the first member of the - * struct - */ - status = (int *) dp->data; - token = rtas_token(rtas_call_name); - if (token == RTAS_UNKNOWN_SERVICE) - *status = FLASH_AUTH; - else - *status = FLASH_NO_OP; - - return 0; -} - -static struct proc_dir_entry *create_flash_pde(const char *filename, - const struct file_operations *fops) -{ - return proc_create(filename, S_IRUSR | S_IWUSR, NULL, fops); -} - -static const struct file_operations rtas_flash_operations = { - .owner = THIS_MODULE, - .read = rtas_flash_read, - .write = rtas_flash_write, - .open = rtas_excl_open, - .release = rtas_flash_release, - .llseek = default_llseek, -}; - -static const struct file_operations manage_flash_operations = { - .owner = THIS_MODULE, - .read = manage_flash_read, - .write = manage_flash_write, - .open = rtas_excl_open, - .release = rtas_excl_release, - .llseek = default_llseek, + const struct file_operations fops; }; -static const struct file_operations validate_flash_operations = { - .owner = THIS_MODULE, - .read = validate_flash_read, - .write = validate_flash_write, - .open = rtas_excl_open, - .release = validate_flash_release, - .llseek = default_llseek, +static const struct rtas_flash_file rtas_flash_files[] = { + { + .filename = "powerpc/rtas/" FIRMWARE_FLASH_NAME, + .rtas_call_name = "ibm,update-flash-64-and-reboot", + .status = &rtas_update_flash_data.status, + .fops.read = rtas_flash_read_msg, + .fops.write = rtas_flash_write, + .fops.release = rtas_flash_release, + .fops.llseek = default_llseek, + }, + { + .filename = "powerpc/rtas/" FIRMWARE_UPDATE_NAME, + .rtas_call_name = "ibm,update-flash-64-and-reboot", + .status = &rtas_update_flash_data.status, + .fops.read = rtas_flash_read_num, + .fops.write = rtas_flash_write, + .fops.release = rtas_flash_release, + .fops.llseek = default_llseek, + }, + { + .filename = "powerpc/rtas/" VALIDATE_FLASH_NAME, + .rtas_call_name = "ibm,validate-flash-image", + .status = &rtas_validate_flash_data.status, + .fops.read = validate_flash_read, + .fops.write = validate_flash_write, + .fops.release = validate_flash_release, + .fops.llseek = default_llseek, + }, + { + .filename = "powerpc/rtas/" MANAGE_FLASH_NAME, + .rtas_call_name = "ibm,manage-flash-image", + .status = &rtas_manage_flash_data.status, + .fops.read = manage_flash_read, + .fops.write = manage_flash_write, + .fops.llseek = default_llseek, + } }; static int __init rtas_flash_init(void) { - int rc; + int i; if (rtas_token("ibm,update-flash-64-and-reboot") == RTAS_UNKNOWN_SERVICE) { pr_info("rtas_flash: no firmware flash support\n"); - return 1; + return -EINVAL; } - firmware_flash_pde = create_flash_pde("powerpc/rtas/" - FIRMWARE_FLASH_NAME, - &rtas_flash_operations); - if (firmware_flash_pde == NULL) { - rc = -ENOMEM; - goto cleanup; - } + rtas_validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL); + if (!rtas_validate_flash_data.buf) + return -ENOMEM; - rc = initialize_flash_pde_data("ibm,update-flash-64-and-reboot", - sizeof(struct rtas_update_flash_t), - firmware_flash_pde); - if (rc != 0) - goto cleanup; - - firmware_update_pde = create_flash_pde("powerpc/rtas/" - FIRMWARE_UPDATE_NAME, - &rtas_flash_operations); - if (firmware_update_pde == NULL) { - rc = -ENOMEM; - goto cleanup; + flash_block_cache = kmem_cache_create("rtas_flash_cache", + RTAS_BLK_SIZE, RTAS_BLK_SIZE, 0, + NULL); + if (!flash_block_cache) { + printk(KERN_ERR "%s: failed to create block cache\n", + __func__); + goto enomem_buf; } - rc = initialize_flash_pde_data("ibm,update-flash-64-and-reboot", - sizeof(struct rtas_update_flash_t), - firmware_update_pde); - if (rc != 0) - goto cleanup; - - validate_pde = create_flash_pde("powerpc/rtas/" VALIDATE_FLASH_NAME, - &validate_flash_operations); - if (validate_pde == NULL) { - rc = -ENOMEM; - goto cleanup; - } + for (i = 0; i < ARRAY_SIZE(rtas_flash_files); i++) { + const struct rtas_flash_file *f = &rtas_flash_files[i]; + int token; - rc = initialize_flash_pde_data("ibm,validate-flash-image", - sizeof(struct rtas_validate_flash_t), - validate_pde); - if (rc != 0) - goto cleanup; - - manage_pde = create_flash_pde("powerpc/rtas/" MANAGE_FLASH_NAME, - &manage_flash_operations); - if (manage_pde == NULL) { - rc = -ENOMEM; - goto cleanup; - } + if (!proc_create(f->filename, S_IRUSR | S_IWUSR, NULL, &f->fops)) + goto enomem; - rc = initialize_flash_pde_data("ibm,manage-flash-image", - sizeof(struct rtas_manage_flash_t), - manage_pde); - if (rc != 0) - goto cleanup; + /* + * This code assumes that the status int is the first member of the + * struct + */ + token = rtas_token(f->rtas_call_name); + if (token == RTAS_UNKNOWN_SERVICE) + *f->status = FLASH_AUTH; + else + *f->status = FLASH_NO_OP; + } rtas_flash_term_hook = rtas_flash_firmware; - - flash_block_cache = kmem_cache_create("rtas_flash_cache", - RTAS_BLK_SIZE, RTAS_BLK_SIZE, 0, - rtas_block_ctor); - if (!flash_block_cache) { - printk(KERN_ERR "%s: failed to create block cache\n", - __func__); - rc = -ENOMEM; - goto cleanup; - } return 0; -cleanup: - remove_flash_pde(firmware_flash_pde); - remove_flash_pde(firmware_update_pde); - remove_flash_pde(validate_pde); - remove_flash_pde(manage_pde); +enomem: + while (--i >= 0) { + const struct rtas_flash_file *f = &rtas_flash_files[i]; + remove_proc_entry(f->filename, NULL); + } - return rc; + kmem_cache_destroy(flash_block_cache); +enomem_buf: + kfree(rtas_validate_flash_data.buf); + return -ENOMEM; } static void __exit rtas_flash_cleanup(void) { + int i; + rtas_flash_term_hook = NULL; - if (flash_block_cache) - kmem_cache_destroy(flash_block_cache); + if (rtas_firmware_flash_list) { + free_flash_list(rtas_firmware_flash_list); + rtas_firmware_flash_list = NULL; + } + + for (i = 0; i < ARRAY_SIZE(rtas_flash_files); i++) { + const struct rtas_flash_file *f = &rtas_flash_files[i]; + remove_proc_entry(f->filename, NULL); + } - remove_flash_pde(firmware_flash_pde); - remove_flash_pde(firmware_update_pde); - remove_flash_pde(validate_pde); - remove_flash_pde(manage_pde); + kmem_cache_destroy(flash_block_cache); + kfree(rtas_validate_flash_data.buf); } module_init(rtas_flash_init); diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index 71cb20d6ec6..c168337aef9 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -80,10 +80,6 @@ int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val) if (ret) return PCIBIOS_DEVICE_NOT_FOUND; - if (returnval == EEH_IO_ERROR_VALUE(size) && - eeh_dev_check_failure(of_node_to_eeh_dev(pdn->node))) - return PCIBIOS_DEVICE_NOT_FOUND; - return PCIBIOS_SUCCESSFUL; } @@ -92,18 +88,39 @@ static int rtas_pci_read_config(struct pci_bus *bus, int where, int size, u32 *val) { struct device_node *busdn, *dn; - - busdn = pci_bus_to_OF_node(bus); + struct pci_dn *pdn; + bool found = false; +#ifdef CONFIG_EEH + struct eeh_dev *edev; +#endif + int ret; /* Search only direct children of the bus */ + *val = 0xFFFFFFFF; + busdn = pci_bus_to_OF_node(bus); for (dn = busdn->child; dn; dn = dn->sibling) { - struct pci_dn *pdn = PCI_DN(dn); + pdn = PCI_DN(dn); if (pdn && pdn->devfn == devfn - && of_device_is_available(dn)) - return rtas_read_config(pdn, where, size, val); + && of_device_is_available(dn)) { + found = true; + break; + } } - return PCIBIOS_DEVICE_NOT_FOUND; + if (!found) + return PCIBIOS_DEVICE_NOT_FOUND; +#ifdef CONFIG_EEH + edev = of_node_to_eeh_dev(dn); + if (edev && edev->pe && edev->pe->state & EEH_PE_RESET) + return PCIBIOS_DEVICE_NOT_FOUND; +#endif + + ret = rtas_read_config(pdn, where, size, val); + if (*val == EEH_IO_ERROR_VALUE(size) && + eeh_dev_check_failure(of_node_to_eeh_dev(dn))) + return PCIBIOS_DEVICE_NOT_FOUND; + + return ret; } int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val) @@ -136,17 +153,34 @@ static int rtas_pci_write_config(struct pci_bus *bus, int where, int size, u32 val) { struct device_node *busdn, *dn; - - busdn = pci_bus_to_OF_node(bus); + struct pci_dn *pdn; + bool found = false; +#ifdef CONFIG_EEH + struct eeh_dev *edev; +#endif + int ret; /* Search only direct children of the bus */ + busdn = pci_bus_to_OF_node(bus); for (dn = busdn->child; dn; dn = dn->sibling) { - struct pci_dn *pdn = PCI_DN(dn); + pdn = PCI_DN(dn); if (pdn && pdn->devfn == devfn - && of_device_is_available(dn)) - return rtas_write_config(pdn, where, size, val); + && of_device_is_available(dn)) { + found = true; + break; + } } - return PCIBIOS_DEVICE_NOT_FOUND; + + if (!found) + return PCIBIOS_DEVICE_NOT_FOUND; +#ifdef CONFIG_EEH + edev = of_node_to_eeh_dev(dn); + if (edev && edev->pe && (edev->pe->state & EEH_PE_RESET)) + return PCIBIOS_DEVICE_NOT_FOUND; +#endif + ret = rtas_write_config(pdn, where, size, val); + + return ret; } static struct pci_ops rtas_pci_ops = { @@ -201,7 +235,7 @@ static void python_countermeasures(struct device_node *dev) iounmap(chip_regs); } -void __init init_pci_config_tokens (void) +void __init init_pci_config_tokens(void) { read_pci_config = rtas_token("read-pci-config"); write_pci_config = rtas_token("write-pci-config"); @@ -209,7 +243,7 @@ void __init init_pci_config_tokens (void) ibm_write_pci_config = rtas_token("ibm,write-pci-config"); } -unsigned long get_phb_buid (struct device_node *phb) +unsigned long get_phb_buid(struct device_node *phb) { struct resource r; @@ -223,7 +257,7 @@ unsigned long get_phb_buid (struct device_node *phb) static int phb_set_bus_ranges(struct device_node *dev, struct pci_controller *phb) { - const int *bus_range; + const __be32 *bus_range; unsigned int len; bus_range = of_get_property(dev, "bus-range", &len); @@ -231,8 +265,8 @@ static int phb_set_bus_ranges(struct device_node *dev, return 1; } - phb->first_busno = bus_range[0]; - phb->last_busno = bus_range[1]; + phb->first_busno = be32_to_cpu(bus_range[0]); + phb->last_busno = be32_to_cpu(bus_range[1]); return 0; } diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 1045ff49cc6..e736387fee6 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -29,6 +29,7 @@ #include <asm/nvram.h> #include <linux/atomic.h> #include <asm/machdep.h> +#include <asm/topology.h> static DEFINE_SPINLOCK(rtasd_log_lock); @@ -87,6 +88,8 @@ static char *rtas_event_type(int type) return "Resource Deallocation Event"; case RTAS_TYPE_DUMP: return "Dump Notification Event"; + case RTAS_TYPE_PRRN: + return "Platform Resource Reassignment Event"; } return rtas_type[0]; @@ -147,8 +150,8 @@ static void printk_log_rtas(char *buf, int len) struct rtas_error_log *errlog = (struct rtas_error_log *)buf; printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n", - error_log_cnt, rtas_event_type(errlog->type), - errlog->severity); + error_log_cnt, rtas_event_type(rtas_error_type(errlog)), + rtas_error_severity(errlog)); } } @@ -156,14 +159,16 @@ static int log_rtas_len(char * buf) { int len; struct rtas_error_log *err; + uint32_t extended_log_length; /* rtas fixed header */ len = 8; err = (struct rtas_error_log *)buf; - if (err->extended && err->extended_log_length) { + extended_log_length = rtas_error_extended_log_length(err); + if (rtas_error_extended(err) && extended_log_length) { /* extended header */ - len += err->extended_log_length; + len += extended_log_length; } if (rtas_error_log_max == 0) @@ -265,9 +270,49 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) spin_unlock_irqrestore(&rtasd_log_lock, s); return; } +} + +#ifdef CONFIG_PPC_PSERIES +static s32 prrn_update_scope; +static void prrn_work_fn(struct work_struct *work) +{ + /* + * For PRRN, we must pass the negative of the scope value in + * the RTAS event. + */ + pseries_devicetree_update(-prrn_update_scope); } +static DECLARE_WORK(prrn_work, prrn_work_fn); + +void prrn_schedule_update(u32 scope) +{ + flush_work(&prrn_work); + prrn_update_scope = scope; + schedule_work(&prrn_work); +} + +static void handle_rtas_event(const struct rtas_error_log *log) +{ + if (rtas_error_type(log) != RTAS_TYPE_PRRN || !prrn_is_enabled()) + return; + + /* For PRRN Events the extended log length is used to denote + * the scope for calling rtas update-nodes. + */ + prrn_schedule_update(rtas_error_extended_log_length(log)); +} + +#else + +static void handle_rtas_event(const struct rtas_error_log *log) +{ + return; +} + +#endif + static int rtas_log_open(struct inode * inode, struct file * file) { return 0; @@ -388,8 +433,10 @@ static void do_event_scan(void) break; } - if (error == 0) + if (error == 0) { pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0); + handle_rtas_event((struct rtas_error_log *)logdata); + } } while(error == 0); } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index bdc499c1787..e5b022c55cc 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -62,8 +62,6 @@ #include <mm/mmu_decl.h> #include <asm/fadump.h> -#include "setup.h" - #ifdef DEBUG #include <asm/udbg.h> #define DBG(fmt...) udbg_printf(fmt) @@ -78,6 +76,9 @@ EXPORT_SYMBOL(ppc_md); struct machdep_calls *machine_id; EXPORT_SYMBOL(machine_id); +int boot_cpuid = -1; +EXPORT_SYMBOL_GPL(boot_cpuid); + unsigned long klimit = (unsigned long) _end; char cmd_line[COMMAND_LINE_SIZE]; @@ -211,6 +212,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) { unsigned long cpu_id = (unsigned long)v - 1; unsigned int pvr; + unsigned long proc_freq; unsigned short maj; unsigned short min; @@ -262,12 +264,19 @@ static int show_cpuinfo(struct seq_file *m, void *v) #endif /* CONFIG_TAU */ /* - * Assume here that all clock rates are the same in a - * smp system. -- Cort + * Platforms that have variable clock rates, should implement + * the method ppc_md.get_proc_freq() that reports the clock + * rate of a given cpu. The rest can use ppc_proc_freq to + * report the clock rate that is same across all cpus. */ - if (ppc_proc_freq) + if (ppc_md.get_proc_freq) + proc_freq = ppc_md.get_proc_freq(cpu_id); + else + proc_freq = ppc_proc_freq; + + if (proc_freq) seq_printf(m, "clock\t\t: %lu.%06luMHz\n", - ppc_proc_freq / 1000000, ppc_proc_freq % 1000000); + proc_freq / 1000000, proc_freq % 1000000); if (ppc_md.show_percpuinfo != NULL) ppc_md.show_percpuinfo(m, cpu_id); @@ -381,9 +390,10 @@ void __init check_for_initrd(void) #ifdef CONFIG_SMP -int threads_per_core, threads_shift; +int threads_per_core, threads_per_subcore, threads_shift; cpumask_t threads_core_mask; EXPORT_SYMBOL_GPL(threads_per_core); +EXPORT_SYMBOL_GPL(threads_per_subcore); EXPORT_SYMBOL_GPL(threads_shift); EXPORT_SYMBOL_GPL(threads_core_mask); @@ -392,6 +402,7 @@ static void __init cpu_init_thread_core_maps(int tpc) int i; threads_per_core = tpc; + threads_per_subcore = tpc; cpumask_clear(&threads_core_mask); /* This implementation only supports power of 2 number of threads @@ -436,7 +447,8 @@ void __init smp_setup_cpu_maps(void) DBG("smp_setup_cpu_maps()\n"); while ((dn = of_find_node_by_type(dn, "cpu")) && cpu < nr_cpu_ids) { - const int *intserv; + const __be32 *intserv; + __be32 cpu_be; int j, len; DBG(" * %s...\n", dn->full_name); @@ -450,15 +462,25 @@ void __init smp_setup_cpu_maps(void) } else { DBG(" no ibm,ppc-interrupt-server#s -> 1 thread\n"); intserv = of_get_property(dn, "reg", NULL); - if (!intserv) - intserv = &cpu; /* assume logical == phys */ + if (!intserv) { + cpu_be = cpu_to_be32(cpu); + intserv = &cpu_be; /* assume logical == phys */ + } } for (j = 0; j < nthreads && cpu < nr_cpu_ids; j++) { + bool avail; + DBG(" thread %d -> cpu %d (hard id %d)\n", - j, cpu, intserv[j]); - set_cpu_present(cpu, true); - set_hard_smp_processor_id(cpu, intserv[j]); + j, cpu, be32_to_cpu(intserv[j])); + + avail = of_device_is_available(dn); + if (!avail) + avail = !of_property_match_string(dn, + "enable-method", "spin-table"); + + set_cpu_present(cpu, avail); + set_hard_smp_processor_id(cpu, be32_to_cpu(intserv[j])); set_cpu_possible(cpu, true); cpu++; } @@ -478,7 +500,7 @@ void __init smp_setup_cpu_maps(void) if (machine_is(pseries) && firmware_has_feature(FW_FEATURE_LPAR) && (dn = of_find_node_by_path("/rtas"))) { int num_addr_cell, num_size_cell, maxcpus; - const unsigned int *ireg; + const __be32 *ireg; num_addr_cell = of_n_addr_cells(dn); num_size_cell = of_n_size_cells(dn); @@ -488,7 +510,7 @@ void __init smp_setup_cpu_maps(void) if (!ireg) goto out; - maxcpus = ireg[num_addr_cell + num_size_cell]; + maxcpus = be32_to_cpup(ireg + num_addr_cell + num_size_cell); /* Double maxcpus for processors which have SMT capability */ if (cpu_has_feature(CPU_FTR_SMT)) @@ -621,12 +643,6 @@ int check_legacy_ioport(unsigned long base_port) case FDC_BASE: /* FDC1 */ np = of_find_node_by_type(NULL, "fdc"); break; -#ifdef CONFIG_PPC_PREP - case _PIDXR: - case _PNPWRP: - case PNPBIOS_BASE: - /* implement me */ -#endif default: /* ipmi is supposed to fail here */ break; @@ -720,33 +736,6 @@ static int powerpc_debugfs_init(void) arch_initcall(powerpc_debugfs_init); #endif -#ifdef CONFIG_BOOKE_WDT -extern u32 booke_wdt_enabled; -extern u32 booke_wdt_period; - -/* Checks wdt=x and wdt_period=xx command-line option */ -notrace int __init early_parse_wdt(char *p) -{ - if (p && strncmp(p, "0", 1) != 0) - booke_wdt_enabled = 1; - - return 0; -} -early_param("wdt", early_parse_wdt); - -int __init early_parse_wdt_period(char *p) -{ - unsigned long ret; - if (p) { - if (!kstrtol(p, 0, &ret)) - booke_wdt_period = ret; - } - - return 0; -} -early_param("wdt_period", early_parse_wdt_period); -#endif /* CONFIG_BOOKE_WDT */ - void ppc_printk_progress(char *s, unsigned short hex) { pr_info("%s\n", s); diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h deleted file mode 100644 index 4c67ad7fae0..00000000000 --- a/arch/powerpc/kernel/setup.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _POWERPC_KERNEL_SETUP_H -#define _POWERPC_KERNEL_SETUP_H - -void check_for_initrd(void); -void do_init_bootmem(void); -void setup_panic(void); -extern int do_early_xmon; - -#endif /* _POWERPC_KERNEL_SETUP_H */ diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index a8f54ecb091..ea4fda60e57 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -38,15 +38,12 @@ #include <asm/serial.h> #include <asm/udbg.h> #include <asm/mmu_context.h> - -#include "setup.h" +#include <asm/epapr_hcalls.h> #define DBG(fmt...) extern void bootx_init(unsigned long r4, unsigned long phys); -int boot_cpuid = -1; -EXPORT_SYMBOL_GPL(boot_cpuid); int boot_cpuid_phys; EXPORT_SYMBOL_GPL(boot_cpuid_phys); @@ -128,6 +125,8 @@ notrace void __init machine_init(u64 dt_ptr) /* Do some early initialization based on the flat device tree */ early_init_devtree(__va(dt_ptr)); + epapr_paravirt_early_init(); + early_init_mmu(); probe_machine(); @@ -246,7 +245,12 @@ static void __init exc_lvl_early_init(void) /* interrupt stacks must be in lowmem, we get that for free on ppc32 * as the memblock is limited to lowmem by MEMBLOCK_REAL_LIMIT */ for_each_possible_cpu(i) { +#ifdef CONFIG_SMP hw_cpu = get_hard_smp_processor_id(i); +#else + hw_cpu = 0; +#endif + critirq_ctx[hw_cpu] = (struct thread_info *) __va(memblock_alloc(THREAD_SIZE, THREAD_SIZE)); #ifdef CONFIG_BOOKE @@ -295,9 +299,6 @@ void __init setup_arch(char **cmdline_p) if (cpu_has_feature(CPU_FTR_UNIFIED_ID_CACHE)) ucache_bsize = icache_bsize = dcache_bsize; - /* reboot on panic */ - panic_timeout = 180; - if (ppc_md.panic) setup_panic(); @@ -326,5 +327,4 @@ void __init setup_arch(char **cmdline_p) /* Initialize the MMU context management stuff */ mmu_context_init(); - } diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6da881b35da..ee082d77117 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -10,7 +10,7 @@ * 2 of the License, or (at your option) any later version. */ -#undef DEBUG +#define DEBUG #include <linux/export.h> #include <linux/string.h> @@ -36,6 +36,7 @@ #include <linux/lockdep.h> #include <linux/memblock.h> #include <linux/hugetlb.h> +#include <linux/memory.h> #include <asm/io.h> #include <asm/kdump.h> @@ -66,8 +67,7 @@ #include <asm/code-patching.h> #include <asm/kvm_ppc.h> #include <asm/hugetlb.h> - -#include "setup.h" +#include <asm/epapr_hcalls.h> #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -75,8 +75,7 @@ #define DBG(fmt...) #endif -int boot_cpuid = 0; -int __initdata spinning_secondaries; +int spinning_secondaries; u64 ppc64_pft_size; /* Pick defaults since we might want to patch instructions @@ -98,6 +97,38 @@ int dcache_bsize; int icache_bsize; int ucache_bsize; +#if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP) +static void setup_tlb_core_data(void) +{ + int cpu; + + BUILD_BUG_ON(offsetof(struct tlb_core_data, lock) != 0); + + for_each_possible_cpu(cpu) { + int first = cpu_first_thread_sibling(cpu); + + paca[cpu].tcd_ptr = &paca[first].tcd; + + /* + * If we have threads, we need either tlbsrx. + * or e6500 tablewalk mode, or else TLB handlers + * will be racy and could produce duplicate entries. + */ + if (smt_enabled_at_boot >= 2 && + !mmu_has_feature(MMU_FTR_USE_TLBRSRV) && + book3e_htw_mode != PPC_HTW_E6500) { + /* Should we panic instead? */ + WARN_ONCE("%s: unsupported MMU configuration -- expect problems\n", + __func__); + } + } +} +#else +static void setup_tlb_core_data(void) +{ +} +#endif + #ifdef CONFIG_SMP static char *smt_enabled_cmdline; @@ -156,6 +187,28 @@ early_param("smt-enabled", early_smt_enabled); #define check_smt_enabled() #endif /* CONFIG_SMP */ +/** Fix up paca fields required for the boot cpu */ +static void fixup_boot_paca(void) +{ + /* The boot cpu is started */ + get_paca()->cpu_start = 1; + /* Allow percpu accesses to work until we setup percpu data */ + get_paca()->data_offset = 0; +} + +static void cpu_ready_for_interrupts(void) +{ + /* Set IR and DR in PACA MSR */ + get_paca()->kernel_msr = MSR_KERNEL; + + /* Enable AIL if supported */ + if (cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_ARCH_207S)) { + unsigned long lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3); + } +} + /* * Early initialization entry point. This is called by head.S * with MMU translation disabled. We rely on the "feature" of @@ -177,6 +230,8 @@ early_param("smt-enabled", early_smt_enabled); void __init early_setup(unsigned long dt_ptr) { + static __initdata struct paca_struct boot_paca; + /* -------- printk is _NOT_ safe to use here ! ------- */ /* Identify CPU type */ @@ -185,6 +240,7 @@ void __init early_setup(unsigned long dt_ptr) /* Assume we're on cpu 0 for now. Don't write to the paca yet! */ initialise_paca(&boot_paca, 0); setup_paca(&boot_paca); + fixup_boot_paca(); /* Initialize lockdep early or else spinlocks will blow */ lockdep_init(); @@ -203,13 +259,11 @@ void __init early_setup(unsigned long dt_ptr) */ early_init_devtree(__va(dt_ptr)); + epapr_paravirt_early_init(); + /* Now we know the logical id of our boot cpu, setup the paca. */ setup_paca(&paca[boot_cpuid]); - - /* Fix up paca fields required for the boot cpu */ - get_paca()->cpu_start = 1; - /* Allow percpu accesses to "work" until we setup percpu data */ - get_paca()->data_offset = 0; + fixup_boot_paca(); /* Probe the machine type */ probe_machine(); @@ -222,6 +276,16 @@ void __init early_setup(unsigned long dt_ptr) early_init_mmu(); /* + * At this point, we can let interrupts switch to virtual mode + * (the MMU has been setup), so adjust the MSR in the PACA to + * have IR and DR set and enable AIL if it exists + */ + cpu_ready_for_interrupts(); + + /* Reserve large chunks of memory for use by CMA for KVM */ + kvm_cma_reserve(); + + /* * Reserve any gigantic pages requested on the command line. * memblock needs to have been initialized by the time this is * called since this will reserve memory. @@ -229,6 +293,18 @@ void __init early_setup(unsigned long dt_ptr) reserve_hugetlb_gpages(); DBG(" <- early_setup()\n"); + +#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX + /* + * This needs to be done *last* (after the above DBG() even) + * + * Right after we return from this function, we turn on the MMU + * which means the real-mode access trick that btext does will + * no longer work, it needs to switch to using a real MMU + * mapping. This call will ensure that it does + */ + btext_map(); +#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */ } #ifdef CONFIG_SMP @@ -239,6 +315,13 @@ void early_setup_secondary(void) /* Initialize the hash table or TLB handling */ early_init_mmu_secondary(); + + /* + * At this point, we can let interrupts switch to virtual mode + * (the MMU has been setup), so adjust the MSR in the PACA to + * have IR and DR set. + */ + cpu_ready_for_interrupts(); } #endif /* CONFIG_SMP */ @@ -259,7 +342,7 @@ void smp_release_cpus(void) ptr = (unsigned long *)((unsigned long)&__secondary_hold_spinloop - PHYSICAL_START); - *ptr = __pa(generic_secondary_smp_init); + *ptr = ppc_function_entry(generic_secondary_smp_init); /* And wait a bit for them to catch up */ for (i = 0; i < 100000; i++) { @@ -297,14 +380,14 @@ static void __init initialize_cache_info(void) * d-cache and i-cache sizes... -Peter */ if (num_cpus == 1) { - const u32 *sizep, *lsizep; + const __be32 *sizep, *lsizep; u32 size, lsize; size = 0; lsize = cur_cpu_spec->dcache_bsize; sizep = of_get_property(np, "d-cache-size", NULL); if (sizep != NULL) - size = *sizep; + size = be32_to_cpu(*sizep); lsizep = of_get_property(np, "d-cache-block-size", NULL); /* fallback if block size missing */ @@ -313,8 +396,8 @@ static void __init initialize_cache_info(void) "d-cache-line-size", NULL); if (lsizep != NULL) - lsize = *lsizep; - if (sizep == 0 || lsizep == 0) + lsize = be32_to_cpu(*lsizep); + if (sizep == NULL || lsizep == NULL) DBG("Argh, can't find dcache properties ! " "sizep: %p, lsizep: %p\n", sizep, lsizep); @@ -327,7 +410,7 @@ static void __init initialize_cache_info(void) lsize = cur_cpu_spec->icache_bsize; sizep = of_get_property(np, "i-cache-size", NULL); if (sizep != NULL) - size = *sizep; + size = be32_to_cpu(*sizep); lsizep = of_get_property(np, "i-cache-block-size", NULL); if (lsizep == NULL) @@ -335,8 +418,8 @@ static void __init initialize_cache_info(void) "i-cache-line-size", NULL); if (lsizep != NULL) - lsize = *lsizep; - if (sizep == 0 || lsizep == 0) + lsize = be32_to_cpu(*lsizep); + if (sizep == NULL || lsizep == NULL) DBG("Argh, can't find icache properties ! " "sizep: %p, lsizep: %p\n", sizep, lsizep); @@ -422,6 +505,7 @@ void __init setup_system(void) smp_setup_cpu_maps(); check_smt_enabled(); + setup_tlb_core_data(); #ifdef CONFIG_SMP /* Release secondary cpus out of their spinloops at 0x60 now that @@ -497,23 +581,25 @@ static void __init irqstack_early_init(void) #ifdef CONFIG_PPC_BOOK3E static void __init exc_lvl_early_init(void) { - extern unsigned int interrupt_base_book3e; - extern unsigned int exc_debug_debug_book3e; - unsigned int i; + unsigned long sp; for_each_possible_cpu(i) { - critirq_ctx[i] = (struct thread_info *) - __va(memblock_alloc(THREAD_SIZE, THREAD_SIZE)); - dbgirq_ctx[i] = (struct thread_info *) - __va(memblock_alloc(THREAD_SIZE, THREAD_SIZE)); - mcheckirq_ctx[i] = (struct thread_info *) - __va(memblock_alloc(THREAD_SIZE, THREAD_SIZE)); + sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE); + critirq_ctx[i] = (struct thread_info *)__va(sp); + paca[i].crit_kstack = __va(sp + THREAD_SIZE); + + sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE); + dbgirq_ctx[i] = (struct thread_info *)__va(sp); + paca[i].dbg_kstack = __va(sp + THREAD_SIZE); + + sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE); + mcheckirq_ctx[i] = (struct thread_info *)__va(sp); + paca[i].mc_kstack = __va(sp + THREAD_SIZE); } if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) - patch_branch(&interrupt_base_book3e + (0x040 / 4) + 1, - (unsigned long)&exc_debug_debug_book3e, 0); + patch_exception(0x040, exc_debug_debug_book3e); } #else #define exc_lvl_early_init() @@ -521,7 +607,8 @@ static void __init exc_lvl_early_init(void) /* * Stack space used when we detect a bad kernel stack pointer, and - * early in SMP boots before relocation is enabled. + * early in SMP boots before relocation is enabled. Exclusive emergency + * stack for machine checks. */ static void __init emergency_stack_init(void) { @@ -544,6 +631,13 @@ static void __init emergency_stack_init(void) sp = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit); sp += THREAD_SIZE; paca[i].emergency_sp = __va(sp); + +#ifdef CONFIG_PPC_BOOK3S_64 + /* emergency stack for machine check exception handling. */ + sp = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit); + sp += THREAD_SIZE; + paca[i].mc_emergency_sp = __va(sp); +#endif } } @@ -565,9 +659,6 @@ void __init setup_arch(char **cmdline_p) dcache_bsize = ppc64_caches.dline_size; icache_bsize = ppc64_caches.iline_size; - /* reboot on panic */ - panic_timeout = 180; - if (ppc_md.panic) setup_panic(); @@ -575,7 +666,9 @@ void __init setup_arch(char **cmdline_p) init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; init_mm.brk = klimit; - +#ifdef CONFIG_PPC_64K_PAGES + init_mm.context.pte_frag = NULL; +#endif irqstack_early_init(); exc_lvl_early_init(); emergency_stack_init(); @@ -599,8 +692,6 @@ void __init setup_arch(char **cmdline_p) /* Initialize the MMU context management stuff */ mmu_context_init(); - kvm_linear_init(); - /* Interrupt code needs to be 64K-aligned */ if ((unsigned long)_stext & 0xffff) panic("Kernelbase not 64K-aligned (0x%lx)!\n", @@ -690,9 +781,17 @@ void __init setup_per_cpu_areas(void) } #endif +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +unsigned long memory_block_size_bytes(void) +{ + if (ppc_md.memory_block_size) + return ppc_md.memory_block_size(); -#ifdef CONFIG_PPC_INDIRECT_IO + return MIN_MEMORY_BLOCK_SIZE; +} +#endif + +#if defined(CONFIG_PPC_INDIRECT_PIO) || defined(CONFIG_PPC_INDIRECT_MMIO) struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); -#endif /* CONFIG_PPC_INDIRECT_IO */ - +#endif diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index 3b997118df5..1c794cef288 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -13,10 +13,12 @@ #include <linux/signal.h> #include <linux/uprobes.h> #include <linux/key.h> +#include <linux/context_tracking.h> #include <asm/hw_breakpoint.h> #include <asm/uaccess.h> #include <asm/unistd.h> #include <asm/debug.h> +#include <asm/tm.h> #include "signal.h" @@ -24,18 +26,18 @@ * through debug.exception-trace sysctl. */ -int show_unhandled_signals = 0; +int show_unhandled_signals = 1; /* * Allocate space for the signal frame */ -void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, +void __user * get_sigframe(struct k_sigaction *ka, unsigned long sp, size_t frame_size, int is_32) { unsigned long oldsp, newsp; /* Default to using normal stack */ - oldsp = get_clean_sp(regs, is_32); + oldsp = get_clean_sp(sp, is_32); /* Check for alt stack */ if ((ka->sa.sa_flags & SA_ONSTACK) && @@ -130,8 +132,9 @@ static int do_signal(struct pt_regs *regs) * user space. The DABR will have been cleared if it * triggered inside the kernel. */ - if (current->thread.dabr) - set_dabr(current->thread.dabr, current->thread.dabrx); + if (current->thread.hw_brk.address && + current->thread.hw_brk.type) + __set_breakpoint(¤t->thread.hw_brk); #endif /* Re-enable the breakpoints for the signal stack */ thread_change_pc(current, regs); @@ -158,6 +161,8 @@ static int do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) { + user_exit(); + if (thread_info_flags & _TIF_UPROBE) uprobe_notify_resume(regs); @@ -168,11 +173,40 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); } + + user_enter(); } -long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - unsigned long r5, unsigned long r6, unsigned long r7, - unsigned long r8, struct pt_regs *regs) +unsigned long get_tm_stackpointer(struct pt_regs *regs) { - return do_sigaltstack(uss, uoss, regs->gpr[1]); + /* When in an active transaction that takes a signal, we need to be + * careful with the stack. It's possible that the stack has moved back + * up after the tbegin. The obvious case here is when the tbegin is + * called inside a function that returns before a tend. In this case, + * the stack is part of the checkpointed transactional memory state. + * If we write over this non transactionally or in suspend, we are in + * trouble because if we get a tm abort, the program counter and stack + * pointer will be back at the tbegin but our in memory stack won't be + * valid anymore. + * + * To avoid this, when taking a signal in an active transaction, we + * need to use the stack pointer from the checkpointed state, rather + * than the speculated state. This ensures that the signal context + * (written tm suspended) will be written below the stack required for + * the rollback. The transaction is aborted becuase of the treclaim, + * so any memory written between the tbegin and the signal will be + * rolled back anyway. + * + * For signals taken in non-TM or suspended mode, we use the + * normal/non-checkpointed stack pointer. + */ + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(regs->msr)) { + tm_reclaim_current(TM_CAUSE_SIGNAL); + if (MSR_TM_TRANSACTIONAL(regs->msr)) + return current->thread.ckpt_regs.gpr[1]; + } +#endif + return regs->gpr[1]; } diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h index e00acb41393..c69b9aeb9f2 100644 --- a/arch/powerpc/kernel/signal.h +++ b/arch/powerpc/kernel/signal.h @@ -12,7 +12,7 @@ extern void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags); -extern void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, +extern void __user * get_sigframe(struct k_sigaction *ka, unsigned long sp, size_t frame_size, int is_32); extern int handle_signal32(unsigned long sig, struct k_sigaction *ka, @@ -25,13 +25,21 @@ extern int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, extern unsigned long copy_fpr_to_user(void __user *to, struct task_struct *task); +extern unsigned long copy_transact_fpr_to_user(void __user *to, + struct task_struct *task); extern unsigned long copy_fpr_from_user(struct task_struct *task, void __user *from); +extern unsigned long copy_transact_fpr_from_user(struct task_struct *task, + void __user *from); #ifdef CONFIG_VSX extern unsigned long copy_vsx_to_user(void __user *to, struct task_struct *task); +extern unsigned long copy_transact_vsx_to_user(void __user *to, + struct task_struct *task); extern unsigned long copy_vsx_from_user(struct task_struct *task, void __user *from); +extern unsigned long copy_transact_vsx_from_user(struct task_struct *task, + void __user *from); #endif #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 804e323c139..1bc5a1755ed 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -43,6 +43,7 @@ #include <asm/sigcontext.h> #include <asm/vdso.h> #include <asm/switch_to.h> +#include <asm/tm.h> #ifdef CONFIG_PPC64 #include "ppc32.h" #include <asm/unistd.h> @@ -53,13 +54,9 @@ #include "signal.h" -#undef DEBUG_SIG #ifdef CONFIG_PPC64 -#define sys_sigsuspend compat_sys_sigsuspend -#define sys_rt_sigsuspend compat_sys_rt_sigsuspend #define sys_rt_sigreturn compat_sys_rt_sigreturn -#define sys_sigaction compat_sys_sigaction #define sys_swapcontext compat_sys_swapcontext #define sys_sigreturn compat_sys_sigreturn @@ -68,6 +65,8 @@ #define mcontext mcontext32 #define ucontext ucontext32 +#define __save_altstack __compat_save_altstack + /* * Userspace code may pass a ucontext which doesn't include VSX added * at the end. We need to check for this case. @@ -130,23 +129,6 @@ static inline int get_sigset_t(sigset_t *set, return 0; } -static inline int get_old_sigaction(struct k_sigaction *new_ka, - struct old_sigaction __user *act) -{ - compat_old_sigset_t mask; - compat_uptr_t handler, restorer; - - if (get_user(handler, &act->sa_handler) || - __get_user(restorer, &act->sa_restorer) || - __get_user(new_ka->sa.sa_flags, &act->sa_flags) || - __get_user(mask, &act->sa_mask)) - return -EFAULT; - new_ka->sa.sa_handler = compat_ptr(handler); - new_ka->sa.sa_restorer = compat_ptr(restorer); - siginitset(&new_ka->sa.sa_mask, mask); - return 0; -} - #define to_user_ptr(p) ptr_to_compat(p) #define from_user_ptr(p) compat_ptr(p) @@ -196,21 +178,6 @@ static inline int get_sigset_t(sigset_t *set, const sigset_t __user *uset) return copy_from_user(set, uset, sizeof(*uset)); } -static inline int get_old_sigaction(struct k_sigaction *new_ka, - struct old_sigaction __user *act) -{ - old_sigset_t mask; - - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || - __get_user(new_ka->sa.sa_handler, &act->sa_handler) || - __get_user(new_ka->sa.sa_restorer, &act->sa_restorer) || - __get_user(new_ka->sa.sa_flags, &act->sa_flags) || - __get_user(mask, &act->sa_mask)) - return -EFAULT; - siginitset(&new_ka->sa.sa_mask, mask); - return 0; -} - #define to_user_ptr(p) ((unsigned long)(p)) #define from_user_ptr(p) ((void __user *)(p)) @@ -234,50 +201,8 @@ static inline int restore_general_regs(struct pt_regs *regs, return -EFAULT; return 0; } - -#endif /* CONFIG_PPC64 */ - -/* - * Atomically swap in the new signal mask, and wait for a signal. - */ -long sys_sigsuspend(old_sigset_t mask) -{ - sigset_t blocked; - siginitset(&blocked, mask); - return sigsuspend(&blocked); -} - -long sys_sigaction(int sig, struct old_sigaction __user *act, - struct old_sigaction __user *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret; - -#ifdef CONFIG_PPC64 - if (sig < 0) - sig = -sig; #endif - if (act) { - if (get_old_sigaction(&new_ka, act)) - return -EFAULT; - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(to_user_ptr(old_ka.sa.sa_handler), - &oact->sa_handler) || - __put_user(to_user_ptr(old_ka.sa.sa_restorer), - &oact->sa_restorer) || - __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) - return -EFAULT; - } - - return ret; -} - /* * When we have signals to deliver, we set up on the * user stack, going down from the original stack pointer: @@ -293,6 +218,10 @@ long sys_sigaction(int sig, struct old_sigaction __user *act, struct sigframe { struct sigcontext sctx; /* the sigcontext */ struct mcontext mctx; /* all the register values */ +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + struct sigcontext sctx_transact; + struct mcontext mctx_transact; +#endif /* * Programs using the rs6000/xcoff abi can save up to 19 gp * regs and 18 fp regs below sp before decrementing it. @@ -321,6 +250,9 @@ struct rt_sigframe { struct siginfo info; #endif struct ucontext uc; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + struct ucontext uc_transact; +#endif /* * Programs using the rs6000/xcoff abi can save up to 19 gp * regs and 18 fp regs below sp before decrementing it. @@ -332,27 +264,27 @@ struct rt_sigframe { unsigned long copy_fpr_to_user(void __user *to, struct task_struct *task) { - double buf[ELF_NFPREG]; + u64 buf[ELF_NFPREG]; int i; /* save FPR copy to local buffer then write to the thread_struct */ for (i = 0; i < (ELF_NFPREG - 1) ; i++) buf[i] = task->thread.TS_FPR(i); - memcpy(&buf[i], &task->thread.fpscr, sizeof(double)); + buf[i] = task->thread.fp_state.fpscr; return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); } unsigned long copy_fpr_from_user(struct task_struct *task, void __user *from) { - double buf[ELF_NFPREG]; + u64 buf[ELF_NFPREG]; int i; if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) return 1; for (i = 0; i < (ELF_NFPREG - 1) ; i++) task->thread.TS_FPR(i) = buf[i]; - memcpy(&task->thread.fpscr, &buf[i], sizeof(double)); + task->thread.fp_state.fpscr = buf[i]; return 0; } @@ -360,41 +292,112 @@ unsigned long copy_fpr_from_user(struct task_struct *task, unsigned long copy_vsx_to_user(void __user *to, struct task_struct *task) { - double buf[ELF_NVSRHALFREG]; + u64 buf[ELF_NVSRHALFREG]; int i; /* save FPR copy to local buffer then write to the thread_struct */ for (i = 0; i < ELF_NVSRHALFREG; i++) - buf[i] = task->thread.fpr[i][TS_VSRLOWOFFSET]; + buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET]; return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); } unsigned long copy_vsx_from_user(struct task_struct *task, void __user *from) { - double buf[ELF_NVSRHALFREG]; + u64 buf[ELF_NVSRHALFREG]; int i; if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) return 1; for (i = 0; i < ELF_NVSRHALFREG ; i++) - task->thread.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i]; return 0; } + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +unsigned long copy_transact_fpr_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NFPREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + buf[i] = task->thread.TS_TRANS_FPR(i); + buf[i] = task->thread.transact_fp.fpscr; + return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double)); +} + +unsigned long copy_transact_fpr_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NFPREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double))) + return 1; + for (i = 0; i < (ELF_NFPREG - 1) ; i++) + task->thread.TS_TRANS_FPR(i) = buf[i]; + task->thread.transact_fp.fpscr = buf[i]; + + return 0; +} + +unsigned long copy_transact_vsx_to_user(void __user *to, + struct task_struct *task) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + /* save FPR copy to local buffer then write to the thread_struct */ + for (i = 0; i < ELF_NVSRHALFREG; i++) + buf[i] = task->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET]; + return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double)); +} + +unsigned long copy_transact_vsx_from_user(struct task_struct *task, + void __user *from) +{ + u64 buf[ELF_NVSRHALFREG]; + int i; + + if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double))) + return 1; + for (i = 0; i < ELF_NVSRHALFREG ; i++) + task->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET] = buf[i]; + return 0; +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ #else inline unsigned long copy_fpr_to_user(void __user *to, struct task_struct *task) { - return __copy_to_user(to, task->thread.fpr, + return __copy_to_user(to, task->thread.fp_state.fpr, ELF_NFPREG * sizeof(double)); } inline unsigned long copy_fpr_from_user(struct task_struct *task, void __user *from) { - return __copy_from_user(task->thread.fpr, from, + return __copy_from_user(task->thread.fp_state.fpr, from, ELF_NFPREG * sizeof(double)); } + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +inline unsigned long copy_transact_fpr_to_user(void __user *to, + struct task_struct *task) +{ + return __copy_to_user(to, task->thread.transact_fp.fpr, + ELF_NFPREG * sizeof(double)); +} + +inline unsigned long copy_transact_fpr_from_user(struct task_struct *task, + void __user *from) +{ + return __copy_from_user(task->thread.transact_fp.fpr, from, + ELF_NFPREG * sizeof(double)); +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ #endif /* @@ -403,7 +406,8 @@ inline unsigned long copy_fpr_from_user(struct task_struct *task, * altivec/spe instructions at some point. */ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, - int sigret, int ctx_has_vsx_region) + struct mcontext __user *tm_frame, int sigret, + int ctx_has_vsx_region) { unsigned long msr = regs->msr; @@ -418,7 +422,7 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, /* save altivec registers */ if (current->thread.used_vr) { flush_altivec_to_thread(current); - if (__copy_to_user(&frame->mc_vregs, current->thread.vr, + if (__copy_to_user(&frame->mc_vregs, ¤t->thread.vr_state, ELF_NVRREG * sizeof(vector128))) return 1; /* set MSR_VEC in the saved MSR value to indicate that @@ -431,12 +435,21 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, * use altivec. Since VSCR only contains 32 bits saved in the least * significant bits of a vector, we "cheat" and stuff VRSAVE in the * most significant bits of that same vector. --BenH + * Note that the current VRSAVE value is in the SPR at this point. */ + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + current->thread.vrsave = mfspr(SPRN_VRSAVE); if (__put_user(current->thread.vrsave, (u32 __user *)&frame->mc_vregs[32])) return 1; #endif /* CONFIG_ALTIVEC */ if (copy_fpr_to_user(&frame->mc_fregs, current)) return 1; + + /* + * Clear the MSR VSX bit to indicate there is no valid state attached + * to this context, except in the specific case below where we set it. + */ + msr &= ~MSR_VSX; #ifdef CONFIG_VSX /* * Copy VSR 0-31 upper half from thread_struct to local @@ -471,6 +484,12 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, if (__put_user(msr, &frame->mc_gregs[PT_MSR])) return 1; + /* We need to write 0 the MSR top 32 bits in the tm frame so that we + * can check it on the restore to see if TM is active + */ + if (tm_frame && __put_user(0, &tm_frame->mc_gregs[PT_MSR])) + return 1; + if (sigret) { /* Set up the sigreturn trampoline: li r0,sigret; sc */ if (__put_user(0x38000000UL + sigret, &frame->tramp[0]) @@ -483,6 +502,159 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, return 0; } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Save the current user registers on the user stack. + * We only save the altivec/spe registers if the process has used + * altivec/spe instructions at some point. + * We also save the transactional registers to a second ucontext in the + * frame. + * + * See save_user_regs() and signal_64.c:setup_tm_sigcontexts(). + */ +static int save_tm_user_regs(struct pt_regs *regs, + struct mcontext __user *frame, + struct mcontext __user *tm_frame, int sigret) +{ + unsigned long msr = regs->msr; + + /* Remove TM bits from thread's MSR. The MSR in the sigcontext + * just indicates to userland that we were doing a transaction, but we + * don't want to return in transactional state. This also ensures + * that flush_fp_to_thread won't set TIF_RESTORE_TM again. + */ + regs->msr &= ~MSR_TS_MASK; + + /* Make sure floating point registers are stored in regs */ + flush_fp_to_thread(current); + + /* Save both sets of general registers */ + if (save_general_regs(¤t->thread.ckpt_regs, frame) + || save_general_regs(regs, tm_frame)) + return 1; + + /* Stash the top half of the 64bit MSR into the 32bit MSR word + * of the transactional mcontext. This way we have a backward-compatible + * MSR in the 'normal' (checkpointed) mcontext and additionally one can + * also look at what type of transaction (T or S) was active at the + * time of the signal. + */ + if (__put_user((msr >> 32), &tm_frame->mc_gregs[PT_MSR])) + return 1; + +#ifdef CONFIG_ALTIVEC + /* save altivec registers */ + if (current->thread.used_vr) { + flush_altivec_to_thread(current); + if (__copy_to_user(&frame->mc_vregs, ¤t->thread.vr_state, + ELF_NVRREG * sizeof(vector128))) + return 1; + if (msr & MSR_VEC) { + if (__copy_to_user(&tm_frame->mc_vregs, + ¤t->thread.transact_vr, + ELF_NVRREG * sizeof(vector128))) + return 1; + } else { + if (__copy_to_user(&tm_frame->mc_vregs, + ¤t->thread.vr_state, + ELF_NVRREG * sizeof(vector128))) + return 1; + } + + /* set MSR_VEC in the saved MSR value to indicate that + * frame->mc_vregs contains valid data + */ + msr |= MSR_VEC; + } + + /* We always copy to/from vrsave, it's 0 if we don't have or don't + * use altivec. Since VSCR only contains 32 bits saved in the least + * significant bits of a vector, we "cheat" and stuff VRSAVE in the + * most significant bits of that same vector. --BenH + */ + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + current->thread.vrsave = mfspr(SPRN_VRSAVE); + if (__put_user(current->thread.vrsave, + (u32 __user *)&frame->mc_vregs[32])) + return 1; + if (msr & MSR_VEC) { + if (__put_user(current->thread.transact_vrsave, + (u32 __user *)&tm_frame->mc_vregs[32])) + return 1; + } else { + if (__put_user(current->thread.vrsave, + (u32 __user *)&tm_frame->mc_vregs[32])) + return 1; + } +#endif /* CONFIG_ALTIVEC */ + + if (copy_fpr_to_user(&frame->mc_fregs, current)) + return 1; + if (msr & MSR_FP) { + if (copy_transact_fpr_to_user(&tm_frame->mc_fregs, current)) + return 1; + } else { + if (copy_fpr_to_user(&tm_frame->mc_fregs, current)) + return 1; + } + +#ifdef CONFIG_VSX + /* + * Copy VSR 0-31 upper half from thread_struct to local + * buffer, then write that to userspace. Also set MSR_VSX in + * the saved MSR value to indicate that frame->mc_vregs + * contains valid data + */ + if (current->thread.used_vsr) { + __giveup_vsx(current); + if (copy_vsx_to_user(&frame->mc_vsregs, current)) + return 1; + if (msr & MSR_VSX) { + if (copy_transact_vsx_to_user(&tm_frame->mc_vsregs, + current)) + return 1; + } else { + if (copy_vsx_to_user(&tm_frame->mc_vsregs, current)) + return 1; + } + + msr |= MSR_VSX; + } +#endif /* CONFIG_VSX */ +#ifdef CONFIG_SPE + /* SPE regs are not checkpointed with TM, so this section is + * simply the same as in save_user_regs(). + */ + if (current->thread.used_spe) { + flush_spe_to_thread(current); + if (__copy_to_user(&frame->mc_vregs, current->thread.evr, + ELF_NEVRREG * sizeof(u32))) + return 1; + /* set MSR_SPE in the saved MSR value to indicate that + * frame->mc_vregs contains valid data */ + msr |= MSR_SPE; + } + + /* We always copy to/from spefscr */ + if (__put_user(current->thread.spefscr, (u32 __user *)&frame->mc_vregs + ELF_NEVRREG)) + return 1; +#endif /* CONFIG_SPE */ + + if (__put_user(msr, &frame->mc_gregs[PT_MSR])) + return 1; + if (sigret) { + /* Set up the sigreturn trampoline: li r0,sigret; sc */ + if (__put_user(0x38000000UL + sigret, &frame->tramp[0]) + || __put_user(0x44000002UL, &frame->tramp[1])) + return 1; + flush_icache_range((unsigned long) &frame->tramp[0], + (unsigned long) &frame->tramp[2]); + } + + return 0; +} +#endif + /* * Restore the current user register values from the user stack, * (except for MSR). @@ -532,15 +704,18 @@ static long restore_user_regs(struct pt_regs *regs, regs->msr &= ~MSR_VEC; if (msr & MSR_VEC) { /* restore altivec registers from the stack */ - if (__copy_from_user(current->thread.vr, &sr->mc_vregs, + if (__copy_from_user(¤t->thread.vr_state, &sr->mc_vregs, sizeof(sr->mc_vregs))) return 1; } else if (current->thread.used_vr) - memset(current->thread.vr, 0, ELF_NVRREG * sizeof(vector128)); + memset(¤t->thread.vr_state, 0, + ELF_NVRREG * sizeof(vector128)); /* Always get VRSAVE back */ if (__get_user(current->thread.vrsave, (u32 __user *)&sr->mc_vregs[32])) return 1; + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + mtspr(SPRN_VRSAVE, current->thread.vrsave); #endif /* CONFIG_ALTIVEC */ if (copy_fpr_from_user(current, &sr->mc_fregs)) return 1; @@ -560,7 +735,7 @@ static long restore_user_regs(struct pt_regs *regs, return 1; } else if (current->thread.used_vsr) for (i = 0; i < 32 ; i++) - current->thread.fpr[i][TS_VSRLOWOFFSET] = 0; + current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0; #endif /* CONFIG_VSX */ /* * force the process to reload the FP registers from @@ -588,91 +763,151 @@ static long restore_user_regs(struct pt_regs *regs, return 0; } -#ifdef CONFIG_PPC64 -long compat_sys_rt_sigaction(int sig, const struct sigaction32 __user *act, - struct sigaction32 __user *oact, size_t sigsetsize) +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Restore the current user register values from the user stack, except for + * MSR, and recheckpoint the original checkpointed register state for processes + * in transactions. + */ +static long restore_tm_user_regs(struct pt_regs *regs, + struct mcontext __user *sr, + struct mcontext __user *tm_sr) { - struct k_sigaction new_ka, old_ka; - int ret; + long err; + unsigned long msr, msr_hi; +#ifdef CONFIG_VSX + int i; +#endif - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; + /* + * restore general registers but not including MSR or SOFTE. Also + * take care of keeping r2 (TLS) intact if not a signal. + * See comment in signal_64.c:restore_tm_sigcontexts(); + * TFHAR is restored from the checkpointed NIP; TEXASR and TFIAR + * were set by the signal delivery. + */ + err = restore_general_regs(regs, tm_sr); + err |= restore_general_regs(¤t->thread.ckpt_regs, sr); - if (act) { - compat_uptr_t handler; + err |= __get_user(current->thread.tm_tfhar, &sr->mc_gregs[PT_NIP]); - ret = get_user(handler, &act->sa_handler); - new_ka.sa.sa_handler = compat_ptr(handler); - ret |= get_sigset_t(&new_ka.sa.sa_mask, &act->sa_mask); - ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); - if (ret) - return -EFAULT; - } + err |= __get_user(msr, &sr->mc_gregs[PT_MSR]); + if (err) + return 1; - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - if (!ret && oact) { - ret = put_user(to_user_ptr(old_ka.sa.sa_handler), &oact->sa_handler); - ret |= put_sigset_t(&oact->sa_mask, &old_ka.sa.sa_mask); - ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - } - return ret; -} + /* Restore the previous little-endian mode */ + regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); -/* - * Note: it is necessary to treat how as an unsigned int, with the - * corresponding cast to a signed int to insure that the proper - * conversion (sign extension) between the register representation - * of a signed int (msr in 32-bit mode) and the register representation - * of a signed int (msr in 64-bit mode) is performed. - */ -long compat_sys_rt_sigprocmask(u32 how, compat_sigset_t __user *set, - compat_sigset_t __user *oset, size_t sigsetsize) -{ - sigset_t s; - sigset_t __user *up; - int ret; - mm_segment_t old_fs = get_fs(); + /* + * Do this before updating the thread state in + * current->thread.fpr/vr/evr. That way, if we get preempted + * and another task grabs the FPU/Altivec/SPE, it won't be + * tempted to save the current CPU state into the thread_struct + * and corrupt what we are writing there. + */ + discard_lazy_cpu_state(); - if (set) { - if (get_sigset_t(&s, set)) - return -EFAULT; +#ifdef CONFIG_ALTIVEC + regs->msr &= ~MSR_VEC; + if (msr & MSR_VEC) { + /* restore altivec registers from the stack */ + if (__copy_from_user(¤t->thread.vr_state, &sr->mc_vregs, + sizeof(sr->mc_vregs)) || + __copy_from_user(¤t->thread.transact_vr, + &tm_sr->mc_vregs, + sizeof(sr->mc_vregs))) + return 1; + } else if (current->thread.used_vr) { + memset(¤t->thread.vr_state, 0, + ELF_NVRREG * sizeof(vector128)); + memset(¤t->thread.transact_vr, 0, + ELF_NVRREG * sizeof(vector128)); } - set_fs(KERNEL_DS); - /* This is valid because of the set_fs() */ - up = (sigset_t __user *) &s; - ret = sys_rt_sigprocmask((int)how, set ? up : NULL, oset ? up : NULL, - sigsetsize); - set_fs(old_fs); - if (ret) - return ret; - if (oset) { - if (put_sigset_t(oset, &s)) - return -EFAULT; - } - return 0; -} + /* Always get VRSAVE back */ + if (__get_user(current->thread.vrsave, + (u32 __user *)&sr->mc_vregs[32]) || + __get_user(current->thread.transact_vrsave, + (u32 __user *)&tm_sr->mc_vregs[32])) + return 1; + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + mtspr(SPRN_VRSAVE, current->thread.vrsave); +#endif /* CONFIG_ALTIVEC */ -long compat_sys_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) -{ - sigset_t s; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - /* The __user pointer cast is valid because of the set_fs() */ - ret = sys_rt_sigpending((sigset_t __user *) &s, sigsetsize); - set_fs(old_fs); - if (!ret) { - if (put_sigset_t(set, &s)) - return -EFAULT; + regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1); + + if (copy_fpr_from_user(current, &sr->mc_fregs) || + copy_transact_fpr_from_user(current, &tm_sr->mc_fregs)) + return 1; + +#ifdef CONFIG_VSX + regs->msr &= ~MSR_VSX; + if (msr & MSR_VSX) { + /* + * Restore altivec registers from the stack to a local + * buffer, then write this out to the thread_struct + */ + if (copy_vsx_from_user(current, &sr->mc_vsregs) || + copy_transact_vsx_from_user(current, &tm_sr->mc_vsregs)) + return 1; + } else if (current->thread.used_vsr) + for (i = 0; i < 32 ; i++) { + current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0; + current->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET] = 0; + } +#endif /* CONFIG_VSX */ + +#ifdef CONFIG_SPE + /* SPE regs are not checkpointed with TM, so this section is + * simply the same as in restore_user_regs(). + */ + regs->msr &= ~MSR_SPE; + if (msr & MSR_SPE) { + if (__copy_from_user(current->thread.evr, &sr->mc_vregs, + ELF_NEVRREG * sizeof(u32))) + return 1; + } else if (current->thread.used_spe) + memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32)); + + /* Always get SPEFSCR back */ + if (__get_user(current->thread.spefscr, (u32 __user *)&sr->mc_vregs + + ELF_NEVRREG)) + return 1; +#endif /* CONFIG_SPE */ + + /* Now, recheckpoint. This loads up all of the checkpointed (older) + * registers, including FP and V[S]Rs. After recheckpointing, the + * transactional versions should be loaded. + */ + tm_enable(); + /* Make sure the transaction is marked as failed */ + current->thread.tm_texasr |= TEXASR_FS; + /* This loads the checkpointed FP/VEC state, if used */ + tm_recheckpoint(¤t->thread, msr); + /* Get the top half of the MSR */ + if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR])) + return 1; + /* Pull in MSR TM from user context */ + regs->msr = (regs->msr & ~MSR_TS_MASK) | ((msr_hi<<32) & MSR_TS_MASK); + + /* This loads the speculative FP/VEC state, if used */ + if (msr & MSR_FP) { + do_load_up_transact_fpu(¤t->thread); + regs->msr |= (MSR_FP | current->thread.fpexc_mode); } - return ret; -} +#ifdef CONFIG_ALTIVEC + if (msr & MSR_VEC) { + do_load_up_transact_altivec(¤t->thread); + regs->msr |= MSR_VEC; + } +#endif + return 0; +} +#endif -int copy_siginfo_to_user32(struct compat_siginfo __user *d, siginfo_t *s) +#ifdef CONFIG_PPC64 +int copy_siginfo_to_user32(struct compat_siginfo __user *d, const siginfo_t *s) { int err; @@ -740,79 +975,6 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from) return 0; } - -/* - * Note: it is necessary to treat pid and sig as unsigned ints, with the - * corresponding cast to a signed int to insure that the proper conversion - * (sign extension) between the register representation of a signed int - * (msr in 32-bit mode) and the register representation of a signed int - * (msr in 64-bit mode) is performed. - */ -long compat_sys_rt_sigqueueinfo(u32 pid, u32 sig, compat_siginfo_t __user *uinfo) -{ - siginfo_t info; - int ret; - mm_segment_t old_fs = get_fs(); - - ret = copy_siginfo_from_user32(&info, uinfo); - if (unlikely(ret)) - return ret; - - set_fs (KERNEL_DS); - /* The __user pointer cast is valid becasuse of the set_fs() */ - ret = sys_rt_sigqueueinfo((int)pid, (int)sig, (siginfo_t __user *) &info); - set_fs (old_fs); - return ret; -} -/* - * Start Alternate signal stack support - * - * System Calls - * sigaltatck compat_sys_sigaltstack - */ - -int compat_sys_sigaltstack(u32 __new, u32 __old, int r5, - int r6, int r7, int r8, struct pt_regs *regs) -{ - stack_32_t __user * newstack = compat_ptr(__new); - stack_32_t __user * oldstack = compat_ptr(__old); - stack_t uss, uoss; - int ret; - mm_segment_t old_fs; - unsigned long sp; - compat_uptr_t ss_sp; - - /* - * set sp to the user stack on entry to the system call - * the system call router sets R9 to the saved registers - */ - sp = regs->gpr[1]; - - /* Put new stack info in local 64 bit stack struct */ - if (newstack) { - if (get_user(ss_sp, &newstack->ss_sp) || - __get_user(uss.ss_flags, &newstack->ss_flags) || - __get_user(uss.ss_size, &newstack->ss_size)) - return -EFAULT; - uss.ss_sp = compat_ptr(ss_sp); - } - - old_fs = get_fs(); - set_fs(KERNEL_DS); - /* The __user pointer casts are valid because of the set_fs() */ - ret = do_sigaltstack( - newstack ? (stack_t __user *) &uss : NULL, - oldstack ? (stack_t __user *) &uoss : NULL, - sp); - set_fs(old_fs); - /* Copy the stack information to the user output buffer */ - if (!ret && oldstack && - (put_user(ptr_to_compat(uoss.ss_sp), &oldstack->ss_sp) || - __put_user(uoss.ss_flags, &oldstack->ss_flags) || - __put_user(uoss.ss_size, &oldstack->ss_size))) - return -EFAULT; - return ret; -} #endif /* CONFIG_PPC64 */ /* @@ -825,12 +987,15 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, { struct rt_sigframe __user *rt_sf; struct mcontext __user *frame; + struct mcontext __user *tm_frame = NULL; void __user *addr; unsigned long newsp = 0; + int sigret; + unsigned long tramp; /* Set up Signal Frame */ /* Put a Real Time Context onto stack */ - rt_sf = get_sigframe(ka, regs, sizeof(*rt_sf), 1); + rt_sf = get_sigframe(ka, get_tm_stackpointer(regs), sizeof(*rt_sf), 1); addr = rt_sf; if (unlikely(rt_sf == NULL)) goto badframe; @@ -838,11 +1003,7 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, /* Put the siginfo & fill in most of the ucontext */ if (copy_siginfo_to_user(&rt_sf->info, info) || __put_user(0, &rt_sf->uc.uc_flags) - || __put_user(0, &rt_sf->uc.uc_link) - || __put_user(current->sas_ss_sp, &rt_sf->uc.uc_stack.ss_sp) - || __put_user(sas_ss_flags(regs->gpr[1]), - &rt_sf->uc.uc_stack.ss_flags) - || __put_user(current->sas_ss_size, &rt_sf->uc.uc_stack.ss_size) + || __save_altstack(&rt_sf->uc.uc_stack, regs->gpr[1]) || __put_user(to_user_ptr(&rt_sf->uc.uc_mcontext), &rt_sf->uc.uc_regs) || put_sigset_t(&rt_sf->uc.uc_sigmask, oldset)) @@ -852,16 +1013,35 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, frame = &rt_sf->uc.uc_mcontext; addr = frame; if (vdso32_rt_sigtramp && current->mm->context.vdso_base) { - if (save_user_regs(regs, frame, 0, 1)) - goto badframe; - regs->link = current->mm->context.vdso_base + vdso32_rt_sigtramp; + sigret = 0; + tramp = current->mm->context.vdso_base + vdso32_rt_sigtramp; } else { - if (save_user_regs(regs, frame, __NR_rt_sigreturn, 1)) + sigret = __NR_rt_sigreturn; + tramp = (unsigned long) frame->tramp; + } + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + tm_frame = &rt_sf->uc_transact.uc_mcontext; + if (MSR_TM_ACTIVE(regs->msr)) { + if (__put_user((unsigned long)&rt_sf->uc_transact, + &rt_sf->uc.uc_link) || + __put_user((unsigned long)tm_frame, + &rt_sf->uc_transact.uc_regs)) + goto badframe; + if (save_tm_user_regs(regs, frame, tm_frame, sigret)) goto badframe; - regs->link = (unsigned long) frame->tramp; } + else +#endif + { + if (__put_user(0, &rt_sf->uc.uc_link)) + goto badframe; + if (save_user_regs(regs, frame, tm_frame, sigret, 1)) + goto badframe; + } + regs->link = tramp; - current->thread.fpscr.val = 0; /* turn off all fp exceptions */ + current->thread.fp_state.fpscr = 0; /* turn off all fp exceptions */ /* create a stack frame for the caller of the handler */ newsp = ((unsigned long)rt_sf) - (__SIGNAL_FRAMESIZE + 16); @@ -876,15 +1056,12 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, regs->gpr[5] = (unsigned long) &rt_sf->uc; regs->gpr[6] = (unsigned long) rt_sf; regs->nip = (unsigned long) ka->sa.sa_handler; - /* enter the signal handler in big-endian mode */ + /* enter the signal handler in native-endian mode */ regs->msr &= ~MSR_LE; + regs->msr |= (MSR_KERNEL & MSR_LE); return 1; badframe: -#ifdef DEBUG_SIG - printk("badframe in handle_rt_signal, regs=%p frame=%p newsp=%lx\n", - regs, frame, newsp); -#endif if (show_unhandled_signals) printk_ratelimited(KERN_INFO "%s[%d]: bad frame in handle_rt_signal32: " @@ -925,6 +1102,35 @@ static int do_setcontext(struct ucontext __user *ucp, struct pt_regs *regs, int return 0; } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static int do_setcontext_tm(struct ucontext __user *ucp, + struct ucontext __user *tm_ucp, + struct pt_regs *regs) +{ + sigset_t set; + struct mcontext __user *mcp; + struct mcontext __user *tm_mcp; + u32 cmcp; + u32 tm_cmcp; + + if (get_sigset_t(&set, &ucp->uc_sigmask)) + return -EFAULT; + + if (__get_user(cmcp, &ucp->uc_regs) || + __get_user(tm_cmcp, &tm_ucp->uc_regs)) + return -EFAULT; + mcp = (struct mcontext __user *)(u64)cmcp; + tm_mcp = (struct mcontext __user *)(u64)tm_cmcp; + /* no need to check access_ok(mcp), since mcp < 4GB */ + + set_current_blocked(&set); + if (restore_tm_user_regs(regs, mcp, tm_mcp)) + return -EFAULT; + + return 0; +} +#endif + long sys_swapcontext(struct ucontext __user *old_ctx, struct ucontext __user *new_ctx, int ctx_size, int r6, int r7, int r8, struct pt_regs *regs) @@ -986,7 +1192,7 @@ long sys_swapcontext(struct ucontext __user *old_ctx, mctx = (struct mcontext __user *) ((unsigned long) &old_ctx->uc_mcontext & ~0xfUL); if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size) - || save_user_regs(regs, mctx, 0, ctx_has_vsx_region) + || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region) || put_sigset_t(&old_ctx->uc_sigmask, ¤t->blocked) || __put_user(to_user_ptr(mctx), &old_ctx->uc_regs)) return -EFAULT; @@ -1020,7 +1226,12 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, struct pt_regs *regs) { struct rt_sigframe __user *rt_sf; - +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + struct ucontext __user *uc_transact; + unsigned long msr_hi; + unsigned long tmp; + int tm_restore = 0; +#endif /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; @@ -1028,6 +1239,34 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, (regs->gpr[1] + __SIGNAL_FRAMESIZE + 16); if (!access_ok(VERIFY_READ, rt_sf, sizeof(*rt_sf))) goto bad; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (__get_user(tmp, &rt_sf->uc.uc_link)) + goto bad; + uc_transact = (struct ucontext __user *)(uintptr_t)tmp; + if (uc_transact) { + u32 cmcp; + struct mcontext __user *mcp; + + if (__get_user(cmcp, &uc_transact->uc_regs)) + return -EFAULT; + mcp = (struct mcontext __user *)(u64)cmcp; + /* The top 32 bits of the MSR are stashed in the transactional + * ucontext. */ + if (__get_user(msr_hi, &mcp->mc_gregs[PT_MSR])) + goto bad; + + if (MSR_TM_ACTIVE(msr_hi<<32)) { + /* We only recheckpoint on return if we're + * transaction. + */ + tm_restore = 1; + if (do_setcontext_tm(&rt_sf->uc, uc_transact, regs)) + goto bad; + } + } + if (!tm_restore) + /* Fall through, for non-TM restore */ +#endif if (do_setcontext(&rt_sf->uc, regs, 1)) goto bad; @@ -1039,14 +1278,11 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, * change it. -- paulus */ #ifdef CONFIG_PPC64 - /* - * We use the compat_sys_ version that does the 32/64 bits conversion - * and takes userland pointer directly. What about error checking ? - * nobody does any... - */ - compat_sys_sigaltstack((u32)(u64)&rt_sf->uc.uc_stack, 0, 0, 0, 0, 0, regs); + if (compat_restore_altstack(&rt_sf->uc.uc_stack)) + goto bad; #else - do_sigaltstack(&rt_sf->uc.uc_stack, NULL, regs->gpr[1]); + if (restore_altstack(&rt_sf->uc.uc_stack)) + goto bad; #endif set_thread_flag(TIF_RESTOREALL); return 0; @@ -1074,7 +1310,7 @@ int sys_debug_setcontext(struct ucontext __user *ctx, unsigned char tmp; unsigned long new_msr = regs->msr; #ifdef CONFIG_PPC_ADV_DEBUG_REGS - unsigned long new_dbcr0 = current->thread.dbcr0; + unsigned long new_dbcr0 = current->thread.debug.dbcr0; #endif for (i=0; i<ndbg; i++) { @@ -1089,7 +1325,7 @@ int sys_debug_setcontext(struct ucontext __user *ctx, } else { new_dbcr0 &= ~DBCR0_IC; if (!DBCR_ACTIVE_EVENTS(new_dbcr0, - current->thread.dbcr1)) { + current->thread.debug.dbcr1)) { new_msr &= ~MSR_DE; new_dbcr0 &= ~DBCR0_IDM; } @@ -1124,7 +1360,7 @@ int sys_debug_setcontext(struct ucontext __user *ctx, the user is really doing something wrong. */ regs->msr = new_msr; #ifdef CONFIG_PPC_ADV_DEBUG_REGS - current->thread.dbcr0 = new_dbcr0; + current->thread.debug.dbcr0 = new_dbcr0; #endif if (!access_ok(VERIFY_READ, ctx, sizeof(*ctx)) @@ -1162,7 +1398,7 @@ int sys_debug_setcontext(struct ucontext __user *ctx, * always done it up until now so it is probably better not to * change it. -- paulus */ - do_sigaltstack(&ctx->uc_stack, NULL, regs->gpr[1]); + restore_altstack(&ctx->uc_stack); set_thread_flag(TIF_RESTOREALL); out: @@ -1178,10 +1414,13 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka, { struct sigcontext __user *sc; struct sigframe __user *frame; + struct mcontext __user *tm_mctx = NULL; unsigned long newsp = 0; + int sigret; + unsigned long tramp; /* Set up Signal Frame */ - frame = get_sigframe(ka, regs, sizeof(*frame), 1); + frame = get_sigframe(ka, get_tm_stackpointer(regs), sizeof(*frame), 1); if (unlikely(frame == NULL)) goto badframe; sc = (struct sigcontext __user *) &frame->sctx; @@ -1201,16 +1440,30 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka, goto badframe; if (vdso32_sigtramp && current->mm->context.vdso_base) { - if (save_user_regs(regs, &frame->mctx, 0, 1)) - goto badframe; - regs->link = current->mm->context.vdso_base + vdso32_sigtramp; + sigret = 0; + tramp = current->mm->context.vdso_base + vdso32_sigtramp; } else { - if (save_user_regs(regs, &frame->mctx, __NR_sigreturn, 1)) + sigret = __NR_sigreturn; + tramp = (unsigned long) frame->mctx.tramp; + } + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + tm_mctx = &frame->mctx_transact; + if (MSR_TM_ACTIVE(regs->msr)) { + if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact, + sigret)) + goto badframe; + } + else +#endif + { + if (save_user_regs(regs, &frame->mctx, tm_mctx, sigret, 1)) goto badframe; - regs->link = (unsigned long) frame->mctx.tramp; } - current->thread.fpscr.val = 0; /* turn off all fp exceptions */ + regs->link = tramp; + + current->thread.fp_state.fpscr = 0; /* turn off all fp exceptions */ /* create a stack frame for the caller of the handler */ newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE; @@ -1223,14 +1476,9 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka, regs->nip = (unsigned long) ka->sa.sa_handler; /* enter the signal handler in big-endian mode */ regs->msr &= ~MSR_LE; - return 1; badframe: -#ifdef DEBUG_SIG - printk("badframe in handle_signal, regs=%p frame=%p newsp=%lx\n", - regs, frame, newsp); -#endif if (show_unhandled_signals) printk_ratelimited(KERN_INFO "%s[%d]: bad frame in handle_signal32: " @@ -1248,16 +1496,22 @@ badframe: long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, struct pt_regs *regs) { + struct sigframe __user *sf; struct sigcontext __user *sc; struct sigcontext sigctx; struct mcontext __user *sr; void __user *addr; sigset_t set; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + struct mcontext __user *mcp, *tm_mcp; + unsigned long msr_hi; +#endif /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; - sc = (struct sigcontext __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); + sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); + sc = &sf->sctx; addr = sc; if (copy_from_user(&sigctx, sc, sizeof(sigctx))) goto badframe; @@ -1274,11 +1528,25 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, #endif set_current_blocked(&set); - sr = (struct mcontext __user *)from_user_ptr(sigctx.regs); - addr = sr; - if (!access_ok(VERIFY_READ, sr, sizeof(*sr)) - || restore_user_regs(regs, sr, 1)) +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + mcp = (struct mcontext __user *)&sf->mctx; + tm_mcp = (struct mcontext __user *)&sf->mctx_transact; + if (__get_user(msr_hi, &tm_mcp->mc_gregs[PT_MSR])) goto badframe; + if (MSR_TM_ACTIVE(msr_hi<<32)) { + if (!cpu_has_feature(CPU_FTR_TM)) + goto badframe; + if (restore_tm_user_regs(regs, mcp, tm_mcp)) + goto badframe; + } else +#endif + { + sr = (struct mcontext __user *)from_user_ptr(sigctx.regs); + addr = sr; + if (!access_ok(VERIFY_READ, sr, sizeof(*sr)) + || restore_user_regs(regs, sr, 1)) + goto badframe; + } set_thread_flag(TIF_RESTOREALL); return 0; diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 1ca045d4432..97c1e4b683f 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -34,10 +34,10 @@ #include <asm/syscalls.h> #include <asm/vdso.h> #include <asm/switch_to.h> +#include <asm/tm.h> #include "signal.h" -#define DEBUG_SIG 0 #define GP_REGS_SIZE min(sizeof(elf_gregset_t), sizeof(struct pt_regs)) #define FP_REGS_SIZE sizeof(elf_fpregset_t) @@ -56,13 +56,16 @@ struct rt_sigframe { /* sys_rt_sigreturn requires the ucontext be the first field */ struct ucontext uc; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + struct ucontext uc_transact; +#endif unsigned long _unused[2]; unsigned int tramp[TRAMP_SIZE]; struct siginfo __user *pinfo; void __user *puc; struct siginfo info; - /* 64 bit ABI allows for 288 bytes below sp before decrementing it. */ - char abigap[288]; + /* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */ + char abigap[USER_REDZONE_SIZE]; } __attribute__ ((aligned (16))); static const char fmt32[] = KERN_INFO \ @@ -92,8 +95,6 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long msr = regs->msr; long err = 0; - flush_fp_to_thread(current); - #ifdef CONFIG_ALTIVEC err |= __put_user(v_regs, &sc->v_regs); @@ -101,7 +102,8 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, if (current->thread.used_vr) { flush_altivec_to_thread(current); /* Copy 33 vec registers (vr0..31 and vscr) to the stack */ - err |= __copy_to_user(v_regs, current->thread.vr, 33 * sizeof(vector128)); + err |= __copy_to_user(v_regs, ¤t->thread.vr_state, + 33 * sizeof(vector128)); /* set MSR_VEC in the MSR value in the frame to indicate that sc->v_reg) * contains valid data. */ @@ -110,6 +112,8 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, /* We always copy to/from vrsave, it's 0 if we don't have or don't * use altivec. */ + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + current->thread.vrsave = mfspr(SPRN_VRSAVE); err |= __put_user(current->thread.vrsave, (u32 __user *)&v_regs[33]); #else /* CONFIG_ALTIVEC */ err |= __put_user(0, &sc->v_regs); @@ -117,6 +121,12 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, flush_fp_to_thread(current); /* copy fpr regs and fpscr */ err |= copy_fpr_to_user(&sc->fp_regs, current); + + /* + * Clear the MSR VSX bit to indicate there is no valid state attached + * to this context, except in the specific case below where we set it. + */ + msr &= ~MSR_VSX; #ifdef CONFIG_VSX /* * Copy VSX low doubleword to local buffer for formatting, @@ -145,6 +155,145 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, return err; } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * As above, but Transactional Memory is in use, so deliver sigcontexts + * containing checkpointed and transactional register states. + * + * To do this, we treclaim (done before entering here) to gather both sets of + * registers and set up the 'normal' sigcontext registers with rolled-back + * register values such that a simple signal handler sees a correct + * checkpointed register state. If interested, a TM-aware sighandler can + * examine the transactional registers in the 2nd sigcontext to determine the + * real origin of the signal. + */ +static long setup_tm_sigcontexts(struct sigcontext __user *sc, + struct sigcontext __user *tm_sc, + struct pt_regs *regs, + int signr, sigset_t *set, unsigned long handler) +{ + /* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the + * process never used altivec yet (MSR_VEC is zero in pt_regs of + * the context). This is very important because we must ensure we + * don't lose the VRSAVE content that may have been set prior to + * the process doing its first vector operation + * Userland shall check AT_HWCAP to know wether it can rely on the + * v_regs pointer or not. + */ +#ifdef CONFIG_ALTIVEC + elf_vrreg_t __user *v_regs = (elf_vrreg_t __user *) + (((unsigned long)sc->vmx_reserve + 15) & ~0xful); + elf_vrreg_t __user *tm_v_regs = (elf_vrreg_t __user *) + (((unsigned long)tm_sc->vmx_reserve + 15) & ~0xful); +#endif + unsigned long msr = regs->msr; + long err = 0; + + BUG_ON(!MSR_TM_ACTIVE(regs->msr)); + + /* Remove TM bits from thread's MSR. The MSR in the sigcontext + * just indicates to userland that we were doing a transaction, but we + * don't want to return in transactional state. This also ensures + * that flush_fp_to_thread won't set TIF_RESTORE_TM again. + */ + regs->msr &= ~MSR_TS_MASK; + + flush_fp_to_thread(current); + +#ifdef CONFIG_ALTIVEC + err |= __put_user(v_regs, &sc->v_regs); + err |= __put_user(tm_v_regs, &tm_sc->v_regs); + + /* save altivec registers */ + if (current->thread.used_vr) { + flush_altivec_to_thread(current); + /* Copy 33 vec registers (vr0..31 and vscr) to the stack */ + err |= __copy_to_user(v_regs, ¤t->thread.vr_state, + 33 * sizeof(vector128)); + /* If VEC was enabled there are transactional VRs valid too, + * else they're a copy of the checkpointed VRs. + */ + if (msr & MSR_VEC) + err |= __copy_to_user(tm_v_regs, + ¤t->thread.transact_vr, + 33 * sizeof(vector128)); + else + err |= __copy_to_user(tm_v_regs, + ¤t->thread.vr_state, + 33 * sizeof(vector128)); + + /* set MSR_VEC in the MSR value in the frame to indicate + * that sc->v_reg contains valid data. + */ + msr |= MSR_VEC; + } + /* We always copy to/from vrsave, it's 0 if we don't have or don't + * use altivec. + */ + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + current->thread.vrsave = mfspr(SPRN_VRSAVE); + err |= __put_user(current->thread.vrsave, (u32 __user *)&v_regs[33]); + if (msr & MSR_VEC) + err |= __put_user(current->thread.transact_vrsave, + (u32 __user *)&tm_v_regs[33]); + else + err |= __put_user(current->thread.vrsave, + (u32 __user *)&tm_v_regs[33]); + +#else /* CONFIG_ALTIVEC */ + err |= __put_user(0, &sc->v_regs); + err |= __put_user(0, &tm_sc->v_regs); +#endif /* CONFIG_ALTIVEC */ + + /* copy fpr regs and fpscr */ + err |= copy_fpr_to_user(&sc->fp_regs, current); + if (msr & MSR_FP) + err |= copy_transact_fpr_to_user(&tm_sc->fp_regs, current); + else + err |= copy_fpr_to_user(&tm_sc->fp_regs, current); + +#ifdef CONFIG_VSX + /* + * Copy VSX low doubleword to local buffer for formatting, + * then out to userspace. Update v_regs to point after the + * VMX data. + */ + if (current->thread.used_vsr) { + __giveup_vsx(current); + v_regs += ELF_NVRREG; + tm_v_regs += ELF_NVRREG; + + err |= copy_vsx_to_user(v_regs, current); + + if (msr & MSR_VSX) + err |= copy_transact_vsx_to_user(tm_v_regs, current); + else + err |= copy_vsx_to_user(tm_v_regs, current); + + /* set MSR_VSX in the MSR value in the frame to + * indicate that sc->vs_reg) contains valid data. + */ + msr |= MSR_VSX; + } +#endif /* CONFIG_VSX */ + + err |= __put_user(&sc->gp_regs, &sc->regs); + err |= __put_user(&tm_sc->gp_regs, &tm_sc->regs); + WARN_ON(!FULL_REGS(regs)); + err |= __copy_to_user(&tm_sc->gp_regs, regs, GP_REGS_SIZE); + err |= __copy_to_user(&sc->gp_regs, + ¤t->thread.ckpt_regs, GP_REGS_SIZE); + err |= __put_user(msr, &tm_sc->gp_regs[PT_MSR]); + err |= __put_user(msr, &sc->gp_regs[PT_MSR]); + err |= __put_user(signr, &sc->signal); + err |= __put_user(handler, &sc->handler); + if (set != NULL) + err |= __put_user(set->sig[0], &sc->oldmask); + + return err; +} +#endif + /* * Restore the sigcontext from the signal frame. */ @@ -212,16 +361,18 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig, if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128))) return -EFAULT; /* Copy 33 vec registers (vr0..31 and vscr) from the stack */ - if (v_regs != 0 && (msr & MSR_VEC) != 0) - err |= __copy_from_user(current->thread.vr, v_regs, + if (v_regs != NULL && (msr & MSR_VEC) != 0) + err |= __copy_from_user(¤t->thread.vr_state, v_regs, 33 * sizeof(vector128)); else if (current->thread.used_vr) - memset(current->thread.vr, 0, 33 * sizeof(vector128)); + memset(¤t->thread.vr_state, 0, 33 * sizeof(vector128)); /* Always get VRSAVE back */ - if (v_regs != 0) + if (v_regs != NULL) err |= __get_user(current->thread.vrsave, (u32 __user *)&v_regs[33]); else current->thread.vrsave = 0; + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + mtspr(SPRN_VRSAVE, current->thread.vrsave); #endif /* CONFIG_ALTIVEC */ /* restore floating point */ err |= copy_fpr_from_user(current, &sc->fp_regs); @@ -236,10 +387,165 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig, err |= copy_vsx_from_user(current, v_regs); else for (i = 0; i < 32 ; i++) - current->thread.fpr[i][TS_VSRLOWOFFSET] = 0; + current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0; +#endif + return err; +} + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Restore the two sigcontexts from the frame of a transactional processes. + */ + +static long restore_tm_sigcontexts(struct pt_regs *regs, + struct sigcontext __user *sc, + struct sigcontext __user *tm_sc) +{ +#ifdef CONFIG_ALTIVEC + elf_vrreg_t __user *v_regs, *tm_v_regs; +#endif + unsigned long err = 0; + unsigned long msr; +#ifdef CONFIG_VSX + int i; +#endif + /* copy the GPRs */ + err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr)); + err |= __copy_from_user(¤t->thread.ckpt_regs, sc->gp_regs, + sizeof(regs->gpr)); + + /* + * TFHAR is restored from the checkpointed 'wound-back' ucontext's NIP. + * TEXASR was set by the signal delivery reclaim, as was TFIAR. + * Users doing anything abhorrent like thread-switching w/ signals for + * TM-Suspended code will have to back TEXASR/TFIAR up themselves. + * For the case of getting a signal and simply returning from it, + * we don't need to re-copy them here. + */ + err |= __get_user(regs->nip, &tm_sc->gp_regs[PT_NIP]); + err |= __get_user(current->thread.tm_tfhar, &sc->gp_regs[PT_NIP]); + + /* get MSR separately, transfer the LE bit if doing signal return */ + err |= __get_user(msr, &sc->gp_regs[PT_MSR]); + /* pull in MSR TM from user context */ + regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK); + + /* pull in MSR LE from user context */ + regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); + + /* The following non-GPR non-FPR non-VR state is also checkpointed: */ + err |= __get_user(regs->ctr, &tm_sc->gp_regs[PT_CTR]); + err |= __get_user(regs->link, &tm_sc->gp_regs[PT_LNK]); + err |= __get_user(regs->xer, &tm_sc->gp_regs[PT_XER]); + err |= __get_user(regs->ccr, &tm_sc->gp_regs[PT_CCR]); + err |= __get_user(current->thread.ckpt_regs.ctr, + &sc->gp_regs[PT_CTR]); + err |= __get_user(current->thread.ckpt_regs.link, + &sc->gp_regs[PT_LNK]); + err |= __get_user(current->thread.ckpt_regs.xer, + &sc->gp_regs[PT_XER]); + err |= __get_user(current->thread.ckpt_regs.ccr, + &sc->gp_regs[PT_CCR]); + + /* These regs are not checkpointed; they can go in 'regs'. */ + err |= __get_user(regs->trap, &sc->gp_regs[PT_TRAP]); + err |= __get_user(regs->dar, &sc->gp_regs[PT_DAR]); + err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]); + err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]); + + /* + * Do this before updating the thread state in + * current->thread.fpr/vr. That way, if we get preempted + * and another task grabs the FPU/Altivec, it won't be + * tempted to save the current CPU state into the thread_struct + * and corrupt what we are writing there. + */ + discard_lazy_cpu_state(); + + /* + * Force reload of FP/VEC. + * This has to be done before copying stuff into current->thread.fpr/vr + * for the reasons explained in the previous comment. + */ + regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX); + +#ifdef CONFIG_ALTIVEC + err |= __get_user(v_regs, &sc->v_regs); + err |= __get_user(tm_v_regs, &tm_sc->v_regs); + if (err) + return err; + if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128))) + return -EFAULT; + if (tm_v_regs && !access_ok(VERIFY_READ, + tm_v_regs, 34 * sizeof(vector128))) + return -EFAULT; + /* Copy 33 vec registers (vr0..31 and vscr) from the stack */ + if (v_regs != NULL && tm_v_regs != NULL && (msr & MSR_VEC) != 0) { + err |= __copy_from_user(¤t->thread.vr_state, v_regs, + 33 * sizeof(vector128)); + err |= __copy_from_user(¤t->thread.transact_vr, tm_v_regs, + 33 * sizeof(vector128)); + } + else if (current->thread.used_vr) { + memset(¤t->thread.vr_state, 0, 33 * sizeof(vector128)); + memset(¤t->thread.transact_vr, 0, 33 * sizeof(vector128)); + } + /* Always get VRSAVE back */ + if (v_regs != NULL && tm_v_regs != NULL) { + err |= __get_user(current->thread.vrsave, + (u32 __user *)&v_regs[33]); + err |= __get_user(current->thread.transact_vrsave, + (u32 __user *)&tm_v_regs[33]); + } + else { + current->thread.vrsave = 0; + current->thread.transact_vrsave = 0; + } + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + mtspr(SPRN_VRSAVE, current->thread.vrsave); +#endif /* CONFIG_ALTIVEC */ + /* restore floating point */ + err |= copy_fpr_from_user(current, &sc->fp_regs); + err |= copy_transact_fpr_from_user(current, &tm_sc->fp_regs); +#ifdef CONFIG_VSX + /* + * Get additional VSX data. Update v_regs to point after the + * VMX data. Copy VSX low doubleword from userspace to local + * buffer for formatting, then into the taskstruct. + */ + if (v_regs && ((msr & MSR_VSX) != 0)) { + v_regs += ELF_NVRREG; + tm_v_regs += ELF_NVRREG; + err |= copy_vsx_from_user(current, v_regs); + err |= copy_transact_vsx_from_user(current, tm_v_regs); + } else { + for (i = 0; i < 32 ; i++) { + current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0; + current->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET] = 0; + } + } #endif + tm_enable(); + /* Make sure the transaction is marked as failed */ + current->thread.tm_texasr |= TEXASR_FS; + /* This loads the checkpointed FP/VEC state, if used */ + tm_recheckpoint(¤t->thread, msr); + + /* This loads the speculative FP/VEC state, if used */ + if (msr & MSR_FP) { + do_load_up_transact_fpu(¤t->thread); + regs->msr |= (MSR_FP | current->thread.fpexc_mode); + } +#ifdef CONFIG_ALTIVEC + if (msr & MSR_VEC) { + do_load_up_transact_altivec(¤t->thread); + regs->msr |= MSR_VEC; + } +#endif + return err; } +#endif /* * Setup the trampoline code on the stack @@ -355,6 +661,9 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5, { struct ucontext __user *uc = (struct ucontext __user *)regs->gpr[1]; sigset_t set; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + unsigned long msr; +#endif /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; @@ -365,22 +674,31 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5, if (__copy_from_user(&set, &uc->uc_sigmask, sizeof(set))) goto badframe; set_current_blocked(&set); +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR])) + goto badframe; + if (MSR_TM_ACTIVE(msr)) { + /* We recheckpoint on return. */ + struct ucontext __user *uc_transact; + if (__get_user(uc_transact, &uc->uc_link)) + goto badframe; + if (restore_tm_sigcontexts(regs, &uc->uc_mcontext, + &uc_transact->uc_mcontext)) + goto badframe; + } + else + /* Fall through, for non-TM restore */ +#endif if (restore_sigcontext(regs, NULL, 1, &uc->uc_mcontext)) goto badframe; - /* do_sigaltstack expects a __user pointer and won't modify - * what's in there anyway - */ - do_sigaltstack(&uc->uc_stack, NULL, regs->gpr[1]); + if (restore_altstack(&uc->uc_stack)) + goto badframe; set_thread_flag(TIF_RESTOREALL); return 0; badframe: -#if DEBUG_SIG - printk("badframe in sys_rt_sigreturn, regs=%p uc=%p &uc->uc_mcontext=%p\n", - regs, uc, &uc->uc_mcontext); -#endif if (show_unhandled_signals) printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, current->comm, current->pid, "rt_sigreturn", @@ -393,17 +711,11 @@ badframe: int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) { - /* Handler is *really* a pointer to the function descriptor for - * the signal routine. The first entry in the function - * descriptor is the entry address of signal and the second - * entry is the TOC value we need to use. - */ - func_descr_t __user *funct_desc_ptr; struct rt_sigframe __user *frame; unsigned long newsp = 0; long err = 0; - frame = get_sigframe(ka, regs, sizeof(*frame), 0); + frame = get_sigframe(ka, get_tm_stackpointer(regs), sizeof(*frame), 0); if (unlikely(frame == NULL)) goto badframe; @@ -415,19 +727,32 @@ int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info, /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->gpr[1]), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, signr, NULL, - (unsigned long)ka->sa.sa_handler, 1); + err |= __save_altstack(&frame->uc.uc_stack, regs->gpr[1]); +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(regs->msr)) { + /* The ucontext_t passed to userland points to the second + * ucontext_t (for transactional state) with its uc_link ptr. + */ + err |= __put_user(&frame->uc_transact, &frame->uc.uc_link); + err |= setup_tm_sigcontexts(&frame->uc.uc_mcontext, + &frame->uc_transact.uc_mcontext, + regs, signr, + NULL, + (unsigned long)ka->sa.sa_handler); + } else +#endif + { + err |= __put_user(0, &frame->uc.uc_link); + err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, signr, + NULL, (unsigned long)ka->sa.sa_handler, + 1); + } err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) goto badframe; /* Make sure signal handler doesn't get spurious FP exceptions */ - current->thread.fpscr.val = 0; + current->thread.fp_state.fpscr = 0; /* Set up to return from userspace. */ if (vdso64_rt_sigtramp && current->mm->context.vdso_base) { @@ -438,18 +763,32 @@ int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info, goto badframe; regs->link = (unsigned long) &frame->tramp[0]; } - funct_desc_ptr = (func_descr_t __user *) ka->sa.sa_handler; /* Allocate a dummy caller frame for the signal handler. */ newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE; err |= put_user(regs->gpr[1], (unsigned long __user *)newsp); /* Set up "regs" so we "return" to the signal handler. */ - err |= get_user(regs->nip, &funct_desc_ptr->entry); - /* enter the signal handler in big-endian mode */ + if (is_elf2_task()) { + regs->nip = (unsigned long) ka->sa.sa_handler; + regs->gpr[12] = regs->nip; + } else { + /* Handler is *really* a pointer to the function descriptor for + * the signal routine. The first entry in the function + * descriptor is the entry address of signal and the second + * entry is the TOC value we need to use. + */ + func_descr_t __user *funct_desc_ptr = + (func_descr_t __user *) ka->sa.sa_handler; + + err |= get_user(regs->nip, &funct_desc_ptr->entry); + err |= get_user(regs->gpr[2], &funct_desc_ptr->toc); + } + + /* enter the signal handler in native-endian mode */ regs->msr &= ~MSR_LE; + regs->msr |= (MSR_KERNEL & MSR_LE); regs->gpr[1] = newsp; - err |= get_user(regs->gpr[2], &funct_desc_ptr->toc); regs->gpr[3] = signr; regs->result = 0; if (ka->sa.sa_flags & SA_SIGINFO) { @@ -465,10 +804,6 @@ int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info, return 1; badframe: -#if DEBUG_SIG - printk("badframe in setup_rt_frame, regs=%p frame=%p newsp=%lx\n", - regs, frame, newsp); -#endif if (show_unhandled_signals) printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, current->comm, current->pid, "setup_rt_frame", diff --git a/arch/powerpc/kernel/smp-tbsync.c b/arch/powerpc/kernel/smp-tbsync.c index e68fd1ae727..7a37ecd3afa 100644 --- a/arch/powerpc/kernel/smp-tbsync.c +++ b/arch/powerpc/kernel/smp-tbsync.c @@ -9,7 +9,6 @@ #include <linux/sched.h> #include <linux/smp.h> #include <linux/unistd.h> -#include <linux/init.h> #include <linux/slab.h> #include <linux/atomic.h> #include <asm/smp.h> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 793401e6508..1007fb802e6 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -35,6 +35,8 @@ #include <asm/ptrace.h> #include <linux/atomic.h> #include <asm/irq.h> +#include <asm/hw_irq.h> +#include <asm/kvm_ppc.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/prom.h> @@ -81,6 +83,28 @@ int smt_enabled_at_boot = 1; static void (*crash_ipi_function_ptr)(struct pt_regs *) = NULL; +/* + * Returns 1 if the specified cpu should be brought up during boot. + * Used to inhibit booting threads if they've been disabled or + * limited on the command line + */ +int smp_generic_cpu_bootable(unsigned int nr) +{ + /* Special case - we inhibit secondary thread startup + * during boot if the user requests it. + */ + if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) { + if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) + return 0; + if (smt_enabled_at_boot + && cpu_thread_in_core(nr) >= smt_enabled_at_boot) + return 0; + } + + return 1; +} + + #ifdef CONFIG_PPC64 int smp_generic_kick_cpu(int nr) { @@ -123,9 +147,9 @@ static irqreturn_t reschedule_action(int irq, void *data) return IRQ_HANDLED; } -static irqreturn_t call_function_single_action(int irq, void *data) +static irqreturn_t tick_broadcast_ipi_action(int irq, void *data) { - generic_smp_call_function_single_interrupt(); + tick_broadcast_ipi_handler(); return IRQ_HANDLED; } @@ -146,14 +170,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data) static irq_handler_t smp_ipi_action[] = { [PPC_MSG_CALL_FUNCTION] = call_function_action, [PPC_MSG_RESCHEDULE] = reschedule_action, - [PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action, + [PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action, [PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action, }; const char *smp_ipi_name[] = { [PPC_MSG_CALL_FUNCTION] = "ipi call function", [PPC_MSG_RESCHEDULE] = "ipi reschedule", - [PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single", + [PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast", [PPC_MSG_DEBUGGER_BREAK] = "ipi debugger", }; @@ -172,7 +196,7 @@ int smp_request_message_ipi(int virq, int msg) #endif err = request_irq(virq, smp_ipi_action[msg], IRQF_PERCPU | IRQF_NO_THREAD | IRQF_NO_SUSPEND, - smp_ipi_name[msg], 0); + smp_ipi_name[msg], NULL); WARN(err < 0, "unable to request_irq %d for %s (rc %d)\n", virq, smp_ipi_name[msg], err); @@ -210,6 +234,12 @@ void smp_muxed_ipi_message_pass(int cpu, int msg) smp_ops->cause_ipi(cpu, info->data); } +#ifdef __BIG_ENDIAN__ +#define IPI_MESSAGE(A) (1 << (24 - 8 * (A))) +#else +#define IPI_MESSAGE(A) (1 << (8 * (A))) +#endif + irqreturn_t smp_ipi_demux(void) { struct cpu_messages *info = &__get_cpu_var(ipi_message); @@ -219,19 +249,14 @@ irqreturn_t smp_ipi_demux(void) do { all = xchg(&info->messages, 0); - -#ifdef __BIG_ENDIAN - if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNCTION))) + if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION)) generic_smp_call_function_interrupt(); - if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE))) + if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE)) scheduler_ipi(); - if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE))) - generic_smp_call_function_single_interrupt(); - if (all & (1 << (24 - 8 * PPC_MSG_DEBUGGER_BREAK))) + if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST)) + tick_broadcast_ipi_handler(); + if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK)) debug_ipi_action(0, NULL); -#else -#error Unsupported ENDIAN -#endif } while (info->messages); return IRQ_HANDLED; @@ -257,7 +282,7 @@ EXPORT_SYMBOL_GPL(smp_send_reschedule); void arch_send_call_function_single_ipi(int cpu) { - do_message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE); + do_message_pass(cpu, PPC_MSG_CALL_FUNCTION); } void arch_send_call_function_ipi_mask(const struct cpumask *mask) @@ -268,6 +293,16 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) do_message_pass(cpu, PPC_MSG_CALL_FUNCTION); } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +void tick_broadcast(const struct cpumask *mask) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) + do_message_pass(cpu, PPC_MSG_TICK_BROADCAST); +} +#endif + #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) void smp_send_debugger_break(void) { @@ -346,13 +381,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid)); cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); - if (smp_ops) - if (smp_ops->probe) - max_cpus = smp_ops->probe(); - else - max_cpus = NR_CPUS; - else - max_cpus = 1; + if (smp_ops && smp_ops->probe) + smp_ops->probe(); } void smp_prepare_boot_cpu(void) @@ -361,6 +391,7 @@ void smp_prepare_boot_cpu(void) #ifdef CONFIG_PPC64 paca[boot_cpuid].__current = current; #endif + set_numa_node(numa_cpu_lookup_table[boot_cpuid]); current_set[boot_cpuid] = task_thread_info(current); } @@ -428,38 +459,9 @@ int generic_check_cpu_restart(unsigned int cpu) return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE; } -static atomic_t secondary_inhibit_count; - -/* - * Don't allow secondary CPU threads to come online - */ -void inhibit_secondary_onlining(void) -{ - /* - * This makes secondary_inhibit_count stable during cpu - * online/offline operations. - */ - get_online_cpus(); - - atomic_inc(&secondary_inhibit_count); - put_online_cpus(); -} -EXPORT_SYMBOL_GPL(inhibit_secondary_onlining); - -/* - * Allow secondary CPU threads to come online again - */ -void uninhibit_secondary_onlining(void) -{ - get_online_cpus(); - atomic_dec(&secondary_inhibit_count); - put_online_cpus(); -} -EXPORT_SYMBOL_GPL(uninhibit_secondary_onlining); - -static int secondaries_inhibited(void) +static bool secondaries_inhibited(void) { - return atomic_read(&secondary_inhibit_count); + return kvm_hv_mode_active(); } #else /* HOTPLUG_CPU */ @@ -480,7 +482,7 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) secondary_ti = current_set[cpu] = ti; } -int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) +int __cpu_up(unsigned int cpu, struct task_struct *tidle) { int rc, c; @@ -488,7 +490,7 @@ int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) * Don't allow secondary threads to come online if inhibited */ if (threads_per_core > 1 && secondaries_inhibited() && - cpu % threads_per_core != 0) + cpu_thread_in_subcore(cpu)) return -EBUSY; if (smp_ops == NULL || @@ -557,7 +559,7 @@ int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) int cpu_to_core_id(int cpu) { struct device_node *np; - const int *reg; + const __be32 *reg; int id = -1; np = of_get_cpu_node(cpu, NULL); @@ -568,7 +570,7 @@ int cpu_to_core_id(int cpu) if (!reg) goto out; - id = *reg; + id = be32_to_cpup(reg); out: of_node_put(np); return id; @@ -587,6 +589,33 @@ int cpu_first_thread_of_core(int core) } EXPORT_SYMBOL_GPL(cpu_first_thread_of_core); +static void traverse_siblings_chip_id(int cpu, bool add, int chipid) +{ + const struct cpumask *mask; + struct device_node *np; + int i, plen; + const __be32 *prop; + + mask = add ? cpu_online_mask : cpu_present_mask; + for_each_cpu(i, mask) { + np = of_get_cpu_node(i, NULL); + if (!np) + continue; + prop = of_get_property(np, "ibm,chip-id", &plen); + if (prop && plen == sizeof(int) && + of_read_number(prop, 1) == chipid) { + if (add) { + cpumask_set_cpu(cpu, cpu_core_mask(i)); + cpumask_set_cpu(i, cpu_core_mask(cpu)); + } else { + cpumask_clear_cpu(cpu, cpu_core_mask(i)); + cpumask_clear_cpu(i, cpu_core_mask(cpu)); + } + } + of_node_put(np); + } +} + /* Must be called when no change can occur to cpu_present_mask, * i.e. during cpu online or offline. */ @@ -609,11 +638,51 @@ static struct device_node *cpu_to_l2cache(int cpu) return cache; } +static void traverse_core_siblings(int cpu, bool add) +{ + struct device_node *l2_cache, *np; + const struct cpumask *mask; + int i, chip, plen; + const __be32 *prop; + + /* First see if we have ibm,chip-id properties in cpu nodes */ + np = of_get_cpu_node(cpu, NULL); + if (np) { + chip = -1; + prop = of_get_property(np, "ibm,chip-id", &plen); + if (prop && plen == sizeof(int)) + chip = of_read_number(prop, 1); + of_node_put(np); + if (chip >= 0) { + traverse_siblings_chip_id(cpu, add, chip); + return; + } + } + + l2_cache = cpu_to_l2cache(cpu); + mask = add ? cpu_online_mask : cpu_present_mask; + for_each_cpu(i, mask) { + np = cpu_to_l2cache(i); + if (!np) + continue; + if (np == l2_cache) { + if (add) { + cpumask_set_cpu(cpu, cpu_core_mask(i)); + cpumask_set_cpu(i, cpu_core_mask(cpu)); + } else { + cpumask_clear_cpu(cpu, cpu_core_mask(i)); + cpumask_clear_cpu(i, cpu_core_mask(cpu)); + } + } + of_node_put(np); + } + of_node_put(l2_cache); +} + /* Activate a secondary processor. */ void start_secondary(void *unused) { unsigned int cpu = smp_processor_id(); - struct device_node *l2_cache; int i, base; atomic_inc(&init_mm.mm_count); @@ -637,12 +706,10 @@ void start_secondary(void *unused) vdso_getcpu_init(); #endif - notify_cpu_starting(cpu); - set_cpu_online(cpu, true); /* Update sibling maps */ base = cpu_first_thread_sibling(cpu); for (i = 0; i < threads_per_core; i++) { - if (cpu_is_offline(base + i)) + if (cpu_is_offline(base + i) && (cpu != base + i)) continue; cpumask_set_cpu(cpu, cpu_sibling_mask(base + i)); cpumask_set_cpu(base + i, cpu_sibling_mask(cpu)); @@ -654,22 +721,21 @@ void start_secondary(void *unused) cpumask_set_cpu(cpu, cpu_core_mask(base + i)); cpumask_set_cpu(base + i, cpu_core_mask(cpu)); } - l2_cache = cpu_to_l2cache(cpu); - for_each_online_cpu(i) { - struct device_node *np = cpu_to_l2cache(i); - if (!np) - continue; - if (np == l2_cache) { - cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, cpu_core_mask(cpu)); - } - of_node_put(np); - } - of_node_put(l2_cache); + traverse_core_siblings(cpu, true); + + /* + * numa_node_id() works after this. + */ + set_numa_node(numa_cpu_lookup_table[cpu]); + set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); + + smp_wmb(); + notify_cpu_starting(cpu); + set_cpu_online(cpu, true); local_irq_enable(); - cpu_idle(); + cpu_startup_entry(CPUHP_ONLINE); BUG(); } @@ -679,6 +745,28 @@ int setup_profiling_timer(unsigned int multiplier) return 0; } +#ifdef CONFIG_SCHED_SMT +/* cpumask of CPUs with asymetric SMT dependancy */ +static int powerpc_smt_flags(void) +{ + int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + + if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { + printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); + flags |= SD_ASYM_PACKING; + } + return flags; +} +#endif + +static struct sched_domain_topology_level powerpc_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + void __init smp_cpus_done(unsigned int max_cpus) { cpumask_var_t old_mask; @@ -703,21 +791,13 @@ void __init smp_cpus_done(unsigned int max_cpus) dump_numa_cpu_topology(); -} + set_sched_topology(powerpc_topology); -int arch_sd_sibling_asym_packing(void) -{ - if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { - printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); - return SD_ASYM_PACKING; - } - return 0; } #ifdef CONFIG_HOTPLUG_CPU int __cpu_disable(void) { - struct device_node *l2_cache; int cpu = smp_processor_id(); int base, i; int err; @@ -737,20 +817,7 @@ int __cpu_disable(void) cpumask_clear_cpu(cpu, cpu_core_mask(base + i)); cpumask_clear_cpu(base + i, cpu_core_mask(cpu)); } - - l2_cache = cpu_to_l2cache(cpu); - for_each_present_cpu(i) { - struct device_node *np = cpu_to_l2cache(i); - if (!np) - continue; - if (np == l2_cache) { - cpumask_clear_cpu(cpu, cpu_core_mask(i)); - cpumask_clear_cpu(i, cpu_core_mask(cpu)); - } - of_node_put(np); - } - of_node_put(l2_cache); - + traverse_core_siblings(cpu, false); return 0; } @@ -761,18 +828,6 @@ void __cpu_die(unsigned int cpu) smp_ops->cpu_die(cpu); } -static DEFINE_MUTEX(powerpc_cpu_hotplug_driver_mutex); - -void cpu_hotplug_driver_lock() -{ - mutex_lock(&powerpc_cpu_hotplug_driver_mutex); -} - -void cpu_hotplug_driver_unlock() -{ - mutex_unlock(&powerpc_cpu_hotplug_driver_mutex); -} - void cpu_die(void) { if (ppc_md.cpu_die) diff --git a/arch/powerpc/kernel/softemu8xx.c b/arch/powerpc/kernel/softemu8xx.c deleted file mode 100644 index 29b2f81dd70..00000000000 --- a/arch/powerpc/kernel/softemu8xx.c +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Software emulation of some PPC instructions for the 8xx core. - * - * Copyright (C) 1998 Dan Malek (dmalek@jlc.net) - * - * Software floating emuation for the MPC8xx processor. I did this mostly - * because it was easier than trying to get the libraries compiled for - * software floating point. The goal is still to get the libraries done, - * but I lost patience and needed some hacks to at least get init and - * shells running. The first problem is the setjmp/longjmp that save - * and restore the floating point registers. - * - * For this emulation, our working registers are found on the register - * save area. - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/stddef.h> -#include <linux/unistd.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/interrupt.h> - -#include <asm/pgtable.h> -#include <asm/uaccess.h> -#include <asm/io.h> - -/* Eventually we may need a look-up table, but this works for now. -*/ -#define LFS 48 -#define LFD 50 -#define LFDU 51 -#define STFD 54 -#define STFDU 55 -#define FMR 63 - -void print_8xx_pte(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - - printk(" pte @ 0x%8lx: ", addr); - pgd = pgd_offset(mm, addr & PAGE_MASK); - if (pgd) { - pmd = pmd_offset(pud_offset(pgd, addr & PAGE_MASK), - addr & PAGE_MASK); - if (pmd && pmd_present(*pmd)) { - pte = pte_offset_kernel(pmd, addr & PAGE_MASK); - if (pte) { - printk(" (0x%08lx)->(0x%08lx)->0x%08lx\n", - (long)pgd, (long)pte, (long)pte_val(*pte)); -#define pp ((long)pte_val(*pte)) - printk(" RPN: %05lx PP: %lx SPS: %lx SH: %lx " - "CI: %lx v: %lx\n", - pp>>12, /* rpn */ - (pp>>10)&3, /* pp */ - (pp>>3)&1, /* small */ - (pp>>2)&1, /* shared */ - (pp>>1)&1, /* cache inhibit */ - pp&1 /* valid */ - ); -#undef pp - } - else { - printk("no pte\n"); - } - } - else { - printk("no pmd\n"); - } - } - else { - printk("no pgd\n"); - } -} - -int get_8xx_pte(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - int retval = 0; - - pgd = pgd_offset(mm, addr & PAGE_MASK); - if (pgd) { - pmd = pmd_offset(pud_offset(pgd, addr & PAGE_MASK), - addr & PAGE_MASK); - if (pmd && pmd_present(*pmd)) { - pte = pte_offset_kernel(pmd, addr & PAGE_MASK); - if (pte) { - retval = (int)pte_val(*pte); - } - } - } - return retval; -} - -/* - * We return 0 on success, 1 on unimplemented instruction, and EFAULT - * if a load/store faulted. - */ -int Soft_emulate_8xx(struct pt_regs *regs) -{ - u32 inst, instword; - u32 flreg, idxreg, disp; - int retval; - s16 sdisp; - u32 *ea, *ip; - - retval = 0; - - instword = *((u32 *)regs->nip); - inst = instword >> 26; - - flreg = (instword >> 21) & 0x1f; - idxreg = (instword >> 16) & 0x1f; - disp = instword & 0xffff; - - ea = (u32 *)(regs->gpr[idxreg] + disp); - ip = (u32 *)¤t->thread.TS_FPR(flreg); - - switch ( inst ) - { - case LFD: - /* this is a 16 bit quantity that is sign extended - * so use a signed short here -- Cort - */ - sdisp = (instword & 0xffff); - ea = (u32 *)(regs->gpr[idxreg] + sdisp); - if (copy_from_user(ip, ea, sizeof(double))) - retval = -EFAULT; - break; - - case LFDU: - if (copy_from_user(ip, ea, sizeof(double))) - retval = -EFAULT; - else - regs->gpr[idxreg] = (u32)ea; - break; - case LFS: - sdisp = (instword & 0xffff); - ea = (u32 *)(regs->gpr[idxreg] + sdisp); - if (copy_from_user(ip, ea, sizeof(float))) - retval = -EFAULT; - break; - case STFD: - /* this is a 16 bit quantity that is sign extended - * so use a signed short here -- Cort - */ - sdisp = (instword & 0xffff); - ea = (u32 *)(regs->gpr[idxreg] + sdisp); - if (copy_to_user(ea, ip, sizeof(double))) - retval = -EFAULT; - break; - - case STFDU: - if (copy_to_user(ea, ip, sizeof(double))) - retval = -EFAULT; - else - regs->gpr[idxreg] = (u32)ea; - break; - case FMR: - /* assume this is a fp move -- Cort */ - memcpy(ip, ¤t->thread.TS_FPR((instword>>11)&0x1f), - sizeof(double)); - break; - default: - retval = 1; - printk("Bad emulation %s/%d\n" - " NIP: %08lx instruction: %08x opcode: %x " - "A: %x B: %x C: %x code: %x rc: %x\n", - current->comm,current->pid, - regs->nip, - instword,inst, - (instword>>16)&0x1f, - (instword>>11)&0x1f, - (instword>>6)&0x1f, - (instword>>1)&0x3ff, - instword&1); - { - int pa; - print_8xx_pte(current->mm,regs->nip); - pa = get_8xx_pte(current->mm,regs->nip) & PAGE_MASK; - pa |= (regs->nip & ~PAGE_MASK); - pa = (unsigned long)__va(pa); - printk("Kernel VA for NIP %x ", pa); - print_8xx_pte(current->mm,pa); - } - } - - if (retval == 0) - regs->nip += 4; - - return retval; -} diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S index 86ac1d90d02..988f38dced0 100644 --- a/arch/powerpc/kernel/swsusp_asm64.S +++ b/arch/powerpc/kernel/swsusp_asm64.S @@ -46,10 +46,19 @@ #define SL_r29 0xe8 #define SL_r30 0xf0 #define SL_r31 0xf8 -#define SL_SIZE SL_r31+8 +#define SL_SPRG1 0x100 +#define SL_TCR 0x108 +#define SL_SIZE SL_TCR+8 /* these macros rely on the save area being * pointed to by r11 */ + +#define SAVE_SPR(register) \ + mfspr r0, SPRN_##register ;\ + std r0, SL_##register(r11) +#define RESTORE_SPR(register) \ + ld r0, SL_##register(r11) ;\ + mtspr SPRN_##register, r0 #define SAVE_SPECIAL(special) \ mf##special r0 ;\ std r0, SL_##special(r11) @@ -103,8 +112,17 @@ _GLOBAL(swsusp_arch_suspend) SAVE_REGISTER(r30) SAVE_REGISTER(r31) SAVE_SPECIAL(MSR) - SAVE_SPECIAL(SDR1) SAVE_SPECIAL(XER) +#ifdef CONFIG_PPC_BOOK3S_64 +BEGIN_FW_FTR_SECTION + SAVE_SPECIAL(SDR1) +END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR) +#else + SAVE_SPR(TCR) + + /* Save SPRG1, SPRG1 be used save paca */ + SAVE_SPR(SPRG1) +#endif /* we push the stack up 128 bytes but don't store the * stack pointer on the stack like a real stackframe */ @@ -151,6 +169,7 @@ copy_page_loop: bne+ copyloop nothing_to_copy: +#ifdef CONFIG_PPC_BOOK3S_64 /* flush caches */ lis r3, 0x10 mtctr r3 @@ -167,6 +186,7 @@ nothing_to_copy: sync tlbia +#endif ld r11,swsusp_save_area_ptr@toc(r2) @@ -208,16 +228,41 @@ nothing_to_copy: RESTORE_REGISTER(r29) RESTORE_REGISTER(r30) RESTORE_REGISTER(r31) + +#ifdef CONFIG_PPC_BOOK3S_64 /* can't use RESTORE_SPECIAL(MSR) */ ld r0, SL_MSR(r11) mtmsrd r0, 0 +BEGIN_FW_FTR_SECTION RESTORE_SPECIAL(SDR1) +END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR) +#else + /* Restore SPRG1, be used to save paca */ + ld r0, SL_SPRG1(r11) + mtsprg 1, r0 + + RESTORE_SPECIAL(MSR) + + /* Restore TCR and clear any pending bits in TSR. */ + RESTORE_SPR(TCR) + lis r0, (TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS)@h + mtspr SPRN_TSR, r0 + + /* Kick decrementer */ + li r0, 1 + mtdec r0 + + /* Invalidate all tlbs */ + bl _tlbil_all +#endif RESTORE_SPECIAL(XER) sync addi r1,r1,-128 +#ifdef CONFIG_PPC_BOOK3S_64 bl slb_flush_and_rebolt +#endif bl do_after_copyback addi r1,r1,128 diff --git a/arch/powerpc/kernel/swsusp_booke.S b/arch/powerpc/kernel/swsusp_booke.S index 11a39307dd7..553c1405ee0 100644 --- a/arch/powerpc/kernel/swsusp_booke.S +++ b/arch/powerpc/kernel/swsusp_booke.S @@ -74,21 +74,21 @@ _GLOBAL(swsusp_arch_suspend) bne 1b /* Save SPRGs */ - mfsprg r4,0 + mfspr r4,SPRN_SPRG0 stw r4,SL_SPRG0(r11) - mfsprg r4,1 + mfspr r4,SPRN_SPRG1 stw r4,SL_SPRG1(r11) - mfsprg r4,2 + mfspr r4,SPRN_SPRG2 stw r4,SL_SPRG2(r11) - mfsprg r4,3 + mfspr r4,SPRN_SPRG3 stw r4,SL_SPRG3(r11) - mfsprg r4,4 + mfspr r4,SPRN_SPRG4 stw r4,SL_SPRG4(r11) - mfsprg r4,5 + mfspr r4,SPRN_SPRG5 stw r4,SL_SPRG5(r11) - mfsprg r4,6 + mfspr r4,SPRN_SPRG6 stw r4,SL_SPRG6(r11) - mfsprg r4,7 + mfspr r4,SPRN_SPRG7 stw r4,SL_SPRG7(r11) /* Call the low level suspend stuff (we should probably have made @@ -141,22 +141,30 @@ _GLOBAL(swsusp_arch_resume) lis r11,swsusp_save_area@h ori r11,r11,swsusp_save_area@l + /* + * Mappings from virtual addresses to physical addresses may be + * different than they were prior to restoring hibernation state. + * Invalidate the TLB so that the boot CPU is using the new + * mappings. + */ + bl _tlbil_all + lwz r4,SL_SPRG0(r11) - mtsprg 0,r4 + mtspr SPRN_SPRG0,r4 lwz r4,SL_SPRG1(r11) - mtsprg 1,r4 + mtspr SPRN_SPRG1,r4 lwz r4,SL_SPRG2(r11) - mtsprg 2,r4 + mtspr SPRN_SPRG2,r4 lwz r4,SL_SPRG3(r11) - mtsprg 3,r4 + mtspr SPRN_SPRG3,r4 lwz r4,SL_SPRG4(r11) - mtsprg 4,r4 + mtspr SPRN_SPRG4,r4 lwz r4,SL_SPRG5(r11) - mtsprg 5,r4 + mtspr SPRN_SPRG5,r4 lwz r4,SL_SPRG6(r11) - mtsprg 6,r4 + mtspr SPRN_SPRG6,r4 lwz r4,SL_SPRG7(r11) - mtsprg 7,r4 + mtspr SPRN_SPRG7,r4 /* restore the MSR */ lwz r3,SL_MSR(r11) diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index 8a93778ed9f..8a285876aef 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -61,404 +61,6 @@ asmlinkage long ppc32_select(u32 n, compat_ulong_t __user *inp, return compat_sys_select((int)n, inp, outp, exp, compat_ptr(tvp_x)); } -/* Note: it is necessary to treat option as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sysfs(u32 option, u32 arg1, u32 arg2) -{ - return sys_sysfs((int)option, arg1, arg2); -} - -#ifdef CONFIG_SYSVIPC -long compat_sys_ipc(u32 call, u32 first, u32 second, u32 third, compat_uptr_t ptr, - u32 fifth) -{ - int version; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - - case SEMTIMEDOP: - if (fifth) - /* sign extend semid */ - return compat_sys_semtimedop((int)first, - compat_ptr(ptr), second, - compat_ptr(fifth)); - /* else fall through for normal semop() */ - case SEMOP: - /* struct sembuf is the same on 32 and 64bit :)) */ - /* sign extend semid */ - return sys_semtimedop((int)first, compat_ptr(ptr), second, - NULL); - case SEMGET: - /* sign extend key, nsems */ - return sys_semget((int)first, (int)second, third); - case SEMCTL: - /* sign extend semid, semnum */ - return compat_sys_semctl((int)first, (int)second, third, - compat_ptr(ptr)); - - case MSGSND: - /* sign extend msqid */ - return compat_sys_msgsnd((int)first, (int)second, third, - compat_ptr(ptr)); - case MSGRCV: - /* sign extend msqid, msgtyp */ - return compat_sys_msgrcv((int)first, second, (int)fifth, - third, version, compat_ptr(ptr)); - case MSGGET: - /* sign extend key */ - return sys_msgget((int)first, second); - case MSGCTL: - /* sign extend msqid */ - return compat_sys_msgctl((int)first, second, compat_ptr(ptr)); - - case SHMAT: - /* sign extend shmid */ - return compat_sys_shmat((int)first, second, third, version, - compat_ptr(ptr)); - case SHMDT: - return sys_shmdt(compat_ptr(ptr)); - case SHMGET: - /* sign extend key_t */ - return sys_shmget((int)first, second, third); - case SHMCTL: - /* sign extend shmid */ - return compat_sys_shmctl((int)first, second, compat_ptr(ptr)); - - default: - return -ENOSYS; - } - - return -ENOSYS; -} -#endif - -/* Note: it is necessary to treat out_fd and in_fd as unsigned ints, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sendfile_wrapper(u32 out_fd, u32 in_fd, - compat_off_t __user *offset, u32 count) -{ - return compat_sys_sendfile((int)out_fd, (int)in_fd, offset, count); -} - -asmlinkage long compat_sys_sendfile64_wrapper(u32 out_fd, u32 in_fd, - compat_loff_t __user *offset, u32 count) -{ - return sys_sendfile((int)out_fd, (int)in_fd, - (off_t __user *)offset, count); -} - -/* Note: it is necessary to treat option as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_prctl(u32 option, u32 arg2, u32 arg3, u32 arg4, u32 arg5) -{ - return sys_prctl((int)option, - (unsigned long) arg2, - (unsigned long) arg3, - (unsigned long) arg4, - (unsigned long) arg5); -} - -/* Note: it is necessary to treat pid as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_rr_get_interval_wrapper(u32 pid, - struct compat_timespec __user *interval) -{ - return compat_sys_sched_rr_get_interval((int)pid, interval); -} - -/* Note: it is necessary to treat mode as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_access(const char __user * filename, u32 mode) -{ - return sys_access(filename, (int)mode); -} - - -/* Note: it is necessary to treat mode as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_creat(const char __user * pathname, u32 mode) -{ - return sys_creat(pathname, (int)mode); -} - - -/* Note: it is necessary to treat pid and options as unsigned ints, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_waitpid(u32 pid, unsigned int __user * stat_addr, u32 options) -{ - return sys_waitpid((int)pid, stat_addr, (int)options); -} - - -/* Note: it is necessary to treat gidsetsize as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_getgroups(u32 gidsetsize, gid_t __user *grouplist) -{ - return sys_getgroups((int)gidsetsize, grouplist); -} - - -/* Note: it is necessary to treat pid as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_getpgid(u32 pid) -{ - return sys_getpgid((int)pid); -} - - - -/* Note: it is necessary to treat pid as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_getsid(u32 pid) -{ - return sys_getsid((int)pid); -} - - -/* Note: it is necessary to treat pid and sig as unsigned ints, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_kill(u32 pid, u32 sig) -{ - return sys_kill((int)pid, (int)sig); -} - - -/* Note: it is necessary to treat mode as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_mkdir(const char __user * pathname, u32 mode) -{ - return sys_mkdir(pathname, (int)mode); -} - -long compat_sys_nice(u32 increment) -{ - /* sign extend increment */ - return sys_nice((int)increment); -} - -off_t ppc32_lseek(unsigned int fd, u32 offset, unsigned int origin) -{ - /* sign extend n */ - return sys_lseek(fd, (int)offset, origin); -} - -long compat_sys_truncate(const char __user * path, u32 length) -{ - /* sign extend length */ - return sys_truncate(path, (int)length); -} - -long compat_sys_ftruncate(int fd, u32 length) -{ - /* sign extend length */ - return sys_ftruncate(fd, (int)length); -} - -/* Note: it is necessary to treat bufsiz as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_readlink(const char __user * path, char __user * buf, u32 bufsiz) -{ - return sys_readlink(path, buf, (int)bufsiz); -} - -/* Note: it is necessary to treat option as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_get_priority_max(u32 policy) -{ - return sys_sched_get_priority_max((int)policy); -} - - -/* Note: it is necessary to treat policy as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_get_priority_min(u32 policy) -{ - return sys_sched_get_priority_min((int)policy); -} - - -/* Note: it is necessary to treat pid as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_getparam(u32 pid, struct sched_param __user *param) -{ - return sys_sched_getparam((int)pid, param); -} - - -/* Note: it is necessary to treat pid as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_getscheduler(u32 pid) -{ - return sys_sched_getscheduler((int)pid); -} - - -/* Note: it is necessary to treat pid as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_setparam(u32 pid, struct sched_param __user *param) -{ - return sys_sched_setparam((int)pid, param); -} - - -/* Note: it is necessary to treat pid and policy as unsigned ints, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_sched_setscheduler(u32 pid, u32 policy, struct sched_param __user *param) -{ - return sys_sched_setscheduler((int)pid, (int)policy, param); -} - - -/* Note: it is necessary to treat len as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_setdomainname(char __user *name, u32 len) -{ - return sys_setdomainname(name, (int)len); -} - - -/* Note: it is necessary to treat gidsetsize as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_setgroups(u32 gidsetsize, gid_t __user *grouplist) -{ - return sys_setgroups((int)gidsetsize, grouplist); -} - - -asmlinkage long compat_sys_sethostname(char __user *name, u32 len) -{ - /* sign extend len */ - return sys_sethostname(name, (int)len); -} - - -/* Note: it is necessary to treat pid and pgid as unsigned ints, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_setpgid(u32 pid, u32 pgid) -{ - return sys_setpgid((int)pid, (int)pgid); -} - -long compat_sys_getpriority(u32 which, u32 who) -{ - /* sign extend which and who */ - return sys_getpriority((int)which, (int)who); -} - -long compat_sys_setpriority(u32 which, u32 who, u32 niceval) -{ - /* sign extend which, who and niceval */ - return sys_setpriority((int)which, (int)who, (int)niceval); -} - -long compat_sys_ioprio_get(u32 which, u32 who) -{ - /* sign extend which and who */ - return sys_ioprio_get((int)which, (int)who); -} - -long compat_sys_ioprio_set(u32 which, u32 who, u32 ioprio) -{ - /* sign extend which, who and ioprio */ - return sys_ioprio_set((int)which, (int)who, (int)ioprio); -} - -/* Note: it is necessary to treat newmask as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_ssetmask(u32 newmask) -{ - return sys_ssetmask((int) newmask); -} - -asmlinkage long compat_sys_syslog(u32 type, char __user * buf, u32 len) -{ - /* sign extend len */ - return sys_syslog(type, buf, (int)len); -} - - -/* Note: it is necessary to treat mask as an unsigned int, - * with the corresponding cast to a signed int to insure that the - * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) - * and the register representation of a signed int (msr in 64-bit mode) is performed. - */ -asmlinkage long compat_sys_umask(u32 mask) -{ - return sys_umask((int)mask); -} - unsigned long compat_sys_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) @@ -467,12 +69,6 @@ unsigned long compat_sys_mmap2(unsigned long addr, size_t len, return sys_mmap(addr, len, prot, flags, fd, pgoff << 12); } -long compat_sys_tgkill(u32 tgid, u32 pid, int sig) -{ - /* sign extend tgid, pid */ - return sys_tgkill((int)tgid, (int)pid, sig); -} - /* * long long munging: * The 32 bit ABI passes long longs in an odd even register pair. @@ -514,13 +110,6 @@ asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long h return sys_ftruncate(fd, (high << 32) | low); } -long ppc32_lookup_dcookie(u32 cookie_high, u32 cookie_low, char __user *buf, - size_t len) -{ - return sys_lookup_dcookie((u64)cookie_high << 32 | cookie_low, - buf, len); -} - long ppc32_fadvise64(int fd, u32 unused, u32 offset_high, u32 offset_low, size_t len, int advice) { @@ -528,23 +117,6 @@ long ppc32_fadvise64(int fd, u32 unused, u32 offset_high, u32 offset_low, advice); } -asmlinkage long compat_sys_add_key(const char __user *_type, - const char __user *_description, - const void __user *_payload, - u32 plen, - u32 ringid) -{ - return sys_add_key(_type, _description, _payload, plen, ringid); -} - -asmlinkage long compat_sys_request_key(const char __user *_type, - const char __user *_description, - const char __user *_callout_info, - u32 destringid) -{ - return sys_request_key(_type, _description, _callout_info, destringid); -} - asmlinkage long compat_sys_sync_file_range2(int fd, unsigned int flags, unsigned offset_hi, unsigned offset_lo, unsigned nbytes_hi, unsigned nbytes_lo) @@ -554,11 +126,3 @@ asmlinkage long compat_sys_sync_file_range2(int fd, unsigned int flags, return sys_sync_file_range(fd, offset, nbytes, flags); } - -asmlinkage long compat_sys_fanotify_mark(int fanotify_fd, unsigned int flags, - unsigned mask_hi, unsigned mask_lo, - int dfd, const char __user *pathname) -{ - u64 mask = ((u64)mask_hi << 32) | mask_lo; - return sys_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname); -} diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index 4e3cc47f26b..cd9be9aa016 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -34,7 +34,6 @@ #include <linux/ipc.h> #include <linux/utsname.h> #include <linux/file.h> -#include <linux/init.h> #include <linux/personality.h> #include <asm/uaccess.h> diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 3ce1f864c2d..67fd2fd2620 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -17,6 +17,7 @@ #include <asm/machdep.h> #include <asm/smp.h> #include <asm/pmc.h> +#include <asm/firmware.h> #include "cacheinfo.h" @@ -50,8 +51,6 @@ static ssize_t store_smt_snooze_delay(struct device *dev, return -EINVAL; per_cpu(smt_snooze_delay, cpu->dev.id) = snooze; - update_smt_snooze_delay(cpu->dev.id, snooze); - return count; } @@ -85,6 +84,304 @@ __setup("smt-snooze-delay=", setup_smt_snooze_delay); #endif /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC_FSL_BOOK3E +#define MAX_BIT 63 + +static u64 pw20_wt; +static u64 altivec_idle_wt; + +static unsigned int get_idle_ticks_bit(u64 ns) +{ + u64 cycle; + + if (ns >= 10000) + cycle = div_u64(ns + 500, 1000) * tb_ticks_per_usec; + else + cycle = div_u64(ns * tb_ticks_per_usec, 1000); + + if (!cycle) + return 0; + + return ilog2(cycle); +} + +static void do_show_pwrmgtcr0(void *val) +{ + u32 *value = val; + + *value = mfspr(SPRN_PWRMGTCR0); +} + +static ssize_t show_pw20_state(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u32 value; + unsigned int cpu = dev->id; + + smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1); + + value &= PWRMGTCR0_PW20_WAIT; + + return sprintf(buf, "%u\n", value ? 1 : 0); +} + +static void do_store_pw20_state(void *val) +{ + u32 *value = val; + u32 pw20_state; + + pw20_state = mfspr(SPRN_PWRMGTCR0); + + if (*value) + pw20_state |= PWRMGTCR0_PW20_WAIT; + else + pw20_state &= ~PWRMGTCR0_PW20_WAIT; + + mtspr(SPRN_PWRMGTCR0, pw20_state); +} + +static ssize_t store_pw20_state(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 value; + unsigned int cpu = dev->id; + + if (kstrtou32(buf, 0, &value)) + return -EINVAL; + + if (value > 1) + return -EINVAL; + + smp_call_function_single(cpu, do_store_pw20_state, &value, 1); + + return count; +} + +static ssize_t show_pw20_wait_time(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u32 value; + u64 tb_cycle = 1; + u64 time; + + unsigned int cpu = dev->id; + + if (!pw20_wt) { + smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1); + value = (value & PWRMGTCR0_PW20_ENT) >> + PWRMGTCR0_PW20_ENT_SHIFT; + + tb_cycle = (tb_cycle << (MAX_BIT - value + 1)); + /* convert ms to ns */ + if (tb_ticks_per_usec > 1000) { + time = div_u64(tb_cycle, tb_ticks_per_usec / 1000); + } else { + u32 rem_us; + + time = div_u64_rem(tb_cycle, tb_ticks_per_usec, + &rem_us); + time = time * 1000 + rem_us * 1000 / tb_ticks_per_usec; + } + } else { + time = pw20_wt; + } + + return sprintf(buf, "%llu\n", time > 0 ? time : 0); +} + +static void set_pw20_wait_entry_bit(void *val) +{ + u32 *value = val; + u32 pw20_idle; + + pw20_idle = mfspr(SPRN_PWRMGTCR0); + + /* Set Automatic PW20 Core Idle Count */ + /* clear count */ + pw20_idle &= ~PWRMGTCR0_PW20_ENT; + + /* set count */ + pw20_idle |= ((MAX_BIT - *value) << PWRMGTCR0_PW20_ENT_SHIFT); + + mtspr(SPRN_PWRMGTCR0, pw20_idle); +} + +static ssize_t store_pw20_wait_time(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 entry_bit; + u64 value; + + unsigned int cpu = dev->id; + + if (kstrtou64(buf, 0, &value)) + return -EINVAL; + + if (!value) + return -EINVAL; + + entry_bit = get_idle_ticks_bit(value); + if (entry_bit > MAX_BIT) + return -EINVAL; + + pw20_wt = value; + + smp_call_function_single(cpu, set_pw20_wait_entry_bit, + &entry_bit, 1); + + return count; +} + +static ssize_t show_altivec_idle(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u32 value; + unsigned int cpu = dev->id; + + smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1); + + value &= PWRMGTCR0_AV_IDLE_PD_EN; + + return sprintf(buf, "%u\n", value ? 1 : 0); +} + +static void do_store_altivec_idle(void *val) +{ + u32 *value = val; + u32 altivec_idle; + + altivec_idle = mfspr(SPRN_PWRMGTCR0); + + if (*value) + altivec_idle |= PWRMGTCR0_AV_IDLE_PD_EN; + else + altivec_idle &= ~PWRMGTCR0_AV_IDLE_PD_EN; + + mtspr(SPRN_PWRMGTCR0, altivec_idle); +} + +static ssize_t store_altivec_idle(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 value; + unsigned int cpu = dev->id; + + if (kstrtou32(buf, 0, &value)) + return -EINVAL; + + if (value > 1) + return -EINVAL; + + smp_call_function_single(cpu, do_store_altivec_idle, &value, 1); + + return count; +} + +static ssize_t show_altivec_idle_wait_time(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u32 value; + u64 tb_cycle = 1; + u64 time; + + unsigned int cpu = dev->id; + + if (!altivec_idle_wt) { + smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1); + value = (value & PWRMGTCR0_AV_IDLE_CNT) >> + PWRMGTCR0_AV_IDLE_CNT_SHIFT; + + tb_cycle = (tb_cycle << (MAX_BIT - value + 1)); + /* convert ms to ns */ + if (tb_ticks_per_usec > 1000) { + time = div_u64(tb_cycle, tb_ticks_per_usec / 1000); + } else { + u32 rem_us; + + time = div_u64_rem(tb_cycle, tb_ticks_per_usec, + &rem_us); + time = time * 1000 + rem_us * 1000 / tb_ticks_per_usec; + } + } else { + time = altivec_idle_wt; + } + + return sprintf(buf, "%llu\n", time > 0 ? time : 0); +} + +static void set_altivec_idle_wait_entry_bit(void *val) +{ + u32 *value = val; + u32 altivec_idle; + + altivec_idle = mfspr(SPRN_PWRMGTCR0); + + /* Set Automatic AltiVec Idle Count */ + /* clear count */ + altivec_idle &= ~PWRMGTCR0_AV_IDLE_CNT; + + /* set count */ + altivec_idle |= ((MAX_BIT - *value) << PWRMGTCR0_AV_IDLE_CNT_SHIFT); + + mtspr(SPRN_PWRMGTCR0, altivec_idle); +} + +static ssize_t store_altivec_idle_wait_time(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 entry_bit; + u64 value; + + unsigned int cpu = dev->id; + + if (kstrtou64(buf, 0, &value)) + return -EINVAL; + + if (!value) + return -EINVAL; + + entry_bit = get_idle_ticks_bit(value); + if (entry_bit > MAX_BIT) + return -EINVAL; + + altivec_idle_wt = value; + + smp_call_function_single(cpu, set_altivec_idle_wait_entry_bit, + &entry_bit, 1); + + return count; +} + +/* + * Enable/Disable interface: + * 0, disable. 1, enable. + */ +static DEVICE_ATTR(pw20_state, 0600, show_pw20_state, store_pw20_state); +static DEVICE_ATTR(altivec_idle, 0600, show_altivec_idle, store_altivec_idle); + +/* + * Set wait time interface:(Nanosecond) + * Example: Base on TBfreq is 41MHZ. + * 1~48(ns): TB[63] + * 49~97(ns): TB[62] + * 98~195(ns): TB[61] + * 196~390(ns): TB[60] + * 391~780(ns): TB[59] + * 781~1560(ns): TB[58] + * ... + */ +static DEVICE_ATTR(pw20_wait_time, 0600, + show_pw20_wait_time, + store_pw20_wait_time); +static DEVICE_ATTR(altivec_idle_wait_time, 0600, + show_altivec_idle_wait_time, + store_altivec_idle_wait_time); +#endif + /* * Enabling PMCs will slow partition context switch times so we only do * it the first time we write to the PMCs. @@ -107,16 +404,18 @@ void ppc_enable_pmcs(void) } EXPORT_SYMBOL(ppc_enable_pmcs); -#define SYSFS_PMCSETUP(NAME, ADDRESS) \ +#define __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, EXTRA) \ static void read_##NAME(void *val) \ { \ *(unsigned long *)val = mfspr(ADDRESS); \ } \ static void write_##NAME(void *val) \ { \ - ppc_enable_pmcs(); \ + EXTRA; \ mtspr(ADDRESS, *(unsigned long *)val); \ -} \ +} + +#define __SYSFS_SPRSETUP_SHOW_STORE(NAME) \ static ssize_t show_##NAME(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ @@ -139,6 +438,15 @@ static ssize_t __used \ return count; \ } +#define SYSFS_PMCSETUP(NAME, ADDRESS) \ + __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ppc_enable_pmcs()) \ + __SYSFS_SPRSETUP_SHOW_STORE(NAME) +#define SYSFS_SPRSETUP(NAME, ADDRESS) \ + __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ) \ + __SYSFS_SPRSETUP_SHOW_STORE(NAME) + +#define SYSFS_SPRSETUP_SHOW_STORE(NAME) \ + __SYSFS_SPRSETUP_SHOW_STORE(NAME) /* Let's define all possible registers, we'll only hook up the ones * that are implemented on the current processor @@ -174,34 +482,50 @@ SYSFS_PMCSETUP(pmc7, SPRN_PMC7); SYSFS_PMCSETUP(pmc8, SPRN_PMC8); SYSFS_PMCSETUP(mmcra, SPRN_MMCRA); -SYSFS_PMCSETUP(purr, SPRN_PURR); -SYSFS_PMCSETUP(spurr, SPRN_SPURR); -SYSFS_PMCSETUP(dscr, SPRN_DSCR); -SYSFS_PMCSETUP(pir, SPRN_PIR); +SYSFS_SPRSETUP(purr, SPRN_PURR); +SYSFS_SPRSETUP(spurr, SPRN_SPURR); +SYSFS_SPRSETUP(pir, SPRN_PIR); +/* + Lets only enable read for phyp resources and + enable write when needed with a separate function. + Lets be conservative and default to pseries. +*/ static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra); -static DEVICE_ATTR(spurr, 0600, show_spurr, NULL); -static DEVICE_ATTR(dscr, 0600, show_dscr, store_dscr); -static DEVICE_ATTR(purr, 0600, show_purr, store_purr); +static DEVICE_ATTR(spurr, 0400, show_spurr, NULL); +static DEVICE_ATTR(purr, 0400, show_purr, store_purr); static DEVICE_ATTR(pir, 0400, show_pir, NULL); -unsigned long dscr_default = 0; -EXPORT_SYMBOL(dscr_default); +static unsigned long dscr_default; -static ssize_t show_dscr_default(struct device *dev, - struct device_attribute *attr, char *buf) +static void read_dscr(void *val) { - return sprintf(buf, "%lx\n", dscr_default); + *(unsigned long *)val = get_paca()->dscr_default; } -static void update_dscr(void *dummy) +static void write_dscr(void *val) { + get_paca()->dscr_default = *(unsigned long *)val; if (!current->thread.dscr_inherit) { - current->thread.dscr = dscr_default; - mtspr(SPRN_DSCR, dscr_default); + current->thread.dscr = *(unsigned long *)val; + mtspr(SPRN_DSCR, *(unsigned long *)val); } } +SYSFS_SPRSETUP_SHOW_STORE(dscr); +static DEVICE_ATTR(dscr, 0600, show_dscr, store_dscr); + +static void add_write_permission_dev_attr(struct device_attribute *attr) +{ + attr->attr.mode |= 0200; +} + +static ssize_t show_dscr_default(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%lx\n", dscr_default); +} + static ssize_t __used store_dscr_default(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -214,7 +538,7 @@ static ssize_t __used store_dscr_default(struct device *dev, return -EINVAL; dscr_default = val; - on_each_cpu(update_dscr, NULL, 1); + on_each_cpu(write_dscr, &val, 1); return count; } @@ -238,34 +562,34 @@ SYSFS_PMCSETUP(pa6t_pmc3, SPRN_PA6T_PMC3); SYSFS_PMCSETUP(pa6t_pmc4, SPRN_PA6T_PMC4); SYSFS_PMCSETUP(pa6t_pmc5, SPRN_PA6T_PMC5); #ifdef CONFIG_DEBUG_KERNEL -SYSFS_PMCSETUP(hid0, SPRN_HID0); -SYSFS_PMCSETUP(hid1, SPRN_HID1); -SYSFS_PMCSETUP(hid4, SPRN_HID4); -SYSFS_PMCSETUP(hid5, SPRN_HID5); -SYSFS_PMCSETUP(ima0, SPRN_PA6T_IMA0); -SYSFS_PMCSETUP(ima1, SPRN_PA6T_IMA1); -SYSFS_PMCSETUP(ima2, SPRN_PA6T_IMA2); -SYSFS_PMCSETUP(ima3, SPRN_PA6T_IMA3); -SYSFS_PMCSETUP(ima4, SPRN_PA6T_IMA4); -SYSFS_PMCSETUP(ima5, SPRN_PA6T_IMA5); -SYSFS_PMCSETUP(ima6, SPRN_PA6T_IMA6); -SYSFS_PMCSETUP(ima7, SPRN_PA6T_IMA7); -SYSFS_PMCSETUP(ima8, SPRN_PA6T_IMA8); -SYSFS_PMCSETUP(ima9, SPRN_PA6T_IMA9); -SYSFS_PMCSETUP(imaat, SPRN_PA6T_IMAAT); -SYSFS_PMCSETUP(btcr, SPRN_PA6T_BTCR); -SYSFS_PMCSETUP(pccr, SPRN_PA6T_PCCR); -SYSFS_PMCSETUP(rpccr, SPRN_PA6T_RPCCR); -SYSFS_PMCSETUP(der, SPRN_PA6T_DER); -SYSFS_PMCSETUP(mer, SPRN_PA6T_MER); -SYSFS_PMCSETUP(ber, SPRN_PA6T_BER); -SYSFS_PMCSETUP(ier, SPRN_PA6T_IER); -SYSFS_PMCSETUP(sier, SPRN_PA6T_SIER); -SYSFS_PMCSETUP(siar, SPRN_PA6T_SIAR); -SYSFS_PMCSETUP(tsr0, SPRN_PA6T_TSR0); -SYSFS_PMCSETUP(tsr1, SPRN_PA6T_TSR1); -SYSFS_PMCSETUP(tsr2, SPRN_PA6T_TSR2); -SYSFS_PMCSETUP(tsr3, SPRN_PA6T_TSR3); +SYSFS_SPRSETUP(hid0, SPRN_HID0); +SYSFS_SPRSETUP(hid1, SPRN_HID1); +SYSFS_SPRSETUP(hid4, SPRN_HID4); +SYSFS_SPRSETUP(hid5, SPRN_HID5); +SYSFS_SPRSETUP(ima0, SPRN_PA6T_IMA0); +SYSFS_SPRSETUP(ima1, SPRN_PA6T_IMA1); +SYSFS_SPRSETUP(ima2, SPRN_PA6T_IMA2); +SYSFS_SPRSETUP(ima3, SPRN_PA6T_IMA3); +SYSFS_SPRSETUP(ima4, SPRN_PA6T_IMA4); +SYSFS_SPRSETUP(ima5, SPRN_PA6T_IMA5); +SYSFS_SPRSETUP(ima6, SPRN_PA6T_IMA6); +SYSFS_SPRSETUP(ima7, SPRN_PA6T_IMA7); +SYSFS_SPRSETUP(ima8, SPRN_PA6T_IMA8); +SYSFS_SPRSETUP(ima9, SPRN_PA6T_IMA9); +SYSFS_SPRSETUP(imaat, SPRN_PA6T_IMAAT); +SYSFS_SPRSETUP(btcr, SPRN_PA6T_BTCR); +SYSFS_SPRSETUP(pccr, SPRN_PA6T_PCCR); +SYSFS_SPRSETUP(rpccr, SPRN_PA6T_RPCCR); +SYSFS_SPRSETUP(der, SPRN_PA6T_DER); +SYSFS_SPRSETUP(mer, SPRN_PA6T_MER); +SYSFS_SPRSETUP(ber, SPRN_PA6T_BER); +SYSFS_SPRSETUP(ier, SPRN_PA6T_IER); +SYSFS_SPRSETUP(sier, SPRN_PA6T_SIER); +SYSFS_SPRSETUP(siar, SPRN_PA6T_SIAR); +SYSFS_SPRSETUP(tsr0, SPRN_PA6T_TSR0); +SYSFS_SPRSETUP(tsr1, SPRN_PA6T_TSR1); +SYSFS_SPRSETUP(tsr2, SPRN_PA6T_TSR2); +SYSFS_SPRSETUP(tsr3, SPRN_PA6T_TSR3); #endif /* CONFIG_DEBUG_KERNEL */ #endif /* HAS_PPC_PMC_PA6T */ @@ -341,7 +665,7 @@ static struct device_attribute pa6t_attrs[] = { #endif /* HAS_PPC_PMC_PA6T */ #endif /* HAS_PPC_PMC_CLASSIC */ -static void __cpuinit register_cpu_online(unsigned int cpu) +static void register_cpu_online(unsigned int cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); struct device *s = &c->dev; @@ -394,8 +718,11 @@ static void __cpuinit register_cpu_online(unsigned int cpu) if (cpu_has_feature(CPU_FTR_MMCRA)) device_create_file(s, &dev_attr_mmcra); - if (cpu_has_feature(CPU_FTR_PURR)) + if (cpu_has_feature(CPU_FTR_PURR)) { + if (!firmware_has_feature(FW_FEATURE_LPAR)) + add_write_permission_dev_attr(&dev_attr_purr); device_create_file(s, &dev_attr_purr); + } if (cpu_has_feature(CPU_FTR_SPURR)) device_create_file(s, &dev_attr_spurr); @@ -407,6 +734,15 @@ static void __cpuinit register_cpu_online(unsigned int cpu) device_create_file(s, &dev_attr_pir); #endif /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC_FSL_BOOK3E + if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) { + device_create_file(s, &dev_attr_pw20_state); + device_create_file(s, &dev_attr_pw20_wait_time); + + device_create_file(s, &dev_attr_altivec_idle); + device_create_file(s, &dev_attr_altivec_idle_wait_time); + } +#endif cacheinfo_cpu_online(cpu); } @@ -479,6 +815,15 @@ static void unregister_cpu_online(unsigned int cpu) device_remove_file(s, &dev_attr_pir); #endif /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC_FSL_BOOK3E + if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) { + device_remove_file(s, &dev_attr_pw20_state); + device_remove_file(s, &dev_attr_pw20_wait_time); + + device_remove_file(s, &dev_attr_altivec_idle); + device_remove_file(s, &dev_attr_altivec_idle_wait_time); + } +#endif cacheinfo_cpu_offline(cpu); } @@ -502,7 +847,7 @@ ssize_t arch_cpu_release(const char *buf, size_t count) #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit sysfs_cpu_notify(struct notifier_block *self, +static int sysfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned int)(long)hcpu; @@ -522,7 +867,7 @@ static int __cpuinit sysfs_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata sysfs_cpu_nb = { +static struct notifier_block sysfs_cpu_nb = { .notifier_call = sysfs_cpu_notify, }; @@ -643,7 +988,8 @@ static int __init topology_init(void) int cpu; register_nodes(); - register_cpu_notifier(&sysfs_cpu_nb); + + cpu_notifier_register_begin(); for_each_possible_cpu(cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); @@ -667,6 +1013,11 @@ static int __init topology_init(void) if (cpu_online(cpu)) register_cpu_online(cpu); } + + __register_cpu_notifier(&sysfs_cpu_nb); + + cpu_notifier_register_done(); + #ifdef CONFIG_PPC64 sysfs_create_dscr_default(); #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index 93219c34af3..895c50ca943 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -17,12 +17,12 @@ #include <asm/ppc_asm.h> #ifdef CONFIG_PPC64 -#define SYSCALL(func) .llong .sys_##func,.sys_##func -#define COMPAT_SYS(func) .llong .sys_##func,.compat_sys_##func -#define PPC_SYS(func) .llong .ppc_##func,.ppc_##func -#define OLDSYS(func) .llong .sys_ni_syscall,.sys_ni_syscall -#define SYS32ONLY(func) .llong .sys_ni_syscall,.compat_sys_##func -#define SYSX(f, f3264, f32) .llong .f,.f3264 +#define SYSCALL(func) .llong DOTSYM(sys_##func),DOTSYM(sys_##func) +#define COMPAT_SYS(func) .llong DOTSYM(sys_##func),DOTSYM(compat_sys_##func) +#define PPC_SYS(func) .llong DOTSYM(ppc_##func),DOTSYM(ppc_##func) +#define OLDSYS(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall) +#define SYS32ONLY(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func) +#define SYSX(f, f3264, f32) .llong DOTSYM(f),DOTSYM(f3264) #else #define SYSCALL(func) .long sys_##func #define COMPAT_SYS(func) .long sys_##func @@ -36,6 +36,8 @@ #define PPC_SYS_SPU(func) PPC_SYS(func) #define SYSX_SPU(f, f3264, f32) SYSX(f, f3264, f32) +.section .rodata,"a" + #ifdef CONFIG_PPC64 #define sys_sigpending sys_ni_syscall #define sys_old_getrlimit sys_ni_syscall @@ -43,5 +45,7 @@ .p2align 3 #endif -_GLOBAL(sys_call_table) +.globl sys_call_table +sys_call_table: + #include <asm/systbl.h> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 6f6b1cccc91..9fff9cdcc51 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -42,6 +42,7 @@ #include <linux/timex.h> #include <linux/kernel_stat.h> #include <linux/time.h> +#include <linux/clockchips.h> #include <linux/init.h> #include <linux/profile.h> #include <linux/cpu.h> @@ -106,7 +107,7 @@ struct clock_event_device decrementer_clockevent = { .irq = 0, .set_next_event = decrementer_set_next_event, .set_mode = decrementer_set_mode, - .features = CLOCK_EVT_FEAT_ONESHOT, + .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP, }; EXPORT_SYMBOL(decrementer_clockevent); @@ -143,7 +144,7 @@ EXPORT_SYMBOL_GPL(ppc_proc_freq); unsigned long ppc_tb_freq; EXPORT_SYMBOL_GPL(ppc_tb_freq); -#ifdef CONFIG_VIRT_CPU_ACCOUNTING +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Factors for converting from cputime_t (timebase ticks) to * jiffies, microseconds, seconds, and clock_t (1/USER_HZ seconds). @@ -210,23 +211,23 @@ static u64 scan_dispatch_log(u64 stop_tb) if (!dtl) return 0; - if (i == vpa->dtl_idx) + if (i == be64_to_cpu(vpa->dtl_idx)) return 0; - while (i < vpa->dtl_idx) { - if (dtl_consumer) - dtl_consumer(dtl, i); - dtb = dtl->timebase; - tb_delta = dtl->enqueue_to_dispatch_time + - dtl->ready_to_enqueue_time; + while (i < be64_to_cpu(vpa->dtl_idx)) { + dtb = be64_to_cpu(dtl->timebase); + tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) + + be32_to_cpu(dtl->ready_to_enqueue_time); barrier(); - if (i + N_DISPATCH_LOG < vpa->dtl_idx) { + if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) { /* buffer has overflowed */ - i = vpa->dtl_idx - N_DISPATCH_LOG; + i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG; dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); continue; } if (dtb > stop_tb) break; + if (dtl_consumer) + dtl_consumer(dtl, i); stolen += tb_delta; ++i; ++dtl; @@ -269,7 +270,7 @@ static inline u64 calculate_stolen_time(u64 stop_tb) { u64 stolen = 0; - if (get_paca()->dtl_ridx != get_paca()->lppaca_ptr->dtl_idx) { + if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) { stolen = scan_dispatch_log(stop_tb); get_paca()->system_time -= stolen; } @@ -347,6 +348,7 @@ void vtime_account_system(struct task_struct *tsk) if (stolen) account_steal_time(stolen); } +EXPORT_SYMBOL_GPL(vtime_account_system); void vtime_account_idle(struct task_struct *tsk) { @@ -377,7 +379,7 @@ void vtime_account_user(struct task_struct *tsk) account_user_time(tsk, utime, utimescaled); } -#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ +#else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #define calc_cputime_factors() #endif @@ -477,43 +479,13 @@ void arch_irq_work_raise(void) #endif /* CONFIG_IRQ_WORK */ -/* - * timer_interrupt - gets called when the decrementer overflows, - * with interrupts disabled. - */ -void timer_interrupt(struct pt_regs * regs) +void __timer_interrupt(void) { - struct pt_regs *old_regs; + struct pt_regs *regs = get_irq_regs(); u64 *next_tb = &__get_cpu_var(decrementers_next_tb); struct clock_event_device *evt = &__get_cpu_var(decrementers); u64 now; - /* Ensure a positive value is written to the decrementer, or else - * some CPUs will continue to take decrementer exceptions. - */ - set_dec(DECREMENTER_MAX); - - /* Some implementations of hotplug will get timer interrupts while - * offline, just ignore these - */ - if (!cpu_online(smp_processor_id())) - return; - - /* Conditionally hard-enable interrupts now that the DEC has been - * bumped to its maximum value - */ - may_hard_irq_enable(); - - __get_cpu_var(irq_stat).timer_irqs++; - -#if defined(CONFIG_PPC32) && defined(CONFIG_PMAC) - if (atomic_read(&ppc_n_lost_interrupts) != 0) - do_IRQ(regs); -#endif - - old_regs = set_irq_regs(regs); - irq_enter(); - trace_timer_interrupt_entry(regs); if (test_irq_work_pending()) { @@ -526,10 +498,15 @@ void timer_interrupt(struct pt_regs * regs) *next_tb = ~(u64)0; if (evt->event_handler) evt->event_handler(evt); + __get_cpu_var(irq_stat).timer_irqs_event++; } else { now = *next_tb - now; if (now <= DECREMENTER_MAX) set_dec((int)now); + /* We may have raced with new irq work */ + if (test_irq_work_pending()) + set_dec(1); + __get_cpu_var(irq_stat).timer_irqs_others++; } #ifdef CONFIG_PPC64 @@ -541,7 +518,48 @@ void timer_interrupt(struct pt_regs * regs) #endif trace_timer_interrupt_exit(regs); +} +/* + * timer_interrupt - gets called when the decrementer overflows, + * with interrupts disabled. + */ +void timer_interrupt(struct pt_regs * regs) +{ + struct pt_regs *old_regs; + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); + + /* Ensure a positive value is written to the decrementer, or else + * some CPUs will continue to take decrementer exceptions. + */ + set_dec(DECREMENTER_MAX); + + /* Some implementations of hotplug will get timer interrupts while + * offline, just ignore these and we also need to set + * decrementers_next_tb as MAX to make sure __check_irq_replay + * don't replay timer interrupt when return, otherwise we'll trap + * here infinitely :( + */ + if (!cpu_online(smp_processor_id())) { + *next_tb = ~(u64)0; + return; + } + + /* Conditionally hard-enable interrupts now that the DEC has been + * bumped to its maximum value + */ + may_hard_irq_enable(); + + +#if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC) + if (atomic_read(&ppc_n_lost_interrupts) != 0) + do_IRQ(regs); +#endif + + old_regs = set_irq_regs(regs); + irq_enter(); + + __timer_interrupt(); irq_exit(); set_irq_regs(old_regs); } @@ -606,7 +624,7 @@ unsigned long long sched_clock(void) static int __init get_freq(char *name, int cells, unsigned long *val) { struct device_node *cpu; - const unsigned int *fp; + const __be32 *fp; int found = 0; /* The cpu node should have timebase and clock frequency properties */ @@ -625,7 +643,6 @@ static int __init get_freq(char *name, int cells, unsigned long *val) return found; } -/* should become __cpuinit when secondary_cpu_time_init also is */ void start_cpu_decrementer(void) { #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) @@ -663,7 +680,7 @@ int update_persistent_clock(struct timespec now) struct rtc_time tm; if (!ppc_md.set_rtc_time) - return 0; + return -ENODEV; to_tm(now.tv_sec + 1 + timezone_offset, &tm); tm.tm_year -= 1900; @@ -798,6 +815,11 @@ static int decrementer_set_next_event(unsigned long evt, { __get_cpu_var(decrementers_next_tb) = get_tb_or_rtc() + evt; set_dec(evt); + + /* We may have raced with new irq work */ + if (test_irq_work_pending()) + set_dec(1); + return 0; } @@ -808,6 +830,15 @@ static void decrementer_set_mode(enum clock_event_mode mode, decrementer_set_next_event(DECREMENTER_MAX, dev); } +/* Interrupt handler for the timer broadcast IPI */ +void tick_broadcast_ipi_handler(void) +{ + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); + + *next_tb = get_tb_or_rtc(); + __timer_interrupt(); +} + static void register_decrementer_clockevent(int cpu) { struct clock_event_device *dec = &per_cpu(decrementers, cpu); @@ -911,6 +942,7 @@ void __init time_init(void) clocksource_init(); init_decrementer_clockevent(); + tick_setup_hrtimer_broadcast(); } @@ -1043,10 +1075,8 @@ static int __init rtc_init(void) return -ENODEV; pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - return 0; + return PTR_ERR_OR_ZERO(pdev); } module_init(rtc_init); diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S new file mode 100644 index 00000000000..2a324f4cb1b --- /dev/null +++ b/arch/powerpc/kernel/tm.S @@ -0,0 +1,481 @@ +/* + * Transactional memory support routines to reclaim and recheckpoint + * transactional process state. + * + * Copyright 2012 Matt Evans & Michael Neuling, IBM Corporation. + */ + +#include <asm/asm-offsets.h> +#include <asm/ppc_asm.h> +#include <asm/ppc-opcode.h> +#include <asm/ptrace.h> +#include <asm/reg.h> +#include <asm/bug.h> + +#ifdef CONFIG_VSX +/* See fpu.S, this is borrowed from there */ +#define __SAVE_32FPRS_VSRS(n,c,base) \ +BEGIN_FTR_SECTION \ + b 2f; \ +END_FTR_SECTION_IFSET(CPU_FTR_VSX); \ + SAVE_32FPRS(n,base); \ + b 3f; \ +2: SAVE_32VSRS(n,c,base); \ +3: +#define __REST_32FPRS_VSRS(n,c,base) \ +BEGIN_FTR_SECTION \ + b 2f; \ +END_FTR_SECTION_IFSET(CPU_FTR_VSX); \ + REST_32FPRS(n,base); \ + b 3f; \ +2: REST_32VSRS(n,c,base); \ +3: +#else +#define __SAVE_32FPRS_VSRS(n,c,base) SAVE_32FPRS(n, base) +#define __REST_32FPRS_VSRS(n,c,base) REST_32FPRS(n, base) +#endif +#define SAVE_32FPRS_VSRS(n,c,base) \ + __SAVE_32FPRS_VSRS(n,__REG_##c,__REG_##base) +#define REST_32FPRS_VSRS(n,c,base) \ + __REST_32FPRS_VSRS(n,__REG_##c,__REG_##base) + +/* Stack frame offsets for local variables. */ +#define TM_FRAME_L0 TM_FRAME_SIZE-16 +#define TM_FRAME_L1 TM_FRAME_SIZE-8 + + +/* In order to access the TM SPRs, TM must be enabled. So, do so: */ +_GLOBAL(tm_enable) + mfmsr r4 + li r3, MSR_TM >> 32 + sldi r3, r3, 32 + and. r0, r4, r3 + bne 1f + or r4, r4, r3 + mtmsrd r4 +1: blr + +_GLOBAL(tm_save_sprs) + mfspr r0, SPRN_TFHAR + std r0, THREAD_TM_TFHAR(r3) + mfspr r0, SPRN_TEXASR + std r0, THREAD_TM_TEXASR(r3) + mfspr r0, SPRN_TFIAR + std r0, THREAD_TM_TFIAR(r3) + blr + +_GLOBAL(tm_restore_sprs) + ld r0, THREAD_TM_TFHAR(r3) + mtspr SPRN_TFHAR, r0 + ld r0, THREAD_TM_TEXASR(r3) + mtspr SPRN_TEXASR, r0 + ld r0, THREAD_TM_TFIAR(r3) + mtspr SPRN_TFIAR, r0 + blr + + /* Passed an 8-bit failure cause as first argument. */ +_GLOBAL(tm_abort) + TABORT(R3) + blr + +/* void tm_reclaim(struct thread_struct *thread, + * unsigned long orig_msr, + * uint8_t cause) + * + * - Performs a full reclaim. This destroys outstanding + * transactions and updates thread->regs.tm_ckpt_* with the + * original checkpointed state. Note that thread->regs is + * unchanged. + * - FP regs are written back to thread->transact_fpr before + * reclaiming. These are the transactional (current) versions. + * + * Purpose is to both abort transactions of, and preserve the state of, + * a transactions at a context switch. We preserve/restore both sets of process + * state to restore them when the thread's scheduled again. We continue in + * userland as though nothing happened, but when the transaction is resumed + * they will abort back to the checkpointed state we save out here. + * + * Call with IRQs off, stacks get all out of sync for some periods in here! + */ +_GLOBAL(tm_reclaim) + mfcr r6 + mflr r0 + stw r6, 8(r1) + std r0, 16(r1) + std r2, STK_GOT(r1) + stdu r1, -TM_FRAME_SIZE(r1) + + /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */ + + std r3, STK_PARAM(R3)(r1) + SAVE_NVGPRS(r1) + + /* We need to setup MSR for VSX register save instructions. Here we + * also clear the MSR RI since when we do the treclaim, we won't have a + * valid kernel pointer for a while. We clear RI here as it avoids + * adding another mtmsr closer to the treclaim. This makes the region + * maked as non-recoverable wider than it needs to be but it saves on + * inserting another mtmsrd later. + */ + mfmsr r14 + mr r15, r14 + ori r15, r15, MSR_FP + li r16, MSR_RI + ori r16, r16, MSR_EE /* IRQs hard off */ + andc r15, r15, r16 + oris r15, r15, MSR_VEC@h +#ifdef CONFIG_VSX + BEGIN_FTR_SECTION + oris r15,r15, MSR_VSX@h + END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + mtmsrd r15 + std r14, TM_FRAME_L0(r1) + + /* Stash the stack pointer away for use after reclaim */ + std r1, PACAR1(r13) + + /* ******************** FPR/VR/VSRs ************ + * Before reclaiming, capture the current/transactional FPR/VR + * versions /if used/. + * + * (If VSX used, FP and VMX are implied. Or, we don't need to look + * at MSR.VSX as copying FP regs if .FP, vector regs if .VMX covers it.) + * + * We're passed the thread's MSR as parameter 2. + * + * We enabled VEC/FP/VSX in the msr above, so we can execute these + * instructions! + */ + andis. r0, r4, MSR_VEC@h + beq dont_backup_vec + + addi r7, r3, THREAD_TRANSACT_VRSTATE + SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 transact vr state */ + mfvscr vr0 + li r6, VRSTATE_VSCR + stvx vr0, r7, r6 +dont_backup_vec: + mfspr r0, SPRN_VRSAVE + std r0, THREAD_TRANSACT_VRSAVE(r3) + + andi. r0, r4, MSR_FP + beq dont_backup_fp + + addi r7, r3, THREAD_TRANSACT_FPSTATE + SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 transact fp state */ + + mffs fr0 + stfd fr0,FPSTATE_FPSCR(r7) + +dont_backup_fp: + /* Do sanity check on MSR to make sure we are suspended */ + li r7, (MSR_TS_S)@higher + srdi r6, r14, 32 + and r6, r6, r7 +1: tdeqi r6, 0 + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 + + /* The moment we treclaim, ALL of our GPRs will switch + * to user register state. (FPRs, CCR etc. also!) + * Use an sprg and a tm_scratch in the PACA to shuffle. + */ + TRECLAIM(R5) /* Cause in r5 */ + + /* ******************** GPRs ******************** */ + /* Stash the checkpointed r13 away in the scratch SPR and get the real + * paca + */ + SET_SCRATCH0(r13) + GET_PACA(r13) + + /* Stash the checkpointed r1 away in paca tm_scratch and get the real + * stack pointer back + */ + std r1, PACATMSCRATCH(r13) + ld r1, PACAR1(r13) + + /* Store the PPR in r11 and reset to decent value */ + std r11, GPR11(r1) /* Temporary stash */ + mfspr r11, SPRN_PPR + HMT_MEDIUM + + /* Now get some more GPRS free */ + std r7, GPR7(r1) /* Temporary stash */ + std r12, GPR12(r1) /* '' '' '' */ + ld r12, STK_PARAM(R3)(r1) /* Param 0, thread_struct * */ + + std r11, THREAD_TM_PPR(r12) /* Store PPR and free r11 */ + + addi r7, r12, PT_CKPT_REGS /* Thread's ckpt_regs */ + + /* Make r7 look like an exception frame so that we + * can use the neat GPRx(n) macros. r7 is NOT a pt_regs ptr! + */ + subi r7, r7, STACK_FRAME_OVERHEAD + + /* Sync the userland GPRs 2-12, 14-31 to thread->regs: */ + SAVE_GPR(0, r7) /* user r0 */ + SAVE_GPR(2, r7) /* user r2 */ + SAVE_4GPRS(3, r7) /* user r3-r6 */ + SAVE_GPR(8, r7) /* user r8 */ + SAVE_GPR(9, r7) /* user r9 */ + SAVE_GPR(10, r7) /* user r10 */ + ld r3, PACATMSCRATCH(r13) /* user r1 */ + ld r4, GPR7(r1) /* user r7 */ + ld r5, GPR11(r1) /* user r11 */ + ld r6, GPR12(r1) /* user r12 */ + GET_SCRATCH0(8) /* user r13 */ + std r3, GPR1(r7) + std r4, GPR7(r7) + std r5, GPR11(r7) + std r6, GPR12(r7) + std r8, GPR13(r7) + + SAVE_NVGPRS(r7) /* user r14-r31 */ + + /* ******************** NIP ******************** */ + mfspr r3, SPRN_TFHAR + std r3, _NIP(r7) /* Returns to failhandler */ + /* The checkpointed NIP is ignored when rescheduling/rechkpting, + * but is used in signal return to 'wind back' to the abort handler. + */ + + /* ******************** CR,LR,CCR,MSR ********** */ + mfctr r3 + mflr r4 + mfcr r5 + mfxer r6 + + std r3, _CTR(r7) + std r4, _LINK(r7) + std r5, _CCR(r7) + std r6, _XER(r7) + + + /* ******************** TAR, DSCR ********** */ + mfspr r3, SPRN_TAR + mfspr r4, SPRN_DSCR + + std r3, THREAD_TM_TAR(r12) + std r4, THREAD_TM_DSCR(r12) + + /* MSR and flags: We don't change CRs, and we don't need to alter + * MSR. + */ + + /* TM regs, incl TEXASR -- these live in thread_struct. Note they've + * been updated by the treclaim, to explain to userland the failure + * cause (aborted). + */ + mfspr r0, SPRN_TEXASR + mfspr r3, SPRN_TFHAR + mfspr r4, SPRN_TFIAR + std r0, THREAD_TM_TEXASR(r12) + std r3, THREAD_TM_TFHAR(r12) + std r4, THREAD_TM_TFIAR(r12) + + /* AMR is checkpointed too, but is unsupported by Linux. */ + + /* Restore original MSR/IRQ state & clear TM mode */ + ld r14, TM_FRAME_L0(r1) /* Orig MSR */ + li r15, 0 + rldimi r14, r15, MSR_TS_LG, (63-MSR_TS_LG)-1 + mtmsrd r14 + + REST_NVGPRS(r1) + + addi r1, r1, TM_FRAME_SIZE + lwz r4, 8(r1) + ld r0, 16(r1) + mtcr r4 + mtlr r0 + ld r2, STK_GOT(r1) + + /* Load CPU's default DSCR */ + ld r0, PACA_DSCR(r13) + mtspr SPRN_DSCR, r0 + + blr + + + /* void tm_recheckpoint(struct thread_struct *thread, + * unsigned long orig_msr) + * - Restore the checkpointed register state saved by tm_reclaim + * when we switch_to a process. + * + * Call with IRQs off, stacks get all out of sync for + * some periods in here! + */ +_GLOBAL(__tm_recheckpoint) + mfcr r5 + mflr r0 + stw r5, 8(r1) + std r0, 16(r1) + std r2, STK_GOT(r1) + stdu r1, -TM_FRAME_SIZE(r1) + + /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. + * This is used for backing up the NVGPRs: + */ + SAVE_NVGPRS(r1) + + /* Load complete register state from ts_ckpt* registers */ + + addi r7, r3, PT_CKPT_REGS /* Thread's ckpt_regs */ + + /* Make r7 look like an exception frame so that we + * can use the neat GPRx(n) macros. r7 is now NOT a pt_regs ptr! + */ + subi r7, r7, STACK_FRAME_OVERHEAD + + SET_SCRATCH0(r1) + + mfmsr r6 + /* R4 = original MSR to indicate whether thread used FP/Vector etc. */ + + /* Enable FP/vec in MSR if necessary! */ + lis r5, MSR_VEC@h + ori r5, r5, MSR_FP + and. r5, r4, r5 + beq restore_gprs /* if neither, skip both */ + +#ifdef CONFIG_VSX + BEGIN_FTR_SECTION + oris r5, r5, MSR_VSX@h + END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + or r5, r6, r5 /* Set MSR.FP+.VSX/.VEC */ + mtmsr r5 + +#ifdef CONFIG_ALTIVEC + /* FP and VEC registers: These are recheckpointed from thread.fpr[] + * and thread.vr[] respectively. The thread.transact_fpr[] version + * is more modern, and will be loaded subsequently by any FPUnavailable + * trap. + */ + andis. r0, r4, MSR_VEC@h + beq dont_restore_vec + + addi r8, r3, THREAD_VRSTATE + li r5, VRSTATE_VSCR + lvx vr0, r8, r5 + mtvscr vr0 + REST_32VRS(0, r5, r8) /* r5 scratch, r8 ptr */ +dont_restore_vec: + ld r5, THREAD_VRSAVE(r3) + mtspr SPRN_VRSAVE, r5 +#endif + + andi. r0, r4, MSR_FP + beq dont_restore_fp + + addi r8, r3, THREAD_FPSTATE + lfd fr0, FPSTATE_FPSCR(r8) + MTFSF_L(fr0) + REST_32FPRS_VSRS(0, R4, R8) + +dont_restore_fp: + mtmsr r6 /* FP/Vec off again! */ + +restore_gprs: + + /* ******************** CR,LR,CCR,MSR ********** */ + ld r4, _CTR(r7) + ld r5, _LINK(r7) + ld r8, _XER(r7) + + mtctr r4 + mtlr r5 + mtxer r8 + + /* ******************** TAR ******************** */ + ld r4, THREAD_TM_TAR(r3) + mtspr SPRN_TAR, r4 + + /* Load up the PPR and DSCR in GPRs only at this stage */ + ld r5, THREAD_TM_DSCR(r3) + ld r6, THREAD_TM_PPR(r3) + + /* Clear the MSR RI since we are about to change R1. EE is already off + */ + li r4, 0 + mtmsrd r4, 1 + + REST_GPR(0, r7) /* GPR0 */ + REST_2GPRS(2, r7) /* GPR2-3 */ + REST_GPR(4, r7) /* GPR4 */ + REST_4GPRS(8, r7) /* GPR8-11 */ + REST_2GPRS(12, r7) /* GPR12-13 */ + + REST_NVGPRS(r7) /* GPR14-31 */ + + /* Load up PPR and DSCR here so we don't run with user values for long + */ + mtspr SPRN_DSCR, r5 + mtspr SPRN_PPR, r6 + + /* Do final sanity check on TEXASR to make sure FS is set. Do this + * here before we load up the userspace r1 so any bugs we hit will get + * a call chain */ + mfspr r5, SPRN_TEXASR + srdi r5, r5, 16 + li r6, (TEXASR_FS)@h + and r6, r6, r5 +1: tdeqi r6, 0 + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 + + /* Do final sanity check on MSR to make sure we are not transactional + * or suspended + */ + mfmsr r6 + li r5, (MSR_TS_MASK)@higher + srdi r6, r6, 32 + and r6, r6, r5 +1: tdnei r6, 0 + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 + + /* Restore CR */ + ld r6, _CCR(r7) + mtcr r6 + + REST_GPR(1, r7) /* GPR1 */ + REST_GPR(5, r7) /* GPR5-7 */ + REST_GPR(6, r7) + ld r7, GPR7(r7) + + /* Commit register state as checkpointed state: */ + TRECHKPT + + HMT_MEDIUM + + /* Our transactional state has now changed. + * + * Now just get out of here. Transactional (current) state will be + * updated once restore is called on the return path in the _switch-ed + * -to process. + */ + + GET_PACA(r13) + GET_SCRATCH0(r1) + + /* R1 is restored, so we are recoverable again. EE is still off */ + li r4, MSR_RI + mtmsrd r4, 1 + + REST_NVGPRS(r1) + + addi r1, r1, TM_FRAME_SIZE + lwz r4, 8(r1) + ld r0, 16(r1) + mtcr r4 + mtlr r0 + ld r2, STK_GOT(r1) + + /* Load CPU's default DSCR */ + ld r0, PACA_DSCR(r13) + mtspr SPRN_DSCR, r0 + + blr + + /* ****************************************************************** */ diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 32518401af6..239f1cde3ff 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -35,6 +35,7 @@ #include <linux/kdebug.h> #include <linux/debugfs.h> #include <linux/ratelimit.h> +#include <linux/context_tracking.h> #include <asm/emulated_ops.h> #include <asm/pgtable.h> @@ -43,22 +44,23 @@ #include <asm/machdep.h> #include <asm/rtas.h> #include <asm/pmc.h> -#ifdef CONFIG_PPC32 #include <asm/reg.h> -#endif #ifdef CONFIG_PMAC_BACKLIGHT #include <asm/backlight.h> #endif #ifdef CONFIG_PPC64 #include <asm/firmware.h> #include <asm/processor.h> +#include <asm/tm.h> #endif #include <asm/kexec.h> #include <asm/ppc-opcode.h> #include <asm/rio.h> #include <asm/fadump.h> #include <asm/switch_to.h> +#include <asm/tm.h> #include <asm/debug.h> +#include <sysdev/fsl_pci.h> #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) int (*__debugger)(struct pt_regs *regs) __read_mostly; @@ -66,7 +68,7 @@ int (*__debugger_ipi)(struct pt_regs *regs) __read_mostly; int (*__debugger_bpt)(struct pt_regs *regs) __read_mostly; int (*__debugger_sstep)(struct pt_regs *regs) __read_mostly; int (*__debugger_iabr_match)(struct pt_regs *regs) __read_mostly; -int (*__debugger_dabr_match)(struct pt_regs *regs) __read_mostly; +int (*__debugger_break_match)(struct pt_regs *regs) __read_mostly; int (*__debugger_fault_handler)(struct pt_regs *regs) __read_mostly; EXPORT_SYMBOL(__debugger); @@ -74,10 +76,17 @@ EXPORT_SYMBOL(__debugger_ipi); EXPORT_SYMBOL(__debugger_bpt); EXPORT_SYMBOL(__debugger_sstep); EXPORT_SYMBOL(__debugger_iabr_match); -EXPORT_SYMBOL(__debugger_dabr_match); +EXPORT_SYMBOL(__debugger_break_match); EXPORT_SYMBOL(__debugger_fault_handler); #endif +/* Transactional Memory trap debug */ +#ifdef TM_DEBUG_SW +#define TM_DEBUG(x...) printk(KERN_INFO x) +#else +#define TM_DEBUG(x...) do { } while(0) +#endif + /* * Trap & Exception support */ @@ -138,7 +147,7 @@ static void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, { bust_spinlocks(0); die_owner = -1; - add_taint(TAINT_DIE); + add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); die_nest_count--; oops_exit(); printk("\n"); @@ -276,6 +285,23 @@ void system_reset_exception(struct pt_regs *regs) /* What should we do here? We could issue a shutdown or hard reset. */ } + +/* + * This function is called in real mode. Strictly no printk's please. + * + * regs->nip and regs->msr contains srr0 and ssr1. + */ +long machine_check_early(struct pt_regs *regs) +{ + long handled = 0; + + __get_cpu_var(irq_stat).mce_exceptions++; + + if (cur_cpu_spec && cur_cpu_spec->machine_check_early) + handled = cur_cpu_spec->machine_check_early(regs); + return handled; +} + #endif /* @@ -342,14 +368,15 @@ static inline int check_io_access(struct pt_regs *regs) #define REASON_TRAP ESR_PTR /* single-step stuff */ -#define single_stepping(regs) (current->thread.dbcr0 & DBCR0_IC) -#define clear_single_step(regs) (current->thread.dbcr0 &= ~DBCR0_IC) +#define single_stepping(regs) (current->thread.debug.dbcr0 & DBCR0_IC) +#define clear_single_step(regs) (current->thread.debug.dbcr0 &= ~DBCR0_IC) #else /* On non-4xx, the reason for the machine check or program exception is in the MSR. */ #define get_reason(regs) ((regs)->msr) #define get_mc_reason(regs) ((regs)->msr) +#define REASON_TM 0x200000 #define REASON_FP 0x100000 #define REASON_ILLEGAL 0x80000 #define REASON_PRIVILEGED 0x40000 @@ -556,6 +583,8 @@ int machine_check_e500(struct pt_regs *regs) if (reason & MCSR_BUS_RBERR) { if (fsl_rio_mcheck_exception(regs)) return 1; + if (fsl_pci_mcheck_exception(regs)) + return 1; } printk("Machine check in kernel mode.\n"); @@ -658,6 +687,7 @@ int machine_check_generic(struct pt_regs *regs) void machine_check_exception(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); int recover = 0; __get_cpu_var(irq_stat).mce_exceptions++; @@ -674,7 +704,7 @@ void machine_check_exception(struct pt_regs *regs) recover = cur_cpu_spec->machine_check(regs); if (recover > 0) - return; + goto bail; #if defined(CONFIG_8xx) && defined(CONFIG_PCI) /* the qspan pci read routines can cause machine checks -- Cort @@ -684,20 +714,23 @@ void machine_check_exception(struct pt_regs *regs) * -- BenH */ bad_page_fault(regs, regs->dar, SIGBUS); - return; + goto bail; #endif if (debugger_fault_handler(regs)) - return; + goto bail; if (check_io_access(regs)) - return; + goto bail; die("Machine check", regs, SIGBUS); /* Must die if the interrupt is not recoverable */ if (!(regs->msr & MSR_RI)) panic("Unrecoverable Machine check"); + +bail: + exception_exit(prev_state); } void SMIException(struct pt_regs *regs) @@ -707,20 +740,29 @@ void SMIException(struct pt_regs *regs) void unknown_exception(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); + printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n", regs->nip, regs->msr, regs->trap); _exception(SIGTRAP, regs, 0, 0); + + exception_exit(prev_state); } void instruction_breakpoint_exception(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); + if (notify_die(DIE_IABR_MATCH, "iabr_match", regs, 5, 5, SIGTRAP) == NOTIFY_STOP) - return; + goto bail; if (debugger_iabr_match(regs)) - return; + goto bail; _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip); + +bail: + exception_exit(prev_state); } void RunModeException(struct pt_regs *regs) @@ -730,15 +772,20 @@ void RunModeException(struct pt_regs *regs) void __kprobes single_step_exception(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); + clear_single_step(regs); if (notify_die(DIE_SSTEP, "single_step", regs, 5, 5, SIGTRAP) == NOTIFY_STOP) - return; + goto bail; if (debugger_sstep(regs)) - return; + goto bail; _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip); + +bail: + exception_exit(prev_state); } /* @@ -786,7 +833,7 @@ static void parse_fpe(struct pt_regs *regs) flush_fp_to_thread(current); - code = __parse_fpscr(current->thread.fpscr.val); + code = __parse_fpscr(current->thread.fp_state.fpscr); _exception(SIGFPE, regs, code, regs->nip); } @@ -837,6 +884,10 @@ static int emulate_string_inst(struct pt_regs *regs, u32 instword) u8 val; u32 shift = 8 * (3 - (pos & 0x3)); + /* if process is 32-bit, clear upper 32 bits of EA */ + if ((regs->msr & MSR_64BIT) == 0) + EA &= 0xFFFFFFFF; + switch ((instword & PPC_INST_STRING_MASK)) { case PPC_INST_LSWX: case PPC_INST_LSWI: @@ -904,12 +955,34 @@ static int emulate_isel(struct pt_regs *regs, u32 instword) return 0; } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static inline bool tm_abort_check(struct pt_regs *regs, int cause) +{ + /* If we're emulating a load/store in an active transaction, we cannot + * emulate it as the kernel operates in transaction suspended context. + * We need to abort the transaction. This creates a persistent TM + * abort so tell the user what caused it with a new code. + */ + if (MSR_TM_TRANSACTIONAL(regs->msr)) { + tm_enable(); + tm_abort(cause); + return true; + } + return false; +} +#else +static inline bool tm_abort_check(struct pt_regs *regs, int reason) +{ + return false; +} +#endif + static int emulate_instruction(struct pt_regs *regs) { u32 instword; u32 rd; - if (!user_mode(regs) || (regs->msr & MSR_LE)) + if (!user_mode(regs)) return -EINVAL; CHECK_FULL_REGS(regs); @@ -943,6 +1016,9 @@ static int emulate_instruction(struct pt_regs *regs) /* Emulate load/store string insn. */ if ((instword & PPC_INST_STRING_GEN_MASK) == PPC_INST_STRING) { + if (tm_abort_check(regs, + TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT)) + return -EINVAL; PPC_WARN_EMULATED(string, regs); return emulate_string_inst(regs, instword); } @@ -959,9 +1035,19 @@ static int emulate_instruction(struct pt_regs *regs) return emulate_isel(regs, instword); } + /* Emulate sync instruction variants */ + if ((instword & PPC_INST_SYNC_MASK) == PPC_INST_SYNC) { + PPC_WARN_EMULATED(sync, regs); + asm volatile("sync"); + return 0; + } + #ifdef CONFIG_PPC64 /* Emulate the mfspr rD, DSCR. */ - if (((instword & PPC_INST_MFSPR_DSCR_MASK) == PPC_INST_MFSPR_DSCR) && + if ((((instword & PPC_INST_MFSPR_DSCR_USER_MASK) == + PPC_INST_MFSPR_DSCR_USER) || + ((instword & PPC_INST_MFSPR_DSCR_MASK) == + PPC_INST_MFSPR_DSCR)) && cpu_has_feature(CPU_FTR_DSCR)) { PPC_WARN_EMULATED(mfdscr, regs); rd = (instword >> 21) & 0x1f; @@ -969,7 +1055,10 @@ static int emulate_instruction(struct pt_regs *regs) return 0; } /* Emulate the mtspr DSCR, rD. */ - if (((instword & PPC_INST_MTSPR_DSCR_MASK) == PPC_INST_MTSPR_DSCR) && + if ((((instword & PPC_INST_MTSPR_DSCR_USER_MASK) == + PPC_INST_MTSPR_DSCR_USER) || + ((instword & PPC_INST_MTSPR_DSCR_MASK) == + PPC_INST_MTSPR_DSCR)) && cpu_has_feature(CPU_FTR_DSCR)) { PPC_WARN_EMULATED(mtdscr, regs); rd = (instword >> 21) & 0x1f; @@ -988,10 +1077,41 @@ int is_valid_bugaddr(unsigned long addr) return is_kernel_addr(addr); } +#ifdef CONFIG_MATH_EMULATION +static int emulate_math(struct pt_regs *regs) +{ + int ret; + extern int do_mathemu(struct pt_regs *regs); + + ret = do_mathemu(regs); + if (ret >= 0) + PPC_WARN_EMULATED(math, regs); + + switch (ret) { + case 0: + emulate_single_step(regs); + return 0; + case 1: { + int code = 0; + code = __parse_fpscr(current->thread.fp_state.fpscr); + _exception(SIGFPE, regs, code, regs->nip); + return 0; + } + case -EFAULT: + _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip); + return 0; + } + + return -1; +} +#else +static inline int emulate_math(struct pt_regs *regs) { return -1; } +#endif + void __kprobes program_check_exception(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); unsigned int reason = get_reason(regs); - extern int do_mathemu(struct pt_regs *regs); /* We can now get here via a FP Unavailable exception if the core * has no FPU, in that case the reason flags will be 0 */ @@ -999,56 +1119,84 @@ void __kprobes program_check_exception(struct pt_regs *regs) if (reason & REASON_FP) { /* IEEE FP exception */ parse_fpe(regs); - return; + goto bail; } if (reason & REASON_TRAP) { /* Debugger is first in line to stop recursive faults in * rcu_lock, notify_die, or atomic_notifier_call_chain */ if (debugger_bpt(regs)) - return; + goto bail; /* trap exception */ if (notify_die(DIE_BPT, "breakpoint", regs, 5, 5, SIGTRAP) == NOTIFY_STOP) - return; + goto bail; if (!(regs->msr & MSR_PR) && /* not user-mode */ report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) { regs->nip += 4; - return; + goto bail; } _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip); - return; + goto bail; } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (reason & REASON_TM) { + /* This is a TM "Bad Thing Exception" program check. + * This occurs when: + * - An rfid/hrfid/mtmsrd attempts to cause an illegal + * transition in TM states. + * - A trechkpt is attempted when transactional. + * - A treclaim is attempted when non transactional. + * - A tend is illegally attempted. + * - writing a TM SPR when transactional. + */ + if (!user_mode(regs) && + report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) { + regs->nip += 4; + goto bail; + } + /* If usermode caused this, it's done something illegal and + * gets a SIGILL slap on the wrist. We call it an illegal + * operand to distinguish from the instruction just being bad + * (e.g. executing a 'tend' on a CPU without TM!); it's an + * illegal /placement/ of a valid instruction. + */ + if (user_mode(regs)) { + _exception(SIGILL, regs, ILL_ILLOPN, regs->nip); + goto bail; + } else { + printk(KERN_EMERG "Unexpected TM Bad Thing exception " + "at %lx (msr 0x%x)\n", regs->nip, reason); + die("Unrecoverable exception", regs, SIGABRT); + } + } +#endif + + /* + * If we took the program check in the kernel skip down to sending a + * SIGILL. The subsequent cases all relate to emulating instructions + * which we should only do for userspace. We also do not want to enable + * interrupts for kernel faults because that might lead to further + * faults, and loose the context of the original exception. + */ + if (!user_mode(regs)) + goto sigill; /* We restore the interrupt state now */ if (!arch_irq_disabled_regs(regs)) local_irq_enable(); -#ifdef CONFIG_MATH_EMULATION /* (reason & REASON_ILLEGAL) would be the obvious thing here, * but there seems to be a hardware bug on the 405GP (RevD) * that means ESR is sometimes set incorrectly - either to * ESR_DST (!?) or 0. In the process of chasing this with the * hardware people - not sure if it can happen on any illegal * instruction or only on FP instructions, whether there is a - * pattern to occurrences etc. -dgibson 31/Mar/2003 */ - switch (do_mathemu(regs)) { - case 0: - emulate_single_step(regs); - return; - case 1: { - int code = 0; - code = __parse_fpscr(current->thread.fpscr.val); - _exception(SIGFPE, regs, code, regs->nip); - return; - } - case -EFAULT: - _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip); - return; - } - /* fall through on any other errors */ -#endif /* CONFIG_MATH_EMULATION */ + * pattern to occurrences etc. -dgibson 31/Mar/2003 + */ + if (!emulate_math(regs)) + goto bail; /* Try to emulate it if we should. */ if (reason & (REASON_ILLEGAL | REASON_PRIVILEGED)) { @@ -1056,27 +1204,45 @@ void __kprobes program_check_exception(struct pt_regs *regs) case 0: regs->nip += 4; emulate_single_step(regs); - return; + goto bail; case -EFAULT: _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip); - return; + goto bail; } } +sigill: if (reason & REASON_PRIVILEGED) _exception(SIGILL, regs, ILL_PRVOPC, regs->nip); else _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); + +bail: + exception_exit(prev_state); +} + +/* + * This occurs when running in hypervisor mode on POWER6 or later + * and an illegal instruction is encountered. + */ +void __kprobes emulation_assist_interrupt(struct pt_regs *regs) +{ + regs->msr |= REASON_ILLEGAL; + program_check_exception(regs); } void alignment_exception(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); int sig, code, fixed = 0; /* We restore the interrupt state now */ if (!arch_irq_disabled_regs(regs)) local_irq_enable(); + if (tm_abort_check(regs, TM_CAUSE_ALIGNMENT | TM_CAUSE_PERSISTENT)) + goto bail; + /* we don't implement logging of alignment exceptions */ if (!(current->thread.align_ctl & PR_UNALIGN_SIGBUS)) fixed = fix_alignment(regs); @@ -1084,7 +1250,7 @@ void alignment_exception(struct pt_regs *regs) if (fixed == 1) { regs->nip += 4; /* skip over emulated instruction */ emulate_single_step(regs); - return; + goto bail; } /* Operand address was bad */ @@ -1099,6 +1265,9 @@ void alignment_exception(struct pt_regs *regs) _exception(sig, regs, code, regs->dar); else bad_page_fault(regs, regs->dar, sig); + +bail: + exception_exit(prev_state); } void StackOverflow(struct pt_regs *regs) @@ -1127,23 +1296,32 @@ void trace_syscall(struct pt_regs *regs) void kernel_fp_unavailable_exception(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); + printk(KERN_EMERG "Unrecoverable FP Unavailable Exception " "%lx at %lx\n", regs->trap, regs->nip); die("Unrecoverable FP Unavailable Exception", regs, SIGABRT); + + exception_exit(prev_state); } void altivec_unavailable_exception(struct pt_regs *regs) { + enum ctx_state prev_state = exception_enter(); + if (user_mode(regs)) { /* A user program has executed an altivec instruction, but this kernel doesn't support altivec. */ _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); - return; + goto bail; } printk(KERN_EMERG "Unrecoverable VMX/Altivec Unavailable Exception " "%lx at %lx\n", regs->trap, regs->nip); die("Unrecoverable VMX/Altivec Unavailable Exception", regs, SIGABRT); + +bail: + exception_exit(prev_state); } void vsx_unavailable_exception(struct pt_regs *regs) @@ -1160,6 +1338,161 @@ void vsx_unavailable_exception(struct pt_regs *regs) die("Unrecoverable VSX Unavailable Exception", regs, SIGABRT); } +#ifdef CONFIG_PPC64 +void facility_unavailable_exception(struct pt_regs *regs) +{ + static char *facility_strings[] = { + [FSCR_FP_LG] = "FPU", + [FSCR_VECVSX_LG] = "VMX/VSX", + [FSCR_DSCR_LG] = "DSCR", + [FSCR_PM_LG] = "PMU SPRs", + [FSCR_BHRB_LG] = "BHRB", + [FSCR_TM_LG] = "TM", + [FSCR_EBB_LG] = "EBB", + [FSCR_TAR_LG] = "TAR", + }; + char *facility = "unknown"; + u64 value; + u8 status; + bool hv; + + hv = (regs->trap == 0xf80); + if (hv) + value = mfspr(SPRN_HFSCR); + else + value = mfspr(SPRN_FSCR); + + status = value >> 56; + if (status == FSCR_DSCR_LG) { + /* User is acessing the DSCR. Set the inherit bit and allow + * the user to set it directly in future by setting via the + * FSCR DSCR bit. We always leave HFSCR DSCR set. + */ + current->thread.dscr_inherit = 1; + mtspr(SPRN_FSCR, value | FSCR_DSCR); + return; + } + + if ((status < ARRAY_SIZE(facility_strings)) && + facility_strings[status]) + facility = facility_strings[status]; + + /* We restore the interrupt state now */ + if (!arch_irq_disabled_regs(regs)) + local_irq_enable(); + + pr_err_ratelimited( + "%sFacility '%s' unavailable, exception at 0x%lx, MSR=%lx\n", + hv ? "Hypervisor " : "", facility, regs->nip, regs->msr); + + if (user_mode(regs)) { + _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); + return; + } + + die("Unexpected facility unavailable exception", regs, SIGABRT); +} +#endif + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + +void fp_unavailable_tm(struct pt_regs *regs) +{ + /* Note: This does not handle any kind of FP laziness. */ + + TM_DEBUG("FP Unavailable trap whilst transactional at 0x%lx, MSR=%lx\n", + regs->nip, regs->msr); + + /* We can only have got here if the task started using FP after + * beginning the transaction. So, the transactional regs are just a + * copy of the checkpointed ones. But, we still need to recheckpoint + * as we're enabling FP for the process; it will return, abort the + * transaction, and probably retry but now with FP enabled. So the + * checkpointed FP registers need to be loaded. + */ + tm_reclaim_current(TM_CAUSE_FAC_UNAV); + /* Reclaim didn't save out any FPRs to transact_fprs. */ + + /* Enable FP for the task: */ + regs->msr |= (MSR_FP | current->thread.fpexc_mode); + + /* This loads and recheckpoints the FP registers from + * thread.fpr[]. They will remain in registers after the + * checkpoint so we don't need to reload them after. + * If VMX is in use, the VRs now hold checkpointed values, + * so we don't want to load the VRs from the thread_struct. + */ + tm_recheckpoint(¤t->thread, MSR_FP); + + /* If VMX is in use, get the transactional values back */ + if (regs->msr & MSR_VEC) { + do_load_up_transact_altivec(¤t->thread); + /* At this point all the VSX state is loaded, so enable it */ + regs->msr |= MSR_VSX; + } +} + +void altivec_unavailable_tm(struct pt_regs *regs) +{ + /* See the comments in fp_unavailable_tm(). This function operates + * the same way. + */ + + TM_DEBUG("Vector Unavailable trap whilst transactional at 0x%lx," + "MSR=%lx\n", + regs->nip, regs->msr); + tm_reclaim_current(TM_CAUSE_FAC_UNAV); + regs->msr |= MSR_VEC; + tm_recheckpoint(¤t->thread, MSR_VEC); + current->thread.used_vr = 1; + + if (regs->msr & MSR_FP) { + do_load_up_transact_fpu(¤t->thread); + regs->msr |= MSR_VSX; + } +} + +void vsx_unavailable_tm(struct pt_regs *regs) +{ + unsigned long orig_msr = regs->msr; + + /* See the comments in fp_unavailable_tm(). This works similarly, + * though we're loading both FP and VEC registers in here. + * + * If FP isn't in use, load FP regs. If VEC isn't in use, load VEC + * regs. Either way, set MSR_VSX. + */ + + TM_DEBUG("VSX Unavailable trap whilst transactional at 0x%lx," + "MSR=%lx\n", + regs->nip, regs->msr); + + current->thread.used_vsr = 1; + + /* If FP and VMX are already loaded, we have all the state we need */ + if ((orig_msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC)) { + regs->msr |= MSR_VSX; + return; + } + + /* This reclaims FP and/or VR regs if they're already enabled */ + tm_reclaim_current(TM_CAUSE_FAC_UNAV); + + regs->msr |= MSR_VEC | MSR_FP | current->thread.fpexc_mode | + MSR_VSX; + + /* This loads & recheckpoints FP and VRs; but we have + * to be sure not to overwrite previously-valid state. + */ + tm_recheckpoint(¤t->thread, regs->msr & ~orig_msr); + + if (orig_msr & MSR_FP) + do_load_up_transact_fpu(¤t->thread); + if (orig_msr & MSR_VEC) + do_load_up_transact_altivec(¤t->thread); +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + void performance_monitor_exception(struct pt_regs *regs) { __get_cpu_var(irq_stat).pmu_irqs++; @@ -1170,61 +1503,18 @@ void performance_monitor_exception(struct pt_regs *regs) #ifdef CONFIG_8xx void SoftwareEmulation(struct pt_regs *regs) { - extern int do_mathemu(struct pt_regs *); - extern int Soft_emulate_8xx(struct pt_regs *); -#if defined(CONFIG_MATH_EMULATION) || defined(CONFIG_8XX_MINIMAL_FPEMU) - int errcode; -#endif - CHECK_FULL_REGS(regs); if (!user_mode(regs)) { debugger(regs); - die("Kernel Mode Software FPU Emulation", regs, SIGFPE); + die("Kernel Mode Unimplemented Instruction or SW FPU Emulation", + regs, SIGFPE); } -#ifdef CONFIG_MATH_EMULATION - errcode = do_mathemu(regs); - if (errcode >= 0) - PPC_WARN_EMULATED(math, regs); - - switch (errcode) { - case 0: - emulate_single_step(regs); + if (!emulate_math(regs)) return; - case 1: { - int code = 0; - code = __parse_fpscr(current->thread.fpscr.val); - _exception(SIGFPE, regs, code, regs->nip); - return; - } - case -EFAULT: - _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip); - return; - default: - _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); - return; - } -#elif defined(CONFIG_8XX_MINIMAL_FPEMU) - errcode = Soft_emulate_8xx(regs); - if (errcode >= 0) - PPC_WARN_EMULATED(8xx, regs); - - switch (errcode) { - case 0: - emulate_single_step(regs); - return; - case 1: - _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); - return; - case -EFAULT: - _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip); - return; - } -#else _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); -#endif } #endif /* CONFIG_8xx */ @@ -1239,7 +1529,7 @@ static void handle_debug(struct pt_regs *regs, unsigned long debug_status) if (debug_status & (DBSR_DAC1R | DBSR_DAC1W)) { dbcr_dac(current) &= ~(DBCR_DAC1R | DBCR_DAC1W); #ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE - current->thread.dbcr2 &= ~DBCR2_DAC12MODE; + current->thread.debug.dbcr2 &= ~DBCR2_DAC12MODE; #endif do_send_trap(regs, mfspr(SPRN_DAC1), debug_status, TRAP_HWBKPT, 5); @@ -1250,24 +1540,24 @@ static void handle_debug(struct pt_regs *regs, unsigned long debug_status) 6); changed |= 0x01; } else if (debug_status & DBSR_IAC1) { - current->thread.dbcr0 &= ~DBCR0_IAC1; + current->thread.debug.dbcr0 &= ~DBCR0_IAC1; dbcr_iac_range(current) &= ~DBCR_IAC12MODE; do_send_trap(regs, mfspr(SPRN_IAC1), debug_status, TRAP_HWBKPT, 1); changed |= 0x01; } else if (debug_status & DBSR_IAC2) { - current->thread.dbcr0 &= ~DBCR0_IAC2; + current->thread.debug.dbcr0 &= ~DBCR0_IAC2; do_send_trap(regs, mfspr(SPRN_IAC2), debug_status, TRAP_HWBKPT, 2); changed |= 0x01; } else if (debug_status & DBSR_IAC3) { - current->thread.dbcr0 &= ~DBCR0_IAC3; + current->thread.debug.dbcr0 &= ~DBCR0_IAC3; dbcr_iac_range(current) &= ~DBCR_IAC34MODE; do_send_trap(regs, mfspr(SPRN_IAC3), debug_status, TRAP_HWBKPT, 3); changed |= 0x01; } else if (debug_status & DBSR_IAC4) { - current->thread.dbcr0 &= ~DBCR0_IAC4; + current->thread.debug.dbcr0 &= ~DBCR0_IAC4; do_send_trap(regs, mfspr(SPRN_IAC4), debug_status, TRAP_HWBKPT, 4); changed |= 0x01; @@ -1277,19 +1567,20 @@ static void handle_debug(struct pt_regs *regs, unsigned long debug_status) * Check all other debug flags and see if that bit needs to be turned * back on or not. */ - if (DBCR_ACTIVE_EVENTS(current->thread.dbcr0, current->thread.dbcr1)) + if (DBCR_ACTIVE_EVENTS(current->thread.debug.dbcr0, + current->thread.debug.dbcr1)) regs->msr |= MSR_DE; else /* Make sure the IDM flag is off */ - current->thread.dbcr0 &= ~DBCR0_IDM; + current->thread.debug.dbcr0 &= ~DBCR0_IDM; if (changed & 0x01) - mtspr(SPRN_DBCR0, current->thread.dbcr0); + mtspr(SPRN_DBCR0, current->thread.debug.dbcr0); } void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status) { - current->thread.dbsr = debug_status; + current->thread.debug.dbsr = debug_status; /* Hack alert: On BookE, Branch Taken stops on the branch itself, while * on server, it stops on the target of the branch. In order to simulate @@ -1306,8 +1597,8 @@ void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status) /* Do the single step trick only when coming from userspace */ if (user_mode(regs)) { - current->thread.dbcr0 &= ~DBCR0_BT; - current->thread.dbcr0 |= DBCR0_IDM | DBCR0_IC; + current->thread.debug.dbcr0 &= ~DBCR0_BT; + current->thread.debug.dbcr0 |= DBCR0_IDM | DBCR0_IC; regs->msr |= MSR_DE; return; } @@ -1335,13 +1626,13 @@ void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status) return; if (user_mode(regs)) { - current->thread.dbcr0 &= ~DBCR0_IC; - if (DBCR_ACTIVE_EVENTS(current->thread.dbcr0, - current->thread.dbcr1)) + current->thread.debug.dbcr0 &= ~DBCR0_IC; + if (DBCR_ACTIVE_EVENTS(current->thread.debug.dbcr0, + current->thread.debug.dbcr1)) regs->msr |= MSR_DE; else /* Make sure the IDM bit is off */ - current->thread.dbcr0 &= ~DBCR0_IDM; + current->thread.debug.dbcr0 &= ~DBCR0_IDM; } _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip); @@ -1387,7 +1678,7 @@ void altivec_assist_exception(struct pt_regs *regs) /* XXX quick hack for now: set the non-Java bit in the VSCR */ printk_ratelimited(KERN_ERR "Unrecognized altivec instruction " "in %s at %lx\n", current->comm, regs->nip); - current->thread.vscr.u[3] |= 0x10000; + current->thread.vr_state.vscr.u[3] |= 0x10000; } } #endif /* CONFIG_ALTIVEC */ @@ -1515,7 +1806,7 @@ void unrecoverable_exception(struct pt_regs *regs) die("Unrecoverable exception", regs, SIGABRT); } -#ifdef CONFIG_BOOKE_WDT +#if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x) /* * Default handler for a Watchdog exception, * spins until a reboot occurs @@ -1568,11 +1859,10 @@ struct ppc_emulated ppc_emulated = { WARN_EMULATED_SETUP(popcntb), WARN_EMULATED_SETUP(spe), WARN_EMULATED_SETUP(string), + WARN_EMULATED_SETUP(sync), WARN_EMULATED_SETUP(unaligned), #ifdef CONFIG_MATH_EMULATION WARN_EMULATED_SETUP(math), -#elif defined(CONFIG_8XX_MINIMAL_FPEMU) - WARN_EMULATED_SETUP(8xx), #endif #ifdef CONFIG_VSX WARN_EMULATED_SETUP(vsx), @@ -1580,6 +1870,7 @@ struct ppc_emulated ppc_emulated = { #ifdef CONFIG_PPC64 WARN_EMULATED_SETUP(mfdscr), WARN_EMULATED_SETUP(mtdscr), + WARN_EMULATED_SETUP(lq_stq), #endif }; diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index f9748498fe5..b7aa07279a6 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -50,7 +50,7 @@ void __init udbg_early_init(void) udbg_init_debug_beat(); #elif defined(CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE) udbg_init_pas_realmode(); -#elif defined(CONFIG_BOOTX_TEXT) +#elif defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) udbg_init_btext(); #elif defined(CONFIG_PPC_EARLY_DEBUG_44x) /* PPC44x debug */ @@ -62,8 +62,9 @@ void __init udbg_early_init(void) udbg_init_cpm(); #elif defined(CONFIG_PPC_EARLY_DEBUG_USBGECKO) udbg_init_usbgecko(); -#elif defined(CONFIG_PPC_EARLY_DEBUG_WSP) - udbg_init_wsp(); +#elif defined(CONFIG_PPC_EARLY_DEBUG_MEMCONS) + /* In memory console */ + udbg_init_memcons(); #elif defined(CONFIG_PPC_EARLY_DEBUG_EHV_BC) udbg_init_ehv_bc(); #elif defined(CONFIG_PPC_EARLY_DEBUG_PS3GELIC) @@ -156,15 +157,13 @@ static struct console udbg_console = { .index = 0, }; -static int early_console_initialized; - /* * Called by setup_system after ppc_md->probe and ppc_md->early_init. * Call it again after setting udbg_putc in ppc_md->setup_arch. */ void __init register_early_udbg_console(void) { - if (early_console_initialized) + if (early_console) return; if (!udbg_putc) @@ -174,7 +173,7 @@ void __init register_early_udbg_console(void) printk(KERN_INFO "early console immortal !\n"); udbg_console.flags &= ~CON_BOOT; } - early_console_initialized = 1; + early_console = &udbg_console; register_console(&udbg_console); } diff --git a/arch/powerpc/kernel/udbg_16550.c b/arch/powerpc/kernel/udbg_16550.c index 6837f839ab7..6e7c4923b5e 100644 --- a/arch/powerpc/kernel/udbg_16550.c +++ b/arch/powerpc/kernel/udbg_16550.c @@ -18,23 +18,19 @@ extern void real_writeb(u8 data, volatile u8 __iomem *addr); extern u8 real_205_readb(volatile u8 __iomem *addr); extern void real_205_writeb(u8 data, volatile u8 __iomem *addr); -struct NS16550 { - /* this struct must be packed */ - unsigned char rbr; /* 0 */ - unsigned char ier; /* 1 */ - unsigned char fcr; /* 2 */ - unsigned char lcr; /* 3 */ - unsigned char mcr; /* 4 */ - unsigned char lsr; /* 5 */ - unsigned char msr; /* 6 */ - unsigned char scr; /* 7 */ -}; - -#define thr rbr -#define iir fcr -#define dll rbr -#define dlm ier -#define dlab lcr +#define UART_RBR 0 +#define UART_IER 1 +#define UART_FCR 2 +#define UART_LCR 3 +#define UART_MCR 4 +#define UART_LSR 5 +#define UART_MSR 6 +#define UART_SCR 7 +#define UART_THR UART_RBR +#define UART_IIR UART_FCR +#define UART_DLL UART_RBR +#define UART_DLM UART_IER +#define UART_DLAB UART_LCR #define LSR_DR 0x01 /* Data ready */ #define LSR_OE 0x02 /* Overrun */ @@ -47,52 +43,62 @@ struct NS16550 { #define LCR_DLAB 0x80 -static struct NS16550 __iomem *udbg_comport; +static u8 (*udbg_uart_in)(unsigned int reg); +static void (*udbg_uart_out)(unsigned int reg, u8 data); -static void udbg_550_flush(void) +static void udbg_uart_flush(void) { - if (udbg_comport) { - while ((in_8(&udbg_comport->lsr) & LSR_THRE) == 0) - /* wait for idle */; - } + if (!udbg_uart_in) + return; + + /* wait for idle */ + while ((udbg_uart_in(UART_LSR) & LSR_THRE) == 0) + cpu_relax(); } -static void udbg_550_putc(char c) +static void udbg_uart_putc(char c) { - if (udbg_comport) { - if (c == '\n') - udbg_550_putc('\r'); - udbg_550_flush(); - out_8(&udbg_comport->thr, c); - } + if (!udbg_uart_out) + return; + + if (c == '\n') + udbg_uart_putc('\r'); + udbg_uart_flush(); + udbg_uart_out(UART_THR, c); } -static int udbg_550_getc_poll(void) +static int udbg_uart_getc_poll(void) { - if (udbg_comport) { - if ((in_8(&udbg_comport->lsr) & LSR_DR) != 0) - return in_8(&udbg_comport->rbr); - else - return -1; - } + if (!udbg_uart_in || !(udbg_uart_in(UART_LSR) & LSR_DR)) + return udbg_uart_in(UART_RBR); return -1; } -static int udbg_550_getc(void) +static int udbg_uart_getc(void) { - if (udbg_comport) { - while ((in_8(&udbg_comport->lsr) & LSR_DR) == 0) - /* wait for char */; - return in_8(&udbg_comport->rbr); - } - return -1; + if (!udbg_uart_in) + return -1; + /* wait for char */ + while (!(udbg_uart_in(UART_LSR) & LSR_DR)) + cpu_relax(); + return udbg_uart_in(UART_RBR); } -void udbg_init_uart(void __iomem *comport, unsigned int speed, - unsigned int clock) +static void udbg_use_uart(void) +{ + udbg_putc = udbg_uart_putc; + udbg_flush = udbg_uart_flush; + udbg_getc = udbg_uart_getc; + udbg_getc_poll = udbg_uart_getc_poll; +} + +void udbg_uart_setup(unsigned int speed, unsigned int clock) { unsigned int dll, base_bauds; + if (!udbg_uart_out) + return; + if (clock == 0) clock = 1843200; if (speed == 0) @@ -101,51 +107,43 @@ void udbg_init_uart(void __iomem *comport, unsigned int speed, base_bauds = clock / 16; dll = base_bauds / speed; - if (comport) { - udbg_comport = (struct NS16550 __iomem *)comport; - out_8(&udbg_comport->lcr, 0x00); - out_8(&udbg_comport->ier, 0xff); - out_8(&udbg_comport->ier, 0x00); - out_8(&udbg_comport->lcr, LCR_DLAB); - out_8(&udbg_comport->dll, dll & 0xff); - out_8(&udbg_comport->dlm, dll >> 8); - /* 8 data, 1 stop, no parity */ - out_8(&udbg_comport->lcr, 0x03); - /* RTS/DTR */ - out_8(&udbg_comport->mcr, 0x03); - /* Clear & enable FIFOs */ - out_8(&udbg_comport->fcr ,0x07); - udbg_putc = udbg_550_putc; - udbg_flush = udbg_550_flush; - udbg_getc = udbg_550_getc; - udbg_getc_poll = udbg_550_getc_poll; - } + udbg_uart_out(UART_LCR, 0x00); + udbg_uart_out(UART_IER, 0xff); + udbg_uart_out(UART_IER, 0x00); + udbg_uart_out(UART_LCR, LCR_DLAB); + udbg_uart_out(UART_DLL, dll & 0xff); + udbg_uart_out(UART_DLM, dll >> 8); + /* 8 data, 1 stop, no parity */ + udbg_uart_out(UART_LCR, 0x3); + /* RTS/DTR */ + udbg_uart_out(UART_MCR, 0x3); + /* Clear & enable FIFOs */ + udbg_uart_out(UART_FCR, 0x7); } -unsigned int udbg_probe_uart_speed(void __iomem *comport, unsigned int clock) +unsigned int udbg_probe_uart_speed(unsigned int clock) { unsigned int dll, dlm, divisor, prescaler, speed; u8 old_lcr; - struct NS16550 __iomem *port = comport; - old_lcr = in_8(&port->lcr); + old_lcr = udbg_uart_in(UART_LCR); /* select divisor latch registers. */ - out_8(&port->lcr, LCR_DLAB); + udbg_uart_out(UART_LCR, old_lcr | LCR_DLAB); /* now, read the divisor */ - dll = in_8(&port->dll); - dlm = in_8(&port->dlm); + dll = udbg_uart_in(UART_DLL); + dlm = udbg_uart_in(UART_DLM); divisor = dlm << 8 | dll; /* check prescaling */ - if (in_8(&port->mcr) & 0x80) + if (udbg_uart_in(UART_MCR) & 0x80) prescaler = 4; else prescaler = 1; /* restore the LCR */ - out_8(&port->lcr, old_lcr); + udbg_uart_out(UART_LCR, old_lcr); /* calculate speed */ speed = (clock / prescaler) / (divisor * 16); @@ -157,195 +155,144 @@ unsigned int udbg_probe_uart_speed(void __iomem *comport, unsigned int clock) return speed; } -#ifdef CONFIG_PPC_MAPLE -void udbg_maple_real_flush(void) +static union { + unsigned char __iomem *mmio_base; + unsigned long pio_base; +} udbg_uart; + +static unsigned int udbg_uart_stride = 1; + +static u8 udbg_uart_in_pio(unsigned int reg) { - if (udbg_comport) { - while ((real_readb(&udbg_comport->lsr) & LSR_THRE) == 0) - /* wait for idle */; - } + return inb(udbg_uart.pio_base + (reg * udbg_uart_stride)); } -void udbg_maple_real_putc(char c) +static void udbg_uart_out_pio(unsigned int reg, u8 data) { - if (udbg_comport) { - if (c == '\n') - udbg_maple_real_putc('\r'); - udbg_maple_real_flush(); - real_writeb(c, &udbg_comport->thr); eieio(); - } + outb(data, udbg_uart.pio_base + (reg * udbg_uart_stride)); } -void __init udbg_init_maple_realmode(void) +void udbg_uart_init_pio(unsigned long port, unsigned int stride) { - udbg_comport = (struct NS16550 __iomem *)0xf40003f8; - - udbg_putc = udbg_maple_real_putc; - udbg_flush = udbg_maple_real_flush; - udbg_getc = NULL; - udbg_getc_poll = NULL; + if (!port) + return; + udbg_uart.pio_base = port; + udbg_uart_stride = stride; + udbg_uart_in = udbg_uart_in_pio; + udbg_uart_out = udbg_uart_out_pio; + udbg_use_uart(); } -#endif /* CONFIG_PPC_MAPLE */ -#ifdef CONFIG_PPC_PASEMI -void udbg_pas_real_flush(void) +static u8 udbg_uart_in_mmio(unsigned int reg) { - if (udbg_comport) { - while ((real_205_readb(&udbg_comport->lsr) & LSR_THRE) == 0) - /* wait for idle */; - } + return in_8(udbg_uart.mmio_base + (reg * udbg_uart_stride)); } -void udbg_pas_real_putc(char c) +static void udbg_uart_out_mmio(unsigned int reg, u8 data) { - if (udbg_comport) { - if (c == '\n') - udbg_pas_real_putc('\r'); - udbg_pas_real_flush(); - real_205_writeb(c, &udbg_comport->thr); eieio(); - } + out_8(udbg_uart.mmio_base + (reg * udbg_uart_stride), data); } -void udbg_init_pas_realmode(void) -{ - udbg_comport = (struct NS16550 __iomem *)0xfcff03f8UL; - udbg_putc = udbg_pas_real_putc; - udbg_flush = udbg_pas_real_flush; - udbg_getc = NULL; - udbg_getc_poll = NULL; +void udbg_uart_init_mmio(void __iomem *addr, unsigned int stride) +{ + if (!addr) + return; + udbg_uart.mmio_base = addr; + udbg_uart_stride = stride; + udbg_uart_in = udbg_uart_in_mmio; + udbg_uart_out = udbg_uart_out_mmio; + udbg_use_uart(); } -#endif /* CONFIG_PPC_MAPLE */ -#ifdef CONFIG_PPC_EARLY_DEBUG_44x -#include <platforms/44x/44x.h> +#ifdef CONFIG_PPC_MAPLE -static void udbg_44x_as1_flush(void) +#define UDBG_UART_MAPLE_ADDR ((void __iomem *)0xf40003f8) + +static u8 udbg_uart_in_maple(unsigned int reg) { - if (udbg_comport) { - while ((as1_readb(&udbg_comport->lsr) & LSR_THRE) == 0) - /* wait for idle */; - } + return real_readb(UDBG_UART_MAPLE_ADDR + reg); } -static void udbg_44x_as1_putc(char c) +static void udbg_uart_out_maple(unsigned int reg, u8 val) { - if (udbg_comport) { - if (c == '\n') - udbg_44x_as1_putc('\r'); - udbg_44x_as1_flush(); - as1_writeb(c, &udbg_comport->thr); eieio(); - } + real_writeb(val, UDBG_UART_MAPLE_ADDR + reg); } -static int udbg_44x_as1_getc(void) +void __init udbg_init_maple_realmode(void) { - if (udbg_comport) { - while ((as1_readb(&udbg_comport->lsr) & LSR_DR) == 0) - ; /* wait for char */ - return as1_readb(&udbg_comport->rbr); - } - return -1; + udbg_uart_in = udbg_uart_in_maple; + udbg_uart_out = udbg_uart_out_maple; + udbg_use_uart(); } -void __init udbg_init_44x_as1(void) -{ - udbg_comport = - (struct NS16550 __iomem *)PPC44x_EARLY_DEBUG_VIRTADDR; +#endif /* CONFIG_PPC_MAPLE */ - udbg_putc = udbg_44x_as1_putc; - udbg_flush = udbg_44x_as1_flush; - udbg_getc = udbg_44x_as1_getc; -} -#endif /* CONFIG_PPC_EARLY_DEBUG_44x */ +#ifdef CONFIG_PPC_PASEMI -#ifdef CONFIG_PPC_EARLY_DEBUG_40x -static void udbg_40x_real_flush(void) +#define UDBG_UART_PAS_ADDR ((void __iomem *)0xfcff03f8UL) + +static u8 udbg_uart_in_pas(unsigned int reg) { - if (udbg_comport) { - while ((real_readb(&udbg_comport->lsr) & LSR_THRE) == 0) - /* wait for idle */; - } + return real_205_readb(UDBG_UART_PAS_ADDR + reg); } -static void udbg_40x_real_putc(char c) +static void udbg_uart_out_pas(unsigned int reg, u8 val) { - if (udbg_comport) { - if (c == '\n') - udbg_40x_real_putc('\r'); - udbg_40x_real_flush(); - real_writeb(c, &udbg_comport->thr); eieio(); - } + real_205_writeb(val, UDBG_UART_PAS_ADDR + reg); } -static int udbg_40x_real_getc(void) +void __init udbg_init_pas_realmode(void) { - if (udbg_comport) { - while ((real_readb(&udbg_comport->lsr) & LSR_DR) == 0) - ; /* wait for char */ - return real_readb(&udbg_comport->rbr); - } - return -1; + udbg_uart_in = udbg_uart_in_pas; + udbg_uart_out = udbg_uart_out_pas; + udbg_use_uart(); } -void __init udbg_init_40x_realmode(void) -{ - udbg_comport = (struct NS16550 __iomem *) - CONFIG_PPC_EARLY_DEBUG_40x_PHYSADDR; +#endif /* CONFIG_PPC_PASEMI */ - udbg_putc = udbg_40x_real_putc; - udbg_flush = udbg_40x_real_flush; - udbg_getc = udbg_40x_real_getc; - udbg_getc_poll = NULL; -} -#endif /* CONFIG_PPC_EARLY_DEBUG_40x */ +#ifdef CONFIG_PPC_EARLY_DEBUG_44x + +#include <platforms/44x/44x.h> -#ifdef CONFIG_PPC_EARLY_DEBUG_WSP -static void udbg_wsp_flush(void) +static u8 udbg_uart_in_44x_as1(unsigned int reg) { - if (udbg_comport) { - while ((readb(&udbg_comport->lsr) & LSR_THRE) == 0) - /* wait for idle */; - } + return as1_readb((void __iomem *)PPC44x_EARLY_DEBUG_VIRTADDR + reg); } -static void udbg_wsp_putc(char c) +static void udbg_uart_out_44x_as1(unsigned int reg, u8 val) { - if (udbg_comport) { - if (c == '\n') - udbg_wsp_putc('\r'); - udbg_wsp_flush(); - writeb(c, &udbg_comport->thr); eieio(); - } + as1_writeb(val, (void __iomem *)PPC44x_EARLY_DEBUG_VIRTADDR + reg); } -static int udbg_wsp_getc(void) +void __init udbg_init_44x_as1(void) { - if (udbg_comport) { - while ((readb(&udbg_comport->lsr) & LSR_DR) == 0) - ; /* wait for char */ - return readb(&udbg_comport->rbr); - } - return -1; + udbg_uart_in = udbg_uart_in_44x_as1; + udbg_uart_out = udbg_uart_out_44x_as1; + udbg_use_uart(); } -static int udbg_wsp_getc_poll(void) +#endif /* CONFIG_PPC_EARLY_DEBUG_44x */ + +#ifdef CONFIG_PPC_EARLY_DEBUG_40x + +static u8 udbg_uart_in_40x(unsigned int reg) { - if (udbg_comport) - if (readb(&udbg_comport->lsr) & LSR_DR) - return readb(&udbg_comport->rbr); - return -1; + return real_readb((void __iomem *)CONFIG_PPC_EARLY_DEBUG_40x_PHYSADDR + + reg); } -void __init udbg_init_wsp(void) +static void udbg_uart_out_40x(unsigned int reg, u8 val) { - udbg_comport = (struct NS16550 __iomem *)WSP_UART_VIRT; - - udbg_init_uart(udbg_comport, 57600, 50000000); + real_writeb(val, (void __iomem *)CONFIG_PPC_EARLY_DEBUG_40x_PHYSADDR + + reg); +} - udbg_putc = udbg_wsp_putc; - udbg_flush = udbg_wsp_flush; - udbg_getc = udbg_wsp_getc; - udbg_getc_poll = udbg_wsp_getc_poll; +void __init udbg_init_40x_realmode(void) +{ + udbg_uart_in = udbg_uart_in_40x; + udbg_uart_out = udbg_uart_out_40x; + udbg_use_uart(); } -#endif /* CONFIG_PPC_EARLY_DEBUG_WSP */ + +#endif /* CONFIG_PPC_EARLY_DEBUG_40x */ diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c index bc77834dbf4..003b20964ea 100644 --- a/arch/powerpc/kernel/uprobes.c +++ b/arch/powerpc/kernel/uprobes.c @@ -31,6 +31,16 @@ #define UPROBE_TRAP_NR UINT_MAX /** + * is_trap_insn - check if the instruction is a trap variant + * @insn: instruction to be checked. + * Returns true if @insn is a trap variant. + */ +bool is_trap_insn(uprobe_opcode_t *insn) +{ + return (is_trap(*insn)); +} + +/** * arch_uprobe_analyze_insn * @mm: the probed address space. * @arch_uprobe: the probepoint information. @@ -43,12 +53,6 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, if (addr & 0x03) return -EINVAL; - /* - * We currently don't support a uprobe on an already - * existing breakpoint instruction underneath - */ - if (is_trap(auprobe->ainsn)) - return -ENOTSUPP; return 0; } @@ -182,9 +186,22 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) * emulate_step() returns 1 if the insn was successfully emulated. * For all other cases, we need to single-step in hardware. */ - ret = emulate_step(regs, auprobe->ainsn); + ret = emulate_step(regs, auprobe->insn); if (ret > 0) return true; return false; } + +unsigned long +arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) +{ + unsigned long orig_ret_vaddr; + + orig_ret_vaddr = regs->link; + + /* Replace the return addr with trampoline addr */ + regs->link = trampoline_vaddr; + + return orig_ret_vaddr; +} diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 1b2076f049c..ce74c335a6a 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -34,8 +34,7 @@ #include <asm/firmware.h> #include <asm/vdso.h> #include <asm/vdso_datapage.h> - -#include "setup.h" +#include <asm/setup.h> #undef DEBUG @@ -113,6 +112,10 @@ static struct vdso_patch_def vdso_patches[] = { CPU_FTR_USE_TB, 0, "__kernel_get_tbfreq", NULL }, + { + CPU_FTR_USE_TB, 0, + "__kernel_time", NULL + }, }; /* @@ -707,13 +710,13 @@ static void __init vdso_setup_syscall_map(void) } #ifdef CONFIG_PPC64 -int __cpuinit vdso_getcpu_init(void) +int vdso_getcpu_init(void) { unsigned long cpu, node, val; /* - * SPRG3 contains the CPU in the bottom 16 bits and the NUMA node in - * the next 16 bits. The VDSO uses this to implement getcpu(). + * SPRG_VDSO contains the CPU in the bottom 16 bits and the NUMA node + * in the next 16 bits. The VDSO uses this to implement getcpu(). */ cpu = get_cpu(); WARN_ON_ONCE(cpu > 0xffff); @@ -722,8 +725,8 @@ int __cpuinit vdso_getcpu_init(void) WARN_ON_ONCE(node > 0xffff); val = (cpu & 0xfff) | ((node & 0xffff) << 16); - mtspr(SPRN_SPRG3, val); - get_paca()->sprg3 = val; + mtspr(SPRN_SPRG_VDSO_WRITE, val); + get_paca()->sprg_vdso = val; put_cpu(); diff --git a/arch/powerpc/kernel/vdso32/getcpu.S b/arch/powerpc/kernel/vdso32/getcpu.S index 47afd08c90f..23eb9a9441b 100644 --- a/arch/powerpc/kernel/vdso32/getcpu.S +++ b/arch/powerpc/kernel/vdso32/getcpu.S @@ -29,7 +29,7 @@ */ V_FUNCTION_BEGIN(__kernel_getcpu) .cfi_startproc - mfspr r5,SPRN_USPRG3 + mfspr r5,SPRN_SPRG_VDSO_READ cmpdi cr0,r3,0 cmpdi cr1,r4,0 clrlwi r6,r5,16 diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 4ee09ee2e83..6b2b69616e7 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -181,6 +181,32 @@ V_FUNCTION_END(__kernel_clock_getres) /* + * Exact prototype of time() + * + * time_t time(time *t); + * + */ +V_FUNCTION_BEGIN(__kernel_time) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + + mr r11,r3 /* r11 holds t */ + bl __get_datapage@local + mr r9, r3 /* datapage ptr in r9 */ + + lwz r3,STAMP_XTIME+TSPEC_TV_SEC(r9) + + cmplwi r11,0 /* check if t is NULL */ + beq 2f + stw r3,0(r11) /* store result at *t */ +2: mtlr r12 + crclr cr0*4+so + blr + .cfi_endproc +V_FUNCTION_END(__kernel_time) + +/* * This is the core of clock_gettime() and gettimeofday(), * it returns the current time in r3 (seconds) and r4. * On entry, r7 gives the resolution of r4, either USEC_PER_SEC @@ -206,9 +232,15 @@ __do_get_tspec: lwz r6,(CFG_TB_ORIG_STAMP+4)(r9) /* Get a stable TB value */ +#ifdef CONFIG_8xx 2: mftbu r3 mftbl r4 mftbu r0 +#else +2: mfspr r3, SPRN_TBRU + mfspr r4, SPRN_TBRL + mfspr r0, SPRN_TBRU +#endif cmplw cr0,r3,r0 bne- 2b diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S index 43200ba2e57..e58ee10fa5c 100644 --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S @@ -4,7 +4,11 @@ */ #include <asm/vdso.h> +#ifdef __LITTLE_ENDIAN__ +OUTPUT_FORMAT("elf32-powerpcle", "elf32-powerpcle", "elf32-powerpcle") +#else OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", "elf32-powerpc") +#endif OUTPUT_ARCH(powerpc:common) ENTRY(_start) @@ -150,6 +154,7 @@ VERSION #ifdef CONFIG_PPC64 __kernel_getcpu; #endif + __kernel_time; local: *; }; diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S index 6e8f507ed32..6ac107ac402 100644 --- a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S +++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S @@ -1,4 +1,3 @@ -#include <linux/init.h> #include <linux/linkage.h> #include <asm/page.h> @@ -7,7 +6,7 @@ .globl vdso32_start, vdso32_end .balign PAGE_SIZE vdso32_start: - .incbin "arch/powerpc/kernel/vdso32/vdso32.so" + .incbin "arch/powerpc/kernel/vdso32/vdso32.so.dbg" .balign PAGE_SIZE vdso32_end: diff --git a/arch/powerpc/kernel/vdso64/getcpu.S b/arch/powerpc/kernel/vdso64/getcpu.S index 47afd08c90f..23eb9a9441b 100644 --- a/arch/powerpc/kernel/vdso64/getcpu.S +++ b/arch/powerpc/kernel/vdso64/getcpu.S @@ -29,7 +29,7 @@ */ V_FUNCTION_BEGIN(__kernel_getcpu) .cfi_startproc - mfspr r5,SPRN_USPRG3 + mfspr r5,SPRN_SPRG_VDSO_READ cmpdi cr0,r3,0 cmpdi cr1,r4,0 clrlwi r6,r5,16 diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S index e97a9a0dc4a..a76b4af37ef 100644 --- a/arch/powerpc/kernel/vdso64/gettimeofday.S +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S @@ -164,6 +164,32 @@ V_FUNCTION_BEGIN(__kernel_clock_getres) .cfi_endproc V_FUNCTION_END(__kernel_clock_getres) +/* + * Exact prototype of time() + * + * time_t time(time *t); + * + */ +V_FUNCTION_BEGIN(__kernel_time) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + + mr r11,r3 /* r11 holds t */ + bl V_LOCAL_FUNC(__get_datapage) + + ld r4,STAMP_XTIME+TSPC64_TV_SEC(r3) + + cmpldi r11,0 /* check if t is NULL */ + beq 2f + std r4,0(r11) /* store result at *t */ +2: mtlr r12 + crclr cr0*4+so + mr r3,r4 + blr + .cfi_endproc +V_FUNCTION_END(__kernel_time) + /* * This is the core of clock_gettime() and gettimeofday(), diff --git a/arch/powerpc/kernel/vdso64/sigtramp.S b/arch/powerpc/kernel/vdso64/sigtramp.S index 45ea281e9a2..542c6f422e4 100644 --- a/arch/powerpc/kernel/vdso64/sigtramp.S +++ b/arch/powerpc/kernel/vdso64/sigtramp.S @@ -142,6 +142,13 @@ V_FUNCTION_END(__kernel_sigtramp_rt64) /* Size of CR reg in DWARF unwind info. */ #define CRSIZE 4 +/* Offset of CR reg within a full word. */ +#ifdef __LITTLE_ENDIAN__ +#define CROFF 0 +#else +#define CROFF (RSIZE - CRSIZE) +#endif + /* This is the offset of the VMX reg pointer. */ #define VREGS 48*RSIZE+33*8 @@ -181,7 +188,14 @@ V_FUNCTION_END(__kernel_sigtramp_rt64) rsave (31, 31*RSIZE); \ rsave (67, 32*RSIZE); /* ap, used as temp for nip */ \ rsave (65, 36*RSIZE); /* lr */ \ - rsave (70, 38*RSIZE + (RSIZE - CRSIZE)) /* cr */ + rsave (68, 38*RSIZE + CROFF); /* cr fields */ \ + rsave (69, 38*RSIZE + CROFF); \ + rsave (70, 38*RSIZE + CROFF); \ + rsave (71, 38*RSIZE + CROFF); \ + rsave (72, 38*RSIZE + CROFF); \ + rsave (73, 38*RSIZE + CROFF); \ + rsave (74, 38*RSIZE + CROFF); \ + rsave (75, 38*RSIZE + CROFF) /* Describe where the FP regs are saved. */ #define EH_FRAME_FP \ diff --git a/arch/powerpc/kernel/vdso64/vdso64.lds.S b/arch/powerpc/kernel/vdso64/vdso64.lds.S index e6c1758f358..64fb183a47c 100644 --- a/arch/powerpc/kernel/vdso64/vdso64.lds.S +++ b/arch/powerpc/kernel/vdso64/vdso64.lds.S @@ -4,7 +4,11 @@ */ #include <asm/vdso.h> +#ifdef __LITTLE_ENDIAN__ +OUTPUT_FORMAT("elf64-powerpcle", "elf64-powerpcle", "elf64-powerpcle") +#else OUTPUT_FORMAT("elf64-powerpc", "elf64-powerpc", "elf64-powerpc") +#endif OUTPUT_ARCH(powerpc:common64) ENTRY(_start) @@ -147,6 +151,7 @@ VERSION __kernel_sync_dicache_p5; __kernel_sigtramp_rt64; __kernel_getcpu; + __kernel_time; local: *; }; diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S index b8553d62b79..df60fca6a13 100644 --- a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S +++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S @@ -1,4 +1,3 @@ -#include <linux/init.h> #include <linux/linkage.h> #include <asm/page.h> @@ -7,7 +6,7 @@ .globl vdso64_start, vdso64_end .balign PAGE_SIZE vdso64_start: - .incbin "arch/powerpc/kernel/vdso64/vdso64.so" + .incbin "arch/powerpc/kernel/vdso64/vdso64.so.dbg" .balign PAGE_SIZE vdso64_end: diff --git a/arch/powerpc/kernel/vecemu.c b/arch/powerpc/kernel/vecemu.c index 604d0947cb2..c4bfadb2606 100644 --- a/arch/powerpc/kernel/vecemu.c +++ b/arch/powerpc/kernel/vecemu.c @@ -271,7 +271,7 @@ int emulate_altivec(struct pt_regs *regs) vb = (instr >> 11) & 0x1f; vc = (instr >> 6) & 0x1f; - vrs = current->thread.vr; + vrs = current->thread.vr_state.vr; switch (instr & 0x3f) { case 10: switch (vc) { @@ -320,12 +320,12 @@ int emulate_altivec(struct pt_regs *regs) case 14: /* vctuxs */ for (i = 0; i < 4; ++i) vrs[vd].u[i] = ctuxs(vrs[vb].u[i], va, - ¤t->thread.vscr.u[3]); + ¤t->thread.vr_state.vscr.u[3]); break; case 15: /* vctsxs */ for (i = 0; i < 4; ++i) vrs[vd].u[i] = ctsxs(vrs[vb].u[i], va, - ¤t->thread.vscr.u[3]); + ¤t->thread.vr_state.vscr.u[3]); break; default: return -EINVAL; diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index e830289d2e4..74f8050518d 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -7,13 +7,76 @@ #include <asm/page.h> #include <asm/ptrace.h> +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* void do_load_up_transact_altivec(struct thread_struct *thread) + * + * This is similar to load_up_altivec but for the transactional version of the + * vector regs. It doesn't mess with the task MSR or valid flags. + * Furthermore, VEC laziness is not supported with TM currently. + */ +_GLOBAL(do_load_up_transact_altivec) + mfmsr r6 + oris r5,r6,MSR_VEC@h + MTMSRD(r5) + isync + + li r4,1 + stw r4,THREAD_USED_VR(r3) + + li r10,THREAD_TRANSACT_VRSTATE+VRSTATE_VSCR + lvx vr0,r10,r3 + mtvscr vr0 + addi r10,r3,THREAD_TRANSACT_VRSTATE + REST_32VRS(0,r4,r10) + + /* Disable VEC again. */ + MTMSRD(r6) + isync + + blr +#endif + +/* + * Enable use of VMX/Altivec for the caller. + */ +_GLOBAL(vec_enable) + mfmsr r3 + oris r3,r3,MSR_VEC@h + MTMSRD(r3) + isync + blr + +/* + * Load state from memory into VMX registers including VSCR. + * Assumes the caller has enabled VMX in the MSR. + */ +_GLOBAL(load_vr_state) + li r4,VRSTATE_VSCR + lvx vr0,r4,r3 + mtvscr vr0 + REST_32VRS(0,r4,r3) + blr + +/* + * Store VMX state into memory, including VSCR. + * Assumes the caller has enabled VMX in the MSR. + */ +_GLOBAL(store_vr_state) + SAVE_32VRS(0, r4, r3) + mfvscr vr0 + li r4, VRSTATE_VSCR + stvx vr0, r4, r3 + blr + /* - * load_up_altivec(unused, unused, tsk) * Disable VMX for the task which had it previously, * and save its vector registers in its thread_struct. * Enables the VMX for use in the kernel on return. * On SMP we know the VMX is free, since we give it up every * switch (ie, no lazy save of the vector registers). + * + * Note that on 32-bit this can only use registers that will be + * restored by fast_exception_return, i.e. r3 - r6, r10 and r11. */ _GLOBAL(load_up_altivec) mfmsr r5 /* grab the current MSR */ @@ -39,10 +102,11 @@ _GLOBAL(load_up_altivec) /* Save VMX state to last_task_used_altivec's THREAD struct */ toreal(r4) addi r4,r4,THREAD - SAVE_32VRS(0,r5,r4) + addi r6,r4,THREAD_VRSTATE + SAVE_32VRS(0,r5,r6) mfvscr vr0 - li r10,THREAD_VSCR - stvx vr0,r10,r4 + li r10,VRSTATE_VSCR + stvx vr0,r10,r6 /* Disable VMX for last_task_used_altivec */ PPC_LL r5,PT_REGS(r4) toreal(r5) @@ -74,12 +138,13 @@ _GLOBAL(load_up_altivec) oris r12,r12,MSR_VEC@h std r12,_MSR(r1) #endif + addi r6,r5,THREAD_VRSTATE li r4,1 - li r10,THREAD_VSCR + li r10,VRSTATE_VSCR stw r4,THREAD_USED_VR(r5) - lvx vr0,r10,r5 + lvx vr0,r10,r6 mtvscr vr0 - REST_32VRS(0,r4,r5) + REST_32VRS(0,r4,r6) #ifndef CONFIG_SMP /* Update last_task_used_altivec to 'current' */ subi r4,r5,THREAD /* Back to 'current' */ @@ -114,12 +179,16 @@ _GLOBAL(giveup_altivec) PPC_LCMPI 0,r3,0 beqlr /* if no previous owner, done */ addi r3,r3,THREAD /* want THREAD of task */ + PPC_LL r7,THREAD_VRSAVEAREA(r3) PPC_LL r5,PT_REGS(r3) - PPC_LCMPI 0,r5,0 - SAVE_32VRS(0,r4,r3) + PPC_LCMPI 0,r7,0 + bne 2f + addi r7,r3,THREAD_VRSTATE +2: PPC_LCMPI 0,r5,0 + SAVE_32VRS(0,r4,r7) mfvscr vr0 - li r4,THREAD_VSCR - stvx vr0,r4,r3 + li r4,VRSTATE_VSCR + stvx vr0,r4,r7 beq 1f PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5) #ifdef CONFIG_VSX diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 536016d792b..904c66128fa 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -518,16 +518,18 @@ static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page, struct dma_attrs *attrs) { struct vio_dev *viodev = to_vio_dev(dev); + struct iommu_table *tbl; dma_addr_t ret = DMA_ERROR_CODE; - if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) { + tbl = get_iommu_table_base(dev); + if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) { atomic_inc(&viodev->cmo.allocs_failed); return ret; } ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs); if (unlikely(dma_mapping_error(dev, ret))) { - vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE)); + vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); atomic_inc(&viodev->cmo.allocs_failed); } @@ -540,10 +542,12 @@ static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, struct dma_attrs *attrs) { struct vio_dev *viodev = to_vio_dev(dev); + struct iommu_table *tbl; + tbl = get_iommu_table_base(dev); dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs); - vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE)); + vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); } static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, @@ -551,12 +555,14 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, struct dma_attrs *attrs) { struct vio_dev *viodev = to_vio_dev(dev); + struct iommu_table *tbl; struct scatterlist *sgl; int ret, count = 0; size_t alloc_size = 0; + tbl = get_iommu_table_base(dev); for (sgl = sglist; count < nelems; count++, sgl++) - alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE); + alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl)); if (vio_cmo_alloc(viodev, alloc_size)) { atomic_inc(&viodev->cmo.allocs_failed); @@ -572,7 +578,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, } for (sgl = sglist, count = 0; count < ret; count++, sgl++) - alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE); + alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); if (alloc_size) vio_cmo_dealloc(viodev, alloc_size); @@ -585,12 +591,14 @@ static void vio_dma_iommu_unmap_sg(struct device *dev, struct dma_attrs *attrs) { struct vio_dev *viodev = to_vio_dev(dev); + struct iommu_table *tbl; struct scatterlist *sgl; size_t alloc_size = 0; int count = 0; + tbl = get_iommu_table_base(dev); for (sgl = sglist; count < nelems; count++, sgl++) - alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE); + alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs); @@ -706,11 +714,14 @@ static int vio_cmo_bus_probe(struct vio_dev *viodev) { struct vio_cmo_dev_entry *dev_ent; struct device *dev = &viodev->dev; + struct iommu_table *tbl; struct vio_driver *viodrv = to_vio_driver(dev->driver); unsigned long flags; size_t size; bool dma_capable = false; + tbl = get_iommu_table_base(dev); + /* A device requires entitlement if it has a DMA window property */ switch (viodev->family) { case VDEVICE: @@ -736,7 +747,8 @@ static int vio_cmo_bus_probe(struct vio_dev *viodev) return -EINVAL; } - viodev->cmo.desired = IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev)); + viodev->cmo.desired = + IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev), tbl); if (viodev->cmo.desired < VIO_CMO_MIN_ENT) viodev->cmo.desired = VIO_CMO_MIN_ENT; size = VIO_CMO_MIN_ENT; @@ -997,21 +1009,36 @@ static struct device_attribute vio_cmo_dev_attrs[] = { /* sysfs bus functions and data structures for CMO */ #define viobus_cmo_rd_attr(name) \ -static ssize_t \ -viobus_cmo_##name##_show(struct bus_type *bt, char *buf) \ +static ssize_t cmo_##name##_show(struct bus_type *bt, char *buf) \ { \ return sprintf(buf, "%lu\n", vio_cmo.name); \ -} +} \ +static BUS_ATTR_RO(cmo_##name) #define viobus_cmo_pool_rd_attr(name, var) \ static ssize_t \ -viobus_cmo_##name##_pool_show_##var(struct bus_type *bt, char *buf) \ +cmo_##name##_##var##_show(struct bus_type *bt, char *buf) \ { \ return sprintf(buf, "%lu\n", vio_cmo.name.var); \ +} \ +static BUS_ATTR_RO(cmo_##name##_##var) + +viobus_cmo_rd_attr(entitled); +viobus_cmo_rd_attr(spare); +viobus_cmo_rd_attr(min); +viobus_cmo_rd_attr(desired); +viobus_cmo_rd_attr(curr); +viobus_cmo_pool_rd_attr(reserve, size); +viobus_cmo_pool_rd_attr(excess, size); +viobus_cmo_pool_rd_attr(excess, free); + +static ssize_t cmo_high_show(struct bus_type *bt, char *buf) +{ + return sprintf(buf, "%lu\n", vio_cmo.high); } -static ssize_t viobus_cmo_high_reset(struct bus_type *bt, const char *buf, - size_t count) +static ssize_t cmo_high_store(struct bus_type *bt, const char *buf, + size_t count) { unsigned long flags; @@ -1021,35 +1048,26 @@ static ssize_t viobus_cmo_high_reset(struct bus_type *bt, const char *buf, return count; } - -viobus_cmo_rd_attr(entitled); -viobus_cmo_pool_rd_attr(reserve, size); -viobus_cmo_pool_rd_attr(excess, size); -viobus_cmo_pool_rd_attr(excess, free); -viobus_cmo_rd_attr(spare); -viobus_cmo_rd_attr(min); -viobus_cmo_rd_attr(desired); -viobus_cmo_rd_attr(curr); -viobus_cmo_rd_attr(high); - -static struct bus_attribute vio_cmo_bus_attrs[] = { - __ATTR(cmo_entitled, S_IRUGO, viobus_cmo_entitled_show, NULL), - __ATTR(cmo_reserve_size, S_IRUGO, viobus_cmo_reserve_pool_show_size, NULL), - __ATTR(cmo_excess_size, S_IRUGO, viobus_cmo_excess_pool_show_size, NULL), - __ATTR(cmo_excess_free, S_IRUGO, viobus_cmo_excess_pool_show_free, NULL), - __ATTR(cmo_spare, S_IRUGO, viobus_cmo_spare_show, NULL), - __ATTR(cmo_min, S_IRUGO, viobus_cmo_min_show, NULL), - __ATTR(cmo_desired, S_IRUGO, viobus_cmo_desired_show, NULL), - __ATTR(cmo_curr, S_IRUGO, viobus_cmo_curr_show, NULL), - __ATTR(cmo_high, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH, - viobus_cmo_high_show, viobus_cmo_high_reset), - __ATTR_NULL +static BUS_ATTR_RW(cmo_high); + +static struct attribute *vio_bus_attrs[] = { + &bus_attr_cmo_entitled.attr, + &bus_attr_cmo_spare.attr, + &bus_attr_cmo_min.attr, + &bus_attr_cmo_desired.attr, + &bus_attr_cmo_curr.attr, + &bus_attr_cmo_high.attr, + &bus_attr_cmo_reserve_size.attr, + &bus_attr_cmo_excess_size.attr, + &bus_attr_cmo_excess_free.attr, + NULL, }; +ATTRIBUTE_GROUPS(vio_bus); static void vio_cmo_sysfs_init(void) { vio_bus_type.dev_attrs = vio_cmo_dev_attrs; - vio_bus_type.bus_attrs = vio_cmo_bus_attrs; + vio_bus_type.bus_groups = vio_bus_groups; } #else /* CONFIG_PPC_SMLPAR */ int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; } @@ -1153,7 +1171,7 @@ EXPORT_SYMBOL(vio_h_cop_sync); static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) { - const unsigned char *dma_window; + const __be32 *dma_window; struct iommu_table *tbl; unsigned long offset, size; @@ -1170,9 +1188,10 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) &tbl->it_index, &offset, &size); /* TCE table size - measured in tce entries */ - tbl->it_size = size >> IOMMU_PAGE_SHIFT; + tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; + tbl->it_size = size >> tbl->it_page_shift; /* offset for VIO should always be 0 */ - tbl->it_offset = offset >> IOMMU_PAGE_SHIFT; + tbl->it_offset = offset >> tbl->it_page_shift; tbl->it_busno = 0; tbl->it_type = TCE_VB; tbl->it_blocksize = 16; @@ -1312,8 +1331,7 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) { struct vio_dev *viodev; struct device_node *parent_node; - const unsigned int *unit_address; - const unsigned int *pfo_resid = NULL; + const __be32 *prop; enum vio_dev_family family; const char *of_node_name = of_node->name ? of_node->name : "<unknown>"; @@ -1360,6 +1378,8 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) /* we need the 'device_type' property, in order to match with drivers */ viodev->family = family; if (viodev->family == VDEVICE) { + unsigned int unit_address; + if (of_node->type != NULL) viodev->type = of_node->type; else { @@ -1368,24 +1388,24 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) goto out; } - unit_address = of_get_property(of_node, "reg", NULL); - if (unit_address == NULL) { + prop = of_get_property(of_node, "reg", NULL); + if (prop == NULL) { pr_warn("%s: node %s missing 'reg'\n", __func__, of_node_name); goto out; } - dev_set_name(&viodev->dev, "%x", *unit_address); + unit_address = of_read_number(prop, 1); + dev_set_name(&viodev->dev, "%x", unit_address); viodev->irq = irq_of_parse_and_map(of_node, 0); - viodev->unit_address = *unit_address; + viodev->unit_address = unit_address; } else { /* PFO devices need their resource_id for submitting COP_OPs * This is an optional field for devices, but is required when * performing synchronous ops */ - pfo_resid = of_get_property(of_node, "ibm,resource-id", NULL); - if (pfo_resid != NULL) - viodev->resource_id = *pfo_resid; + prop = of_get_property(of_node, "ibm,resource-id", NULL); + if (prop != NULL) + viodev->resource_id = of_read_number(prop, 1); - unit_address = NULL; dev_set_name(&viodev->dev, "%s", of_node_name); viodev->type = of_node_name; viodev->irq = 0; @@ -1412,8 +1432,8 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) /* needed to ensure proper operation of coherent allocations * later, in case driver doesn't set it explicitly */ - dma_set_mask(&viodev->dev, DMA_BIT_MASK(64)); - dma_set_coherent_mask(&viodev->dev, DMA_BIT_MASK(64)); + viodev->dev.coherent_dma_mask = DMA_BIT_MASK(64); + viodev->dev.dma_mask = &viodev->dev.coherent_dma_mask; } /* register with generic device framework */ @@ -1529,11 +1549,15 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, const char *cp; dn = dev->of_node; - if (!dn) - return -ENODEV; + if (!dn) { + strcpy(buf, "\n"); + return strlen(buf); + } cp = of_get_property(dn, "compatible", NULL); - if (!cp) - return -ENODEV; + if (!cp) { + strcpy(buf, "\n"); + return strlen(buf); + } return sprintf(buf, "vio:T%sS%s\n", vio_dev->type, cp); } @@ -1622,7 +1646,6 @@ static struct vio_dev *vio_find_name(const char *name) */ struct vio_dev *vio_find_node(struct device_node *vnode) { - const uint32_t *unit_address; char kobj_name[20]; struct device_node *vnode_parent; const char *dev_type; @@ -1638,10 +1661,13 @@ struct vio_dev *vio_find_node(struct device_node *vnode) /* construct the kobject name from the device node */ if (!strcmp(dev_type, "vdevice")) { - unit_address = of_get_property(vnode, "reg", NULL); - if (!unit_address) + const __be32 *prop; + + prop = of_get_property(vnode, "reg", NULL); + if (!prop) return NULL; - snprintf(kobj_name, sizeof(kobj_name), "%x", *unit_address); + snprintf(kobj_name, sizeof(kobj_name), "%x", + (uint32_t)of_read_number(prop, 1)); } else if (!strcmp(dev_type, "ibm,platform-facilities")) snprintf(kobj_name, sizeof(kobj_name), "%s", vnode->name); else diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 65d1c08cf09..f096e72262f 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -38,9 +38,6 @@ jiffies = jiffies_64 + 4; #endif SECTIONS { - . = 0; - reloc_start = .; - . = KERNELBASE; /* @@ -218,6 +215,11 @@ SECTIONS .got : AT(ADDR(.got) - LOAD_OFFSET) { __toc_start = .; +#ifndef CONFIG_RELOCATABLE + __prom_init_toc_start = .; + arch/powerpc/kernel/prom_init.o*(.toc .got) + __prom_init_toc_end = .; +#endif *(.got) *(.toc) } |
