aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/kernel')
-rw-r--r--arch/powerpc/kernel/Makefile54
-rw-r--r--arch/powerpc/kernel/align.c275
-rw-r--r--arch/powerpc/kernel/asm-offsets.c398
-rw-r--r--arch/powerpc/kernel/btext.c258
-rw-r--r--arch/powerpc/kernel/cacheinfo.c69
-rw-r--r--arch/powerpc/kernel/clock.c82
-rw-r--r--arch/powerpc/kernel/cpu_setup_6xx.S40
-rw-r--r--arch/powerpc/kernel/cpu_setup_fsl_booke.S154
-rw-r--r--arch/powerpc/kernel/cpu_setup_power.S182
-rw-r--r--arch/powerpc/kernel/cpu_setup_ppc970.S26
-rw-r--r--arch/powerpc/kernel/cputable.c316
-rw-r--r--arch/powerpc/kernel/crash.c329
-rw-r--r--arch/powerpc/kernel/crash_dump.c69
-rw-r--r--arch/powerpc/kernel/dbell.c69
-rw-r--r--arch/powerpc/kernel/dma-iommu.c47
-rw-r--r--arch/powerpc/kernel/dma-swiotlb.c43
-rw-r--r--arch/powerpc/kernel/dma.c98
-rw-r--r--arch/powerpc/kernel/e500-pmu.c129
-rw-r--r--arch/powerpc/kernel/eeh.c1183
-rw-r--r--arch/powerpc/kernel/eeh_cache.c310
-rw-r--r--arch/powerpc/kernel/eeh_dev.c112
-rw-r--r--arch/powerpc/kernel/eeh_driver.c870
-rw-r--r--arch/powerpc/kernel/eeh_event.c196
-rw-r--r--arch/powerpc/kernel/eeh_pe.c887
-rw-r--r--arch/powerpc/kernel/eeh_sysfs.c98
-rw-r--r--arch/powerpc/kernel/entry_32.S132
-rw-r--r--arch/powerpc/kernel/entry_64.S717
-rw-r--r--arch/powerpc/kernel/epapr_hcalls.S55
-rw-r--r--arch/powerpc/kernel/epapr_paravirt.c85
-rw-r--r--arch/powerpc/kernel/exceptions-64e.S974
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S1745
-rw-r--r--arch/powerpc/kernel/fadump.c1316
-rw-r--r--arch/powerpc/kernel/firmware.c3
-rw-r--r--arch/powerpc/kernel/fpu.S101
-rw-r--r--arch/powerpc/kernel/fsl_booke_entry_mapping.S2
-rw-r--r--arch/powerpc/kernel/ftrace.c271
-rw-r--r--arch/powerpc/kernel/head_32.S24
-rw-r--r--arch/powerpc/kernel/head_40x.S96
-rw-r--r--arch/powerpc/kernel/head_44x.S209
-rw-r--r--arch/powerpc/kernel/head_64.S306
-rw-r--r--arch/powerpc/kernel/head_8xx.S25
-rw-r--r--arch/powerpc/kernel/head_booke.h136
-rw-r--r--arch/powerpc/kernel/head_fsl_booke.S622
-rw-r--r--arch/powerpc/kernel/hw_breakpoint.c88
-rw-r--r--arch/powerpc/kernel/ibmebus.c447
-rw-r--r--arch/powerpc/kernel/idle.c95
-rw-r--r--arch/powerpc/kernel/idle_6xx.S4
-rw-r--r--arch/powerpc/kernel/idle_book3e.S61
-rw-r--r--arch/powerpc/kernel/idle_e500.S16
-rw-r--r--arch/powerpc/kernel/idle_power4.S47
-rw-r--r--arch/powerpc/kernel/idle_power7.S187
-rw-r--r--arch/powerpc/kernel/init_task.c29
-rw-r--r--arch/powerpc/kernel/io-workarounds.c212
-rw-r--r--arch/powerpc/kernel/io.c5
-rw-r--r--arch/powerpc/kernel/iomap.c43
-rw-r--r--arch/powerpc/kernel/iommu.c721
-rw-r--r--arch/powerpc/kernel/irq.c1221
-rw-r--r--arch/powerpc/kernel/isa-bridge.c16
-rw-r--r--arch/powerpc/kernel/jump_label.c25
-rw-r--r--arch/powerpc/kernel/kgdb.c48
-rw-r--r--arch/powerpc/kernel/kprobes.c38
-rw-r--r--arch/powerpc/kernel/kvm.c394
-rw-r--r--arch/powerpc/kernel/kvm_emul.S124
-rw-r--r--arch/powerpc/kernel/l2cr_6xx.S2
-rw-r--r--arch/powerpc/kernel/legacy_serial.c134
-rw-r--r--arch/powerpc/kernel/lparcfg.c816
-rw-r--r--arch/powerpc/kernel/machine_kexec.c103
-rw-r--r--arch/powerpc/kernel/machine_kexec_32.c4
-rw-r--r--arch/powerpc/kernel/machine_kexec_64.c36
-rw-r--r--arch/powerpc/kernel/mce.c352
-rw-r--r--arch/powerpc/kernel/mce_power.c313
-rw-r--r--arch/powerpc/kernel/misc.S13
-rw-r--r--arch/powerpc/kernel/misc_32.S461
-rw-r--r--arch/powerpc/kernel/misc_64.S185
-rw-r--r--arch/powerpc/kernel/module.c22
-rw-r--r--arch/powerpc/kernel/module_32.c25
-rw-r--r--arch/powerpc/kernel/module_64.c319
-rw-r--r--arch/powerpc/kernel/mpc7450-pmu.c417
-rw-r--r--arch/powerpc/kernel/nvram_64.c537
-rw-r--r--arch/powerpc/kernel/of_platform.c26
-rw-r--r--arch/powerpc/kernel/paca.c121
-rw-r--r--arch/powerpc/kernel/pci-common.c681
-rw-r--r--arch/powerpc/kernel/pci-hotplug.c109
-rw-r--r--arch/powerpc/kernel/pci_32.c186
-rw-r--r--arch/powerpc/kernel/pci_64.c91
-rw-r--r--arch/powerpc/kernel/pci_dn.c80
-rw-r--r--arch/powerpc/kernel/pci_of_scan.c127
-rw-r--r--arch/powerpc/kernel/perf_callchain.c478
-rw-r--r--arch/powerpc/kernel/perf_event.c1386
-rw-r--r--arch/powerpc/kernel/perf_event_fsl_emb.c687
-rw-r--r--arch/powerpc/kernel/pmc.c3
-rw-r--r--arch/powerpc/kernel/power4-pmu.c616
-rw-r--r--arch/powerpc/kernel/power5+-pmu.c685
-rw-r--r--arch/powerpc/kernel/power5-pmu.c624
-rw-r--r--arch/powerpc/kernel/power6-pmu.c547
-rw-r--r--arch/powerpc/kernel/power7-pmu.c372
-rw-r--r--arch/powerpc/kernel/ppc32.h77
-rw-r--r--arch/powerpc/kernel/ppc970-pmu.c497
-rw-r--r--arch/powerpc/kernel/ppc_ksyms.c45
-rw-r--r--arch/powerpc/kernel/ppc_save_regs.S3
-rw-r--r--arch/powerpc/kernel/proc_powerpc.c37
-rw-r--r--arch/powerpc/kernel/process.c919
-rw-r--r--arch/powerpc/kernel/prom.c595
-rw-r--r--arch/powerpc/kernel/prom_init.c1199
-rw-r--r--arch/powerpc/kernel/prom_init_check.sh3
-rw-r--r--arch/powerpc/kernel/prom_parse.c140
-rw-r--r--arch/powerpc/kernel/ptrace.c524
-rw-r--r--arch/powerpc/kernel/ptrace32.c75
-rw-r--r--arch/powerpc/kernel/reloc_32.S209
-rw-r--r--arch/powerpc/kernel/reloc_64.S5
-rw-r--r--arch/powerpc/kernel/rtas-rtc.c29
-rw-r--r--arch/powerpc/kernel/rtas.c257
-rw-r--r--arch/powerpc/kernel/rtas_flash.c574
-rw-r--r--arch/powerpc/kernel/rtas_pci.c90
-rw-r--r--arch/powerpc/kernel/rtasd.c71
-rw-r--r--arch/powerpc/kernel/setup-common.c128
-rw-r--r--arch/powerpc/kernel/setup.h9
-rw-r--r--arch/powerpc/kernel/setup_32.c45
-rw-r--r--arch/powerpc/kernel/setup_64.c256
-rw-r--r--arch/powerpc/kernel/signal.c108
-rw-r--r--arch/powerpc/kernel/signal.h15
-rw-r--r--arch/powerpc/kernel/signal_32.c911
-rw-r--r--arch/powerpc/kernel/signal_64.c444
-rw-r--r--arch/powerpc/kernel/smp-tbsync.c11
-rw-r--r--arch/powerpc/kernel/smp.c549
-rw-r--r--arch/powerpc/kernel/softemu8xx.c200
-rw-r--r--arch/powerpc/kernel/stacktrace.c2
-rw-r--r--arch/powerpc/kernel/swsusp.c5
-rw-r--r--arch/powerpc/kernel/swsusp_32.S2
-rw-r--r--arch/powerpc/kernel/swsusp_64.c2
-rw-r--r--arch/powerpc/kernel/swsusp_asm64.S49
-rw-r--r--arch/powerpc/kernel/swsusp_booke.S40
-rw-r--r--arch/powerpc/kernel/sys_ppc32.c500
-rw-r--r--arch/powerpc/kernel/syscalls.c9
-rw-r--r--arch/powerpc/kernel/sysfs.c750
-rw-r--r--arch/powerpc/kernel/systbl.S18
-rw-r--r--arch/powerpc/kernel/time.c428
-rw-r--r--arch/powerpc/kernel/tm.S481
-rw-r--r--arch/powerpc/kernel/traps.c787
-rw-r--r--arch/powerpc/kernel/udbg.c50
-rw-r--r--arch/powerpc/kernel/udbg_16550.c334
-rw-r--r--arch/powerpc/kernel/uprobes.c207
-rw-r--r--arch/powerpc/kernel/vdso.c55
-rw-r--r--arch/powerpc/kernel/vdso32/Makefile4
-rw-r--r--arch/powerpc/kernel/vdso32/getcpu.S45
-rw-r--r--arch/powerpc/kernel/vdso32/gettimeofday.S32
-rw-r--r--arch/powerpc/kernel/vdso32/sigtramp.S2
-rw-r--r--arch/powerpc/kernel/vdso32/vdso32.lds.S8
-rw-r--r--arch/powerpc/kernel/vdso32/vdso32_wrapper.S3
-rw-r--r--arch/powerpc/kernel/vdso64/Makefile2
-rw-r--r--arch/powerpc/kernel/vdso64/getcpu.S45
-rw-r--r--arch/powerpc/kernel/vdso64/gettimeofday.S26
-rw-r--r--arch/powerpc/kernel/vdso64/sigtramp.S18
-rw-r--r--arch/powerpc/kernel/vdso64/vdso64.lds.S6
-rw-r--r--arch/powerpc/kernel/vdso64/vdso64_wrapper.S3
-rw-r--r--arch/powerpc/kernel/vecemu.c6
-rw-r--r--arch/powerpc/kernel/vector.S104
-rw-r--r--arch/powerpc/kernel/vio.c510
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S23
159 files changed, 23059 insertions, 17014 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 36c30f31ec9..670c312d914 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -2,12 +2,13 @@
# Makefile for the linux kernel.
#
+CFLAGS_prom.o = -I$(src)/../../../scripts/dtc/libfdt
CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"'
subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
ifeq ($(CONFIG_PPC64),y)
-CFLAGS_prom_init.o += -mno-minimal-toc
+CFLAGS_prom_init.o += $(NO_MINIMAL_TOC)
endif
ifeq ($(CONFIG_PPC32),y)
CFLAGS_prom_init.o += -fPIC
@@ -28,21 +29,25 @@ endif
obj-y := cputable.o ptrace.o syscalls.o \
irq.o align.o signal_32.o pmc.o vdso.o \
- init_task.o process.o systbl.o idle.o \
- signal.o sysfs.o cacheinfo.o
-obj-y += vdso32/
+ process.o systbl.o idle.o \
+ signal.o sysfs.o cacheinfo.o time.o \
+ prom.o traps.o setup-common.o \
+ udbg.o misc.o io.o dma.o \
+ misc_$(CONFIG_WORD_SIZE).o vdso32/
obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \
signal_64.o ptrace32.o \
paca.o nvram_64.o firmware.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o
+obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o
+obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o
obj64-$(CONFIG_RELOCATABLE) += reloc_64.o
obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o
obj-$(CONFIG_PPC64) += vdso64/
obj-$(CONFIG_ALTIVEC) += vecemu.o
obj-$(CONFIG_PPC_970_NAP) += idle_power4.o
+obj-$(CONFIG_PPC_P7_NAP) += idle_power7.o
obj-$(CONFIG_PPC_OF) += of_platform.o prom_parse.o
-obj-$(CONFIG_PPC_CLOCK) += clock.o
procfs-y := proc_powerpc.o
obj-$(CONFIG_PROC_FS) += $(procfs-y)
rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI) := rtas_pci.o
@@ -50,11 +55,13 @@ obj-$(CONFIG_PPC_RTAS) += rtas.o rtas-rtc.o $(rtaspci-y-y)
obj-$(CONFIG_PPC_RTAS_DAEMON) += rtasd.o
obj-$(CONFIG_RTAS_FLASH) += rtas_flash.o
obj-$(CONFIG_RTAS_PROC) += rtas-proc.o
-obj-$(CONFIG_LPARCFG) += lparcfg.o
obj-$(CONFIG_IBMVIO) += vio.o
obj-$(CONFIG_IBMEBUS) += ibmebus.o
+obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
+ eeh_driver.o eeh_event.o eeh_sysfs.o
obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
+obj-$(CONFIG_FA_DUMP) += fadump.o
ifeq ($(CONFIG_PPC32),y)
obj-$(CONFIG_E500) += idle_e500.o
endif
@@ -69,33 +76,32 @@ endif
obj64-$(CONFIG_HIBERNATION) += swsusp_asm64.o
obj-$(CONFIG_MODULES) += module.o module_$(CONFIG_WORD_SIZE).o
obj-$(CONFIG_44x) += cpu_setup_44x.o
-obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o dbell.o
-obj-$(CONFIG_PPC_BOOK3E_64) += dbell.o
+obj-$(CONFIG_PPC_FSL_BOOK3E) += cpu_setup_fsl_booke.o
+obj-$(CONFIG_PPC_DOORBELL) += dbell.o
+obj-$(CONFIG_JUMP_LABEL) += jump_label.o
extra-y := head_$(CONFIG_WORD_SIZE).o
-extra-$(CONFIG_PPC_BOOK3E_32) := head_new_booke.o
extra-$(CONFIG_40x) := head_40x.o
extra-$(CONFIG_44x) := head_44x.o
extra-$(CONFIG_FSL_BOOKE) := head_fsl_booke.o
extra-$(CONFIG_8xx) := head_8xx.o
extra-y += vmlinux.lds
-obj-y += time.o prom.o traps.o setup-common.o \
- udbg.o misc.o io.o dma.o \
- misc_$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_RELOCATABLE_PPC32) += reloc_32.o
+
obj-$(CONFIG_PPC32) += entry_32.o setup_32.o
obj-$(CONFIG_PPC64) += dma-iommu.o iommu.o
obj-$(CONFIG_KGDB) += kgdb.o
-obj-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o
obj-$(CONFIG_MODULES) += ppc_ksyms.o
obj-$(CONFIG_BOOTX_TEXT) += btext.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_UPROBES) += uprobes.o
obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o
-pci64-$(CONFIG_PPC64) += pci_dn.o isa-bridge.o
+pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o
obj-$(CONFIG_PCI) += pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \
pci-common.o pci_of_scan.o
obj-$(CONFIG_PCI_MSI) += msi.o
@@ -104,24 +110,18 @@ obj-$(CONFIG_KEXEC) += machine_kexec.o crash.o \
obj-$(CONFIG_AUDIT) += audit.o
obj64-$(CONFIG_AUDIT) += compat_audit.o
+obj-$(CONFIG_PPC_IO_WORKAROUNDS) += io-workarounds.o
+
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
-obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o
+obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
-obj-$(CONFIG_PPC_PERF_CTRS) += perf_event.o
-obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \
- power5+-pmu.o power6-pmu.o power7-pmu.o
-obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o
-
-obj-$(CONFIG_FSL_EMB_PERF_EVENT) += perf_event_fsl_emb.o
-obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o
-
-obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
-
-ifneq ($(CONFIG_PPC_INDIRECT_IO),y)
+ifneq ($(CONFIG_PPC_INDIRECT_PIO),y)
obj-y += iomap.o
endif
+obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM) += tm.o
+
obj-$(CONFIG_PPC64) += $(obj64-y)
obj-$(CONFIG_PPC32) += $(obj32-y)
@@ -129,6 +129,7 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),)
obj-y += ppc_save_regs.o
endif
+obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o
obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o
# Disable GCOV in odd or sensitive code
@@ -141,6 +142,7 @@ GCOV_PROFILE_kprobes.o := n
extra-$(CONFIG_PPC_FPU) += fpu.o
extra-$(CONFIG_ALTIVEC) += vector.o
extra-$(CONFIG_PPC64) += entry_64.o
+extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o
extra-y += systbl_chk.i
$(obj)/systbl.o: systbl_chk
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 8184ee97e48..34f55524d45 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -21,18 +21,17 @@
#include <linux/mm.h>
#include <asm/processor.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <asm/cache.h>
#include <asm/cputable.h>
#include <asm/emulated_ops.h>
+#include <asm/switch_to.h>
+#include <asm/disassemble.h>
struct aligninfo {
unsigned char len;
unsigned char flags;
};
-#define IS_XFORM(inst) (((inst) >> 26) == 31)
-#define IS_DSFORM(inst) (((inst) >> 26) >= 56)
#define INVALID { 0, 0 }
@@ -54,8 +53,6 @@ struct aligninfo {
/* DSISR bits reported for a DCBZ instruction: */
#define DCBZ 0x5f /* 8xx/82xx dcbz faults when cache not enabled */
-#define SWAP(a, b) (t = (a), (a) = (b), (b) = t)
-
/*
* The PowerPC stores certain bits of the instruction that caused the
* alignment exception in the DSISR register. This array maps those
@@ -75,7 +72,7 @@ static struct aligninfo aligninfo[128] = {
{ 8, LD+F }, /* 00 0 1001: lfd */
{ 4, ST+F+S }, /* 00 0 1010: stfs */
{ 8, ST+F }, /* 00 0 1011: stfd */
- INVALID, /* 00 0 1100 */
+ { 16, LD }, /* 00 0 1100: lq */
{ 8, LD }, /* 00 0 1101: ld/ldu/lwa */
INVALID, /* 00 0 1110 */
{ 8, ST }, /* 00 0 1111: std/stdu */
@@ -142,7 +139,7 @@ static struct aligninfo aligninfo[128] = {
{ 2, LD+SW }, /* 10 0 1100: lhbrx */
{ 4, LD+SE }, /* 10 0 1101 lwa */
{ 2, ST+SW }, /* 10 0 1110: sthbrx */
- INVALID, /* 10 0 1111 */
+ { 16, ST }, /* 10 0 1111: stq */
INVALID, /* 10 1 0000 */
INVALID, /* 10 1 0001 */
INVALID, /* 10 1 0010 */
@@ -194,37 +191,6 @@ static struct aligninfo aligninfo[128] = {
};
/*
- * Create a DSISR value from the instruction
- */
-static inline unsigned make_dsisr(unsigned instr)
-{
- unsigned dsisr;
-
-
- /* bits 6:15 --> 22:31 */
- dsisr = (instr & 0x03ff0000) >> 16;
-
- if (IS_XFORM(instr)) {
- /* bits 29:30 --> 15:16 */
- dsisr |= (instr & 0x00000006) << 14;
- /* bit 25 --> 17 */
- dsisr |= (instr & 0x00000040) << 8;
- /* bits 21:24 --> 18:21 */
- dsisr |= (instr & 0x00000780) << 3;
- } else {
- /* bit 5 --> 17 */
- dsisr |= (instr & 0x04000000) >> 12;
- /* bits 1: 4 --> 18:21 */
- dsisr |= (instr & 0x78000000) >> 17;
- /* bits 30:31 --> 12:13 */
- if (IS_DSFORM(instr))
- dsisr |= (instr & 0x00000003) << 18;
- }
-
- return dsisr;
-}
-
-/*
* The dcbz (data cache block zero) instruction
* gives an alignment fault if used on non-cacheable
* memory. We handle the fault mainly for the
@@ -256,11 +222,17 @@ static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr)
* bottom 4 bytes of each register, and the loads clear the
* top 4 bytes of the affected register.
*/
+#ifdef __BIG_ENDIAN__
#ifdef CONFIG_PPC64
#define REG_BYTE(rp, i) *((u8 *)((rp) + ((i) >> 2)) + ((i) & 3) + 4)
#else
#define REG_BYTE(rp, i) *((u8 *)(rp) + (i))
#endif
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define REG_BYTE(rp, i) (*(((u8 *)((rp) + ((i)>>2)) + ((i)&3))))
+#endif
#define SWIZ_PTR(p) ((unsigned char __user *)((p) ^ swiz))
@@ -305,6 +277,15 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr,
nb0 = nb + reg * 4 - 128;
nb = 128 - reg * 4;
}
+#ifdef __LITTLE_ENDIAN__
+ /*
+ * String instructions are endian neutral but the code
+ * below is not. Force byte swapping on so that the
+ * effects of swizzling are undone in the load/store
+ * loops below.
+ */
+ flags ^= SW;
+#endif
} else {
/* lwm, stmw */
nb = (32 - reg) * 4;
@@ -372,8 +353,6 @@ static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg,
char *ptr1 = (char *) &current->thread.TS_FPR(reg+1);
int i, ret, sw = 0;
- if (!(flags & F))
- return 0;
if (reg & 1)
return 0; /* invalid form: FRS/FRT must be even */
if (flags & SW)
@@ -393,6 +372,34 @@ static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg,
return 1; /* exception handled and fixed up */
}
+#ifdef CONFIG_PPC64
+static int emulate_lq_stq(struct pt_regs *regs, unsigned char __user *addr,
+ unsigned int reg, unsigned int flags)
+{
+ char *ptr0 = (char *)&regs->gpr[reg];
+ char *ptr1 = (char *)&regs->gpr[reg+1];
+ int i, ret, sw = 0;
+
+ if (reg & 1)
+ return 0; /* invalid form: GPR must be even */
+ if (flags & SW)
+ sw = 7;
+ ret = 0;
+ for (i = 0; i < 8; ++i) {
+ if (!(flags & ST)) {
+ ret |= __get_user(ptr0[i^sw], addr + i);
+ ret |= __get_user(ptr1[i^sw], addr + i + 8);
+ } else {
+ ret |= __put_user(ptr0[i^sw], addr + i);
+ ret |= __put_user(ptr1[i^sw], addr + i + 8);
+ }
+ }
+ if (ret)
+ return -EFAULT;
+ return 1; /* exception handled and fixed up */
+}
+#endif /* CONFIG_PPC64 */
+
#ifdef CONFIG_SPE
static struct aligninfo spe_aligninfo[32] = {
@@ -458,7 +465,7 @@ static struct aligninfo spe_aligninfo[32] = {
static int emulate_spe(struct pt_regs *regs, unsigned int reg,
unsigned int instr)
{
- int t, ret;
+ int ret;
union {
u64 ll;
u32 w[2];
@@ -581,24 +588,18 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg,
if (flags & SW) {
switch (flags & 0xf0) {
case E8:
- SWAP(data.v[0], data.v[7]);
- SWAP(data.v[1], data.v[6]);
- SWAP(data.v[2], data.v[5]);
- SWAP(data.v[3], data.v[4]);
+ data.ll = swab64(data.ll);
break;
case E4:
-
- SWAP(data.v[0], data.v[3]);
- SWAP(data.v[1], data.v[2]);
- SWAP(data.v[4], data.v[7]);
- SWAP(data.v[5], data.v[6]);
+ data.w[0] = swab32(data.w[0]);
+ data.w[1] = swab32(data.w[1]);
break;
/* Its half word endian */
default:
- SWAP(data.v[0], data.v[1]);
- SWAP(data.v[2], data.v[3]);
- SWAP(data.v[4], data.v[5]);
- SWAP(data.v[6], data.v[7]);
+ data.h[0] = swab16(data.h[0]);
+ data.h[1] = swab16(data.h[1]);
+ data.h[2] = swab16(data.h[2]);
+ data.h[3] = swab16(data.h[3]);
break;
}
}
@@ -651,17 +652,38 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg,
int sw = 0;
int i, j;
+ /* userland only */
+ if (unlikely(!user_mode(regs)))
+ return 0;
+
flush_vsx_to_thread(current);
if (reg < 32)
- ptr = (char *) &current->thread.TS_FPR(reg);
+ ptr = (char *) &current->thread.fp_state.fpr[reg][0];
else
- ptr = (char *) &current->thread.vr[reg - 32];
+ ptr = (char *) &current->thread.vr_state.vr[reg - 32];
lptr = (unsigned long *) ptr;
+#ifdef __LITTLE_ENDIAN__
+ if (flags & SW) {
+ elsize = length;
+ sw = length-1;
+ } else {
+ /*
+ * The elements are BE ordered, even in LE mode, so process
+ * them in reverse order.
+ */
+ addr += length - elsize;
+
+ /* 8 byte memory accesses go in the top 8 bytes of the VR */
+ if (length == 8)
+ ptr += 8;
+ }
+#else
if (flags & SW)
sw = elsize-1;
+#endif
for (j = 0; j < length; j += elsize) {
for (i = 0; i < elsize; ++i) {
@@ -671,19 +693,31 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg,
ret |= __get_user(ptr[i^sw], addr + i);
}
ptr += elsize;
+#ifdef __LITTLE_ENDIAN__
+ addr -= elsize;
+#else
addr += elsize;
+#endif
}
+#ifdef __BIG_ENDIAN__
+#define VSX_HI 0
+#define VSX_LO 1
+#else
+#define VSX_HI 1
+#define VSX_LO 0
+#endif
+
if (!ret) {
if (flags & U)
regs->gpr[areg] = regs->dar;
/* Splat load copies the same data to top and bottom 8 bytes */
if (flags & SPLT)
- lptr[1] = lptr[0];
- /* For 8 byte loads, zero the top 8 bytes */
+ lptr[VSX_LO] = lptr[VSX_HI];
+ /* For 8 byte loads, zero the low 8 bytes */
else if (!(flags & ST) && (8 == length))
- lptr[1] = 0;
+ lptr[VSX_LO] = 0;
} else
return -EFAULT;
@@ -706,18 +740,28 @@ int fix_alignment(struct pt_regs *regs)
unsigned int dsisr;
unsigned char __user *addr;
unsigned long p, swiz;
- int ret, t;
- union {
+ int ret, i;
+ union data {
u64 ll;
double dd;
unsigned char v[8];
struct {
+#ifdef __LITTLE_ENDIAN__
+ int low32;
+ unsigned hi32;
+#else
unsigned hi32;
int low32;
+#endif
} x32;
struct {
+#ifdef __LITTLE_ENDIAN__
+ short low16;
+ unsigned char hi48[6];
+#else
unsigned char hi48[6];
short low16;
+#endif
} x16;
} data;
@@ -764,10 +808,21 @@ int fix_alignment(struct pt_regs *regs)
nb = aligninfo[instr].len;
flags = aligninfo[instr].flags;
+ /* ldbrx/stdbrx overlap lfs/stfs in the DSISR unfortunately */
+ if (IS_XFORM(instruction) && ((instruction >> 1) & 0x3ff) == 532) {
+ nb = 8;
+ flags = LD+SW;
+ } else if (IS_XFORM(instruction) &&
+ ((instruction >> 1) & 0x3ff) == 660) {
+ nb = 8;
+ flags = ST+SW;
+ }
+
/* Byteswap little endian loads and stores */
swiz = 0;
- if (regs->msr & MSR_LE) {
+ if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) {
flags ^= SW;
+#ifdef __BIG_ENDIAN__
/*
* So-called "PowerPC little endian" mode works by
* swizzling addresses rather than by actually doing
@@ -780,6 +835,7 @@ int fix_alignment(struct pt_regs *regs)
*/
if (cpu_has_feature(CPU_FTR_PPC_LE))
swiz = 7;
+#endif
}
/* DAR has the operand effective address */
@@ -804,7 +860,7 @@ int fix_alignment(struct pt_regs *regs)
elsize = 8;
flags = 0;
- if (regs->msr & MSR_LE)
+ if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE))
flags |= SW;
if (instruction & 0x100)
flags |= ST;
@@ -852,10 +908,20 @@ int fix_alignment(struct pt_regs *regs)
flush_fp_to_thread(current);
}
- /* Special case for 16-byte FP loads and stores */
- if (nb == 16) {
- PPC_WARN_ALIGNMENT(fp_pair, regs);
- return emulate_fp_pair(addr, reg, flags);
+ if ((nb == 16)) {
+ if (flags & F) {
+ /* Special case for 16-byte FP loads and stores */
+ PPC_WARN_ALIGNMENT(fp_pair, regs);
+ return emulate_fp_pair(addr, reg, flags);
+ } else {
+#ifdef CONFIG_PPC64
+ /* Special case for 16-byte loads and stores */
+ PPC_WARN_ALIGNMENT(lq_stq, regs);
+ return emulate_lq_stq(regs, addr, reg, flags);
+#else
+ return 0;
+#endif
+ }
}
PPC_WARN_ALIGNMENT(unaligned, regs);
@@ -864,32 +930,36 @@ int fix_alignment(struct pt_regs *regs)
* get it from register values
*/
if (!(flags & ST)) {
- data.ll = 0;
- ret = 0;
- p = (unsigned long) addr;
+ unsigned int start = 0;
+
switch (nb) {
- case 8:
- ret |= __get_user_inatomic(data.v[0], SWIZ_PTR(p++));
- ret |= __get_user_inatomic(data.v[1], SWIZ_PTR(p++));
- ret |= __get_user_inatomic(data.v[2], SWIZ_PTR(p++));
- ret |= __get_user_inatomic(data.v[3], SWIZ_PTR(p++));
case 4:
- ret |= __get_user_inatomic(data.v[4], SWIZ_PTR(p++));
- ret |= __get_user_inatomic(data.v[5], SWIZ_PTR(p++));
+ start = offsetof(union data, x32.low32);
+ break;
case 2:
- ret |= __get_user_inatomic(data.v[6], SWIZ_PTR(p++));
- ret |= __get_user_inatomic(data.v[7], SWIZ_PTR(p++));
- if (unlikely(ret))
- return -EFAULT;
+ start = offsetof(union data, x16.low16);
+ break;
}
+
+ data.ll = 0;
+ ret = 0;
+ p = (unsigned long)addr;
+
+ for (i = 0; i < nb; i++)
+ ret |= __get_user_inatomic(data.v[start + i],
+ SWIZ_PTR(p++));
+
+ if (unlikely(ret))
+ return -EFAULT;
+
} else if (flags & F) {
- data.dd = current->thread.TS_FPR(reg);
+ data.ll = current->thread.TS_FPR(reg);
if (flags & S) {
/* Single-precision FP store requires conversion... */
#ifdef CONFIG_PPC_FPU
preempt_disable();
enable_kernel_fp();
- cvt_df(&data.dd, (float *)&data.v[4]);
+ cvt_df(&data.dd, (float *)&data.x32.low32);
preempt_enable();
#else
return 0;
@@ -901,17 +971,13 @@ int fix_alignment(struct pt_regs *regs)
if (flags & SW) {
switch (nb) {
case 8:
- SWAP(data.v[0], data.v[7]);
- SWAP(data.v[1], data.v[6]);
- SWAP(data.v[2], data.v[5]);
- SWAP(data.v[3], data.v[4]);
+ data.ll = swab64(data.ll);
break;
case 4:
- SWAP(data.v[4], data.v[7]);
- SWAP(data.v[5], data.v[6]);
+ data.x32.low32 = swab32(data.x32.low32);
break;
case 2:
- SWAP(data.v[6], data.v[7]);
+ data.x16.low16 = swab16(data.x16.low16);
break;
}
}
@@ -933,7 +999,7 @@ int fix_alignment(struct pt_regs *regs)
#ifdef CONFIG_PPC_FPU
preempt_disable();
enable_kernel_fp();
- cvt_fd((float *)&data.v[4], &data.dd);
+ cvt_fd((float *)&data.x32.low32, &data.dd);
preempt_enable();
#else
return 0;
@@ -943,25 +1009,28 @@ int fix_alignment(struct pt_regs *regs)
/* Store result to memory or update registers */
if (flags & ST) {
- ret = 0;
- p = (unsigned long) addr;
+ unsigned int start = 0;
+
switch (nb) {
- case 8:
- ret |= __put_user_inatomic(data.v[0], SWIZ_PTR(p++));
- ret |= __put_user_inatomic(data.v[1], SWIZ_PTR(p++));
- ret |= __put_user_inatomic(data.v[2], SWIZ_PTR(p++));
- ret |= __put_user_inatomic(data.v[3], SWIZ_PTR(p++));
case 4:
- ret |= __put_user_inatomic(data.v[4], SWIZ_PTR(p++));
- ret |= __put_user_inatomic(data.v[5], SWIZ_PTR(p++));
+ start = offsetof(union data, x32.low32);
+ break;
case 2:
- ret |= __put_user_inatomic(data.v[6], SWIZ_PTR(p++));
- ret |= __put_user_inatomic(data.v[7], SWIZ_PTR(p++));
+ start = offsetof(union data, x16.low16);
+ break;
}
+
+ ret = 0;
+ p = (unsigned long)addr;
+
+ for (i = 0; i < nb; i++)
+ ret |= __put_user_inatomic(data.v[start + i],
+ SWIZ_PTR(p++));
+
if (unlikely(ret))
return -EFAULT;
} else if (flags & F)
- current->thread.TS_FPR(reg) = data.dd;
+ current->thread.TS_FPR(reg) = data.ll;
else
regs->gpr[reg] = data.ll;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index bd0df2e6aa8..f5995a91221 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -44,15 +44,17 @@
#include <asm/compat.h>
#include <asm/mmu.h>
#include <asm/hvcall.h>
+#include <asm/xics.h>
#endif
-#ifdef CONFIG_PPC_ISERIES
-#include <asm/iseries/alpaca.h>
+#ifdef CONFIG_PPC_POWERNV
+#include <asm/opal.h>
#endif
#if defined(CONFIG_KVM) || defined(CONFIG_KVM_GUEST)
#include <linux/kvm_host.h>
#endif
#if defined(CONFIG_KVM) && defined(CONFIG_PPC_BOOK3S)
#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
#endif
#ifdef CONFIG_PPC32
@@ -74,33 +76,38 @@ int main(void)
DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context));
DEFINE(SIGSEGV, SIGSEGV);
DEFINE(NMI_MASK, NMI_MASK);
+ DEFINE(THREAD_DSCR, offsetof(struct thread_struct, dscr));
+ DEFINE(THREAD_DSCR_INHERIT, offsetof(struct thread_struct, dscr_inherit));
+ DEFINE(TASKTHREADPPR, offsetof(struct task_struct, thread.ppr));
#else
DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
+ DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16));
+ DEFINE(KSP_LIMIT, offsetof(struct thread_struct, ksp_limit));
#endif /* CONFIG_PPC64 */
DEFINE(KSP, offsetof(struct thread_struct, ksp));
- DEFINE(KSP_LIMIT, offsetof(struct thread_struct, ksp_limit));
DEFINE(PT_REGS, offsetof(struct thread_struct, regs));
+#ifdef CONFIG_BOOKE
+ DEFINE(THREAD_NORMSAVES, offsetof(struct thread_struct, normsave[0]));
+#endif
DEFINE(THREAD_FPEXC_MODE, offsetof(struct thread_struct, fpexc_mode));
- DEFINE(THREAD_FPR0, offsetof(struct thread_struct, fpr[0]));
- DEFINE(THREAD_FPSCR, offsetof(struct thread_struct, fpscr));
+ DEFINE(THREAD_FPSTATE, offsetof(struct thread_struct, fp_state));
+ DEFINE(THREAD_FPSAVEAREA, offsetof(struct thread_struct, fp_save_area));
+ DEFINE(FPSTATE_FPSCR, offsetof(struct thread_fp_state, fpscr));
#ifdef CONFIG_ALTIVEC
- DEFINE(THREAD_VR0, offsetof(struct thread_struct, vr[0]));
+ DEFINE(THREAD_VRSTATE, offsetof(struct thread_struct, vr_state));
+ DEFINE(THREAD_VRSAVEAREA, offsetof(struct thread_struct, vr_save_area));
DEFINE(THREAD_VRSAVE, offsetof(struct thread_struct, vrsave));
- DEFINE(THREAD_VSCR, offsetof(struct thread_struct, vscr));
DEFINE(THREAD_USED_VR, offsetof(struct thread_struct, used_vr));
+ DEFINE(VRSTATE_VSCR, offsetof(struct thread_vr_state, vscr));
#endif /* CONFIG_ALTIVEC */
#ifdef CONFIG_VSX
- DEFINE(THREAD_VSR0, offsetof(struct thread_struct, fpr));
DEFINE(THREAD_USED_VSR, offsetof(struct thread_struct, used_vsr));
#endif /* CONFIG_VSX */
#ifdef CONFIG_PPC64
DEFINE(KSP_VSID, offsetof(struct thread_struct, ksp_vsid));
#else /* CONFIG_PPC64 */
DEFINE(PGDIR, offsetof(struct thread_struct, pgdir));
-#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
- DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0));
-#endif
#ifdef CONFIG_SPE
DEFINE(THREAD_EVR0, offsetof(struct thread_struct, evr[0]));
DEFINE(THREAD_ACC, offsetof(struct thread_struct, acc));
@@ -108,9 +115,46 @@ int main(void)
DEFINE(THREAD_USED_SPE, offsetof(struct thread_struct, used_spe));
#endif /* CONFIG_SPE */
#endif /* CONFIG_PPC64 */
+#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+ DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, debug.dbcr0));
+#endif
#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
#endif
+#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
+ DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu));
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ DEFINE(THREAD_TAR, offsetof(struct thread_struct, tar));
+ DEFINE(THREAD_BESCR, offsetof(struct thread_struct, bescr));
+ DEFINE(THREAD_EBBHR, offsetof(struct thread_struct, ebbhr));
+ DEFINE(THREAD_EBBRR, offsetof(struct thread_struct, ebbrr));
+ DEFINE(THREAD_SIAR, offsetof(struct thread_struct, siar));
+ DEFINE(THREAD_SDAR, offsetof(struct thread_struct, sdar));
+ DEFINE(THREAD_SIER, offsetof(struct thread_struct, sier));
+ DEFINE(THREAD_MMCR0, offsetof(struct thread_struct, mmcr0));
+ DEFINE(THREAD_MMCR2, offsetof(struct thread_struct, mmcr2));
+#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ DEFINE(PACATMSCRATCH, offsetof(struct paca_struct, tm_scratch));
+ DEFINE(THREAD_TM_TFHAR, offsetof(struct thread_struct, tm_tfhar));
+ DEFINE(THREAD_TM_TEXASR, offsetof(struct thread_struct, tm_texasr));
+ DEFINE(THREAD_TM_TFIAR, offsetof(struct thread_struct, tm_tfiar));
+ DEFINE(THREAD_TM_TAR, offsetof(struct thread_struct, tm_tar));
+ DEFINE(THREAD_TM_PPR, offsetof(struct thread_struct, tm_ppr));
+ DEFINE(THREAD_TM_DSCR, offsetof(struct thread_struct, tm_dscr));
+ DEFINE(PT_CKPT_REGS, offsetof(struct thread_struct, ckpt_regs));
+ DEFINE(THREAD_TRANSACT_VRSTATE, offsetof(struct thread_struct,
+ transact_vr));
+ DEFINE(THREAD_TRANSACT_VRSAVE, offsetof(struct thread_struct,
+ transact_vrsave));
+ DEFINE(THREAD_TRANSACT_FPSTATE, offsetof(struct thread_struct,
+ transact_fp));
+ /* Local pt_regs on stack for Transactional Memory funcs. */
+ DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD +
+ sizeof(struct pt_regs) + 16);
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
@@ -127,6 +171,7 @@ int main(void)
DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page));
/* paca */
DEFINE(PACA_SIZE, sizeof(struct paca_struct));
+ DEFINE(PACA_LOCK_TOKEN, offsetof(struct paca_struct, lock_token));
DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index));
DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start));
DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack));
@@ -138,7 +183,7 @@ int main(void)
DEFINE(PACAKBASE, offsetof(struct paca_struct, kernelbase));
DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
- DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
+ DEFINE(PACAIRQHAPPENED, offsetof(struct paca_struct, irq_happened));
DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
#ifdef CONFIG_PPC_MM_SLICES
DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct,
@@ -159,6 +204,15 @@ int main(void)
DEFINE(PACA_MC_STACK, offsetof(struct paca_struct, mc_kstack));
DEFINE(PACA_CRIT_STACK, offsetof(struct paca_struct, crit_kstack));
DEFINE(PACA_DBG_STACK, offsetof(struct paca_struct, dbg_kstack));
+ DEFINE(PACA_TCD_PTR, offsetof(struct paca_struct, tcd_ptr));
+
+ DEFINE(TCD_ESEL_NEXT,
+ offsetof(struct tlb_core_data, esel_next));
+ DEFINE(TCD_ESEL_MAX,
+ offsetof(struct tlb_core_data, esel_max));
+ DEFINE(TCD_ESEL_FIRST,
+ offsetof(struct tlb_core_data, esel_first));
+ DEFINE(TCD_LOCK, offsetof(struct tlb_core_data, lock));
#endif /* CONFIG_PPC_BOOK3E */
#ifdef CONFIG_PPC_STD_MMU_64
@@ -182,26 +236,26 @@ int main(void)
DEFINE(SLBSHADOW_STACKESID,
offsetof(struct slb_shadow, save_area[SLB_NUM_BOLTED - 1].esid));
DEFINE(SLBSHADOW_SAVEAREA, offsetof(struct slb_shadow, save_area));
- DEFINE(LPPACASRR0, offsetof(struct lppaca, saved_srr0));
- DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1));
- DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int));
- DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
+ DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
+ DEFINE(LPPACA_YIELDCOUNT, offsetof(struct lppaca, yield_count));
DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
#endif /* CONFIG_PPC_STD_MMU_64 */
DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp));
+#ifdef CONFIG_PPC_BOOK3S_64
+ DEFINE(PACAMCEMERGSP, offsetof(struct paca_struct, mc_emergency_sp));
+ DEFINE(PACA_IN_MCE, offsetof(struct paca_struct, in_mce));
+#endif
DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id));
DEFINE(PACAKEXECSTATE, offsetof(struct paca_struct, kexec_state));
+ DEFINE(PACA_DSCR, offsetof(struct paca_struct, dscr_default));
DEFINE(PACA_STARTTIME, offsetof(struct paca_struct, starttime));
DEFINE(PACA_STARTTIME_USER, offsetof(struct paca_struct, starttime_user));
DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time));
DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time));
DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
- DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
- DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb));
- DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max));
-#endif
+ DEFINE(PACA_NAPSTATELOST, offsetof(struct paca_struct, nap_state_lost));
+ DEFINE(PACA_SPRG_VDSO, offsetof(struct paca_struct, sprg_vdso));
#endif /* CONFIG_PPC64 */
/* RTAS */
@@ -209,7 +263,6 @@ int main(void)
DEFINE(RTASENTRY, offsetof(struct rtas_t, entry));
/* Interrupt register frame */
- DEFINE(STACK_FRAME_OVERHEAD, STACK_FRAME_OVERHEAD);
DEFINE(INT_FRAME_SIZE, STACK_INT_FRAME_SIZE);
DEFINE(SWITCH_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs));
#ifdef CONFIG_PPC64
@@ -378,85 +431,237 @@ int main(void)
DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry));
#endif
-#ifdef CONFIG_PPC_ISERIES
- /* the assembler miscalculates the VSID values */
- DEFINE(PAGE_OFFSET_ESID, GET_ESID(PAGE_OFFSET));
- DEFINE(PAGE_OFFSET_VSID, KERNEL_VSID(PAGE_OFFSET));
- DEFINE(VMALLOC_START_ESID, GET_ESID(VMALLOC_START));
- DEFINE(VMALLOC_START_VSID, KERNEL_VSID(VMALLOC_START));
-
- /* alpaca */
- DEFINE(ALPACA_SIZE, sizeof(struct alpaca));
-#endif
-
DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE);
DEFINE(PTE_SIZE, sizeof(pte_t));
#ifdef CONFIG_KVM
DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
+ DEFINE(VCPU_GUEST_PID, offsetof(struct kvm_vcpu, arch.pid));
DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
- DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
- DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
- DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
- DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
+ DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
+ DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fp.fpr));
+#ifdef CONFIG_ALTIVEC
+ DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr.vr));
+#endif
+ DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
+ DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
+ DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
+#ifdef CONFIG_PPC_BOOK3S
+ DEFINE(VCPU_TAR, offsetof(struct kvm_vcpu, arch.tar));
+#endif
+ DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
+ DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr));
+ DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0));
+ DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1));
+ DEFINE(VCPU_SPRG0, offsetof(struct kvm_vcpu, arch.shregs.sprg0));
+ DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.shregs.sprg1));
+ DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
+ DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
+#endif
+ DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
+ DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
+ DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
+ DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6));
+ DEFINE(VCPU_SHARED_SPRG7, offsetof(struct kvm_vcpu_arch_shared, sprg7));
DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
+ DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1));
DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
+ DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
+ DEFINE(VCPU_SHAREDBE, offsetof(struct kvm_vcpu, arch.shared_big_endian));
+#endif
+
+ DEFINE(VCPU_SHARED_MAS0, offsetof(struct kvm_vcpu_arch_shared, mas0));
+ DEFINE(VCPU_SHARED_MAS1, offsetof(struct kvm_vcpu_arch_shared, mas1));
+ DEFINE(VCPU_SHARED_MAS2, offsetof(struct kvm_vcpu_arch_shared, mas2));
+ DEFINE(VCPU_SHARED_MAS7_3, offsetof(struct kvm_vcpu_arch_shared, mas7_3));
+ DEFINE(VCPU_SHARED_MAS4, offsetof(struct kvm_vcpu_arch_shared, mas4));
+ DEFINE(VCPU_SHARED_MAS6, offsetof(struct kvm_vcpu_arch_shared, mas6));
+
+ DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
+ DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
/* book3s */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
+ DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
+ DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
+ DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
+ DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
+ DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
+ DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
+ DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
+ DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
+ DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
+ DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
+ DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
+ DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
+#endif
#ifdef CONFIG_PPC_BOOK3S
- DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
- DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
- DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
- DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
- DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
- DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
- DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
+ DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
+ DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
+ DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
+ DEFINE(VCPU_IC, offsetof(struct kvm_vcpu, arch.ic));
+ DEFINE(VCPU_VTB, offsetof(struct kvm_vcpu, arch.vtb));
+ DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
+ DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
+ DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
+ DEFINE(VCPU_IAMR, offsetof(struct kvm_vcpu, arch.iamr));
+ DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
+ DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
+ DEFINE(VCPU_DABRX, offsetof(struct kvm_vcpu, arch.dabrx));
+ DEFINE(VCPU_DAWR, offsetof(struct kvm_vcpu, arch.dawr));
+ DEFINE(VCPU_DAWRX, offsetof(struct kvm_vcpu, arch.dawrx));
+ DEFINE(VCPU_CIABR, offsetof(struct kvm_vcpu, arch.ciabr));
DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
- DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
- offsetof(struct kvmppc_vcpu_book3s, vcpu));
- DEFINE(SVCPU_CR, offsetof(struct kvmppc_book3s_shadow_vcpu, cr));
- DEFINE(SVCPU_XER, offsetof(struct kvmppc_book3s_shadow_vcpu, xer));
- DEFINE(SVCPU_CTR, offsetof(struct kvmppc_book3s_shadow_vcpu, ctr));
- DEFINE(SVCPU_LR, offsetof(struct kvmppc_book3s_shadow_vcpu, lr));
- DEFINE(SVCPU_PC, offsetof(struct kvmppc_book3s_shadow_vcpu, pc));
- DEFINE(SVCPU_R0, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[0]));
- DEFINE(SVCPU_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[1]));
- DEFINE(SVCPU_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[2]));
- DEFINE(SVCPU_R3, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[3]));
- DEFINE(SVCPU_R4, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[4]));
- DEFINE(SVCPU_R5, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[5]));
- DEFINE(SVCPU_R6, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[6]));
- DEFINE(SVCPU_R7, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[7]));
- DEFINE(SVCPU_R8, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[8]));
- DEFINE(SVCPU_R9, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[9]));
- DEFINE(SVCPU_R10, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[10]));
- DEFINE(SVCPU_R11, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[11]));
- DEFINE(SVCPU_R12, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[12]));
- DEFINE(SVCPU_R13, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[13]));
- DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
- DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2));
- DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu,
- vmhandler));
- DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu,
- scratch0));
- DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu,
- scratch1));
- DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu,
- in_guest));
- DEFINE(SVCPU_FAULT_DSISR, offsetof(struct kvmppc_book3s_shadow_vcpu,
- fault_dsisr));
- DEFINE(SVCPU_FAULT_DAR, offsetof(struct kvmppc_book3s_shadow_vcpu,
- fault_dar));
- DEFINE(SVCPU_LAST_INST, offsetof(struct kvmppc_book3s_shadow_vcpu,
- last_inst));
- DEFINE(SVCPU_SHADOW_SRR1, offsetof(struct kvmppc_book3s_shadow_vcpu,
- shadow_srr1));
-#ifdef CONFIG_PPC_BOOK3S_32
- DEFINE(SVCPU_SR, offsetof(struct kvmppc_book3s_shadow_vcpu, sr));
+ DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
+ DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
+ DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
+ DEFINE(VCPU_CEDED, offsetof(struct kvm_vcpu, arch.ceded));
+ DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded));
+ DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
+ DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
+ DEFINE(VCPU_SPMC, offsetof(struct kvm_vcpu, arch.spmc));
+ DEFINE(VCPU_SIAR, offsetof(struct kvm_vcpu, arch.siar));
+ DEFINE(VCPU_SDAR, offsetof(struct kvm_vcpu, arch.sdar));
+ DEFINE(VCPU_SIER, offsetof(struct kvm_vcpu, arch.sier));
+ DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
+ DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
+ DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
+ DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
+ DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+ DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
+ DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
+ DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
+ DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar));
+ DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr));
+ DEFINE(VCPU_FSCR, offsetof(struct kvm_vcpu, arch.fscr));
+ DEFINE(VCPU_SHADOW_FSCR, offsetof(struct kvm_vcpu, arch.shadow_fscr));
+ DEFINE(VCPU_PSPB, offsetof(struct kvm_vcpu, arch.pspb));
+ DEFINE(VCPU_EBBHR, offsetof(struct kvm_vcpu, arch.ebbhr));
+ DEFINE(VCPU_EBBRR, offsetof(struct kvm_vcpu, arch.ebbrr));
+ DEFINE(VCPU_BESCR, offsetof(struct kvm_vcpu, arch.bescr));
+ DEFINE(VCPU_CSIGR, offsetof(struct kvm_vcpu, arch.csigr));
+ DEFINE(VCPU_TACR, offsetof(struct kvm_vcpu, arch.tacr));
+ DEFINE(VCPU_TCSCR, offsetof(struct kvm_vcpu, arch.tcscr));
+ DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop));
+ DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort));
+ DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
+ DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
+ DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
+ DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
+ DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
+ DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm));
+ DEFINE(VCORE_TB_OFFSET, offsetof(struct kvmppc_vcore, tb_offset));
+ DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr));
+ DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr));
+ DEFINE(VCORE_DPDES, offsetof(struct kvmppc_vcore, dpdes));
+ DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
+ DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
+ DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ DEFINE(VCPU_TFHAR, offsetof(struct kvm_vcpu, arch.tfhar));
+ DEFINE(VCPU_TFIAR, offsetof(struct kvm_vcpu, arch.tfiar));
+ DEFINE(VCPU_TEXASR, offsetof(struct kvm_vcpu, arch.texasr));
+ DEFINE(VCPU_GPR_TM, offsetof(struct kvm_vcpu, arch.gpr_tm));
+ DEFINE(VCPU_FPRS_TM, offsetof(struct kvm_vcpu, arch.fp_tm.fpr));
+ DEFINE(VCPU_VRS_TM, offsetof(struct kvm_vcpu, arch.vr_tm.vr));
+ DEFINE(VCPU_VRSAVE_TM, offsetof(struct kvm_vcpu, arch.vrsave_tm));
+ DEFINE(VCPU_CR_TM, offsetof(struct kvm_vcpu, arch.cr_tm));
+ DEFINE(VCPU_LR_TM, offsetof(struct kvm_vcpu, arch.lr_tm));
+ DEFINE(VCPU_CTR_TM, offsetof(struct kvm_vcpu, arch.ctr_tm));
+ DEFINE(VCPU_AMR_TM, offsetof(struct kvm_vcpu, arch.amr_tm));
+ DEFINE(VCPU_PPR_TM, offsetof(struct kvm_vcpu, arch.ppr_tm));
+ DEFINE(VCPU_DSCR_TM, offsetof(struct kvm_vcpu, arch.dscr_tm));
+ DEFINE(VCPU_TAR_TM, offsetof(struct kvm_vcpu, arch.tar_tm));
#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+ DEFINE(PACA_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
+# define SVCPU_FIELD(x, f) DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f))
#else
+# define SVCPU_FIELD(x, f)
+#endif
+# define HSTATE_FIELD(x, f) DEFINE(x, offsetof(struct paca_struct, kvm_hstate.f))
+#else /* 32-bit */
+# define SVCPU_FIELD(x, f) DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, f))
+# define HSTATE_FIELD(x, f) DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, hstate.f))
+#endif
+
+ SVCPU_FIELD(SVCPU_CR, cr);
+ SVCPU_FIELD(SVCPU_XER, xer);
+ SVCPU_FIELD(SVCPU_CTR, ctr);
+ SVCPU_FIELD(SVCPU_LR, lr);
+ SVCPU_FIELD(SVCPU_PC, pc);
+ SVCPU_FIELD(SVCPU_R0, gpr[0]);
+ SVCPU_FIELD(SVCPU_R1, gpr[1]);
+ SVCPU_FIELD(SVCPU_R2, gpr[2]);
+ SVCPU_FIELD(SVCPU_R3, gpr[3]);
+ SVCPU_FIELD(SVCPU_R4, gpr[4]);
+ SVCPU_FIELD(SVCPU_R5, gpr[5]);
+ SVCPU_FIELD(SVCPU_R6, gpr[6]);
+ SVCPU_FIELD(SVCPU_R7, gpr[7]);
+ SVCPU_FIELD(SVCPU_R8, gpr[8]);
+ SVCPU_FIELD(SVCPU_R9, gpr[9]);
+ SVCPU_FIELD(SVCPU_R10, gpr[10]);
+ SVCPU_FIELD(SVCPU_R11, gpr[11]);
+ SVCPU_FIELD(SVCPU_R12, gpr[12]);
+ SVCPU_FIELD(SVCPU_R13, gpr[13]);
+ SVCPU_FIELD(SVCPU_FAULT_DSISR, fault_dsisr);
+ SVCPU_FIELD(SVCPU_FAULT_DAR, fault_dar);
+ SVCPU_FIELD(SVCPU_LAST_INST, last_inst);
+ SVCPU_FIELD(SVCPU_SHADOW_SRR1, shadow_srr1);
+#ifdef CONFIG_PPC_BOOK3S_32
+ SVCPU_FIELD(SVCPU_SR, sr);
+#endif
+#ifdef CONFIG_PPC64
+ SVCPU_FIELD(SVCPU_SLB, slb);
+ SVCPU_FIELD(SVCPU_SLB_MAX, slb_max);
+ SVCPU_FIELD(SVCPU_SHADOW_FSCR, shadow_fscr);
+#endif
+
+ HSTATE_FIELD(HSTATE_HOST_R1, host_r1);
+ HSTATE_FIELD(HSTATE_HOST_R2, host_r2);
+ HSTATE_FIELD(HSTATE_HOST_MSR, host_msr);
+ HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler);
+ HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
+ HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
+ HSTATE_FIELD(HSTATE_SCRATCH2, scratch2);
+ HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
+ HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
+ HSTATE_FIELD(HSTATE_NAPPING, napping);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ HSTATE_FIELD(HSTATE_HWTHREAD_REQ, hwthread_req);
+ HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state);
+ HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
+ HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
+ HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
+ HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
+ HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
+ HSTATE_FIELD(HSTATE_PTID, ptid);
+ HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
+ HSTATE_FIELD(HSTATE_PMC, host_pmc);
+ HSTATE_FIELD(HSTATE_PURR, host_purr);
+ HSTATE_FIELD(HSTATE_SPURR, host_spurr);
+ HSTATE_FIELD(HSTATE_DSCR, host_dscr);
+ HSTATE_FIELD(HSTATE_DABR, dabr);
+ HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+ DEFINE(IPI_PRIORITY, IPI_PRIORITY);
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ HSTATE_FIELD(HSTATE_CFAR, cfar);
+ HSTATE_FIELD(HSTATE_PPR, ppr);
+ HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#else /* CONFIG_PPC_BOOK3S */
DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
@@ -465,8 +670,9 @@ int main(void)
DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
+ DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save));
#endif /* CONFIG_PPC_BOOK3S */
-#endif
+#endif /* CONFIG_KVM */
#ifdef CONFIG_KVM_GUEST
DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared,
@@ -496,6 +702,19 @@ int main(void)
DEFINE(TLBCAM_MAS7, offsetof(struct tlbcam, MAS7));
#endif
+#if defined(CONFIG_KVM) && defined(CONFIG_SPE)
+ DEFINE(VCPU_EVR, offsetof(struct kvm_vcpu, arch.evr[0]));
+ DEFINE(VCPU_ACC, offsetof(struct kvm_vcpu, arch.acc));
+ DEFINE(VCPU_SPEFSCR, offsetof(struct kvm_vcpu, arch.spefscr));
+ DEFINE(VCPU_HOST_SPEFSCR, offsetof(struct kvm_vcpu, arch.host_spefscr));
+#endif
+
+#ifdef CONFIG_KVM_BOOKE_HV
+ DEFINE(VCPU_HOST_MAS4, offsetof(struct kvm_vcpu, arch.host_mas4));
+ DEFINE(VCPU_HOST_MAS6, offsetof(struct kvm_vcpu, arch.host_mas6));
+ DEFINE(VCPU_EPLC, offsetof(struct kvm_vcpu, arch.eplc));
+#endif
+
#ifdef CONFIG_KVM_EXIT_TIMING
DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu,
arch.timing_exit.tv32.tbu));
@@ -507,5 +726,12 @@ int main(void)
arch.timing_last_enter.tv32.tbl));
#endif
+#ifdef CONFIG_PPC_POWERNV
+ DEFINE(OPAL_MC_GPR3, offsetof(struct opal_machine_check_event, gpr3));
+ DEFINE(OPAL_MC_SRR0, offsetof(struct opal_machine_check_event, srr0));
+ DEFINE(OPAL_MC_SRR1, offsetof(struct opal_machine_check_event, srr1));
+ DEFINE(PACA_OPAL_MC_EVT, offsetof(struct paca_struct, opal_mc_evt));
+#endif
+
return 0;
}
diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c
index 625942ae558..41c011cb607 100644
--- a/arch/powerpc/kernel/btext.c
+++ b/arch/powerpc/kernel/btext.c
@@ -6,7 +6,7 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/memblock.h>
#include <asm/sections.h>
@@ -25,11 +25,6 @@
static void scrollscreen(void);
#endif
-static void draw_byte(unsigned char c, long locX, long locY);
-static void draw_byte_32(unsigned char *bits, unsigned int *base, int rb);
-static void draw_byte_16(unsigned char *bits, unsigned int *base, int rb);
-static void draw_byte_8(unsigned char *bits, unsigned int *base, int rb);
-
#define __force_data __attribute__((__section__(".data")))
static int g_loc_X __force_data;
@@ -52,6 +47,26 @@ static unsigned char vga_font[cmapsz];
int boot_text_mapped __force_data = 0;
int force_printk_to_btext = 0;
+extern void rmci_on(void);
+extern void rmci_off(void);
+
+static inline void rmci_maybe_on(void)
+{
+#if defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) && defined(CONFIG_PPC64)
+ if (!(mfmsr() & MSR_DR))
+ rmci_on();
+#endif
+}
+
+static inline void rmci_maybe_off(void)
+{
+#if defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) && defined(CONFIG_PPC64)
+ if (!(mfmsr() & MSR_DR))
+ rmci_off();
+#endif
+}
+
+
#ifdef CONFIG_PPC32
/* Calc BAT values for mapping the display and store them
* in disp_BAT. Those values are then used from head.S to map
@@ -99,7 +114,7 @@ void __init btext_prepare_BAT(void)
/* This function can be used to enable the early boot text when doing
* OF booting or within bootx init. It must be followed by a btext_unmap()
- * call before the logical address becomes unuseable
+ * call before the logical address becomes unusable
*/
void __init btext_setup_display(int width, int height, int depth, int pitch,
unsigned long address)
@@ -134,7 +149,7 @@ void __init btext_unmap(void)
* changes.
*/
-static void map_boot_text(void)
+void btext_map(void)
{
unsigned long base, offset, size;
unsigned char *vbase;
@@ -209,7 +224,7 @@ int btext_initialize(struct device_node *np)
dispDeviceRect[2] = width;
dispDeviceRect[3] = height;
- map_boot_text();
+ btext_map();
return 0;
}
@@ -283,7 +298,7 @@ void btext_update_display(unsigned long phys, int width, int height,
iounmap(logicalDisplayBase);
boot_text_mapped = 0;
}
- map_boot_text();
+ btext_map();
g_loc_X = 0;
g_loc_Y = 0;
g_max_loc_X = width / 8;
@@ -298,6 +313,7 @@ void btext_clearscreen(void)
(dispDeviceDepth >> 3)) >> 2;
int i,j;
+ rmci_maybe_on();
for (i=0; i<(dispDeviceRect[3] - dispDeviceRect[1]); i++)
{
unsigned int *ptr = base;
@@ -305,6 +321,7 @@ void btext_clearscreen(void)
*(ptr++) = 0;
base += (dispDeviceRowBytes >> 2);
}
+ rmci_maybe_off();
}
void btext_flushscreen(void)
@@ -355,6 +372,8 @@ static void scrollscreen(void)
(dispDeviceDepth >> 3)) >> 2;
int i,j;
+ rmci_maybe_on();
+
for (i=0; i<(dispDeviceRect[3] - dispDeviceRect[1] - 16); i++)
{
unsigned int *src_ptr = src;
@@ -371,9 +390,116 @@ static void scrollscreen(void)
*(dst_ptr++) = 0;
dst += (dispDeviceRowBytes >> 2);
}
+
+ rmci_maybe_off();
}
#endif /* ndef NO_SCROLL */
+static unsigned int expand_bits_8[16] = {
+ 0x00000000,
+ 0x000000ff,
+ 0x0000ff00,
+ 0x0000ffff,
+ 0x00ff0000,
+ 0x00ff00ff,
+ 0x00ffff00,
+ 0x00ffffff,
+ 0xff000000,
+ 0xff0000ff,
+ 0xff00ff00,
+ 0xff00ffff,
+ 0xffff0000,
+ 0xffff00ff,
+ 0xffffff00,
+ 0xffffffff
+};
+
+static unsigned int expand_bits_16[4] = {
+ 0x00000000,
+ 0x0000ffff,
+ 0xffff0000,
+ 0xffffffff
+};
+
+
+static void draw_byte_32(unsigned char *font, unsigned int *base, int rb)
+{
+ int l, bits;
+ int fg = 0xFFFFFFFFUL;
+ int bg = 0x00000000UL;
+
+ for (l = 0; l < 16; ++l)
+ {
+ bits = *font++;
+ base[0] = (-(bits >> 7) & fg) ^ bg;
+ base[1] = (-((bits >> 6) & 1) & fg) ^ bg;
+ base[2] = (-((bits >> 5) & 1) & fg) ^ bg;
+ base[3] = (-((bits >> 4) & 1) & fg) ^ bg;
+ base[4] = (-((bits >> 3) & 1) & fg) ^ bg;
+ base[5] = (-((bits >> 2) & 1) & fg) ^ bg;
+ base[6] = (-((bits >> 1) & 1) & fg) ^ bg;
+ base[7] = (-(bits & 1) & fg) ^ bg;
+ base = (unsigned int *) ((char *)base + rb);
+ }
+}
+
+static inline void draw_byte_16(unsigned char *font, unsigned int *base, int rb)
+{
+ int l, bits;
+ int fg = 0xFFFFFFFFUL;
+ int bg = 0x00000000UL;
+ unsigned int *eb = (int *)expand_bits_16;
+
+ for (l = 0; l < 16; ++l)
+ {
+ bits = *font++;
+ base[0] = (eb[bits >> 6] & fg) ^ bg;
+ base[1] = (eb[(bits >> 4) & 3] & fg) ^ bg;
+ base[2] = (eb[(bits >> 2) & 3] & fg) ^ bg;
+ base[3] = (eb[bits & 3] & fg) ^ bg;
+ base = (unsigned int *) ((char *)base + rb);
+ }
+}
+
+static inline void draw_byte_8(unsigned char *font, unsigned int *base, int rb)
+{
+ int l, bits;
+ int fg = 0x0F0F0F0FUL;
+ int bg = 0x00000000UL;
+ unsigned int *eb = (int *)expand_bits_8;
+
+ for (l = 0; l < 16; ++l)
+ {
+ bits = *font++;
+ base[0] = (eb[bits >> 4] & fg) ^ bg;
+ base[1] = (eb[bits & 0xf] & fg) ^ bg;
+ base = (unsigned int *) ((char *)base + rb);
+ }
+}
+
+static noinline void draw_byte(unsigned char c, long locX, long locY)
+{
+ unsigned char *base = calc_base(locX << 3, locY << 4);
+ unsigned char *font = &vga_font[((unsigned int)c) * 16];
+ int rb = dispDeviceRowBytes;
+
+ rmci_maybe_on();
+ switch(dispDeviceDepth) {
+ case 24:
+ case 32:
+ draw_byte_32(font, (unsigned int *)base, rb);
+ break;
+ case 15:
+ case 16:
+ draw_byte_16(font, (unsigned int *)base, rb);
+ break;
+ case 8:
+ draw_byte_8(font, (unsigned int *)base, rb);
+ break;
+ }
+ rmci_maybe_off();
+}
+
void btext_drawchar(char c)
{
int cline = 0;
@@ -465,107 +591,12 @@ void btext_drawhex(unsigned long v)
btext_drawchar(' ');
}
-static void draw_byte(unsigned char c, long locX, long locY)
-{
- unsigned char *base = calc_base(locX << 3, locY << 4);
- unsigned char *font = &vga_font[((unsigned int)c) * 16];
- int rb = dispDeviceRowBytes;
-
- switch(dispDeviceDepth) {
- case 24:
- case 32:
- draw_byte_32(font, (unsigned int *)base, rb);
- break;
- case 15:
- case 16:
- draw_byte_16(font, (unsigned int *)base, rb);
- break;
- case 8:
- draw_byte_8(font, (unsigned int *)base, rb);
- break;
- }
-}
-
-static unsigned int expand_bits_8[16] = {
- 0x00000000,
- 0x000000ff,
- 0x0000ff00,
- 0x0000ffff,
- 0x00ff0000,
- 0x00ff00ff,
- 0x00ffff00,
- 0x00ffffff,
- 0xff000000,
- 0xff0000ff,
- 0xff00ff00,
- 0xff00ffff,
- 0xffff0000,
- 0xffff00ff,
- 0xffffff00,
- 0xffffffff
-};
-
-static unsigned int expand_bits_16[4] = {
- 0x00000000,
- 0x0000ffff,
- 0xffff0000,
- 0xffffffff
-};
-
-
-static void draw_byte_32(unsigned char *font, unsigned int *base, int rb)
-{
- int l, bits;
- int fg = 0xFFFFFFFFUL;
- int bg = 0x00000000UL;
-
- for (l = 0; l < 16; ++l)
- {
- bits = *font++;
- base[0] = (-(bits >> 7) & fg) ^ bg;
- base[1] = (-((bits >> 6) & 1) & fg) ^ bg;
- base[2] = (-((bits >> 5) & 1) & fg) ^ bg;
- base[3] = (-((bits >> 4) & 1) & fg) ^ bg;
- base[4] = (-((bits >> 3) & 1) & fg) ^ bg;
- base[5] = (-((bits >> 2) & 1) & fg) ^ bg;
- base[6] = (-((bits >> 1) & 1) & fg) ^ bg;
- base[7] = (-(bits & 1) & fg) ^ bg;
- base = (unsigned int *) ((char *)base + rb);
- }
-}
-
-static void draw_byte_16(unsigned char *font, unsigned int *base, int rb)
-{
- int l, bits;
- int fg = 0xFFFFFFFFUL;
- int bg = 0x00000000UL;
- unsigned int *eb = (int *)expand_bits_16;
-
- for (l = 0; l < 16; ++l)
- {
- bits = *font++;
- base[0] = (eb[bits >> 6] & fg) ^ bg;
- base[1] = (eb[(bits >> 4) & 3] & fg) ^ bg;
- base[2] = (eb[(bits >> 2) & 3] & fg) ^ bg;
- base[3] = (eb[bits & 3] & fg) ^ bg;
- base = (unsigned int *) ((char *)base + rb);
- }
-}
-
-static void draw_byte_8(unsigned char *font, unsigned int *base, int rb)
+void __init udbg_init_btext(void)
{
- int l, bits;
- int fg = 0x0F0F0F0FUL;
- int bg = 0x00000000UL;
- unsigned int *eb = (int *)expand_bits_8;
-
- for (l = 0; l < 16; ++l)
- {
- bits = *font++;
- base[0] = (eb[bits >> 4] & fg) ^ bg;
- base[1] = (eb[bits & 0xf] & fg) ^ bg;
- base = (unsigned int *) ((char *)base + rb);
- }
+ /* If btext is enabled, we might have a BAT setup for early display,
+ * thus we do enable some very basic udbg output
+ */
+ udbg_putc = btext_drawchar;
}
static unsigned char vga_font[cmapsz] = {
@@ -913,10 +944,3 @@ static unsigned char vga_font[cmapsz] = {
0x00, 0x00, 0x00, 0x00,
};
-void __init udbg_init_btext(void)
-{
- /* If btext is enabled, we might have a BAT setup for early display,
- * thus we do enable some very basic udbg output
- */
- udbg_putc = btext_drawchar;
-}
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index a3c684b4c86..40198d50b4c 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -12,7 +12,6 @@
#include <linux/cpu.h>
#include <linux/cpumask.h>
-#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kobject.h>
#include <linux/list.h>
@@ -131,7 +130,8 @@ static const char *cache_type_string(const struct cache *cache)
return cache_type_info[cache->type].name;
}
-static void __cpuinit cache_init(struct cache *cache, int type, int level, struct device_node *ofnode)
+static void cache_init(struct cache *cache, int type, int level,
+ struct device_node *ofnode)
{
cache->type = type;
cache->level = level;
@@ -140,7 +140,7 @@ static void __cpuinit cache_init(struct cache *cache, int type, int level, struc
list_add(&cache->list, &cache_list);
}
-static struct cache *__cpuinit new_cache(int type, int level, struct device_node *ofnode)
+static struct cache *new_cache(int type, int level, struct device_node *ofnode)
{
struct cache *cache;
@@ -195,7 +195,7 @@ static void cache_cpu_set(struct cache *cache, int cpu)
static int cache_size(const struct cache *cache, unsigned int *ret)
{
const char *propname;
- const u32 *cache_size;
+ const __be32 *cache_size;
propname = cache_type_info[cache->type].size_prop;
@@ -203,7 +203,7 @@ static int cache_size(const struct cache *cache, unsigned int *ret)
if (!cache_size)
return -ENODEV;
- *ret = *cache_size;
+ *ret = of_read_number(cache_size, 1);
return 0;
}
@@ -221,7 +221,7 @@ static int cache_size_kb(const struct cache *cache, unsigned int *ret)
/* not cache_line_size() because that's a macro in include/linux/cache.h */
static int cache_get_line_size(const struct cache *cache, unsigned int *ret)
{
- const u32 *line_size;
+ const __be32 *line_size;
int i, lim;
lim = ARRAY_SIZE(cache_type_info[cache->type].line_size_props);
@@ -238,14 +238,14 @@ static int cache_get_line_size(const struct cache *cache, unsigned int *ret)
if (!line_size)
return -ENODEV;
- *ret = *line_size;
+ *ret = of_read_number(line_size, 1);
return 0;
}
static int cache_nr_sets(const struct cache *cache, unsigned int *ret)
{
const char *propname;
- const u32 *nr_sets;
+ const __be32 *nr_sets;
propname = cache_type_info[cache->type].nr_sets_prop;
@@ -253,7 +253,7 @@ static int cache_nr_sets(const struct cache *cache, unsigned int *ret)
if (!nr_sets)
return -ENODEV;
- *ret = *nr_sets;
+ *ret = of_read_number(nr_sets, 1);
return 0;
}
@@ -324,7 +324,8 @@ static bool cache_node_is_unified(const struct device_node *np)
return of_get_property(np, "cache-unified", NULL);
}
-static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode_unified(struct device_node *node,
+ int level)
{
struct cache *cache;
@@ -335,7 +336,8 @@ static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *
return cache;
}
-static struct cache *__cpuinit cache_do_one_devnode_split(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode_split(struct device_node *node,
+ int level)
{
struct cache *dcache, *icache;
@@ -357,7 +359,7 @@ err:
return NULL;
}
-static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode(struct device_node *node, int level)
{
struct cache *cache;
@@ -369,7 +371,8 @@ static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, in
return cache;
}
-static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *node, int level)
+static struct cache *cache_lookup_or_instantiate(struct device_node *node,
+ int level)
{
struct cache *cache;
@@ -385,7 +388,7 @@ static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *n
return cache;
}
-static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigger)
+static void link_cache_lists(struct cache *smaller, struct cache *bigger)
{
while (smaller->next_local) {
if (smaller->next_local == bigger)
@@ -396,13 +399,13 @@ static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigg
smaller->next_local = bigger;
}
-static void __cpuinit do_subsidiary_caches_debugcheck(struct cache *cache)
+static void do_subsidiary_caches_debugcheck(struct cache *cache)
{
WARN_ON_ONCE(cache->level != 1);
WARN_ON_ONCE(strcmp(cache->ofnode->type, "cpu"));
}
-static void __cpuinit do_subsidiary_caches(struct cache *cache)
+static void do_subsidiary_caches(struct cache *cache)
{
struct device_node *subcache_node;
int level = cache->level;
@@ -423,7 +426,7 @@ static void __cpuinit do_subsidiary_caches(struct cache *cache)
}
}
-static struct cache *__cpuinit cache_chain_instantiate(unsigned int cpu_id)
+static struct cache *cache_chain_instantiate(unsigned int cpu_id)
{
struct device_node *cpu_node;
struct cache *cpu_cache = NULL;
@@ -448,18 +451,18 @@ out:
return cpu_cache;
}
-static struct cache_dir *__cpuinit cacheinfo_create_cache_dir(unsigned int cpu_id)
+static struct cache_dir *cacheinfo_create_cache_dir(unsigned int cpu_id)
{
struct cache_dir *cache_dir;
- struct sys_device *sysdev;
+ struct device *dev;
struct kobject *kobj = NULL;
- sysdev = get_cpu_sysdev(cpu_id);
- WARN_ONCE(!sysdev, "no sysdev for CPU %i\n", cpu_id);
- if (!sysdev)
+ dev = get_cpu_device(cpu_id);
+ WARN_ONCE(!dev, "no dev for CPU %i\n", cpu_id);
+ if (!dev)
goto err;
- kobj = kobject_create_and_add("cache", &sysdev->kobj);
+ kobj = kobject_create_and_add("cache", &dev->kobj);
if (!kobj)
goto err;
@@ -653,7 +656,7 @@ static struct kobj_type cache_index_type = {
.default_attrs = cache_index_default_attrs,
};
-static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
+static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
{
const char *cache_name;
const char *cache_type;
@@ -696,7 +699,8 @@ static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *d
kfree(buf);
}
-static void __cpuinit cacheinfo_create_index_dir(struct cache *cache, int index, struct cache_dir *cache_dir)
+static void cacheinfo_create_index_dir(struct cache *cache, int index,
+ struct cache_dir *cache_dir)
{
struct cache_index_dir *index_dir;
int rc;
@@ -722,7 +726,8 @@ err:
kfree(index_dir);
}
-static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache *cache_list)
+static void cacheinfo_sysfs_populate(unsigned int cpu_id,
+ struct cache *cache_list)
{
struct cache_dir *cache_dir;
struct cache *cache;
@@ -740,7 +745,7 @@ static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache
}
}
-void __cpuinit cacheinfo_cpu_online(unsigned int cpu_id)
+void cacheinfo_cpu_online(unsigned int cpu_id)
{
struct cache *cache;
@@ -751,7 +756,10 @@ void __cpuinit cacheinfo_cpu_online(unsigned int cpu_id)
cacheinfo_sysfs_populate(cpu_id, cache);
}
-#ifdef CONFIG_HOTPLUG_CPU /* functions needed for cpu offline */
+/* functions needed to remove cache entry for cpu offline or suspend/resume */
+
+#if (defined(CONFIG_PPC_PSERIES) && defined(CONFIG_SUSPEND)) || \
+ defined(CONFIG_HOTPLUG_CPU)
static struct cache *cache_lookup_by_cpu(unsigned int cpu_id)
{
@@ -788,6 +796,9 @@ static void remove_cache_dir(struct cache_dir *cache_dir)
{
remove_index_dirs(cache_dir);
+ /* Remove cache dir from sysfs */
+ kobject_del(cache_dir->kobj);
+
kobject_put(cache_dir->kobj);
kfree(cache_dir);
@@ -835,4 +846,4 @@ void cacheinfo_cpu_offline(unsigned int cpu_id)
if (cache)
cache_cpu_clear(cache, cpu_id);
}
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* (CONFIG_PPC_PSERIES && CONFIG_SUSPEND) || CONFIG_HOTPLUG_CPU */
diff --git a/arch/powerpc/kernel/clock.c b/arch/powerpc/kernel/clock.c
deleted file mode 100644
index ce668f54575..00000000000
--- a/arch/powerpc/kernel/clock.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Dummy clk implementations for powerpc.
- * These need to be overridden in platform code.
- */
-
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <asm/clk_interface.h>
-
-struct clk_interface clk_functions;
-
-struct clk *clk_get(struct device *dev, const char *id)
-{
- if (clk_functions.clk_get)
- return clk_functions.clk_get(dev, id);
- return ERR_PTR(-ENOSYS);
-}
-EXPORT_SYMBOL(clk_get);
-
-void clk_put(struct clk *clk)
-{
- if (clk_functions.clk_put)
- clk_functions.clk_put(clk);
-}
-EXPORT_SYMBOL(clk_put);
-
-int clk_enable(struct clk *clk)
-{
- if (clk_functions.clk_enable)
- return clk_functions.clk_enable(clk);
- return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_enable);
-
-void clk_disable(struct clk *clk)
-{
- if (clk_functions.clk_disable)
- clk_functions.clk_disable(clk);
-}
-EXPORT_SYMBOL(clk_disable);
-
-unsigned long clk_get_rate(struct clk *clk)
-{
- if (clk_functions.clk_get_rate)
- return clk_functions.clk_get_rate(clk);
- return 0;
-}
-EXPORT_SYMBOL(clk_get_rate);
-
-long clk_round_rate(struct clk *clk, unsigned long rate)
-{
- if (clk_functions.clk_round_rate)
- return clk_functions.clk_round_rate(clk, rate);
- return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_round_rate);
-
-int clk_set_rate(struct clk *clk, unsigned long rate)
-{
- if (clk_functions.clk_set_rate)
- return clk_functions.clk_set_rate(clk, rate);
- return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_set_rate);
-
-struct clk *clk_get_parent(struct clk *clk)
-{
- if (clk_functions.clk_get_parent)
- return clk_functions.clk_get_parent(clk);
- return ERR_PTR(-ENOSYS);
-}
-EXPORT_SYMBOL(clk_get_parent);
-
-int clk_set_parent(struct clk *clk, struct clk *parent)
-{
- if (clk_functions.clk_set_parent)
- return clk_functions.clk_set_parent(clk, parent);
- return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_set_parent);
diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S
index 55cba4a8a95..f8cd9fba4d3 100644
--- a/arch/powerpc/kernel/cpu_setup_6xx.S
+++ b/arch/powerpc/kernel/cpu_setup_6xx.S
@@ -18,7 +18,7 @@
#include <asm/mmu.h>
_GLOBAL(__setup_cpu_603)
- mflr r4
+ mflr r5
BEGIN_MMU_FTR_SECTION
li r10,0
mtspr SPRN_SPRG_603_LRU,r10 /* init SW LRU tracking */
@@ -27,60 +27,60 @@ BEGIN_FTR_SECTION
bl __init_fpu_registers
END_FTR_SECTION_IFCLR(CPU_FTR_FPU_UNAVAILABLE)
bl setup_common_caches
- mtlr r4
+ mtlr r5
blr
_GLOBAL(__setup_cpu_604)
- mflr r4
+ mflr r5
bl setup_common_caches
bl setup_604_hid0
- mtlr r4
+ mtlr r5
blr
_GLOBAL(__setup_cpu_750)
- mflr r4
+ mflr r5
bl __init_fpu_registers
bl setup_common_caches
bl setup_750_7400_hid0
- mtlr r4
+ mtlr r5
blr
_GLOBAL(__setup_cpu_750cx)
- mflr r4
+ mflr r5
bl __init_fpu_registers
bl setup_common_caches
bl setup_750_7400_hid0
bl setup_750cx
- mtlr r4
+ mtlr r5
blr
_GLOBAL(__setup_cpu_750fx)
- mflr r4
+ mflr r5
bl __init_fpu_registers
bl setup_common_caches
bl setup_750_7400_hid0
bl setup_750fx
- mtlr r4
+ mtlr r5
blr
_GLOBAL(__setup_cpu_7400)
- mflr r4
+ mflr r5
bl __init_fpu_registers
bl setup_7400_workarounds
bl setup_common_caches
bl setup_750_7400_hid0
- mtlr r4
+ mtlr r5
blr
_GLOBAL(__setup_cpu_7410)
- mflr r4
+ mflr r5
bl __init_fpu_registers
bl setup_7410_workarounds
bl setup_common_caches
bl setup_750_7400_hid0
li r3,0
mtspr SPRN_L2CR2,r3
- mtlr r4
+ mtlr r5
blr
_GLOBAL(__setup_cpu_745x)
- mflr r4
+ mflr r5
bl setup_common_caches
bl setup_745x_specifics
- mtlr r4
+ mtlr r5
blr
/* Enable caches for 603's, 604, 750 & 7400 */
@@ -194,10 +194,10 @@ setup_750cx:
cror 4*cr0+eq,4*cr0+eq,4*cr1+eq
cror 4*cr0+eq,4*cr0+eq,4*cr2+eq
bnelr
- lwz r6,CPU_SPEC_FEATURES(r5)
+ lwz r6,CPU_SPEC_FEATURES(r4)
li r7,CPU_FTR_CAN_NAP
andc r6,r6,r7
- stw r6,CPU_SPEC_FEATURES(r5)
+ stw r6,CPU_SPEC_FEATURES(r4)
blr
/* 750fx specific
@@ -225,12 +225,12 @@ BEGIN_FTR_SECTION
andis. r11,r11,L3CR_L3E@h
beq 1f
END_FTR_SECTION_IFSET(CPU_FTR_L3CR)
- lwz r6,CPU_SPEC_FEATURES(r5)
+ lwz r6,CPU_SPEC_FEATURES(r4)
andi. r0,r6,CPU_FTR_L3_DISABLE_NAP
beq 1f
li r7,CPU_FTR_CAN_NAP
andc r6,r6,r7
- stw r6,CPU_SPEC_FEATURES(r5)
+ stw r6,CPU_SPEC_FEATURES(r4)
1:
mfspr r11,SPRN_HID0
diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
index 894e64fa481..4f1393d2007 100644
--- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S
+++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
@@ -16,6 +16,8 @@
#include <asm/processor.h>
#include <asm/cputable.h>
#include <asm/ppc_asm.h>
+#include <asm/mmu-book3e.h>
+#include <asm/asm-offsets.h>
_GLOBAL(__e500_icache_setup)
mfspr r0, SPRN_L1CSR1
@@ -51,6 +53,61 @@ _GLOBAL(__e500_dcache_setup)
isync
blr
+/*
+ * FIXME - we haven't yet done testing to determine a reasonable default
+ * value for PW20_WAIT_IDLE_BIT.
+ */
+#define PW20_WAIT_IDLE_BIT 50 /* 1ms, TB frequency is 41.66MHZ */
+_GLOBAL(setup_pw20_idle)
+ mfspr r3, SPRN_PWRMGTCR0
+
+ /* Set PW20_WAIT bit, enable pw20 state*/
+ ori r3, r3, PWRMGTCR0_PW20_WAIT
+ li r11, PW20_WAIT_IDLE_BIT
+
+ /* Set Automatic PW20 Core Idle Count */
+ rlwimi r3, r11, PWRMGTCR0_PW20_ENT_SHIFT, PWRMGTCR0_PW20_ENT
+
+ mtspr SPRN_PWRMGTCR0, r3
+
+ blr
+
+/*
+ * FIXME - we haven't yet done testing to determine a reasonable default
+ * value for AV_WAIT_IDLE_BIT.
+ */
+#define AV_WAIT_IDLE_BIT 50 /* 1ms, TB frequency is 41.66MHZ */
+_GLOBAL(setup_altivec_idle)
+ mfspr r3, SPRN_PWRMGTCR0
+
+ /* Enable Altivec Idle */
+ oris r3, r3, PWRMGTCR0_AV_IDLE_PD_EN@h
+ li r11, AV_WAIT_IDLE_BIT
+
+ /* Set Automatic AltiVec Idle Count */
+ rlwimi r3, r11, PWRMGTCR0_AV_IDLE_CNT_SHIFT, PWRMGTCR0_AV_IDLE_CNT
+
+ mtspr SPRN_PWRMGTCR0, r3
+
+ blr
+
+_GLOBAL(__setup_cpu_e6500)
+ mflr r6
+#ifdef CONFIG_PPC64
+ bl setup_altivec_ivors
+ /* Touch IVOR42 only if the CPU supports E.HV category */
+ mfspr r10,SPRN_MMUCFG
+ rlwinm. r10,r10,0,MMUCFG_LPIDSIZE
+ beq 1f
+ bl setup_lrat_ivor
+1:
+#endif
+ bl setup_pw20_idle
+ bl setup_altivec_idle
+ bl __setup_cpu_e5500
+ mtlr r6
+ blr
+
#ifdef CONFIG_PPC32
_GLOBAL(__setup_cpu_e200)
/* enable dedicated debug exception handling resources (Debug APU) */
@@ -64,26 +121,105 @@ _GLOBAL(__setup_cpu_e500v2)
bl __e500_icache_setup
bl __e500_dcache_setup
bl __setup_e500_ivors
+#if defined(CONFIG_FSL_RIO) || defined(CONFIG_FSL_PCI)
+ /* Ensure that RFXE is set */
+ mfspr r3,SPRN_HID1
+ oris r3,r3,HID1_RFXE@h
+ mtspr SPRN_HID1,r3
+#endif
mtlr r4
blr
_GLOBAL(__setup_cpu_e500mc)
- mflr r4
+_GLOBAL(__setup_cpu_e5500)
+ mflr r5
bl __e500_icache_setup
bl __e500_dcache_setup
bl __setup_e500mc_ivors
- mtlr r4
+ /*
+ * We only want to touch IVOR38-41 if we're running on hardware
+ * that supports category E.HV. The architectural way to determine
+ * this is MMUCFG[LPIDSIZE].
+ */
+ mfspr r3, SPRN_MMUCFG
+ rlwinm. r3, r3, 0, MMUCFG_LPIDSIZE
+ beq 1f
+ bl __setup_ehv_ivors
+ b 2f
+1:
+ lwz r3, CPU_SPEC_FEATURES(r4)
+ /* We need this check as cpu_setup is also called for
+ * the secondary cores. So, if we have already cleared
+ * the feature on the primary core, avoid doing it on the
+ * secondary core.
+ */
+ andis. r6, r3, CPU_FTR_EMB_HV@h
+ beq 2f
+ rlwinm r3, r3, 0, ~CPU_FTR_EMB_HV
+ stw r3, CPU_SPEC_FEATURES(r4)
+2:
+ mtlr r5
blr
#endif
-/* Right now, restore and setup are the same thing */
+
+#ifdef CONFIG_PPC_BOOK3E_64
+_GLOBAL(__restore_cpu_e6500)
+ mflr r5
+ bl setup_altivec_ivors
+ /* Touch IVOR42 only if the CPU supports E.HV category */
+ mfspr r10,SPRN_MMUCFG
+ rlwinm. r10,r10,0,MMUCFG_LPIDSIZE
+ beq 1f
+ bl setup_lrat_ivor
+1:
+ bl setup_pw20_idle
+ bl setup_altivec_idle
+ bl __restore_cpu_e5500
+ mtlr r5
+ blr
+
_GLOBAL(__restore_cpu_e5500)
-_GLOBAL(__setup_cpu_e5500)
mflr r4
bl __e500_icache_setup
bl __e500_dcache_setup
-#ifdef CONFIG_PPC_BOOK3E_64
- bl .__setup_base_ivors
-#else
- bl __setup_e500mc_ivors
-#endif
+ bl __setup_base_ivors
+ bl setup_perfmon_ivor
+ bl setup_doorbell_ivors
+ /*
+ * We only want to touch IVOR38-41 if we're running on hardware
+ * that supports category E.HV. The architectural way to determine
+ * this is MMUCFG[LPIDSIZE].
+ */
+ mfspr r10,SPRN_MMUCFG
+ rlwinm. r10,r10,0,MMUCFG_LPIDSIZE
+ beq 1f
+ bl setup_ehv_ivors
+1:
mtlr r4
blr
+
+_GLOBAL(__setup_cpu_e5500)
+ mflr r5
+ bl __e500_icache_setup
+ bl __e500_dcache_setup
+ bl __setup_base_ivors
+ bl setup_perfmon_ivor
+ bl setup_doorbell_ivors
+ /*
+ * We only want to touch IVOR38-41 if we're running on hardware
+ * that supports category E.HV. The architectural way to determine
+ * this is MMUCFG[LPIDSIZE].
+ */
+ mfspr r10,SPRN_MMUCFG
+ rlwinm. r10,r10,0,MMUCFG_LPIDSIZE
+ beq 1f
+ bl setup_ehv_ivors
+ b 2f
+1:
+ ld r10,CPU_SPEC_FEATURES(r4)
+ LOAD_REG_IMMEDIATE(r9,CPU_FTR_EMB_HV)
+ andc r10,r10,r9
+ std r10,CPU_SPEC_FEATURES(r4)
+2:
+ mtlr r5
+ blr
+#endif
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
new file mode 100644
index 00000000000..46733535cc0
--- /dev/null
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -0,0 +1,182 @@
+/*
+ * This file contains low level CPU setup functions.
+ * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/cache.h>
+
+/* Entry: r3 = crap, r4 = ptr to cputable entry
+ *
+ * Note that we can be called twice for pseudo-PVRs
+ */
+_GLOBAL(__setup_cpu_power7)
+ mflr r11
+ bl __init_hvmode_206
+ mtlr r11
+ beqlr
+ li r0,0
+ mtspr SPRN_LPID,r0
+ mfspr r3,SPRN_LPCR
+ bl __init_LPCR
+ bl __init_tlb_power7
+ mtlr r11
+ blr
+
+_GLOBAL(__restore_cpu_power7)
+ mflr r11
+ mfmsr r3
+ rldicl. r0,r3,4,63
+ beqlr
+ li r0,0
+ mtspr SPRN_LPID,r0
+ mfspr r3,SPRN_LPCR
+ bl __init_LPCR
+ bl __init_tlb_power7
+ mtlr r11
+ blr
+
+_GLOBAL(__setup_cpu_power8)
+ mflr r11
+ bl __init_FSCR
+ bl __init_PMU
+ bl __init_hvmode_206
+ mtlr r11
+ beqlr
+ li r0,0
+ mtspr SPRN_LPID,r0
+ mfspr r3,SPRN_LPCR
+ ori r3, r3, LPCR_PECEDH
+ bl __init_LPCR
+ bl __init_HFSCR
+ bl __init_tlb_power8
+ bl __init_PMU_HV
+ mtlr r11
+ blr
+
+_GLOBAL(__restore_cpu_power8)
+ mflr r11
+ bl __init_FSCR
+ bl __init_PMU
+ mfmsr r3
+ rldicl. r0,r3,4,63
+ mtlr r11
+ beqlr
+ li r0,0
+ mtspr SPRN_LPID,r0
+ mfspr r3,SPRN_LPCR
+ ori r3, r3, LPCR_PECEDH
+ bl __init_LPCR
+ bl __init_HFSCR
+ bl __init_tlb_power8
+ bl __init_PMU_HV
+ mtlr r11
+ blr
+
+__init_hvmode_206:
+ /* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */
+ mfmsr r3
+ rldicl. r0,r3,4,63
+ bnelr
+ ld r5,CPU_SPEC_FEATURES(r4)
+ LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
+ xor r5,r5,r6
+ std r5,CPU_SPEC_FEATURES(r4)
+ blr
+
+__init_LPCR:
+ /* Setup a sane LPCR:
+ * Called with initial LPCR in R3
+ *
+ * LPES = 0b01 (HSRR0/1 used for 0x500)
+ * PECE = 0b111
+ * DPFD = 4
+ * HDICE = 0
+ * VC = 0b100 (VPM0=1, VPM1=0, ISL=0)
+ * VRMASD = 0b10000 (L=1, LP=00)
+ *
+ * Other bits untouched for now
+ */
+ li r5,1
+ rldimi r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
+ ori r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
+ li r5,4
+ rldimi r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3
+ clrrdi r3,r3,1 /* clear HDICE */
+ li r5,4
+ rldimi r3,r5, LPCR_VC_SH, 0
+ li r5,0x10
+ rldimi r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
+ mtspr SPRN_LPCR,r3
+ isync
+ blr
+
+__init_FSCR:
+ mfspr r3,SPRN_FSCR
+ ori r3,r3,FSCR_TAR|FSCR_DSCR|FSCR_EBB
+ mtspr SPRN_FSCR,r3
+ blr
+
+__init_HFSCR:
+ mfspr r3,SPRN_HFSCR
+ ori r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|\
+ HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB
+ mtspr SPRN_HFSCR,r3
+ blr
+
+/*
+ * Clear the TLB using the specified IS form of tlbiel instruction
+ * (invalidate by congruence class). P7 has 128 CCs., P8 has 512.
+ *
+ * r3 = IS field
+ */
+__init_tlb_power7:
+ li r3,0xc00 /* IS field = 0b11 */
+_GLOBAL(__flush_tlb_power7)
+ li r6,128
+ mtctr r6
+ mr r7,r3 /* IS field */
+ ptesync
+2: tlbiel r7
+ addi r7,r7,0x1000
+ bdnz 2b
+ ptesync
+1: blr
+
+__init_tlb_power8:
+ li r3,0xc00 /* IS field = 0b11 */
+_GLOBAL(__flush_tlb_power8)
+ li r6,512
+ mtctr r6
+ mr r7,r3 /* IS field */
+ ptesync
+2: tlbiel r7
+ addi r7,r7,0x1000
+ bdnz 2b
+ ptesync
+1: blr
+
+__init_PMU_HV:
+ li r5,0
+ mtspr SPRN_MMCRC,r5
+ mtspr SPRN_MMCRH,r5
+ blr
+
+__init_PMU:
+ li r5,0
+ mtspr SPRN_MMCRS,r5
+ mtspr SPRN_MMCRA,r5
+ mtspr SPRN_MMCR0,r5
+ mtspr SPRN_MMCR1,r5
+ mtspr SPRN_MMCR2,r5
+ blr
diff --git a/arch/powerpc/kernel/cpu_setup_ppc970.S b/arch/powerpc/kernel/cpu_setup_ppc970.S
index 27f2507279d..12fac8df01c 100644
--- a/arch/powerpc/kernel/cpu_setup_ppc970.S
+++ b/arch/powerpc/kernel/cpu_setup_ppc970.S
@@ -76,7 +76,7 @@ _GLOBAL(__setup_cpu_ppc970)
/* Do nothing if not running in HV mode */
mfmsr r0
rldicl. r0,r0,4,63
- beqlr
+ beq no_hv_mode
mfspr r0,SPRN_HID0
li r11,5 /* clear DOZE and SLEEP */
@@ -90,7 +90,7 @@ _GLOBAL(__setup_cpu_ppc970MP)
/* Do nothing if not running in HV mode */
mfmsr r0
rldicl. r0,r0,4,63
- beqlr
+ beq no_hv_mode
mfspr r0,SPRN_HID0
li r11,0x15 /* clear DOZE and SLEEP */
@@ -109,6 +109,14 @@ load_hids:
sync
isync
+ /* Try to set LPES = 01 in HID4 */
+ mfspr r0,SPRN_HID4
+ clrldi r0,r0,1 /* clear LPES0 */
+ ori r0,r0,HID4_LPES1 /* set LPES1 */
+ sync
+ mtspr SPRN_HID4,r0
+ isync
+
/* Save away cpu state */
LOAD_REG_ADDR(r5,cpu_state_storage)
@@ -117,11 +125,21 @@ load_hids:
std r3,CS_HID0(r5)
mfspr r3,SPRN_HID1
std r3,CS_HID1(r5)
- mfspr r3,SPRN_HID4
- std r3,CS_HID4(r5)
+ mfspr r4,SPRN_HID4
+ std r4,CS_HID4(r5)
mfspr r3,SPRN_HID5
std r3,CS_HID5(r5)
+ /* See if we successfully set LPES1 to 1; if not we are in Apple mode */
+ andi. r4,r4,HID4_LPES1
+ bnelr
+
+no_hv_mode:
+ /* Disable CPU_FTR_HVMODE and exit, since we don't have HV mode */
+ ld r5,CPU_SPEC_FEATURES(r4)
+ LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
+ andc r5,r5,r6
+ std r5,CPU_SPEC_FEATURES(r4)
blr
/* Called with no MMU context (typically MSR:IR/DR off) to
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 96a908f1cd8..0c157642c2a 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -14,12 +14,13 @@
#include <linux/sched.h>
#include <linux/threads.h>
#include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <asm/oprofile_impl.h>
#include <asm/cputable.h>
#include <asm/prom.h> /* for PTRRELOC on ARCH=ppc */
#include <asm/mmu.h>
+#include <asm/setup.h>
struct cpu_spec* cur_cpu_spec = NULL;
EXPORT_SYMBOL(cur_cpu_spec);
@@ -62,14 +63,24 @@ extern void __setup_cpu_745x(unsigned long offset, struct cpu_spec* spec);
extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec);
extern void __setup_cpu_ppc970MP(unsigned long offset, struct cpu_spec* spec);
extern void __setup_cpu_pa6t(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_a2(unsigned long offset, struct cpu_spec* spec);
extern void __restore_cpu_pa6t(void);
extern void __restore_cpu_ppc970(void);
extern void __setup_cpu_power7(unsigned long offset, struct cpu_spec* spec);
extern void __restore_cpu_power7(void);
+extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec);
+extern void __restore_cpu_power8(void);
+extern void __restore_cpu_a2(void);
+extern void __flush_tlb_power7(unsigned long inval_selector);
+extern void __flush_tlb_power8(unsigned long inval_selector);
+extern long __machine_check_early_realmode_p7(struct pt_regs *regs);
+extern long __machine_check_early_realmode_p8(struct pt_regs *regs);
#endif /* CONFIG_PPC64 */
#if defined(CONFIG_E500)
extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec);
+extern void __setup_cpu_e6500(unsigned long offset, struct cpu_spec* spec);
extern void __restore_cpu_e5500(void);
+extern void __restore_cpu_e6500(void);
#endif /* CONFIG_E500 */
/* This table only contains "desktop" CPUs, it need to be filled with embedded
@@ -91,6 +102,15 @@ extern void __restore_cpu_e5500(void);
PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \
PPC_FEATURE_TRUE_LE | \
PPC_FEATURE_PSERIES_PERFMON_COMPAT)
+#define COMMON_USER2_POWER7 (PPC_FEATURE2_DSCR)
+#define COMMON_USER_POWER8 (COMMON_USER_PPC64 | PPC_FEATURE_ARCH_2_06 |\
+ PPC_FEATURE_SMT | PPC_FEATURE_ICACHE_SNOOP | \
+ PPC_FEATURE_TRUE_LE | \
+ PPC_FEATURE_PSERIES_PERFMON_COMPAT)
+#define COMMON_USER2_POWER8 (PPC_FEATURE2_ARCH_2_07 | \
+ PPC_FEATURE2_HTM_COMP | PPC_FEATURE2_DSCR | \
+ PPC_FEATURE2_ISEL | PPC_FEATURE2_TAR | \
+ PPC_FEATURE2_VEC_CRYPTO)
#define COMMON_USER_PA6T (COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\
PPC_FEATURE_TRUE_LE | \
PPC_FEATURE_HAS_ALTIVEC_COMP)
@@ -116,7 +136,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.pmc_type = PPC_PMC_IBM,
.oprofile_cpu_type = "ppc64/power3",
.oprofile_type = PPC_OPROFILE_RS64,
- .machine_check = machine_check_generic,
.platform = "power3",
},
{ /* Power3+ */
@@ -132,7 +151,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.pmc_type = PPC_PMC_IBM,
.oprofile_cpu_type = "ppc64/power3",
.oprofile_type = PPC_OPROFILE_RS64,
- .machine_check = machine_check_generic,
.platform = "power3",
},
{ /* Northstar */
@@ -148,7 +166,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.pmc_type = PPC_PMC_IBM,
.oprofile_cpu_type = "ppc64/rs64",
.oprofile_type = PPC_OPROFILE_RS64,
- .machine_check = machine_check_generic,
.platform = "rs64",
},
{ /* Pulsar */
@@ -164,7 +181,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.pmc_type = PPC_PMC_IBM,
.oprofile_cpu_type = "ppc64/rs64",
.oprofile_type = PPC_OPROFILE_RS64,
- .machine_check = machine_check_generic,
.platform = "rs64",
},
{ /* I-star */
@@ -180,7 +196,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.pmc_type = PPC_PMC_IBM,
.oprofile_cpu_type = "ppc64/rs64",
.oprofile_type = PPC_OPROFILE_RS64,
- .machine_check = machine_check_generic,
.platform = "rs64",
},
{ /* S-star */
@@ -196,7 +211,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.pmc_type = PPC_PMC_IBM,
.oprofile_cpu_type = "ppc64/rs64",
.oprofile_type = PPC_OPROFILE_RS64,
- .machine_check = machine_check_generic,
.platform = "rs64",
},
{ /* Power4 */
@@ -205,14 +219,13 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_name = "POWER4 (gp)",
.cpu_features = CPU_FTRS_POWER4,
.cpu_user_features = COMMON_USER_POWER4,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_POWER4,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 8,
.pmc_type = PPC_PMC_IBM,
.oprofile_cpu_type = "ppc64/power4",
.oprofile_type = PPC_OPROFILE_POWER4,
- .machine_check = machine_check_generic,
.platform = "power4",
},
{ /* Power4+ */
@@ -221,14 +234,13 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_name = "POWER4+ (gq)",
.cpu_features = CPU_FTRS_POWER4,
.cpu_user_features = COMMON_USER_POWER4,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_POWER4,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 8,
.pmc_type = PPC_PMC_IBM,
.oprofile_cpu_type = "ppc64/power4",
.oprofile_type = PPC_OPROFILE_POWER4,
- .machine_check = machine_check_generic,
.platform = "power4",
},
{ /* PPC970 */
@@ -238,7 +250,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_features = CPU_FTRS_PPC970,
.cpu_user_features = COMMON_USER_POWER4 |
PPC_FEATURE_HAS_ALTIVEC_COMP,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_PPC970,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 8,
@@ -247,7 +259,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_restore = __restore_cpu_ppc970,
.oprofile_cpu_type = "ppc64/970",
.oprofile_type = PPC_OPROFILE_POWER4,
- .machine_check = machine_check_generic,
.platform = "ppc970",
},
{ /* PPC970FX */
@@ -257,7 +268,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_features = CPU_FTRS_PPC970,
.cpu_user_features = COMMON_USER_POWER4 |
PPC_FEATURE_HAS_ALTIVEC_COMP,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_PPC970,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 8,
@@ -266,7 +277,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_restore = __restore_cpu_ppc970,
.oprofile_cpu_type = "ppc64/970",
.oprofile_type = PPC_OPROFILE_POWER4,
- .machine_check = machine_check_generic,
.platform = "ppc970",
},
{ /* PPC970MP DD1.0 - no DEEPNAP, use regular 970 init */
@@ -276,7 +286,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_features = CPU_FTRS_PPC970,
.cpu_user_features = COMMON_USER_POWER4 |
PPC_FEATURE_HAS_ALTIVEC_COMP,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_PPC970,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 8,
@@ -285,7 +295,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_restore = __restore_cpu_ppc970,
.oprofile_cpu_type = "ppc64/970MP",
.oprofile_type = PPC_OPROFILE_POWER4,
- .machine_check = machine_check_generic,
.platform = "ppc970",
},
{ /* PPC970MP */
@@ -295,7 +304,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_features = CPU_FTRS_PPC970,
.cpu_user_features = COMMON_USER_POWER4 |
PPC_FEATURE_HAS_ALTIVEC_COMP,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_PPC970,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 8,
@@ -304,7 +313,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_restore = __restore_cpu_ppc970,
.oprofile_cpu_type = "ppc64/970MP",
.oprofile_type = PPC_OPROFILE_POWER4,
- .machine_check = machine_check_generic,
.platform = "ppc970",
},
{ /* PPC970GX */
@@ -314,7 +322,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_features = CPU_FTRS_PPC970,
.cpu_user_features = COMMON_USER_POWER4 |
PPC_FEATURE_HAS_ALTIVEC_COMP,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_PPC970,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 8,
@@ -322,7 +330,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_setup = __setup_cpu_ppc970,
.oprofile_cpu_type = "ppc64/970",
.oprofile_type = PPC_OPROFILE_POWER4,
- .machine_check = machine_check_generic,
.platform = "ppc970",
},
{ /* Power5 GR */
@@ -331,7 +338,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_name = "POWER5 (gr)",
.cpu_features = CPU_FTRS_POWER5,
.cpu_user_features = COMMON_USER_POWER5,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_POWER5,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 6,
@@ -343,7 +350,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
*/
.oprofile_mmcra_sihv = MMCRA_SIHV,
.oprofile_mmcra_sipr = MMCRA_SIPR,
- .machine_check = machine_check_generic,
.platform = "power5",
},
{ /* Power5++ */
@@ -352,7 +358,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_name = "POWER5+ (gs)",
.cpu_features = CPU_FTRS_POWER5,
.cpu_user_features = COMMON_USER_POWER5_PLUS,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_POWER5,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 6,
@@ -360,7 +366,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.oprofile_type = PPC_OPROFILE_POWER4,
.oprofile_mmcra_sihv = MMCRA_SIHV,
.oprofile_mmcra_sipr = MMCRA_SIPR,
- .machine_check = machine_check_generic,
.platform = "power5+",
},
{ /* Power5 GS */
@@ -369,7 +374,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_name = "POWER5+ (gs)",
.cpu_features = CPU_FTRS_POWER5,
.cpu_user_features = COMMON_USER_POWER5_PLUS,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_POWER5,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 6,
@@ -378,7 +383,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.oprofile_type = PPC_OPROFILE_POWER4,
.oprofile_mmcra_sihv = MMCRA_SIHV,
.oprofile_mmcra_sipr = MMCRA_SIPR,
- .machine_check = machine_check_generic,
.platform = "power5+",
},
{ /* POWER6 in P5+ mode; 2.04-compliant processor */
@@ -387,10 +391,9 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_name = "POWER5+",
.cpu_features = CPU_FTRS_POWER5,
.cpu_user_features = COMMON_USER_POWER5_PLUS,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_POWER5,
.icache_bsize = 128,
.dcache_bsize = 128,
- .machine_check = machine_check_generic,
.oprofile_cpu_type = "ppc64/ibm-compat-v1",
.oprofile_type = PPC_OPROFILE_POWER4,
.platform = "power5+",
@@ -402,7 +405,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_features = CPU_FTRS_POWER6,
.cpu_user_features = COMMON_USER_POWER6 |
PPC_FEATURE_POWER6_EXT,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_POWER6,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 6,
@@ -413,7 +416,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.oprofile_mmcra_sipr = POWER6_MMCRA_SIPR,
.oprofile_mmcra_clear = POWER6_MMCRA_THRM |
POWER6_MMCRA_OTHER,
- .machine_check = machine_check_generic,
.platform = "power6x",
},
{ /* 2.05-compliant processor, i.e. Power6 "architected" mode */
@@ -422,10 +424,9 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_name = "POWER6 (architected)",
.cpu_features = CPU_FTRS_POWER6,
.cpu_user_features = COMMON_USER_POWER6,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_POWER6,
.icache_bsize = 128,
.dcache_bsize = 128,
- .machine_check = machine_check_generic,
.oprofile_cpu_type = "ppc64/ibm-compat-v1",
.oprofile_type = PPC_OPROFILE_POWER4,
.platform = "power6",
@@ -436,36 +437,135 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_name = "POWER7 (architected)",
.cpu_features = CPU_FTRS_POWER7,
.cpu_user_features = COMMON_USER_POWER7,
- .mmu_features = MMU_FTR_HPTE_TABLE |
- MMU_FTR_TLBIE_206,
+ .cpu_user_features2 = COMMON_USER2_POWER7,
+ .mmu_features = MMU_FTRS_POWER7,
.icache_bsize = 128,
.dcache_bsize = 128,
- .machine_check = machine_check_generic,
.oprofile_type = PPC_OPROFILE_POWER4,
.oprofile_cpu_type = "ppc64/ibm-compat-v1",
+ .cpu_setup = __setup_cpu_power7,
+ .cpu_restore = __restore_cpu_power7,
+ .flush_tlb = __flush_tlb_power7,
+ .machine_check_early = __machine_check_early_realmode_p7,
.platform = "power7",
},
+ { /* 2.07-compliant processor, i.e. Power8 "architected" mode */
+ .pvr_mask = 0xffffffff,
+ .pvr_value = 0x0f000004,
+ .cpu_name = "POWER8 (architected)",
+ .cpu_features = CPU_FTRS_POWER8,
+ .cpu_user_features = COMMON_USER_POWER8,
+ .cpu_user_features2 = COMMON_USER2_POWER8,
+ .mmu_features = MMU_FTRS_POWER8,
+ .icache_bsize = 128,
+ .dcache_bsize = 128,
+ .oprofile_type = PPC_OPROFILE_INVALID,
+ .oprofile_cpu_type = "ppc64/ibm-compat-v1",
+ .cpu_setup = __setup_cpu_power8,
+ .cpu_restore = __restore_cpu_power8,
+ .flush_tlb = __flush_tlb_power8,
+ .machine_check_early = __machine_check_early_realmode_p8,
+ .platform = "power8",
+ },
{ /* Power7 */
.pvr_mask = 0xffff0000,
.pvr_value = 0x003f0000,
.cpu_name = "POWER7 (raw)",
.cpu_features = CPU_FTRS_POWER7,
.cpu_user_features = COMMON_USER_POWER7,
- .mmu_features = MMU_FTR_HPTE_TABLE |
- MMU_FTR_TLBIE_206,
+ .cpu_user_features2 = COMMON_USER2_POWER7,
+ .mmu_features = MMU_FTRS_POWER7,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 6,
.pmc_type = PPC_PMC_IBM,
+ .oprofile_cpu_type = "ppc64/power7",
+ .oprofile_type = PPC_OPROFILE_POWER4,
.cpu_setup = __setup_cpu_power7,
.cpu_restore = __restore_cpu_power7,
+ .flush_tlb = __flush_tlb_power7,
+ .machine_check_early = __machine_check_early_realmode_p7,
+ .platform = "power7",
+ },
+ { /* Power7+ */
+ .pvr_mask = 0xffff0000,
+ .pvr_value = 0x004A0000,
+ .cpu_name = "POWER7+ (raw)",
+ .cpu_features = CPU_FTRS_POWER7,
+ .cpu_user_features = COMMON_USER_POWER7,
+ .cpu_user_features2 = COMMON_USER2_POWER7,
+ .mmu_features = MMU_FTRS_POWER7,
+ .icache_bsize = 128,
+ .dcache_bsize = 128,
+ .num_pmcs = 6,
+ .pmc_type = PPC_PMC_IBM,
.oprofile_cpu_type = "ppc64/power7",
.oprofile_type = PPC_OPROFILE_POWER4,
- .oprofile_mmcra_sihv = POWER6_MMCRA_SIHV,
- .oprofile_mmcra_sipr = POWER6_MMCRA_SIPR,
- .oprofile_mmcra_clear = POWER6_MMCRA_THRM |
- POWER6_MMCRA_OTHER,
- .platform = "power7",
+ .cpu_setup = __setup_cpu_power7,
+ .cpu_restore = __restore_cpu_power7,
+ .flush_tlb = __flush_tlb_power7,
+ .machine_check_early = __machine_check_early_realmode_p7,
+ .platform = "power7+",
+ },
+ { /* Power8E */
+ .pvr_mask = 0xffff0000,
+ .pvr_value = 0x004b0000,
+ .cpu_name = "POWER8E (raw)",
+ .cpu_features = CPU_FTRS_POWER8E,
+ .cpu_user_features = COMMON_USER_POWER8,
+ .cpu_user_features2 = COMMON_USER2_POWER8,
+ .mmu_features = MMU_FTRS_POWER8,
+ .icache_bsize = 128,
+ .dcache_bsize = 128,
+ .num_pmcs = 6,
+ .pmc_type = PPC_PMC_IBM,
+ .oprofile_cpu_type = "ppc64/power8",
+ .oprofile_type = PPC_OPROFILE_INVALID,
+ .cpu_setup = __setup_cpu_power8,
+ .cpu_restore = __restore_cpu_power8,
+ .flush_tlb = __flush_tlb_power8,
+ .machine_check_early = __machine_check_early_realmode_p8,
+ .platform = "power8",
+ },
+ { /* Power8 DD1: Does not support doorbell IPIs */
+ .pvr_mask = 0xffffff00,
+ .pvr_value = 0x004d0100,
+ .cpu_name = "POWER8 (raw)",
+ .cpu_features = CPU_FTRS_POWER8_DD1,
+ .cpu_user_features = COMMON_USER_POWER8,
+ .cpu_user_features2 = COMMON_USER2_POWER8,
+ .mmu_features = MMU_FTRS_POWER8,
+ .icache_bsize = 128,
+ .dcache_bsize = 128,
+ .num_pmcs = 6,
+ .pmc_type = PPC_PMC_IBM,
+ .oprofile_cpu_type = "ppc64/power8",
+ .oprofile_type = PPC_OPROFILE_INVALID,
+ .cpu_setup = __setup_cpu_power8,
+ .cpu_restore = __restore_cpu_power8,
+ .flush_tlb = __flush_tlb_power8,
+ .machine_check_early = __machine_check_early_realmode_p8,
+ .platform = "power8",
+ },
+ { /* Power8 */
+ .pvr_mask = 0xffff0000,
+ .pvr_value = 0x004d0000,
+ .cpu_name = "POWER8 (raw)",
+ .cpu_features = CPU_FTRS_POWER8,
+ .cpu_user_features = COMMON_USER_POWER8,
+ .cpu_user_features2 = COMMON_USER2_POWER8,
+ .mmu_features = MMU_FTRS_POWER8,
+ .icache_bsize = 128,
+ .dcache_bsize = 128,
+ .num_pmcs = 6,
+ .pmc_type = PPC_PMC_IBM,
+ .oprofile_cpu_type = "ppc64/power8",
+ .oprofile_type = PPC_OPROFILE_INVALID,
+ .cpu_setup = __setup_cpu_power8,
+ .cpu_restore = __restore_cpu_power8,
+ .flush_tlb = __flush_tlb_power8,
+ .machine_check_early = __machine_check_early_realmode_p8,
+ .platform = "power8",
},
{ /* Cell Broadband Engine */
.pvr_mask = 0xffff0000,
@@ -475,14 +575,13 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_user_features = COMMON_USER_PPC64 |
PPC_FEATURE_CELL | PPC_FEATURE_HAS_ALTIVEC_COMP |
PPC_FEATURE_SMT,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_CELL,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 4,
.pmc_type = PPC_PMC_IBM,
.oprofile_cpu_type = "ppc64/cell-be",
.oprofile_type = PPC_OPROFILE_CELL,
- .machine_check = machine_check_generic,
.platform = "ppc-cell-be",
},
{ /* PA Semi PA6T */
@@ -491,7 +590,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_name = "PA6T",
.cpu_features = CPU_FTRS_PA6T,
.cpu_user_features = COMMON_USER_PA6T,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_PA6T,
.icache_bsize = 64,
.dcache_bsize = 64,
.num_pmcs = 6,
@@ -500,7 +599,6 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_restore = __restore_cpu_pa6t,
.oprofile_cpu_type = "ppc64/pa6t",
.oprofile_type = PPC_OPROFILE_PA6T,
- .machine_check = machine_check_generic,
.platform = "pa6t",
},
{ /* default match */
@@ -509,12 +607,11 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_name = "POWER4 (compatible)",
.cpu_features = CPU_FTRS_COMPATIBLE,
.cpu_user_features = COMMON_USER_PPC64,
- .mmu_features = MMU_FTR_HPTE_TABLE,
+ .mmu_features = MMU_FTRS_DEFAULT_HPTE_ARCH_V2,
.icache_bsize = 128,
.dcache_bsize = 128,
.num_pmcs = 6,
.pmc_type = PPC_PMC_IBM,
- .machine_check = machine_check_generic,
.platform = "power4",
}
#endif /* CONFIG_PPC_BOOK3S_64 */
@@ -1513,6 +1610,19 @@ static struct cpu_spec __initdata cpu_specs[] = {
.machine_check = machine_check_4xx,
.platform = "ppc405",
},
+ { /* APM8018X */
+ .pvr_mask = 0xffff0000,
+ .pvr_value = 0x7ff11432,
+ .cpu_name = "APM8018X",
+ .cpu_features = CPU_FTRS_40X,
+ .cpu_user_features = PPC_FEATURE_32 |
+ PPC_FEATURE_HAS_MMU | PPC_FEATURE_HAS_4xxMAC,
+ .mmu_features = MMU_FTR_TYPE_40x,
+ .icache_bsize = 32,
+ .dcache_bsize = 32,
+ .machine_check = machine_check_4xx,
+ .platform = "ppc405",
+ },
{ /* default match */
.pvr_mask = 0x00000000,
.pvr_value = 0x00000000,
@@ -1811,7 +1921,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.platform = "ppc440",
},
{ /* 464 in APM821xx */
- .pvr_mask = 0xffffff00,
+ .pvr_mask = 0xfffffff0,
.pvr_value = 0x12C41C80,
.cpu_name = "APM821XX",
.cpu_features = CPU_FTRS_44X,
@@ -1824,11 +1934,25 @@ static struct cpu_spec __initdata cpu_specs[] = {
.machine_check = machine_check_440A,
.platform = "ppc440",
},
- { /* 476 core */
- .pvr_mask = 0xffff0000,
- .pvr_value = 0x11a50000,
+ { /* 476 DD2 core */
+ .pvr_mask = 0xffffffff,
+ .pvr_value = 0x11a52080,
.cpu_name = "476",
- .cpu_features = CPU_FTRS_47X,
+ .cpu_features = CPU_FTRS_47X | CPU_FTR_476_DD2,
+ .cpu_user_features = COMMON_USER_BOOKE |
+ PPC_FEATURE_HAS_FPU,
+ .mmu_features = MMU_FTR_TYPE_47x |
+ MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL,
+ .icache_bsize = 32,
+ .dcache_bsize = 128,
+ .machine_check = machine_check_47x,
+ .platform = "ppc470",
+ },
+ { /* 476fpe */
+ .pvr_mask = 0xffff0000,
+ .pvr_value = 0x7ff50000,
+ .cpu_name = "476fpe",
+ .cpu_features = CPU_FTRS_47X | CPU_FTR_476_DD2,
.cpu_user_features = COMMON_USER_BOOKE |
PPC_FEATURE_HAS_FPU,
.mmu_features = MMU_FTR_TYPE_47x |
@@ -1852,6 +1976,20 @@ static struct cpu_spec __initdata cpu_specs[] = {
.machine_check = machine_check_47x,
.platform = "ppc470",
},
+ { /* 476 others */
+ .pvr_mask = 0xffff0000,
+ .pvr_value = 0x11a50000,
+ .cpu_name = "476",
+ .cpu_features = CPU_FTRS_47X,
+ .cpu_user_features = COMMON_USER_BOOKE |
+ PPC_FEATURE_HAS_FPU,
+ .mmu_features = MMU_FTR_TYPE_47x |
+ MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_LOCK_BCAST_INVAL,
+ .icache_bsize = 32,
+ .dcache_bsize = 128,
+ .machine_check = machine_check_47x,
+ .platform = "ppc470",
+ },
{ /* default match */
.pvr_mask = 0x00000000,
.pvr_value = 0x00000000,
@@ -1921,6 +2059,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_user_features = COMMON_USER_BOOKE |
PPC_FEATURE_HAS_SPE_COMP |
PPC_FEATURE_HAS_EFP_SINGLE_COMP,
+ .cpu_user_features2 = PPC_FEATURE2_ISEL,
.mmu_features = MMU_FTR_TYPE_FSL_E,
.icache_bsize = 32,
.dcache_bsize = 32,
@@ -1940,6 +2079,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
PPC_FEATURE_HAS_SPE_COMP |
PPC_FEATURE_HAS_EFP_SINGLE_COMP |
PPC_FEATURE_HAS_EFP_DOUBLE_COMP,
+ .cpu_user_features2 = PPC_FEATURE2_ISEL,
.mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS,
.icache_bsize = 32,
.dcache_bsize = 32,
@@ -1956,6 +2096,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.cpu_name = "e500mc",
.cpu_features = CPU_FTRS_E500MC,
.cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+ .cpu_user_features2 = PPC_FEATURE2_ISEL,
.mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
MMU_FTR_USE_TLBILX,
.icache_bsize = 64,
@@ -1972,8 +2113,9 @@ static struct cpu_spec __initdata cpu_specs[] = {
.pvr_mask = 0xffff0000,
.pvr_value = 0x80240000,
.cpu_name = "e5500",
- .cpu_features = CPU_FTRS_E500MC,
- .cpu_user_features = COMMON_USER_BOOKE,
+ .cpu_features = CPU_FTRS_E5500,
+ .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU,
+ .cpu_user_features2 = PPC_FEATURE2_ISEL,
.mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
MMU_FTR_USE_TLBILX,
.icache_bsize = 64,
@@ -1982,10 +2124,34 @@ static struct cpu_spec __initdata cpu_specs[] = {
.oprofile_cpu_type = "ppc/e500mc",
.oprofile_type = PPC_OPROFILE_FSL_EMB,
.cpu_setup = __setup_cpu_e5500,
+#ifndef CONFIG_PPC32
.cpu_restore = __restore_cpu_e5500,
+#endif
.machine_check = machine_check_e500mc,
.platform = "ppce5500",
},
+ { /* e6500 */
+ .pvr_mask = 0xffff0000,
+ .pvr_value = 0x80400000,
+ .cpu_name = "e6500",
+ .cpu_features = CPU_FTRS_E6500,
+ .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU |
+ PPC_FEATURE_HAS_ALTIVEC_COMP,
+ .cpu_user_features2 = PPC_FEATURE2_ISEL,
+ .mmu_features = MMU_FTR_TYPE_FSL_E | MMU_FTR_BIG_PHYS |
+ MMU_FTR_USE_TLBILX,
+ .icache_bsize = 64,
+ .dcache_bsize = 64,
+ .num_pmcs = 6,
+ .oprofile_cpu_type = "ppc/e6500",
+ .oprofile_type = PPC_OPROFILE_FSL_EMB,
+ .cpu_setup = __setup_cpu_e6500,
+#ifndef CONFIG_PPC32
+ .cpu_restore = __restore_cpu_e6500,
+#endif
+ .machine_check = machine_check_e500mc,
+ .platform = "ppce6500",
+ },
#ifdef CONFIG_PPC32
{ /* default match */
.pvr_mask = 0x00000000,
@@ -2003,34 +2169,12 @@ static struct cpu_spec __initdata cpu_specs[] = {
}
#endif /* CONFIG_PPC32 */
#endif /* CONFIG_E500 */
-
-#ifdef CONFIG_PPC_BOOK3E_64
- { /* This is a default entry to get going, to be replaced by
- * a real one at some stage
- */
-#define CPU_FTRS_BASE_BOOK3E (CPU_FTR_USE_TB | \
- CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_SMT | \
- CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
- .pvr_mask = 0x00000000,
- .pvr_value = 0x00000000,
- .cpu_name = "Book3E",
- .cpu_features = CPU_FTRS_BASE_BOOK3E,
- .cpu_user_features = COMMON_USER_PPC64,
- .mmu_features = MMU_FTR_TYPE_3E | MMU_FTR_USE_TLBILX |
- MMU_FTR_USE_TLBIVAX_BCAST |
- MMU_FTR_LOCK_BCAST_INVAL,
- .icache_bsize = 64,
- .dcache_bsize = 64,
- .num_pmcs = 0,
- .machine_check = machine_check_generic,
- .platform = "power6",
- },
-#endif
};
static struct cpu_spec the_cpu_spec;
-static void __init setup_cpu_spec(unsigned long offset, struct cpu_spec *s)
+static struct cpu_spec * __init setup_cpu_spec(unsigned long offset,
+ struct cpu_spec *s)
{
struct cpu_spec *t = &the_cpu_spec;
struct cpu_spec old;
@@ -2089,10 +2233,12 @@ static void __init setup_cpu_spec(unsigned long offset, struct cpu_spec *s)
* pointer on ppc64 and booke as we are running at 0 in real mode
* on ppc64 and reloc_offset is always 0 on booke.
*/
- if (s->cpu_setup) {
- s->cpu_setup(offset, s);
+ if (t->cpu_setup) {
+ t->cpu_setup(offset, t);
}
#endif /* CONFIG_PPC64 || CONFIG_BOOKE */
+
+ return t;
}
struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
@@ -2103,10 +2249,8 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
s = PTRRELOC(s);
for (i = 0; i < ARRAY_SIZE(cpu_specs); i++,s++) {
- if ((pvr & s->pvr_mask) == s->pvr_value) {
- setup_cpu_spec(offset, s);
- return s;
- }
+ if ((pvr & s->pvr_mask) == s->pvr_value)
+ return setup_cpu_spec(offset, s);
}
BUG();
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index 832c8c4db25..51dbace3269 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -10,84 +10,84 @@
*
*/
-#undef DEBUG
-
#include <linux/kernel.h>
#include <linux/smp.h>
#include <linux/reboot.h>
#include <linux/kexec.h>
-#include <linux/bootmem.h>
+#include <linux/export.h>
#include <linux/crash_dump.h>
#include <linux/delay.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/init.h>
#include <linux/irq.h>
#include <linux/types.h>
-#include <linux/memblock.h>
#include <asm/processor.h>
#include <asm/machdep.h>
#include <asm/kexec.h>
#include <asm/kdump.h>
#include <asm/prom.h>
-#include <asm/firmware.h>
#include <asm/smp.h>
-#include <asm/system.h>
#include <asm/setjmp.h>
+#include <asm/debug.h>
-#ifdef DEBUG
-#include <asm/udbg.h>
-#define DBG(fmt...) udbg_printf(fmt)
-#else
-#define DBG(fmt...)
-#endif
+/*
+ * The primary CPU waits a while for all secondary CPUs to enter. This is to
+ * avoid sending an IPI if the secondary CPUs are entering
+ * crash_kexec_secondary on their own (eg via a system reset).
+ *
+ * The secondary timeout has to be longer than the primary. Both timeouts are
+ * in milliseconds.
+ */
+#define PRIMARY_TIMEOUT 500
+#define SECONDARY_TIMEOUT 1000
+
+#define IPI_TIMEOUT 10000
+#define REAL_MODE_TIMEOUT 10000
-/* This keeps a track of which one is crashing cpu. */
+/* This keeps a track of which one is the crashing cpu. */
int crashing_cpu = -1;
-static cpumask_t cpus_in_crash = CPU_MASK_NONE;
-cpumask_t cpus_in_sr = CPU_MASK_NONE;
+static int time_to_dump;
-#define CRASH_HANDLER_MAX 2
+#define CRASH_HANDLER_MAX 3
/* NULL terminated list of shutdown handles */
static crash_shutdown_t crash_shutdown_handles[CRASH_HANDLER_MAX+1];
static DEFINE_SPINLOCK(crash_handlers_lock);
+static unsigned long crash_shutdown_buf[JMP_BUF_LEN];
+static int crash_shutdown_cpu = -1;
+
+static int handle_fault(struct pt_regs *regs)
+{
+ if (crash_shutdown_cpu == smp_processor_id())
+ longjmp(crash_shutdown_buf, 1);
+ return 0;
+}
+
#ifdef CONFIG_SMP
-static atomic_t enter_on_soft_reset = ATOMIC_INIT(0);
+static atomic_t cpus_in_crash;
void crash_ipi_callback(struct pt_regs *regs)
{
+ static cpumask_t cpus_state_saved = CPU_MASK_NONE;
+
int cpu = smp_processor_id();
if (!cpu_online(cpu))
return;
hard_irq_disable();
- if (!cpu_isset(cpu, cpus_in_crash))
+ if (!cpumask_test_cpu(cpu, &cpus_state_saved)) {
crash_save_cpu(regs, cpu);
- cpu_set(cpu, cpus_in_crash);
-
- /*
- * Entered via soft-reset - could be the kdump
- * process is invoked using soft-reset or user activated
- * it if some CPU did not respond to an IPI.
- * For soft-reset, the secondary CPU can enter this func
- * twice. 1 - using IPI, and 2. soft-reset.
- * Tell the kexec CPU that entered via soft-reset and ready
- * to go down.
- */
- if (cpu_isset(cpu, cpus_in_sr)) {
- cpu_clear(cpu, cpus_in_sr);
- atomic_inc(&enter_on_soft_reset);
+ cpumask_set_cpu(cpu, &cpus_state_saved);
}
+ atomic_inc(&cpus_in_crash);
+ smp_mb__after_atomic();
+
/*
* Starting the kdump boot.
* This barrier is needed to make sure that all CPUs are stopped.
- * If not, soft-reset will be invoked to bring other CPUs.
*/
- while (!cpu_isset(crashing_cpu, cpus_in_crash))
+ while (!time_to_dump)
cpu_relax();
if (ppc_md.kexec_cpu_down)
@@ -102,142 +102,108 @@ void crash_ipi_callback(struct pt_regs *regs)
/* NOTREACHED */
}
-/*
- * Wait until all CPUs are entered via soft-reset.
- */
-static void crash_soft_reset_check(int cpu)
-{
- unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */
-
- cpu_clear(cpu, cpus_in_sr);
- while (atomic_read(&enter_on_soft_reset) != ncpus)
- cpu_relax();
-}
-
-
static void crash_kexec_prepare_cpus(int cpu)
{
unsigned int msecs;
-
unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */
+ int tries = 0;
+ int (*old_handler)(struct pt_regs *regs);
+
+ printk(KERN_EMERG "Sending IPI to other CPUs\n");
crash_send_ipi(crash_ipi_callback);
smp_wmb();
+again:
/*
- * FIXME: Until we will have the way to stop other CPUSs reliabally,
+ * FIXME: Until we will have the way to stop other CPUs reliably,
* the crash CPU will send an IPI and wait for other CPUs to
* respond.
- * Delay of at least 10 seconds.
*/
- printk(KERN_EMERG "Sending IPI to other cpus...\n");
- msecs = 10000;
- while ((cpus_weight(cpus_in_crash) < ncpus) && (--msecs > 0)) {
- cpu_relax();
+ msecs = IPI_TIMEOUT;
+ while ((atomic_read(&cpus_in_crash) < ncpus) && (--msecs > 0))
mdelay(1);
- }
/* Would it be better to replace the trap vector here? */
+ if (atomic_read(&cpus_in_crash) >= ncpus) {
+ printk(KERN_EMERG "IPI complete\n");
+ return;
+ }
+
+ printk(KERN_EMERG "ERROR: %d cpu(s) not responding\n",
+ ncpus - atomic_read(&cpus_in_crash));
+
/*
- * FIXME: In case if we do not get all CPUs, one possibility: ask the
- * user to do soft reset such that we get all.
- * Soft-reset will be used until better mechanism is implemented.
+ * If we have a panic timeout set then we can't wait indefinitely
+ * for someone to activate system reset. We also give up on the
+ * second time through if system reset fail to work.
*/
- if (cpus_weight(cpus_in_crash) < ncpus) {
- printk(KERN_EMERG "done waiting: %d cpu(s) not responding\n",
- ncpus - cpus_weight(cpus_in_crash));
- printk(KERN_EMERG "Activate soft-reset to stop other cpu(s)\n");
- cpus_in_sr = CPU_MASK_NONE;
- atomic_set(&enter_on_soft_reset, 0);
- while (cpus_weight(cpus_in_crash) < ncpus)
- cpu_relax();
- }
+ if ((panic_timeout > 0) || (tries > 0))
+ return;
+
/*
- * Make sure all CPUs are entered via soft-reset if the kdump is
- * invoked using soft-reset.
+ * A system reset will cause all CPUs to take an 0x100 exception.
+ * The primary CPU returns here via setjmp, and the secondary
+ * CPUs reexecute the crash_kexec_secondary path.
*/
- if (cpu_isset(cpu, cpus_in_sr))
- crash_soft_reset_check(cpu);
- /* Leave the IPI callback set */
-}
+ old_handler = __debugger;
+ __debugger = handle_fault;
+ crash_shutdown_cpu = smp_processor_id();
-/* wait for all the CPUs to hit real mode but timeout if they don't come in */
-#ifdef CONFIG_PPC_STD_MMU_64
-static void crash_kexec_wait_realmode(int cpu)
-{
- unsigned int msecs;
- int i;
+ if (setjmp(crash_shutdown_buf) == 0) {
+ printk(KERN_EMERG "Activate system reset (dumprestart) "
+ "to stop other cpu(s)\n");
- msecs = 10000;
- for (i=0; i < NR_CPUS && msecs > 0; i++) {
- if (i == cpu)
- continue;
+ /*
+ * A system reset will force all CPUs to execute the
+ * crash code again. We need to reset cpus_in_crash so we
+ * wait for everyone to do this.
+ */
+ atomic_set(&cpus_in_crash, 0);
+ smp_mb();
- while (paca[i].kexec_state < KEXEC_STATE_REAL_MODE) {
- barrier();
- if (!cpu_possible(i)) {
- break;
- }
- if (!cpu_online(i)) {
- break;
- }
- msecs--;
- mdelay(1);
- }
+ while (atomic_read(&cpus_in_crash) < ncpus)
+ cpu_relax();
}
- mb();
+
+ crash_shutdown_cpu = -1;
+ __debugger = old_handler;
+
+ tries++;
+ goto again;
}
-#endif
/*
- * This function will be called by secondary cpus or by kexec cpu
- * if soft-reset is activated to stop some CPUs.
+ * This function will be called by secondary cpus.
*/
void crash_kexec_secondary(struct pt_regs *regs)
{
- int cpu = smp_processor_id();
unsigned long flags;
- int msecs = 5;
+ int msecs = SECONDARY_TIMEOUT;
local_irq_save(flags);
- /* Wait 5ms if the kexec CPU is not entered yet. */
+
+ /* Wait for the primary crash CPU to signal its progress */
while (crashing_cpu < 0) {
if (--msecs < 0) {
- /*
- * Either kdump image is not loaded or
- * kdump process is not started - Probably xmon
- * exited using 'x'(exit and recover) or
- * kexec_should_crash() failed for all running tasks.
- */
- cpu_clear(cpu, cpus_in_sr);
+ /* No response, kdump image may not have been loaded */
local_irq_restore(flags);
return;
}
+
mdelay(1);
- cpu_relax();
- }
- if (cpu == crashing_cpu) {
- /*
- * Panic CPU will enter this func only via soft-reset.
- * Wait until all secondary CPUs entered and
- * then start kexec boot.
- */
- crash_soft_reset_check(cpu);
- cpu_set(crashing_cpu, cpus_in_crash);
- if (ppc_md.kexec_cpu_down)
- ppc_md.kexec_cpu_down(1, 0);
- machine_kexec(kexec_crash_image);
- /* NOTREACHED */
}
+
crash_ipi_callback(regs);
}
-#else
+#else /* ! CONFIG_SMP */
+
static void crash_kexec_prepare_cpus(int cpu)
{
/*
- * move the secondarys to us so that we can copy
+ * move the secondaries to us so that we can copy
* the new kernel 0-0x100 safely
*
* do this if kexec in setup.c ?
@@ -251,75 +217,34 @@ static void crash_kexec_prepare_cpus(int cpu)
void crash_kexec_secondary(struct pt_regs *regs)
{
- cpus_in_sr = CPU_MASK_NONE;
}
-#endif
-#ifdef CONFIG_SPU_BASE
-
-#include <asm/spu.h>
-#include <asm/spu_priv1.h>
-
-struct crash_spu_info {
- struct spu *spu;
- u32 saved_spu_runcntl_RW;
- u32 saved_spu_status_R;
- u32 saved_spu_npc_RW;
- u64 saved_mfc_sr1_RW;
- u64 saved_mfc_dar;
- u64 saved_mfc_dsisr;
-};
+#endif /* CONFIG_SMP */
-#define CRASH_NUM_SPUS 16 /* Enough for current hardware */
-static struct crash_spu_info crash_spu_info[CRASH_NUM_SPUS];
-
-static void crash_kexec_stop_spus(void)
+/* wait for all the CPUs to hit real mode but timeout if they don't come in */
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC_STD_MMU_64)
+static void crash_kexec_wait_realmode(int cpu)
{
- struct spu *spu;
+ unsigned int msecs;
int i;
- u64 tmp;
-
- for (i = 0; i < CRASH_NUM_SPUS; i++) {
- if (!crash_spu_info[i].spu)
- continue;
-
- spu = crash_spu_info[i].spu;
-
- crash_spu_info[i].saved_spu_runcntl_RW =
- in_be32(&spu->problem->spu_runcntl_RW);
- crash_spu_info[i].saved_spu_status_R =
- in_be32(&spu->problem->spu_status_R);
- crash_spu_info[i].saved_spu_npc_RW =
- in_be32(&spu->problem->spu_npc_RW);
- crash_spu_info[i].saved_mfc_dar = spu_mfc_dar_get(spu);
- crash_spu_info[i].saved_mfc_dsisr = spu_mfc_dsisr_get(spu);
- tmp = spu_mfc_sr1_get(spu);
- crash_spu_info[i].saved_mfc_sr1_RW = tmp;
-
- tmp &= ~MFC_STATE1_MASTER_RUN_CONTROL_MASK;
- spu_mfc_sr1_set(spu, tmp);
-
- __delay(200);
- }
-}
-
-void crash_register_spus(struct list_head *list)
-{
- struct spu *spu;
-
- list_for_each_entry(spu, list, full_list) {
- if (WARN_ON(spu->number >= CRASH_NUM_SPUS))
+ msecs = REAL_MODE_TIMEOUT;
+ for (i=0; i < nr_cpu_ids && msecs > 0; i++) {
+ if (i == cpu)
continue;
- crash_spu_info[spu->number].spu = spu;
+ while (paca[i].kexec_state < KEXEC_STATE_REAL_MODE) {
+ barrier();
+ if (!cpu_possible(i) || !cpu_online(i) || (msecs <= 0))
+ break;
+ msecs--;
+ mdelay(1);
+ }
}
+ mb();
}
-
#else
-static inline void crash_kexec_stop_spus(void)
-{
-}
-#endif /* CONFIG_SPU_BASE */
+static inline void crash_kexec_wait_realmode(int cpu) {}
+#endif /* CONFIG_SMP && CONFIG_PPC_STD_MMU_64 */
/*
* Register a function to be called on shutdown. Only use this if you
@@ -374,22 +299,11 @@ int crash_shutdown_unregister(crash_shutdown_t handler)
}
EXPORT_SYMBOL(crash_shutdown_unregister);
-static unsigned long crash_shutdown_buf[JMP_BUF_LEN];
-static int crash_shutdown_cpu = -1;
-
-static int handle_fault(struct pt_regs *regs)
-{
- if (crash_shutdown_cpu == smp_processor_id())
- longjmp(crash_shutdown_buf, 1);
- return 0;
-}
-
void default_machine_crash_shutdown(struct pt_regs *regs)
{
unsigned int i;
int (*old_handler)(struct pt_regs *regs);
-
/*
* This function is only called after the system
* has panicked or is otherwise in a critical state.
@@ -407,17 +321,26 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
* such that another IPI will not be sent.
*/
crashing_cpu = smp_processor_id();
- crash_save_cpu(regs, crashing_cpu);
+
+ /*
+ * If we came in via system reset, wait a while for the secondary
+ * CPUs to enter.
+ */
+ if (TRAP(regs) == 0x100)
+ mdelay(PRIMARY_TIMEOUT);
+
crash_kexec_prepare_cpus(crashing_cpu);
- cpu_set(crashing_cpu, cpus_in_crash);
-#if defined(CONFIG_PPC_STD_MMU_64) && defined(CONFIG_SMP)
+
+ crash_save_cpu(regs, crashing_cpu);
+
+ time_to_dump = 1;
+
crash_kexec_wait_realmode(crashing_cpu);
-#endif
machine_kexec_mask_interrupts();
/*
- * Call registered shutdown routines savely. Swap out
+ * Call registered shutdown routines safely. Swap out
* __debugger_fault_handler, and replace on exit.
*/
old_handler = __debugger_fault_handler;
@@ -439,8 +362,6 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
crash_shutdown_cpu = -1;
__debugger_fault_handler = old_handler;
- crash_kexec_stop_spus();
-
if (ppc_md.kexec_cpu_down)
ppc_md.kexec_cpu_down(1, 0);
}
diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c
index 8e05c16344e..7a13f378ca2 100644
--- a/arch/powerpc/kernel/crash_dump.c
+++ b/arch/powerpc/kernel/crash_dump.c
@@ -19,6 +19,7 @@
#include <asm/prom.h>
#include <asm/firmware.h>
#include <asm/uaccess.h>
+#include <asm/rtas.h>
#ifdef DEBUG
#include <asm/udbg.h>
@@ -27,10 +28,7 @@
#define DBG(fmt...)
#endif
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
-#ifndef CONFIG_RELOCATABLE
+#ifndef CONFIG_NONSTATIC_KERNEL
void __init reserve_kdump_trampoline(void)
{
memblock_reserve(0, KDUMP_RESERVE_LIMIT);
@@ -69,31 +67,7 @@ void __init setup_kdump_trampoline(void)
DBG(" <- setup_kdump_trampoline()\n");
}
-#endif /* CONFIG_RELOCATABLE */
-
-/*
- * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
- * is_kdump_kernel() to determine if we are booting after a panic. Hence
- * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
- */
-static int __init parse_elfcorehdr(char *p)
-{
- if (p)
- elfcorehdr_addr = memparse(p, &p);
-
- return 1;
-}
-__setup("elfcorehdr=", parse_elfcorehdr);
-
-static int __init parse_savemaxmem(char *p)
-{
- if (p)
- saved_max_pfn = (memparse(p, &p) >> PAGE_SHIFT) - 1;
-
- return 1;
-}
-__setup("savemaxmem=", parse_savemaxmem);
-
+#endif /* CONFIG_NONSTATIC_KERNEL */
static size_t copy_oldmem_vaddr(void *vaddr, char *buf, size_t csize,
unsigned long offset, int userbuf)
@@ -124,20 +98,51 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
size_t csize, unsigned long offset, int userbuf)
{
void *vaddr;
+ phys_addr_t paddr;
if (!csize)
return 0;
csize = min_t(size_t, csize, PAGE_SIZE);
+ paddr = pfn << PAGE_SHIFT;
- if ((min_low_pfn < pfn) && (pfn < max_pfn)) {
- vaddr = __va(pfn << PAGE_SHIFT);
+ if (memblock_is_region_memory(paddr, csize)) {
+ vaddr = __va(paddr);
csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf);
} else {
- vaddr = __ioremap(pfn << PAGE_SHIFT, PAGE_SIZE, 0);
+ vaddr = __ioremap(paddr, PAGE_SIZE, 0);
csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf);
iounmap(vaddr);
}
return csize;
}
+
+#ifdef CONFIG_PPC_RTAS
+/*
+ * The crashkernel region will almost always overlap the RTAS region, so
+ * we have to be careful when shrinking the crashkernel region.
+ */
+void crash_free_reserved_phys_range(unsigned long begin, unsigned long end)
+{
+ unsigned long addr;
+ const __be32 *basep, *sizep;
+ unsigned int rtas_start = 0, rtas_end = 0;
+
+ basep = of_get_property(rtas.dev, "linux,rtas-base", NULL);
+ sizep = of_get_property(rtas.dev, "rtas-size", NULL);
+
+ if (basep && sizep) {
+ rtas_start = be32_to_cpup(basep);
+ rtas_end = rtas_start + be32_to_cpup(sizep);
+ }
+
+ for (addr = begin; addr < end; addr += PAGE_SIZE) {
+ /* Does this page overlap with the RTAS region? */
+ if (addr <= rtas_end && ((addr + PAGE_SIZE) > rtas_start))
+ continue;
+
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+ }
+}
+#endif
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index 3307a52d797..d55c76c571f 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -13,84 +13,41 @@
#include <linux/kernel.h>
#include <linux/smp.h>
#include <linux/threads.h>
-#include <linux/percpu.h>
+#include <linux/hardirq.h>
#include <asm/dbell.h>
#include <asm/irq_regs.h>
#ifdef CONFIG_SMP
-struct doorbell_cpu_info {
- unsigned long messages; /* current messages bits */
- unsigned int tag; /* tag value */
-};
-
-static DEFINE_PER_CPU(struct doorbell_cpu_info, doorbell_cpu_info);
-
void doorbell_setup_this_cpu(void)
{
- struct doorbell_cpu_info *info = &__get_cpu_var(doorbell_cpu_info);
+ unsigned long tag = mfspr(SPRN_DOORBELL_CPUTAG) & PPC_DBELL_TAG_MASK;
- info->messages = 0;
- info->tag = mfspr(SPRN_PIR) & 0x3fff;
+ smp_muxed_ipi_set_data(smp_processor_id(), tag);
}
-void doorbell_message_pass(int target, int msg)
+void doorbell_cause_ipi(int cpu, unsigned long data)
{
- struct doorbell_cpu_info *info;
- int i;
-
- if (target < NR_CPUS) {
- info = &per_cpu(doorbell_cpu_info, target);
- set_bit(msg, &info->messages);
- ppc_msgsnd(PPC_DBELL, 0, info->tag);
- }
- else if (target == MSG_ALL_BUT_SELF) {
- for_each_online_cpu(i) {
- if (i == smp_processor_id())
- continue;
- info = &per_cpu(doorbell_cpu_info, i);
- set_bit(msg, &info->messages);
- ppc_msgsnd(PPC_DBELL, 0, info->tag);
- }
- }
- else { /* target == MSG_ALL */
- for_each_online_cpu(i) {
- info = &per_cpu(doorbell_cpu_info, i);
- set_bit(msg, &info->messages);
- }
- ppc_msgsnd(PPC_DBELL, PPC_DBELL_MSG_BRDCAST, 0);
- }
+ /* Order previous accesses vs. msgsnd, which is treated as a store */
+ mb();
+ ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, data);
}
void doorbell_exception(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- struct doorbell_cpu_info *info = &__get_cpu_var(doorbell_cpu_info);
- int msg;
-
- /* Warning: regs can be NULL when called from irq enable */
-
- if (!info->messages || (num_online_cpus() < 2))
- goto out;
- for (msg = 0; msg < 4; msg++)
- if (test_and_clear_bit(msg, &info->messages))
- smp_message_recv(msg);
+ irq_enter();
-out:
- set_irq_regs(old_regs);
-}
+ may_hard_irq_enable();
-void doorbell_check_self(void)
-{
- struct doorbell_cpu_info *info = &__get_cpu_var(doorbell_cpu_info);
+ __get_cpu_var(irq_stat).doorbell_irqs++;
- if (!info->messages)
- return;
+ smp_ipi_demux();
- ppc_msgsnd(PPC_DBELL, 0, info->tag);
+ irq_exit();
+ set_irq_regs(old_regs);
}
-
#else /* CONFIG_SMP */
void doorbell_exception(struct pt_regs *regs)
{
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 6e54a0fd31a..54d0116256f 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -5,6 +5,7 @@
* busses using the iommu infrastructure
*/
+#include <linux/export.h>
#include <asm/iommu.h>
/*
@@ -16,15 +17,17 @@
* to the dma address (mapping) of the first page.
*/
static void *dma_iommu_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag)
+ dma_addr_t *dma_handle, gfp_t flag,
+ struct dma_attrs *attrs)
{
return iommu_alloc_coherent(dev, get_iommu_table_base(dev), size,
- dma_handle, device_to_mask(dev), flag,
+ dma_handle, dev->coherent_dma_mask, flag,
dev_to_node(dev));
}
static void dma_iommu_free_coherent(struct device *dev, size_t size,
- void *vaddr, dma_addr_t dma_handle)
+ void *vaddr, dma_addr_t dma_handle,
+ struct dma_attrs *attrs)
{
iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle);
}
@@ -80,23 +83,37 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask)
return 0;
}
- if ((tbl->it_offset + tbl->it_size) > (mask >> IOMMU_PAGE_SHIFT)) {
- dev_info(dev, "Warning: IOMMU window too big for device mask\n");
- dev_info(dev, "mask: 0x%08llx, table end: 0x%08lx\n",
- mask, (tbl->it_offset + tbl->it_size) <<
- IOMMU_PAGE_SHIFT);
+ if (tbl->it_offset > (mask >> tbl->it_page_shift)) {
+ dev_info(dev, "Warning: IOMMU offset too big for device mask\n");
+ dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n",
+ mask, tbl->it_offset << tbl->it_page_shift);
return 0;
} else
return 1;
}
+static u64 dma_iommu_get_required_mask(struct device *dev)
+{
+ struct iommu_table *tbl = get_iommu_table_base(dev);
+ u64 mask;
+ if (!tbl)
+ return 0;
+
+ mask = 1ULL < (fls_long(tbl->it_offset + tbl->it_size) - 1);
+ mask += mask - 1;
+
+ return mask;
+}
+
struct dma_map_ops dma_iommu_ops = {
- .alloc_coherent = dma_iommu_alloc_coherent,
- .free_coherent = dma_iommu_free_coherent,
- .map_sg = dma_iommu_map_sg,
- .unmap_sg = dma_iommu_unmap_sg,
- .dma_supported = dma_iommu_dma_supported,
- .map_page = dma_iommu_map_page,
- .unmap_page = dma_iommu_unmap_page,
+ .alloc = dma_iommu_alloc_coherent,
+ .free = dma_iommu_free_coherent,
+ .mmap = dma_direct_mmap_coherent,
+ .map_sg = dma_iommu_map_sg,
+ .unmap_sg = dma_iommu_unmap_sg,
+ .dma_supported = dma_iommu_dma_supported,
+ .map_page = dma_iommu_map_page,
+ .unmap_page = dma_iommu_unmap_page,
+ .get_required_mask = dma_iommu_get_required_mask,
};
EXPORT_SYMBOL(dma_iommu_ops);
diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
index 4295e0b94b2..bd1a2aba599 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -12,6 +12,7 @@
*/
#include <linux/dma-mapping.h>
+#include <linux/memblock.h>
#include <linux/pfn.h>
#include <linux/of_platform.h>
#include <linux/platform_device.h>
@@ -20,10 +21,24 @@
#include <asm/machdep.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
-#include <asm/abs_addr.h>
unsigned int ppc_swiotlb_enable;
+static u64 swiotlb_powerpc_get_required(struct device *dev)
+{
+ u64 end, mask, max_direct_dma_addr = dev->archdata.max_direct_dma_addr;
+
+ end = memblock_end_of_DRAM();
+ if (max_direct_dma_addr && end > max_direct_dma_addr)
+ end = max_direct_dma_addr;
+ end += get_dma_offset(dev);
+
+ mask = 1ULL << (fls64(end) - 1);
+ mask += mask - 1;
+
+ return mask;
+}
+
/*
* At the moment, all platforms that use this code only require
* swiotlb to be used if we're operating on HIGHMEM. Since
@@ -32,8 +47,9 @@ unsigned int ppc_swiotlb_enable;
* for everything else.
*/
struct dma_map_ops swiotlb_dma_ops = {
- .alloc_coherent = dma_direct_alloc_coherent,
- .free_coherent = dma_direct_free_coherent,
+ .alloc = dma_direct_alloc_coherent,
+ .free = dma_direct_free_coherent,
+ .mmap = dma_direct_mmap_coherent,
.map_sg = swiotlb_map_sg_attrs,
.unmap_sg = swiotlb_unmap_sg_attrs,
.dma_supported = swiotlb_dma_supported,
@@ -44,6 +60,7 @@ struct dma_map_ops swiotlb_dma_ops = {
.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
.sync_sg_for_device = swiotlb_sync_sg_for_device,
.mapping_error = swiotlb_dma_mapping_error,
+ .get_required_mask = swiotlb_powerpc_get_required,
};
void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev)
@@ -88,3 +105,23 @@ int __init swiotlb_setup_bus_notifier(void)
&ppc_swiotlb_plat_bus_notifier);
return 0;
}
+
+void swiotlb_detect_4g(void)
+{
+ if ((memblock_end_of_DRAM() - 1) > 0xffffffff)
+ ppc_swiotlb_enable = 1;
+}
+
+static int __init swiotlb_late_init(void)
+{
+ if (ppc_swiotlb_enable) {
+ swiotlb_print_info();
+ set_pci_dma_ops(&swiotlb_dma_ops);
+ ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb;
+ } else {
+ swiotlb_free();
+ }
+
+ return 0;
+}
+subsys_initcall(swiotlb_late_init);
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index cf02cad62d9..ee78f6e49d6 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -10,8 +10,10 @@
#include <linux/dma-debug.h>
#include <linux/gfp.h>
#include <linux/memblock.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <asm/vio.h>
#include <asm/bug.h>
-#include <asm/abs_addr.h>
#include <asm/machdep.h>
/*
@@ -25,7 +27,8 @@
void *dma_direct_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag)
+ dma_addr_t *dma_handle, gfp_t flag,
+ struct dma_attrs *attrs)
{
void *ret;
#ifdef CONFIG_NOT_COHERENT_CACHE
@@ -46,14 +49,15 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size,
return NULL;
ret = page_address(page);
memset(ret, 0, size);
- *dma_handle = virt_to_abs(ret) + get_dma_offset(dev);
+ *dma_handle = __pa(ret) + get_dma_offset(dev);
return ret;
#endif
}
void dma_direct_free_coherent(struct device *dev, size_t size,
- void *vaddr, dma_addr_t dma_handle)
+ void *vaddr, dma_addr_t dma_handle,
+ struct dma_attrs *attrs)
{
#ifdef CONFIG_NOT_COHERENT_CACHE
__dma_free_coherent(size, vaddr);
@@ -62,6 +66,24 @@ void dma_direct_free_coherent(struct device *dev, size_t size,
#endif
}
+int dma_direct_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t handle, size_t size,
+ struct dma_attrs *attrs)
+{
+ unsigned long pfn;
+
+#ifdef CONFIG_NOT_COHERENT_CACHE
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ pfn = __dma_get_coherent_pfn((unsigned long)cpu_addr);
+#else
+ pfn = page_to_pfn(virt_to_page(cpu_addr));
+#endif
+ return remap_pfn_range(vma, vma->vm_start,
+ pfn + vma->vm_pgoff,
+ vma->vm_end - vma->vm_start,
+ vma->vm_page_prot);
+}
+
static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
int nents, enum dma_data_direction direction,
struct dma_attrs *attrs)
@@ -96,6 +118,18 @@ static int dma_direct_dma_supported(struct device *dev, u64 mask)
#endif
}
+static u64 dma_direct_get_required_mask(struct device *dev)
+{
+ u64 end, mask;
+
+ end = memblock_end_of_DRAM() + get_dma_offset(dev);
+
+ mask = 1ULL << (fls64(end) - 1);
+ mask += mask - 1;
+
+ return mask;
+}
+
static inline dma_addr_t dma_direct_map_page(struct device *dev,
struct page *page,
unsigned long offset,
@@ -137,13 +171,15 @@ static inline void dma_direct_sync_single(struct device *dev,
#endif
struct dma_map_ops dma_direct_ops = {
- .alloc_coherent = dma_direct_alloc_coherent,
- .free_coherent = dma_direct_free_coherent,
- .map_sg = dma_direct_map_sg,
- .unmap_sg = dma_direct_unmap_sg,
- .dma_supported = dma_direct_dma_supported,
- .map_page = dma_direct_map_page,
- .unmap_page = dma_direct_unmap_page,
+ .alloc = dma_direct_alloc_coherent,
+ .free = dma_direct_free_coherent,
+ .mmap = dma_direct_mmap_coherent,
+ .map_sg = dma_direct_map_sg,
+ .unmap_sg = dma_direct_unmap_sg,
+ .dma_supported = dma_direct_dma_supported,
+ .map_page = dma_direct_map_page,
+ .unmap_page = dma_direct_unmap_page,
+ .get_required_mask = dma_direct_get_required_mask,
#ifdef CONFIG_NOT_COHERENT_CACHE
.sync_single_for_cpu = dma_direct_sync_single,
.sync_single_for_device = dma_direct_sync_single,
@@ -155,27 +191,53 @@ EXPORT_SYMBOL(dma_direct_ops);
#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-int dma_set_mask(struct device *dev, u64 dma_mask)
+int __dma_set_mask(struct device *dev, u64 dma_mask)
{
struct dma_map_ops *dma_ops = get_dma_ops(dev);
- if (ppc_md.dma_set_mask)
- return ppc_md.dma_set_mask(dev, dma_mask);
- if (unlikely(dma_ops == NULL))
- return -EIO;
- if (dma_ops->set_dma_mask != NULL)
+ if ((dma_ops != NULL) && (dma_ops->set_dma_mask != NULL))
return dma_ops->set_dma_mask(dev, dma_mask);
if (!dev->dma_mask || !dma_supported(dev, dma_mask))
return -EIO;
*dev->dma_mask = dma_mask;
return 0;
}
+int dma_set_mask(struct device *dev, u64 dma_mask)
+{
+ if (ppc_md.dma_set_mask)
+ return ppc_md.dma_set_mask(dev, dma_mask);
+ return __dma_set_mask(dev, dma_mask);
+}
EXPORT_SYMBOL(dma_set_mask);
+u64 dma_get_required_mask(struct device *dev)
+{
+ struct dma_map_ops *dma_ops = get_dma_ops(dev);
+
+ if (ppc_md.dma_get_required_mask)
+ return ppc_md.dma_get_required_mask(dev);
+
+ if (unlikely(dma_ops == NULL))
+ return 0;
+
+ if (dma_ops->get_required_mask)
+ return dma_ops->get_required_mask(dev);
+
+ return DMA_BIT_MASK(8 * sizeof(dma_addr_t));
+}
+EXPORT_SYMBOL_GPL(dma_get_required_mask);
+
static int __init dma_init(void)
{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
+ dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
+#ifdef CONFIG_PCI
+ dma_debug_add_bus(&pci_bus_type);
+#endif
+#ifdef CONFIG_IBMVIO
+ dma_debug_add_bus(&vio_bus_type);
+#endif
return 0;
}
fs_initcall(dma_init);
+
diff --git a/arch/powerpc/kernel/e500-pmu.c b/arch/powerpc/kernel/e500-pmu.c
deleted file mode 100644
index 7c07de0d894..00000000000
--- a/arch/powerpc/kernel/e500-pmu.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Performance counter support for e500 family processors.
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- * Copyright 2010 Freescale Semiconductor, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/string.h>
-#include <linux/perf_event.h>
-#include <asm/reg.h>
-#include <asm/cputable.h>
-
-/*
- * Map of generic hardware event types to hardware events
- * Zero if unsupported
- */
-static int e500_generic_events[] = {
- [PERF_COUNT_HW_CPU_CYCLES] = 1,
- [PERF_COUNT_HW_INSTRUCTIONS] = 2,
- [PERF_COUNT_HW_CACHE_MISSES] = 41, /* Data L1 cache reloads */
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 12,
- [PERF_COUNT_HW_BRANCH_MISSES] = 15,
-};
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-/*
- * Table of generalized cache-related events.
- * 0 means not supported, -1 means nonsensical, other values
- * are event codes.
- */
-static int e500_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
- /*
- * D-cache misses are not split into read/write/prefetch;
- * use raw event 41.
- */
- [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 27, 0 },
- [C(OP_WRITE)] = { 28, 0 },
- [C(OP_PREFETCH)] = { 29, 0 },
- },
- [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 2, 60 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { 0, 0 },
- },
- /*
- * Assuming LL means L2, it's not a good match for this model.
- * It allocates only on L1 castout or explicit prefetch, and
- * does not have separate read/write events (but it does have
- * separate instruction/data events).
- */
- [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0 },
- [C(OP_WRITE)] = { 0, 0 },
- [C(OP_PREFETCH)] = { 0, 0 },
- },
- /*
- * There are data/instruction MMU misses, but that's a miss on
- * the chip's internal level-one TLB which is probably not
- * what the user wants. Instead, unified level-two TLB misses
- * are reported here.
- */
- [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 26, 66 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 12, 15 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
-};
-
-static int num_events = 128;
-
-/* Upper half of event id is PMLCb, for threshold events */
-static u64 e500_xlate_event(u64 event_id)
-{
- u32 event_low = (u32)event_id;
- u64 ret;
-
- if (event_low >= num_events)
- return 0;
-
- ret = FSL_EMB_EVENT_VALID;
-
- if (event_low >= 76 && event_low <= 81) {
- ret |= FSL_EMB_EVENT_RESTRICTED;
- ret |= event_id &
- (FSL_EMB_EVENT_THRESHMUL | FSL_EMB_EVENT_THRESH);
- } else if (event_id &
- (FSL_EMB_EVENT_THRESHMUL | FSL_EMB_EVENT_THRESH)) {
- /* Threshold requested on non-threshold event */
- return 0;
- }
-
- return ret;
-}
-
-static struct fsl_emb_pmu e500_pmu = {
- .name = "e500 family",
- .n_counter = 4,
- .n_restricted = 2,
- .xlate_event = e500_xlate_event,
- .n_generic = ARRAY_SIZE(e500_generic_events),
- .generic_events = e500_generic_events,
- .cache_events = &e500_cache_events,
-};
-
-static int init_e500_pmu(void)
-{
- if (!cur_cpu_spec->oprofile_cpu_type)
- return -ENODEV;
-
- if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/e500mc"))
- num_events = 256;
- else if (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/e500"))
- return -ENODEV;
-
- return register_fsl_emb_pmu(&e500_pmu);
-}
-
-arch_initcall(init_e500_pmu);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
new file mode 100644
index 00000000000..86e25702aac
--- /dev/null
+++ b/arch/powerpc/kernel/eeh.c
@@ -0,0 +1,1183 @@
+/*
+ * Copyright IBM Corporation 2001, 2005, 2006
+ * Copyright Dave Engebretsen & Todd Inglett 2001
+ * Copyright Linas Vepstas 2005, 2006
+ * Copyright 2001-2012 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/reboot.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/of.h>
+
+#include <linux/atomic.h>
+#include <asm/debug.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/rtas.h>
+
+
+/** Overview:
+ * EEH, or "Extended Error Handling" is a PCI bridge technology for
+ * dealing with PCI bus errors that can't be dealt with within the
+ * usual PCI framework, except by check-stopping the CPU. Systems
+ * that are designed for high-availability/reliability cannot afford
+ * to crash due to a "mere" PCI error, thus the need for EEH.
+ * An EEH-capable bridge operates by converting a detected error
+ * into a "slot freeze", taking the PCI adapter off-line, making
+ * the slot behave, from the OS'es point of view, as if the slot
+ * were "empty": all reads return 0xff's and all writes are silently
+ * ignored. EEH slot isolation events can be triggered by parity
+ * errors on the address or data busses (e.g. during posted writes),
+ * which in turn might be caused by low voltage on the bus, dust,
+ * vibration, humidity, radioactivity or plain-old failed hardware.
+ *
+ * Note, however, that one of the leading causes of EEH slot
+ * freeze events are buggy device drivers, buggy device microcode,
+ * or buggy device hardware. This is because any attempt by the
+ * device to bus-master data to a memory address that is not
+ * assigned to the device will trigger a slot freeze. (The idea
+ * is to prevent devices-gone-wild from corrupting system memory).
+ * Buggy hardware/drivers will have a miserable time co-existing
+ * with EEH.
+ *
+ * Ideally, a PCI device driver, when suspecting that an isolation
+ * event has occurred (e.g. by reading 0xff's), will then ask EEH
+ * whether this is the case, and then take appropriate steps to
+ * reset the PCI slot, the PCI device, and then resume operations.
+ * However, until that day, the checking is done here, with the
+ * eeh_check_failure() routine embedded in the MMIO macros. If
+ * the slot is found to be isolated, an "EEH Event" is synthesized
+ * and sent out for processing.
+ */
+
+/* If a device driver keeps reading an MMIO register in an interrupt
+ * handler after a slot isolation event, it might be broken.
+ * This sets the threshold for how many read attempts we allow
+ * before printing an error message.
+ */
+#define EEH_MAX_FAILS 2100000
+
+/* Time to wait for a PCI slot to report status, in milliseconds */
+#define PCI_BUS_RESET_WAIT_MSEC (5*60*1000)
+
+/*
+ * EEH probe mode support, which is part of the flags,
+ * is to support multiple platforms for EEH. Some platforms
+ * like pSeries do PCI emunation based on device tree.
+ * However, other platforms like powernv probe PCI devices
+ * from hardware. The flag is used to distinguish that.
+ * In addition, struct eeh_ops::probe would be invoked for
+ * particular OF node or PCI device so that the corresponding
+ * PE would be created there.
+ */
+int eeh_subsystem_flags;
+EXPORT_SYMBOL(eeh_subsystem_flags);
+
+/* Platform dependent EEH operations */
+struct eeh_ops *eeh_ops = NULL;
+
+/* Lock to avoid races due to multiple reports of an error */
+DEFINE_RAW_SPINLOCK(confirm_error_lock);
+
+/* Buffer for reporting pci register dumps. Its here in BSS, and
+ * not dynamically alloced, so that it ends up in RMO where RTAS
+ * can access it.
+ */
+#define EEH_PCI_REGS_LOG_LEN 4096
+static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN];
+
+/*
+ * The struct is used to maintain the EEH global statistic
+ * information. Besides, the EEH global statistics will be
+ * exported to user space through procfs
+ */
+struct eeh_stats {
+ u64 no_device; /* PCI device not found */
+ u64 no_dn; /* OF node not found */
+ u64 no_cfg_addr; /* Config address not found */
+ u64 ignored_check; /* EEH check skipped */
+ u64 total_mmio_ffs; /* Total EEH checks */
+ u64 false_positives; /* Unnecessary EEH checks */
+ u64 slot_resets; /* PE reset */
+};
+
+static struct eeh_stats eeh_stats;
+
+#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
+
+static int __init eeh_setup(char *str)
+{
+ if (!strcmp(str, "off"))
+ eeh_subsystem_flags |= EEH_FORCE_DISABLED;
+
+ return 1;
+}
+__setup("eeh=", eeh_setup);
+
+/**
+ * eeh_gather_pci_data - Copy assorted PCI config space registers to buff
+ * @edev: device to report data for
+ * @buf: point to buffer in which to log
+ * @len: amount of room in buffer
+ *
+ * This routine captures assorted PCI configuration space data,
+ * and puts them into a buffer for RTAS error logging.
+ */
+static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len)
+{
+ struct device_node *dn = eeh_dev_to_of_node(edev);
+ u32 cfg;
+ int cap, i;
+ int n = 0;
+
+ n += scnprintf(buf+n, len-n, "%s\n", dn->full_name);
+ pr_warn("EEH: of node=%s\n", dn->full_name);
+
+ eeh_ops->read_config(dn, PCI_VENDOR_ID, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
+ pr_warn("EEH: PCI device/vendor: %08x\n", cfg);
+
+ eeh_ops->read_config(dn, PCI_COMMAND, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
+ pr_warn("EEH: PCI cmd/status register: %08x\n", cfg);
+
+ /* Gather bridge-specific registers */
+ if (edev->mode & EEH_DEV_BRIDGE) {
+ eeh_ops->read_config(dn, PCI_SEC_STATUS, 2, &cfg);
+ n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
+ pr_warn("EEH: Bridge secondary status: %04x\n", cfg);
+
+ eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &cfg);
+ n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
+ pr_warn("EEH: Bridge control: %04x\n", cfg);
+ }
+
+ /* Dump out the PCI-X command and status regs */
+ cap = edev->pcix_cap;
+ if (cap) {
+ eeh_ops->read_config(dn, cap, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg);
+ pr_warn("EEH: PCI-X cmd: %08x\n", cfg);
+
+ eeh_ops->read_config(dn, cap+4, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg);
+ pr_warn("EEH: PCI-X status: %08x\n", cfg);
+ }
+
+ /* If PCI-E capable, dump PCI-E cap 10 */
+ cap = edev->pcie_cap;
+ if (cap) {
+ n += scnprintf(buf+n, len-n, "pci-e cap10:\n");
+ pr_warn("EEH: PCI-E capabilities and status follow:\n");
+
+ for (i=0; i<=8; i++) {
+ eeh_ops->read_config(dn, cap+4*i, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
+ pr_warn("EEH: PCI-E %02x: %08x\n", i, cfg);
+ }
+ }
+
+ /* If AER capable, dump it */
+ cap = edev->aer_cap;
+ if (cap) {
+ n += scnprintf(buf+n, len-n, "pci-e AER:\n");
+ pr_warn("EEH: PCI-E AER capability register set follows:\n");
+
+ for (i=0; i<14; i++) {
+ eeh_ops->read_config(dn, cap+4*i, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
+ pr_warn("EEH: PCI-E AER %02x: %08x\n", i, cfg);
+ }
+ }
+
+ return n;
+}
+
+/**
+ * eeh_slot_error_detail - Generate combined log including driver log and error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ *
+ * This routine should be called to generate the combined log, which
+ * is comprised of driver log and error log. The driver log is figured
+ * out from the config space of the corresponding PCI device, while
+ * the error log is fetched through platform dependent function call.
+ */
+void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
+{
+ size_t loglen = 0;
+ struct eeh_dev *edev, *tmp;
+
+ /*
+ * When the PHB is fenced or dead, it's pointless to collect
+ * the data from PCI config space because it should return
+ * 0xFF's. For ER, we still retrieve the data from the PCI
+ * config space.
+ *
+ * For pHyp, we have to enable IO for log retrieval. Otherwise,
+ * 0xFF's is always returned from PCI config space.
+ */
+ if (!(pe->type & EEH_PE_PHB)) {
+ if (eeh_probe_mode_devtree())
+ eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+ eeh_ops->configure_bridge(pe);
+ eeh_pe_restore_bars(pe);
+
+ pci_regs_buf[0] = 0;
+ eeh_pe_for_each_dev(pe, edev, tmp) {
+ loglen += eeh_gather_pci_data(edev, pci_regs_buf + loglen,
+ EEH_PCI_REGS_LOG_LEN - loglen);
+ }
+ }
+
+ eeh_ops->get_log(pe, severity, pci_regs_buf, loglen);
+}
+
+/**
+ * eeh_token_to_phys - Convert EEH address token to phys address
+ * @token: I/O token, should be address in the form 0xA....
+ *
+ * This routine should be called to convert virtual I/O address
+ * to physical one.
+ */
+static inline unsigned long eeh_token_to_phys(unsigned long token)
+{
+ pte_t *ptep;
+ unsigned long pa;
+ int hugepage_shift;
+
+ /*
+ * We won't find hugepages here, iomem
+ */
+ ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
+ if (!ptep)
+ return token;
+ WARN_ON(hugepage_shift);
+ pa = pte_pfn(*ptep) << PAGE_SHIFT;
+
+ return pa | (token & (PAGE_SIZE-1));
+}
+
+/*
+ * On PowerNV platform, we might already have fenced PHB there.
+ * For that case, it's meaningless to recover frozen PE. Intead,
+ * We have to handle fenced PHB firstly.
+ */
+static int eeh_phb_check_failure(struct eeh_pe *pe)
+{
+ struct eeh_pe *phb_pe;
+ unsigned long flags;
+ int ret;
+
+ if (!eeh_probe_mode_dev())
+ return -EPERM;
+
+ /* Find the PHB PE */
+ phb_pe = eeh_phb_pe_get(pe->phb);
+ if (!phb_pe) {
+ pr_warning("%s Can't find PE for PHB#%d\n",
+ __func__, pe->phb->global_number);
+ return -EEXIST;
+ }
+
+ /* If the PHB has been in problematic state */
+ eeh_serialize_lock(&flags);
+ if (phb_pe->state & EEH_PE_ISOLATED) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Check PHB state */
+ ret = eeh_ops->get_state(phb_pe, NULL);
+ if ((ret < 0) ||
+ (ret == EEH_STATE_NOT_SUPPORT) ||
+ (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
+ (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Isolate the PHB and send event */
+ eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
+ eeh_serialize_unlock(flags);
+
+ pr_err("EEH: PHB#%x failure detected, location: %s\n",
+ phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe));
+ dump_stack();
+ eeh_send_failure_event(phb_pe);
+
+ return 1;
+out:
+ eeh_serialize_unlock(flags);
+ return ret;
+}
+
+/**
+ * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @edev: eeh device
+ *
+ * Check for an EEH failure for the given device node. Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze. This routine
+ * will query firmware for the EEH status.
+ *
+ * Returns 0 if there has not been an EEH error; otherwise returns
+ * a non-zero value and queues up a slot isolation event notification.
+ *
+ * It is safe to call this routine in an interrupt context.
+ */
+int eeh_dev_check_failure(struct eeh_dev *edev)
+{
+ int ret;
+ int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
+ unsigned long flags;
+ struct device_node *dn;
+ struct pci_dev *dev;
+ struct eeh_pe *pe, *parent_pe, *phb_pe;
+ int rc = 0;
+ const char *location;
+
+ eeh_stats.total_mmio_ffs++;
+
+ if (!eeh_enabled())
+ return 0;
+
+ if (!edev) {
+ eeh_stats.no_dn++;
+ return 0;
+ }
+ dn = eeh_dev_to_of_node(edev);
+ dev = eeh_dev_to_pci_dev(edev);
+ pe = edev->pe;
+
+ /* Access to IO BARs might get this far and still not want checking. */
+ if (!pe) {
+ eeh_stats.ignored_check++;
+ pr_debug("EEH: Ignored check for %s %s\n",
+ eeh_pci_name(dev), dn->full_name);
+ return 0;
+ }
+
+ if (!pe->addr && !pe->config_addr) {
+ eeh_stats.no_cfg_addr++;
+ return 0;
+ }
+
+ /*
+ * On PowerNV platform, we might already have fenced PHB
+ * there and we need take care of that firstly.
+ */
+ ret = eeh_phb_check_failure(pe);
+ if (ret > 0)
+ return ret;
+
+ /* If we already have a pending isolation event for this
+ * slot, we know it's bad already, we don't need to check.
+ * Do this checking under a lock; as multiple PCI devices
+ * in one slot might report errors simultaneously, and we
+ * only want one error recovery routine running.
+ */
+ eeh_serialize_lock(&flags);
+ rc = 1;
+ if (pe->state & EEH_PE_ISOLATED) {
+ pe->check_count++;
+ if (pe->check_count % EEH_MAX_FAILS == 0) {
+ location = of_get_property(dn, "ibm,loc-code", NULL);
+ printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
+ "location=%s driver=%s pci addr=%s\n",
+ pe->check_count, location,
+ eeh_driver_name(dev), eeh_pci_name(dev));
+ printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n",
+ eeh_driver_name(dev));
+ dump_stack();
+ }
+ goto dn_unlock;
+ }
+
+ /*
+ * Now test for an EEH failure. This is VERY expensive.
+ * Note that the eeh_config_addr may be a parent device
+ * in the case of a device behind a bridge, or it may be
+ * function zero of a multi-function device.
+ * In any case they must share a common PHB.
+ */
+ ret = eeh_ops->get_state(pe, NULL);
+
+ /* Note that config-io to empty slots may fail;
+ * they are empty when they don't have children.
+ * We will punt with the following conditions: Failure to get
+ * PE's state, EEH not support and Permanently unavailable
+ * state, PE is in good state.
+ */
+ if ((ret < 0) ||
+ (ret == EEH_STATE_NOT_SUPPORT) ||
+ ((ret & active_flags) == active_flags)) {
+ eeh_stats.false_positives++;
+ pe->false_positives++;
+ rc = 0;
+ goto dn_unlock;
+ }
+
+ /*
+ * It should be corner case that the parent PE has been
+ * put into frozen state as well. We should take care
+ * that at first.
+ */
+ parent_pe = pe->parent;
+ while (parent_pe) {
+ /* Hit the ceiling ? */
+ if (parent_pe->type & EEH_PE_PHB)
+ break;
+
+ /* Frozen parent PE ? */
+ ret = eeh_ops->get_state(parent_pe, NULL);
+ if (ret > 0 &&
+ (ret & active_flags) != active_flags)
+ pe = parent_pe;
+
+ /* Next parent level */
+ parent_pe = parent_pe->parent;
+ }
+
+ eeh_stats.slot_resets++;
+
+ /* Avoid repeated reports of this failure, including problems
+ * with other functions on this device, and functions under
+ * bridges.
+ */
+ eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+ eeh_serialize_unlock(flags);
+
+ /* Most EEH events are due to device driver bugs. Having
+ * a stack trace will help the device-driver authors figure
+ * out what happened. So print that out.
+ */
+ phb_pe = eeh_phb_pe_get(pe->phb);
+ pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
+ pe->phb->global_number, pe->addr);
+ pr_err("EEH: PE location: %s, PHB location: %s\n",
+ eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
+ dump_stack();
+
+ eeh_send_failure_event(pe);
+
+ return 1;
+
+dn_unlock:
+ eeh_serialize_unlock(flags);
+ return rc;
+}
+
+EXPORT_SYMBOL_GPL(eeh_dev_check_failure);
+
+/**
+ * eeh_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @token: I/O token, should be address in the form 0xA....
+ * @val: value, should be all 1's (XXX why do we need this arg??)
+ *
+ * Check for an EEH failure at the given token address. Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze event. This routine
+ * will query firmware for the EEH status.
+ *
+ * Note this routine is safe to call in an interrupt context.
+ */
+unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
+{
+ unsigned long addr;
+ struct eeh_dev *edev;
+
+ /* Finding the phys addr + pci device; this is pretty quick. */
+ addr = eeh_token_to_phys((unsigned long __force) token);
+ edev = eeh_addr_cache_get_dev(addr);
+ if (!edev) {
+ eeh_stats.no_device++;
+ return val;
+ }
+
+ eeh_dev_check_failure(edev);
+ return val;
+}
+
+EXPORT_SYMBOL(eeh_check_failure);
+
+
+/**
+ * eeh_pci_enable - Enable MMIO or DMA transfers for this slot
+ * @pe: EEH PE
+ *
+ * This routine should be called to reenable frozen MMIO or DMA
+ * so that it would work correctly again. It's useful while doing
+ * recovery or log collection on the indicated device.
+ */
+int eeh_pci_enable(struct eeh_pe *pe, int function)
+{
+ int rc, flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
+
+ /*
+ * pHyp doesn't allow to enable IO or DMA on unfrozen PE.
+ * Also, it's pointless to enable them on unfrozen PE. So
+ * we have the check here.
+ */
+ if (function == EEH_OPT_THAW_MMIO ||
+ function == EEH_OPT_THAW_DMA) {
+ rc = eeh_ops->get_state(pe, NULL);
+ if (rc < 0)
+ return rc;
+
+ /* Needn't to enable or already enabled */
+ if ((rc == EEH_STATE_NOT_SUPPORT) ||
+ ((rc & flags) == flags))
+ return 0;
+ }
+
+ rc = eeh_ops->set_option(pe, function);
+ if (rc)
+ pr_warn("%s: Unexpected state change %d on "
+ "PHB#%d-PE#%x, err=%d\n",
+ __func__, function, pe->phb->global_number,
+ pe->addr, rc);
+
+ rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
+ if (rc <= 0)
+ return rc;
+
+ if ((function == EEH_OPT_THAW_MMIO) &&
+ (rc & EEH_STATE_MMIO_ENABLED))
+ return 0;
+
+ if ((function == EEH_OPT_THAW_DMA) &&
+ (rc & EEH_STATE_DMA_ENABLED))
+ return 0;
+
+ return rc;
+}
+
+/**
+ * pcibios_set_pcie_slot_reset - Set PCI-E reset state
+ * @dev: pci device struct
+ * @state: reset state to enter
+ *
+ * Return value:
+ * 0 if success
+ */
+int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
+{
+ struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+ struct eeh_pe *pe = edev->pe;
+
+ if (!pe) {
+ pr_err("%s: No PE found on PCI device %s\n",
+ __func__, pci_name(dev));
+ return -EINVAL;
+ }
+
+ switch (state) {
+ case pcie_deassert_reset:
+ eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
+ break;
+ case pcie_hot_reset:
+ eeh_ops->reset(pe, EEH_RESET_HOT);
+ break;
+ case pcie_warm_reset:
+ eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
+ break;
+ default:
+ return -EINVAL;
+ };
+
+ return 0;
+}
+
+/**
+ * eeh_set_pe_freset - Check the required reset for the indicated device
+ * @data: EEH device
+ * @flag: return value
+ *
+ * Each device might have its preferred reset type: fundamental or
+ * hot reset. The routine is used to collected the information for
+ * the indicated device and its children so that the bunch of the
+ * devices could be reset properly.
+ */
+static void *eeh_set_dev_freset(void *data, void *flag)
+{
+ struct pci_dev *dev;
+ unsigned int *freset = (unsigned int *)flag;
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+
+ dev = eeh_dev_to_pci_dev(edev);
+ if (dev)
+ *freset |= dev->needs_freset;
+
+ return NULL;
+}
+
+/**
+ * eeh_reset_pe_once - Assert the pci #RST line for 1/4 second
+ * @pe: EEH PE
+ *
+ * Assert the PCI #RST line for 1/4 second.
+ */
+static void eeh_reset_pe_once(struct eeh_pe *pe)
+{
+ unsigned int freset = 0;
+
+ /* Determine type of EEH reset required for
+ * Partitionable Endpoint, a hot-reset (1)
+ * or a fundamental reset (3).
+ * A fundamental reset required by any device under
+ * Partitionable Endpoint trumps hot-reset.
+ */
+ eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset);
+
+ if (freset)
+ eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
+ else
+ eeh_ops->reset(pe, EEH_RESET_HOT);
+
+ eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
+}
+
+/**
+ * eeh_reset_pe - Reset the indicated PE
+ * @pe: EEH PE
+ *
+ * This routine should be called to reset indicated device, including
+ * PE. A PE might include multiple PCI devices and sometimes PCI bridges
+ * might be involved as well.
+ */
+int eeh_reset_pe(struct eeh_pe *pe)
+{
+ int flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
+ int i, rc;
+
+ /* Take three shots at resetting the bus */
+ for (i=0; i<3; i++) {
+ eeh_reset_pe_once(pe);
+
+ /*
+ * EEH_PE_ISOLATED is expected to be removed after
+ * BAR restore.
+ */
+ rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
+ if ((rc & flags) == flags)
+ return 0;
+
+ if (rc < 0) {
+ pr_err("%s: Unrecoverable slot failure on PHB#%d-PE#%x",
+ __func__, pe->phb->global_number, pe->addr);
+ return -1;
+ }
+ pr_err("EEH: bus reset %d failed on PHB#%d-PE#%x, rc=%d\n",
+ i+1, pe->phb->global_number, pe->addr, rc);
+ }
+
+ return -1;
+}
+
+/**
+ * eeh_save_bars - Save device bars
+ * @edev: PCI device associated EEH device
+ *
+ * Save the values of the device bars. Unlike the restore
+ * routine, this routine is *not* recursive. This is because
+ * PCI devices are added individually; but, for the restore,
+ * an entire slot is reset at a time.
+ */
+void eeh_save_bars(struct eeh_dev *edev)
+{
+ int i;
+ struct device_node *dn;
+
+ if (!edev)
+ return;
+ dn = eeh_dev_to_of_node(edev);
+
+ for (i = 0; i < 16; i++)
+ eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]);
+
+ /*
+ * For PCI bridges including root port, we need enable bus
+ * master explicitly. Otherwise, it can't fetch IODA table
+ * entries correctly. So we cache the bit in advance so that
+ * we can restore it after reset, either PHB range or PE range.
+ */
+ if (edev->mode & EEH_DEV_BRIDGE)
+ edev->config_space[1] |= PCI_COMMAND_MASTER;
+}
+
+/**
+ * eeh_ops_register - Register platform dependent EEH operations
+ * @ops: platform dependent EEH operations
+ *
+ * Register the platform dependent EEH operation callback
+ * functions. The platform should call this function before
+ * any other EEH operations.
+ */
+int __init eeh_ops_register(struct eeh_ops *ops)
+{
+ if (!ops->name) {
+ pr_warning("%s: Invalid EEH ops name for %p\n",
+ __func__, ops);
+ return -EINVAL;
+ }
+
+ if (eeh_ops && eeh_ops != ops) {
+ pr_warning("%s: EEH ops of platform %s already existing (%s)\n",
+ __func__, eeh_ops->name, ops->name);
+ return -EEXIST;
+ }
+
+ eeh_ops = ops;
+
+ return 0;
+}
+
+/**
+ * eeh_ops_unregister - Unreigster platform dependent EEH operations
+ * @name: name of EEH platform operations
+ *
+ * Unregister the platform dependent EEH operation callback
+ * functions.
+ */
+int __exit eeh_ops_unregister(const char *name)
+{
+ if (!name || !strlen(name)) {
+ pr_warning("%s: Invalid EEH ops name\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ if (eeh_ops && !strcmp(eeh_ops->name, name)) {
+ eeh_ops = NULL;
+ return 0;
+ }
+
+ return -EEXIST;
+}
+
+static int eeh_reboot_notifier(struct notifier_block *nb,
+ unsigned long action, void *unused)
+{
+ eeh_set_enable(false);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block eeh_reboot_nb = {
+ .notifier_call = eeh_reboot_notifier,
+};
+
+/**
+ * eeh_init - EEH initialization
+ *
+ * Initialize EEH by trying to enable it for all of the adapters in the system.
+ * As a side effect we can determine here if eeh is supported at all.
+ * Note that we leave EEH on so failed config cycles won't cause a machine
+ * check. If a user turns off EEH for a particular adapter they are really
+ * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't
+ * grant access to a slot if EEH isn't enabled, and so we always enable
+ * EEH for all slots/all devices.
+ *
+ * The eeh-force-off option disables EEH checking globally, for all slots.
+ * Even if force-off is set, the EEH hardware is still enabled, so that
+ * newer systems can boot.
+ */
+int eeh_init(void)
+{
+ struct pci_controller *hose, *tmp;
+ struct device_node *phb;
+ static int cnt = 0;
+ int ret = 0;
+
+ /*
+ * We have to delay the initialization on PowerNV after
+ * the PCI hierarchy tree has been built because the PEs
+ * are figured out based on PCI devices instead of device
+ * tree nodes
+ */
+ if (machine_is(powernv) && cnt++ <= 0)
+ return ret;
+
+ /* Register reboot notifier */
+ ret = register_reboot_notifier(&eeh_reboot_nb);
+ if (ret) {
+ pr_warn("%s: Failed to register notifier (%d)\n",
+ __func__, ret);
+ return ret;
+ }
+
+ /* call platform initialization function */
+ if (!eeh_ops) {
+ pr_warning("%s: Platform EEH operation not found\n",
+ __func__);
+ return -EEXIST;
+ } else if ((ret = eeh_ops->init())) {
+ pr_warning("%s: Failed to call platform init function (%d)\n",
+ __func__, ret);
+ return ret;
+ }
+
+ /* Initialize EEH event */
+ ret = eeh_event_init();
+ if (ret)
+ return ret;
+
+ /* Enable EEH for all adapters */
+ if (eeh_probe_mode_devtree()) {
+ list_for_each_entry_safe(hose, tmp,
+ &hose_list, list_node) {
+ phb = hose->dn;
+ traverse_pci_devices(phb, eeh_ops->of_probe, NULL);
+ }
+ } else if (eeh_probe_mode_dev()) {
+ list_for_each_entry_safe(hose, tmp,
+ &hose_list, list_node)
+ pci_walk_bus(hose->bus, eeh_ops->dev_probe, NULL);
+ } else {
+ pr_warn("%s: Invalid probe mode %x",
+ __func__, eeh_subsystem_flags);
+ return -EINVAL;
+ }
+
+ /*
+ * Call platform post-initialization. Actually, It's good chance
+ * to inform platform that EEH is ready to supply service if the
+ * I/O cache stuff has been built up.
+ */
+ if (eeh_ops->post_init) {
+ ret = eeh_ops->post_init();
+ if (ret)
+ return ret;
+ }
+
+ if (eeh_enabled())
+ pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
+ else
+ pr_warning("EEH: No capable adapters found\n");
+
+ return ret;
+}
+
+core_initcall_sync(eeh_init);
+
+/**
+ * eeh_add_device_early - Enable EEH for the indicated device_node
+ * @dn: device node for which to set up EEH
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ * This routine must be called before any i/o is performed to the
+ * adapter (inluding any config-space i/o).
+ * Whether this actually enables EEH or not for this device depends
+ * on the CEC architecture, type of the device, on earlier boot
+ * command-line arguments & etc.
+ */
+void eeh_add_device_early(struct device_node *dn)
+{
+ struct pci_controller *phb;
+
+ /*
+ * If we're doing EEH probe based on PCI device, we
+ * would delay the probe until late stage because
+ * the PCI device isn't available this moment.
+ */
+ if (!eeh_probe_mode_devtree())
+ return;
+
+ if (!of_node_to_eeh_dev(dn))
+ return;
+ phb = of_node_to_eeh_dev(dn)->phb;
+
+ /* USB Bus children of PCI devices will not have BUID's */
+ if (NULL == phb || 0 == phb->buid)
+ return;
+
+ eeh_ops->of_probe(dn, NULL);
+}
+
+/**
+ * eeh_add_device_tree_early - Enable EEH for the indicated device
+ * @dn: device node
+ *
+ * This routine must be used to perform EEH initialization for the
+ * indicated PCI device that was added after system boot (e.g.
+ * hotplug, dlpar).
+ */
+void eeh_add_device_tree_early(struct device_node *dn)
+{
+ struct device_node *sib;
+
+ for_each_child_of_node(dn, sib)
+ eeh_add_device_tree_early(sib);
+ eeh_add_device_early(dn);
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_tree_early);
+
+/**
+ * eeh_add_device_late - Perform EEH initialization for the indicated pci device
+ * @dev: pci device for which to set up EEH
+ *
+ * This routine must be used to complete EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ */
+void eeh_add_device_late(struct pci_dev *dev)
+{
+ struct device_node *dn;
+ struct eeh_dev *edev;
+
+ if (!dev || !eeh_enabled())
+ return;
+
+ pr_debug("EEH: Adding device %s\n", pci_name(dev));
+
+ dn = pci_device_to_OF_node(dev);
+ edev = of_node_to_eeh_dev(dn);
+ if (edev->pdev == dev) {
+ pr_debug("EEH: Already referenced !\n");
+ return;
+ }
+
+ /*
+ * The EEH cache might not be removed correctly because of
+ * unbalanced kref to the device during unplug time, which
+ * relies on pcibios_release_device(). So we have to remove
+ * that here explicitly.
+ */
+ if (edev->pdev) {
+ eeh_rmv_from_parent_pe(edev);
+ eeh_addr_cache_rmv_dev(edev->pdev);
+ eeh_sysfs_remove_device(edev->pdev);
+ edev->mode &= ~EEH_DEV_SYSFS;
+
+ /*
+ * We definitely should have the PCI device removed
+ * though it wasn't correctly. So we needn't call
+ * into error handler afterwards.
+ */
+ edev->mode |= EEH_DEV_NO_HANDLER;
+
+ edev->pdev = NULL;
+ dev->dev.archdata.edev = NULL;
+ }
+
+ edev->pdev = dev;
+ dev->dev.archdata.edev = edev;
+
+ /*
+ * We have to do the EEH probe here because the PCI device
+ * hasn't been created yet in the early stage.
+ */
+ if (eeh_probe_mode_dev())
+ eeh_ops->dev_probe(dev, NULL);
+
+ eeh_addr_cache_insert_dev(dev);
+}
+
+/**
+ * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI bus
+ * @bus: PCI bus
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices which are attached to the indicated PCI bus. The PCI bus
+ * is added after system boot through hotplug or dlpar.
+ */
+void eeh_add_device_tree_late(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ eeh_add_device_late(dev);
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+ struct pci_bus *subbus = dev->subordinate;
+ if (subbus)
+ eeh_add_device_tree_late(subbus);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
+
+/**
+ * eeh_add_sysfs_files - Add EEH sysfs files for the indicated PCI bus
+ * @bus: PCI bus
+ *
+ * This routine must be used to add EEH sysfs files for PCI
+ * devices which are attached to the indicated PCI bus. The PCI bus
+ * is added after system boot through hotplug or dlpar.
+ */
+void eeh_add_sysfs_files(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ eeh_sysfs_add_device(dev);
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+ struct pci_bus *subbus = dev->subordinate;
+ if (subbus)
+ eeh_add_sysfs_files(subbus);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(eeh_add_sysfs_files);
+
+/**
+ * eeh_remove_device - Undo EEH setup for the indicated pci device
+ * @dev: pci device to be removed
+ *
+ * This routine should be called when a device is removed from
+ * a running system (e.g. by hotplug or dlpar). It unregisters
+ * the PCI device from the EEH subsystem. I/O errors affecting
+ * this device will no longer be detected after this call; thus,
+ * i/o errors affecting this slot may leave this device unusable.
+ */
+void eeh_remove_device(struct pci_dev *dev)
+{
+ struct eeh_dev *edev;
+
+ if (!dev || !eeh_enabled())
+ return;
+ edev = pci_dev_to_eeh_dev(dev);
+
+ /* Unregister the device with the EEH/PCI address search system */
+ pr_debug("EEH: Removing device %s\n", pci_name(dev));
+
+ if (!edev || !edev->pdev || !edev->pe) {
+ pr_debug("EEH: Not referenced !\n");
+ return;
+ }
+
+ /*
+ * During the hotplug for EEH error recovery, we need the EEH
+ * device attached to the parent PE in order for BAR restore
+ * a bit later. So we keep it for BAR restore and remove it
+ * from the parent PE during the BAR resotre.
+ */
+ edev->pdev = NULL;
+ dev->dev.archdata.edev = NULL;
+ if (!(edev->pe->state & EEH_PE_KEEP))
+ eeh_rmv_from_parent_pe(edev);
+ else
+ edev->mode |= EEH_DEV_DISCONNECTED;
+
+ /*
+ * We're removing from the PCI subsystem, that means
+ * the PCI device driver can't support EEH or not
+ * well. So we rely on hotplug completely to do recovery
+ * for the specific PCI device.
+ */
+ edev->mode |= EEH_DEV_NO_HANDLER;
+
+ eeh_addr_cache_rmv_dev(dev);
+ eeh_sysfs_remove_device(dev);
+ edev->mode &= ~EEH_DEV_SYSFS;
+}
+
+static int proc_eeh_show(struct seq_file *m, void *v)
+{
+ if (!eeh_enabled()) {
+ seq_printf(m, "EEH Subsystem is globally disabled\n");
+ seq_printf(m, "eeh_total_mmio_ffs=%llu\n", eeh_stats.total_mmio_ffs);
+ } else {
+ seq_printf(m, "EEH Subsystem is enabled\n");
+ seq_printf(m,
+ "no device=%llu\n"
+ "no device node=%llu\n"
+ "no config address=%llu\n"
+ "check not wanted=%llu\n"
+ "eeh_total_mmio_ffs=%llu\n"
+ "eeh_false_positives=%llu\n"
+ "eeh_slot_resets=%llu\n",
+ eeh_stats.no_device,
+ eeh_stats.no_dn,
+ eeh_stats.no_cfg_addr,
+ eeh_stats.ignored_check,
+ eeh_stats.total_mmio_ffs,
+ eeh_stats.false_positives,
+ eeh_stats.slot_resets);
+ }
+
+ return 0;
+}
+
+static int proc_eeh_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_eeh_show, NULL);
+}
+
+static const struct file_operations proc_eeh_operations = {
+ .open = proc_eeh_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#ifdef CONFIG_DEBUG_FS
+static int eeh_enable_dbgfs_set(void *data, u64 val)
+{
+ if (val)
+ eeh_subsystem_flags &= ~EEH_FORCE_DISABLED;
+ else
+ eeh_subsystem_flags |= EEH_FORCE_DISABLED;
+
+ /* Notify the backend */
+ if (eeh_ops->post_init)
+ eeh_ops->post_init();
+
+ return 0;
+}
+
+static int eeh_enable_dbgfs_get(void *data, u64 *val)
+{
+ if (eeh_enabled())
+ *val = 0x1ul;
+ else
+ *val = 0x0ul;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get,
+ eeh_enable_dbgfs_set, "0x%llx\n");
+#endif
+
+static int __init eeh_init_proc(void)
+{
+ if (machine_is(pseries) || machine_is(powernv)) {
+ proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations);
+#ifdef CONFIG_DEBUG_FS
+ debugfs_create_file("eeh_enable", 0600,
+ powerpc_debugfs_root, NULL,
+ &eeh_enable_dbgfs_ops);
+#endif
+ }
+
+ return 0;
+}
+__initcall(eeh_init_proc);
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
new file mode 100644
index 00000000000..e8c9fd546a5
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -0,0 +1,310 @@
+/*
+ * PCI address cache; allows the lookup of PCI devices based on I/O address
+ *
+ * Copyright IBM Corporation 2004
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2004
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+
+/**
+ * The pci address cache subsystem. This subsystem places
+ * PCI device address resources into a red-black tree, sorted
+ * according to the address range, so that given only an i/o
+ * address, the corresponding PCI device can be **quickly**
+ * found. It is safe to perform an address lookup in an interrupt
+ * context; this ability is an important feature.
+ *
+ * Currently, the only customer of this code is the EEH subsystem;
+ * thus, this code has been somewhat tailored to suit EEH better.
+ * In particular, the cache does *not* hold the addresses of devices
+ * for which EEH is not enabled.
+ *
+ * (Implementation Note: The RB tree seems to be better/faster
+ * than any hash algo I could think of for this problem, even
+ * with the penalty of slow pointer chases for d-cache misses).
+ */
+struct pci_io_addr_range {
+ struct rb_node rb_node;
+ unsigned long addr_lo;
+ unsigned long addr_hi;
+ struct eeh_dev *edev;
+ struct pci_dev *pcidev;
+ unsigned int flags;
+};
+
+static struct pci_io_addr_cache {
+ struct rb_root rb_root;
+ spinlock_t piar_lock;
+} pci_io_addr_cache_root;
+
+static inline struct eeh_dev *__eeh_addr_cache_get_device(unsigned long addr)
+{
+ struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
+
+ while (n) {
+ struct pci_io_addr_range *piar;
+ piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+ if (addr < piar->addr_lo)
+ n = n->rb_left;
+ else if (addr > piar->addr_hi)
+ n = n->rb_right;
+ else
+ return piar->edev;
+ }
+
+ return NULL;
+}
+
+/**
+ * eeh_addr_cache_get_dev - Get device, given only address
+ * @addr: mmio (PIO) phys address or i/o port number
+ *
+ * Given an mmio phys address, or a port number, find a pci device
+ * that implements this address. Be sure to pci_dev_put the device
+ * when finished. I/O port numbers are assumed to be offset
+ * from zero (that is, they do *not* have pci_io_addr added in).
+ * It is safe to call this function within an interrupt.
+ */
+struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr)
+{
+ struct eeh_dev *edev;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+ edev = __eeh_addr_cache_get_device(addr);
+ spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+ return edev;
+}
+
+#ifdef DEBUG
+/*
+ * Handy-dandy debug print routine, does nothing more
+ * than print out the contents of our addr cache.
+ */
+static void eeh_addr_cache_print(struct pci_io_addr_cache *cache)
+{
+ struct rb_node *n;
+ int cnt = 0;
+
+ n = rb_first(&cache->rb_root);
+ while (n) {
+ struct pci_io_addr_range *piar;
+ piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+ pr_debug("PCI: %s addr range %d [%lx-%lx]: %s\n",
+ (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
+ piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev));
+ cnt++;
+ n = rb_next(n);
+ }
+}
+#endif
+
+/* Insert address range into the rb tree. */
+static struct pci_io_addr_range *
+eeh_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
+ unsigned long ahi, unsigned int flags)
+{
+ struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct pci_io_addr_range *piar;
+
+ /* Walk tree, find a place to insert into tree */
+ while (*p) {
+ parent = *p;
+ piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
+ if (ahi < piar->addr_lo) {
+ p = &parent->rb_left;
+ } else if (alo > piar->addr_hi) {
+ p = &parent->rb_right;
+ } else {
+ if (dev != piar->pcidev ||
+ alo != piar->addr_lo || ahi != piar->addr_hi) {
+ pr_warning("PIAR: overlapping address range\n");
+ }
+ return piar;
+ }
+ }
+ piar = kzalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
+ if (!piar)
+ return NULL;
+
+ piar->addr_lo = alo;
+ piar->addr_hi = ahi;
+ piar->edev = pci_dev_to_eeh_dev(dev);
+ piar->pcidev = dev;
+ piar->flags = flags;
+
+#ifdef DEBUG
+ pr_debug("PIAR: insert range=[%lx:%lx] dev=%s\n",
+ alo, ahi, pci_name(dev));
+#endif
+
+ rb_link_node(&piar->rb_node, parent, p);
+ rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
+
+ return piar;
+}
+
+static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
+{
+ struct device_node *dn;
+ struct eeh_dev *edev;
+ int i;
+
+ dn = pci_device_to_OF_node(dev);
+ if (!dn) {
+ pr_warning("PCI: no pci dn found for dev=%s\n", pci_name(dev));
+ return;
+ }
+
+ edev = of_node_to_eeh_dev(dn);
+ if (!edev) {
+ pr_warning("PCI: no EEH dev found for dn=%s\n",
+ dn->full_name);
+ return;
+ }
+
+ /* Skip any devices for which EEH is not enabled. */
+ if (!eeh_probe_mode_dev() && !edev->pe) {
+#ifdef DEBUG
+ pr_info("PCI: skip building address cache for=%s - %s\n",
+ pci_name(dev), dn->full_name);
+#endif
+ return;
+ }
+
+ /* Walk resources on this device, poke them into the tree */
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+ unsigned long start = pci_resource_start(dev,i);
+ unsigned long end = pci_resource_end(dev,i);
+ unsigned int flags = pci_resource_flags(dev,i);
+
+ /* We are interested only bus addresses, not dma or other stuff */
+ if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+ continue;
+ if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
+ continue;
+ eeh_addr_cache_insert(dev, start, end, flags);
+ }
+}
+
+/**
+ * eeh_addr_cache_insert_dev - Add a device to the address cache
+ * @dev: PCI device whose I/O addresses we are interested in.
+ *
+ * In order to support the fast lookup of devices based on addresses,
+ * we maintain a cache of devices that can be quickly searched.
+ * This routine adds a device to that cache.
+ */
+void eeh_addr_cache_insert_dev(struct pci_dev *dev)
+{
+ unsigned long flags;
+
+ /* Ignore PCI bridges */
+ if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)
+ return;
+
+ spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+ __eeh_addr_cache_insert_dev(dev);
+ spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+static inline void __eeh_addr_cache_rmv_dev(struct pci_dev *dev)
+{
+ struct rb_node *n;
+
+restart:
+ n = rb_first(&pci_io_addr_cache_root.rb_root);
+ while (n) {
+ struct pci_io_addr_range *piar;
+ piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+ if (piar->pcidev == dev) {
+ rb_erase(n, &pci_io_addr_cache_root.rb_root);
+ kfree(piar);
+ goto restart;
+ }
+ n = rb_next(n);
+ }
+}
+
+/**
+ * eeh_addr_cache_rmv_dev - remove pci device from addr cache
+ * @dev: device to remove
+ *
+ * Remove a device from the addr-cache tree.
+ * This is potentially expensive, since it will walk
+ * the tree multiple times (once per resource).
+ * But so what; device removal doesn't need to be that fast.
+ */
+void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+ __eeh_addr_cache_rmv_dev(dev);
+ spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+/**
+ * eeh_addr_cache_build - Build a cache of I/O addresses
+ *
+ * Build a cache of pci i/o addresses. This cache will be used to
+ * find the pci device that corresponds to a given address.
+ * This routine scans all pci busses to build the cache.
+ * Must be run late in boot process, after the pci controllers
+ * have been scanned for devices (after all device resources are known).
+ */
+void eeh_addr_cache_build(void)
+{
+ struct device_node *dn;
+ struct eeh_dev *edev;
+ struct pci_dev *dev = NULL;
+
+ spin_lock_init(&pci_io_addr_cache_root.piar_lock);
+
+ for_each_pci_dev(dev) {
+ dn = pci_device_to_OF_node(dev);
+ if (!dn)
+ continue;
+
+ edev = of_node_to_eeh_dev(dn);
+ if (!edev)
+ continue;
+
+ dev->dev.archdata.edev = edev;
+ edev->pdev = dev;
+
+ eeh_addr_cache_insert_dev(dev);
+ eeh_sysfs_add_device(dev);
+ }
+
+#ifdef DEBUG
+ /* Verify tree built up above, echo back the list of addrs. */
+ eeh_addr_cache_print(&pci_io_addr_cache_root);
+#endif
+}
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
new file mode 100644
index 00000000000..1efa28f5fc5
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -0,0 +1,112 @@
+/*
+ * The file intends to implement dynamic creation of EEH device, which will
+ * be bound with OF node and PCI device simutaneously. The EEH devices would
+ * be foundamental information for EEH core components to work proerly. Besides,
+ * We have to support multiple situations where dynamic creation of EEH device
+ * is required:
+ *
+ * 1) Before PCI emunation starts, we need create EEH devices according to the
+ * PCI sensitive OF nodes.
+ * 2) When PCI emunation is done, we need do the binding between PCI device and
+ * the associated EEH device.
+ * 3) DR (Dynamic Reconfiguration) would create PCI sensitive OF node. EEH device
+ * will be created while PCI sensitive OF node is detected from DR.
+ * 4) PCI hotplug needs redoing the binding between PCI device and EEH device. If
+ * PHB is newly inserted, we also need create EEH devices accordingly.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+/**
+ * eeh_dev_init - Create EEH device according to OF node
+ * @dn: device node
+ * @data: PHB
+ *
+ * It will create EEH device according to the given OF node. The function
+ * might be called by PCI emunation, DR, PHB hotplug.
+ */
+void *eeh_dev_init(struct device_node *dn, void *data)
+{
+ struct pci_controller *phb = data;
+ struct eeh_dev *edev;
+
+ /* Allocate EEH device */
+ edev = kzalloc(sizeof(*edev), GFP_KERNEL);
+ if (!edev) {
+ pr_warning("%s: out of memory\n", __func__);
+ return NULL;
+ }
+
+ /* Associate EEH device with OF node */
+ PCI_DN(dn)->edev = edev;
+ edev->dn = dn;
+ edev->phb = phb;
+ INIT_LIST_HEAD(&edev->list);
+
+ return NULL;
+}
+
+/**
+ * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB
+ * @phb: PHB
+ *
+ * Scan the PHB OF node and its child association, then create the
+ * EEH devices accordingly
+ */
+void eeh_dev_phb_init_dynamic(struct pci_controller *phb)
+{
+ struct device_node *dn = phb->dn;
+
+ /* EEH PE for PHB */
+ eeh_phb_pe_create(phb);
+
+ /* EEH device for PHB */
+ eeh_dev_init(dn, phb);
+
+ /* EEH devices for children OF nodes */
+ traverse_pci_devices(dn, eeh_dev_init, phb);
+}
+
+/**
+ * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs
+ *
+ * Scan all the existing PHBs and create EEH devices for their OF
+ * nodes and their children OF nodes
+ */
+static int __init eeh_dev_phb_init(void)
+{
+ struct pci_controller *phb, *tmp;
+
+ list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
+ eeh_dev_phb_init_dynamic(phb);
+
+ pr_info("EEH: devices created\n");
+
+ return 0;
+}
+
+core_initcall(eeh_dev_phb_init);
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
new file mode 100644
index 00000000000..420da61d4ce
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -0,0 +1,870 @@
+/*
+ * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
+ * Copyright IBM Corp. 2004 2005
+ * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+
+/**
+ * eeh_pcid_name - Retrieve name of PCI device driver
+ * @pdev: PCI device
+ *
+ * This routine is used to retrieve the name of PCI device driver
+ * if that's valid.
+ */
+static inline const char *eeh_pcid_name(struct pci_dev *pdev)
+{
+ if (pdev && pdev->dev.driver)
+ return pdev->dev.driver->name;
+ return "";
+}
+
+/**
+ * eeh_pcid_get - Get the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is used to retrieve the PCI device driver for
+ * the indicated PCI device. Besides, we will increase the reference
+ * of the PCI device driver to prevent that being unloaded on
+ * the fly. Otherwise, kernel crash would be seen.
+ */
+static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
+{
+ if (!pdev || !pdev->driver)
+ return NULL;
+
+ if (!try_module_get(pdev->driver->driver.owner))
+ return NULL;
+
+ return pdev->driver;
+}
+
+/**
+ * eeh_pcid_put - Dereference on the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is called to do dereference on the PCI device
+ * driver of the indicated PCI device.
+ */
+static inline void eeh_pcid_put(struct pci_dev *pdev)
+{
+ if (!pdev || !pdev->driver)
+ return;
+
+ module_put(pdev->driver->driver.owner);
+}
+
+#if 0
+static void print_device_node_tree(struct pci_dn *pdn, int dent)
+{
+ int i;
+ struct device_node *pc;
+
+ if (!pdn)
+ return;
+ for (i = 0; i < dent; i++)
+ printk(" ");
+ printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
+ pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr,
+ pdn->eeh_pe_config_addr, pdn->node->full_name);
+ dent += 3;
+ pc = pdn->node->child;
+ while (pc) {
+ print_device_node_tree(PCI_DN(pc), dent);
+ pc = pc->sibling;
+ }
+}
+#endif
+
+/**
+ * eeh_disable_irq - Disable interrupt for the recovering device
+ * @dev: PCI device
+ *
+ * This routine must be called when reporting temporary or permanent
+ * error to the particular PCI device to disable interrupt of that
+ * device. If the device has enabled MSI or MSI-X interrupt, we needn't
+ * do real work because EEH should freeze DMA transfers for those PCI
+ * devices encountering EEH errors, which includes MSI or MSI-X.
+ */
+static void eeh_disable_irq(struct pci_dev *dev)
+{
+ struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+
+ /* Don't disable MSI and MSI-X interrupts. They are
+ * effectively disabled by the DMA Stopped state
+ * when an EEH error occurs.
+ */
+ if (dev->msi_enabled || dev->msix_enabled)
+ return;
+
+ if (!irq_has_action(dev->irq))
+ return;
+
+ edev->mode |= EEH_DEV_IRQ_DISABLED;
+ disable_irq_nosync(dev->irq);
+}
+
+/**
+ * eeh_enable_irq - Enable interrupt for the recovering device
+ * @dev: PCI device
+ *
+ * This routine must be called to enable interrupt while failed
+ * device could be resumed.
+ */
+static void eeh_enable_irq(struct pci_dev *dev)
+{
+ struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+
+ if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
+ edev->mode &= ~EEH_DEV_IRQ_DISABLED;
+ /*
+ * FIXME !!!!!
+ *
+ * This is just ass backwards. This maze has
+ * unbalanced irq_enable/disable calls. So instead of
+ * finding the root cause it works around the warning
+ * in the irq_enable code by conditionally calling
+ * into it.
+ *
+ * That's just wrong.The warning in the core code is
+ * there to tell people to fix their assymetries in
+ * their own code, not by abusing the core information
+ * to avoid it.
+ *
+ * I so wish that the assymetry would be the other way
+ * round and a few more irq_disable calls render that
+ * shit unusable forever.
+ *
+ * tglx
+ */
+ if (irqd_irq_disabled(irq_get_irq_data(dev->irq)))
+ enable_irq(dev->irq);
+ }
+}
+
+static bool eeh_dev_removed(struct eeh_dev *edev)
+{
+ /* EEH device removed ? */
+ if (!edev || (edev->mode & EEH_DEV_REMOVED))
+ return true;
+
+ return false;
+}
+
+/**
+ * eeh_report_error - Report pci error to each device driver
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * Report an EEH error to each device driver, collect up and
+ * merge the device driver responses. Cumulative response
+ * passed back in "userdata".
+ */
+static void *eeh_report_error(void *data, void *userdata)
+{
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ enum pci_ers_result rc, *res = userdata;
+ struct pci_driver *driver;
+
+ if (!dev || eeh_dev_removed(edev))
+ return NULL;
+ dev->error_state = pci_channel_io_frozen;
+
+ driver = eeh_pcid_get(dev);
+ if (!driver) return NULL;
+
+ eeh_disable_irq(dev);
+
+ if (!driver->err_handler ||
+ !driver->err_handler->error_detected) {
+ eeh_pcid_put(dev);
+ return NULL;
+ }
+
+ rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
+
+ /* A driver that needs a reset trumps all others */
+ if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+ if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+ eeh_pcid_put(dev);
+ return NULL;
+}
+
+/**
+ * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * Tells each device driver that IO ports, MMIO and config space I/O
+ * are now enabled. Collects up and merges the device driver responses.
+ * Cumulative response passed back in "userdata".
+ */
+static void *eeh_report_mmio_enabled(void *data, void *userdata)
+{
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ enum pci_ers_result rc, *res = userdata;
+ struct pci_driver *driver;
+
+ if (!dev || eeh_dev_removed(edev))
+ return NULL;
+
+ driver = eeh_pcid_get(dev);
+ if (!driver) return NULL;
+
+ if (!driver->err_handler ||
+ !driver->err_handler->mmio_enabled ||
+ (edev->mode & EEH_DEV_NO_HANDLER)) {
+ eeh_pcid_put(dev);
+ return NULL;
+ }
+
+ rc = driver->err_handler->mmio_enabled(dev);
+
+ /* A driver that needs a reset trumps all others */
+ if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+ if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+ eeh_pcid_put(dev);
+ return NULL;
+}
+
+/**
+ * eeh_report_reset - Tell device that slot has been reset
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * This routine must be called while EEH tries to reset particular
+ * PCI device so that the associated PCI device driver could take
+ * some actions, usually to save data the driver needs so that the
+ * driver can work again while the device is recovered.
+ */
+static void *eeh_report_reset(void *data, void *userdata)
+{
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ enum pci_ers_result rc, *res = userdata;
+ struct pci_driver *driver;
+
+ if (!dev || eeh_dev_removed(edev))
+ return NULL;
+ dev->error_state = pci_channel_io_normal;
+
+ driver = eeh_pcid_get(dev);
+ if (!driver) return NULL;
+
+ eeh_enable_irq(dev);
+
+ if (!driver->err_handler ||
+ !driver->err_handler->slot_reset ||
+ (edev->mode & EEH_DEV_NO_HANDLER)) {
+ eeh_pcid_put(dev);
+ return NULL;
+ }
+
+ rc = driver->err_handler->slot_reset(dev);
+ if ((*res == PCI_ERS_RESULT_NONE) ||
+ (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
+ if (*res == PCI_ERS_RESULT_DISCONNECT &&
+ rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+
+ eeh_pcid_put(dev);
+ return NULL;
+}
+
+/**
+ * eeh_report_resume - Tell device to resume normal operations
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * This routine must be called to notify the device driver that it
+ * could resume so that the device driver can do some initialization
+ * to make the recovered device work again.
+ */
+static void *eeh_report_resume(void *data, void *userdata)
+{
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ struct pci_driver *driver;
+
+ if (!dev || eeh_dev_removed(edev))
+ return NULL;
+ dev->error_state = pci_channel_io_normal;
+
+ driver = eeh_pcid_get(dev);
+ if (!driver) return NULL;
+
+ eeh_enable_irq(dev);
+
+ if (!driver->err_handler ||
+ !driver->err_handler->resume ||
+ (edev->mode & EEH_DEV_NO_HANDLER)) {
+ edev->mode &= ~EEH_DEV_NO_HANDLER;
+ eeh_pcid_put(dev);
+ return NULL;
+ }
+
+ driver->err_handler->resume(dev);
+
+ eeh_pcid_put(dev);
+ return NULL;
+}
+
+/**
+ * eeh_report_failure - Tell device driver that device is dead.
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * This informs the device driver that the device is permanently
+ * dead, and that no further recovery attempts will be made on it.
+ */
+static void *eeh_report_failure(void *data, void *userdata)
+{
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ struct pci_driver *driver;
+
+ if (!dev || eeh_dev_removed(edev))
+ return NULL;
+ dev->error_state = pci_channel_io_perm_failure;
+
+ driver = eeh_pcid_get(dev);
+ if (!driver) return NULL;
+
+ eeh_disable_irq(dev);
+
+ if (!driver->err_handler ||
+ !driver->err_handler->error_detected) {
+ eeh_pcid_put(dev);
+ return NULL;
+ }
+
+ driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
+
+ eeh_pcid_put(dev);
+ return NULL;
+}
+
+static void *eeh_rmv_device(void *data, void *userdata)
+{
+ struct pci_driver *driver;
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ int *removed = (int *)userdata;
+
+ /*
+ * Actually, we should remove the PCI bridges as well.
+ * However, that's lots of complexity to do that,
+ * particularly some of devices under the bridge might
+ * support EEH. So we just care about PCI devices for
+ * simplicity here.
+ */
+ if (!dev || (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE))
+ return NULL;
+
+ /*
+ * We rely on count-based pcibios_release_device() to
+ * detach permanently offlined PEs. Unfortunately, that's
+ * not reliable enough. We might have the permanently
+ * offlined PEs attached, but we needn't take care of
+ * them and their child devices.
+ */
+ if (eeh_dev_removed(edev))
+ return NULL;
+
+ driver = eeh_pcid_get(dev);
+ if (driver) {
+ eeh_pcid_put(dev);
+ if (driver->err_handler)
+ return NULL;
+ }
+
+ /* Remove it from PCI subsystem */
+ pr_debug("EEH: Removing %s without EEH sensitive driver\n",
+ pci_name(dev));
+ edev->bus = dev->bus;
+ edev->mode |= EEH_DEV_DISCONNECTED;
+ (*removed)++;
+
+ pci_lock_rescan_remove();
+ pci_stop_and_remove_bus_device(dev);
+ pci_unlock_rescan_remove();
+
+ return NULL;
+}
+
+static void *eeh_pe_detach_dev(void *data, void *userdata)
+{
+ struct eeh_pe *pe = (struct eeh_pe *)data;
+ struct eeh_dev *edev, *tmp;
+
+ eeh_pe_for_each_dev(pe, edev, tmp) {
+ if (!(edev->mode & EEH_DEV_DISCONNECTED))
+ continue;
+
+ edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED);
+ eeh_rmv_from_parent_pe(edev);
+ }
+
+ return NULL;
+}
+
+/*
+ * Explicitly clear PE's frozen state for PowerNV where
+ * we have frozen PE until BAR restore is completed. It's
+ * harmless to clear it for pSeries. To be consistent with
+ * PE reset (for 3 times), we try to clear the frozen state
+ * for 3 times as well.
+ */
+static void *__eeh_clear_pe_frozen_state(void *data, void *flag)
+{
+ struct eeh_pe *pe = (struct eeh_pe *)data;
+ int i, rc;
+
+ for (i = 0; i < 3; i++) {
+ rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+ if (rc)
+ continue;
+ rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
+ if (!rc)
+ break;
+ }
+
+ /* The PE has been isolated, clear it */
+ if (rc) {
+ pr_warn("%s: Can't clear frozen PHB#%x-PE#%x (%d)\n",
+ __func__, pe->phb->global_number, pe->addr, rc);
+ return (void *)pe;
+ }
+
+ return NULL;
+}
+
+static int eeh_clear_pe_frozen_state(struct eeh_pe *pe)
+{
+ void *rc;
+
+ rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, NULL);
+ if (!rc)
+ eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+
+ return rc ? -EIO : 0;
+}
+
+/**
+ * eeh_reset_device - Perform actual reset of a pci slot
+ * @pe: EEH PE
+ * @bus: PCI bus corresponding to the isolcated slot
+ *
+ * This routine must be called to do reset on the indicated PE.
+ * During the reset, udev might be invoked because those affected
+ * PCI devices will be removed and then added.
+ */
+static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
+{
+ struct pci_bus *frozen_bus = eeh_pe_bus_get(pe);
+ struct timeval tstamp;
+ int cnt, rc, removed = 0;
+
+ /* pcibios will clear the counter; save the value */
+ cnt = pe->freeze_count;
+ tstamp = pe->tstamp;
+
+ /*
+ * We don't remove the corresponding PE instances because
+ * we need the information afterwords. The attached EEH
+ * devices are expected to be attached soon when calling
+ * into pcibios_add_pci_devices().
+ */
+ eeh_pe_state_mark(pe, EEH_PE_KEEP);
+ if (bus) {
+ pci_lock_rescan_remove();
+ pcibios_remove_pci_devices(bus);
+ pci_unlock_rescan_remove();
+ } else if (frozen_bus) {
+ eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed);
+ }
+
+ /*
+ * Reset the pci controller. (Asserts RST#; resets config space).
+ * Reconfigure bridges and devices. Don't try to bring the system
+ * up if the reset failed for some reason.
+ *
+ * During the reset, it's very dangerous to have uncontrolled PCI
+ * config accesses. So we prefer to block them. However, controlled
+ * PCI config accesses initiated from EEH itself are allowed.
+ */
+ eeh_pe_state_mark(pe, EEH_PE_RESET);
+ rc = eeh_reset_pe(pe);
+ if (rc) {
+ eeh_pe_state_clear(pe, EEH_PE_RESET);
+ return rc;
+ }
+
+ pci_lock_rescan_remove();
+
+ /* Restore PE */
+ eeh_ops->configure_bridge(pe);
+ eeh_pe_restore_bars(pe);
+ eeh_pe_state_clear(pe, EEH_PE_RESET);
+
+ /* Clear frozen state */
+ rc = eeh_clear_pe_frozen_state(pe);
+ if (rc)
+ return rc;
+
+ /* Give the system 5 seconds to finish running the user-space
+ * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes,
+ * this is a hack, but if we don't do this, and try to bring
+ * the device up before the scripts have taken it down,
+ * potentially weird things happen.
+ */
+ if (bus) {
+ pr_info("EEH: Sleep 5s ahead of complete hotplug\n");
+ ssleep(5);
+
+ /*
+ * The EEH device is still connected with its parent
+ * PE. We should disconnect it so the binding can be
+ * rebuilt when adding PCI devices.
+ */
+ eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
+ pcibios_add_pci_devices(bus);
+ } else if (frozen_bus && removed) {
+ pr_info("EEH: Sleep 5s ahead of partial hotplug\n");
+ ssleep(5);
+
+ eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
+ pcibios_add_pci_devices(frozen_bus);
+ }
+ eeh_pe_state_clear(pe, EEH_PE_KEEP);
+
+ pe->tstamp = tstamp;
+ pe->freeze_count = cnt;
+
+ pci_unlock_rescan_remove();
+ return 0;
+}
+
+/* The longest amount of time to wait for a pci device
+ * to come back on line, in seconds.
+ */
+#define MAX_WAIT_FOR_RECOVERY 300
+
+static void eeh_handle_normal_event(struct eeh_pe *pe)
+{
+ struct pci_bus *frozen_bus;
+ int rc = 0;
+ enum pci_ers_result result = PCI_ERS_RESULT_NONE;
+
+ frozen_bus = eeh_pe_bus_get(pe);
+ if (!frozen_bus) {
+ pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n",
+ __func__, pe->phb->global_number, pe->addr);
+ return;
+ }
+
+ eeh_pe_update_time_stamp(pe);
+ pe->freeze_count++;
+ if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
+ goto excess_failures;
+ pr_warning("EEH: This PCI device has failed %d times in the last hour\n",
+ pe->freeze_count);
+
+ /* Walk the various device drivers attached to this slot through
+ * a reset sequence, giving each an opportunity to do what it needs
+ * to accomplish the reset. Each child gets a report of the
+ * status ... if any child can't handle the reset, then the entire
+ * slot is dlpar removed and added.
+ */
+ pr_info("EEH: Notify device drivers to shutdown\n");
+ eeh_pe_dev_traverse(pe, eeh_report_error, &result);
+
+ /* Get the current PCI slot state. This can take a long time,
+ * sometimes over 3 seconds for certain systems.
+ */
+ rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
+ if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
+ pr_warning("EEH: Permanent failure\n");
+ goto hard_fail;
+ }
+
+ /* Since rtas may enable MMIO when posting the error log,
+ * don't post the error log until after all dev drivers
+ * have been informed.
+ */
+ pr_info("EEH: Collect temporary log\n");
+ eeh_slot_error_detail(pe, EEH_LOG_TEMP);
+
+ /* If all device drivers were EEH-unaware, then shut
+ * down all of the device drivers, and hope they
+ * go down willingly, without panicing the system.
+ */
+ if (result == PCI_ERS_RESULT_NONE) {
+ pr_info("EEH: Reset with hotplug activity\n");
+ rc = eeh_reset_device(pe, frozen_bus);
+ if (rc) {
+ pr_warning("%s: Unable to reset, err=%d\n",
+ __func__, rc);
+ goto hard_fail;
+ }
+ }
+
+ /* If all devices reported they can proceed, then re-enable MMIO */
+ if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+ pr_info("EEH: Enable I/O for affected devices\n");
+ rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+
+ if (rc < 0)
+ goto hard_fail;
+ if (rc) {
+ result = PCI_ERS_RESULT_NEED_RESET;
+ } else {
+ pr_info("EEH: Notify device drivers to resume I/O\n");
+ eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
+ }
+ }
+
+ /* If all devices reported they can proceed, then re-enable DMA */
+ if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+ pr_info("EEH: Enabled DMA for affected devices\n");
+ rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
+
+ if (rc < 0)
+ goto hard_fail;
+ if (rc) {
+ result = PCI_ERS_RESULT_NEED_RESET;
+ } else {
+ /*
+ * We didn't do PE reset for the case. The PE
+ * is still in frozen state. Clear it before
+ * resuming the PE.
+ */
+ eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+ result = PCI_ERS_RESULT_RECOVERED;
+ }
+ }
+
+ /* If any device has a hard failure, then shut off everything. */
+ if (result == PCI_ERS_RESULT_DISCONNECT) {
+ pr_warning("EEH: Device driver gave up\n");
+ goto hard_fail;
+ }
+
+ /* If any device called out for a reset, then reset the slot */
+ if (result == PCI_ERS_RESULT_NEED_RESET) {
+ pr_info("EEH: Reset without hotplug activity\n");
+ rc = eeh_reset_device(pe, NULL);
+ if (rc) {
+ pr_warning("%s: Cannot reset, err=%d\n",
+ __func__, rc);
+ goto hard_fail;
+ }
+
+ pr_info("EEH: Notify device drivers "
+ "the completion of reset\n");
+ result = PCI_ERS_RESULT_NONE;
+ eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
+ }
+
+ /* All devices should claim they have recovered by now. */
+ if ((result != PCI_ERS_RESULT_RECOVERED) &&
+ (result != PCI_ERS_RESULT_NONE)) {
+ pr_warning("EEH: Not recovered\n");
+ goto hard_fail;
+ }
+
+ /* Tell all device drivers that they can resume operations */
+ pr_info("EEH: Notify device driver to resume\n");
+ eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
+
+ return;
+
+excess_failures:
+ /*
+ * About 90% of all real-life EEH failures in the field
+ * are due to poorly seated PCI cards. Only 10% or so are
+ * due to actual, failed cards.
+ */
+ pr_err("EEH: PHB#%d-PE#%x has failed %d times in the\n"
+ "last hour and has been permanently disabled.\n"
+ "Please try reseating or replacing it.\n",
+ pe->phb->global_number, pe->addr,
+ pe->freeze_count);
+ goto perm_error;
+
+hard_fail:
+ pr_err("EEH: Unable to recover from failure from PHB#%d-PE#%x.\n"
+ "Please try reseating or replacing it\n",
+ pe->phb->global_number, pe->addr);
+
+perm_error:
+ eeh_slot_error_detail(pe, EEH_LOG_PERM);
+
+ /* Notify all devices that they're about to go down. */
+ eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
+
+ /* Mark the PE to be removed permanently */
+ pe->freeze_count = EEH_MAX_ALLOWED_FREEZES + 1;
+
+ /*
+ * Shut down the device drivers for good. We mark
+ * all removed devices correctly to avoid access
+ * the their PCI config any more.
+ */
+ if (frozen_bus) {
+ eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
+
+ pci_lock_rescan_remove();
+ pcibios_remove_pci_devices(frozen_bus);
+ pci_unlock_rescan_remove();
+ }
+}
+
+static void eeh_handle_special_event(void)
+{
+ struct eeh_pe *pe, *phb_pe;
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ unsigned long flags;
+ int rc;
+
+
+ do {
+ rc = eeh_ops->next_error(&pe);
+
+ switch (rc) {
+ case EEH_NEXT_ERR_DEAD_IOC:
+ /* Mark all PHBs in dead state */
+ eeh_serialize_lock(&flags);
+
+ /* Purge all events */
+ eeh_remove_event(NULL, true);
+
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb_pe = eeh_phb_pe_get(hose);
+ if (!phb_pe) continue;
+
+ eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
+ }
+
+ eeh_serialize_unlock(flags);
+
+ break;
+ case EEH_NEXT_ERR_FROZEN_PE:
+ case EEH_NEXT_ERR_FENCED_PHB:
+ case EEH_NEXT_ERR_DEAD_PHB:
+ /* Mark the PE in fenced state */
+ eeh_serialize_lock(&flags);
+
+ /* Purge all events of the PHB */
+ eeh_remove_event(pe, true);
+
+ if (rc == EEH_NEXT_ERR_DEAD_PHB)
+ eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+ else
+ eeh_pe_state_mark(pe,
+ EEH_PE_ISOLATED | EEH_PE_RECOVERING);
+
+ eeh_serialize_unlock(flags);
+
+ break;
+ case EEH_NEXT_ERR_NONE:
+ return;
+ default:
+ pr_warn("%s: Invalid value %d from next_error()\n",
+ __func__, rc);
+ return;
+ }
+
+ /*
+ * For fenced PHB and frozen PE, it's handled as normal
+ * event. We have to remove the affected PHBs for dead
+ * PHB and IOC
+ */
+ if (rc == EEH_NEXT_ERR_FROZEN_PE ||
+ rc == EEH_NEXT_ERR_FENCED_PHB) {
+ eeh_handle_normal_event(pe);
+ eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+ } else {
+ pci_lock_rescan_remove();
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb_pe = eeh_phb_pe_get(hose);
+ if (!phb_pe ||
+ !(phb_pe->state & EEH_PE_ISOLATED) ||
+ (phb_pe->state & EEH_PE_RECOVERING))
+ continue;
+
+ /* Notify all devices to be down */
+ bus = eeh_pe_bus_get(phb_pe);
+ eeh_pe_dev_traverse(pe,
+ eeh_report_failure, NULL);
+ pcibios_remove_pci_devices(bus);
+ }
+ pci_unlock_rescan_remove();
+ }
+
+ /*
+ * If we have detected dead IOC, we needn't proceed
+ * any more since all PHBs would have been removed
+ */
+ if (rc == EEH_NEXT_ERR_DEAD_IOC)
+ break;
+ } while (rc != EEH_NEXT_ERR_NONE);
+}
+
+/**
+ * eeh_handle_event - Reset a PCI device after hard lockup.
+ * @pe: EEH PE
+ *
+ * While PHB detects address or data parity errors on particular PCI
+ * slot, the associated PE will be frozen. Besides, DMA's occurring
+ * to wild addresses (which usually happen due to bugs in device
+ * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
+ * #PERR or other misc PCI-related errors also can trigger EEH errors.
+ *
+ * Recovery process consists of unplugging the device driver (which
+ * generated hotplug events to userspace), then issuing a PCI #RST to
+ * the device, then reconfiguring the PCI config space for all bridges
+ * & devices under this slot, and then finally restarting the device
+ * drivers (which cause a second set of hotplug events to go out to
+ * userspace).
+ */
+void eeh_handle_event(struct eeh_pe *pe)
+{
+ if (pe)
+ eeh_handle_normal_event(pe);
+ else
+ eeh_handle_special_event();
+}
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
new file mode 100644
index 00000000000..4eefb6e34db
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -0,0 +1,196 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
+ */
+
+#include <linux/delay.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+
+/** Overview:
+ * EEH error states may be detected within exception handlers;
+ * however, the recovery processing needs to occur asynchronously
+ * in a normal kernel context and not an interrupt context.
+ * This pair of routines creates an event and queues it onto a
+ * work-queue, where a worker thread can drive recovery.
+ */
+
+static DEFINE_SPINLOCK(eeh_eventlist_lock);
+static struct semaphore eeh_eventlist_sem;
+LIST_HEAD(eeh_eventlist);
+
+/**
+ * eeh_event_handler - Dispatch EEH events.
+ * @dummy - unused
+ *
+ * The detection of a frozen slot can occur inside an interrupt,
+ * where it can be hard to do anything about it. The goal of this
+ * routine is to pull these detection events out of the context
+ * of the interrupt handler, and re-dispatch them for processing
+ * at a later time in a normal context.
+ */
+static int eeh_event_handler(void * dummy)
+{
+ unsigned long flags;
+ struct eeh_event *event;
+ struct eeh_pe *pe;
+
+ while (!kthread_should_stop()) {
+ if (down_interruptible(&eeh_eventlist_sem))
+ break;
+
+ /* Fetch EEH event from the queue */
+ spin_lock_irqsave(&eeh_eventlist_lock, flags);
+ event = NULL;
+ if (!list_empty(&eeh_eventlist)) {
+ event = list_entry(eeh_eventlist.next,
+ struct eeh_event, list);
+ list_del(&event->list);
+ }
+ spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+ if (!event)
+ continue;
+
+ /* We might have event without binding PE */
+ pe = event->pe;
+ if (pe) {
+ eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+ if (pe->type & EEH_PE_PHB)
+ pr_info("EEH: Detected error on PHB#%d\n",
+ pe->phb->global_number);
+ else
+ pr_info("EEH: Detected PCI bus error on "
+ "PHB#%d-PE#%x\n",
+ pe->phb->global_number, pe->addr);
+ eeh_handle_event(pe);
+ eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+ } else {
+ eeh_handle_event(NULL);
+ }
+
+ kfree(event);
+ }
+
+ return 0;
+}
+
+/**
+ * eeh_event_init - Start kernel thread to handle EEH events
+ *
+ * This routine is called to start the kernel thread for processing
+ * EEH event.
+ */
+int eeh_event_init(void)
+{
+ struct task_struct *t;
+ int ret = 0;
+
+ /* Initialize semaphore */
+ sema_init(&eeh_eventlist_sem, 0);
+
+ t = kthread_run(eeh_event_handler, NULL, "eehd");
+ if (IS_ERR(t)) {
+ ret = PTR_ERR(t);
+ pr_err("%s: Failed to start EEH daemon (%d)\n",
+ __func__, ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * eeh_send_failure_event - Generate a PCI error event
+ * @pe: EEH PE
+ *
+ * This routine can be called within an interrupt context;
+ * the actual event will be delivered in a normal context
+ * (from a workqueue).
+ */
+int eeh_send_failure_event(struct eeh_pe *pe)
+{
+ unsigned long flags;
+ struct eeh_event *event;
+
+ event = kzalloc(sizeof(*event), GFP_ATOMIC);
+ if (!event) {
+ pr_err("EEH: out of memory, event not handled\n");
+ return -ENOMEM;
+ }
+ event->pe = pe;
+
+ /* We may or may not be called in an interrupt context */
+ spin_lock_irqsave(&eeh_eventlist_lock, flags);
+ list_add(&event->list, &eeh_eventlist);
+ spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+
+ /* For EEH deamon to knick in */
+ up(&eeh_eventlist_sem);
+
+ return 0;
+}
+
+/**
+ * eeh_remove_event - Remove EEH event from the queue
+ * @pe: Event binding to the PE
+ * @force: Event will be removed unconditionally
+ *
+ * On PowerNV platform, we might have subsequent coming events
+ * is part of the former one. For that case, those subsequent
+ * coming events are totally duplicated and unnecessary, thus
+ * they should be removed.
+ */
+void eeh_remove_event(struct eeh_pe *pe, bool force)
+{
+ unsigned long flags;
+ struct eeh_event *event, *tmp;
+
+ /*
+ * If we have NULL PE passed in, we have dead IOC
+ * or we're sure we can report all existing errors
+ * by the caller.
+ *
+ * With "force", the event with associated PE that
+ * have been isolated, the event won't be removed
+ * to avoid event lost.
+ */
+ spin_lock_irqsave(&eeh_eventlist_lock, flags);
+ list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) {
+ if (!force && event->pe &&
+ (event->pe->state & EEH_PE_ISOLATED))
+ continue;
+
+ if (!pe) {
+ list_del(&event->list);
+ kfree(event);
+ } else if (pe->type & EEH_PE_PHB) {
+ if (event->pe && event->pe->phb == pe->phb) {
+ list_del(&event->list);
+ kfree(event);
+ }
+ } else if (event->pe == pe) {
+ list_del(&event->list);
+ kfree(event);
+ }
+ }
+ spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+}
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
new file mode 100644
index 00000000000..fbd01eba447
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -0,0 +1,887 @@
+/*
+ * The file intends to implement PE based on the information from
+ * platforms. Basically, there have 3 types of PEs: PHB/Bus/Device.
+ * All the PEs should be organized as hierarchy tree. The first level
+ * of the tree will be associated to existing PHBs since the particular
+ * PE is only meaningful in one PHB domain.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+static LIST_HEAD(eeh_phb_pe);
+
+/**
+ * eeh_pe_alloc - Allocate PE
+ * @phb: PCI controller
+ * @type: PE type
+ *
+ * Allocate PE instance dynamically.
+ */
+static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type)
+{
+ struct eeh_pe *pe;
+
+ /* Allocate PHB PE */
+ pe = kzalloc(sizeof(struct eeh_pe), GFP_KERNEL);
+ if (!pe) return NULL;
+
+ /* Initialize PHB PE */
+ pe->type = type;
+ pe->phb = phb;
+ INIT_LIST_HEAD(&pe->child_list);
+ INIT_LIST_HEAD(&pe->child);
+ INIT_LIST_HEAD(&pe->edevs);
+
+ return pe;
+}
+
+/**
+ * eeh_phb_pe_create - Create PHB PE
+ * @phb: PCI controller
+ *
+ * The function should be called while the PHB is detected during
+ * system boot or PCI hotplug in order to create PHB PE.
+ */
+int eeh_phb_pe_create(struct pci_controller *phb)
+{
+ struct eeh_pe *pe;
+
+ /* Allocate PHB PE */
+ pe = eeh_pe_alloc(phb, EEH_PE_PHB);
+ if (!pe) {
+ pr_err("%s: out of memory!\n", __func__);
+ return -ENOMEM;
+ }
+
+ /* Put it into the list */
+ list_add_tail(&pe->child, &eeh_phb_pe);
+
+ pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number);
+
+ return 0;
+}
+
+/**
+ * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB
+ * @phb: PCI controller
+ *
+ * The overall PEs form hierarchy tree. The first layer of the
+ * hierarchy tree is composed of PHB PEs. The function is used
+ * to retrieve the corresponding PHB PE according to the given PHB.
+ */
+struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
+{
+ struct eeh_pe *pe;
+
+ list_for_each_entry(pe, &eeh_phb_pe, child) {
+ /*
+ * Actually, we needn't check the type since
+ * the PE for PHB has been determined when that
+ * was created.
+ */
+ if ((pe->type & EEH_PE_PHB) && pe->phb == phb)
+ return pe;
+ }
+
+ return NULL;
+}
+
+/**
+ * eeh_pe_next - Retrieve the next PE in the tree
+ * @pe: current PE
+ * @root: root PE
+ *
+ * The function is used to retrieve the next PE in the
+ * hierarchy PE tree.
+ */
+static struct eeh_pe *eeh_pe_next(struct eeh_pe *pe,
+ struct eeh_pe *root)
+{
+ struct list_head *next = pe->child_list.next;
+
+ if (next == &pe->child_list) {
+ while (1) {
+ if (pe == root)
+ return NULL;
+ next = pe->child.next;
+ if (next != &pe->parent->child_list)
+ break;
+ pe = pe->parent;
+ }
+ }
+
+ return list_entry(next, struct eeh_pe, child);
+}
+
+/**
+ * eeh_pe_traverse - Traverse PEs in the specified PHB
+ * @root: root PE
+ * @fn: callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the specified PE and its
+ * child PEs. The traversing is to be terminated once the
+ * callback returns something other than NULL, or no more PEs
+ * to be traversed.
+ */
+void *eeh_pe_traverse(struct eeh_pe *root,
+ eeh_traverse_func fn, void *flag)
+{
+ struct eeh_pe *pe;
+ void *ret;
+
+ for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
+ ret = fn(pe, flag);
+ if (ret) return ret;
+ }
+
+ return NULL;
+}
+
+/**
+ * eeh_pe_dev_traverse - Traverse the devices from the PE
+ * @root: EEH PE
+ * @fn: function callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the devices of the specified
+ * PE and its child PEs.
+ */
+void *eeh_pe_dev_traverse(struct eeh_pe *root,
+ eeh_traverse_func fn, void *flag)
+{
+ struct eeh_pe *pe;
+ struct eeh_dev *edev, *tmp;
+ void *ret;
+
+ if (!root) {
+ pr_warning("%s: Invalid PE %p\n", __func__, root);
+ return NULL;
+ }
+
+ /* Traverse root PE */
+ for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
+ eeh_pe_for_each_dev(pe, edev, tmp) {
+ ret = fn(edev, flag);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * __eeh_pe_get - Check the PE address
+ * @data: EEH PE
+ * @flag: EEH device
+ *
+ * For one particular PE, it can be identified by PE address
+ * or tranditional BDF address. BDF address is composed of
+ * Bus/Device/Function number. The extra data referred by flag
+ * indicates which type of address should be used.
+ */
+static void *__eeh_pe_get(void *data, void *flag)
+{
+ struct eeh_pe *pe = (struct eeh_pe *)data;
+ struct eeh_dev *edev = (struct eeh_dev *)flag;
+
+ /* Unexpected PHB PE */
+ if (pe->type & EEH_PE_PHB)
+ return NULL;
+
+ /* We prefer PE address */
+ if (edev->pe_config_addr &&
+ (edev->pe_config_addr == pe->addr))
+ return pe;
+
+ /* Try BDF address */
+ if (edev->config_addr &&
+ (edev->config_addr == pe->config_addr))
+ return pe;
+
+ return NULL;
+}
+
+/**
+ * eeh_pe_get - Search PE based on the given address
+ * @edev: EEH device
+ *
+ * Search the corresponding PE based on the specified address which
+ * is included in the eeh device. The function is used to check if
+ * the associated PE has been created against the PE address. It's
+ * notable that the PE address has 2 format: traditional PE address
+ * which is composed of PCI bus/device/function number, or unified
+ * PE address.
+ */
+struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
+{
+ struct eeh_pe *root = eeh_phb_pe_get(edev->phb);
+ struct eeh_pe *pe;
+
+ pe = eeh_pe_traverse(root, __eeh_pe_get, edev);
+
+ return pe;
+}
+
+/**
+ * eeh_pe_get_parent - Retrieve the parent PE
+ * @edev: EEH device
+ *
+ * The whole PEs existing in the system are organized as hierarchy
+ * tree. The function is used to retrieve the parent PE according
+ * to the parent EEH device.
+ */
+static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
+{
+ struct device_node *dn;
+ struct eeh_dev *parent;
+
+ /*
+ * It might have the case for the indirect parent
+ * EEH device already having associated PE, but
+ * the direct parent EEH device doesn't have yet.
+ */
+ dn = edev->dn->parent;
+ while (dn) {
+ /* We're poking out of PCI territory */
+ if (!PCI_DN(dn)) return NULL;
+
+ parent = of_node_to_eeh_dev(dn);
+ /* We're poking out of PCI territory */
+ if (!parent) return NULL;
+
+ if (parent->pe)
+ return parent->pe;
+
+ dn = dn->parent;
+ }
+
+ return NULL;
+}
+
+/**
+ * eeh_add_to_parent_pe - Add EEH device to parent PE
+ * @edev: EEH device
+ *
+ * Add EEH device to the parent PE. If the parent PE already
+ * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
+ * we have to create new PE to hold the EEH device and the new
+ * PE will be linked to its parent PE as well.
+ */
+int eeh_add_to_parent_pe(struct eeh_dev *edev)
+{
+ struct eeh_pe *pe, *parent;
+
+ /*
+ * Search the PE has been existing or not according
+ * to the PE address. If that has been existing, the
+ * PE should be composed of PCI bus and its subordinate
+ * components.
+ */
+ pe = eeh_pe_get(edev);
+ if (pe && !(pe->type & EEH_PE_INVALID)) {
+ if (!edev->pe_config_addr) {
+ pr_err("%s: PE with addr 0x%x already exists\n",
+ __func__, edev->config_addr);
+ return -EEXIST;
+ }
+
+ /* Mark the PE as type of PCI bus */
+ pe->type = EEH_PE_BUS;
+ edev->pe = pe;
+
+ /* Put the edev to PE */
+ list_add_tail(&edev->list, &pe->edevs);
+ pr_debug("EEH: Add %s to Bus PE#%x\n",
+ edev->dn->full_name, pe->addr);
+
+ return 0;
+ } else if (pe && (pe->type & EEH_PE_INVALID)) {
+ list_add_tail(&edev->list, &pe->edevs);
+ edev->pe = pe;
+ /*
+ * We're running to here because of PCI hotplug caused by
+ * EEH recovery. We need clear EEH_PE_INVALID until the top.
+ */
+ parent = pe;
+ while (parent) {
+ if (!(parent->type & EEH_PE_INVALID))
+ break;
+ parent->type &= ~(EEH_PE_INVALID | EEH_PE_KEEP);
+ parent = parent->parent;
+ }
+ pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
+ edev->dn->full_name, pe->addr, pe->parent->addr);
+
+ return 0;
+ }
+
+ /* Create a new EEH PE */
+ pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
+ if (!pe) {
+ pr_err("%s: out of memory!\n", __func__);
+ return -ENOMEM;
+ }
+ pe->addr = edev->pe_config_addr;
+ pe->config_addr = edev->config_addr;
+
+ /*
+ * While doing PE reset, we probably hot-reset the
+ * upstream bridge. However, the PCI devices including
+ * the associated EEH devices might be removed when EEH
+ * core is doing recovery. So that won't safe to retrieve
+ * the bridge through downstream EEH device. We have to
+ * trace the parent PCI bus, then the upstream bridge.
+ */
+ if (eeh_probe_mode_dev())
+ pe->bus = eeh_dev_to_pci_dev(edev)->bus;
+
+ /*
+ * Put the new EEH PE into hierarchy tree. If the parent
+ * can't be found, the newly created PE will be attached
+ * to PHB directly. Otherwise, we have to associate the
+ * PE with its parent.
+ */
+ parent = eeh_pe_get_parent(edev);
+ if (!parent) {
+ parent = eeh_phb_pe_get(edev->phb);
+ if (!parent) {
+ pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
+ __func__, edev->phb->global_number);
+ edev->pe = NULL;
+ kfree(pe);
+ return -EEXIST;
+ }
+ }
+ pe->parent = parent;
+
+ /*
+ * Put the newly created PE into the child list and
+ * link the EEH device accordingly.
+ */
+ list_add_tail(&pe->child, &parent->child_list);
+ list_add_tail(&edev->list, &pe->edevs);
+ edev->pe = pe;
+ pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
+ edev->dn->full_name, pe->addr, pe->parent->addr);
+
+ return 0;
+}
+
+/**
+ * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE
+ * @edev: EEH device
+ *
+ * The PE hierarchy tree might be changed when doing PCI hotplug.
+ * Also, the PCI devices or buses could be removed from the system
+ * during EEH recovery. So we have to call the function remove the
+ * corresponding PE accordingly if necessary.
+ */
+int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
+{
+ struct eeh_pe *pe, *parent, *child;
+ int cnt;
+
+ if (!edev->pe) {
+ pr_debug("%s: No PE found for EEH device %s\n",
+ __func__, edev->dn->full_name);
+ return -EEXIST;
+ }
+
+ /* Remove the EEH device */
+ pe = edev->pe;
+ edev->pe = NULL;
+ list_del(&edev->list);
+
+ /*
+ * Check if the parent PE includes any EEH devices.
+ * If not, we should delete that. Also, we should
+ * delete the parent PE if it doesn't have associated
+ * child PEs and EEH devices.
+ */
+ while (1) {
+ parent = pe->parent;
+ if (pe->type & EEH_PE_PHB)
+ break;
+
+ if (!(pe->state & EEH_PE_KEEP)) {
+ if (list_empty(&pe->edevs) &&
+ list_empty(&pe->child_list)) {
+ list_del(&pe->child);
+ kfree(pe);
+ } else {
+ break;
+ }
+ } else {
+ if (list_empty(&pe->edevs)) {
+ cnt = 0;
+ list_for_each_entry(child, &pe->child_list, child) {
+ if (!(child->type & EEH_PE_INVALID)) {
+ cnt++;
+ break;
+ }
+ }
+
+ if (!cnt)
+ pe->type |= EEH_PE_INVALID;
+ else
+ break;
+ }
+ }
+
+ pe = parent;
+ }
+
+ return 0;
+}
+
+/**
+ * eeh_pe_update_time_stamp - Update PE's frozen time stamp
+ * @pe: EEH PE
+ *
+ * We have time stamp for each PE to trace its time of getting
+ * frozen in last hour. The function should be called to update
+ * the time stamp on first error of the specific PE. On the other
+ * handle, we needn't account for errors happened in last hour.
+ */
+void eeh_pe_update_time_stamp(struct eeh_pe *pe)
+{
+ struct timeval tstamp;
+
+ if (!pe) return;
+
+ if (pe->freeze_count <= 0) {
+ pe->freeze_count = 0;
+ do_gettimeofday(&pe->tstamp);
+ } else {
+ do_gettimeofday(&tstamp);
+ if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
+ pe->tstamp = tstamp;
+ pe->freeze_count = 0;
+ }
+ }
+}
+
+/**
+ * __eeh_pe_state_mark - Mark the state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to mark the indicated state for the given
+ * PE. Also, the associated PCI devices will be put into IO frozen
+ * state as well.
+ */
+static void *__eeh_pe_state_mark(void *data, void *flag)
+{
+ struct eeh_pe *pe = (struct eeh_pe *)data;
+ int state = *((int *)flag);
+ struct eeh_dev *edev, *tmp;
+ struct pci_dev *pdev;
+
+ /* Keep the state of permanently removed PE intact */
+ if ((pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) &&
+ (state & (EEH_PE_ISOLATED | EEH_PE_RECOVERING)))
+ return NULL;
+
+ pe->state |= state;
+
+ /* Offline PCI devices if applicable */
+ if (state != EEH_PE_ISOLATED)
+ return NULL;
+
+ eeh_pe_for_each_dev(pe, edev, tmp) {
+ pdev = eeh_dev_to_pci_dev(edev);
+ if (pdev)
+ pdev->error_state = pci_channel_io_frozen;
+ }
+
+ return NULL;
+}
+
+/**
+ * eeh_pe_state_mark - Mark specified state for PE and its associated device
+ * @pe: EEH PE
+ *
+ * EEH error affects the current PE and its child PEs. The function
+ * is used to mark appropriate state for the affected PEs and the
+ * associated devices.
+ */
+void eeh_pe_state_mark(struct eeh_pe *pe, int state)
+{
+ eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
+}
+
+static void *__eeh_pe_dev_mode_mark(void *data, void *flag)
+{
+ struct eeh_dev *edev = data;
+ int mode = *((int *)flag);
+
+ edev->mode |= mode;
+
+ return NULL;
+}
+
+/**
+ * eeh_pe_dev_state_mark - Mark state for all device under the PE
+ * @pe: EEH PE
+ *
+ * Mark specific state for all child devices of the PE.
+ */
+void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode)
+{
+ eeh_pe_dev_traverse(pe, __eeh_pe_dev_mode_mark, &mode);
+}
+
+/**
+ * __eeh_pe_state_clear - Clear state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to clear the indicated state from the
+ * given PE. Besides, we also clear the check count of the PE
+ * as well.
+ */
+static void *__eeh_pe_state_clear(void *data, void *flag)
+{
+ struct eeh_pe *pe = (struct eeh_pe *)data;
+ int state = *((int *)flag);
+
+ /* Keep the state of permanently removed PE intact */
+ if ((pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) &&
+ (state & EEH_PE_ISOLATED))
+ return NULL;
+
+ pe->state &= ~state;
+
+ /* Clear check count since last isolation */
+ if (state & EEH_PE_ISOLATED)
+ pe->check_count = 0;
+
+ return NULL;
+}
+
+/**
+ * eeh_pe_state_clear - Clear state for the PE and its children
+ * @pe: PE
+ * @state: state to be cleared
+ *
+ * When the PE and its children has been recovered from error,
+ * we need clear the error state for that. The function is used
+ * for the purpose.
+ */
+void eeh_pe_state_clear(struct eeh_pe *pe, int state)
+{
+ eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
+}
+
+/*
+ * Some PCI bridges (e.g. PLX bridges) have primary/secondary
+ * buses assigned explicitly by firmware, and we probably have
+ * lost that after reset. So we have to delay the check until
+ * the PCI-CFG registers have been restored for the parent
+ * bridge.
+ *
+ * Don't use normal PCI-CFG accessors, which probably has been
+ * blocked on normal path during the stage. So we need utilize
+ * eeh operations, which is always permitted.
+ */
+static void eeh_bridge_check_link(struct eeh_dev *edev,
+ struct device_node *dn)
+{
+ int cap;
+ uint32_t val;
+ int timeout = 0;
+
+ /*
+ * We only check root port and downstream ports of
+ * PCIe switches
+ */
+ if (!(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT)))
+ return;
+
+ pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n",
+ __func__, edev->phb->global_number,
+ edev->config_addr >> 8,
+ PCI_SLOT(edev->config_addr & 0xFF),
+ PCI_FUNC(edev->config_addr & 0xFF));
+
+ /* Check slot status */
+ cap = edev->pcie_cap;
+ eeh_ops->read_config(dn, cap + PCI_EXP_SLTSTA, 2, &val);
+ if (!(val & PCI_EXP_SLTSTA_PDS)) {
+ pr_debug(" No card in the slot (0x%04x) !\n", val);
+ return;
+ }
+
+ /* Check power status if we have the capability */
+ eeh_ops->read_config(dn, cap + PCI_EXP_SLTCAP, 2, &val);
+ if (val & PCI_EXP_SLTCAP_PCP) {
+ eeh_ops->read_config(dn, cap + PCI_EXP_SLTCTL, 2, &val);
+ if (val & PCI_EXP_SLTCTL_PCC) {
+ pr_debug(" In power-off state, power it on ...\n");
+ val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC);
+ val |= (0x0100 & PCI_EXP_SLTCTL_PIC);
+ eeh_ops->write_config(dn, cap + PCI_EXP_SLTCTL, 2, val);
+ msleep(2 * 1000);
+ }
+ }
+
+ /* Enable link */
+ eeh_ops->read_config(dn, cap + PCI_EXP_LNKCTL, 2, &val);
+ val &= ~PCI_EXP_LNKCTL_LD;
+ eeh_ops->write_config(dn, cap + PCI_EXP_LNKCTL, 2, val);
+
+ /* Check link */
+ eeh_ops->read_config(dn, cap + PCI_EXP_LNKCAP, 4, &val);
+ if (!(val & PCI_EXP_LNKCAP_DLLLARC)) {
+ pr_debug(" No link reporting capability (0x%08x) \n", val);
+ msleep(1000);
+ return;
+ }
+
+ /* Wait the link is up until timeout (5s) */
+ timeout = 0;
+ while (timeout < 5000) {
+ msleep(20);
+ timeout += 20;
+
+ eeh_ops->read_config(dn, cap + PCI_EXP_LNKSTA, 2, &val);
+ if (val & PCI_EXP_LNKSTA_DLLLA)
+ break;
+ }
+
+ if (val & PCI_EXP_LNKSTA_DLLLA)
+ pr_debug(" Link up (%s)\n",
+ (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB");
+ else
+ pr_debug(" Link not ready (0x%04x)\n", val);
+}
+
+#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
+#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
+
+static void eeh_restore_bridge_bars(struct eeh_dev *edev,
+ struct device_node *dn)
+{
+ int i;
+
+ /*
+ * Device BARs: 0x10 - 0x18
+ * Bus numbers and windows: 0x18 - 0x30
+ */
+ for (i = 4; i < 13; i++)
+ eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
+ /* Rom: 0x38 */
+ eeh_ops->write_config(dn, 14*4, 4, edev->config_space[14]);
+
+ /* Cache line & Latency timer: 0xC 0xD */
+ eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
+ SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+ eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
+ SAVED_BYTE(PCI_LATENCY_TIMER));
+ /* Max latency, min grant, interrupt ping and line: 0x3C */
+ eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
+
+ /* PCI Command: 0x4 */
+ eeh_ops->write_config(dn, PCI_COMMAND, 4, edev->config_space[1]);
+
+ /* Check the PCIe link is ready */
+ eeh_bridge_check_link(edev, dn);
+}
+
+static void eeh_restore_device_bars(struct eeh_dev *edev,
+ struct device_node *dn)
+{
+ int i;
+ u32 cmd;
+
+ for (i = 4; i < 10; i++)
+ eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
+ /* 12 == Expansion ROM Address */
+ eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]);
+
+ eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
+ SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+ eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
+ SAVED_BYTE(PCI_LATENCY_TIMER));
+
+ /* max latency, min grant, interrupt pin and line */
+ eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
+
+ /*
+ * Restore PERR & SERR bits, some devices require it,
+ * don't touch the other command bits
+ */
+ eeh_ops->read_config(dn, PCI_COMMAND, 4, &cmd);
+ if (edev->config_space[1] & PCI_COMMAND_PARITY)
+ cmd |= PCI_COMMAND_PARITY;
+ else
+ cmd &= ~PCI_COMMAND_PARITY;
+ if (edev->config_space[1] & PCI_COMMAND_SERR)
+ cmd |= PCI_COMMAND_SERR;
+ else
+ cmd &= ~PCI_COMMAND_SERR;
+ eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd);
+}
+
+/**
+ * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
+ * @data: EEH device
+ * @flag: Unused
+ *
+ * Loads the PCI configuration space base address registers,
+ * the expansion ROM base address, the latency timer, and etc.
+ * from the saved values in the device node.
+ */
+static void *eeh_restore_one_device_bars(void *data, void *flag)
+{
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct device_node *dn = eeh_dev_to_of_node(edev);
+
+ /* Do special restore for bridges */
+ if (edev->mode & EEH_DEV_BRIDGE)
+ eeh_restore_bridge_bars(edev, dn);
+ else
+ eeh_restore_device_bars(edev, dn);
+
+ if (eeh_ops->restore_config)
+ eeh_ops->restore_config(dn);
+
+ return NULL;
+}
+
+/**
+ * eeh_pe_restore_bars - Restore the PCI config space info
+ * @pe: EEH PE
+ *
+ * This routine performs a recursive walk to the children
+ * of this device as well.
+ */
+void eeh_pe_restore_bars(struct eeh_pe *pe)
+{
+ /*
+ * We needn't take the EEH lock since eeh_pe_dev_traverse()
+ * will take that.
+ */
+ eeh_pe_dev_traverse(pe, eeh_restore_one_device_bars, NULL);
+}
+
+/**
+ * eeh_pe_loc_get - Retrieve location code binding to the given PE
+ * @pe: EEH PE
+ *
+ * Retrieve the location code of the given PE. If the primary PE bus
+ * is root bus, we will grab location code from PHB device tree node
+ * or root port. Otherwise, the upstream bridge's device tree node
+ * of the primary PE bus will be checked for the location code.
+ */
+const char *eeh_pe_loc_get(struct eeh_pe *pe)
+{
+ struct pci_controller *hose;
+ struct pci_bus *bus = eeh_pe_bus_get(pe);
+ struct pci_dev *pdev;
+ struct device_node *dn;
+ const char *loc;
+
+ if (!bus)
+ return "N/A";
+
+ /* PHB PE or root PE ? */
+ if (pci_is_root_bus(bus)) {
+ hose = pci_bus_to_host(bus);
+ loc = of_get_property(hose->dn,
+ "ibm,loc-code", NULL);
+ if (loc)
+ return loc;
+ loc = of_get_property(hose->dn,
+ "ibm,io-base-loc-code", NULL);
+ if (loc)
+ return loc;
+
+ pdev = pci_get_slot(bus, 0x0);
+ } else {
+ pdev = bus->self;
+ }
+
+ if (!pdev) {
+ loc = "N/A";
+ goto out;
+ }
+
+ dn = pci_device_to_OF_node(pdev);
+ if (!dn) {
+ loc = "N/A";
+ goto out;
+ }
+
+ loc = of_get_property(dn, "ibm,loc-code", NULL);
+ if (!loc)
+ loc = of_get_property(dn, "ibm,slot-location-code", NULL);
+ if (!loc)
+ loc = "N/A";
+
+out:
+ if (pci_is_root_bus(bus) && pdev)
+ pci_dev_put(pdev);
+ return loc;
+}
+
+/**
+ * eeh_pe_bus_get - Retrieve PCI bus according to the given PE
+ * @pe: EEH PE
+ *
+ * Retrieve the PCI bus according to the given PE. Basically,
+ * there're 3 types of PEs: PHB/Bus/Device. For PHB PE, the
+ * primary PCI bus will be retrieved. The parent bus will be
+ * returned for BUS PE. However, we don't have associated PCI
+ * bus for DEVICE PE.
+ */
+struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
+{
+ struct pci_bus *bus = NULL;
+ struct eeh_dev *edev;
+ struct pci_dev *pdev;
+
+ if (pe->type & EEH_PE_PHB) {
+ bus = pe->phb->bus;
+ } else if (pe->type & EEH_PE_BUS ||
+ pe->type & EEH_PE_DEVICE) {
+ if (pe->bus) {
+ bus = pe->bus;
+ goto out;
+ }
+
+ edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
+ pdev = eeh_dev_to_pci_dev(edev);
+ if (pdev)
+ bus = pdev->bus;
+ }
+
+out:
+ return bus;
+}
diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c
new file mode 100644
index 00000000000..e2595ba4b72
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_sysfs.c
@@ -0,0 +1,98 @@
+/*
+ * Sysfs entries for PCI Error Recovery for PAPR-compliant platform.
+ * Copyright IBM Corporation 2007
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2007
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/pci.h>
+#include <linux/stat.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+
+/**
+ * EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic
+ * @_name: name of file in sysfs directory
+ * @_memb: name of member in struct pci_dn to access
+ * @_format: printf format for display
+ *
+ * All of the attributes look very similar, so just
+ * auto-gen a cut-n-paste routine to display them.
+ */
+#define EEH_SHOW_ATTR(_name,_memb,_format) \
+static ssize_t eeh_show_##_name(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ struct pci_dev *pdev = to_pci_dev(dev); \
+ struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); \
+ \
+ if (!edev) \
+ return 0; \
+ \
+ return sprintf(buf, _format "\n", edev->_memb); \
+} \
+static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL);
+
+EEH_SHOW_ATTR(eeh_mode, mode, "0x%x");
+EEH_SHOW_ATTR(eeh_config_addr, config_addr, "0x%x");
+EEH_SHOW_ATTR(eeh_pe_config_addr, pe_config_addr, "0x%x");
+
+void eeh_sysfs_add_device(struct pci_dev *pdev)
+{
+ struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+ int rc=0;
+
+ if (!eeh_enabled())
+ return;
+
+ if (edev && (edev->mode & EEH_DEV_SYSFS))
+ return;
+
+ rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode);
+ rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr);
+ rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+
+ if (rc)
+ printk(KERN_WARNING "EEH: Unable to create sysfs entries\n");
+ else if (edev)
+ edev->mode |= EEH_DEV_SYSFS;
+}
+
+void eeh_sysfs_remove_device(struct pci_dev *pdev)
+{
+ struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+
+ /*
+ * The parent directory might have been removed. We needn't
+ * continue for that case.
+ */
+ if (!pdev->dev.kobj.sd) {
+ if (edev)
+ edev->mode &= ~EEH_DEV_SYSFS;
+ return;
+ }
+
+ device_remove_file(&pdev->dev, &dev_attr_eeh_mode);
+ device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
+ device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+
+ if (edev)
+ edev->mode &= ~EEH_DEV_SYSFS;
+}
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index ed4aeb96398..22b45a4955c 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -31,6 +31,7 @@
#include <asm/asm-offsets.h>
#include <asm/unistd.h>
#include <asm/ftrace.h>
+#include <asm/ptrace.h>
#undef SHOW_SYSCALLS
#undef SHOW_SYSCALLS_TASK
@@ -88,6 +89,10 @@ crit_transfer_to_handler:
mfspr r0,SPRN_SRR1
stw r0,_SRR1(r11)
+ /* set the stack limit to the current stack
+ * and set the limit to protect the thread_info
+ * struct
+ */
mfspr r8,SPRN_SPRG_THREAD
lwz r0,KSP_LIMIT(r8)
stw r0,SAVED_KSP_LIMIT(r11)
@@ -108,6 +113,10 @@ crit_transfer_to_handler:
mfspr r0,SPRN_SRR1
stw r0,crit_srr1@l(0)
+ /* set the stack limit to the current stack
+ * and set the limit to protect the thread_info
+ * struct
+ */
mfspr r8,SPRN_SPRG_THREAD
lwz r0,KSP_LIMIT(r8)
stw r0,saved_ksp_limit@l(0)
@@ -157,7 +166,7 @@ transfer_to_handler:
tophys(r11,r11)
addi r11,r11,global_dbcr0@l
#ifdef CONFIG_SMP
- rlwinm r9,r1,0,0,(31-THREAD_SHIFT)
+ CURRENT_THREAD_INFO(r9, r1)
lwz r9,TI_CPU(r9)
slwi r9,r9,3
add r11,r11,r9
@@ -178,7 +187,7 @@ transfer_to_handler:
ble- stack_ovf /* then the kernel stack overflowed */
5:
#if defined(CONFIG_6xx) || defined(CONFIG_E500)
- rlwinm r9,r1,0,0,31-THREAD_SHIFT
+ CURRENT_THREAD_INFO(r9, r1)
tophys(r9,r9) /* check local flags */
lwz r12,TI_LOCAL_FLAGS(r9)
mtcrf 0x01,r12
@@ -205,25 +214,37 @@ reenable_mmu: /* re-enable mmu so we can */
andi. r10,r10,MSR_EE /* Did EE change? */
beq 1f
- /* Save handler and return address into the 2 unused words
- * of the STACK_FRAME_OVERHEAD (sneak sneak sneak). Everything
- * else can be recovered from the pt_regs except r3 which for
- * normal interrupts has been set to pt_regs and for syscalls
- * is an argument, so we temporarily use ORIG_GPR3 to save it
+ /*
+ * The trace_hardirqs_off will use CALLER_ADDR0 and CALLER_ADDR1.
+ * If from user mode there is only one stack frame on the stack, and
+ * accessing CALLER_ADDR1 will cause oops. So we need create a dummy
+ * stack frame to make trace_hardirqs_off happy.
+ *
+ * This is handy because we also need to save a bunch of GPRs,
+ * r3 can be different from GPR3(r1) at this point, r9 and r11
+ * contains the old MSR and handler address respectively,
+ * r4 & r5 can contain page fault arguments that need to be passed
+ * along as well. r12, CCR, CTR, XER etc... are left clobbered as
+ * they aren't useful past this point (aren't syscall arguments),
+ * the rest is restored from the exception frame.
*/
+ stwu r1,-32(r1)
stw r9,8(r1)
stw r11,12(r1)
- stw r3,ORIG_GPR3(r1)
+ stw r3,16(r1)
+ stw r4,20(r1)
+ stw r5,24(r1)
bl trace_hardirqs_off
+ lwz r5,24(r1)
+ lwz r4,20(r1)
+ lwz r3,16(r1)
+ lwz r11,12(r1)
+ lwz r9,8(r1)
+ addi r1,r1,32
lwz r0,GPR0(r1)
- lwz r3,ORIG_GPR3(r1)
- lwz r4,GPR4(r1)
- lwz r5,GPR5(r1)
lwz r6,GPR6(r1)
lwz r7,GPR7(r1)
lwz r8,GPR8(r1)
- lwz r9,8(r1)
- lwz r11,12(r1)
1: mtctr r11
mtlr r9
bctr /* jump to handler */
@@ -314,7 +335,7 @@ _GLOBAL(DoSyscall)
mtmsr r11
1:
#endif /* CONFIG_TRACE_IRQFLAGS */
- rlwinm r10,r1,0,0,(31-THREAD_SHIFT) /* current_thread_info() */
+ CURRENT_THREAD_INFO(r10, r1)
lwz r11,TI_FLAGS(r10)
andi. r11,r11,_TIF_SYSCALL_T_OR_A
bne- syscall_dotrace
@@ -335,7 +356,7 @@ ret_from_syscall:
bl do_show_syscall_exit
#endif
mr r6,r3
- rlwinm r12,r1,0,0,(31-THREAD_SHIFT) /* current_thread_info() */
+ CURRENT_THREAD_INFO(r12, r1)
/* disable interrupts so current_thread_info()->flags can't change */
LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */
/* Note: We don't bother telling lockdep about it */
@@ -414,6 +435,17 @@ ret_from_fork:
li r3,0
b ret_from_syscall
+ .globl ret_from_kernel_thread
+ret_from_kernel_thread:
+ REST_NVGPRS(r1)
+ bl schedule_tail
+ mtlr r14
+ mr r3,r15
+ PPC440EP_ERR42
+ blrl
+ li r3,0
+ b ret_from_syscall
+
/* Traced system call support */
syscall_dotrace:
SAVE_NVGPRS(r1)
@@ -796,7 +828,7 @@ ret_from_except:
user_exc_return: /* r10 contains MSR_KERNEL here */
/* Check current_thread_info()->flags */
- rlwinm r9,r1,0,0,(31-THREAD_SHIFT)
+ CURRENT_THREAD_INFO(r9, r1)
lwz r9,TI_FLAGS(r9)
andi. r0,r9,_TIF_USER_WORK_MASK
bne do_work
@@ -810,19 +842,56 @@ restore_user:
bnel- load_dbcr0
#endif
-#ifdef CONFIG_PREEMPT
b restore
/* N.B. the only way to get here is from the beq following ret_from_except. */
resume_kernel:
+ /* check current_thread_info, _TIF_EMULATE_STACK_STORE */
+ CURRENT_THREAD_INFO(r9, r1)
+ lwz r8,TI_FLAGS(r9)
+ andis. r0,r8,_TIF_EMULATE_STACK_STORE@h
+ beq+ 1f
+
+ addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */
+
+ lwz r3,GPR1(r1)
+ subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */
+ mr r4,r1 /* src: current exception frame */
+ mr r1,r3 /* Reroute the trampoline frame to r1 */
+
+ /* Copy from the original to the trampoline. */
+ li r5,INT_FRAME_SIZE/4 /* size: INT_FRAME_SIZE */
+ li r6,0 /* start offset: 0 */
+ mtctr r5
+2: lwzx r0,r6,r4
+ stwx r0,r6,r3
+ addi r6,r6,4
+ bdnz 2b
+
+ /* Do real store operation to complete stwu */
+ lwz r5,GPR1(r1)
+ stw r8,0(r5)
+
+ /* Clear _TIF_EMULATE_STACK_STORE flag */
+ lis r11,_TIF_EMULATE_STACK_STORE@h
+ addi r5,r9,TI_FLAGS
+0: lwarx r8,0,r5
+ andc r8,r8,r11
+#ifdef CONFIG_IBM405_ERR77
+ dcbt 0,r5
+#endif
+ stwcx. r8,0,r5
+ bne- 0b
+1:
+
+#ifdef CONFIG_PREEMPT
/* check current_thread_info->preempt_count */
- rlwinm r9,r1,0,0,(31-THREAD_SHIFT)
lwz r0,TI_PREEMPT(r9)
cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
bne restore
- lwz r0,TI_FLAGS(r9)
- andi. r0,r0,_TIF_NEED_RESCHED
+ andi. r8,r8,_TIF_NEED_RESCHED
beq+ restore
+ lwz r3,_MSR(r1)
andi. r0,r3,MSR_EE /* interrupts off? */
beq restore /* don't schedule if so */
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -833,7 +902,7 @@ resume_kernel:
bl trace_hardirqs_off
#endif
1: bl preempt_schedule_irq
- rlwinm r9,r1,0,0,(31-THREAD_SHIFT)
+ CURRENT_THREAD_INFO(r9, r1)
lwz r3,TI_FLAGS(r9)
andi. r0,r3,_TIF_NEED_RESCHED
bne- 1b
@@ -843,8 +912,6 @@ resume_kernel:
*/
bl trace_hardirqs_on
#endif
-#else
-resume_kernel:
#endif /* CONFIG_PREEMPT */
/* interrupts are hard-disabled at this point */
@@ -879,7 +946,18 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
*/
andi. r10,r9,MSR_EE
beq 1f
+ /*
+ * Since the ftrace irqsoff latency trace checks CALLER_ADDR1,
+ * which is the stack frame here, we need to force a stack frame
+ * in case we came from user space.
+ */
+ stwu r1,-32(r1)
+ mflr r0
+ stw r0,4(r1)
+ stwu r1,-32(r1)
bl trace_hardirqs_on
+ lwz r1,0(r1)
+ lwz r1,0(r1)
lwz r9,_MSR(r1)
1:
#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -1092,7 +1170,7 @@ ret_from_debug_exc:
lwz r10,SAVED_KSP_LIMIT(r1)
stw r10,KSP_LIMIT(r9)
lwz r9,THREAD_INFO-THREAD(r9)
- rlwinm r10,r1,0,0,(31-THREAD_SHIFT)
+ CURRENT_THREAD_INFO(r10, r1)
lwz r10,TI_PREEMPT(r10)
stw r10,TI_PREEMPT(r9)
RESTORE_xSRR(SRR0,SRR1);
@@ -1126,7 +1204,7 @@ load_dbcr0:
lis r11,global_dbcr0@ha
addi r11,r11,global_dbcr0@l
#ifdef CONFIG_SMP
- rlwinm r9,r1,0,0,(31-THREAD_SHIFT)
+ CURRENT_THREAD_INFO(r9, r1)
lwz r9,TI_CPU(r9)
slwi r9,r9,3
add r11,r11,r9
@@ -1167,7 +1245,7 @@ recheck:
LOAD_MSR_KERNEL(r10,MSR_KERNEL)
SYNC
MTMSRD(r10) /* disable interrupts */
- rlwinm r9,r1,0,0,(31-THREAD_SHIFT)
+ CURRENT_THREAD_INFO(r9, r1)
lwz r9,TI_FLAGS(r9)
andi. r0,r9,_TIF_NEED_RESCHED
bne- do_resched
@@ -1186,7 +1264,7 @@ do_user_signal: /* r10 contains MSR_KERNEL here */
stw r3,_TRAP(r1)
2: addi r3,r1,STACK_FRAME_OVERHEAD
mr r4,r9
- bl do_signal
+ bl do_notify_resume
REST_NVGPRS(r1)
b recheck
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index d82878c4daa..6528c5e2cc4 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -32,13 +32,15 @@
#include <asm/ptrace.h>
#include <asm/irqflags.h>
#include <asm/ftrace.h>
+#include <asm/hw_irq.h>
+#include <asm/context_tracking.h>
/*
* System calls.
*/
.section ".toc","aw"
-.SYS_CALL_TABLE:
- .tc .sys_call_table[TC],.sys_call_table
+SYS_CALL_TABLE:
+ .tc sys_call_table[TC],sys_call_table
/* This value is used to mark exception frames on the stack. */
exception_marker:
@@ -61,16 +63,11 @@ system_call_common:
std r12,_MSR(r1)
std r0,GPR0(r1)
std r10,GPR1(r1)
+ beq 2f /* if from kernel mode */
ACCOUNT_CPU_USER_ENTRY(r10, r11)
- /*
- * This "crclr so" clears CR0.SO, which is the error indication on
- * return from this system call. There must be no cmp instruction
- * between it and the "mfcr r9" below, otherwise if XER.SO is set,
- * CR0.SO will get set, causing all system calls to appear to fail.
- */
- crclr so
- std r2,GPR2(r1)
+2: std r2,GPR2(r1)
std r3,GPR3(r1)
+ mfcr r2
std r4,GPR4(r1)
std r5,GPR5(r1)
std r6,GPR6(r1)
@@ -81,85 +78,82 @@ system_call_common:
std r11,GPR10(r1)
std r11,GPR11(r1)
std r11,GPR12(r1)
+ std r11,_XER(r1)
+ std r11,_CTR(r1)
std r9,GPR13(r1)
- mfcr r9
mflr r10
+ /*
+ * This clears CR0.SO (bit 28), which is the error indication on
+ * return from this system call.
+ */
+ rldimi r2,r11,28,(63-28)
li r11,0xc01
- std r9,_CCR(r1)
std r10,_LINK(r1)
std r11,_TRAP(r1)
- mfxer r9
- mfctr r10
- std r9,_XER(r1)
- std r10,_CTR(r1)
std r3,ORIG_GPR3(r1)
+ std r2,_CCR(r1)
ld r2,PACATOC(r13)
addi r9,r1,STACK_FRAME_OVERHEAD
ld r11,exception_marker@toc(r2)
std r11,-16(r9) /* "regshere" marker */
-#if defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR)
+#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR)
BEGIN_FW_FTR_SECTION
beq 33f
/* if from user, see if there are any DTL entries to process */
ld r10,PACALPPACAPTR(r13) /* get ptr to VPA */
ld r11,PACA_DTL_RIDX(r13) /* get log read index */
- ld r10,LPPACA_DTLIDX(r10) /* get log write index */
+ addi r10,r10,LPPACA_DTLIDX
+ LDX_BE r10,0,r10 /* get log write index */
cmpd cr1,r11,r10
beq+ cr1,33f
- bl .accumulate_stolen_time
+ bl accumulate_stolen_time
REST_GPR(0,r1)
REST_4GPRS(3,r1)
REST_2GPRS(7,r1)
addi r9,r1,STACK_FRAME_OVERHEAD
33:
END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING && CONFIG_PPC_SPLPAR */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE && CONFIG_PPC_SPLPAR */
-#ifdef CONFIG_TRACE_IRQFLAGS
- bl .trace_hardirqs_on
- REST_GPR(0,r1)
- REST_4GPRS(3,r1)
- REST_2GPRS(7,r1)
- addi r9,r1,STACK_FRAME_OVERHEAD
- ld r12,_MSR(r1)
-#endif /* CONFIG_TRACE_IRQFLAGS */
- li r10,1
- stb r10,PACASOFTIRQEN(r13)
- stb r10,PACAHARDIRQEN(r13)
- std r10,SOFTE(r1)
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
- /* Hack for handling interrupts when soft-enabling on iSeries */
- cmpdi cr1,r0,0x5555 /* syscall 0x5555 */
- andi. r10,r12,MSR_PR /* from kernel */
- crand 4*cr0+eq,4*cr1+eq,4*cr0+eq
- bne 2f
- b hardware_interrupt_entry
-2:
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif /* CONFIG_PPC_ISERIES */
+ /*
+ * A syscall should always be called with interrupts enabled
+ * so we just unconditionally hard-enable here. When some kind
+ * of irq tracing is used, we additionally check that condition
+ * is correct
+ */
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_BUG)
+ lbz r10,PACASOFTIRQEN(r13)
+ xori r10,r10,1
+1: tdnei r10,0
+ EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING
+#endif
- /* Hard enable interrupts */
#ifdef CONFIG_PPC_BOOK3E
wrteei 1
#else
- mfmsr r11
+ ld r11,PACAKMSR(r13)
ori r11,r11,MSR_EE
mtmsrd r11,1
#endif /* CONFIG_PPC_BOOK3E */
+ /* We do need to set SOFTE in the stack frame or the return
+ * from interrupt will be painful
+ */
+ li r10,1
+ std r10,SOFTE(r1)
+
#ifdef SHOW_SYSCALLS
- bl .do_show_syscall
+ bl do_show_syscall
REST_GPR(0,r1)
REST_4GPRS(3,r1)
REST_2GPRS(7,r1)
addi r9,r1,STACK_FRAME_OVERHEAD
#endif
- clrrdi r11,r1,THREAD_SHIFT
+ CURRENT_THREAD_INFO(r11, r1)
ld r10,TI_FLAGS(r11)
andi. r11,r10,_TIF_SYSCALL_T_OR_A
- bne- syscall_dotrace
-syscall_dotrace_cont:
+ bne syscall_dotrace
+.Lsyscall_dotrace_cont:
cmpldi 0,r0,NR_syscalls
bge- syscall_enosys
@@ -168,7 +162,7 @@ system_call: /* label this so stack traces look sane */
* Need to vector to 32 Bit or default sys_call_table here,
* based on caller's run-mode / personality.
*/
- ld r11,.SYS_CALL_TABLE@toc(2)
+ ld r11,SYS_CALL_TABLE@toc(2)
andi. r10,r10,_TIF_32BIT
beq 15f
addi r11,r11,8 /* use 32-bit syscall entries */
@@ -180,17 +174,17 @@ system_call: /* label this so stack traces look sane */
clrldi r8,r8,32
15:
slwi r0,r0,4
- ldx r10,r11,r0 /* Fetch system call handler [ptr] */
- mtctr r10
+ ldx r12,r11,r0 /* Fetch system call handler [ptr] */
+ mtctr r12
bctrl /* Call handler */
syscall_exit:
std r3,RESULT(r1)
#ifdef SHOW_SYSCALLS
- bl .do_show_syscall_exit
+ bl do_show_syscall_exit
ld r3,RESULT(r1)
#endif
- clrrdi r12,r1,THREAD_SHIFT
+ CURRENT_THREAD_INFO(r12, r1)
ld r8,_MSR(r1)
#ifdef CONFIG_PPC_BOOK3S
@@ -198,17 +192,24 @@ syscall_exit:
andi. r10,r8,MSR_RI
beq- unrecov_restore
#endif
-
- /* Disable interrupts so current_thread_info()->flags can't change,
+ /*
+ * Disable interrupts so current_thread_info()->flags can't change,
* and so that we don't get interrupted after loading SRR0/1.
*/
#ifdef CONFIG_PPC_BOOK3E
wrteei 0
#else
- mfmsr r10
- rldicl r10,r10,48,1
- rotldi r10,r10,16
- mtmsrd r10,1
+ ld r10,PACAKMSR(r13)
+ /*
+ * For performance reasons we clear RI the same time that we
+ * clear EE. We only need to clear RI just before we restore r13
+ * below, but batching it with EE saves us one expensive mtmsrd call.
+ * We have to be careful to restore RI if we branch anywhere from
+ * here (eg syscall_exit_work).
+ */
+ li r9,MSR_RI
+ andc r11,r10,r9
+ mtmsrd r11,1
#endif /* CONFIG_PPC_BOOK3E */
ld r9,TI_FLAGS(r12)
@@ -218,27 +219,17 @@ syscall_exit:
cmpld r3,r11
ld r5,_CCR(r1)
bge- syscall_error
-syscall_error_cont:
+.Lsyscall_error_cont:
ld r7,_NIP(r1)
BEGIN_FTR_SECTION
stdcx. r0,0,r1 /* to clear the reservation */
END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
andi. r6,r8,MSR_PR
ld r4,_LINK(r1)
- /*
- * Clear RI before restoring r13. If we are returning to
- * userspace and we take an exception after restoring r13,
- * we end up corrupting the userspace r13 value.
- */
-#ifdef CONFIG_PPC_BOOK3S
- /* No MSR:RI on BookE */
- li r12,MSR_RI
- andc r11,r10,r12
- mtmsrd r11,1 /* clear MSR.RI */
-#endif /* CONFIG_PPC_BOOK3S */
beq- 1f
ACCOUNT_CPU_USER_EXIT(r11, r12)
+ HMT_MEDIUM_LOW_HAS_PPR
ld r13,GPR13(r1) /* only restore r13 if returning to usermode */
1: ld r2,GPR2(r1)
ld r1,GPR1(r1)
@@ -253,13 +244,13 @@ syscall_error:
oris r5,r5,0x1000 /* Set SO bit in CR */
neg r3,r3
std r5,_CCR(r1)
- b syscall_error_cont
+ b .Lsyscall_error_cont
/* Traced system call support */
syscall_dotrace:
- bl .save_nvgprs
+ bl save_nvgprs
addi r3,r1,STACK_FRAME_OVERHEAD
- bl .do_syscall_trace_enter
+ bl do_syscall_trace_enter
/*
* Restore argument registers possibly just changed.
* We use the return value of do_syscall_trace_enter
@@ -273,15 +264,18 @@ syscall_dotrace:
ld r7,GPR7(r1)
ld r8,GPR8(r1)
addi r9,r1,STACK_FRAME_OVERHEAD
- clrrdi r10,r1,THREAD_SHIFT
+ CURRENT_THREAD_INFO(r10, r1)
ld r10,TI_FLAGS(r10)
- b syscall_dotrace_cont
+ b .Lsyscall_dotrace_cont
syscall_enosys:
li r3,-ENOSYS
b syscall_exit
syscall_exit_work:
+#ifdef CONFIG_PPC_BOOK3S
+ mtmsrd r10,1 /* Restore RI */
+#endif
/* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr.
If TIF_NOERROR is set, just save r3 as it is. */
@@ -312,22 +306,23 @@ syscall_exit_work:
subi r12,r12,TI_FLAGS
4: /* Anything else left to do? */
+ SET_DEFAULT_THREAD_PPR(r3, r10) /* Set thread.ppr = 3 */
andi. r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP)
- beq .ret_from_except_lite
+ beq ret_from_except_lite
/* Re-enable interrupts */
#ifdef CONFIG_PPC_BOOK3E
wrteei 1
#else
- mfmsr r10
+ ld r10,PACAKMSR(r13)
ori r10,r10,MSR_EE
mtmsrd r10,1
#endif /* CONFIG_PPC_BOOK3E */
- bl .save_nvgprs
+ bl save_nvgprs
addi r3,r1,STACK_FRAME_OVERHEAD
- bl .do_syscall_trace_leave
- b .ret_from_except
+ bl do_syscall_trace_leave
+ b ret_from_except
/* Save non-volatile GPRs, if not already saved. */
_GLOBAL(save_nvgprs)
@@ -350,36 +345,48 @@ _GLOBAL(save_nvgprs)
*/
_GLOBAL(ppc_fork)
- bl .save_nvgprs
- bl .sys_fork
+ bl save_nvgprs
+ bl sys_fork
b syscall_exit
_GLOBAL(ppc_vfork)
- bl .save_nvgprs
- bl .sys_vfork
+ bl save_nvgprs
+ bl sys_vfork
b syscall_exit
_GLOBAL(ppc_clone)
- bl .save_nvgprs
- bl .sys_clone
+ bl save_nvgprs
+ bl sys_clone
b syscall_exit
_GLOBAL(ppc32_swapcontext)
- bl .save_nvgprs
- bl .compat_sys_swapcontext
+ bl save_nvgprs
+ bl compat_sys_swapcontext
b syscall_exit
_GLOBAL(ppc64_swapcontext)
- bl .save_nvgprs
- bl .sys_swapcontext
+ bl save_nvgprs
+ bl sys_swapcontext
b syscall_exit
_GLOBAL(ret_from_fork)
- bl .schedule_tail
+ bl schedule_tail
REST_NVGPRS(r1)
li r3,0
b syscall_exit
+_GLOBAL(ret_from_kernel_thread)
+ bl schedule_tail
+ REST_NVGPRS(r1)
+ mtlr r14
+ mr r3,r15
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+ mr r12,r14
+#endif
+ blrl
+ li r3,0
+ b syscall_exit
+
/*
* This routine switches between two different tasks. The process
* state of one is saved on its kernel stack. Then the state
@@ -431,6 +438,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
std r23,_CCR(r1)
std r1,KSP(r3) /* Set old stack pointer */
+#ifdef CONFIG_PPC_BOOK3S_64
+BEGIN_FTR_SECTION
+ /* Event based branch registers */
+ mfspr r0, SPRN_BESCR
+ std r0, THREAD_BESCR(r3)
+ mfspr r0, SPRN_EBBHR
+ std r0, THREAD_EBBHR(r3)
+ mfspr r0, SPRN_EBBRR
+ std r0, THREAD_EBBRR(r3)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+#endif
+
#ifdef CONFIG_SMP
/* We need a sync somewhere here to make sure that if the
* previous task gets rescheduled on another CPU, it sees all
@@ -450,6 +469,13 @@ BEGIN_FTR_SECTION
ldarx r6,0,r1
END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS)
+#ifdef CONFIG_PPC_BOOK3S
+/* Cancel all explict user streams as they will have no use after context
+ * switch and will stop the HW from creating streams itself
+ */
+ DCBT_STOP_ALL_STREAM_IDS(r6)
+#endif
+
addi r6,r4,-THREAD /* Convert THREAD to 'current' */
std r6,PACACURRENT(r13) /* Set new 'current' */
@@ -462,10 +488,10 @@ BEGIN_FTR_SECTION
FTR_SECTION_ELSE_NESTED(95)
clrrdi r6,r8,40 /* get its 1T ESID */
clrrdi r9,r1,40 /* get current sp 1T ESID */
- ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_1T_SEGMENT, 95)
+ ALT_MMU_FTR_SECTION_END_NESTED_IFCLR(MMU_FTR_1T_SEGMENT, 95)
FTR_SECTION_ELSE
b 2f
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_SLB)
+ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_SLB)
clrldi. r0,r6,2 /* is new ESID c00000000? */
cmpd cr1,r6,r9 /* or is new ESID the same as current ESID? */
cror eq,4*cr1+eq,eq
@@ -479,7 +505,7 @@ BEGIN_FTR_SECTION
li r9,MMU_SEGSIZE_1T /* insert B field */
oris r6,r6,(MMU_SEGSIZE_1T << SLBIE_SSIZE_SHIFT)@h
rldimi r7,r9,SLB_VSID_SSIZE_SHIFT,0
-END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
/* Update the last bolted SLB. No write barriers are needed
* here, provided we only update the current CPU's SLB shadow
@@ -487,11 +513,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
*/
ld r9,PACA_SLBSHADOWPTR(r13)
li r12,0
- std r12,SLBSHADOW_STACKESID(r9) /* Clear ESID */
- std r7,SLBSHADOW_STACKVSID(r9) /* Save VSID */
- std r0,SLBSHADOW_STACKESID(r9) /* Save ESID */
+ std r12,SLBSHADOW_STACKESID(r9) /* Clear ESID */
+ li r12,SLBSHADOW_STACKVSID
+ STDX_BE r7,r12,r9 /* Save VSID */
+ li r12,SLBSHADOW_STACKESID
+ STDX_BE r0,r12,r9 /* Save ESID */
- /* No need to check for CPU_FTR_NO_SLBIE_B here, since when
+ /* No need to check for MMU_FTR_NO_SLBIE_B here, since when
* we have 1TB segments, the only CPUs known to have the errata
* only support less than 1TB of system memory and we'll never
* actually hit this code path.
@@ -504,7 +532,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
2:
#endif /* !CONFIG_PPC_BOOK3S */
- clrrdi r7,r8,THREAD_SHIFT /* base of new stack */
+ CURRENT_THREAD_INFO(r7, r8) /* base of new stack */
/* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE
because we don't need to leave the 288-byte ABI gap at the
top of the kernel stack. */
@@ -513,8 +541,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT)
mr r1,r8 /* start using new stack pointer */
std r7,PACAKSAVE(r13)
- ld r6,_CCR(r1)
- mtcrf 0xFF,r6
+#ifdef CONFIG_PPC_BOOK3S_64
+BEGIN_FTR_SECTION
+ /* Event based branch registers */
+ ld r0, THREAD_BESCR(r4)
+ mtspr SPRN_BESCR, r0
+ ld r0, THREAD_EBBHR(r4)
+ mtspr SPRN_EBBHR, r0
+ ld r0, THREAD_EBBRR(r4)
+ mtspr SPRN_EBBRR, r0
+
+ ld r0,THREAD_TAR(r4)
+ mtspr SPRN_TAR,r0
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+#endif
#ifdef CONFIG_ALTIVEC
BEGIN_FTR_SECTION
@@ -522,6 +562,28 @@ BEGIN_FTR_SECTION
mtspr SPRN_VRSAVE,r0 /* if G4, restore VRSAVE reg */
END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
#endif /* CONFIG_ALTIVEC */
+#ifdef CONFIG_PPC64
+BEGIN_FTR_SECTION
+ lwz r6,THREAD_DSCR_INHERIT(r4)
+ ld r0,THREAD_DSCR(r4)
+ cmpwi r6,0
+ bne 1f
+ ld r0,PACA_DSCR(r13)
+1:
+BEGIN_FTR_SECTION_NESTED(70)
+ mfspr r8, SPRN_FSCR
+ rldimi r8, r6, FSCR_DSCR_LG, (63 - FSCR_DSCR_LG)
+ mtspr SPRN_FSCR, r8
+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_207S, CPU_FTR_ARCH_207S, 70)
+ cmpd r0,r25
+ beq 2f
+ mtspr SPRN_DSCR,r0
+2:
+END_FTR_SECTION_IFSET(CPU_FTR_DSCR)
+#endif
+
+ ld r6,_CCR(r1)
+ mtcrf 0xFF,r6
/* r3-r13 are destroyed -- Cort */
REST_8GPRS(14, r1)
@@ -538,7 +600,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
_GLOBAL(ret_from_except)
ld r11,_TRAP(r1)
andi. r0,r11,1
- bne .ret_from_except_lite
+ bne ret_from_except_lite
REST_NVGPRS(r1)
_GLOBAL(ret_from_except_lite)
@@ -550,51 +612,197 @@ _GLOBAL(ret_from_except_lite)
#ifdef CONFIG_PPC_BOOK3E
wrteei 0
#else
- mfmsr r10 /* Get current interrupt state */
- rldicl r9,r10,48,1 /* clear MSR_EE */
- rotldi r9,r9,16
- mtmsrd r9,1 /* Update machine state */
+ ld r10,PACAKMSR(r13) /* Get kernel MSR without EE */
+ mtmsrd r10,1 /* Update machine state */
#endif /* CONFIG_PPC_BOOK3E */
-#ifdef CONFIG_PREEMPT
- clrrdi r9,r1,THREAD_SHIFT /* current_thread_info() */
- li r0,_TIF_NEED_RESCHED /* bits to check */
+ CURRENT_THREAD_INFO(r9, r1)
ld r3,_MSR(r1)
+#ifdef CONFIG_PPC_BOOK3E
+ ld r10,PACACURRENT(r13)
+#endif /* CONFIG_PPC_BOOK3E */
ld r4,TI_FLAGS(r9)
- /* Move MSR_PR bit in r3 to _TIF_SIGPENDING position in r0 */
- rlwimi r0,r3,32+TIF_SIGPENDING-MSR_PR_LG,_TIF_SIGPENDING
- and. r0,r4,r0 /* check NEED_RESCHED and maybe SIGPENDING */
- bne do_work
-
-#else /* !CONFIG_PREEMPT */
- ld r3,_MSR(r1) /* Returning to user mode? */
andi. r3,r3,MSR_PR
- beq restore /* if not, just restore regs and return */
+ beq resume_kernel
+#ifdef CONFIG_PPC_BOOK3E
+ lwz r3,(THREAD+THREAD_DBCR0)(r10)
+#endif /* CONFIG_PPC_BOOK3E */
/* Check current_thread_info()->flags */
- clrrdi r9,r1,THREAD_SHIFT
- ld r4,TI_FLAGS(r9)
andi. r0,r4,_TIF_USER_WORK_MASK
- bne do_work
+#ifdef CONFIG_PPC_BOOK3E
+ bne 1f
+ /*
+ * Check to see if the dbcr0 register is set up to debug.
+ * Use the internal debug mode bit to do this.
+ */
+ andis. r0,r3,DBCR0_IDM@h
+ beq restore
+ mfmsr r0
+ rlwinm r0,r0,0,~MSR_DE /* Clear MSR.DE */
+ mtmsr r0
+ mtspr SPRN_DBCR0,r3
+ li r10, -1
+ mtspr SPRN_DBSR,r10
+ b restore
+#else
+ beq restore
#endif
+1: andi. r0,r4,_TIF_NEED_RESCHED
+ beq 2f
+ bl restore_interrupts
+ SCHEDULE_USER
+ b ret_from_except_lite
+2:
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ andi. r0,r4,_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM
+ bne 3f /* only restore TM if nothing else to do */
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl restore_tm_state
+ b restore
+3:
+#endif
+ bl save_nvgprs
+ bl restore_interrupts
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl do_notify_resume
+ b ret_from_except
+
+resume_kernel:
+ /* check current_thread_info, _TIF_EMULATE_STACK_STORE */
+ andis. r8,r4,_TIF_EMULATE_STACK_STORE@h
+ beq+ 1f
+
+ addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */
+
+ lwz r3,GPR1(r1)
+ subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */
+ mr r4,r1 /* src: current exception frame */
+ mr r1,r3 /* Reroute the trampoline frame to r1 */
+ /* Copy from the original to the trampoline. */
+ li r5,INT_FRAME_SIZE/8 /* size: INT_FRAME_SIZE */
+ li r6,0 /* start offset: 0 */
+ mtctr r5
+2: ldx r0,r6,r4
+ stdx r0,r6,r3
+ addi r6,r6,8
+ bdnz 2b
+
+ /* Do real store operation to complete stwu */
+ lwz r5,GPR1(r1)
+ std r8,0(r5)
+
+ /* Clear _TIF_EMULATE_STACK_STORE flag */
+ lis r11,_TIF_EMULATE_STACK_STORE@h
+ addi r5,r9,TI_FLAGS
+0: ldarx r4,0,r5
+ andc r4,r4,r11
+ stdcx. r4,0,r5
+ bne- 0b
+1:
+
+#ifdef CONFIG_PREEMPT
+ /* Check if we need to preempt */
+ andi. r0,r4,_TIF_NEED_RESCHED
+ beq+ restore
+ /* Check that preempt_count() == 0 and interrupts are enabled */
+ lwz r8,TI_PREEMPT(r9)
+ cmpwi cr1,r8,0
+ ld r0,SOFTE(r1)
+ cmpdi r0,0
+ crandc eq,cr1*4+eq,eq
+ bne restore
+
+ /*
+ * Here we are preempting the current task. We want to make
+ * sure we are soft-disabled first and reconcile irq state.
+ */
+ RECONCILE_IRQ_STATE(r3,r4)
+1: bl preempt_schedule_irq
+
+ /* Re-test flags and eventually loop */
+ CURRENT_THREAD_INFO(r9, r1)
+ ld r4,TI_FLAGS(r9)
+ andi. r0,r4,_TIF_NEED_RESCHED
+ bne 1b
+
+ /*
+ * arch_local_irq_restore() from preempt_schedule_irq above may
+ * enable hard interrupt but we really should disable interrupts
+ * when we return from the interrupt, and so that we don't get
+ * interrupted after loading SRR0/1.
+ */
+#ifdef CONFIG_PPC_BOOK3E
+ wrteei 0
+#else
+ ld r10,PACAKMSR(r13) /* Get kernel MSR without EE */
+ mtmsrd r10,1 /* Update machine state */
+#endif /* CONFIG_PPC_BOOK3E */
+#endif /* CONFIG_PREEMPT */
+
+ .globl fast_exc_return_irq
+fast_exc_return_irq:
restore:
-BEGIN_FW_FTR_SECTION
+ /*
+ * This is the main kernel exit path. First we check if we
+ * are about to re-enable interrupts
+ */
ld r5,SOFTE(r1)
-FW_FTR_SECTION_ELSE
- b .Liseries_check_pending_irqs
-ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
-2:
- TRACE_AND_RESTORE_IRQ(r5);
+ lbz r6,PACASOFTIRQEN(r13)
+ cmpwi cr0,r5,0
+ beq restore_irq_off
- /* extract EE bit and use it to restore paca->hard_enabled */
- ld r3,_MSR(r1)
- rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */
- stb r4,PACAHARDIRQEN(r13)
+ /* We are enabling, were we already enabled ? Yes, just return */
+ cmpwi cr0,r6,1
+ beq cr0,do_restore
+
+ /*
+ * We are about to soft-enable interrupts (we are hard disabled
+ * at this point). We check if there's anything that needs to
+ * be replayed first.
+ */
+ lbz r0,PACAIRQHAPPENED(r13)
+ cmpwi cr0,r0,0
+ bne- restore_check_irq_replay
+
+ /*
+ * Get here when nothing happened while soft-disabled, just
+ * soft-enable and move-on. We will hard-enable as a side
+ * effect of rfi
+ */
+restore_no_replay:
+ TRACE_ENABLE_INTS
+ li r0,1
+ stb r0,PACASOFTIRQEN(r13);
+ /*
+ * Final return path. BookE is handled in a different file
+ */
+do_restore:
#ifdef CONFIG_PPC_BOOK3E
- b .exception_return_book3e
+ b exception_return_book3e
#else
+ /*
+ * Clear the reservation. If we know the CPU tracks the address of
+ * the reservation then we can potentially save some cycles and use
+ * a larx. On POWER6 and POWER7 this is significantly faster.
+ */
+BEGIN_FTR_SECTION
+ stdcx. r0,0,r1 /* to clear the reservation */
+FTR_SECTION_ELSE
+ ldarx r4,0,r1
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
+
+ /*
+ * Some code path such as load_up_fpu or altivec return directly
+ * here. They run entirely hard disabled and do not alter the
+ * interrupt state. They also don't use lwarx/stwcx. and thus
+ * are known not to leave dangling reservations.
+ */
+ .globl fast_exception_return
+fast_exception_return:
+ ld r3,_MSR(r1)
ld r4,_CTR(r1)
ld r0,_LINK(r1)
mtctr r4
@@ -607,32 +815,35 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
andi. r0,r3,MSR_RI
beq- unrecov_restore
- /*
- * Clear the reservation. If we know the CPU tracks the address of
- * the reservation then we can potentially save some cycles and use
- * a larx. On POWER6 and POWER7 this is significantly faster.
- */
+ /* Load PPR from thread struct before we clear MSR:RI */
BEGIN_FTR_SECTION
- stdcx. r0,0,r1 /* to clear the reservation */
-FTR_SECTION_ELSE
- ldarx r4,0,r1
-ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
+ ld r2,PACACURRENT(r13)
+ ld r2,TASKTHREADPPR(r2)
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
/*
* Clear RI before restoring r13. If we are returning to
* userspace and we take an exception after restoring r13,
* we end up corrupting the userspace r13 value.
*/
- mfmsr r4
- andc r4,r4,r0 /* r0 contains MSR_RI here */
+ ld r4,PACAKMSR(r13) /* Get kernel MSR without EE */
+ andc r4,r4,r0 /* r0 contains MSR_RI here */
mtmsrd r4,1
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ /* TM debug */
+ std r3, PACATMSCRATCH(r13) /* Stash returned-to MSR */
+#endif
/*
* r13 is our per cpu area, only restore it if we are returning to
- * userspace
+ * userspace the value stored in the stack frame may belong to
+ * another CPU.
*/
andi. r0,r3,MSR_PR
beq 1f
+BEGIN_FTR_SECTION
+ mtspr SPRN_PPR,r2 /* Restore PPR */
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
ACCOUNT_CPU_USER_EXIT(r2, r4)
REST_GPR(13, r1)
1:
@@ -654,99 +865,86 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
#endif /* CONFIG_PPC_BOOK3E */
-.Liseries_check_pending_irqs:
-#ifdef CONFIG_PPC_ISERIES
- ld r5,SOFTE(r1)
- cmpdi 0,r5,0
- beq 2b
- /* Check for pending interrupts (iSeries) */
- ld r3,PACALPPACAPTR(r13)
- ld r3,LPPACAANYINT(r3)
- cmpdi r3,0
- beq+ 2b /* skip do_IRQ if no interrupts */
-
- li r3,0
- stb r3,PACASOFTIRQEN(r13) /* ensure we are soft-disabled */
-#ifdef CONFIG_TRACE_IRQFLAGS
- bl .trace_hardirqs_off
- mfmsr r10
-#endif
- ori r10,r10,MSR_EE
- mtmsrd r10 /* hard-enable again */
- addi r3,r1,STACK_FRAME_OVERHEAD
- bl .do_IRQ
- b .ret_from_except_lite /* loop back and handle more */
-#endif
-
-do_work:
-#ifdef CONFIG_PREEMPT
- andi. r0,r3,MSR_PR /* Returning to user mode? */
- bne user_work
- /* Check that preempt_count() == 0 and interrupts are enabled */
- lwz r8,TI_PREEMPT(r9)
- cmpwi cr1,r8,0
- ld r0,SOFTE(r1)
- cmpdi r0,0
- crandc eq,cr1*4+eq,eq
- bne restore
-
- /* Here we are preempting the current task.
+ /*
+ * We are returning to a context with interrupts soft disabled.
*
- * Ensure interrupts are soft-disabled. We also properly mark
- * the PACA to reflect the fact that they are hard-disabled
- * and trace the change
+ * However, we may also about to hard enable, so we need to
+ * make sure that in this case, we also clear PACA_IRQ_HARD_DIS
+ * or that bit can get out of sync and bad things will happen
*/
- li r0,0
- stb r0,PACASOFTIRQEN(r13)
- stb r0,PACAHARDIRQEN(r13)
+restore_irq_off:
+ ld r3,_MSR(r1)
+ lbz r7,PACAIRQHAPPENED(r13)
+ andi. r0,r3,MSR_EE
+ beq 1f
+ rlwinm r7,r7,0,~PACA_IRQ_HARD_DIS
+ stb r7,PACAIRQHAPPENED(r13)
+1: li r0,0
+ stb r0,PACASOFTIRQEN(r13);
TRACE_DISABLE_INTS
+ b do_restore
- /* Call the scheduler with soft IRQs off */
-1: bl .preempt_schedule_irq
-
- /* Hard-disable interrupts again (and update PACA) */
-#ifdef CONFIG_PPC_BOOK3E
- wrteei 0
-#else
- mfmsr r10
- rldicl r10,r10,48,1
- rotldi r10,r10,16
- mtmsrd r10,1
-#endif /* CONFIG_PPC_BOOK3E */
- li r0,0
- stb r0,PACAHARDIRQEN(r13)
-
- /* Re-test flags and eventually loop */
- clrrdi r9,r1,THREAD_SHIFT
- ld r4,TI_FLAGS(r9)
- andi. r0,r4,_TIF_NEED_RESCHED
- bne 1b
- b restore
-
-user_work:
-#endif /* CONFIG_PREEMPT */
+ /*
+ * Something did happen, check if a re-emit is needed
+ * (this also clears paca->irq_happened)
+ */
+restore_check_irq_replay:
+ /* XXX: We could implement a fast path here where we check
+ * for irq_happened being just 0x01, in which case we can
+ * clear it and return. That means that we would potentially
+ * miss a decrementer having wrapped all the way around.
+ *
+ * Still, this might be useful for things like hash_page
+ */
+ bl __check_irq_replay
+ cmpwi cr0,r3,0
+ beq restore_no_replay
+
+ /*
+ * We need to re-emit an interrupt. We do so by re-using our
+ * existing exception frame. We first change the trap value,
+ * but we need to ensure we preserve the low nibble of it
+ */
+ ld r4,_TRAP(r1)
+ clrldi r4,r4,60
+ or r4,r4,r3
+ std r4,_TRAP(r1)
- /* Enable interrupts */
+ /*
+ * Then find the right handler and call it. Interrupts are
+ * still soft-disabled and we keep them that way.
+ */
+ cmpwi cr0,r3,0x500
+ bne 1f
+ addi r3,r1,STACK_FRAME_OVERHEAD;
+ bl do_IRQ
+ b ret_from_except
+1: cmpwi cr0,r3,0x900
+ bne 1f
+ addi r3,r1,STACK_FRAME_OVERHEAD;
+ bl timer_interrupt
+ b ret_from_except
+#ifdef CONFIG_PPC_DOORBELL
+1:
#ifdef CONFIG_PPC_BOOK3E
- wrteei 1
+ cmpwi cr0,r3,0x280
#else
- ori r10,r10,MSR_EE
- mtmsrd r10,1
+ BEGIN_FTR_SECTION
+ cmpwi cr0,r3,0xe80
+ FTR_SECTION_ELSE
+ cmpwi cr0,r3,0xa00
+ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
#endif /* CONFIG_PPC_BOOK3E */
-
- andi. r0,r4,_TIF_NEED_RESCHED
- beq 1f
- bl .schedule
- b .ret_from_except_lite
-
-1: bl .save_nvgprs
- addi r3,r1,STACK_FRAME_OVERHEAD
- bl .do_signal
- b .ret_from_except
-
+ bne 1f
+ addi r3,r1,STACK_FRAME_OVERHEAD;
+ bl doorbell_exception
+ b ret_from_except
+#endif /* CONFIG_PPC_DOORBELL */
+1: b ret_from_except /* What else to do here ? */
+
unrecov_restore:
addi r3,r1,STACK_FRAME_OVERHEAD
- bl .unrecoverable_exception
+ bl unrecoverable_exception
b unrecov_restore
#ifdef CONFIG_PPC_RTAS
@@ -812,7 +1010,7 @@ _GLOBAL(enter_rtas)
std r6,PACASAVEDMSR(r13)
/* Setup our real return addr */
- LOAD_REG_ADDR(r4,.rtas_return_loc)
+ LOAD_REG_ADDR(r4,rtas_return_loc)
clrldi r4,r4,2 /* convert to realmode address */
mtlr r4
@@ -822,7 +1020,7 @@ _GLOBAL(enter_rtas)
li r9,1
rldicr r9,r9,MSR_SF_LG,(63-MSR_SF_LG)
- ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI
+ ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE
andc r6,r0,r9
sync /* disable interrupts so SRR0/1 */
mtmsrd r0 /* don't get trashed */
@@ -836,14 +1034,16 @@ _GLOBAL(enter_rtas)
rfid
b . /* prevent speculative execution */
-_STATIC(rtas_return_loc)
+rtas_return_loc:
+ FIXUP_ENDIAN
+
/* relocation is off at this point */
- mfspr r4,SPRN_SPRG_PACA /* Get PACA */
+ GET_PACA(r4)
clrldi r4,r4,2 /* convert to realmode address */
bcl 20,31,$+4
0: mflr r3
- ld r3,(1f-0b)(r3) /* get &.rtas_restore_regs */
+ ld r3,(1f-0b)(r3) /* get &rtas_restore_regs */
mfmsr r6
li r0,MSR_RI
@@ -860,16 +1060,16 @@ _STATIC(rtas_return_loc)
b . /* prevent speculative execution */
.align 3
-1: .llong .rtas_restore_regs
+1: .llong rtas_restore_regs
-_STATIC(rtas_restore_regs)
+rtas_restore_regs:
/* relocation is on at this point */
REST_GPR(2, r1) /* Restore the TOC */
REST_GPR(13, r1) /* Restore paca */
REST_8GPRS(14, r1) /* Restore the non-volatiles */
REST_10GPRS(22, r1) /* ditto */
- mfspr r13,SPRN_SPRG_PACA
+ GET_PACA(r13)
ld r4,_CCR(r1)
mtcr r4
@@ -908,28 +1108,30 @@ _GLOBAL(enter_prom)
std r10,_CCR(r1)
std r11,_MSR(r1)
- /* Get the PROM entrypoint */
- mtlr r4
+ /* Put PROM address in SRR0 */
+ mtsrr0 r4
- /* Switch MSR to 32 bits mode
+ /* Setup our trampoline return addr in LR */
+ bcl 20,31,$+4
+0: mflr r4
+ addi r4,r4,(1f - 0b)
+ mtlr r4
+
+ /* Prepare a 32-bit mode big endian MSR
*/
#ifdef CONFIG_PPC_BOOK3E
rlwinm r11,r11,0,1,31
- mtmsr r11
+ mtsrr1 r11
+ rfi
#else /* CONFIG_PPC_BOOK3E */
- mfmsr r11
- li r12,1
- rldicr r12,r12,MSR_SF_LG,(63-MSR_SF_LG)
- andc r11,r11,r12
- li r12,1
- rldicr r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG)
- andc r11,r11,r12
- mtmsrd r11
+ LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE)
+ andc r11,r11,r12
+ mtsrr1 r11
+ rfid
#endif /* CONFIG_PPC_BOOK3E */
- isync
- /* Enter PROM here... */
- blrl
+1: /* Return from OF */
+ FIXUP_ENDIAN
/* Just make sure that r1 top 32 bits didn't get
* corrupt by OF
@@ -960,7 +1162,7 @@ _GLOBAL(mcount)
_GLOBAL(_mcount)
blr
-_GLOBAL(ftrace_caller)
+_GLOBAL_TOC(ftrace_caller)
/* Taken from output of objdump from lib64/glibc */
mflr r3
ld r11, 0(r1)
@@ -984,10 +1186,7 @@ _GLOBAL(ftrace_graph_stub)
_GLOBAL(ftrace_stub)
blr
#else
-_GLOBAL(mcount)
- blr
-
-_GLOBAL(_mcount)
+_GLOBAL_TOC(_mcount)
/* Taken from output of objdump from lib64/glibc */
mflr r3
ld r11, 0(r1)
@@ -1025,7 +1224,7 @@ _GLOBAL(ftrace_graph_caller)
ld r11, 112(r1)
addi r3, r11, 16
- bl .prepare_ftrace_return
+ bl prepare_ftrace_return
nop
ld r0, 128(r1)
@@ -1041,7 +1240,7 @@ _GLOBAL(return_to_handler)
mr r31, r1
stdu r1, -112(r1)
- bl .ftrace_return_to_handler
+ bl ftrace_return_to_handler
nop
/* return value has real return address */
@@ -1071,7 +1270,7 @@ _GLOBAL(mod_return_to_handler)
*/
ld r2, PACATOC(r13)
- bl .ftrace_return_to_handler
+ bl ftrace_return_to_handler
nop
/* return value has real return address */
diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S
new file mode 100644
index 00000000000..9f1ebf7338f
--- /dev/null
+++ b/arch/powerpc/kernel/epapr_hcalls.S
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2012 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <asm/epapr_hcalls.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-compat.h>
+#include <asm/asm-offsets.h>
+
+#ifndef CONFIG_PPC64
+/* epapr_ev_idle() was derived from e500_idle() */
+_GLOBAL(epapr_ev_idle)
+ CURRENT_THREAD_INFO(r3, r1)
+ PPC_LL r4, TI_LOCAL_FLAGS(r3) /* set napping bit */
+ ori r4, r4,_TLF_NAPPING /* so when we take an exception */
+ PPC_STL r4, TI_LOCAL_FLAGS(r3) /* it will return to our caller */
+
+ wrteei 1
+
+idle_loop:
+ LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE))
+
+.global epapr_ev_idle_start
+epapr_ev_idle_start:
+ li r3, -1
+ nop
+ nop
+ nop
+
+ /*
+ * Guard against spurious wakeups from a hypervisor --
+ * only interrupt will cause us to return to LR due to
+ * _TLF_NAPPING.
+ */
+ b idle_loop
+#endif
+
+/* Hypercall entry point. Will be patched with device tree instructions. */
+.global epapr_hypercall_start
+epapr_hypercall_start:
+ li r3, -1
+ nop
+ nop
+ nop
+ blr
diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c
new file mode 100644
index 00000000000..59e4ba74975
--- /dev/null
+++ b/arch/powerpc/kernel/epapr_paravirt.c
@@ -0,0 +1,85 @@
+/*
+ * ePAPR para-virtualization support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright (C) 2012 Freescale Semiconductor, Inc.
+ */
+
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <asm/epapr_hcalls.h>
+#include <asm/cacheflush.h>
+#include <asm/code-patching.h>
+#include <asm/machdep.h>
+
+#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
+extern void epapr_ev_idle(void);
+extern u32 epapr_ev_idle_start[];
+#endif
+
+bool epapr_paravirt_enabled;
+static bool __maybe_unused epapr_has_idle;
+
+static int __init early_init_dt_scan_epapr(unsigned long node,
+ const char *uname,
+ int depth, void *data)
+{
+ const u32 *insts;
+ int len;
+ int i;
+
+ insts = of_get_flat_dt_prop(node, "hcall-instructions", &len);
+ if (!insts)
+ return 0;
+
+ if (len % 4 || len > (4 * 4))
+ return -1;
+
+ for (i = 0; i < (len / 4); i++) {
+ u32 inst = be32_to_cpu(insts[i]);
+ patch_instruction(epapr_hypercall_start + i, inst);
+#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
+ patch_instruction(epapr_ev_idle_start + i, inst);
+#endif
+ }
+
+#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
+ if (of_get_flat_dt_prop(node, "has-idle", NULL))
+ epapr_has_idle = true;
+#endif
+
+ epapr_paravirt_enabled = true;
+
+ return 1;
+}
+
+int __init epapr_paravirt_early_init(void)
+{
+ of_scan_flat_dt(early_init_dt_scan_epapr, NULL);
+
+ return 0;
+}
+
+static int __init epapr_idle_init(void)
+{
+#if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
+ if (epapr_has_idle)
+ ppc_md.power_save = epapr_ev_idle;
+#endif
+
+ return 0;
+}
+
+postcore_initcall(epapr_idle_init);
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index 5c43063d250..bb9cac6c805 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -17,12 +17,16 @@
#include <asm/cputable.h>
#include <asm/setup.h>
#include <asm/thread_info.h>
+#include <asm/reg_a2.h>
#include <asm/exception-64e.h>
#include <asm/bug.h>
#include <asm/irqflags.h>
#include <asm/ptrace.h>
#include <asm/ppc-opcode.h>
#include <asm/mmu.h>
+#include <asm/hw_irq.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_booke_hv_asm.h>
/* XXX This will ultimately add space for a special exception save
* structure used to save things like SRR0/SRR1, SPRGs, MAS, etc...
@@ -30,19 +34,263 @@
* special interrupts from within a non-standard level will probably
* blow you up
*/
-#define SPECIAL_EXC_FRAME_SIZE INT_FRAME_SIZE
+#define SPECIAL_EXC_SRR0 0
+#define SPECIAL_EXC_SRR1 1
+#define SPECIAL_EXC_SPRG_GEN 2
+#define SPECIAL_EXC_SPRG_TLB 3
+#define SPECIAL_EXC_MAS0 4
+#define SPECIAL_EXC_MAS1 5
+#define SPECIAL_EXC_MAS2 6
+#define SPECIAL_EXC_MAS3 7
+#define SPECIAL_EXC_MAS6 8
+#define SPECIAL_EXC_MAS7 9
+#define SPECIAL_EXC_MAS5 10 /* E.HV only */
+#define SPECIAL_EXC_MAS8 11 /* E.HV only */
+#define SPECIAL_EXC_IRQHAPPENED 12
+#define SPECIAL_EXC_DEAR 13
+#define SPECIAL_EXC_ESR 14
+#define SPECIAL_EXC_SOFTE 15
+#define SPECIAL_EXC_CSRR0 16
+#define SPECIAL_EXC_CSRR1 17
+/* must be even to keep 16-byte stack alignment */
+#define SPECIAL_EXC_END 18
+
+#define SPECIAL_EXC_FRAME_SIZE (INT_FRAME_SIZE + SPECIAL_EXC_END * 8)
+#define SPECIAL_EXC_FRAME_OFFS (INT_FRAME_SIZE - 288)
+
+#define SPECIAL_EXC_STORE(reg, name) \
+ std reg, (SPECIAL_EXC_##name * 8 + SPECIAL_EXC_FRAME_OFFS)(r1)
+
+#define SPECIAL_EXC_LOAD(reg, name) \
+ ld reg, (SPECIAL_EXC_##name * 8 + SPECIAL_EXC_FRAME_OFFS)(r1)
+
+special_reg_save:
+ lbz r9,PACAIRQHAPPENED(r13)
+ RECONCILE_IRQ_STATE(r3,r4)
+
+ /*
+ * We only need (or have stack space) to save this stuff if
+ * we interrupted the kernel.
+ */
+ ld r3,_MSR(r1)
+ andi. r3,r3,MSR_PR
+ bnelr
+
+ /* Copy info into temporary exception thread info */
+ ld r11,PACAKSAVE(r13)
+ CURRENT_THREAD_INFO(r11, r11)
+ CURRENT_THREAD_INFO(r12, r1)
+ ld r10,TI_FLAGS(r11)
+ std r10,TI_FLAGS(r12)
+ ld r10,TI_PREEMPT(r11)
+ std r10,TI_PREEMPT(r12)
+ ld r10,TI_TASK(r11)
+ std r10,TI_TASK(r12)
+
+ /*
+ * Advance to the next TLB exception frame for handler
+ * types that don't do it automatically.
+ */
+ LOAD_REG_ADDR(r11,extlb_level_exc)
+ lwz r12,0(r11)
+ mfspr r10,SPRN_SPRG_TLB_EXFRAME
+ add r10,r10,r12
+ mtspr SPRN_SPRG_TLB_EXFRAME,r10
+
+ /*
+ * Save registers needed to allow nesting of certain exceptions
+ * (such as TLB misses) inside special exception levels
+ */
+ mfspr r10,SPRN_SRR0
+ SPECIAL_EXC_STORE(r10,SRR0)
+ mfspr r10,SPRN_SRR1
+ SPECIAL_EXC_STORE(r10,SRR1)
+ mfspr r10,SPRN_SPRG_GEN_SCRATCH
+ SPECIAL_EXC_STORE(r10,SPRG_GEN)
+ mfspr r10,SPRN_SPRG_TLB_SCRATCH
+ SPECIAL_EXC_STORE(r10,SPRG_TLB)
+ mfspr r10,SPRN_MAS0
+ SPECIAL_EXC_STORE(r10,MAS0)
+ mfspr r10,SPRN_MAS1
+ SPECIAL_EXC_STORE(r10,MAS1)
+ mfspr r10,SPRN_MAS2
+ SPECIAL_EXC_STORE(r10,MAS2)
+ mfspr r10,SPRN_MAS3
+ SPECIAL_EXC_STORE(r10,MAS3)
+ mfspr r10,SPRN_MAS6
+ SPECIAL_EXC_STORE(r10,MAS6)
+ mfspr r10,SPRN_MAS7
+ SPECIAL_EXC_STORE(r10,MAS7)
+BEGIN_FTR_SECTION
+ mfspr r10,SPRN_MAS5
+ SPECIAL_EXC_STORE(r10,MAS5)
+ mfspr r10,SPRN_MAS8
+ SPECIAL_EXC_STORE(r10,MAS8)
+
+ /* MAS5/8 could have inappropriate values if we interrupted KVM code */
+ li r10,0
+ mtspr SPRN_MAS5,r10
+ mtspr SPRN_MAS8,r10
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+ SPECIAL_EXC_STORE(r9,IRQHAPPENED)
+
+ mfspr r10,SPRN_DEAR
+ SPECIAL_EXC_STORE(r10,DEAR)
+ mfspr r10,SPRN_ESR
+ SPECIAL_EXC_STORE(r10,ESR)
+
+ lbz r10,PACASOFTIRQEN(r13)
+ SPECIAL_EXC_STORE(r10,SOFTE)
+ ld r10,_NIP(r1)
+ SPECIAL_EXC_STORE(r10,CSRR0)
+ ld r10,_MSR(r1)
+ SPECIAL_EXC_STORE(r10,CSRR1)
+
+ blr
+
+ret_from_level_except:
+ ld r3,_MSR(r1)
+ andi. r3,r3,MSR_PR
+ beq 1f
+ b ret_from_except
+1:
+
+ LOAD_REG_ADDR(r11,extlb_level_exc)
+ lwz r12,0(r11)
+ mfspr r10,SPRN_SPRG_TLB_EXFRAME
+ sub r10,r10,r12
+ mtspr SPRN_SPRG_TLB_EXFRAME,r10
+
+ /*
+ * It's possible that the special level exception interrupted a
+ * TLB miss handler, and inserted the same entry that the
+ * interrupted handler was about to insert. On CPUs without TLB
+ * write conditional, this can result in a duplicate TLB entry.
+ * Wipe all non-bolted entries to be safe.
+ *
+ * Note that this doesn't protect against any TLB misses
+ * we may take accessing the stack from here to the end of
+ * the special level exception. It's not clear how we can
+ * reasonably protect against that, but only CPUs with
+ * neither TLB write conditional nor bolted kernel memory
+ * are affected. Do any such CPUs even exist?
+ */
+ PPC_TLBILX_ALL(0,R0)
+
+ REST_NVGPRS(r1)
+
+ SPECIAL_EXC_LOAD(r10,SRR0)
+ mtspr SPRN_SRR0,r10
+ SPECIAL_EXC_LOAD(r10,SRR1)
+ mtspr SPRN_SRR1,r10
+ SPECIAL_EXC_LOAD(r10,SPRG_GEN)
+ mtspr SPRN_SPRG_GEN_SCRATCH,r10
+ SPECIAL_EXC_LOAD(r10,SPRG_TLB)
+ mtspr SPRN_SPRG_TLB_SCRATCH,r10
+ SPECIAL_EXC_LOAD(r10,MAS0)
+ mtspr SPRN_MAS0,r10
+ SPECIAL_EXC_LOAD(r10,MAS1)
+ mtspr SPRN_MAS1,r10
+ SPECIAL_EXC_LOAD(r10,MAS2)
+ mtspr SPRN_MAS2,r10
+ SPECIAL_EXC_LOAD(r10,MAS3)
+ mtspr SPRN_MAS3,r10
+ SPECIAL_EXC_LOAD(r10,MAS6)
+ mtspr SPRN_MAS6,r10
+ SPECIAL_EXC_LOAD(r10,MAS7)
+ mtspr SPRN_MAS7,r10
+BEGIN_FTR_SECTION
+ SPECIAL_EXC_LOAD(r10,MAS5)
+ mtspr SPRN_MAS5,r10
+ SPECIAL_EXC_LOAD(r10,MAS8)
+ mtspr SPRN_MAS8,r10
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+
+ lbz r6,PACASOFTIRQEN(r13)
+ ld r5,SOFTE(r1)
+
+ /* Interrupts had better not already be enabled... */
+ twnei r6,0
+
+ cmpwi cr0,r5,0
+ beq 1f
+
+ TRACE_ENABLE_INTS
+ stb r5,PACASOFTIRQEN(r13)
+1:
+ /*
+ * Restore PACAIRQHAPPENED rather than setting it based on
+ * the return MSR[EE], since we could have interrupted
+ * __check_irq_replay() or other inconsistent transitory
+ * states that must remain that way.
+ */
+ SPECIAL_EXC_LOAD(r10,IRQHAPPENED)
+ stb r10,PACAIRQHAPPENED(r13)
+
+ SPECIAL_EXC_LOAD(r10,DEAR)
+ mtspr SPRN_DEAR,r10
+ SPECIAL_EXC_LOAD(r10,ESR)
+ mtspr SPRN_ESR,r10
+
+ stdcx. r0,0,r1 /* to clear the reservation */
+
+ REST_4GPRS(2, r1)
+ REST_4GPRS(6, r1)
+
+ ld r10,_CTR(r1)
+ ld r11,_XER(r1)
+ mtctr r10
+ mtxer r11
+
+ blr
+
+.macro ret_from_level srr0 srr1 paca_ex scratch
+ bl ret_from_level_except
+
+ ld r10,_LINK(r1)
+ ld r11,_CCR(r1)
+ ld r0,GPR13(r1)
+ mtlr r10
+ mtcr r11
+
+ ld r10,GPR10(r1)
+ ld r11,GPR11(r1)
+ ld r12,GPR12(r1)
+ mtspr \scratch,r0
+
+ std r10,\paca_ex+EX_R10(r13);
+ std r11,\paca_ex+EX_R11(r13);
+ ld r10,_NIP(r1)
+ ld r11,_MSR(r1)
+ ld r0,GPR0(r1)
+ ld r1,GPR1(r1)
+ mtspr \srr0,r10
+ mtspr \srr1,r11
+ ld r10,\paca_ex+EX_R10(r13)
+ ld r11,\paca_ex+EX_R11(r13)
+ mfspr r13,\scratch
+.endm
+
+ret_from_crit_except:
+ ret_from_level SPRN_CSRR0 SPRN_CSRR1 PACA_EXCRIT SPRN_SPRG_CRIT_SCRATCH
+ rfci
+
+ret_from_mc_except:
+ ret_from_level SPRN_MCSRR0 SPRN_MCSRR1 PACA_EXMC SPRN_SPRG_MC_SCRATCH
+ rfmci
/* Exception prolog code for all exceptions */
-#define EXCEPTION_PROLOG(n, type, addition) \
+#define EXCEPTION_PROLOG(n, intnum, type, addition) \
mtspr SPRN_SPRG_##type##_SCRATCH,r13; /* get spare registers */ \
mfspr r13,SPRN_SPRG_PACA; /* get PACA */ \
std r10,PACA_EX##type+EX_R10(r13); \
std r11,PACA_EX##type+EX_R11(r13); \
mfcr r10; /* save CR */ \
+ mfspr r11,SPRN_##type##_SRR1;/* what are we coming from */ \
+ DO_KVM intnum,SPRN_##type##_SRR1; /* KVM hook */ \
+ stw r10,PACA_EX##type+EX_CR(r13); /* save old CR in the PACA */ \
addition; /* additional code for that exc. */ \
std r1,PACA_EX##type+EX_R1(r13); /* save old r1 in the PACA */ \
- stw r10,PACA_EX##type+EX_CR(r13); /* save old CR in the PACA */ \
- mfspr r11,SPRN_##type##_SRR1;/* what are we coming from */ \
type##_SET_KSTACK; /* get special stack if necessary */\
andi. r10,r11,MSR_PR; /* save stack pointer */ \
beq 1f; /* branch around if supervisor */ \
@@ -57,72 +305,79 @@
#define SPRN_GEN_SRR0 SPRN_SRR0
#define SPRN_GEN_SRR1 SPRN_SRR1
+#define GDBELL_SET_KSTACK GEN_SET_KSTACK
+#define SPRN_GDBELL_SRR0 SPRN_GSRR0
+#define SPRN_GDBELL_SRR1 SPRN_GSRR1
+
#define CRIT_SET_KSTACK \
ld r1,PACA_CRIT_STACK(r13); \
- subi r1,r1,SPECIAL_EXC_FRAME_SIZE;
+ subi r1,r1,SPECIAL_EXC_FRAME_SIZE
#define SPRN_CRIT_SRR0 SPRN_CSRR0
#define SPRN_CRIT_SRR1 SPRN_CSRR1
#define DBG_SET_KSTACK \
ld r1,PACA_DBG_STACK(r13); \
- subi r1,r1,SPECIAL_EXC_FRAME_SIZE;
+ subi r1,r1,SPECIAL_EXC_FRAME_SIZE
#define SPRN_DBG_SRR0 SPRN_DSRR0
#define SPRN_DBG_SRR1 SPRN_DSRR1
#define MC_SET_KSTACK \
ld r1,PACA_MC_STACK(r13); \
- subi r1,r1,SPECIAL_EXC_FRAME_SIZE;
+ subi r1,r1,SPECIAL_EXC_FRAME_SIZE
#define SPRN_MC_SRR0 SPRN_MCSRR0
#define SPRN_MC_SRR1 SPRN_MCSRR1
-#define NORMAL_EXCEPTION_PROLOG(n, addition) \
- EXCEPTION_PROLOG(n, GEN, addition##_GEN)
+#define NORMAL_EXCEPTION_PROLOG(n, intnum, addition) \
+ EXCEPTION_PROLOG(n, intnum, GEN, addition##_GEN(n))
-#define CRIT_EXCEPTION_PROLOG(n, addition) \
- EXCEPTION_PROLOG(n, CRIT, addition##_CRIT)
+#define CRIT_EXCEPTION_PROLOG(n, intnum, addition) \
+ EXCEPTION_PROLOG(n, intnum, CRIT, addition##_CRIT(n))
-#define DBG_EXCEPTION_PROLOG(n, addition) \
- EXCEPTION_PROLOG(n, DBG, addition##_DBG)
+#define DBG_EXCEPTION_PROLOG(n, intnum, addition) \
+ EXCEPTION_PROLOG(n, intnum, DBG, addition##_DBG(n))
-#define MC_EXCEPTION_PROLOG(n, addition) \
- EXCEPTION_PROLOG(n, MC, addition##_MC)
+#define MC_EXCEPTION_PROLOG(n, intnum, addition) \
+ EXCEPTION_PROLOG(n, intnum, MC, addition##_MC(n))
+#define GDBELL_EXCEPTION_PROLOG(n, intnum, addition) \
+ EXCEPTION_PROLOG(n, intnum, GDBELL, addition##_GDBELL(n))
/* Variants of the "addition" argument for the prolog
*/
-#define PROLOG_ADDITION_NONE_GEN
-#define PROLOG_ADDITION_NONE_CRIT
-#define PROLOG_ADDITION_NONE_DBG
-#define PROLOG_ADDITION_NONE_MC
-
-#define PROLOG_ADDITION_MASKABLE_GEN \
- lbz r11,PACASOFTIRQEN(r13); /* are irqs soft-disabled ? */ \
- cmpwi cr0,r11,0; /* yes -> go out of line */ \
- beq masked_interrupt_book3e;
-
-#define PROLOG_ADDITION_2REGS_GEN \
+#define PROLOG_ADDITION_NONE_GEN(n)
+#define PROLOG_ADDITION_NONE_GDBELL(n)
+#define PROLOG_ADDITION_NONE_CRIT(n)
+#define PROLOG_ADDITION_NONE_DBG(n)
+#define PROLOG_ADDITION_NONE_MC(n)
+
+#define PROLOG_ADDITION_MASKABLE_GEN(n) \
+ lbz r10,PACASOFTIRQEN(r13); /* are irqs soft-disabled ? */ \
+ cmpwi cr0,r10,0; /* yes -> go out of line */ \
+ beq masked_interrupt_book3e_##n
+
+#define PROLOG_ADDITION_2REGS_GEN(n) \
std r14,PACA_EXGEN+EX_R14(r13); \
std r15,PACA_EXGEN+EX_R15(r13)
-#define PROLOG_ADDITION_1REG_GEN \
+#define PROLOG_ADDITION_1REG_GEN(n) \
std r14,PACA_EXGEN+EX_R14(r13);
-#define PROLOG_ADDITION_2REGS_CRIT \
+#define PROLOG_ADDITION_2REGS_CRIT(n) \
std r14,PACA_EXCRIT+EX_R14(r13); \
std r15,PACA_EXCRIT+EX_R15(r13)
-#define PROLOG_ADDITION_2REGS_DBG \
+#define PROLOG_ADDITION_2REGS_DBG(n) \
std r14,PACA_EXDBG+EX_R14(r13); \
std r15,PACA_EXDBG+EX_R15(r13)
-#define PROLOG_ADDITION_2REGS_MC \
+#define PROLOG_ADDITION_2REGS_MC(n) \
std r14,PACA_EXMC+EX_R14(r13); \
std r15,PACA_EXMC+EX_R15(r13)
-/* Core exception code for all exceptions except TLB misses.
- * XXX: Needs to make SPRN_SPRG_GEN depend on exception type
- */
-#define EXCEPTION_COMMON(n, excf, ints) \
+
+/* Core exception code for all exceptions except TLB misses. */
+#define EXCEPTION_COMMON_LVL(n, scratch, excf) \
+exc_##n##_common: \
std r0,GPR0(r1); /* save r0 in stackframe */ \
std r2,GPR2(r1); /* save r2 in stackframe */ \
SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \
@@ -130,10 +385,11 @@
std r9,GPR9(r1); /* save r9 in stackframe */ \
std r10,_NIP(r1); /* save SRR0 to stackframe */ \
std r11,_MSR(r1); /* save SRR1 to stackframe */ \
+ beq 2f; /* if from kernel mode */ \
ACCOUNT_CPU_USER_ENTRY(r10,r11);/* accounting (uses cr0+eq) */ \
- ld r3,excf+EX_R10(r13); /* get back r10 */ \
+2: ld r3,excf+EX_R10(r13); /* get back r10 */ \
ld r4,excf+EX_R11(r13); /* get back r11 */ \
- mfspr r5,SPRN_SPRG_GEN_SCRATCH;/* get back r13 */ \
+ mfspr r5,scratch; /* get back r13 */ \
std r12,GPR12(r1); /* save r12 in stackframe */ \
ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \
mflr r6; /* save LR in stackframe */ \
@@ -157,23 +413,29 @@
std r11,SOFTE(r1); /* and save it to stackframe */ \
std r12,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ \
std r3,_TRAP(r1); /* set trap number */ \
- std r0,RESULT(r1); /* clear regs->result */ \
- ints;
-
-/* Variants for the "ints" argument */
-#define INTS_KEEP
-#define INTS_DISABLE_SOFT \
- stb r0,PACASOFTIRQEN(r13); /* mark interrupts soft-disabled */ \
- TRACE_DISABLE_INTS;
-#define INTS_DISABLE_HARD \
- stb r0,PACAHARDIRQEN(r13); /* and hard disabled */
-#define INTS_DISABLE_ALL \
- INTS_DISABLE_SOFT \
- INTS_DISABLE_HARD
-
-/* This is called by exceptions that used INTS_KEEP (that is did not clear
- * neither soft nor hard IRQ indicators in the PACA. This will restore MSR:EE
- * to it's previous value
+ std r0,RESULT(r1); /* clear regs->result */
+
+#define EXCEPTION_COMMON(n) \
+ EXCEPTION_COMMON_LVL(n, SPRN_SPRG_GEN_SCRATCH, PACA_EXGEN)
+#define EXCEPTION_COMMON_CRIT(n) \
+ EXCEPTION_COMMON_LVL(n, SPRN_SPRG_CRIT_SCRATCH, PACA_EXCRIT)
+#define EXCEPTION_COMMON_MC(n) \
+ EXCEPTION_COMMON_LVL(n, SPRN_SPRG_MC_SCRATCH, PACA_EXMC)
+#define EXCEPTION_COMMON_DBG(n) \
+ EXCEPTION_COMMON_LVL(n, SPRN_SPRG_DBG_SCRATCH, PACA_EXDBG)
+
+/*
+ * This is meant for exceptions that don't immediately hard-enable. We
+ * set a bit in paca->irq_happened to ensure that a subsequent call to
+ * arch_local_irq_restore() will properly hard-enable and avoid the
+ * fast-path, and then reconcile irq state.
+ */
+#define INTS_DISABLE RECONCILE_IRQ_STATE(r3,r4)
+
+/*
+ * This is called by exceptions that don't use INTS_DISABLE (that did not
+ * touch irq indicators in the PACA). This will restore MSR:EE to it's
+ * previous value
*
* XXX In the long run, we may want to open-code it in order to separate the
* load from the wrtee, thus limiting the latency caused by the dependency
@@ -217,7 +479,7 @@ exc_##n##_bad_stack: \
* interrupts happen before the wait instruction.
*/
#define CHECK_NAPPING() \
- clrrdi r11,r1,THREAD_SHIFT; \
+ CURRENT_THREAD_INFO(r11, r1); \
ld r10,TI_LOCAL_FLAGS(r11); \
andi. r9,r10,_TLF_NAPPING; \
beq+ 1f; \
@@ -228,15 +490,16 @@ exc_##n##_bad_stack: \
1:
-#define MASKABLE_EXCEPTION(trapnum, label, hdlr, ack) \
+#define MASKABLE_EXCEPTION(trapnum, intnum, label, hdlr, ack) \
START_EXCEPTION(label); \
- NORMAL_EXCEPTION_PROLOG(trapnum, PROLOG_ADDITION_MASKABLE) \
- EXCEPTION_COMMON(trapnum, PACA_EXGEN, INTS_DISABLE_ALL) \
+ NORMAL_EXCEPTION_PROLOG(trapnum, intnum, PROLOG_ADDITION_MASKABLE)\
+ EXCEPTION_COMMON(trapnum) \
+ INTS_DISABLE; \
ack(r8); \
CHECK_NAPPING(); \
addi r3,r1,STACK_FRAME_OVERHEAD; \
bl hdlr; \
- b .ret_from_except_lite;
+ b ret_from_except_lite;
/* This value is used to mark exception frames on the stack. */
.section ".toc","aw"
@@ -252,11 +515,8 @@ exception_marker:
.balign 0x1000
.globl interrupt_base_book3e
interrupt_base_book3e: /* fake trap */
- /* Note: If real debug exceptions are supported by the HW, the vector
- * below will have to be patched up to point to an appropriate handler
- */
- EXCEPTION_STUB(0x000, machine_check) /* 0x0200 */
- EXCEPTION_STUB(0x020, critical_input) /* 0x0580 */
+ EXCEPTION_STUB(0x000, machine_check)
+ EXCEPTION_STUB(0x020, critical_input) /* 0x0100 */
EXCEPTION_STUB(0x040, debug_crit) /* 0x0d00 */
EXCEPTION_STUB(0x060, data_storage) /* 0x0300 */
EXCEPTION_STUB(0x080, instruction_storage) /* 0x0400 */
@@ -271,105 +531,172 @@ interrupt_base_book3e: /* fake trap */
EXCEPTION_STUB(0x1a0, watchdog) /* 0x09f0 */
EXCEPTION_STUB(0x1c0, data_tlb_miss)
EXCEPTION_STUB(0x1e0, instruction_tlb_miss)
+ EXCEPTION_STUB(0x200, altivec_unavailable)
+ EXCEPTION_STUB(0x220, altivec_assist)
+ EXCEPTION_STUB(0x260, perfmon)
EXCEPTION_STUB(0x280, doorbell)
EXCEPTION_STUB(0x2a0, doorbell_crit)
+ EXCEPTION_STUB(0x2c0, guest_doorbell)
+ EXCEPTION_STUB(0x2e0, guest_doorbell_crit)
+ EXCEPTION_STUB(0x300, hypercall)
+ EXCEPTION_STUB(0x320, ehpriv)
+ EXCEPTION_STUB(0x340, lrat_error)
.globl interrupt_end_book3e
interrupt_end_book3e:
/* Critical Input Interrupt */
START_EXCEPTION(critical_input);
- CRIT_EXCEPTION_PROLOG(0x100, PROLOG_ADDITION_NONE)
-// EXCEPTION_COMMON(0x100, PACA_EXCRIT, INTS_DISABLE_ALL)
-// bl special_reg_save_crit
-// CHECK_NAPPING();
-// addi r3,r1,STACK_FRAME_OVERHEAD
-// bl .critical_exception
-// b ret_from_crit_except
- b .
+ CRIT_EXCEPTION_PROLOG(0x100, BOOKE_INTERRUPT_CRITICAL,
+ PROLOG_ADDITION_NONE)
+ EXCEPTION_COMMON_CRIT(0x100)
+ bl save_nvgprs
+ bl special_reg_save
+ CHECK_NAPPING();
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl unknown_exception
+ b ret_from_crit_except
/* Machine Check Interrupt */
START_EXCEPTION(machine_check);
- CRIT_EXCEPTION_PROLOG(0x200, PROLOG_ADDITION_NONE)
-// EXCEPTION_COMMON(0x200, PACA_EXMC, INTS_DISABLE_ALL)
-// bl special_reg_save_mc
-// addi r3,r1,STACK_FRAME_OVERHEAD
-// CHECK_NAPPING();
-// bl .machine_check_exception
-// b ret_from_mc_except
- b .
+ MC_EXCEPTION_PROLOG(0x000, BOOKE_INTERRUPT_MACHINE_CHECK,
+ PROLOG_ADDITION_NONE)
+ EXCEPTION_COMMON_MC(0x000)
+ bl save_nvgprs
+ bl special_reg_save
+ CHECK_NAPPING();
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl machine_check_exception
+ b ret_from_mc_except
/* Data Storage Interrupt */
START_EXCEPTION(data_storage)
- NORMAL_EXCEPTION_PROLOG(0x300, PROLOG_ADDITION_2REGS)
+ NORMAL_EXCEPTION_PROLOG(0x300, BOOKE_INTERRUPT_DATA_STORAGE,
+ PROLOG_ADDITION_2REGS)
mfspr r14,SPRN_DEAR
mfspr r15,SPRN_ESR
- EXCEPTION_COMMON(0x300, PACA_EXGEN, INTS_KEEP)
+ EXCEPTION_COMMON(0x300)
+ INTS_DISABLE
b storage_fault_common
/* Instruction Storage Interrupt */
START_EXCEPTION(instruction_storage);
- NORMAL_EXCEPTION_PROLOG(0x400, PROLOG_ADDITION_2REGS)
+ NORMAL_EXCEPTION_PROLOG(0x400, BOOKE_INTERRUPT_INST_STORAGE,
+ PROLOG_ADDITION_2REGS)
li r15,0
mr r14,r10
- EXCEPTION_COMMON(0x400, PACA_EXGEN, INTS_KEEP)
+ EXCEPTION_COMMON(0x400)
+ INTS_DISABLE
b storage_fault_common
/* External Input Interrupt */
- MASKABLE_EXCEPTION(0x500, external_input, .do_IRQ, ACK_NONE)
+ MASKABLE_EXCEPTION(0x500, BOOKE_INTERRUPT_EXTERNAL,
+ external_input, do_IRQ, ACK_NONE)
/* Alignment */
START_EXCEPTION(alignment);
- NORMAL_EXCEPTION_PROLOG(0x600, PROLOG_ADDITION_2REGS)
+ NORMAL_EXCEPTION_PROLOG(0x600, BOOKE_INTERRUPT_ALIGNMENT,
+ PROLOG_ADDITION_2REGS)
mfspr r14,SPRN_DEAR
mfspr r15,SPRN_ESR
- EXCEPTION_COMMON(0x600, PACA_EXGEN, INTS_KEEP)
+ EXCEPTION_COMMON(0x600)
b alignment_more /* no room, go out of line */
/* Program Interrupt */
START_EXCEPTION(program);
- NORMAL_EXCEPTION_PROLOG(0x700, PROLOG_ADDITION_1REG)
+ NORMAL_EXCEPTION_PROLOG(0x700, BOOKE_INTERRUPT_PROGRAM,
+ PROLOG_ADDITION_1REG)
mfspr r14,SPRN_ESR
- EXCEPTION_COMMON(0x700, PACA_EXGEN, INTS_DISABLE_SOFT)
+ EXCEPTION_COMMON(0x700)
+ INTS_DISABLE
std r14,_DSISR(r1)
addi r3,r1,STACK_FRAME_OVERHEAD
ld r14,PACA_EXGEN+EX_R14(r13)
- bl .save_nvgprs
- INTS_RESTORE_HARD
- bl .program_check_exception
- b .ret_from_except
+ bl save_nvgprs
+ bl program_check_exception
+ b ret_from_except
/* Floating Point Unavailable Interrupt */
START_EXCEPTION(fp_unavailable);
- NORMAL_EXCEPTION_PROLOG(0x800, PROLOG_ADDITION_NONE)
+ NORMAL_EXCEPTION_PROLOG(0x800, BOOKE_INTERRUPT_FP_UNAVAIL,
+ PROLOG_ADDITION_NONE)
/* we can probably do a shorter exception entry for that one... */
- EXCEPTION_COMMON(0x800, PACA_EXGEN, INTS_KEEP)
- bne 1f /* if from user, just load it up */
- bl .save_nvgprs
+ EXCEPTION_COMMON(0x800)
+ ld r12,_MSR(r1)
+ andi. r0,r12,MSR_PR;
+ beq- 1f
+ bl load_up_fpu
+ b fast_exception_return
+1: INTS_DISABLE
+ bl save_nvgprs
addi r3,r1,STACK_FRAME_OVERHEAD
- INTS_RESTORE_HARD
- bl .kernel_fp_unavailable_exception
- BUG_OPCODE
-1: ld r12,_MSR(r1)
- bl .load_up_fpu
+ bl kernel_fp_unavailable_exception
+ b ret_from_except
+
+/* Altivec Unavailable Interrupt */
+ START_EXCEPTION(altivec_unavailable);
+ NORMAL_EXCEPTION_PROLOG(0x200, BOOKE_INTERRUPT_SPE_ALTIVEC_UNAVAIL,
+ PROLOG_ADDITION_NONE)
+ /* we can probably do a shorter exception entry for that one... */
+ EXCEPTION_COMMON(0x200)
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+ ld r12,_MSR(r1)
+ andi. r0,r12,MSR_PR;
+ beq- 1f
+ bl load_up_altivec
b fast_exception_return
+1:
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+ INTS_DISABLE
+ bl save_nvgprs
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl altivec_unavailable_exception
+ b ret_from_except
+
+/* AltiVec Assist */
+ START_EXCEPTION(altivec_assist);
+ NORMAL_EXCEPTION_PROLOG(0x220,
+ BOOKE_INTERRUPT_SPE_FP_DATA_ALTIVEC_ASSIST,
+ PROLOG_ADDITION_NONE)
+ EXCEPTION_COMMON(0x220)
+ INTS_DISABLE
+ bl save_nvgprs
+ addi r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+ bl altivec_assist_exception
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#else
+ bl unknown_exception
+#endif
+ b ret_from_except
+
/* Decrementer Interrupt */
- MASKABLE_EXCEPTION(0x900, decrementer, .timer_interrupt, ACK_DEC)
+ MASKABLE_EXCEPTION(0x900, BOOKE_INTERRUPT_DECREMENTER,
+ decrementer, timer_interrupt, ACK_DEC)
/* Fixed Interval Timer Interrupt */
- MASKABLE_EXCEPTION(0x980, fixed_interval, .unknown_exception, ACK_FIT)
+ MASKABLE_EXCEPTION(0x980, BOOKE_INTERRUPT_FIT,
+ fixed_interval, unknown_exception, ACK_FIT)
/* Watchdog Timer Interrupt */
START_EXCEPTION(watchdog);
- CRIT_EXCEPTION_PROLOG(0x9f0, PROLOG_ADDITION_NONE)
-// EXCEPTION_COMMON(0x9f0, PACA_EXCRIT, INTS_DISABLE_ALL)
-// bl special_reg_save_crit
-// CHECK_NAPPING();
-// addi r3,r1,STACK_FRAME_OVERHEAD
-// bl .unknown_exception
-// b ret_from_crit_except
- b .
+ CRIT_EXCEPTION_PROLOG(0x9f0, BOOKE_INTERRUPT_WATCHDOG,
+ PROLOG_ADDITION_NONE)
+ EXCEPTION_COMMON_CRIT(0x9f0)
+ bl save_nvgprs
+ bl special_reg_save
+ CHECK_NAPPING();
+ addi r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_BOOKE_WDT
+ bl WatchdogException
+#else
+ bl unknown_exception
+#endif
+ b ret_from_crit_except
/* System Call Interrupt */
START_EXCEPTION(system_call)
@@ -379,19 +706,21 @@ interrupt_end_book3e:
mfspr r13,SPRN_SPRG_PACA /* get our PACA */
b system_call_common
-/* Auxillary Processor Unavailable Interrupt */
+/* Auxiliary Processor Unavailable Interrupt */
START_EXCEPTION(ap_unavailable);
- NORMAL_EXCEPTION_PROLOG(0xf20, PROLOG_ADDITION_NONE)
- EXCEPTION_COMMON(0xf20, PACA_EXGEN, INTS_KEEP)
+ NORMAL_EXCEPTION_PROLOG(0xf20, BOOKE_INTERRUPT_AP_UNAVAIL,
+ PROLOG_ADDITION_NONE)
+ EXCEPTION_COMMON(0xf20)
+ INTS_DISABLE
+ bl save_nvgprs
addi r3,r1,STACK_FRAME_OVERHEAD
- bl .save_nvgprs
- INTS_RESTORE_HARD
- bl .unknown_exception
- b .ret_from_except
+ bl unknown_exception
+ b ret_from_except
/* Debug exception as a critical interrupt*/
START_EXCEPTION(debug_crit);
- CRIT_EXCEPTION_PROLOG(0xd00, PROLOG_ADDITION_2REGS)
+ CRIT_EXCEPTION_PROLOG(0xd00, BOOKE_INTERRUPT_DEBUG,
+ PROLOG_ADDITION_2REGS)
/*
* If there is a single step or branch-taken exception in an
@@ -403,7 +732,7 @@ interrupt_end_book3e:
*/
mfspr r14,SPRN_DBSR /* check single-step/branch taken */
- andis. r15,r14,DBSR_IC@h
+ andis. r15,r14,(DBSR_IC|DBSR_BT)@h
beq+ 1f
LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
@@ -414,7 +743,7 @@ interrupt_end_book3e:
bge+ cr1,1f
/* here it looks like we got an inappropriate debug exception. */
- lis r14,DBSR_IC@h /* clear the IC event */
+ lis r14,(DBSR_IC|DBSR_BT)@h /* clear the event */
rlwinm r11,r11,0,~MSR_DE /* clear DE in the CSRR1 value */
mtspr SPRN_DBSR,r14
mtspr SPRN_CSRR1,r11
@@ -438,53 +767,239 @@ interrupt_end_book3e:
/* Now we mash up things to make it look like we are coming on a
* normal exception
*/
- mfspr r15,SPRN_SPRG_CRIT_SCRATCH
- mtspr SPRN_SPRG_GEN_SCRATCH,r15
mfspr r14,SPRN_DBSR
- EXCEPTION_COMMON(0xd00, PACA_EXCRIT, INTS_DISABLE_ALL)
+ EXCEPTION_COMMON_CRIT(0xd00)
std r14,_DSISR(r1)
addi r3,r1,STACK_FRAME_OVERHEAD
mr r4,r14
ld r14,PACA_EXCRIT+EX_R14(r13)
ld r15,PACA_EXCRIT+EX_R15(r13)
- bl .save_nvgprs
- bl .DebugException
- b .ret_from_except
+ bl save_nvgprs
+ bl DebugException
+ b ret_from_except
kernel_dbg_exc:
b . /* NYI */
+/* Debug exception as a debug interrupt*/
+ START_EXCEPTION(debug_debug);
+ DBG_EXCEPTION_PROLOG(0xd00, BOOKE_INTERRUPT_DEBUG,
+ PROLOG_ADDITION_2REGS)
+
+ /*
+ * If there is a single step or branch-taken exception in an
+ * exception entry sequence, it was probably meant to apply to
+ * the code where the exception occurred (since exception entry
+ * doesn't turn off DE automatically). We simulate the effect
+ * of turning off DE on entry to an exception handler by turning
+ * off DE in the DSRR1 value and clearing the debug status.
+ */
+
+ mfspr r14,SPRN_DBSR /* check single-step/branch taken */
+ andis. r15,r14,(DBSR_IC|DBSR_BT)@h
+ beq+ 1f
+
+ LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
+ LOAD_REG_IMMEDIATE(r15,interrupt_end_book3e)
+ cmpld cr0,r10,r14
+ cmpld cr1,r10,r15
+ blt+ cr0,1f
+ bge+ cr1,1f
+
+ /* here it looks like we got an inappropriate debug exception. */
+ lis r14,(DBSR_IC|DBSR_BT)@h /* clear the event */
+ rlwinm r11,r11,0,~MSR_DE /* clear DE in the DSRR1 value */
+ mtspr SPRN_DBSR,r14
+ mtspr SPRN_DSRR1,r11
+ lwz r10,PACA_EXDBG+EX_CR(r13) /* restore registers */
+ ld r1,PACA_EXDBG+EX_R1(r13)
+ ld r14,PACA_EXDBG+EX_R14(r13)
+ ld r15,PACA_EXDBG+EX_R15(r13)
+ mtcr r10
+ ld r10,PACA_EXDBG+EX_R10(r13) /* restore registers */
+ ld r11,PACA_EXDBG+EX_R11(r13)
+ mfspr r13,SPRN_SPRG_DBG_SCRATCH
+ rfdi
+
+ /* Normal debug exception */
+ /* XXX We only handle coming from userspace for now since we can't
+ * quite save properly an interrupted kernel state yet
+ */
+1: andi. r14,r11,MSR_PR; /* check for userspace again */
+ beq kernel_dbg_exc; /* if from kernel mode */
+
+ /* Now we mash up things to make it look like we are coming on a
+ * normal exception
+ */
+ mfspr r14,SPRN_DBSR
+ EXCEPTION_COMMON_DBG(0xd08)
+ INTS_DISABLE
+ std r14,_DSISR(r1)
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ mr r4,r14
+ ld r14,PACA_EXDBG+EX_R14(r13)
+ ld r15,PACA_EXDBG+EX_R15(r13)
+ bl save_nvgprs
+ bl DebugException
+ b ret_from_except
+
+ START_EXCEPTION(perfmon);
+ NORMAL_EXCEPTION_PROLOG(0x260, BOOKE_INTERRUPT_PERFORMANCE_MONITOR,
+ PROLOG_ADDITION_NONE)
+ EXCEPTION_COMMON(0x260)
+ INTS_DISABLE
+ CHECK_NAPPING()
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl performance_monitor_exception
+ b ret_from_except_lite
+
/* Doorbell interrupt */
- MASKABLE_EXCEPTION(0x2070, doorbell, .doorbell_exception, ACK_NONE)
+ MASKABLE_EXCEPTION(0x280, BOOKE_INTERRUPT_DOORBELL,
+ doorbell, doorbell_exception, ACK_NONE)
/* Doorbell critical Interrupt */
START_EXCEPTION(doorbell_crit);
- CRIT_EXCEPTION_PROLOG(0x2080, PROLOG_ADDITION_NONE)
-// EXCEPTION_COMMON(0x2080, PACA_EXCRIT, INTS_DISABLE_ALL)
-// bl special_reg_save_crit
-// CHECK_NAPPING();
-// addi r3,r1,STACK_FRAME_OVERHEAD
-// bl .doorbell_critical_exception
-// b ret_from_crit_except
- b .
+ CRIT_EXCEPTION_PROLOG(0x2a0, BOOKE_INTERRUPT_DOORBELL_CRITICAL,
+ PROLOG_ADDITION_NONE)
+ EXCEPTION_COMMON_CRIT(0x2a0)
+ bl save_nvgprs
+ bl special_reg_save
+ CHECK_NAPPING();
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl unknown_exception
+ b ret_from_crit_except
+/*
+ * Guest doorbell interrupt
+ * This general exception use GSRRx save/restore registers
+ */
+ START_EXCEPTION(guest_doorbell);
+ GDBELL_EXCEPTION_PROLOG(0x2c0, BOOKE_INTERRUPT_GUEST_DBELL,
+ PROLOG_ADDITION_NONE)
+ EXCEPTION_COMMON(0x2c0)
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl save_nvgprs
+ INTS_RESTORE_HARD
+ bl unknown_exception
+ b ret_from_except
+
+/* Guest Doorbell critical Interrupt */
+ START_EXCEPTION(guest_doorbell_crit);
+ CRIT_EXCEPTION_PROLOG(0x2e0, BOOKE_INTERRUPT_GUEST_DBELL_CRIT,
+ PROLOG_ADDITION_NONE)
+ EXCEPTION_COMMON_CRIT(0x2e0)
+ bl save_nvgprs
+ bl special_reg_save
+ CHECK_NAPPING();
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl unknown_exception
+ b ret_from_crit_except
+
+/* Hypervisor call */
+ START_EXCEPTION(hypercall);
+ NORMAL_EXCEPTION_PROLOG(0x310, BOOKE_INTERRUPT_HV_SYSCALL,
+ PROLOG_ADDITION_NONE)
+ EXCEPTION_COMMON(0x310)
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl save_nvgprs
+ INTS_RESTORE_HARD
+ bl unknown_exception
+ b ret_from_except
+
+/* Embedded Hypervisor priviledged */
+ START_EXCEPTION(ehpriv);
+ NORMAL_EXCEPTION_PROLOG(0x320, BOOKE_INTERRUPT_HV_PRIV,
+ PROLOG_ADDITION_NONE)
+ EXCEPTION_COMMON(0x320)
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl save_nvgprs
+ INTS_RESTORE_HARD
+ bl unknown_exception
+ b ret_from_except
+
+/* LRAT Error interrupt */
+ START_EXCEPTION(lrat_error);
+ NORMAL_EXCEPTION_PROLOG(0x340, BOOKE_INTERRUPT_LRAT_ERROR,
+ PROLOG_ADDITION_NONE)
+ EXCEPTION_COMMON(0x340)
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl .save_nvgprs
+ INTS_RESTORE_HARD
+ bl .unknown_exception
+ b .ret_from_except
/*
- * An interrupt came in while soft-disabled; clear EE in SRR1,
- * clear paca->hard_enabled and return.
+ * An interrupt came in while soft-disabled; We mark paca->irq_happened
+ * accordingly and if the interrupt is level sensitive, we hard disable
*/
-masked_interrupt_book3e:
- mtcr r10
- stb r11,PACAHARDIRQEN(r13)
- mfspr r10,SPRN_SRR1
- rldicl r11,r10,48,1 /* clear MSR_EE */
- rotldi r10,r11,16
- mtspr SPRN_SRR1,r10
- ld r10,PACA_EXGEN+EX_R10(r13); /* restore registers */
- ld r11,PACA_EXGEN+EX_R11(r13);
- mfspr r13,SPRN_SPRG_GEN_SCRATCH;
+
+.macro masked_interrupt_book3e paca_irq full_mask
+ lbz r10,PACAIRQHAPPENED(r13)
+ ori r10,r10,\paca_irq
+ stb r10,PACAIRQHAPPENED(r13)
+
+ .if \full_mask == 1
+ rldicl r10,r11,48,1 /* clear MSR_EE */
+ rotldi r11,r10,16
+ mtspr SPRN_SRR1,r11
+ .endif
+
+ lwz r11,PACA_EXGEN+EX_CR(r13)
+ mtcr r11
+ ld r10,PACA_EXGEN+EX_R10(r13)
+ ld r11,PACA_EXGEN+EX_R11(r13)
+ mfspr r13,SPRN_SPRG_GEN_SCRATCH
rfi
b .
+.endm
+
+masked_interrupt_book3e_0x500:
+ // XXX When adding support for EPR, use PACA_IRQ_EE_EDGE
+ masked_interrupt_book3e PACA_IRQ_EE 1
+
+masked_interrupt_book3e_0x900:
+ ACK_DEC(r10);
+ masked_interrupt_book3e PACA_IRQ_DEC 0
+
+masked_interrupt_book3e_0x980:
+ ACK_FIT(r10);
+ masked_interrupt_book3e PACA_IRQ_DEC 0
+
+masked_interrupt_book3e_0x280:
+masked_interrupt_book3e_0x2c0:
+ masked_interrupt_book3e PACA_IRQ_DBELL 0
+
+/*
+ * Called from arch_local_irq_enable when an interrupt needs
+ * to be resent. r3 contains either 0x500,0x900,0x260 or 0x280
+ * to indicate the kind of interrupt. MSR:EE is already off.
+ * We generate a stackframe like if a real interrupt had happened.
+ *
+ * Note: While MSR:EE is off, we need to make sure that _MSR
+ * in the generated frame has EE set to 1 or the exception
+ * handler will not properly re-enable them.
+ */
+_GLOBAL(__replay_interrupt)
+ /* We are going to jump to the exception common code which
+ * will retrieve various register values from the PACA which
+ * we don't give a damn about.
+ */
+ mflr r10
+ mfmsr r11
+ mfcr r4
+ mtspr SPRN_SPRG_GEN_SCRATCH,r13;
+ std r1,PACA_EXGEN+EX_R1(r13);
+ stw r4,PACA_EXGEN+EX_CR(r13);
+ ori r11,r11,MSR_EE
+ subi r1,r1,INT_FRAME_SIZE;
+ cmpwi cr0,r3,0x500
+ beq exc_0x500_common
+ cmpwi cr0,r3,0x900
+ beq exc_0x900_common
+ cmpwi cr0,r3,0x280
+ beq exc_0x280_common
+ blr
+
/*
* This is called from 0x300 and 0x400 handlers after the prologs with
@@ -499,17 +1014,16 @@ storage_fault_common:
mr r5,r15
ld r14,PACA_EXGEN+EX_R14(r13)
ld r15,PACA_EXGEN+EX_R15(r13)
- INTS_RESTORE_HARD
- bl .do_page_fault
+ bl do_page_fault
cmpdi r3,0
bne- 1f
- b .ret_from_except_lite
-1: bl .save_nvgprs
+ b ret_from_except_lite
+1: bl save_nvgprs
mr r5,r3
addi r3,r1,STACK_FRAME_OVERHEAD
ld r4,_DAR(r1)
- bl .bad_page_fault
- b .ret_from_except
+ bl bad_page_fault
+ b ret_from_except
/*
* Alignment exception doesn't fit entirely in the 0x100 bytes so it
@@ -521,10 +1035,10 @@ alignment_more:
addi r3,r1,STACK_FRAME_OVERHEAD
ld r14,PACA_EXGEN+EX_R14(r13)
ld r15,PACA_EXGEN+EX_R15(r13)
- bl .save_nvgprs
+ bl save_nvgprs
INTS_RESTORE_HARD
- bl .alignment_exception
- b .ret_from_except
+ bl alignment_exception
+ b ret_from_except
/*
* We branch here from entry_64.S for the last stage of the exception
@@ -587,7 +1101,16 @@ fast_exception_return:
BAD_STACK_TRAMPOLINE(0x000)
BAD_STACK_TRAMPOLINE(0x100)
BAD_STACK_TRAMPOLINE(0x200)
+BAD_STACK_TRAMPOLINE(0x220)
+BAD_STACK_TRAMPOLINE(0x260)
+BAD_STACK_TRAMPOLINE(0x280)
+BAD_STACK_TRAMPOLINE(0x2a0)
+BAD_STACK_TRAMPOLINE(0x2c0)
+BAD_STACK_TRAMPOLINE(0x2e0)
BAD_STACK_TRAMPOLINE(0x300)
+BAD_STACK_TRAMPOLINE(0x310)
+BAD_STACK_TRAMPOLINE(0x320)
+BAD_STACK_TRAMPOLINE(0x340)
BAD_STACK_TRAMPOLINE(0x400)
BAD_STACK_TRAMPOLINE(0x500)
BAD_STACK_TRAMPOLINE(0x600)
@@ -600,11 +1123,10 @@ BAD_STACK_TRAMPOLINE(0xa00)
BAD_STACK_TRAMPOLINE(0xb00)
BAD_STACK_TRAMPOLINE(0xc00)
BAD_STACK_TRAMPOLINE(0xd00)
+BAD_STACK_TRAMPOLINE(0xd08)
BAD_STACK_TRAMPOLINE(0xe00)
BAD_STACK_TRAMPOLINE(0xf00)
BAD_STACK_TRAMPOLINE(0xf20)
-BAD_STACK_TRAMPOLINE(0x2070)
-BAD_STACK_TRAMPOLINE(0x2080)
.globl bad_stack_book3e
bad_stack_book3e:
@@ -650,7 +1172,7 @@ bad_stack_book3e:
std r12,0(r11)
ld r2,PACATOC(r13)
1: addi r3,r1,STACK_FRAME_OVERHEAD
- bl .kernel_bad_stack
+ bl kernel_bad_stack
b 1b
/*
@@ -732,7 +1254,7 @@ skpinv: addi r6,r6,1 /* Increment */
bne 1b /* If not, repeat */
/* Invalidate all TLBs */
- PPC_TLBILX_ALL(0,0)
+ PPC_TLBILX_ALL(0,R0)
sync
isync
@@ -785,12 +1307,9 @@ skpinv: addi r6,r6,1 /* Increment */
mtspr SPRN_MAS0,r3
tlbre
mfspr r6,SPRN_MAS1
- rlwinm r6,r6,0,2,0 /* clear IPROT */
+ rlwinm r6,r6,0,2,31 /* clear IPROT and VALID */
mtspr SPRN_MAS1,r6
tlbwe
-
- /* Invalidate TLB1 */
- PPC_TLBILX_ALL(0,0)
sync
isync
@@ -844,12 +1363,9 @@ skpinv: addi r6,r6,1 /* Increment */
mtspr SPRN_MAS0,r4
tlbre
mfspr r5,SPRN_MAS1
- rlwinm r5,r5,0,2,0 /* clear IPROT */
+ rlwinm r5,r5,0,2,31 /* clear IPROT and VALID */
mtspr SPRN_MAS1,r5
tlbwe
-
- /* Invalidate TLB1 */
- PPC_TLBILX_ALL(0,0)
sync
isync
@@ -864,8 +1380,23 @@ have_hes:
* that will have to be made dependent on whether we are running under
* a hypervisor I suppose.
*/
- ori r3,r3,MAS0_HES | MAS0_WQ_ALLWAYS
- mtspr SPRN_MAS0,r3
+
+ /* BEWARE, MAGIC
+ * This code is called as an ordinary function on the boot CPU. But to
+ * avoid duplication, this code is also used in SCOM bringup of
+ * secondary CPUs. We read the code between the initial_tlb_code_start
+ * and initial_tlb_code_end labels one instruction at a time and RAM it
+ * into the new core via SCOM. That doesn't process branches, so there
+ * must be none between those two labels. It also means if this code
+ * ever takes any parameters, the SCOM code must also be updated to
+ * provide them.
+ */
+ .globl a2_tlbinit_code_start
+a2_tlbinit_code_start:
+
+ ori r11,r3,MAS0_WQ_ALLWAYS
+ oris r11,r11,MAS0_ESEL(3)@h /* Use way 3: workaround A2 erratum 376 */
+ mtspr SPRN_MAS0,r11
lis r3,(MAS1_VALID | MAS1_IPROT)@h
ori r3,r3,BOOK3E_PAGESZ_1GB << MAS1_TSIZE_SHIFT
mtspr SPRN_MAS1,r3
@@ -879,18 +1410,70 @@ have_hes:
/* Write the TLB entry */
tlbwe
+ .globl a2_tlbinit_after_linear_map
+a2_tlbinit_after_linear_map:
+
/* Now we branch the new virtual address mapped by this entry */
LOAD_REG_IMMEDIATE(r3,1f)
mtctr r3
bctr
1: /* We are now running at PAGE_OFFSET, clean the TLB of everything
- * else (XXX we should scan for bolted crap from the firmware too)
+ * else (including IPROTed things left by firmware)
+ * r4 = TLBnCFG
+ * r3 = current address (more or less)
*/
- PPC_TLBILX(0,0,0)
+
+ li r5,0
+ mtspr SPRN_MAS6,r5
+ tlbsx 0,r3
+
+ rlwinm r9,r4,0,TLBnCFG_N_ENTRY
+ rlwinm r10,r4,8,0xff
+ addi r10,r10,-1 /* Get inner loop mask */
+
+ li r3,1
+
+ mfspr r5,SPRN_MAS1
+ rlwinm r5,r5,0,(~(MAS1_VALID|MAS1_IPROT))
+
+ mfspr r6,SPRN_MAS2
+ rldicr r6,r6,0,51 /* Extract EPN */
+
+ mfspr r7,SPRN_MAS0
+ rlwinm r7,r7,0,0xffff0fff /* Clear HES and WQ */
+
+ rlwinm r8,r7,16,0xfff /* Extract ESEL */
+
+2: add r4,r3,r8
+ and r4,r4,r10
+
+ rlwimi r7,r4,16,MAS0_ESEL_MASK
+
+ mtspr SPRN_MAS0,r7
+ mtspr SPRN_MAS1,r5
+ mtspr SPRN_MAS2,r6
+ tlbwe
+
+ addi r3,r3,1
+ and. r4,r3,r10
+
+ bne 3f
+ addis r6,r6,(1<<30)@h
+3:
+ cmpw r3,r9
+ blt 2b
+
+ .globl a2_tlbinit_after_iprot_flush
+a2_tlbinit_after_iprot_flush:
+
+ PPC_TLBILX(0,0,R0)
sync
isync
+ .globl a2_tlbinit_code_end
+a2_tlbinit_code_end:
+
/* We translate LR and return */
mflr r3
tovirt(r3,r3)
@@ -922,13 +1505,13 @@ _GLOBAL(start_initialization_book3e)
* and always use AS 0, so we just set it up to match our link
* address and never use 0 based addresses.
*/
- bl .initial_tlb_book3e
+ bl initial_tlb_book3e
/* Init global core bits */
- bl .init_core_book3e
+ bl init_core_book3e
/* Init per-thread bits */
- bl .init_thread_book3e
+ bl init_thread_book3e
/* Return to common init code */
tovirt(r28,r28)
@@ -949,7 +1532,7 @@ _GLOBAL(start_initialization_book3e)
*/
_GLOBAL(book3e_secondary_core_init_tlb_set)
li r4,1
- b .generic_secondary_smp_init
+ b generic_secondary_smp_init
_GLOBAL(book3e_secondary_core_init)
mflr r28
@@ -959,18 +1542,18 @@ _GLOBAL(book3e_secondary_core_init)
bne 2f
/* Setup TLB for this core */
- bl .initial_tlb_book3e
+ bl initial_tlb_book3e
/* We can return from the above running at a different
* address, so recalculate r2 (TOC)
*/
- bl .relative_toc
+ bl relative_toc
/* Init global core bits */
-2: bl .init_core_book3e
+2: bl init_core_book3e
/* Init per-thread bits */
-3: bl .init_thread_book3e
+3: bl init_thread_book3e
/* Return to common init code at proper virtual address.
*
@@ -997,14 +1580,14 @@ _GLOBAL(book3e_secondary_thread_init)
mflr r28
b 3b
-_STATIC(init_core_book3e)
+init_core_book3e:
/* Establish the interrupt vector base */
LOAD_REG_IMMEDIATE(r3, interrupt_base_book3e)
mtspr SPRN_IVPR,r3
sync
blr
-_STATIC(init_thread_book3e)
+init_thread_book3e:
lis r3,(SPRN_EPCR_ICM | SPRN_EPCR_GICM)@h
mtspr SPRN_EPCR,r3
@@ -1040,3 +1623,28 @@ _GLOBAL(__setup_base_ivors)
sync
blr
+
+_GLOBAL(setup_altivec_ivors)
+ SET_IVOR(32, 0x200) /* AltiVec Unavailable */
+ SET_IVOR(33, 0x220) /* AltiVec Assist */
+ blr
+
+_GLOBAL(setup_perfmon_ivor)
+ SET_IVOR(35, 0x260) /* Performance Monitor */
+ blr
+
+_GLOBAL(setup_doorbell_ivors)
+ SET_IVOR(36, 0x280) /* Processor Doorbell */
+ SET_IVOR(37, 0x2a0) /* Processor Doorbell Crit */
+ blr
+
+_GLOBAL(setup_ehv_ivors)
+ SET_IVOR(40, 0x300) /* Embedded Hypervisor System Call */
+ SET_IVOR(41, 0x320) /* Embedded Hypervisor Privilege */
+ SET_IVOR(38, 0x2c0) /* Guest Processor Doorbell */
+ SET_IVOR(39, 0x2e0) /* Guest Processor Doorbell Crit/MC */
+ blr
+
+_GLOBAL(setup_lrat_ivor)
+ SET_IVOR(42, 0x340) /* LRAT Error */
+ blr
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 9f8b01d6466..a7d36b19221 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -5,24 +5,82 @@
* handling and other fixed offset specific things.
*
* This file is meant to be #included from head_64.S due to
- * position dependant assembly.
+ * position dependent assembly.
*
* Most of this originates from head_64.S and thus has the same
* copyright history.
*
*/
+#include <asm/hw_irq.h>
#include <asm/exception-64s.h>
+#include <asm/ptrace.h>
/*
* We layout physical memory as follows:
* 0x0000 - 0x00ff : Secondary processor spin code
- * 0x0100 - 0x2fff : pSeries Interrupt prologs
- * 0x3000 - 0x5fff : interrupt support, iSeries and common interrupt prologs
- * 0x6000 - 0x6fff : Initial (CPU0) segment table
+ * 0x0100 - 0x17ff : pSeries Interrupt prologs
+ * 0x1800 - 0x4000 : interrupt support common interrupt prologs
+ * 0x4000 - 0x5fff : pSeries interrupts with IR=1,DR=1
+ * 0x6000 - 0x6fff : more interrupt support including for IR=1,DR=1
* 0x7000 - 0x7fff : FWNMI data area
- * 0x8000 - : Early init and support code
+ * 0x8000 - 0x8fff : Initial (CPU0) segment table
+ * 0x9000 - : Early init and support code
*/
+ /* Syscall routine is used twice, in reloc-off and reloc-on paths */
+#define SYSCALL_PSERIES_1 \
+BEGIN_FTR_SECTION \
+ cmpdi r0,0x1ebe ; \
+ beq- 1f ; \
+END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
+ mr r9,r13 ; \
+ GET_PACA(r13) ; \
+ mfspr r11,SPRN_SRR0 ; \
+0:
+
+#define SYSCALL_PSERIES_2_RFID \
+ mfspr r12,SPRN_SRR1 ; \
+ ld r10,PACAKBASE(r13) ; \
+ LOAD_HANDLER(r10, system_call_entry) ; \
+ mtspr SPRN_SRR0,r10 ; \
+ ld r10,PACAKMSR(r13) ; \
+ mtspr SPRN_SRR1,r10 ; \
+ rfid ; \
+ b . ; /* prevent speculative execution */
+
+#define SYSCALL_PSERIES_3 \
+ /* Fast LE/BE switch system call */ \
+1: mfspr r12,SPRN_SRR1 ; \
+ xori r12,r12,MSR_LE ; \
+ mtspr SPRN_SRR1,r12 ; \
+ rfid ; /* return to userspace */ \
+ b . ; /* prevent speculative execution */
+
+#if defined(CONFIG_RELOCATABLE)
+ /*
+ * We can't branch directly; in the direct case we use LR
+ * and system_call_entry restores LR. (We thus need to move
+ * LR to r10 in the RFID case too.)
+ */
+#define SYSCALL_PSERIES_2_DIRECT \
+ mflr r10 ; \
+ ld r12,PACAKBASE(r13) ; \
+ LOAD_HANDLER(r12, system_call_entry_direct) ; \
+ mtctr r12 ; \
+ mfspr r12,SPRN_SRR1 ; \
+ /* Re-use of r13... No spare regs to do this */ \
+ li r13,MSR_RI ; \
+ mtmsrd r13,1 ; \
+ GET_PACA(r13) ; /* get r13 back */ \
+ bctr ;
+#else
+ /* We can branch directly */
+#define SYSCALL_PSERIES_2_DIRECT \
+ mfspr r12,SPRN_SRR1 ; \
+ li r10,MSR_RI ; \
+ mtmsrd r10,1 ; /* Set RI (EE=0) */ \
+ b system_call_entry_direct ;
+#endif
/*
* This is the start of the interrupt handlers for pSeries
@@ -36,220 +94,618 @@
.globl __start_interrupts
__start_interrupts:
- STD_EXCEPTION_PSERIES(0x100, system_reset)
+ .globl system_reset_pSeries;
+system_reset_pSeries:
+ HMT_MEDIUM_PPR_DISCARD
+ SET_SCRATCH0(r13)
+#ifdef CONFIG_PPC_P7_NAP
+BEGIN_FTR_SECTION
+ /* Running native on arch 2.06 or later, check if we are
+ * waking up from nap. We only handle no state loss and
+ * supervisor state loss. We do -not- handle hypervisor
+ * state loss at this time.
+ */
+ mfspr r13,SPRN_SRR1
+ rlwinm. r13,r13,47-31,30,31
+ beq 9f
+
+ /* waking up from powersave (nap) state */
+ cmpwi cr1,r13,2
+ /* Total loss of HV state is fatal, we could try to use the
+ * PIR to locate a PACA, then use an emergency stack etc...
+ * OPAL v3 based powernv platforms have new idle states
+ * which fall in this catagory.
+ */
+ bgt cr1,8f
+ GET_PACA(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ li r0,KVM_HWTHREAD_IN_KERNEL
+ stb r0,HSTATE_HWTHREAD_STATE(r13)
+ /* Order setting hwthread_state vs. testing hwthread_req */
+ sync
+ lbz r0,HSTATE_HWTHREAD_REQ(r13)
+ cmpwi r0,0
+ beq 1f
+ b kvm_start_guest
+1:
+#endif
+
+ beq cr1,2f
+ b power7_wakeup_noloss
+2: b power7_wakeup_loss
+
+ /* Fast Sleep wakeup on PowerNV */
+8: GET_PACA(r13)
+ b power7_wakeup_tb_loss
+
+9:
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+#endif /* CONFIG_PPC_P7_NAP */
+ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
+ NOTEST, 0x100)
. = 0x200
-_machine_check_pSeries:
- HMT_MEDIUM
- DO_KVM 0x200
- mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */
- EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common)
+machine_check_pSeries_1:
+ /* This is moved out of line as it can be patched by FW, but
+ * some code path might still want to branch into the original
+ * vector
+ */
+ HMT_MEDIUM_PPR_DISCARD
+ SET_SCRATCH0(r13) /* save r13 */
+#ifdef CONFIG_PPC_P7_NAP
+BEGIN_FTR_SECTION
+ /* Running native on arch 2.06 or later, check if we are
+ * waking up from nap. We only handle no state loss and
+ * supervisor state loss. We do -not- handle hypervisor
+ * state loss at this time.
+ */
+ mfspr r13,SPRN_SRR1
+ rlwinm. r13,r13,47-31,30,31
+ OPT_GET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR)
+ beq 9f
+
+ mfspr r13,SPRN_SRR1
+ rlwinm. r13,r13,47-31,30,31
+ /* waking up from powersave (nap) state */
+ cmpwi cr1,r13,2
+ /* Total loss of HV state is fatal. let's just stay stuck here */
+ OPT_GET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR)
+ bgt cr1,.
+9:
+ OPT_SET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR)
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+#endif /* CONFIG_PPC_P7_NAP */
+ EXCEPTION_PROLOG_0(PACA_EXMC)
+BEGIN_FTR_SECTION
+ b machine_check_pSeries_early
+FTR_SECTION_ELSE
+ b machine_check_pSeries_0
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
. = 0x300
.globl data_access_pSeries
data_access_pSeries:
- HMT_MEDIUM
- DO_KVM 0x300
- mtspr SPRN_SPRG_SCRATCH0,r13
+ HMT_MEDIUM_PPR_DISCARD
+ SET_SCRATCH0(r13)
BEGIN_FTR_SECTION
- mfspr r13,SPRN_SPRG_PACA
- std r9,PACA_EXSLB+EX_R9(r13)
- std r10,PACA_EXSLB+EX_R10(r13)
- mfspr r10,SPRN_DAR
- mfspr r9,SPRN_DSISR
- srdi r10,r10,60
- rlwimi r10,r9,16,0x20
- mfcr r9
- cmpwi r10,0x2c
- beq do_stab_bolted_pSeries
- ld r10,PACA_EXSLB+EX_R10(r13)
- std r11,PACA_EXGEN+EX_R11(r13)
- ld r11,PACA_EXSLB+EX_R9(r13)
- std r12,PACA_EXGEN+EX_R12(r13)
- mfspr r12,SPRN_SPRG_SCRATCH0
- std r10,PACA_EXGEN+EX_R10(r13)
- std r11,PACA_EXGEN+EX_R9(r13)
- std r12,PACA_EXGEN+EX_R13(r13)
- EXCEPTION_PROLOG_PSERIES_1(data_access_common)
-FTR_SECTION_ELSE
- EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common)
-ALT_FTR_SECTION_END_IFCLR(CPU_FTR_SLB)
+ b data_access_check_stab
+data_access_not_stab:
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
+ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
+ KVMTEST, 0x300)
. = 0x380
.globl data_access_slb_pSeries
data_access_slb_pSeries:
- HMT_MEDIUM
- DO_KVM 0x380
- mtspr SPRN_SPRG_SCRATCH0,r13
- mfspr r13,SPRN_SPRG_PACA /* get paca address into r13 */
+ HMT_MEDIUM_PPR_DISCARD
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXSLB)
+ EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
std r3,PACA_EXSLB+EX_R3(r13)
mfspr r3,SPRN_DAR
- std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */
- mfcr r9
#ifdef __DISABLED__
/* Keep that around for when we re-implement dynamic VSIDs */
cmpdi r3,0
bge slb_miss_user_pseries
#endif /* __DISABLED__ */
- std r10,PACA_EXSLB+EX_R10(r13)
- std r11,PACA_EXSLB+EX_R11(r13)
- std r12,PACA_EXSLB+EX_R12(r13)
- mfspr r10,SPRN_SPRG_SCRATCH0
- std r10,PACA_EXSLB+EX_R13(r13)
- mfspr r12,SPRN_SRR1 /* and SRR1 */
+ mfspr r12,SPRN_SRR1
#ifndef CONFIG_RELOCATABLE
- b .slb_miss_realmode
+ b slb_miss_realmode
#else
/*
- * We can't just use a direct branch to .slb_miss_realmode
+ * We can't just use a direct branch to slb_miss_realmode
* because the distance from here to there depends on where
* the kernel ends up being put.
*/
mfctr r11
ld r10,PACAKBASE(r13)
- LOAD_HANDLER(r10, .slb_miss_realmode)
+ LOAD_HANDLER(r10, slb_miss_realmode)
mtctr r10
bctr
#endif
- STD_EXCEPTION_PSERIES(0x400, instruction_access)
+ STD_EXCEPTION_PSERIES(0x400, 0x400, instruction_access)
. = 0x480
.globl instruction_access_slb_pSeries
instruction_access_slb_pSeries:
- HMT_MEDIUM
- DO_KVM 0x480
- mtspr SPRN_SPRG_SCRATCH0,r13
- mfspr r13,SPRN_SPRG_PACA /* get paca address into r13 */
+ HMT_MEDIUM_PPR_DISCARD
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXSLB)
+ EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
std r3,PACA_EXSLB+EX_R3(r13)
mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */
- std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */
- mfcr r9
#ifdef __DISABLED__
/* Keep that around for when we re-implement dynamic VSIDs */
cmpdi r3,0
bge slb_miss_user_pseries
#endif /* __DISABLED__ */
- std r10,PACA_EXSLB+EX_R10(r13)
- std r11,PACA_EXSLB+EX_R11(r13)
- std r12,PACA_EXSLB+EX_R12(r13)
- mfspr r10,SPRN_SPRG_SCRATCH0
- std r10,PACA_EXSLB+EX_R13(r13)
- mfspr r12,SPRN_SRR1 /* and SRR1 */
+ mfspr r12,SPRN_SRR1
#ifndef CONFIG_RELOCATABLE
- b .slb_miss_realmode
+ b slb_miss_realmode
#else
mfctr r11
ld r10,PACAKBASE(r13)
- LOAD_HANDLER(r10, .slb_miss_realmode)
+ LOAD_HANDLER(r10, slb_miss_realmode)
mtctr r10
bctr
#endif
- MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt)
- STD_EXCEPTION_PSERIES(0x600, alignment)
- STD_EXCEPTION_PSERIES(0x700, program_check)
- STD_EXCEPTION_PSERIES(0x800, fp_unavailable)
- MASKABLE_EXCEPTION_PSERIES(0x900, decrementer)
- STD_EXCEPTION_PSERIES(0xa00, trap_0a)
- STD_EXCEPTION_PSERIES(0xb00, trap_0b)
+ /* We open code these as we can't have a ". = x" (even with
+ * x = "." within a feature section
+ */
+ . = 0x500;
+ .globl hardware_interrupt_pSeries;
+ .globl hardware_interrupt_hv;
+hardware_interrupt_pSeries:
+hardware_interrupt_hv:
+ HMT_MEDIUM_PPR_DISCARD
+ BEGIN_FTR_SECTION
+ _MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
+ EXC_HV, SOFTEN_TEST_HV)
+ KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
+ FTR_SECTION_ELSE
+ _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
+ EXC_STD, SOFTEN_TEST_HV_201)
+ KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
+ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+
+ STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
+ KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x600)
+
+ STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
+ KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x700)
+
+ STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
+ KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800)
+
+ . = 0x900
+ .globl decrementer_pSeries
+decrementer_pSeries:
+ _MASKABLE_EXCEPTION_PSERIES(0x900, decrementer, EXC_STD, SOFTEN_TEST_PR)
+
+ STD_EXCEPTION_HV(0x980, 0x982, hdecrementer)
+
+ MASKABLE_EXCEPTION_PSERIES(0xa00, 0xa00, doorbell_super)
+ KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00)
+
+ STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
+ KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xb00)
. = 0xc00
.globl system_call_pSeries
system_call_pSeries:
HMT_MEDIUM
- DO_KVM 0xc00
-BEGIN_FTR_SECTION
- cmpdi r0,0x1ebe
- beq- 1f
-END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
- mr r9,r13
- mfspr r13,SPRN_SPRG_PACA
- mfspr r11,SPRN_SRR0
- ld r12,PACAKBASE(r13)
- ld r10,PACAKMSR(r13)
- LOAD_HANDLER(r12, system_call_entry)
- mtspr SPRN_SRR0,r12
- mfspr r12,SPRN_SRR1
- mtspr SPRN_SRR1,r10
- rfid
- b . /* prevent speculative execution */
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+ SET_SCRATCH0(r13)
+ GET_PACA(r13)
+ std r9,PACA_EXGEN+EX_R9(r13)
+ std r10,PACA_EXGEN+EX_R10(r13)
+ mfcr r9
+ KVMTEST(0xc00)
+ GET_SCRATCH0(r13)
+#endif
+ SYSCALL_PSERIES_1
+ SYSCALL_PSERIES_2_RFID
+ SYSCALL_PSERIES_3
+ KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
-/* Fast LE/BE switch system call */
-1: mfspr r12,SPRN_SRR1
- xori r12,r12,MSR_LE
- mtspr SPRN_SRR1,r12
- rfid /* return to userspace */
- b .
+ STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
+ KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xd00)
- STD_EXCEPTION_PSERIES(0xd00, single_step)
- STD_EXCEPTION_PSERIES(0xe00, trap_0e)
+ /* At 0xe??? we have a bunch of hypervisor exceptions, we branch
+ * out of line to handle them
+ */
+ . = 0xe00
+hv_data_storage_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b h_data_storage_hv
+
+ . = 0xe20
+hv_instr_storage_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b h_instr_storage_hv
+
+ . = 0xe40
+emulation_assist_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b emulation_assist_hv
+
+ . = 0xe60
+hv_exception_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b hmi_exception_hv
+
+ . = 0xe80
+hv_doorbell_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b h_doorbell_hv
/* We need to deal with the Altivec unavailable exception
* here which is at 0xf20, thus in the middle of the
* prolog code of the PerformanceMonitor one. A little
* trickery is thus necessary
*/
-performance_monitor_pSeries_1:
. = 0xf00
- DO_KVM 0xf00
+performance_monitor_pseries_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
b performance_monitor_pSeries
-altivec_unavailable_pSeries_1:
. = 0xf20
- DO_KVM 0xf20
+altivec_unavailable_pseries_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
b altivec_unavailable_pSeries
-vsx_unavailable_pSeries_1:
. = 0xf40
- DO_KVM 0xf40
+vsx_unavailable_pseries_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
b vsx_unavailable_pSeries
+ . = 0xf60
+facility_unavailable_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b facility_unavailable_pSeries
+
+ . = 0xf80
+hv_facility_unavailable_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b facility_unavailable_hv
+
#ifdef CONFIG_CBE_RAS
- HSTD_EXCEPTION_PSERIES(0x1200, cbe_system_error)
+ STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
+ KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
#endif /* CONFIG_CBE_RAS */
- STD_EXCEPTION_PSERIES(0x1300, instruction_breakpoint)
+
+ STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
+ KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
+
+ . = 0x1500
+ .global denorm_exception_hv
+denorm_exception_hv:
+ HMT_MEDIUM_PPR_DISCARD
+ mtspr SPRN_SPRG_HSCRATCH0,r13
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x1500)
+
+#ifdef CONFIG_PPC_DENORMALISATION
+ mfspr r10,SPRN_HSRR1
+ mfspr r11,SPRN_HSRR0 /* save HSRR0 */
+ andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */
+ addi r11,r11,-4 /* HSRR0 is next instruction */
+ bne+ denorm_assist
+#endif
+
+ KVMTEST(0x1500)
+ EXCEPTION_PROLOG_PSERIES_1(denorm_common, EXC_HV)
+ KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1500)
+
#ifdef CONFIG_CBE_RAS
- HSTD_EXCEPTION_PSERIES(0x1600, cbe_maintenance)
+ STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
+ KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
#endif /* CONFIG_CBE_RAS */
- STD_EXCEPTION_PSERIES(0x1700, altivec_assist)
+
+ STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
+ KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x1700)
+
#ifdef CONFIG_CBE_RAS
- HSTD_EXCEPTION_PSERIES(0x1800, cbe_thermal)
+ STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
+ KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
+#else
+ . = 0x1800
#endif /* CONFIG_CBE_RAS */
- . = 0x3000
-/*** pSeries interrupt support ***/
+/*** Out of line interrupts support ***/
- /* moved from 0xf00 */
- STD_EXCEPTION_PSERIES(., performance_monitor)
- STD_EXCEPTION_PSERIES(., altivec_unavailable)
- STD_EXCEPTION_PSERIES(., vsx_unavailable)
+ .align 7
+ /* moved from 0x200 */
+machine_check_pSeries_early:
+BEGIN_FTR_SECTION
+ EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
+ /*
+ * Register contents:
+ * R13 = PACA
+ * R9 = CR
+ * Original R9 to R13 is saved on PACA_EXMC
+ *
+ * Switch to mc_emergency stack and handle re-entrancy (we limit
+ * the nested MCE upto level 4 to avoid stack overflow).
+ * Save MCE registers srr1, srr0, dar and dsisr and then set ME=1
+ *
+ * We use paca->in_mce to check whether this is the first entry or
+ * nested machine check. We increment paca->in_mce to track nested
+ * machine checks.
+ *
+ * If this is the first entry then set stack pointer to
+ * paca->mc_emergency_sp, otherwise r1 is already pointing to
+ * stack frame on mc_emergency stack.
+ *
+ * NOTE: We are here with MSR_ME=0 (off), which means we risk a
+ * checkstop if we get another machine check exception before we do
+ * rfid with MSR_ME=1.
+ */
+ mr r11,r1 /* Save r1 */
+ lhz r10,PACA_IN_MCE(r13)
+ cmpwi r10,0 /* Are we in nested machine check */
+ bne 0f /* Yes, we are. */
+ /* First machine check entry */
+ ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */
+0: subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */
+ addi r10,r10,1 /* increment paca->in_mce */
+ sth r10,PACA_IN_MCE(r13)
+ /* Limit nested MCE to level 4 to avoid stack overflow */
+ cmpwi r10,4
+ bgt 2f /* Check if we hit limit of 4 */
+ std r11,GPR1(r1) /* Save r1 on the stack. */
+ std r11,0(r1) /* make stack chain pointer */
+ mfspr r11,SPRN_SRR0 /* Save SRR0 */
+ std r11,_NIP(r1)
+ mfspr r11,SPRN_SRR1 /* Save SRR1 */
+ std r11,_MSR(r1)
+ mfspr r11,SPRN_DAR /* Save DAR */
+ std r11,_DAR(r1)
+ mfspr r11,SPRN_DSISR /* Save DSISR */
+ std r11,_DSISR(r1)
+ std r9,_CCR(r1) /* Save CR in stackframe */
+ /* Save r9 through r13 from EXMC save area to stack frame. */
+ EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
+ mfmsr r11 /* get MSR value */
+ ori r11,r11,MSR_ME /* turn on ME bit */
+ ori r11,r11,MSR_RI /* turn on RI bit */
+ ld r12,PACAKBASE(r13) /* get high part of &label */
+ LOAD_HANDLER(r12, machine_check_handle_early)
+1: mtspr SPRN_SRR0,r12
+ mtspr SPRN_SRR1,r11
+ rfid
+ b . /* prevent speculative execution */
+2:
+ /* Stack overflow. Stay on emergency stack and panic.
+ * Keep the ME bit off while panic-ing, so that if we hit
+ * another machine check we checkstop.
+ */
+ addi r1,r1,INT_FRAME_SIZE /* go back to previous stack frame */
+ ld r11,PACAKMSR(r13)
+ ld r12,PACAKBASE(r13)
+ LOAD_HANDLER(r12, unrecover_mce)
+ li r10,MSR_ME
+ andc r11,r11,r10 /* Turn off MSR_ME */
+ b 1b
+ b . /* prevent speculative execution */
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+machine_check_pSeries:
+ .globl machine_check_fwnmi
+machine_check_fwnmi:
+ HMT_MEDIUM_PPR_DISCARD
+ SET_SCRATCH0(r13) /* save r13 */
+ EXCEPTION_PROLOG_0(PACA_EXMC)
+machine_check_pSeries_0:
+ EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST, 0x200)
+ EXCEPTION_PROLOG_PSERIES_1(machine_check_common, EXC_STD)
+ KVM_HANDLER_SKIP(PACA_EXMC, EXC_STD, 0x200)
+
+ /* moved from 0x300 */
+data_access_check_stab:
+ GET_PACA(r13)
+ std r9,PACA_EXSLB+EX_R9(r13)
+ std r10,PACA_EXSLB+EX_R10(r13)
+ mfspr r10,SPRN_DAR
+ mfspr r9,SPRN_DSISR
+ srdi r10,r10,60
+ rlwimi r10,r9,16,0x20
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+ lbz r9,HSTATE_IN_GUEST(r13)
+ rlwimi r10,r9,8,0x300
+#endif
+ mfcr r9
+ cmpwi r10,0x2c
+ beq do_stab_bolted_pSeries
+ mtcrf 0x80,r9
+ ld r9,PACA_EXSLB+EX_R9(r13)
+ ld r10,PACA_EXSLB+EX_R10(r13)
+ b data_access_not_stab
+do_stab_bolted_pSeries:
+ std r11,PACA_EXSLB+EX_R11(r13)
+ std r12,PACA_EXSLB+EX_R12(r13)
+ GET_SCRATCH0(r10)
+ std r10,PACA_EXSLB+EX_R13(r13)
+ EXCEPTION_PROLOG_PSERIES_1(do_stab_bolted, EXC_STD)
+
+ KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+ KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+ KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
+ KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
+ KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
+ KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
+
+#ifdef CONFIG_PPC_DENORMALISATION
+denorm_assist:
+BEGIN_FTR_SECTION
+/*
+ * To denormalise we need to move a copy of the register to itself.
+ * For POWER6 do that here for all FP regs.
+ */
+ mfmsr r10
+ ori r10,r10,(MSR_FP|MSR_FE0|MSR_FE1)
+ xori r10,r10,(MSR_FE0|MSR_FE1)
+ mtmsrd r10
+ sync
+
+#define FMR2(n) fmr (n), (n) ; fmr n+1, n+1
+#define FMR4(n) FMR2(n) ; FMR2(n+2)
+#define FMR8(n) FMR4(n) ; FMR4(n+4)
+#define FMR16(n) FMR8(n) ; FMR8(n+8)
+#define FMR32(n) FMR16(n) ; FMR16(n+16)
+ FMR32(0)
+
+FTR_SECTION_ELSE
+/*
+ * To denormalise we need to move a copy of the register to itself.
+ * For POWER7 do that here for the first 32 VSX registers only.
+ */
+ mfmsr r10
+ oris r10,r10,MSR_VSX@h
+ mtmsrd r10
+ sync
+
+#define XVCPSGNDP2(n) XVCPSGNDP(n,n,n) ; XVCPSGNDP(n+1,n+1,n+1)
+#define XVCPSGNDP4(n) XVCPSGNDP2(n) ; XVCPSGNDP2(n+2)
+#define XVCPSGNDP8(n) XVCPSGNDP4(n) ; XVCPSGNDP4(n+4)
+#define XVCPSGNDP16(n) XVCPSGNDP8(n) ; XVCPSGNDP8(n+8)
+#define XVCPSGNDP32(n) XVCPSGNDP16(n) ; XVCPSGNDP16(n+16)
+ XVCPSGNDP32(0)
+
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_206)
+
+BEGIN_FTR_SECTION
+ b denorm_done
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
/*
- * An interrupt came in while soft-disabled; clear EE in SRR1,
- * clear paca->hard_enabled and return.
+ * To denormalise we need to move a copy of the register to itself.
+ * For POWER8 we need to do that for all 64 VSX registers
*/
-masked_interrupt:
- stb r10,PACAHARDIRQEN(r13)
+ XVCPSGNDP32(32)
+denorm_done:
+ mtspr SPRN_HSRR0,r11
mtcrf 0x80,r9
ld r9,PACA_EXGEN+EX_R9(r13)
- mfspr r10,SPRN_SRR1
- rldicl r10,r10,48,1 /* clear MSR_EE */
- rotldi r10,r10,16
- mtspr SPRN_SRR1,r10
+ RESTORE_PPR_PACA(PACA_EXGEN, r10)
+BEGIN_FTR_SECTION
+ ld r10,PACA_EXGEN+EX_CFAR(r13)
+ mtspr SPRN_CFAR,r10
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
ld r10,PACA_EXGEN+EX_R10(r13)
- mfspr r13,SPRN_SPRG_SCRATCH0
- rfid
+ ld r11,PACA_EXGEN+EX_R11(r13)
+ ld r12,PACA_EXGEN+EX_R12(r13)
+ ld r13,PACA_EXGEN+EX_R13(r13)
+ HRFID
b .
+#endif
.align 7
-do_stab_bolted_pSeries:
- std r11,PACA_EXSLB+EX_R11(r13)
- std r12,PACA_EXSLB+EX_R12(r13)
- mfspr r10,SPRN_SPRG_SCRATCH0
- std r10,PACA_EXSLB+EX_R13(r13)
- EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted)
+ /* moved from 0xe00 */
+ STD_EXCEPTION_HV_OOL(0xe02, h_data_storage)
+ KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0xe02)
+ STD_EXCEPTION_HV_OOL(0xe22, h_instr_storage)
+ KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe22)
+ STD_EXCEPTION_HV_OOL(0xe42, emulation_assist)
+ KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe42)
+ STD_EXCEPTION_HV_OOL(0xe62, hmi_exception) /* need to flush cache ? */
+ KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62)
+ MASKABLE_EXCEPTION_HV_OOL(0xe82, h_doorbell)
+ KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe82)
+
+ /* moved from 0xf00 */
+ STD_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
+ KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf00)
+ STD_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
+ KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20)
+ STD_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
+ KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
+ STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
+ KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60)
+ STD_EXCEPTION_HV_OOL(0xf82, facility_unavailable)
+ KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xf82)
+
+/*
+ * An interrupt came in while soft-disabled. We set paca->irq_happened, then:
+ * - If it was a decrementer interrupt, we bump the dec to max and and return.
+ * - If it was a doorbell we return immediately since doorbells are edge
+ * triggered and won't automatically refire.
+ * - else we hard disable and return.
+ * This is called with r10 containing the value to OR to the paca field.
+ */
+#define MASKED_INTERRUPT(_H) \
+masked_##_H##interrupt: \
+ std r11,PACA_EXGEN+EX_R11(r13); \
+ lbz r11,PACAIRQHAPPENED(r13); \
+ or r11,r11,r10; \
+ stb r11,PACAIRQHAPPENED(r13); \
+ cmpwi r10,PACA_IRQ_DEC; \
+ bne 1f; \
+ lis r10,0x7fff; \
+ ori r10,r10,0xffff; \
+ mtspr SPRN_DEC,r10; \
+ b 2f; \
+1: cmpwi r10,PACA_IRQ_DBELL; \
+ beq 2f; \
+ mfspr r10,SPRN_##_H##SRR1; \
+ rldicl r10,r10,48,1; /* clear MSR_EE */ \
+ rotldi r10,r10,16; \
+ mtspr SPRN_##_H##SRR1,r10; \
+2: mtcrf 0x80,r9; \
+ ld r9,PACA_EXGEN+EX_R9(r13); \
+ ld r10,PACA_EXGEN+EX_R10(r13); \
+ ld r11,PACA_EXGEN+EX_R11(r13); \
+ GET_SCRATCH0(r13); \
+ ##_H##rfid; \
+ b .
+
+ MASKED_INTERRUPT()
+ MASKED_INTERRUPT(H)
+
+/*
+ * Called from arch_local_irq_enable when an interrupt needs
+ * to be resent. r3 contains 0x500, 0x900, 0xa00 or 0xe80 to indicate
+ * which kind of interrupt. MSR:EE is already off. We generate a
+ * stackframe like if a real interrupt had happened.
+ *
+ * Note: While MSR:EE is off, we need to make sure that _MSR
+ * in the generated frame has EE set to 1 or the exception
+ * handler will not properly re-enable them.
+ */
+_GLOBAL(__replay_interrupt)
+ /* We are going to jump to the exception common code which
+ * will retrieve various register values from the PACA which
+ * we don't give a damn about, so we don't bother storing them.
+ */
+ mfmsr r12
+ mflr r11
+ mfcr r9
+ ori r12,r12,MSR_EE
+ cmpwi r3,0x900
+ beq decrementer_common
+ cmpwi r3,0x500
+ beq hardware_interrupt_common
+BEGIN_FTR_SECTION
+ cmpwi r3,0xe80
+ beq h_doorbell_common
+FTR_SECTION_ELSE
+ cmpwi r3,0xa00
+ beq doorbell_super_common
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+ blr
#ifdef CONFIG_PPC_PSERIES
/*
@@ -258,16 +714,10 @@ do_stab_bolted_pSeries:
.globl system_reset_fwnmi
.align 7
system_reset_fwnmi:
- HMT_MEDIUM
- mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */
- EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common)
-
- .globl machine_check_fwnmi
- .align 7
-machine_check_fwnmi:
- HMT_MEDIUM
- mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */
- EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common)
+ HMT_MEDIUM_PPR_DISCARD
+ SET_SCRATCH0(r13) /* save r13 */
+ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
+ NOTEST, 0x100)
#endif /* CONFIG_PPC_PSERIES */
@@ -281,7 +731,7 @@ slb_miss_user_pseries:
std r10,PACA_EXGEN+EX_R10(r13)
std r11,PACA_EXGEN+EX_R11(r13)
std r12,PACA_EXGEN+EX_R12(r13)
- mfspr r10,SPRG_SCRATCH0
+ GET_SCRATCH0(r10)
ld r11,PACA_EXSLB+EX_R9(r13)
ld r12,PACA_EXSLB+EX_R3(r13)
std r10,PACA_EXGEN+EX_R13(r13)
@@ -299,65 +749,241 @@ slb_miss_user_pseries:
b . /* prevent spec. execution */
#endif /* __DISABLED__ */
-/* KVM's trampoline code needs to be close to the interrupt handlers */
-
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#include "../kvm/book3s_rmhandlers.S"
-#endif
+kvmppc_skip_interrupt:
+ /*
+ * Here all GPRs are unchanged from when the interrupt happened
+ * except for r13, which is saved in SPRG_SCRATCH0.
+ */
+ mfspr r13, SPRN_SRR0
+ addi r13, r13, 4
+ mtspr SPRN_SRR0, r13
+ GET_SCRATCH0(r13)
+ rfid
+ b .
- .align 7
- .globl __end_interrupts
-__end_interrupts:
+kvmppc_skip_Hinterrupt:
+ /*
+ * Here all GPRs are unchanged from when the interrupt happened
+ * except for r13, which is saved in SPRG_SCRATCH0.
+ */
+ mfspr r13, SPRN_HSRR0
+ addi r13, r13, 4
+ mtspr SPRN_HSRR0, r13
+ GET_SCRATCH0(r13)
+ hrfid
+ b .
+#endif
/*
* Code from here down to __end_handlers is invoked from the
* exception prologs above. Because the prologs assemble the
* addresses of these handlers using the LOAD_HANDLER macro,
- * which uses an addi instruction, these handlers must be in
- * the first 32k of the kernel image.
+ * which uses an ori instruction, these handlers must be in
+ * the first 64k of the kernel image.
*/
/*** Common interrupt handlers ***/
- STD_EXCEPTION_COMMON(0x100, system_reset, .system_reset_exception)
+ STD_EXCEPTION_COMMON(0x100, system_reset, system_reset_exception)
- /*
- * Machine check is different because we use a different
- * save area: PACA_EXMC instead of PACA_EXGEN.
- */
- .align 7
- .globl machine_check_common
-machine_check_common:
- EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
- FINISH_NAP
- DISABLE_INTS
- bl .save_nvgprs
- addi r3,r1,STACK_FRAME_OVERHEAD
- bl .machine_check_exception
- b .ret_from_except
-
- STD_EXCEPTION_COMMON_LITE(0x900, decrementer, .timer_interrupt)
- STD_EXCEPTION_COMMON(0xa00, trap_0a, .unknown_exception)
- STD_EXCEPTION_COMMON(0xb00, trap_0b, .unknown_exception)
- STD_EXCEPTION_COMMON(0xd00, single_step, .single_step_exception)
- STD_EXCEPTION_COMMON(0xe00, trap_0e, .unknown_exception)
- STD_EXCEPTION_COMMON_IDLE(0xf00, performance_monitor, .performance_monitor_exception)
- STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, .instruction_breakpoint_exception)
+ STD_EXCEPTION_COMMON_ASYNC(0x500, hardware_interrupt, do_IRQ)
+ STD_EXCEPTION_COMMON_ASYNC(0x900, decrementer, timer_interrupt)
+ STD_EXCEPTION_COMMON(0x980, hdecrementer, hdec_interrupt)
+#ifdef CONFIG_PPC_DOORBELL
+ STD_EXCEPTION_COMMON_ASYNC(0xa00, doorbell_super, doorbell_exception)
+#else
+ STD_EXCEPTION_COMMON_ASYNC(0xa00, doorbell_super, unknown_exception)
+#endif
+ STD_EXCEPTION_COMMON(0xb00, trap_0b, unknown_exception)
+ STD_EXCEPTION_COMMON(0xd00, single_step, single_step_exception)
+ STD_EXCEPTION_COMMON(0xe00, trap_0e, unknown_exception)
+ STD_EXCEPTION_COMMON(0xe40, emulation_assist, emulation_assist_interrupt)
+ STD_EXCEPTION_COMMON(0xe60, hmi_exception, unknown_exception)
+#ifdef CONFIG_PPC_DOORBELL
+ STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, doorbell_exception)
+#else
+ STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, unknown_exception)
+#endif
+ STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, performance_monitor_exception)
+ STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, instruction_breakpoint_exception)
+ STD_EXCEPTION_COMMON(0x1502, denorm, unknown_exception)
#ifdef CONFIG_ALTIVEC
- STD_EXCEPTION_COMMON(0x1700, altivec_assist, .altivec_assist_exception)
+ STD_EXCEPTION_COMMON(0x1700, altivec_assist, altivec_assist_exception)
#else
- STD_EXCEPTION_COMMON(0x1700, altivec_assist, .unknown_exception)
+ STD_EXCEPTION_COMMON(0x1700, altivec_assist, unknown_exception)
#endif
#ifdef CONFIG_CBE_RAS
- STD_EXCEPTION_COMMON(0x1200, cbe_system_error, .cbe_system_error_exception)
- STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, .cbe_maintenance_exception)
- STD_EXCEPTION_COMMON(0x1800, cbe_thermal, .cbe_thermal_exception)
+ STD_EXCEPTION_COMMON(0x1200, cbe_system_error, cbe_system_error_exception)
+ STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, cbe_maintenance_exception)
+ STD_EXCEPTION_COMMON(0x1800, cbe_thermal, cbe_thermal_exception)
#endif /* CONFIG_CBE_RAS */
+ /*
+ * Relocation-on interrupts: A subset of the interrupts can be delivered
+ * with IR=1/DR=1, if AIL==2 and MSR.HV won't be changed by delivering
+ * it. Addresses are the same as the original interrupt addresses, but
+ * offset by 0xc000000000004000.
+ * It's impossible to receive interrupts below 0x300 via this mechanism.
+ * KVM: None of these traps are from the guest ; anything that escalated
+ * to HV=1 from HV=0 is delivered via real mode handlers.
+ */
+
+ /*
+ * This uses the standard macro, since the original 0x300 vector
+ * only has extra guff for STAB-based processors -- which never
+ * come here.
+ */
+ STD_RELON_EXCEPTION_PSERIES(0x4300, 0x300, data_access)
+ . = 0x4380
+ .globl data_access_slb_relon_pSeries
+data_access_slb_relon_pSeries:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXSLB)
+ EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380)
+ std r3,PACA_EXSLB+EX_R3(r13)
+ mfspr r3,SPRN_DAR
+ mfspr r12,SPRN_SRR1
+#ifndef CONFIG_RELOCATABLE
+ b slb_miss_realmode
+#else
+ /*
+ * We can't just use a direct branch to slb_miss_realmode
+ * because the distance from here to there depends on where
+ * the kernel ends up being put.
+ */
+ mfctr r11
+ ld r10,PACAKBASE(r13)
+ LOAD_HANDLER(r10, slb_miss_realmode)
+ mtctr r10
+ bctr
+#endif
+
+ STD_RELON_EXCEPTION_PSERIES(0x4400, 0x400, instruction_access)
+ . = 0x4480
+ .globl instruction_access_slb_relon_pSeries
+instruction_access_slb_relon_pSeries:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXSLB)
+ EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480)
+ std r3,PACA_EXSLB+EX_R3(r13)
+ mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */
+ mfspr r12,SPRN_SRR1
+#ifndef CONFIG_RELOCATABLE
+ b slb_miss_realmode
+#else
+ mfctr r11
+ ld r10,PACAKBASE(r13)
+ LOAD_HANDLER(r10, slb_miss_realmode)
+ mtctr r10
+ bctr
+#endif
+
+ . = 0x4500
+ .globl hardware_interrupt_relon_pSeries;
+ .globl hardware_interrupt_relon_hv;
+hardware_interrupt_relon_pSeries:
+hardware_interrupt_relon_hv:
+ BEGIN_FTR_SECTION
+ _MASKABLE_RELON_EXCEPTION_PSERIES(0x502, hardware_interrupt, EXC_HV, SOFTEN_TEST_HV)
+ FTR_SECTION_ELSE
+ _MASKABLE_RELON_EXCEPTION_PSERIES(0x500, hardware_interrupt, EXC_STD, SOFTEN_TEST_PR)
+ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+ STD_RELON_EXCEPTION_PSERIES(0x4600, 0x600, alignment)
+ STD_RELON_EXCEPTION_PSERIES(0x4700, 0x700, program_check)
+ STD_RELON_EXCEPTION_PSERIES(0x4800, 0x800, fp_unavailable)
+ MASKABLE_RELON_EXCEPTION_PSERIES(0x4900, 0x900, decrementer)
+ STD_RELON_EXCEPTION_HV(0x4980, 0x982, hdecrementer)
+ MASKABLE_RELON_EXCEPTION_PSERIES(0x4a00, 0xa00, doorbell_super)
+ STD_RELON_EXCEPTION_PSERIES(0x4b00, 0xb00, trap_0b)
+
+ . = 0x4c00
+ .globl system_call_relon_pSeries
+system_call_relon_pSeries:
+ HMT_MEDIUM
+ SYSCALL_PSERIES_1
+ SYSCALL_PSERIES_2_DIRECT
+ SYSCALL_PSERIES_3
+
+ STD_RELON_EXCEPTION_PSERIES(0x4d00, 0xd00, single_step)
+
+ . = 0x4e00
+ b . /* Can't happen, see v2.07 Book III-S section 6.5 */
+
+ . = 0x4e20
+ b . /* Can't happen, see v2.07 Book III-S section 6.5 */
+
+ . = 0x4e40
+emulation_assist_relon_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b emulation_assist_relon_hv
+
+ . = 0x4e60
+ b . /* Can't happen, see v2.07 Book III-S section 6.5 */
+
+ . = 0x4e80
+h_doorbell_relon_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b h_doorbell_relon_hv
+
+ . = 0x4f00
+performance_monitor_relon_pseries_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b performance_monitor_relon_pSeries
+
+ . = 0x4f20
+altivec_unavailable_relon_pseries_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b altivec_unavailable_relon_pSeries
+
+ . = 0x4f40
+vsx_unavailable_relon_pseries_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b vsx_unavailable_relon_pSeries
+
+ . = 0x4f60
+facility_unavailable_relon_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b facility_unavailable_relon_pSeries
+
+ . = 0x4f80
+hv_facility_unavailable_relon_trampoline:
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b hv_facility_unavailable_relon_hv
+
+ STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint)
+#ifdef CONFIG_PPC_DENORMALISATION
+ . = 0x5500
+ b denorm_exception_hv
+#endif
+ STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist)
+
+ /* Other future vectors */
+ .align 7
+ .globl __end_interrupts
+__end_interrupts:
+
.align 7
+system_call_entry_direct:
+#if defined(CONFIG_RELOCATABLE)
+ /* The first level prologue may have used LR to get here, saving
+ * orig in r10. To save hacking/ifdeffing common code, restore here.
+ */
+ mtlr r10
+#endif
system_call_entry:
b system_call_common
+ppc64_runlatch_on_trampoline:
+ b __ppc64_runlatch_on
+
/*
* Here we have detected that the kernel stack pointer is bad.
* R9 contains the saved CR, r13 points to the paca,
@@ -385,9 +1011,24 @@ bad_stack:
std r12,_XER(r1)
SAVE_GPR(0,r1)
SAVE_GPR(2,r1)
- SAVE_4GPRS(3,r1)
- SAVE_2GPRS(7,r1)
- SAVE_10GPRS(12,r1)
+ ld r10,EX_R3(r3)
+ std r10,GPR3(r1)
+ SAVE_GPR(4,r1)
+ SAVE_4GPRS(5,r1)
+ ld r9,EX_R9(r3)
+ ld r10,EX_R10(r3)
+ SAVE_2GPRS(9,r1)
+ ld r9,EX_R11(r3)
+ ld r10,EX_R12(r3)
+ ld r11,EX_R13(r3)
+ std r9,GPR11(r1)
+ std r10,GPR12(r1)
+ std r11,GPR13(r1)
+BEGIN_FTR_SECTION
+ ld r10,EX_CFAR(r3)
+ std r10,ORIG_GPR3(r1)
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+ SAVE_8GPRS(14,r1)
SAVE_10GPRS(22,r1)
lhz r12,PACA_TRAP_SAVE(r13)
std r12,_TRAP(r1)
@@ -396,8 +1037,11 @@ bad_stack:
li r12,0
std r12,0(r11)
ld r2,PACATOC(r13)
+ ld r11,exception_marker@toc(r2)
+ std r12,RESULT(r1)
+ std r11,STACK_FRAME_OVERHEAD-16(r1)
1: addi r3,r1,STACK_FRAME_OVERHEAD
- bl .kernel_bad_stack
+ bl kernel_bad_stack
b 1b
/*
@@ -413,19 +1057,39 @@ data_access_common:
mfspr r10,SPRN_DSISR
stw r10,PACA_EXGEN+EX_DSISR(r13)
EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN)
+ DISABLE_INTS
+ ld r12,_MSR(r1)
ld r3,PACA_EXGEN+EX_DAR(r13)
lwz r4,PACA_EXGEN+EX_DSISR(r13)
li r5,0x300
- b .do_hash_page /* Try to handle as hpte fault */
+ b do_hash_page /* Try to handle as hpte fault */
+
+ .align 7
+ .globl h_data_storage_common
+h_data_storage_common:
+ mfspr r10,SPRN_HDAR
+ std r10,PACA_EXGEN+EX_DAR(r13)
+ mfspr r10,SPRN_HDSISR
+ stw r10,PACA_EXGEN+EX_DSISR(r13)
+ EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN)
+ bl save_nvgprs
+ DISABLE_INTS
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl unknown_exception
+ b ret_from_except
.align 7
.globl instruction_access_common
instruction_access_common:
EXCEPTION_PROLOG_COMMON(0x400, PACA_EXGEN)
+ DISABLE_INTS
+ ld r12,_MSR(r1)
ld r3,_NIP(r1)
andis. r4,r12,0x5820
li r5,0x400
- b .do_hash_page /* Try to handle as hpte fault */
+ b do_hash_page /* Try to handle as hpte fault */
+
+ STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception)
/*
* Here is the common SLB miss user that is used when going to virtual
@@ -440,7 +1104,7 @@ slb_miss_user_common:
stw r9,PACA_EXGEN+EX_CCR(r13)
std r10,PACA_EXGEN+EX_LR(r13)
std r11,PACA_EXGEN+EX_SRR0(r13)
- bl .slb_allocate_user
+ bl slb_allocate_user
ld r10,PACA_EXGEN+EX_LR(r13)
ld r3,PACA_EXGEN+EX_R3(r13)
@@ -483,116 +1147,37 @@ slb_miss_fault:
unrecov_user_slb:
EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN)
DISABLE_INTS
- bl .save_nvgprs
+ bl save_nvgprs
1: addi r3,r1,STACK_FRAME_OVERHEAD
- bl .unrecoverable_exception
+ bl unrecoverable_exception
b 1b
#endif /* __DISABLED__ */
-/*
- * r13 points to the PACA, r9 contains the saved CR,
- * r12 contain the saved SRR1, SRR0 is still ready for return
- * r3 has the faulting address
- * r9 - r13 are saved in paca->exslb.
- * r3 is saved in paca->slb_r3
- * We assume we aren't going to take any exceptions during this procedure.
- */
-_GLOBAL(slb_miss_realmode)
- mflr r10
-#ifdef CONFIG_RELOCATABLE
- mtctr r11
-#endif
-
- stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */
- std r10,PACA_EXSLB+EX_LR(r13) /* save LR */
-
- bl .slb_allocate_realmode
-
- /* All done -- return from exception. */
-
- ld r10,PACA_EXSLB+EX_LR(r13)
- ld r3,PACA_EXSLB+EX_R3(r13)
- lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
- ld r11,PACALPPACAPTR(r13)
- ld r11,LPPACASRR0(r11) /* get SRR0 value */
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif /* CONFIG_PPC_ISERIES */
-
- mtlr r10
-
- andi. r10,r12,MSR_RI /* check for unrecoverable exception */
- beq- 2f
-
-.machine push
-.machine "power4"
- mtcrf 0x80,r9
- mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */
-.machine pop
-
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
- mtspr SPRN_SRR0,r11
- mtspr SPRN_SRR1,r12
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif /* CONFIG_PPC_ISERIES */
- ld r9,PACA_EXSLB+EX_R9(r13)
- ld r10,PACA_EXSLB+EX_R10(r13)
- ld r11,PACA_EXSLB+EX_R11(r13)
- ld r12,PACA_EXSLB+EX_R12(r13)
- ld r13,PACA_EXSLB+EX_R13(r13)
- rfid
- b . /* prevent speculative execution */
-
-2:
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
- b unrecov_slb
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif /* CONFIG_PPC_ISERIES */
- mfspr r11,SPRN_SRR0
- ld r10,PACAKBASE(r13)
- LOAD_HANDLER(r10,unrecov_slb)
- mtspr SPRN_SRR0,r10
- ld r10,PACAKMSR(r13)
- mtspr SPRN_SRR1,r10
- rfid
- b .
-
-unrecov_slb:
- EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB)
- DISABLE_INTS
- bl .save_nvgprs
-1: addi r3,r1,STACK_FRAME_OVERHEAD
- bl .unrecoverable_exception
- b 1b
-
+ /*
+ * Machine check is different because we use a different
+ * save area: PACA_EXMC instead of PACA_EXGEN.
+ */
.align 7
- .globl hardware_interrupt_common
- .globl hardware_interrupt_entry
-hardware_interrupt_common:
- EXCEPTION_PROLOG_COMMON(0x500, PACA_EXGEN)
+ .globl machine_check_common
+machine_check_common:
+
+ mfspr r10,SPRN_DAR
+ std r10,PACA_EXGEN+EX_DAR(r13)
+ mfspr r10,SPRN_DSISR
+ stw r10,PACA_EXGEN+EX_DSISR(r13)
+ EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
FINISH_NAP
-hardware_interrupt_entry:
DISABLE_INTS
-BEGIN_FTR_SECTION
- bl .ppc64_runlatch_on
-END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
+ ld r3,PACA_EXGEN+EX_DAR(r13)
+ lwz r4,PACA_EXGEN+EX_DSISR(r13)
+ std r3,_DAR(r1)
+ std r4,_DSISR(r1)
+ bl save_nvgprs
addi r3,r1,STACK_FRAME_OVERHEAD
- bl .do_IRQ
- b .ret_from_except_lite
-
-#ifdef CONFIG_PPC_970_NAP
-power4_fixup_nap:
- andc r9,r9,r10
- std r9,TI_LOCAL_FLAGS(r11)
- ld r10,_LINK(r1) /* make idle task do the */
- std r10,_NIP(r1) /* equivalent of a blr */
- blr
-#endif
+ bl machine_check_exception
+ b ret_from_except
.align 7
.globl alignment_common
@@ -606,35 +1191,52 @@ alignment_common:
lwz r4,PACA_EXGEN+EX_DSISR(r13)
std r3,_DAR(r1)
std r4,_DSISR(r1)
- bl .save_nvgprs
+ bl save_nvgprs
+ DISABLE_INTS
addi r3,r1,STACK_FRAME_OVERHEAD
- ENABLE_INTS
- bl .alignment_exception
- b .ret_from_except
+ bl alignment_exception
+ b ret_from_except
.align 7
.globl program_check_common
program_check_common:
EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
- bl .save_nvgprs
+ bl save_nvgprs
+ DISABLE_INTS
addi r3,r1,STACK_FRAME_OVERHEAD
- ENABLE_INTS
- bl .program_check_exception
- b .ret_from_except
+ bl program_check_exception
+ b ret_from_except
.align 7
.globl fp_unavailable_common
fp_unavailable_common:
EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN)
bne 1f /* if from user, just load it up */
- bl .save_nvgprs
+ bl save_nvgprs
+ DISABLE_INTS
addi r3,r1,STACK_FRAME_OVERHEAD
- ENABLE_INTS
- bl .kernel_fp_unavailable_exception
+ bl kernel_fp_unavailable_exception
BUG_OPCODE
-1: bl .load_up_fpu
+1:
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+ /* Test if 2 TM state bits are zero. If non-zero (ie. userspace was in
+ * transaction), go do TM stuff
+ */
+ rldicl. r0, r12, (64-MSR_TS_LG), (64-2)
+ bne- 2f
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+ bl load_up_fpu
b fast_exception_return
-
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+2: /* User process was in a transaction */
+ bl save_nvgprs
+ DISABLE_INTS
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl fp_unavailable_tm
+ b ret_from_except
+#endif
.align 7
.globl altivec_unavailable_common
altivec_unavailable_common:
@@ -642,16 +1244,33 @@ altivec_unavailable_common:
#ifdef CONFIG_ALTIVEC
BEGIN_FTR_SECTION
beq 1f
- bl .load_up_altivec
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ BEGIN_FTR_SECTION_NESTED(69)
+ /* Test if 2 TM state bits are zero. If non-zero (ie. userspace was in
+ * transaction), go do TM stuff
+ */
+ rldicl. r0, r12, (64-MSR_TS_LG), (64-2)
+ bne- 2f
+ END_FTR_SECTION_NESTED(CPU_FTR_TM, CPU_FTR_TM, 69)
+#endif
+ bl load_up_altivec
b fast_exception_return
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+2: /* User process was in a transaction */
+ bl save_nvgprs
+ DISABLE_INTS
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl altivec_unavailable_tm
+ b ret_from_except
+#endif
1:
END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
#endif
- bl .save_nvgprs
+ bl save_nvgprs
+ DISABLE_INTS
addi r3,r1,STACK_FRAME_OVERHEAD
- ENABLE_INTS
- bl .altivec_unavailable_exception
- b .ret_from_except
+ bl altivec_unavailable_exception
+ b ret_from_except
.align 7
.globl vsx_unavailable_common
@@ -659,85 +1278,315 @@ vsx_unavailable_common:
EXCEPTION_PROLOG_COMMON(0xf40, PACA_EXGEN)
#ifdef CONFIG_VSX
BEGIN_FTR_SECTION
- bne .load_up_vsx
+ beq 1f
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ BEGIN_FTR_SECTION_NESTED(69)
+ /* Test if 2 TM state bits are zero. If non-zero (ie. userspace was in
+ * transaction), go do TM stuff
+ */
+ rldicl. r0, r12, (64-MSR_TS_LG), (64-2)
+ bne- 2f
+ END_FTR_SECTION_NESTED(CPU_FTR_TM, CPU_FTR_TM, 69)
+#endif
+ b load_up_vsx
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+2: /* User process was in a transaction */
+ bl save_nvgprs
+ DISABLE_INTS
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl vsx_unavailable_tm
+ b ret_from_except
+#endif
1:
END_FTR_SECTION_IFSET(CPU_FTR_VSX)
#endif
- bl .save_nvgprs
+ bl save_nvgprs
+ DISABLE_INTS
addi r3,r1,STACK_FRAME_OVERHEAD
- ENABLE_INTS
- bl .vsx_unavailable_exception
- b .ret_from_except
+ bl vsx_unavailable_exception
+ b ret_from_except
+
+ STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception)
+ STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception)
.align 7
.globl __end_handlers
__end_handlers:
+ /* Equivalents to the above handlers for relocation-on interrupt vectors */
+ STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
+ MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
+
+ STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
+ STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
+ STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
+ STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
+ STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)
+
+#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
/*
- * Return from an exception with minimal checks.
- * The caller is assumed to have done EXCEPTION_PROLOG_COMMON.
- * If interrupts have been enabled, or anything has been
- * done that might have changed the scheduling status of
- * any task or sent any task a signal, you should use
- * ret_from_except or ret_from_except_lite instead of this.
+ * Data area reserved for FWNMI option.
+ * This address (0x7000) is fixed by the RPA.
*/
-fast_exc_return_irq: /* restores irq state too */
- ld r3,SOFTE(r1)
- TRACE_AND_RESTORE_IRQ(r3);
- ld r12,_MSR(r1)
- rldicl r4,r12,49,63 /* get MSR_EE to LSB */
- stb r4,PACAHARDIRQEN(r13) /* restore paca->hard_enabled */
- b 1f
+ .= 0x7000
+ .globl fwnmi_data_area
+fwnmi_data_area:
- .globl fast_exception_return
-fast_exception_return:
+ /* pseries and powernv need to keep the whole page from
+ * 0x7000 to 0x8000 free for use by the firmware
+ */
+ . = 0x8000
+#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
+
+/* Space for CPU0's segment table */
+ .balign 4096
+ .globl initial_stab
+initial_stab:
+ .space 4096
+
+#ifdef CONFIG_PPC_POWERNV
+_GLOBAL(opal_mc_secondary_handler)
+ HMT_MEDIUM_PPR_DISCARD
+ SET_SCRATCH0(r13)
+ GET_PACA(r13)
+ clrldi r3,r3,2
+ tovirt(r3,r3)
+ std r3,PACA_OPAL_MC_EVT(r13)
+ ld r13,OPAL_MC_SRR0(r3)
+ mtspr SPRN_SRR0,r13
+ ld r13,OPAL_MC_SRR1(r3)
+ mtspr SPRN_SRR1,r13
+ ld r3,OPAL_MC_GPR3(r3)
+ GET_SCRATCH0(r13)
+ b machine_check_pSeries
+#endif /* CONFIG_PPC_POWERNV */
+
+
+#define MACHINE_CHECK_HANDLER_WINDUP \
+ /* Clear MSR_RI before setting SRR0 and SRR1. */\
+ li r0,MSR_RI; \
+ mfmsr r9; /* get MSR value */ \
+ andc r9,r9,r0; \
+ mtmsrd r9,1; /* Clear MSR_RI */ \
+ /* Move original SRR0 and SRR1 into the respective regs */ \
+ ld r9,_MSR(r1); \
+ mtspr SPRN_SRR1,r9; \
+ ld r3,_NIP(r1); \
+ mtspr SPRN_SRR0,r3; \
+ ld r9,_CTR(r1); \
+ mtctr r9; \
+ ld r9,_XER(r1); \
+ mtxer r9; \
+ ld r9,_LINK(r1); \
+ mtlr r9; \
+ REST_GPR(0, r1); \
+ REST_8GPRS(2, r1); \
+ REST_GPR(10, r1); \
+ ld r11,_CCR(r1); \
+ mtcr r11; \
+ /* Decrement paca->in_mce. */ \
+ lhz r12,PACA_IN_MCE(r13); \
+ subi r12,r12,1; \
+ sth r12,PACA_IN_MCE(r13); \
+ REST_GPR(11, r1); \
+ REST_2GPRS(12, r1); \
+ /* restore original r1. */ \
+ ld r1,GPR1(r1)
+
+ /*
+ * Handle machine check early in real mode. We come here with
+ * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
+ */
+ .align 7
+ .globl machine_check_handle_early
+machine_check_handle_early:
+ std r0,GPR0(r1) /* Save r0 */
+ EXCEPTION_PROLOG_COMMON_3(0x200)
+ bl save_nvgprs
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl machine_check_early
+ std r3,RESULT(r1) /* Save result */
ld r12,_MSR(r1)
-1: ld r11,_NIP(r1)
- andi. r3,r12,MSR_RI /* check if RI is set */
- beq- unrecov_fer
+#ifdef CONFIG_PPC_P7_NAP
+ /*
+ * Check if thread was in power saving mode. We come here when any
+ * of the following is true:
+ * a. thread wasn't in power saving mode
+ * b. thread was in power saving mode with no state loss or
+ * supervisor state loss
+ *
+ * Go back to nap again if (b) is true.
+ */
+ rlwinm. r11,r12,47-31,30,31 /* Was it in power saving mode? */
+ beq 4f /* No, it wasn;t */
+ /* Thread was in power saving mode. Go back to nap again. */
+ cmpwi r11,2
+ bne 3f
+ /* Supervisor state loss */
+ li r0,1
+ stb r0,PACA_NAPSTATELOST(r13)
+3: bl machine_check_queue_event
+ MACHINE_CHECK_HANDLER_WINDUP
+ GET_PACA(r13)
+ ld r1,PACAR1(r13)
+ b power7_enter_nap_mode
+4:
+#endif
+ /*
+ * Check if we are coming from hypervisor userspace. If yes then we
+ * continue in host kernel in V mode to deliver the MC event.
+ */
+ rldicl. r11,r12,4,63 /* See if MC hit while in HV mode. */
+ beq 5f
+ andi. r11,r12,MSR_PR /* See if coming from user. */
+ bne 9f /* continue in V mode if we are. */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
- andi. r3,r12,MSR_PR
- beq 2f
- ACCOUNT_CPU_USER_EXIT(r3, r4)
+5:
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+ /*
+ * We are coming from kernel context. Check if we are coming from
+ * guest. if yes, then we can continue. We will fall through
+ * do_kvm_200->kvmppc_interrupt to deliver the MC event to guest.
+ */
+ lbz r11,HSTATE_IN_GUEST(r13)
+ cmpwi r11,0 /* Check if coming from guest */
+ bne 9f /* continue if we are. */
+#endif
+ /*
+ * At this point we are not sure about what context we come from.
+ * Queue up the MCE event and return from the interrupt.
+ * But before that, check if this is an un-recoverable exception.
+ * If yes, then stay on emergency stack and panic.
+ */
+ andi. r11,r12,MSR_RI
+ bne 2f
+1: mfspr r11,SPRN_SRR0
+ ld r10,PACAKBASE(r13)
+ LOAD_HANDLER(r10,unrecover_mce)
+ mtspr SPRN_SRR0,r10
+ ld r10,PACAKMSR(r13)
+ /*
+ * We are going down. But there are chances that we might get hit by
+ * another MCE during panic path and we may run into unstable state
+ * with no way out. Hence, turn ME bit off while going down, so that
+ * when another MCE is hit during panic path, system will checkstop
+ * and hypervisor will get restarted cleanly by SP.
+ */
+ li r3,MSR_ME
+ andc r10,r10,r3 /* Turn off MSR_ME */
+ mtspr SPRN_SRR1,r10
+ rfid
+ b .
2:
+ /*
+ * Check if we have successfully handled/recovered from error, if not
+ * then stay on emergency stack and panic.
+ */
+ ld r3,RESULT(r1) /* Load result */
+ cmpdi r3,0 /* see if we handled MCE successfully */
+
+ beq 1b /* if !handled then panic */
+ /*
+ * Return from MC interrupt.
+ * Queue up the MCE event so that we can log it later, while
+ * returning from kernel or opal call.
+ */
+ bl machine_check_queue_event
+ MACHINE_CHECK_HANDLER_WINDUP
+ rfid
+9:
+ /* Deliver the machine check to host kernel in V mode. */
+ MACHINE_CHECK_HANDLER_WINDUP
+ b machine_check_pSeries
+
+unrecover_mce:
+ /* Invoke machine_check_exception to print MCE event and panic. */
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl machine_check_exception
+ /*
+ * We will not reach here. Even if we did, there is no way out. Call
+ * unrecoverable_exception and die.
+ */
+1: addi r3,r1,STACK_FRAME_OVERHEAD
+ bl unrecoverable_exception
+ b 1b
+/*
+ * r13 points to the PACA, r9 contains the saved CR,
+ * r12 contain the saved SRR1, SRR0 is still ready for return
+ * r3 has the faulting address
+ * r9 - r13 are saved in paca->exslb.
+ * r3 is saved in paca->slb_r3
+ * We assume we aren't going to take any exceptions during this procedure.
+ */
+slb_miss_realmode:
+ mflr r10
+#ifdef CONFIG_RELOCATABLE
+ mtctr r11
#endif
- ld r3,_CCR(r1)
- ld r4,_LINK(r1)
- ld r5,_CTR(r1)
- ld r6,_XER(r1)
- mtcr r3
- mtlr r4
- mtctr r5
- mtxer r6
- REST_GPR(0, r1)
- REST_8GPRS(2, r1)
+ stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */
+ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */
+
+ bl slb_allocate_realmode
- mfmsr r10
- rldicl r10,r10,48,1 /* clear EE */
- rldicr r10,r10,16,61 /* clear RI (LE is 0 already) */
- mtmsrd r10,1
+ /* All done -- return from exception. */
- mtspr SPRN_SRR1,r12
- mtspr SPRN_SRR0,r11
- REST_4GPRS(10, r1)
- ld r1,GPR1(r1)
+ ld r10,PACA_EXSLB+EX_LR(r13)
+ ld r3,PACA_EXSLB+EX_R3(r13)
+ lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */
+
+ mtlr r10
+
+ andi. r10,r12,MSR_RI /* check for unrecoverable exception */
+ beq- 2f
+
+.machine push
+.machine "power4"
+ mtcrf 0x80,r9
+ mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */
+.machine pop
+
+ RESTORE_PPR_PACA(PACA_EXSLB, r9)
+ ld r9,PACA_EXSLB+EX_R9(r13)
+ ld r10,PACA_EXSLB+EX_R10(r13)
+ ld r11,PACA_EXSLB+EX_R11(r13)
+ ld r12,PACA_EXSLB+EX_R12(r13)
+ ld r13,PACA_EXSLB+EX_R13(r13)
rfid
b . /* prevent speculative execution */
-unrecov_fer:
- bl .save_nvgprs
+2: mfspr r11,SPRN_SRR0
+ ld r10,PACAKBASE(r13)
+ LOAD_HANDLER(r10,unrecov_slb)
+ mtspr SPRN_SRR0,r10
+ ld r10,PACAKMSR(r13)
+ mtspr SPRN_SRR1,r10
+ rfid
+ b .
+
+unrecov_slb:
+ EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB)
+ DISABLE_INTS
+ bl save_nvgprs
1: addi r3,r1,STACK_FRAME_OVERHEAD
- bl .unrecoverable_exception
+ bl unrecoverable_exception
b 1b
+#ifdef CONFIG_PPC_970_NAP
+power4_fixup_nap:
+ andc r9,r9,r10
+ std r9,TI_LOCAL_FLAGS(r11)
+ ld r10,_LINK(r1) /* make idle task do the */
+ std r10,_NIP(r1) /* equivalent of a blr */
+ blr
+#endif
+
/*
* Hash table stuff
*/
.align 7
-_STATIC(do_hash_page)
+do_hash_page:
std r3,_DAR(r1)
std r4,_DSISR(r1)
@@ -749,34 +1598,12 @@ _STATIC(do_hash_page)
BEGIN_FTR_SECTION
andis. r0,r4,0x0020 /* Is it a segment table fault? */
bne- do_ste_alloc /* If so handle it */
-END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
- clrrdi r11,r1,THREAD_SHIFT
+ CURRENT_THREAD_INFO(r11, r1)
lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */
andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */
bne 77f /* then don't call hash_page now */
-
- /*
- * On iSeries, we soft-disable interrupts here, then
- * hard-enable interrupts so that the hash_page code can spin on
- * the hash_table_lock without problems on a shared processor.
- */
- DISABLE_INTS
-
- /*
- * Currently, trace_hardirqs_off() will be called by DISABLE_INTS
- * and will clobber volatile registers when irq tracing is enabled
- * so we need to reload them. It may be possible to be smarter here
- * and move the irq tracing elsewhere but let's keep it simple for
- * now
- */
-#ifdef CONFIG_TRACE_IRQFLAGS
- ld r3,_DAR(r1)
- ld r4,_DSISR(r1)
- ld r5,_TRAP(r1)
- ld r12,_MSR(r1)
- clrrdi r5,r5,4
-#endif /* CONFIG_TRACE_IRQFLAGS */
/*
* We need to set the _PAGE_USER bit if MSR_PR is set or if we are
* accessing a userspace segment (even from the kernel). We assume
@@ -794,80 +1621,51 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
* r4 contains the required access permissions
* r5 contains the trap number
*
- * at return r3 = 0 for success
+ * at return r3 = 0 for success, 1 for page fault, negative for error
*/
- bl .hash_page /* build HPTE if possible */
+ bl hash_page /* build HPTE if possible */
cmpdi r3,0 /* see if hash_page succeeded */
-BEGIN_FW_FTR_SECTION
- /*
- * If we had interrupts soft-enabled at the point where the
- * DSI/ISI occurred, and an interrupt came in during hash_page,
- * handle it now.
- * We jump to ret_from_except_lite rather than fast_exception_return
- * because ret_from_except_lite will check for and handle pending
- * interrupts if necessary.
- */
- beq 13f
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-
-BEGIN_FW_FTR_SECTION
- /*
- * Here we have interrupts hard-disabled, so it is sufficient
- * to restore paca->{soft,hard}_enable and get out.
- */
+ /* Success */
beq fast_exc_return_irq /* Return from exception on success */
-END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES)
-
- /* For a hash failure, we don't bother re-enabling interrupts */
- ble- 12f
- /*
- * hash_page couldn't handle it, set soft interrupt enable back
- * to what it was before the trap. Note that .arch_local_irq_restore
- * handles any interrupts pending at this point.
- */
- ld r3,SOFTE(r1)
- TRACE_AND_RESTORE_IRQ_PARTIAL(r3, 11f)
- bl .arch_local_irq_restore
- b 11f
-
-/* We have a data breakpoint exception - handle it */
-handle_dabr_fault:
- bl .save_nvgprs
- ld r4,_DAR(r1)
- ld r5,_DSISR(r1)
- addi r3,r1,STACK_FRAME_OVERHEAD
- bl .do_dabr
- b .ret_from_except_lite
+ /* Error */
+ blt- 13f
/* Here we have a page fault that hash_page can't handle. */
handle_page_fault:
- ENABLE_INTS
11: ld r4,_DAR(r1)
ld r5,_DSISR(r1)
addi r3,r1,STACK_FRAME_OVERHEAD
- bl .do_page_fault
+ bl do_page_fault
cmpdi r3,0
- beq+ 13f
- bl .save_nvgprs
+ beq+ 12f
+ bl save_nvgprs
mr r5,r3
addi r3,r1,STACK_FRAME_OVERHEAD
lwz r4,_DAR(r1)
- bl .bad_page_fault
- b .ret_from_except
+ bl bad_page_fault
+ b ret_from_except
+
+/* We have a data breakpoint exception - handle it */
+handle_dabr_fault:
+ bl save_nvgprs
+ ld r4,_DAR(r1)
+ ld r5,_DSISR(r1)
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl do_break
+12: b ret_from_except_lite
-13: b .ret_from_except_lite
/* We have a page fault that hash_page could handle but HV refused
* the PTE insertion
*/
-12: bl .save_nvgprs
+13: bl save_nvgprs
mr r5,r3
addi r3,r1,STACK_FRAME_OVERHEAD
ld r4,_DAR(r1)
- bl .low_hash_fault
- b .ret_from_except
+ bl low_hash_fault
+ b ret_from_except
/*
* We come here as a result of a DSI at a point where we don't want
@@ -876,16 +1674,16 @@ handle_page_fault:
* were soft-disabled. We want to invoke the exception handler for
* the access, or panic if there isn't a handler.
*/
-77: bl .save_nvgprs
+77: bl save_nvgprs
mr r4,r3
addi r3,r1,STACK_FRAME_OVERHEAD
li r5,SIGSEGV
- bl .bad_page_fault
- b .ret_from_except
+ bl bad_page_fault
+ b ret_from_except
/* here we have a segment miss */
do_ste_alloc:
- bl .ste_allocate /* try to insert stab entry */
+ bl ste_allocate /* try to insert stab entry */
cmpdi r3,0
bne- handle_page_fault
b fast_exception_return
@@ -898,21 +1696,39 @@ do_ste_alloc:
* We assume (DAR >> 60) == 0xc.
*/
.align 7
-_GLOBAL(do_stab_bolted)
+do_stab_bolted:
stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */
std r11,PACA_EXSLB+EX_SRR0(r13) /* save SRR0 in exc. frame */
+ mfspr r11,SPRN_DAR /* ea */
+
+ /*
+ * check for bad kernel/user address
+ * (ea & ~REGION_MASK) >= PGTABLE_RANGE
+ */
+ rldicr. r9,r11,4,(63 - 46 - 4)
+ li r9,0 /* VSID = 0 for bad address */
+ bne- 0f
+
+ /*
+ * Calculate VSID:
+ * This is the kernel vsid, we take the top for context from
+ * the range. context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+ * Here we know that (ea >> 60) == 0xc
+ */
+ lis r9,(MAX_USER_CONTEXT + 1)@ha
+ addi r9,r9,(MAX_USER_CONTEXT + 1)@l
+ srdi r10,r11,SID_SHIFT
+ rldimi r10,r9,ESID_BITS,0 /* proto vsid */
+ ASM_VSID_SCRAMBLE(r10, r9, 256M)
+ rldic r9,r10,12,16 /* r9 = vsid << 12 */
+
+0:
/* Hash to the primary group */
ld r10,PACASTABVIRT(r13)
- mfspr r11,SPRN_DAR
- srdi r11,r11,28
+ srdi r11,r11,SID_SHIFT
rldimi r10,r11,7,52 /* r10 = first ste of the group */
- /* Calculate VSID */
- /* This is a kernel address, so protovsid = ESID */
- ASM_VSID_SCRAMBLE(r11, r9, 256M)
- rldic r9,r11,12,16 /* r9 = vsid << 12 */
-
/* Search the primary group for a free entry */
1: ld r11,0(r10) /* Test valid bit of the current ste */
andi. r11,r11,0x80
@@ -975,54 +1791,3 @@ _GLOBAL(do_stab_bolted)
ld r13,PACA_EXSLB+EX_R13(r13)
rfid
b . /* prevent speculative execution */
-
-/*
- * Space for CPU0's segment table.
- *
- * On iSeries, the hypervisor must fill in at least one entry before
- * we get control (with relocate on). The address is given to the hv
- * as a page number (see xLparMap below), so this must be at a
- * fixed address (the linker can't compute (u64)&initial_stab >>
- * PAGE_SHIFT).
- */
- . = STAB0_OFFSET /* 0x6000 */
- .globl initial_stab
-initial_stab:
- .space 4096
-
-#ifdef CONFIG_PPC_PSERIES
-/*
- * Data area reserved for FWNMI option.
- * This address (0x7000) is fixed by the RPA.
- */
- .= 0x7000
- .globl fwnmi_data_area
-fwnmi_data_area:
-#endif /* CONFIG_PPC_PSERIES */
-
- /* iSeries does not use the FWNMI stuff, so it is safe to put
- * this here, even if we later allow kernels that will boot on
- * both pSeries and iSeries */
-#ifdef CONFIG_PPC_ISERIES
- . = LPARMAP_PHYS
- .globl xLparMap
-xLparMap:
- .quad HvEsidsToMap /* xNumberEsids */
- .quad HvRangesToMap /* xNumberRanges */
- .quad STAB0_PAGE /* xSegmentTableOffs */
- .zero 40 /* xRsvd */
- /* xEsids (HvEsidsToMap entries of 2 quads) */
- .quad PAGE_OFFSET_ESID /* xKernelEsid */
- .quad PAGE_OFFSET_VSID /* xKernelVsid */
- .quad VMALLOC_START_ESID /* xKernelEsid */
- .quad VMALLOC_START_VSID /* xKernelVsid */
- /* xRanges (HvRangesToMap entries of 3 quads) */
- .quad HvPagesToMap /* xPages */
- .quad 0 /* xOffset */
- .quad PAGE_OFFSET_VSID << (SID_SHIFT - HW_PAGE_SHIFT) /* xVPN */
-
-#endif /* CONFIG_PPC_ISERIES */
-
-#ifdef CONFIG_PPC_PSERIES
- . = 0x8000
-#endif /* CONFIG_PPC_PSERIES */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
new file mode 100644
index 00000000000..742694c1d85
--- /dev/null
+++ b/arch/powerpc/kernel/fadump.c
@@ -0,0 +1,1316 @@
+/*
+ * Firmware Assisted dump: A robust mechanism to get reliable kernel crash
+ * dump with assistance from firmware. This approach does not use kexec,
+ * instead firmware assists in booting the kdump kernel while preserving
+ * memory contents. The most of the code implementation has been adapted
+ * from phyp assisted dump implementation written by Linas Vepstas and
+ * Manish Ahuja
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2011 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/memblock.h>
+#include <linux/delay.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/crash_dump.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/fadump.h>
+#include <asm/debug.h>
+#include <asm/setup.h>
+
+static struct fw_dump fw_dump;
+static struct fadump_mem_struct fdm;
+static const struct fadump_mem_struct *fdm_active;
+
+static DEFINE_MUTEX(fadump_mutex);
+struct fad_crash_memory_ranges crash_memory_ranges[INIT_CRASHMEM_RANGES];
+int crash_mem_ranges;
+
+/* Scan the Firmware Assisted dump configuration details. */
+int __init early_init_dt_scan_fw_dump(unsigned long node,
+ const char *uname, int depth, void *data)
+{
+ const __be32 *sections;
+ int i, num_sections;
+ int size;
+ const int *token;
+
+ if (depth != 1 || strcmp(uname, "rtas") != 0)
+ return 0;
+
+ /*
+ * Check if Firmware Assisted dump is supported. if yes, check
+ * if dump has been initiated on last reboot.
+ */
+ token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
+ if (!token)
+ return 1;
+
+ fw_dump.fadump_supported = 1;
+ fw_dump.ibm_configure_kernel_dump = *token;
+
+ /*
+ * The 'ibm,kernel-dump' rtas node is present only if there is
+ * dump data waiting for us.
+ */
+ fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL);
+ if (fdm_active)
+ fw_dump.dump_active = 1;
+
+ /* Get the sizes required to store dump data for the firmware provided
+ * dump sections.
+ * For each dump section type supported, a 32bit cell which defines
+ * the ID of a supported section followed by two 32 bit cells which
+ * gives teh size of the section in bytes.
+ */
+ sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
+ &size);
+
+ if (!sections)
+ return 1;
+
+ num_sections = size / (3 * sizeof(u32));
+
+ for (i = 0; i < num_sections; i++, sections += 3) {
+ u32 type = (u32)of_read_number(sections, 1);
+
+ switch (type) {
+ case FADUMP_CPU_STATE_DATA:
+ fw_dump.cpu_state_data_size =
+ of_read_ulong(&sections[1], 2);
+ break;
+ case FADUMP_HPTE_REGION:
+ fw_dump.hpte_region_size =
+ of_read_ulong(&sections[1], 2);
+ break;
+ }
+ }
+
+ return 1;
+}
+
+int is_fadump_active(void)
+{
+ return fw_dump.dump_active;
+}
+
+/* Print firmware assisted dump configurations for debugging purpose. */
+static void fadump_show_config(void)
+{
+ pr_debug("Support for firmware-assisted dump (fadump): %s\n",
+ (fw_dump.fadump_supported ? "present" : "no support"));
+
+ if (!fw_dump.fadump_supported)
+ return;
+
+ pr_debug("Fadump enabled : %s\n",
+ (fw_dump.fadump_enabled ? "yes" : "no"));
+ pr_debug("Dump Active : %s\n",
+ (fw_dump.dump_active ? "yes" : "no"));
+ pr_debug("Dump section sizes:\n");
+ pr_debug(" CPU state data size: %lx\n", fw_dump.cpu_state_data_size);
+ pr_debug(" HPTE region size : %lx\n", fw_dump.hpte_region_size);
+ pr_debug("Boot memory size : %lx\n", fw_dump.boot_memory_size);
+}
+
+static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm,
+ unsigned long addr)
+{
+ if (!fdm)
+ return 0;
+
+ memset(fdm, 0, sizeof(struct fadump_mem_struct));
+ addr = addr & PAGE_MASK;
+
+ fdm->header.dump_format_version = 0x00000001;
+ fdm->header.dump_num_sections = 3;
+ fdm->header.dump_status_flag = 0;
+ fdm->header.offset_first_dump_section =
+ (u32)offsetof(struct fadump_mem_struct, cpu_state_data);
+
+ /*
+ * Fields for disk dump option.
+ * We are not using disk dump option, hence set these fields to 0.
+ */
+ fdm->header.dd_block_size = 0;
+ fdm->header.dd_block_offset = 0;
+ fdm->header.dd_num_blocks = 0;
+ fdm->header.dd_offset_disk_path = 0;
+
+ /* set 0 to disable an automatic dump-reboot. */
+ fdm->header.max_time_auto = 0;
+
+ /* Kernel dump sections */
+ /* cpu state data section. */
+ fdm->cpu_state_data.request_flag = FADUMP_REQUEST_FLAG;
+ fdm->cpu_state_data.source_data_type = FADUMP_CPU_STATE_DATA;
+ fdm->cpu_state_data.source_address = 0;
+ fdm->cpu_state_data.source_len = fw_dump.cpu_state_data_size;
+ fdm->cpu_state_data.destination_address = addr;
+ addr += fw_dump.cpu_state_data_size;
+
+ /* hpte region section */
+ fdm->hpte_region.request_flag = FADUMP_REQUEST_FLAG;
+ fdm->hpte_region.source_data_type = FADUMP_HPTE_REGION;
+ fdm->hpte_region.source_address = 0;
+ fdm->hpte_region.source_len = fw_dump.hpte_region_size;
+ fdm->hpte_region.destination_address = addr;
+ addr += fw_dump.hpte_region_size;
+
+ /* RMA region section */
+ fdm->rmr_region.request_flag = FADUMP_REQUEST_FLAG;
+ fdm->rmr_region.source_data_type = FADUMP_REAL_MODE_REGION;
+ fdm->rmr_region.source_address = RMA_START;
+ fdm->rmr_region.source_len = fw_dump.boot_memory_size;
+ fdm->rmr_region.destination_address = addr;
+ addr += fw_dump.boot_memory_size;
+
+ return addr;
+}
+
+/**
+ * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM
+ *
+ * Function to find the largest memory size we need to reserve during early
+ * boot process. This will be the size of the memory that is required for a
+ * kernel to boot successfully.
+ *
+ * This function has been taken from phyp-assisted dump feature implementation.
+ *
+ * returns larger of 256MB or 5% rounded down to multiples of 256MB.
+ *
+ * TODO: Come up with better approach to find out more accurate memory size
+ * that is required for a kernel to boot successfully.
+ *
+ */
+static inline unsigned long fadump_calculate_reserve_size(void)
+{
+ unsigned long size;
+
+ /*
+ * Check if the size is specified through fadump_reserve_mem= cmdline
+ * option. If yes, then use that.
+ */
+ if (fw_dump.reserve_bootvar)
+ return fw_dump.reserve_bootvar;
+
+ /* divide by 20 to get 5% of value */
+ size = memblock_end_of_DRAM() / 20;
+
+ /* round it down in multiples of 256 */
+ size = size & ~0x0FFFFFFFUL;
+
+ /* Truncate to memory_limit. We don't want to over reserve the memory.*/
+ if (memory_limit && size > memory_limit)
+ size = memory_limit;
+
+ return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM);
+}
+
+/*
+ * Calculate the total memory size required to be reserved for
+ * firmware-assisted dump registration.
+ */
+static unsigned long get_fadump_area_size(void)
+{
+ unsigned long size = 0;
+
+ size += fw_dump.cpu_state_data_size;
+ size += fw_dump.hpte_region_size;
+ size += fw_dump.boot_memory_size;
+ size += sizeof(struct fadump_crash_info_header);
+ size += sizeof(struct elfhdr); /* ELF core header.*/
+ size += sizeof(struct elf_phdr); /* place holder for cpu notes */
+ /* Program headers for crash memory regions. */
+ size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
+
+ size = PAGE_ALIGN(size);
+ return size;
+}
+
+int __init fadump_reserve_mem(void)
+{
+ unsigned long base, size, memory_boundary;
+
+ if (!fw_dump.fadump_enabled)
+ return 0;
+
+ if (!fw_dump.fadump_supported) {
+ printk(KERN_INFO "Firmware-assisted dump is not supported on"
+ " this hardware\n");
+ fw_dump.fadump_enabled = 0;
+ return 0;
+ }
+ /*
+ * Initialize boot memory size
+ * If dump is active then we have already calculated the size during
+ * first kernel.
+ */
+ if (fdm_active)
+ fw_dump.boot_memory_size = fdm_active->rmr_region.source_len;
+ else
+ fw_dump.boot_memory_size = fadump_calculate_reserve_size();
+
+ /*
+ * Calculate the memory boundary.
+ * If memory_limit is less than actual memory boundary then reserve
+ * the memory for fadump beyond the memory_limit and adjust the
+ * memory_limit accordingly, so that the running kernel can run with
+ * specified memory_limit.
+ */
+ if (memory_limit && memory_limit < memblock_end_of_DRAM()) {
+ size = get_fadump_area_size();
+ if ((memory_limit + size) < memblock_end_of_DRAM())
+ memory_limit += size;
+ else
+ memory_limit = memblock_end_of_DRAM();
+ printk(KERN_INFO "Adjusted memory_limit for firmware-assisted"
+ " dump, now %#016llx\n", memory_limit);
+ }
+ if (memory_limit)
+ memory_boundary = memory_limit;
+ else
+ memory_boundary = memblock_end_of_DRAM();
+
+ if (fw_dump.dump_active) {
+ printk(KERN_INFO "Firmware-assisted dump is active.\n");
+ /*
+ * If last boot has crashed then reserve all the memory
+ * above boot_memory_size so that we don't touch it until
+ * dump is written to disk by userspace tool. This memory
+ * will be released for general use once the dump is saved.
+ */
+ base = fw_dump.boot_memory_size;
+ size = memory_boundary - base;
+ memblock_reserve(base, size);
+ printk(KERN_INFO "Reserved %ldMB of memory at %ldMB "
+ "for saving crash dump\n",
+ (unsigned long)(size >> 20),
+ (unsigned long)(base >> 20));
+
+ fw_dump.fadumphdr_addr =
+ fdm_active->rmr_region.destination_address +
+ fdm_active->rmr_region.source_len;
+ pr_debug("fadumphdr_addr = %p\n",
+ (void *) fw_dump.fadumphdr_addr);
+ } else {
+ /* Reserve the memory at the top of memory. */
+ size = get_fadump_area_size();
+ base = memory_boundary - size;
+ memblock_reserve(base, size);
+ printk(KERN_INFO "Reserved %ldMB of memory at %ldMB "
+ "for firmware-assisted dump\n",
+ (unsigned long)(size >> 20),
+ (unsigned long)(base >> 20));
+ }
+ fw_dump.reserve_dump_area_start = base;
+ fw_dump.reserve_dump_area_size = size;
+ return 1;
+}
+
+/* Look for fadump= cmdline option. */
+static int __init early_fadump_param(char *p)
+{
+ if (!p)
+ return 1;
+
+ if (strncmp(p, "on", 2) == 0)
+ fw_dump.fadump_enabled = 1;
+ else if (strncmp(p, "off", 3) == 0)
+ fw_dump.fadump_enabled = 0;
+
+ return 0;
+}
+early_param("fadump", early_fadump_param);
+
+/* Look for fadump_reserve_mem= cmdline option */
+static int __init early_fadump_reserve_mem(char *p)
+{
+ if (p)
+ fw_dump.reserve_bootvar = memparse(p, &p);
+ return 0;
+}
+early_param("fadump_reserve_mem", early_fadump_reserve_mem);
+
+static void register_fw_dump(struct fadump_mem_struct *fdm)
+{
+ int rc;
+ unsigned int wait_time;
+
+ pr_debug("Registering for firmware-assisted kernel dump...\n");
+
+ /* TODO: Add upper time limit for the delay */
+ do {
+ rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+ FADUMP_REGISTER, fdm,
+ sizeof(struct fadump_mem_struct));
+
+ wait_time = rtas_busy_delay_time(rc);
+ if (wait_time)
+ mdelay(wait_time);
+
+ } while (wait_time);
+
+ switch (rc) {
+ case -1:
+ printk(KERN_ERR "Failed to register firmware-assisted kernel"
+ " dump. Hardware Error(%d).\n", rc);
+ break;
+ case -3:
+ printk(KERN_ERR "Failed to register firmware-assisted kernel"
+ " dump. Parameter Error(%d).\n", rc);
+ break;
+ case -9:
+ printk(KERN_ERR "firmware-assisted kernel dump is already "
+ " registered.");
+ fw_dump.dump_registered = 1;
+ break;
+ case 0:
+ printk(KERN_INFO "firmware-assisted kernel dump registration"
+ " is successful\n");
+ fw_dump.dump_registered = 1;
+ break;
+ }
+}
+
+void crash_fadump(struct pt_regs *regs, const char *str)
+{
+ struct fadump_crash_info_header *fdh = NULL;
+
+ if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr)
+ return;
+
+ fdh = __va(fw_dump.fadumphdr_addr);
+ crashing_cpu = smp_processor_id();
+ fdh->crashing_cpu = crashing_cpu;
+ crash_save_vmcoreinfo();
+
+ if (regs)
+ fdh->regs = *regs;
+ else
+ ppc_save_regs(&fdh->regs);
+
+ fdh->cpu_online_mask = *cpu_online_mask;
+
+ /* Call ibm,os-term rtas call to trigger firmware assisted dump */
+ rtas_os_term((char *)str);
+}
+
+#define GPR_MASK 0xffffff0000000000
+static inline int fadump_gpr_index(u64 id)
+{
+ int i = -1;
+ char str[3];
+
+ if ((id & GPR_MASK) == REG_ID("GPR")) {
+ /* get the digits at the end */
+ id &= ~GPR_MASK;
+ id >>= 24;
+ str[2] = '\0';
+ str[1] = id & 0xff;
+ str[0] = (id >> 8) & 0xff;
+ sscanf(str, "%d", &i);
+ if (i > 31)
+ i = -1;
+ }
+ return i;
+}
+
+static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id,
+ u64 reg_val)
+{
+ int i;
+
+ i = fadump_gpr_index(reg_id);
+ if (i >= 0)
+ regs->gpr[i] = (unsigned long)reg_val;
+ else if (reg_id == REG_ID("NIA"))
+ regs->nip = (unsigned long)reg_val;
+ else if (reg_id == REG_ID("MSR"))
+ regs->msr = (unsigned long)reg_val;
+ else if (reg_id == REG_ID("CTR"))
+ regs->ctr = (unsigned long)reg_val;
+ else if (reg_id == REG_ID("LR"))
+ regs->link = (unsigned long)reg_val;
+ else if (reg_id == REG_ID("XER"))
+ regs->xer = (unsigned long)reg_val;
+ else if (reg_id == REG_ID("CR"))
+ regs->ccr = (unsigned long)reg_val;
+ else if (reg_id == REG_ID("DAR"))
+ regs->dar = (unsigned long)reg_val;
+ else if (reg_id == REG_ID("DSISR"))
+ regs->dsisr = (unsigned long)reg_val;
+}
+
+static struct fadump_reg_entry*
+fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs)
+{
+ memset(regs, 0, sizeof(struct pt_regs));
+
+ while (reg_entry->reg_id != REG_ID("CPUEND")) {
+ fadump_set_regval(regs, reg_entry->reg_id,
+ reg_entry->reg_value);
+ reg_entry++;
+ }
+ reg_entry++;
+ return reg_entry;
+}
+
+static u32 *fadump_append_elf_note(u32 *buf, char *name, unsigned type,
+ void *data, size_t data_len)
+{
+ struct elf_note note;
+
+ note.n_namesz = strlen(name) + 1;
+ note.n_descsz = data_len;
+ note.n_type = type;
+ memcpy(buf, &note, sizeof(note));
+ buf += (sizeof(note) + 3)/4;
+ memcpy(buf, name, note.n_namesz);
+ buf += (note.n_namesz + 3)/4;
+ memcpy(buf, data, note.n_descsz);
+ buf += (note.n_descsz + 3)/4;
+
+ return buf;
+}
+
+static void fadump_final_note(u32 *buf)
+{
+ struct elf_note note;
+
+ note.n_namesz = 0;
+ note.n_descsz = 0;
+ note.n_type = 0;
+ memcpy(buf, &note, sizeof(note));
+}
+
+static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs)
+{
+ struct elf_prstatus prstatus;
+
+ memset(&prstatus, 0, sizeof(prstatus));
+ /*
+ * FIXME: How do i get PID? Do I really need it?
+ * prstatus.pr_pid = ????
+ */
+ elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+ buf = fadump_append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+ return buf;
+}
+
+static void fadump_update_elfcore_header(char *bufp)
+{
+ struct elfhdr *elf;
+ struct elf_phdr *phdr;
+
+ elf = (struct elfhdr *)bufp;
+ bufp += sizeof(struct elfhdr);
+
+ /* First note is a place holder for cpu notes info. */
+ phdr = (struct elf_phdr *)bufp;
+
+ if (phdr->p_type == PT_NOTE) {
+ phdr->p_paddr = fw_dump.cpu_notes_buf;
+ phdr->p_offset = phdr->p_paddr;
+ phdr->p_filesz = fw_dump.cpu_notes_buf_size;
+ phdr->p_memsz = fw_dump.cpu_notes_buf_size;
+ }
+ return;
+}
+
+static void *fadump_cpu_notes_buf_alloc(unsigned long size)
+{
+ void *vaddr;
+ struct page *page;
+ unsigned long order, count, i;
+
+ order = get_order(size);
+ vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+ if (!vaddr)
+ return NULL;
+
+ count = 1 << order;
+ page = virt_to_page(vaddr);
+ for (i = 0; i < count; i++)
+ SetPageReserved(page + i);
+ return vaddr;
+}
+
+static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size)
+{
+ struct page *page;
+ unsigned long order, count, i;
+
+ order = get_order(size);
+ count = 1 << order;
+ page = virt_to_page(vaddr);
+ for (i = 0; i < count; i++)
+ ClearPageReserved(page + i);
+ __free_pages(page, order);
+}
+
+/*
+ * Read CPU state dump data and convert it into ELF notes.
+ * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be
+ * used to access the data to allow for additional fields to be added without
+ * affecting compatibility. Each list of registers for a CPU starts with
+ * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes,
+ * 8 Byte ASCII identifier and 8 Byte register value. The register entry
+ * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part
+ * of register value. For more details refer to PAPR document.
+ *
+ * Only for the crashing cpu we ignore the CPU dump data and get exact
+ * state from fadump crash info structure populated by first kernel at the
+ * time of crash.
+ */
+static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm)
+{
+ struct fadump_reg_save_area_header *reg_header;
+ struct fadump_reg_entry *reg_entry;
+ struct fadump_crash_info_header *fdh = NULL;
+ void *vaddr;
+ unsigned long addr;
+ u32 num_cpus, *note_buf;
+ struct pt_regs regs;
+ int i, rc = 0, cpu = 0;
+
+ if (!fdm->cpu_state_data.bytes_dumped)
+ return -EINVAL;
+
+ addr = fdm->cpu_state_data.destination_address;
+ vaddr = __va(addr);
+
+ reg_header = vaddr;
+ if (reg_header->magic_number != REGSAVE_AREA_MAGIC) {
+ printk(KERN_ERR "Unable to read register save area.\n");
+ return -ENOENT;
+ }
+ pr_debug("--------CPU State Data------------\n");
+ pr_debug("Magic Number: %llx\n", reg_header->magic_number);
+ pr_debug("NumCpuOffset: %x\n", reg_header->num_cpu_offset);
+
+ vaddr += reg_header->num_cpu_offset;
+ num_cpus = *((u32 *)(vaddr));
+ pr_debug("NumCpus : %u\n", num_cpus);
+ vaddr += sizeof(u32);
+ reg_entry = (struct fadump_reg_entry *)vaddr;
+
+ /* Allocate buffer to hold cpu crash notes. */
+ fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t);
+ fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size);
+ note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size);
+ if (!note_buf) {
+ printk(KERN_ERR "Failed to allocate 0x%lx bytes for "
+ "cpu notes buffer\n", fw_dump.cpu_notes_buf_size);
+ return -ENOMEM;
+ }
+ fw_dump.cpu_notes_buf = __pa(note_buf);
+
+ pr_debug("Allocated buffer for cpu notes of size %ld at %p\n",
+ (num_cpus * sizeof(note_buf_t)), note_buf);
+
+ if (fw_dump.fadumphdr_addr)
+ fdh = __va(fw_dump.fadumphdr_addr);
+
+ for (i = 0; i < num_cpus; i++) {
+ if (reg_entry->reg_id != REG_ID("CPUSTRT")) {
+ printk(KERN_ERR "Unable to read CPU state data\n");
+ rc = -ENOENT;
+ goto error_out;
+ }
+ /* Lower 4 bytes of reg_value contains logical cpu id */
+ cpu = reg_entry->reg_value & FADUMP_CPU_ID_MASK;
+ if (fdh && !cpumask_test_cpu(cpu, &fdh->cpu_online_mask)) {
+ SKIP_TO_NEXT_CPU(reg_entry);
+ continue;
+ }
+ pr_debug("Reading register data for cpu %d...\n", cpu);
+ if (fdh && fdh->crashing_cpu == cpu) {
+ regs = fdh->regs;
+ note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+ SKIP_TO_NEXT_CPU(reg_entry);
+ } else {
+ reg_entry++;
+ reg_entry = fadump_read_registers(reg_entry, &regs);
+ note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+ }
+ }
+ fadump_final_note(note_buf);
+
+ if (fdh) {
+ pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+ fdh->elfcorehdr_addr);
+ fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr));
+ }
+ return 0;
+
+error_out:
+ fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf),
+ fw_dump.cpu_notes_buf_size);
+ fw_dump.cpu_notes_buf = 0;
+ fw_dump.cpu_notes_buf_size = 0;
+ return rc;
+
+}
+
+/*
+ * Validate and process the dump data stored by firmware before exporting
+ * it through '/proc/vmcore'.
+ */
+static int __init process_fadump(const struct fadump_mem_struct *fdm_active)
+{
+ struct fadump_crash_info_header *fdh;
+ int rc = 0;
+
+ if (!fdm_active || !fw_dump.fadumphdr_addr)
+ return -EINVAL;
+
+ /* Check if the dump data is valid. */
+ if ((fdm_active->header.dump_status_flag == FADUMP_ERROR_FLAG) ||
+ (fdm_active->cpu_state_data.error_flags != 0) ||
+ (fdm_active->rmr_region.error_flags != 0)) {
+ printk(KERN_ERR "Dump taken by platform is not valid\n");
+ return -EINVAL;
+ }
+ if ((fdm_active->rmr_region.bytes_dumped !=
+ fdm_active->rmr_region.source_len) ||
+ !fdm_active->cpu_state_data.bytes_dumped) {
+ printk(KERN_ERR "Dump taken by platform is incomplete\n");
+ return -EINVAL;
+ }
+
+ /* Validate the fadump crash info header */
+ fdh = __va(fw_dump.fadumphdr_addr);
+ if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) {
+ printk(KERN_ERR "Crash info header is not valid.\n");
+ return -EINVAL;
+ }
+
+ rc = fadump_build_cpu_notes(fdm_active);
+ if (rc)
+ return rc;
+
+ /*
+ * We are done validating dump info and elfcore header is now ready
+ * to be exported. set elfcorehdr_addr so that vmcore module will
+ * export the elfcore header through '/proc/vmcore'.
+ */
+ elfcorehdr_addr = fdh->elfcorehdr_addr;
+
+ return 0;
+}
+
+static inline void fadump_add_crash_memory(unsigned long long base,
+ unsigned long long end)
+{
+ if (base == end)
+ return;
+
+ pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n",
+ crash_mem_ranges, base, end - 1, (end - base));
+ crash_memory_ranges[crash_mem_ranges].base = base;
+ crash_memory_ranges[crash_mem_ranges].size = end - base;
+ crash_mem_ranges++;
+}
+
+static void fadump_exclude_reserved_area(unsigned long long start,
+ unsigned long long end)
+{
+ unsigned long long ra_start, ra_end;
+
+ ra_start = fw_dump.reserve_dump_area_start;
+ ra_end = ra_start + fw_dump.reserve_dump_area_size;
+
+ if ((ra_start < end) && (ra_end > start)) {
+ if ((start < ra_start) && (end > ra_end)) {
+ fadump_add_crash_memory(start, ra_start);
+ fadump_add_crash_memory(ra_end, end);
+ } else if (start < ra_start) {
+ fadump_add_crash_memory(start, ra_start);
+ } else if (ra_end < end) {
+ fadump_add_crash_memory(ra_end, end);
+ }
+ } else
+ fadump_add_crash_memory(start, end);
+}
+
+static int fadump_init_elfcore_header(char *bufp)
+{
+ struct elfhdr *elf;
+
+ elf = (struct elfhdr *) bufp;
+ bufp += sizeof(struct elfhdr);
+ memcpy(elf->e_ident, ELFMAG, SELFMAG);
+ elf->e_ident[EI_CLASS] = ELF_CLASS;
+ elf->e_ident[EI_DATA] = ELF_DATA;
+ elf->e_ident[EI_VERSION] = EV_CURRENT;
+ elf->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+ elf->e_type = ET_CORE;
+ elf->e_machine = ELF_ARCH;
+ elf->e_version = EV_CURRENT;
+ elf->e_entry = 0;
+ elf->e_phoff = sizeof(struct elfhdr);
+ elf->e_shoff = 0;
+ elf->e_flags = ELF_CORE_EFLAGS;
+ elf->e_ehsize = sizeof(struct elfhdr);
+ elf->e_phentsize = sizeof(struct elf_phdr);
+ elf->e_phnum = 0;
+ elf->e_shentsize = 0;
+ elf->e_shnum = 0;
+ elf->e_shstrndx = 0;
+
+ return 0;
+}
+
+/*
+ * Traverse through memblock structure and setup crash memory ranges. These
+ * ranges will be used create PT_LOAD program headers in elfcore header.
+ */
+static void fadump_setup_crash_memory_ranges(void)
+{
+ struct memblock_region *reg;
+ unsigned long long start, end;
+
+ pr_debug("Setup crash memory ranges.\n");
+ crash_mem_ranges = 0;
+ /*
+ * add the first memory chunk (RMA_START through boot_memory_size) as
+ * a separate memory chunk. The reason is, at the time crash firmware
+ * will move the content of this memory chunk to different location
+ * specified during fadump registration. We need to create a separate
+ * program header for this chunk with the correct offset.
+ */
+ fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size);
+
+ for_each_memblock(memory, reg) {
+ start = (unsigned long long)reg->base;
+ end = start + (unsigned long long)reg->size;
+ if (start == RMA_START && end >= fw_dump.boot_memory_size)
+ start = fw_dump.boot_memory_size;
+
+ /* add this range excluding the reserved dump area. */
+ fadump_exclude_reserved_area(start, end);
+ }
+}
+
+/*
+ * If the given physical address falls within the boot memory region then
+ * return the relocated address that points to the dump region reserved
+ * for saving initial boot memory contents.
+ */
+static inline unsigned long fadump_relocate(unsigned long paddr)
+{
+ if (paddr > RMA_START && paddr < fw_dump.boot_memory_size)
+ return fdm.rmr_region.destination_address + paddr;
+ else
+ return paddr;
+}
+
+static int fadump_create_elfcore_headers(char *bufp)
+{
+ struct elfhdr *elf;
+ struct elf_phdr *phdr;
+ int i;
+
+ fadump_init_elfcore_header(bufp);
+ elf = (struct elfhdr *)bufp;
+ bufp += sizeof(struct elfhdr);
+
+ /*
+ * setup ELF PT_NOTE, place holder for cpu notes info. The notes info
+ * will be populated during second kernel boot after crash. Hence
+ * this PT_NOTE will always be the first elf note.
+ *
+ * NOTE: Any new ELF note addition should be placed after this note.
+ */
+ phdr = (struct elf_phdr *)bufp;
+ bufp += sizeof(struct elf_phdr);
+ phdr->p_type = PT_NOTE;
+ phdr->p_flags = 0;
+ phdr->p_vaddr = 0;
+ phdr->p_align = 0;
+
+ phdr->p_offset = 0;
+ phdr->p_paddr = 0;
+ phdr->p_filesz = 0;
+ phdr->p_memsz = 0;
+
+ (elf->e_phnum)++;
+
+ /* setup ELF PT_NOTE for vmcoreinfo */
+ phdr = (struct elf_phdr *)bufp;
+ bufp += sizeof(struct elf_phdr);
+ phdr->p_type = PT_NOTE;
+ phdr->p_flags = 0;
+ phdr->p_vaddr = 0;
+ phdr->p_align = 0;
+
+ phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note());
+ phdr->p_offset = phdr->p_paddr;
+ phdr->p_memsz = vmcoreinfo_max_size;
+ phdr->p_filesz = vmcoreinfo_max_size;
+
+ /* Increment number of program headers. */
+ (elf->e_phnum)++;
+
+ /* setup PT_LOAD sections. */
+
+ for (i = 0; i < crash_mem_ranges; i++) {
+ unsigned long long mbase, msize;
+ mbase = crash_memory_ranges[i].base;
+ msize = crash_memory_ranges[i].size;
+
+ if (!msize)
+ continue;
+
+ phdr = (struct elf_phdr *)bufp;
+ bufp += sizeof(struct elf_phdr);
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mbase;
+
+ if (mbase == RMA_START) {
+ /*
+ * The entire RMA region will be moved by firmware
+ * to the specified destination_address. Hence set
+ * the correct offset.
+ */
+ phdr->p_offset = fdm.rmr_region.destination_address;
+ }
+
+ phdr->p_paddr = mbase;
+ phdr->p_vaddr = (unsigned long)__va(mbase);
+ phdr->p_filesz = msize;
+ phdr->p_memsz = msize;
+ phdr->p_align = 0;
+
+ /* Increment number of program headers. */
+ (elf->e_phnum)++;
+ }
+ return 0;
+}
+
+static unsigned long init_fadump_header(unsigned long addr)
+{
+ struct fadump_crash_info_header *fdh;
+
+ if (!addr)
+ return 0;
+
+ fw_dump.fadumphdr_addr = addr;
+ fdh = __va(addr);
+ addr += sizeof(struct fadump_crash_info_header);
+
+ memset(fdh, 0, sizeof(struct fadump_crash_info_header));
+ fdh->magic_number = FADUMP_CRASH_INFO_MAGIC;
+ fdh->elfcorehdr_addr = addr;
+ /* We will set the crashing cpu id in crash_fadump() during crash. */
+ fdh->crashing_cpu = CPU_UNKNOWN;
+
+ return addr;
+}
+
+static void register_fadump(void)
+{
+ unsigned long addr;
+ void *vaddr;
+
+ /*
+ * If no memory is reserved then we can not register for firmware-
+ * assisted dump.
+ */
+ if (!fw_dump.reserve_dump_area_size)
+ return;
+
+ fadump_setup_crash_memory_ranges();
+
+ addr = fdm.rmr_region.destination_address + fdm.rmr_region.source_len;
+ /* Initialize fadump crash info header. */
+ addr = init_fadump_header(addr);
+ vaddr = __va(addr);
+
+ pr_debug("Creating ELF core headers at %#016lx\n", addr);
+ fadump_create_elfcore_headers(vaddr);
+
+ /* register the future kernel dump with firmware. */
+ register_fw_dump(&fdm);
+}
+
+static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
+{
+ int rc = 0;
+ unsigned int wait_time;
+
+ pr_debug("Un-register firmware-assisted dump\n");
+
+ /* TODO: Add upper time limit for the delay */
+ do {
+ rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+ FADUMP_UNREGISTER, fdm,
+ sizeof(struct fadump_mem_struct));
+
+ wait_time = rtas_busy_delay_time(rc);
+ if (wait_time)
+ mdelay(wait_time);
+ } while (wait_time);
+
+ if (rc) {
+ printk(KERN_ERR "Failed to un-register firmware-assisted dump."
+ " unexpected error(%d).\n", rc);
+ return rc;
+ }
+ fw_dump.dump_registered = 0;
+ return 0;
+}
+
+static int fadump_invalidate_dump(struct fadump_mem_struct *fdm)
+{
+ int rc = 0;
+ unsigned int wait_time;
+
+ pr_debug("Invalidating firmware-assisted dump registration\n");
+
+ /* TODO: Add upper time limit for the delay */
+ do {
+ rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+ FADUMP_INVALIDATE, fdm,
+ sizeof(struct fadump_mem_struct));
+
+ wait_time = rtas_busy_delay_time(rc);
+ if (wait_time)
+ mdelay(wait_time);
+ } while (wait_time);
+
+ if (rc) {
+ printk(KERN_ERR "Failed to invalidate firmware-assisted dump "
+ "rgistration. unexpected error(%d).\n", rc);
+ return rc;
+ }
+ fw_dump.dump_active = 0;
+ fdm_active = NULL;
+ return 0;
+}
+
+void fadump_cleanup(void)
+{
+ /* Invalidate the registration only if dump is active. */
+ if (fw_dump.dump_active) {
+ init_fadump_mem_struct(&fdm,
+ fdm_active->cpu_state_data.destination_address);
+ fadump_invalidate_dump(&fdm);
+ }
+}
+
+/*
+ * Release the memory that was reserved in early boot to preserve the memory
+ * contents. The released memory will be available for general use.
+ */
+static void fadump_release_memory(unsigned long begin, unsigned long end)
+{
+ unsigned long addr;
+ unsigned long ra_start, ra_end;
+
+ ra_start = fw_dump.reserve_dump_area_start;
+ ra_end = ra_start + fw_dump.reserve_dump_area_size;
+
+ for (addr = begin; addr < end; addr += PAGE_SIZE) {
+ /*
+ * exclude the dump reserve area. Will reuse it for next
+ * fadump registration.
+ */
+ if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start))
+ continue;
+
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+ }
+}
+
+static void fadump_invalidate_release_mem(void)
+{
+ unsigned long reserved_area_start, reserved_area_end;
+ unsigned long destination_address;
+
+ mutex_lock(&fadump_mutex);
+ if (!fw_dump.dump_active) {
+ mutex_unlock(&fadump_mutex);
+ return;
+ }
+
+ destination_address = fdm_active->cpu_state_data.destination_address;
+ fadump_cleanup();
+ mutex_unlock(&fadump_mutex);
+
+ /*
+ * Save the current reserved memory bounds we will require them
+ * later for releasing the memory for general use.
+ */
+ reserved_area_start = fw_dump.reserve_dump_area_start;
+ reserved_area_end = reserved_area_start +
+ fw_dump.reserve_dump_area_size;
+ /*
+ * Setup reserve_dump_area_start and its size so that we can
+ * reuse this reserved memory for Re-registration.
+ */
+ fw_dump.reserve_dump_area_start = destination_address;
+ fw_dump.reserve_dump_area_size = get_fadump_area_size();
+
+ fadump_release_memory(reserved_area_start, reserved_area_end);
+ if (fw_dump.cpu_notes_buf) {
+ fadump_cpu_notes_buf_free(
+ (unsigned long)__va(fw_dump.cpu_notes_buf),
+ fw_dump.cpu_notes_buf_size);
+ fw_dump.cpu_notes_buf = 0;
+ fw_dump.cpu_notes_buf_size = 0;
+ }
+ /* Initialize the kernel dump memory structure for FAD registration. */
+ init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
+}
+
+static ssize_t fadump_release_memory_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!fw_dump.dump_active)
+ return -EPERM;
+
+ if (buf[0] == '1') {
+ /*
+ * Take away the '/proc/vmcore'. We are releasing the dump
+ * memory, hence it will not be valid anymore.
+ */
+ vmcore_cleanup();
+ fadump_invalidate_release_mem();
+
+ } else
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t fadump_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", fw_dump.fadump_enabled);
+}
+
+static ssize_t fadump_register_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", fw_dump.dump_registered);
+}
+
+static ssize_t fadump_register_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ret = 0;
+
+ if (!fw_dump.fadump_enabled || fdm_active)
+ return -EPERM;
+
+ mutex_lock(&fadump_mutex);
+
+ switch (buf[0]) {
+ case '0':
+ if (fw_dump.dump_registered == 0) {
+ ret = -EINVAL;
+ goto unlock_out;
+ }
+ /* Un-register Firmware-assisted dump */
+ fadump_unregister_dump(&fdm);
+ break;
+ case '1':
+ if (fw_dump.dump_registered == 1) {
+ ret = -EINVAL;
+ goto unlock_out;
+ }
+ /* Register Firmware-assisted dump */
+ register_fadump();
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+unlock_out:
+ mutex_unlock(&fadump_mutex);
+ return ret < 0 ? ret : count;
+}
+
+static int fadump_region_show(struct seq_file *m, void *private)
+{
+ const struct fadump_mem_struct *fdm_ptr;
+
+ if (!fw_dump.fadump_enabled)
+ return 0;
+
+ mutex_lock(&fadump_mutex);
+ if (fdm_active)
+ fdm_ptr = fdm_active;
+ else {
+ mutex_unlock(&fadump_mutex);
+ fdm_ptr = &fdm;
+ }
+
+ seq_printf(m,
+ "CPU : [%#016llx-%#016llx] %#llx bytes, "
+ "Dumped: %#llx\n",
+ fdm_ptr->cpu_state_data.destination_address,
+ fdm_ptr->cpu_state_data.destination_address +
+ fdm_ptr->cpu_state_data.source_len - 1,
+ fdm_ptr->cpu_state_data.source_len,
+ fdm_ptr->cpu_state_data.bytes_dumped);
+ seq_printf(m,
+ "HPTE: [%#016llx-%#016llx] %#llx bytes, "
+ "Dumped: %#llx\n",
+ fdm_ptr->hpte_region.destination_address,
+ fdm_ptr->hpte_region.destination_address +
+ fdm_ptr->hpte_region.source_len - 1,
+ fdm_ptr->hpte_region.source_len,
+ fdm_ptr->hpte_region.bytes_dumped);
+ seq_printf(m,
+ "DUMP: [%#016llx-%#016llx] %#llx bytes, "
+ "Dumped: %#llx\n",
+ fdm_ptr->rmr_region.destination_address,
+ fdm_ptr->rmr_region.destination_address +
+ fdm_ptr->rmr_region.source_len - 1,
+ fdm_ptr->rmr_region.source_len,
+ fdm_ptr->rmr_region.bytes_dumped);
+
+ if (!fdm_active ||
+ (fw_dump.reserve_dump_area_start ==
+ fdm_ptr->cpu_state_data.destination_address))
+ goto out;
+
+ /* Dump is active. Show reserved memory region. */
+ seq_printf(m,
+ " : [%#016llx-%#016llx] %#llx bytes, "
+ "Dumped: %#llx\n",
+ (unsigned long long)fw_dump.reserve_dump_area_start,
+ fdm_ptr->cpu_state_data.destination_address - 1,
+ fdm_ptr->cpu_state_data.destination_address -
+ fw_dump.reserve_dump_area_start,
+ fdm_ptr->cpu_state_data.destination_address -
+ fw_dump.reserve_dump_area_start);
+out:
+ if (fdm_active)
+ mutex_unlock(&fadump_mutex);
+ return 0;
+}
+
+static struct kobj_attribute fadump_release_attr = __ATTR(fadump_release_mem,
+ 0200, NULL,
+ fadump_release_memory_store);
+static struct kobj_attribute fadump_attr = __ATTR(fadump_enabled,
+ 0444, fadump_enabled_show,
+ NULL);
+static struct kobj_attribute fadump_register_attr = __ATTR(fadump_registered,
+ 0644, fadump_register_show,
+ fadump_register_store);
+
+static int fadump_region_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, fadump_region_show, inode->i_private);
+}
+
+static const struct file_operations fadump_region_fops = {
+ .open = fadump_region_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void fadump_init_files(void)
+{
+ struct dentry *debugfs_file;
+ int rc = 0;
+
+ rc = sysfs_create_file(kernel_kobj, &fadump_attr.attr);
+ if (rc)
+ printk(KERN_ERR "fadump: unable to create sysfs file"
+ " fadump_enabled (%d)\n", rc);
+
+ rc = sysfs_create_file(kernel_kobj, &fadump_register_attr.attr);
+ if (rc)
+ printk(KERN_ERR "fadump: unable to create sysfs file"
+ " fadump_registered (%d)\n", rc);
+
+ debugfs_file = debugfs_create_file("fadump_region", 0444,
+ powerpc_debugfs_root, NULL,
+ &fadump_region_fops);
+ if (!debugfs_file)
+ printk(KERN_ERR "fadump: unable to create debugfs file"
+ " fadump_region\n");
+
+ if (fw_dump.dump_active) {
+ rc = sysfs_create_file(kernel_kobj, &fadump_release_attr.attr);
+ if (rc)
+ printk(KERN_ERR "fadump: unable to create sysfs file"
+ " fadump_release_mem (%d)\n", rc);
+ }
+ return;
+}
+
+/*
+ * Prepare for firmware-assisted dump.
+ */
+int __init setup_fadump(void)
+{
+ if (!fw_dump.fadump_enabled)
+ return 0;
+
+ if (!fw_dump.fadump_supported) {
+ printk(KERN_ERR "Firmware-assisted dump is not supported on"
+ " this hardware\n");
+ return 0;
+ }
+
+ fadump_show_config();
+ /*
+ * If dump data is available then see if it is valid and prepare for
+ * saving it to the disk.
+ */
+ if (fw_dump.dump_active) {
+ /*
+ * if dump process fails then invalidate the registration
+ * and release memory before proceeding for re-registration.
+ */
+ if (process_fadump(fdm_active) < 0)
+ fadump_invalidate_release_mem();
+ }
+ /* Initialize the kernel dump memory structure for FAD registration. */
+ else if (fw_dump.reserve_dump_area_size)
+ init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
+ fadump_init_files();
+
+ return 1;
+}
+subsys_initcall(setup_fadump);
diff --git a/arch/powerpc/kernel/firmware.c b/arch/powerpc/kernel/firmware.c
index 6b1f4271eb5..2eae4478f7a 100644
--- a/arch/powerpc/kernel/firmware.c
+++ b/arch/powerpc/kernel/firmware.c
@@ -13,7 +13,8 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/cache.h>
#include <asm/firmware.h>
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index e86c040ae58..9ad236e5d2c 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -23,9 +23,10 @@
#include <asm/thread_info.h>
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
#ifdef CONFIG_VSX
-#define REST_32FPVSRS(n,c,base) \
+#define __REST_32FPVSRS(n,c,base) \
BEGIN_FTR_SECTION \
b 2f; \
END_FTR_SECTION_IFSET(CPU_FTR_VSX); \
@@ -34,7 +35,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX); \
2: REST_32VSRS(n,c,base); \
3:
-#define SAVE_32FPVSRS(n,c,base) \
+#define __SAVE_32FPVSRS(n,c,base) \
BEGIN_FTR_SECTION \
b 2f; \
END_FTR_SECTION_IFSET(CPU_FTR_VSX); \
@@ -43,9 +44,77 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX); \
2: SAVE_32VSRS(n,c,base); \
3:
#else
-#define REST_32FPVSRS(n,b,base) REST_32FPRS(n, base)
-#define SAVE_32FPVSRS(n,b,base) SAVE_32FPRS(n, base)
+#define __REST_32FPVSRS(n,b,base) REST_32FPRS(n, base)
+#define __SAVE_32FPVSRS(n,b,base) SAVE_32FPRS(n, base)
#endif
+#define REST_32FPVSRS(n,c,base) __REST_32FPVSRS(n,__REG_##c,__REG_##base)
+#define SAVE_32FPVSRS(n,c,base) __SAVE_32FPVSRS(n,__REG_##c,__REG_##base)
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/* void do_load_up_transact_fpu(struct thread_struct *thread)
+ *
+ * This is similar to load_up_fpu but for the transactional version of the FP
+ * register set. It doesn't mess with the task MSR or valid flags.
+ * Furthermore, we don't do lazy FP with TM currently.
+ */
+_GLOBAL(do_load_up_transact_fpu)
+ mfmsr r6
+ ori r5,r6,MSR_FP
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+ oris r5,r5,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+ SYNC
+ MTMSRD(r5)
+
+ addi r7,r3,THREAD_TRANSACT_FPSTATE
+ lfd fr0,FPSTATE_FPSCR(r7)
+ MTFSF_L(fr0)
+ REST_32FPVSRS(0, R4, R7)
+
+ /* FP/VSX off again */
+ MTMSRD(r6)
+ SYNC
+
+ blr
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+/*
+ * Enable use of the FPU, and VSX if possible, for the caller.
+ */
+_GLOBAL(fp_enable)
+ mfmsr r3
+ ori r3,r3,MSR_FP
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+ oris r3,r3,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+ SYNC
+ MTMSRD(r3)
+ isync /* (not necessary for arch 2.02 and later) */
+ blr
+
+/*
+ * Load state from memory into FP registers including FPSCR.
+ * Assumes the caller has enabled FP in the MSR.
+ */
+_GLOBAL(load_fp_state)
+ lfd fr0,FPSTATE_FPSCR(r3)
+ MTFSF_L(fr0)
+ REST_32FPVSRS(0, R4, R3)
+ blr
+
+/*
+ * Store FP state into memory, including FPSCR
+ * Assumes the caller has enabled FP in the MSR.
+ */
+_GLOBAL(store_fp_state)
+ SAVE_32FPVSRS(0, R4, R3)
+ mffs fr0
+ stfd fr0,FPSTATE_FPSCR(r3)
+ blr
/*
* This task wants to use the FPU now.
@@ -53,6 +122,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX); \
* and save its floating-point registers in its thread_struct.
* Load up this task's FP registers from its thread_struct,
* enable the FPU for the current task and return to the task.
+ * Note that on 32-bit this can only use registers that will be
+ * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
*/
_GLOBAL(load_up_fpu)
mfmsr r5
@@ -78,9 +149,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
beq 1f
toreal(r4)
addi r4,r4,THREAD /* want last_task_used_math->thread */
- SAVE_32FPVSRS(0, r5, r4)
+ addi r10,r4,THREAD_FPSTATE
+ SAVE_32FPVSRS(0, R5, R10)
mffs fr0
- stfd fr0,THREAD_FPSCR(r4)
+ stfd fr0,FPSTATE_FPSCR(r10)
PPC_LL r5,PT_REGS(r4)
toreal(r5)
PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5)
@@ -91,7 +163,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
#endif /* CONFIG_SMP */
/* enable use of FP after return */
#ifdef CONFIG_PPC32
- mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */
+ mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */
lwz r4,THREAD_FPEXC_MODE(r5)
ori r9,r9,MSR_FP /* enable FP for current */
or r9,r9,r4
@@ -103,9 +175,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
or r12,r12,r4
std r12,_MSR(r1)
#endif
- lfd fr0,THREAD_FPSCR(r5)
+ addi r10,r5,THREAD_FPSTATE
+ lfd fr0,FPSTATE_FPSCR(r10)
MTFSF_L(fr0)
- REST_32FPVSRS(0, r4, r5)
+ REST_32FPVSRS(0, R4, R10)
#ifndef CONFIG_SMP
subi r4,r5,THREAD
fromreal(r4)
@@ -137,11 +210,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
PPC_LCMPI 0,r3,0
beqlr- /* if no previous owner, done */
addi r3,r3,THREAD /* want THREAD of task */
+ PPC_LL r6,THREAD_FPSAVEAREA(r3)
PPC_LL r5,PT_REGS(r3)
- PPC_LCMPI 0,r5,0
- SAVE_32FPVSRS(0, r4 ,r3)
+ PPC_LCMPI 0,r6,0
+ bne 2f
+ addi r6,r3,THREAD_FPSTATE
+2: PPC_LCMPI 0,r5,0
+ SAVE_32FPVSRS(0, R4, R6)
mffs fr0
- stfd fr0,THREAD_FPSCR(r3)
+ stfd fr0,FPSTATE_FPSCR(r6)
beq 1f
PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5)
li r3,MSR_FP|MSR_FE0|MSR_FE1
diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S
index a92c79be272..f22e7e44fbf 100644
--- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S
+++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S
@@ -176,6 +176,8 @@ skpinv: addi r6,r6,1 /* Increment */
/* 7. Jump to KERNELBASE mapping */
lis r6,(KERNELBASE & ~0xfff)@h
ori r6,r6,(KERNELBASE & ~0xfff)@l
+ rlwinm r7,r25,0,0x03ffffff
+ add r6,r7,r6
#elif defined(ENTRY_MAPPING_KEXEC_SETUP)
/*
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index ce1f3e44c24..d178834fe50 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -10,6 +10,8 @@
*
*/
+#define pr_fmt(fmt) "ftrace-powerpc: " fmt
+
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include <linux/uaccess.h>
@@ -22,6 +24,7 @@
#include <asm/cacheflush.h>
#include <asm/code-patching.h>
#include <asm/ftrace.h>
+#include <asm/syscall.h>
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -62,11 +65,9 @@ ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new)
return -EINVAL;
/* replace the text with the new text */
- if (probe_kernel_write((void *)ip, &new, MCOUNT_INSN_SIZE))
+ if (patch_instruction((unsigned int *)ip, new))
return -EPERM;
- flush_icache_range(ip, ip + 8);
-
return 0;
}
@@ -75,6 +76,7 @@ ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new)
*/
static int test_24bit_addr(unsigned long ip, unsigned long addr)
{
+ addr = ppc_function_entry((void *)addr);
/* use the create_branch to verify that this offset can be branched */
return create_branch((unsigned int *)ip, addr, 0);
@@ -105,11 +107,9 @@ __ftrace_make_nop(struct module *mod,
struct dyn_ftrace *rec, unsigned long addr)
{
unsigned int op;
- unsigned int jmp[5];
- unsigned long ptr;
+ unsigned long entry, ptr;
unsigned long ip = rec->ip;
- unsigned long tramp;
- int offset;
+ void *tramp;
/* read where this goes */
if (probe_kernel_read(&op, (void *)ip, sizeof(int)))
@@ -117,106 +117,52 @@ __ftrace_make_nop(struct module *mod,
/* Make sure that that this is still a 24bit jump */
if (!is_bl_op(op)) {
- printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
+ pr_err("Not expected bl: opcode is %x\n", op);
return -EINVAL;
}
/* lets find where the pointer goes */
- tramp = find_bl_target(ip, op);
+ tramp = (void *)find_bl_target(ip, op);
- /*
- * On PPC64 the trampoline looks like:
- * 0x3d, 0x82, 0x00, 0x00, addis r12,r2, <high>
- * 0x39, 0x8c, 0x00, 0x00, addi r12,r12, <low>
- * Where the bytes 2,3,6 and 7 make up the 32bit offset
- * to the TOC that holds the pointer.
- * to jump to.
- * 0xf8, 0x41, 0x00, 0x28, std r2,40(r1)
- * 0xe9, 0x6c, 0x00, 0x20, ld r11,32(r12)
- * The actually address is 32 bytes from the offset
- * into the TOC.
- * 0xe8, 0x4c, 0x00, 0x28, ld r2,40(r12)
- */
-
- pr_devel("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc);
+ pr_devel("ip:%lx jumps to %p", ip, tramp);
- /* Find where the trampoline jumps to */
- if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
- printk(KERN_ERR "Failed to read %lx\n", tramp);
- return -EFAULT;
- }
-
- pr_devel(" %08x %08x", jmp[0], jmp[1]);
-
- /* verify that this is what we expect it to be */
- if (((jmp[0] & 0xffff0000) != 0x3d820000) ||
- ((jmp[1] & 0xffff0000) != 0x398c0000) ||
- (jmp[2] != 0xf8410028) ||
- (jmp[3] != 0xe96c0020) ||
- (jmp[4] != 0xe84c0028)) {
- printk(KERN_ERR "Not a trampoline\n");
+ if (!is_module_trampoline(tramp)) {
+ pr_err("Not a trampoline\n");
return -EINVAL;
}
- /* The bottom half is signed extended */
- offset = ((unsigned)((unsigned short)jmp[0]) << 16) +
- (int)((short)jmp[1]);
-
- pr_devel(" %x ", offset);
-
- /* get the address this jumps too */
- tramp = mod->arch.toc + offset + 32;
- pr_devel("toc: %lx", tramp);
-
- if (probe_kernel_read(jmp, (void *)tramp, 8)) {
- printk(KERN_ERR "Failed to read %lx\n", tramp);
+ if (module_trampoline_target(mod, tramp, &ptr)) {
+ pr_err("Failed to get trampoline target\n");
return -EFAULT;
}
- pr_devel(" %08x %08x\n", jmp[0], jmp[1]);
-
- ptr = ((unsigned long)jmp[0] << 32) + jmp[1];
+ pr_devel("trampoline target %lx", ptr);
+ entry = ppc_global_function_entry((void *)addr);
/* This should match what was called */
- if (ptr != ppc_function_entry((void *)addr)) {
- printk(KERN_ERR "addr does not match %lx\n", ptr);
- return -EINVAL;
- }
-
- /*
- * We want to nop the line, but the next line is
- * 0xe8, 0x41, 0x00, 0x28 ld r2,40(r1)
- * This needs to be turned to a nop too.
- */
- if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- if (op != 0xe8410028) {
- printk(KERN_ERR "Next line is not ld! (%08x)\n", op);
+ if (ptr != entry) {
+ pr_err("addr %lx does not match expected %lx\n", ptr, entry);
return -EINVAL;
}
/*
- * Milton Miller pointed out that we can not blindly do nops.
- * If a task was preempted when calling a trace function,
- * the nops will remove the way to restore the TOC in r2
- * and the r2 TOC will get corrupted.
- */
-
- /*
- * Replace:
- * bl <tramp> <==== will be replaced with "b 1f"
- * ld r2,40(r1)
- * 1:
+ * Our original call site looks like:
+ *
+ * bl <tramp>
+ * ld r2,XX(r1)
+ *
+ * Milton Miller pointed out that we can not simply nop the branch.
+ * If a task was preempted when calling a trace function, the nops
+ * will remove the way to restore the TOC in r2 and the r2 TOC will
+ * get corrupted.
+ *
+ * Use a b +8 to jump over the load.
*/
op = 0x48000008; /* b +8 */
- if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+ if (patch_instruction((unsigned int *)ip, op))
return -EPERM;
-
- flush_icache_range(ip, ip + 8);
-
return 0;
}
@@ -235,7 +181,7 @@ __ftrace_make_nop(struct module *mod,
/* Make sure that that this is still a 24bit jump */
if (!is_bl_op(op)) {
- printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
+ pr_err("Not expected bl: opcode is %x\n", op);
return -EINVAL;
}
@@ -244,9 +190,9 @@ __ftrace_make_nop(struct module *mod,
/*
* On PPC32 the trampoline looks like:
- * 0x3d, 0x60, 0x00, 0x00 lis r11,sym@ha
- * 0x39, 0x6b, 0x00, 0x00 addi r11,r11,sym@l
- * 0x7d, 0x69, 0x03, 0xa6 mtctr r11
+ * 0x3d, 0x80, 0x00, 0x00 lis r12,sym@ha
+ * 0x39, 0x8c, 0x00, 0x00 addi r12,r12,sym@l
+ * 0x7d, 0x89, 0x03, 0xa6 mtctr r12
* 0x4e, 0x80, 0x04, 0x20 bctr
*/
@@ -254,18 +200,18 @@ __ftrace_make_nop(struct module *mod,
/* Find where the trampoline jumps to */
if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
- printk(KERN_ERR "Failed to read %lx\n", tramp);
+ pr_err("Failed to read %lx\n", tramp);
return -EFAULT;
}
pr_devel(" %08x %08x ", jmp[0], jmp[1]);
/* verify that this is what we expect it to be */
- if (((jmp[0] & 0xffff0000) != 0x3d600000) ||
- ((jmp[1] & 0xffff0000) != 0x396b0000) ||
- (jmp[2] != 0x7d6903a6) ||
+ if (((jmp[0] & 0xffff0000) != 0x3d800000) ||
+ ((jmp[1] & 0xffff0000) != 0x398c0000) ||
+ (jmp[2] != 0x7d8903a6) ||
(jmp[3] != 0x4e800420)) {
- printk(KERN_ERR "Not a trampoline\n");
+ pr_err("Not a trampoline\n");
return -EINVAL;
}
@@ -277,19 +223,16 @@ __ftrace_make_nop(struct module *mod,
pr_devel(" %lx ", tramp);
if (tramp != addr) {
- printk(KERN_ERR
- "Trampoline location %08lx does not match addr\n",
+ pr_err("Trampoline location %08lx does not match addr\n",
tramp);
return -EINVAL;
}
op = PPC_INST_NOP;
- if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+ if (patch_instruction((unsigned int *)ip, op))
return -EPERM;
- flush_icache_range(ip, ip + 8);
-
return 0;
}
#endif /* PPC64 */
@@ -321,15 +264,13 @@ int ftrace_make_nop(struct module *mod,
*/
if (!rec->arch.mod) {
if (!mod) {
- printk(KERN_ERR "No module loaded addr=%lx\n",
- addr);
+ pr_err("No module loaded addr=%lx\n", addr);
return -EFAULT;
}
rec->arch.mod = mod;
} else if (mod) {
if (mod != rec->arch.mod) {
- printk(KERN_ERR
- "Record mod %p not equal to passed in mod %p\n",
+ pr_err("Record mod %p not equal to passed in mod %p\n",
rec->arch.mod, mod);
return -EINVAL;
}
@@ -350,45 +291,42 @@ static int
__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
unsigned int op[2];
- unsigned long ip = rec->ip;
+ void *ip = (void *)rec->ip;
/* read where this goes */
- if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2))
+ if (probe_kernel_read(op, ip, sizeof(op)))
return -EFAULT;
/*
- * It should be pointing to two nops or
- * b +8; ld r2,40(r1)
+ * We expect to see:
+ *
+ * b +8
+ * ld r2,XX(r1)
+ *
+ * The load offset is different depending on the ABI. For simplicity
+ * just mask it out when doing the compare.
*/
- if (((op[0] != 0x48000008) || (op[1] != 0xe8410028)) &&
- ((op[0] != PPC_INST_NOP) || (op[1] != PPC_INST_NOP))) {
- printk(KERN_ERR "Expected NOPs but have %x %x\n", op[0], op[1]);
+ if ((op[0] != 0x48000008) || ((op[1] & 0xffff0000) != 0xe8410000)) {
+ pr_err("Unexpected call sequence: %x %x\n", op[0], op[1]);
return -EINVAL;
}
/* If we never set up a trampoline to ftrace_caller, then bail */
if (!rec->arch.mod->arch.tramp) {
- printk(KERN_ERR "No ftrace trampoline\n");
+ pr_err("No ftrace trampoline\n");
return -EINVAL;
}
- /* create the branch to the trampoline */
- op[0] = create_branch((unsigned int *)ip,
- rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
- if (!op[0]) {
- printk(KERN_ERR "REL24 out of range!\n");
+ /* Ensure branch is within 24 bits */
+ if (!create_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) {
+ pr_err("Branch out of range\n");
return -EINVAL;
}
- /* ld r2,40(r1) */
- op[1] = 0xe8410028;
-
- pr_devel("write to %lx\n", rec->ip);
-
- if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2))
- return -EPERM;
-
- flush_icache_range(ip, ip + 8);
+ if (patch_branch(ip, rec->arch.mod->arch.tramp, BRANCH_SET_LINK)) {
+ pr_err("REL24 out of range!\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -405,13 +343,13 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
/* It should be pointing to a nop */
if (op != PPC_INST_NOP) {
- printk(KERN_ERR "Expected NOP but have %x\n", op);
+ pr_err("Expected NOP but have %x\n", op);
return -EINVAL;
}
/* If we never set up a trampoline to ftrace_caller, then bail */
if (!rec->arch.mod->arch.tramp) {
- printk(KERN_ERR "No ftrace trampoline\n");
+ pr_err("No ftrace trampoline\n");
return -EINVAL;
}
@@ -419,17 +357,15 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
op = create_branch((unsigned int *)ip,
rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
if (!op) {
- printk(KERN_ERR "REL24 out of range!\n");
+ pr_err("REL24 out of range!\n");
return -EINVAL;
}
pr_devel("write to %lx\n", rec->ip);
- if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+ if (patch_instruction((unsigned int *)ip, op))
return -EPERM;
- flush_icache_range(ip, ip + 8);
-
return 0;
}
#endif /* CONFIG_PPC64 */
@@ -459,7 +395,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
* already have a module defined.
*/
if (!rec->arch.mod) {
- printk(KERN_ERR "No module loaded\n");
+ pr_err("No module loaded\n");
return -EINVAL;
}
@@ -483,13 +419,60 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
return ret;
}
-int __init ftrace_dyn_arch_init(void *data)
+static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr = (unsigned long)FTRACE_ADDR;
+ int ret;
+
+ ret = ftrace_update_record(rec, enable);
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+ case FTRACE_UPDATE_MAKE_CALL:
+ return ftrace_make_call(rec, ftrace_addr);
+ case FTRACE_UPDATE_MAKE_NOP:
+ return ftrace_make_nop(NULL, rec, ftrace_addr);
+ }
+
+ return 0;
+}
+
+void ftrace_replace_code(int enable)
{
- /* caller expects data to be zero */
- unsigned long *p = data;
+ struct ftrace_rec_iter *iter;
+ struct dyn_ftrace *rec;
+ int ret;
+
+ for (iter = ftrace_rec_iter_start(); iter;
+ iter = ftrace_rec_iter_next(iter)) {
+ rec = ftrace_rec_iter_record(iter);
+ ret = __ftrace_replace_code(rec, enable);
+ if (ret) {
+ ftrace_bug(ret, rec->ip);
+ return;
+ }
+ }
+}
- *p = 0;
+void arch_ftrace_update_code(int command)
+{
+ if (command & FTRACE_UPDATE_CALLS)
+ ftrace_replace_code(1);
+ else if (command & FTRACE_DISABLE_CALLS)
+ ftrace_replace_code(0);
+
+ if (command & FTRACE_UPDATE_TRACE_FUNC)
+ ftrace_update_ftrace_func(ftrace_trace_function);
+
+ if (command & FTRACE_START_FUNC_RET)
+ ftrace_enable_ftrace_graph_caller();
+ else if (command & FTRACE_STOP_FUNC_RET)
+ ftrace_disable_ftrace_graph_caller();
+}
+int __init ftrace_dyn_arch_init(void)
+{
return 0;
}
#endif /* CONFIG_DYNAMIC_FTRACE */
@@ -586,17 +569,23 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
return;
}
- if (ftrace_push_return_trace(old, self_addr, &trace.depth, 0) == -EBUSY) {
- *parent = old;
- return;
- }
-
trace.func = self_addr;
+ trace.depth = current->curr_ret_stack + 1;
/* Only trace if the calling function expects to */
if (!ftrace_graph_entry(&trace)) {
- current->curr_ret_stack--;
*parent = old;
+ return;
}
+
+ if (ftrace_push_return_trace(old, self_addr, &trace.depth, 0) == -EBUSY)
+ *parent = old;
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_PPC64)
+unsigned long __init arch_syscall_addr(int nr)
+{
+ return sys_call_table[nr*2];
+}
+#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 98c4b29a56f..dc0488b6f6e 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -139,8 +139,7 @@ __start:
trap
#endif /* CONFIG_PPC_PMAC */
-1: mr r31,r3 /* save parameters */
- mr r30,r4
+1: mr r31,r3 /* save device tree ptr */
li r24,0 /* cpu # */
/*
@@ -396,7 +395,7 @@ DataAccess:
bl hash_page
1: lwz r5,_DSISR(r11) /* get DSISR value */
mfspr r4,SPRN_DAR
- EXC_XFER_EE_LITE(0x300, handle_page_fault)
+ EXC_XFER_LITE(0x300, handle_page_fault)
/* Instruction access exception. */
@@ -411,7 +410,7 @@ InstructionAccess:
bl hash_page
1: mr r4,r12
mr r5,r9
- EXC_XFER_EE_LITE(0x400, handle_page_fault)
+ EXC_XFER_LITE(0x400, handle_page_fault)
/* External interrupt */
EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE)
@@ -805,19 +804,6 @@ _ENTRY(copy_and_flush)
blr
#ifdef CONFIG_SMP
-#ifdef CONFIG_GEMINI
- .globl __secondary_start_gemini
-__secondary_start_gemini:
- mfspr r4,SPRN_HID0
- ori r4,r4,HID0_ICFI
- li r3,0
- ori r3,r3,HID0_ICE
- andc r4,r4,r3
- mtspr SPRN_HID0,r4
- sync
- b __secondary_start
-#endif /* CONFIG_GEMINI */
-
.globl __secondary_start_mpc86xx
__secondary_start_mpc86xx:
mfspr r3, SPRN_PIR
@@ -977,8 +963,8 @@ start_here:
* Do early platform-specific initialization,
* and set up the MMU.
*/
- mr r3,r31
- mr r4,r30
+ li r3,0
+ mr r4,r31
bl machine_init
bl __save_cpu_setup
bl MMU_init
diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index 8278e8bad5a..7d7d8635227 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -40,6 +40,7 @@
#include <asm/thread_info.h>
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
/* As with the other PowerPC ports, it is expected that when code
* execution begins here, the following registers contain valid, yet
@@ -57,13 +58,7 @@
_ENTRY(_stext);
_ENTRY(_start);
- /* Save parameters we are passed.
- */
- mr r31,r3
- mr r30,r4
- mr r29,r5
- mr r28,r6
- mr r27,r7
+ mr r31,r3 /* save device tree ptr */
/* We have to turn on the MMU right away so we get cache modes
* set correctly.
@@ -399,7 +394,7 @@ label:
NORMAL_EXCEPTION_PROLOG
mr r4,r12 /* Pass SRR0 as arg2 */
li r5,0 /* Pass zero as arg3 */
- EXC_XFER_EE_LITE(0x400, handle_page_fault)
+ EXC_XFER_LITE(0x400, handle_page_fault)
/* 0x0500 - External Interrupt Exception */
EXCEPTION(0x0500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE)
@@ -435,30 +430,18 @@ label:
EXCEPTION(0x0F00, Trap_0F, unknown_exception, EXC_XFER_EE)
/* 0x1000 - Programmable Interval Timer (PIT) Exception */
- START_EXCEPTION(0x1000, Decrementer)
- NORMAL_EXCEPTION_PROLOG
- lis r0,TSR_PIS@h
- mtspr SPRN_TSR,r0 /* Clear the PIT exception */
- addi r3,r1,STACK_FRAME_OVERHEAD
- EXC_XFER_LITE(0x1000, timer_interrupt)
-
-#if 0
-/* NOTE:
- * FIT and WDT handlers are not implemented yet.
- */
+ . = 0x1000
+ b Decrementer
/* 0x1010 - Fixed Interval Timer (FIT) Exception
*/
- STND_EXCEPTION(0x1010, FITException, unknown_exception)
+ . = 0x1010
+ b FITException
/* 0x1020 - Watchdog Timer (WDT) Exception
*/
-#ifdef CONFIG_BOOKE_WDT
- CRITICAL_EXCEPTION(0x1020, WDTException, WatchdogException)
-#else
- CRITICAL_EXCEPTION(0x1020, WDTException, unknown_exception)
-#endif
-#endif
+ . = 0x1020
+ b WDTException
/* 0x1100 - Data TLB Miss Exception
* As the name implies, translation is not in the MMU, so search the
@@ -743,6 +726,29 @@ label:
(MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
NOCOPY, crit_transfer_to_handler, ret_from_crit_exc)
+ /* Programmable Interval Timer (PIT) Exception. (from 0x1000) */
+Decrementer:
+ NORMAL_EXCEPTION_PROLOG
+ lis r0,TSR_PIS@h
+ mtspr SPRN_TSR,r0 /* Clear the PIT exception */
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ EXC_XFER_LITE(0x1000, timer_interrupt)
+
+ /* Fixed Interval Timer (FIT) Exception. (from 0x1010) */
+FITException:
+ NORMAL_EXCEPTION_PROLOG
+ addi r3,r1,STACK_FRAME_OVERHEAD;
+ EXC_XFER_EE(0x1010, unknown_exception)
+
+ /* Watchdog Timer (WDT) Exception. (from 0x1020) */
+WDTException:
+ CRITICAL_EXCEPTION_PROLOG;
+ addi r3,r1,STACK_FRAME_OVERHEAD;
+ EXC_XFER_TEMPLATE(WatchdogException, 0x1020+2,
+ (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)),
+ NOCOPY, crit_transfer_to_handler,
+ ret_from_crit_exc)
+
/*
* The other Data TLB exceptions bail out to this point
* if they can't resolve the lightweight TLB fault.
@@ -752,7 +758,7 @@ DataAccess:
mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */
stw r5,_ESR(r11)
mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */
- EXC_XFER_EE_LITE(0x300, handle_page_fault)
+ EXC_XFER_LITE(0x300, handle_page_fault)
/* Other PowerPC processors, namely those derived from the 6xx-series
* have vectors from 0x2100 through 0x2F00 defined, but marked as reserved.
@@ -765,7 +771,7 @@ DataAccess:
* miss get to this point to load the TLB.
* r10 - TLB_TAG value
* r11 - Linux PTE
- * r12, r9 - avilable to use
+ * r12, r9 - available to use
* PID - loaded with proper value when we get here
* Upon exit, we reload everything and RFI.
* Actually, it will fit now, but oh well.....a common place
@@ -816,14 +822,6 @@ finish_tlb_load:
rfi /* Should sync shadow TLBs */
b . /* prevent prefetch past rfi */
-/* extern void giveup_fpu(struct task_struct *prev)
- *
- * The PowerPC 4xx family of processors do not have an FPU, so this just
- * returns.
- */
-_ENTRY(giveup_fpu)
- blr
-
/* This is where the main kernel code starts.
*/
start_here:
@@ -848,11 +846,8 @@ start_here:
/*
* Decide what sort of machine this is and initialize the MMU.
*/
- mr r3,r31
- mr r4,r30
- mr r5,r29
- mr r6,r28
- mr r7,r27
+ li r3,0
+ mr r4,r31
bl machine_init
bl MMU_init
@@ -935,25 +930,6 @@ initial_mmu:
tlbwe r4,r0,TLB_DATA /* Load the data portion of the entry */
tlbwe r3,r0,TLB_TAG /* Load the tag portion of the entry */
-#if defined(CONFIG_SERIAL_TEXT_DEBUG) && defined(SERIAL_DEBUG_IO_BASE)
-
- /* Load a TLB entry for the UART, so that ppc4xx_progress() can use
- * the UARTs nice and early. We use a 4k real==virtual mapping. */
-
- lis r3,SERIAL_DEBUG_IO_BASE@h
- ori r3,r3,SERIAL_DEBUG_IO_BASE@l
- mr r4,r3
- clrrwi r4,r4,12
- ori r4,r4,(TLB_WR|TLB_I|TLB_M|TLB_G)
-
- clrrwi r3,r3,12
- ori r3,r3,(TLB_VALID | TLB_PAGESZ(PAGESZ_4K))
-
- li r0,0 /* TLB slot 0 */
- tlbwe r4,r0,TLB_DATA
- tlbwe r3,r0,TLB_TAG
-#endif /* CONFIG_SERIAL_DEBUG_TEXT && SERIAL_DEBUG_IO_BASE */
-
isync
/* Establish the exception vector base
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index 562305b40a8..c334f53453f 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -37,6 +37,7 @@
#include <asm/thread_info.h>
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
#include <asm/synch.h>
#include "head_booke.h"
@@ -60,15 +61,37 @@ _ENTRY(_start);
* of abatron_pteptrs
*/
nop
+ mr r31,r3 /* save device tree ptr */
+ li r24,0 /* CPU number */
+
+#ifdef CONFIG_RELOCATABLE
/*
- * Save parameters we are passed
+ * Relocate ourselves to the current runtime address.
+ * This is called only by the Boot CPU.
+ * "relocate" is called with our current runtime virutal
+ * address.
+ * r21 will be loaded with the physical runtime address of _stext
*/
- mr r31,r3
- mr r30,r4
- mr r29,r5
- mr r28,r6
- mr r27,r7
- li r24,0 /* CPU number */
+ bl 0f /* Get our runtime address */
+0: mflr r21 /* Make it accessible */
+ addis r21,r21,(_stext - 0b)@ha
+ addi r21,r21,(_stext - 0b)@l /* Get our current runtime base */
+
+ /*
+ * We have the runtime (virutal) address of our base.
+ * We calculate our shift of offset from a 256M page.
+ * We could map the 256M page we belong to at PAGE_OFFSET and
+ * get going from there.
+ */
+ lis r4,KERNELBASE@h
+ ori r4,r4,KERNELBASE@l
+ rlwinm r6,r21,0,4,31 /* r6 = PHYS_START % 256M */
+ rlwinm r5,r4,0,4,31 /* r5 = KERNELBASE % 256M */
+ subf r3,r5,r6 /* r3 = r6 - r5 */
+ add r3,r4,r3 /* Required Virutal Address */
+
+ bl relocate
+#endif
bl init_cpu_state
@@ -92,14 +115,94 @@ _ENTRY(_start);
bl early_init
+#ifdef CONFIG_RELOCATABLE
+ /*
+ * Relocatable kernel support based on processing of dynamic
+ * relocation entries.
+ *
+ * r25 will contain RPN/ERPN for the start address of memory
+ * r21 will contain the current offset of _stext
+ */
+ lis r3,kernstart_addr@ha
+ la r3,kernstart_addr@l(r3)
+
+ /*
+ * Compute the kernstart_addr.
+ * kernstart_addr => (r6,r8)
+ * kernstart_addr & ~0xfffffff => (r6,r7)
+ */
+ rlwinm r6,r25,0,28,31 /* ERPN. Bits 32-35 of Address */
+ rlwinm r7,r25,0,0,3 /* RPN - assuming 256 MB page size */
+ rlwinm r8,r21,0,4,31 /* r8 = (_stext & 0xfffffff) */
+ or r8,r7,r8 /* Compute the lower 32bit of kernstart_addr */
+
+ /* Store kernstart_addr */
+ stw r6,0(r3) /* higher 32bit */
+ stw r8,4(r3) /* lower 32bit */
+
+ /*
+ * Compute the virt_phys_offset :
+ * virt_phys_offset = stext.run - kernstart_addr
+ *
+ * stext.run = (KERNELBASE & ~0xfffffff) + (kernstart_addr & 0xfffffff)
+ * When we relocate, we have :
+ *
+ * (kernstart_addr & 0xfffffff) = (stext.run & 0xfffffff)
+ *
+ * hence:
+ * virt_phys_offset = (KERNELBASE & ~0xfffffff) - (kernstart_addr & ~0xfffffff)
+ *
+ */
+
+ /* KERNELBASE&~0xfffffff => (r4,r5) */
+ li r4, 0 /* higer 32bit */
+ lis r5,KERNELBASE@h
+ rlwinm r5,r5,0,0,3 /* Align to 256M, lower 32bit */
+
+ /*
+ * 64bit subtraction.
+ */
+ subfc r5,r7,r5
+ subfe r4,r6,r4
+
+ /* Store virt_phys_offset */
+ lis r3,virt_phys_offset@ha
+ la r3,virt_phys_offset@l(r3)
+
+ stw r4,0(r3)
+ stw r5,4(r3)
+
+#elif defined(CONFIG_DYNAMIC_MEMSTART)
+ /*
+ * Mapping based, page aligned dynamic kernel loading.
+ *
+ * r25 will contain RPN/ERPN for the start address of memory
+ *
+ * Add the difference between KERNELBASE and PAGE_OFFSET to the
+ * start of physical memory to get kernstart_addr.
+ */
+ lis r3,kernstart_addr@ha
+ la r3,kernstart_addr@l(r3)
+
+ lis r4,KERNELBASE@h
+ ori r4,r4,KERNELBASE@l
+ lis r5,PAGE_OFFSET@h
+ ori r5,r5,PAGE_OFFSET@l
+ subf r4,r5,r4
+
+ rlwinm r6,r25,0,28,31 /* ERPN */
+ rlwinm r7,r25,0,0,3 /* RPN - assuming 256 MB page size */
+ add r7,r7,r4
+
+ stw r6,0(r3)
+ stw r7,4(r3)
+#endif
+
/*
* Decide what sort of machine this is and initialize the MMU.
*/
- mr r3,r31
- mr r4,r30
- mr r5,r29
- mr r6,r28
- mr r7,r27
+ li r3,0
+ mr r4,r31
bl machine_init
bl MMU_init
@@ -145,10 +248,11 @@ _ENTRY(_start);
interrupt_base:
/* Critical Input Interrupt */
- CRITICAL_EXCEPTION(0x0100, CriticalInput, unknown_exception)
+ CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception)
/* Machine Check Interrupt */
- CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
+ CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \
+ machine_check_exception)
MCHECK_EXCEPTION(0x0210, MachineCheckA, machine_check_exception)
/* Data Storage Interrupt */
@@ -158,7 +262,8 @@ interrupt_base:
INSTRUCTION_STORAGE_EXCEPTION
/* External Input Interrupt */
- EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE)
+ EXCEPTION(0x0500, BOOKE_INTERRUPT_EXTERNAL, ExternalInput, \
+ do_IRQ, EXC_XFER_LITE)
/* Alignment Interrupt */
ALIGNMENT_EXCEPTION
@@ -170,29 +275,32 @@ interrupt_base:
#ifdef CONFIG_PPC_FPU
FP_UNAVAILABLE_EXCEPTION
#else
- EXCEPTION(0x2010, FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
+ EXCEPTION(0x2010, BOOKE_INTERRUPT_FP_UNAVAIL, \
+ FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
#endif
/* System Call Interrupt */
START_EXCEPTION(SystemCall)
- NORMAL_EXCEPTION_PROLOG
+ NORMAL_EXCEPTION_PROLOG(BOOKE_INTERRUPT_SYSCALL)
EXC_XFER_EE_LITE(0x0c00, DoSyscall)
- /* Auxillary Processor Unavailable Interrupt */
- EXCEPTION(0x2020, AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
+ /* Auxiliary Processor Unavailable Interrupt */
+ EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \
+ AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
/* Decrementer Interrupt */
DECREMENTER_EXCEPTION
/* Fixed Internal Timer Interrupt */
/* TODO: Add FIT support */
- EXCEPTION(0x1010, FixedIntervalTimer, unknown_exception, EXC_XFER_EE)
+ EXCEPTION(0x1010, BOOKE_INTERRUPT_FIT, FixedIntervalTimer, \
+ unknown_exception, EXC_XFER_EE)
/* Watchdog Timer Interrupt */
/* TODO: Add watchdog support */
#ifdef CONFIG_BOOKE_WDT
- CRITICAL_EXCEPTION(0x1020, WatchdogTimer, WatchdogException)
+ CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, WatchdogException)
#else
- CRITICAL_EXCEPTION(0x1020, WatchdogTimer, unknown_exception)
+ CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, unknown_exception)
#endif
/* Data TLB Error Interrupt */
@@ -661,6 +769,8 @@ finish_tlb_load_47x:
*/
DEBUG_CRIT_EXCEPTION
+interrupt_end:
+
/*
* Global functions
*/
@@ -674,24 +784,6 @@ _GLOBAL(__fixup_440A_mcheck)
sync
blr
-/*
- * extern void giveup_altivec(struct task_struct *prev)
- *
- * The 44x core does not have an AltiVec unit.
- */
-_GLOBAL(giveup_altivec)
- blr
-
-/*
- * extern void giveup_fpu(struct task_struct *prev)
- *
- * The 44x core does not have an FPU.
- */
-#ifndef CONFIG_PPC_FPU
-_GLOBAL(giveup_fpu)
- blr
-#endif
-
_GLOBAL(set_context)
#ifdef CONFIG_BDI_SWITCH
@@ -717,6 +809,8 @@ _GLOBAL(init_cpu_state)
/* We use the PVR to differenciate 44x cores from 476 */
mfspr r3,SPRN_PVR
srwi r3,r3,16
+ cmplwi cr0,r3,PVR_476FPE@h
+ beq head_start_47x
cmplwi cr0,r3,PVR_476@h
beq head_start_47x
cmplwi cr0,r3,PVR_476_ISS@h
@@ -785,12 +879,29 @@ skpinv: addi r4,r4,1 /* Increment */
/*
* Configure and load pinned entry into TLB slot 63.
*/
+#ifdef CONFIG_NONSTATIC_KERNEL
+ /*
+ * In case of a NONSTATIC_KERNEL we reuse the TLB XLAT
+ * entries of the initial mapping set by the boot loader.
+ * The XLAT entry is stored in r25
+ */
+
+ /* Read the XLAT entry for our current mapping */
+ tlbre r25,r23,PPC44x_TLB_XLAT
+
+ lis r3,KERNELBASE@h
+ ori r3,r3,KERNELBASE@l
+
+ /* Use our current RPN entry */
+ mr r4,r25
+#else
lis r3,PAGE_OFFSET@h
ori r3,r3,PAGE_OFFSET@l
/* Kernel is at the base of RAM */
li r4, 0 /* Load the kernel physical address */
+#endif
/* Load the kernel PID = 0 */
li r0,0
@@ -1000,9 +1111,6 @@ clear_utlb_entry:
lis r3,PAGE_OFFSET@h
ori r3,r3,PAGE_OFFSET@l
- /* Kernel is at the base of RAM */
- li r4, 0 /* Load the kernel physical address */
-
/* Load the kernel PID = 0 */
li r0,0
mtspr SPRN_PID,r0
@@ -1012,9 +1120,8 @@ clear_utlb_entry:
clrrwi r3,r3,12 /* Mask off the effective page number */
ori r3,r3,PPC47x_TLB0_VALID | PPC47x_TLB0_256M
- /* Word 1 */
- clrrwi r4,r4,12 /* Mask off the real page number */
- /* ERPN is 0 for first 4GB page */
+ /* Word 1 - use r25. RPN is the same as the original entry */
+
/* Word 2 */
li r5,0
ori r5,r5,PPC47x_TLB2_S_RWX
@@ -1025,7 +1132,7 @@ clear_utlb_entry:
/* We write to way 0 and bolted 0 */
lis r0,0x8800
tlbwe r3,r0,0
- tlbwe r4,r0,1
+ tlbwe r25,r0,1
tlbwe r5,r0,2
/*
@@ -1123,7 +1230,13 @@ head_start_common:
lis r4,interrupt_base@h /* IVPR only uses the high 16-bits */
mtspr SPRN_IVPR,r4
- addis r22,r22,KERNELBASE@h
+ /*
+ * If the kernel was loaded at a non-zero 256 MB page, we need to
+ * mask off the most significant 4 bits to get the relative address
+ * from the start of physical memory
+ */
+ rlwinm r22,r22,0,4,31
+ addis r22,r22,PAGE_OFFSET@h
mtlr r22
isync
blr
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index f0dd577e4a5..a95145d7f61 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -23,6 +23,7 @@
*/
#include <linux/threads.h>
+#include <linux/init.h>
#include <asm/reg.h>
#include <asm/page.h>
#include <asm/mmu.h>
@@ -32,14 +33,15 @@
#include <asm/cputable.h>
#include <asm/setup.h>
#include <asm/hvcall.h>
-#include <asm/iseries/lpar_map.h>
#include <asm/thread_info.h>
#include <asm/firmware.h>
#include <asm/page_64.h>
#include <asm/irqflags.h>
#include <asm/kvm_book3s_asm.h>
+#include <asm/ptrace.h>
+#include <asm/hw_irq.h>
-/* The physical memory is layed out such that the secondary processor
+/* The physical memory is laid out such that the secondary processor
* spin code sits at 0x0000...0x00ff. On server, the vectors follow
* using the layout described in exceptions-64s.S
*/
@@ -50,10 +52,11 @@
* For pSeries or server processors:
* 1. The MMU is off & open firmware is running in real mode.
* 2. The kernel is entered at __start
- *
- * For iSeries:
- * 1. The MMU is on (as it always is for iSeries)
- * 2. The kernel is entered at system_reset_iSeries
+ * -or- For OPAL entry:
+ * 1. The MMU is off, processor in HV mode, primary CPU enters at 0
+ * with device-tree in gpr3. We also get OPAL base in r8 and
+ * entry in r9 for debugging purposes
+ * 2. Secondary processors enter at 0x60 with PIR in gpr3
*
* For Book3E processors:
* 1. The MMU is on running in AS0 in a state defined in ePAPR
@@ -66,17 +69,18 @@ _stext:
_GLOBAL(__start)
/* NOP this out unconditionally */
BEGIN_FTR_SECTION
- b .__start_initialization_multiplatform
+ FIXUP_ENDIAN
+ b __start_initialization_multiplatform
END_FTR_SECTION(0, 1)
/* Catch branch to 0 in real mode */
trap
- /* Secondary processors spin on this value until it becomes nonzero.
- * When it does it contains the real address of the descriptor
- * of the function that the cpu should jump to to continue
- * initialization.
+ /* Secondary processors spin on this value until it becomes non-zero.
+ * When non-zero, it contains the real address of the function the cpu
+ * should jump to.
*/
+ .balign 8
.globl __secondary_hold_spinloop
__secondary_hold_spinloop:
.llong 0x0
@@ -87,16 +91,7 @@ __secondary_hold_spinloop:
__secondary_hold_acknowledge:
.llong 0x0
-#ifdef CONFIG_PPC_ISERIES
- /*
- * At offset 0x20, there is a pointer to iSeries LPAR data.
- * This is required by the hypervisor
- */
- . = 0x20
- .llong hvReleaseData-KERNELBASE
-#endif /* CONFIG_PPC_ISERIES */
-
-#ifdef CONFIG_CRASH_DUMP
+#ifdef CONFIG_RELOCATABLE
/* This flag is set to 1 by a loader if the kernel should run
* at the loaded address instead of the linked address. This
* is used by kexec-tools to keep the the kdump kernel in the
@@ -122,6 +117,7 @@ __run_at_load:
*/
.globl __secondary_hold
__secondary_hold:
+ FIXUP_ENDIAN
#ifndef CONFIG_PPC_BOOK3E
mfmsr r24
ori r24,r24,MSR_RI
@@ -129,6 +125,8 @@ __secondary_hold:
#endif
/* Grab our physical cpu number */
mr r24,r3
+ /* stash r4 for book3e */
+ mr r25,r4
/* Tell the master cpu we're here */
/* Relocation is off & we are located at an address less */
@@ -136,16 +134,32 @@ __secondary_hold:
std r24,__secondary_hold_acknowledge-_stext(0)
sync
+ li r26,0
+#ifdef CONFIG_PPC_BOOK3E
+ tovirt(r26,r26)
+#endif
/* All secondary cpus wait here until told to start. */
-100: ld r4,__secondary_hold_spinloop-_stext(0)
- cmpdi 0,r4,0
+100: ld r12,__secondary_hold_spinloop-_stext(r26)
+ cmpdi 0,r12,0
beq 100b
#if defined(CONFIG_SMP) || defined(CONFIG_KEXEC)
- ld r4,0(r4) /* deref function descriptor */
- mtctr r4
+#ifdef CONFIG_PPC_BOOK3E
+ tovirt(r12,r12)
+#endif
+ mtctr r12
mr r3,r24
+ /*
+ * it may be the case that other platforms have r4 right to
+ * begin with, this gives us some safety in case it is not
+ */
+#ifdef CONFIG_PPC_BOOK3E
+ mr r4,r25
+#else
li r4,0
+#endif
+ /* Make sure that patched code is visible */
+ isync
bctr
#else
BUG_OPCODE
@@ -170,15 +184,16 @@ _GLOBAL(generic_secondary_thread_init)
mr r24,r3
/* turn on 64-bit mode */
- bl .enable_64b_mode
+ bl enable_64b_mode
/* get a valid TOC pointer, wherever we're mapped at */
- bl .relative_toc
+ bl relative_toc
+ tovirt(r2,r2)
#ifdef CONFIG_PPC_BOOK3E
/* Book3E initialization */
mr r3,r24
- bl .book3e_secondary_thread_init
+ bl book3e_secondary_thread_init
#endif
b generic_secondary_common_init
@@ -192,20 +207,22 @@ _GLOBAL(generic_secondary_thread_init)
* as SCOM before entry).
*/
_GLOBAL(generic_secondary_smp_init)
+ FIXUP_ENDIAN
mr r24,r3
mr r25,r4
/* turn on 64-bit mode */
- bl .enable_64b_mode
+ bl enable_64b_mode
/* get a valid TOC pointer, wherever we're mapped at */
- bl .relative_toc
+ bl relative_toc
+ tovirt(r2,r2)
#ifdef CONFIG_PPC_BOOK3E
/* Book3E initialization */
mr r3,r24
mr r4,r25
- bl .book3e_secondary_core_init
+ bl book3e_secondary_core_init
#endif
generic_secondary_common_init:
@@ -215,19 +232,25 @@ generic_secondary_common_init:
*/
LOAD_REG_ADDR(r13, paca) /* Load paca pointer */
ld r13,0(r13) /* Get base vaddr of paca array */
+#ifndef CONFIG_SMP
+ addi r13,r13,PACA_SIZE /* know r13 if used accidentally */
+ b kexec_wait /* wait for next kernel if !SMP */
+#else
+ LOAD_REG_ADDR(r7, nr_cpu_ids) /* Load nr_cpu_ids address */
+ lwz r7,0(r7) /* also the max paca allocated */
li r5,0 /* logical cpu id */
1: lhz r6,PACAHWCPUID(r13) /* Load HW procid from paca */
cmpw r6,r24 /* Compare to our id */
beq 2f
addi r13,r13,PACA_SIZE /* Loop to next PACA on miss */
addi r5,r5,1
- cmpwi r5,NR_CPUS
+ cmpw r5,r7 /* Check if more pacas exist */
blt 1b
mr r3,r24 /* not found, copy phys to r3 */
- b .kexec_wait /* next kernel might do better */
+ b kexec_wait /* next kernel might do better */
-2: mtspr SPRN_SPRG_PACA,r13 /* Save vaddr of paca in an SPRG */
+2: SET_PACA(r13)
#ifdef CONFIG_PPC_BOOK3E
addi r12,r13,PACA_EXTLB /* and TLB exc frame in another */
mtspr SPRN_SPRG_TLB_EXFRAME,r12
@@ -235,41 +258,48 @@ generic_secondary_common_init:
/* From now on, r24 is expected to be logical cpuid */
mr r24,r5
-3: HMT_LOW
- lbz r23,PACAPROCSTART(r13) /* Test if this processor should */
- /* start. */
-
-#ifndef CONFIG_SMP
- b 3b /* Never go on non-SMP */
-#else
- cmpwi 0,r23,0
- beq 3b /* Loop until told to go */
-
- sync /* order paca.run and cur_cpu_spec */
/* See if we need to call a cpu state restore handler */
LOAD_REG_ADDR(r23, cur_cpu_spec)
ld r23,0(r23)
- ld r23,CPU_SPEC_RESTORE(r23)
- cmpdi 0,r23,0
- beq 4f
- ld r23,0(r23)
- mtctr r23
+ ld r12,CPU_SPEC_RESTORE(r23)
+ cmpdi 0,r12,0
+ beq 3f
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
+ ld r12,0(r12)
+#endif
+ mtctr r12
bctrl
-4: /* Create a temp kernel stack for use before relocation is on. */
+3: LOAD_REG_ADDR(r3, spinning_secondaries) /* Decrement spinning_secondaries */
+ lwarx r4,0,r3
+ subi r4,r4,1
+ stwcx. r4,0,r3
+ bne 3b
+ isync
+
+4: HMT_LOW
+ lbz r23,PACAPROCSTART(r13) /* Test if this processor should */
+ /* start. */
+ cmpwi 0,r23,0
+ beq 4b /* Loop until told to go */
+
+ sync /* order paca.run and cur_cpu_spec */
+ isync /* In case code patching happened */
+
+ /* Create a temp kernel stack for use before relocation is on. */
ld r1,PACAEMERGSP(r13)
subi r1,r1,STACK_FRAME_OVERHEAD
b __secondary_start
-#endif
+#endif /* SMP */
/*
* Turn the MMU off.
* Assumes we're mapped EA == RA if the MMU is on.
*/
#ifdef CONFIG_PPC_BOOK3S
-_STATIC(__mmu_off)
+__mmu_off:
mfmsr r3
andi. r0,r3,MSR_IR|MSR_DR
beqlr
@@ -294,12 +324,12 @@ _STATIC(__mmu_off)
* DT block, r4 is a physical pointer to the kernel itself
*
*/
-_GLOBAL(__start_initialization_multiplatform)
+__start_initialization_multiplatform:
/* Make sure we are running in 64 bits mode */
- bl .enable_64b_mode
+ bl enable_64b_mode
/* Get TOC pointer (current runtime address) */
- bl .relative_toc
+ bl relative_toc
/* find out where we are now */
bcl 20,31,$+4
@@ -312,15 +342,20 @@ _GLOBAL(__start_initialization_multiplatform)
*/
cmpldi cr0,r5,0
beq 1f
- b .__boot_from_prom /* yes -> prom */
+ b __boot_from_prom /* yes -> prom */
1:
/* Save parameters */
mr r31,r3
mr r30,r4
+#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL
+ /* Save OPAL entry */
+ mr r28,r8
+ mr r29,r9
+#endif
#ifdef CONFIG_PPC_BOOK3E
- bl .start_initialization_book3e
- b .__after_prom_start
+ bl start_initialization_book3e
+ b __after_prom_start
#else
/* Setup some critical 970 SPRs before switching MMU off */
mfspr r0,SPRN_PVR
@@ -333,15 +368,15 @@ _GLOBAL(__start_initialization_multiplatform)
beq 1f
cmpwi r0,0x45 /* 970GX */
bne 2f
-1: bl .__cpu_preinit_ppc970
+1: bl __cpu_preinit_ppc970
2:
/* Switch off MMU if not already off */
- bl .__mmu_off
- b .__after_prom_start
+ bl __mmu_off
+ b __after_prom_start
#endif /* CONFIG_PPC_BOOK3E */
-_INIT_STATIC(__boot_from_prom)
+__boot_from_prom:
#ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE
/* Save parameters */
mr r31,r3
@@ -360,7 +395,7 @@ _INIT_STATIC(__boot_from_prom)
#ifdef CONFIG_RELOCATABLE
/* Relocate code for where we are now */
mr r3,r26
- bl .relocate
+ bl relocate
#endif
/* Restore parameters */
@@ -372,26 +407,24 @@ _INIT_STATIC(__boot_from_prom)
/* Do all of the interaction with OF client interface */
mr r8,r26
- bl .prom_init
+ bl prom_init
#endif /* #CONFIG_PPC_OF_BOOT_TRAMPOLINE */
/* We never return. We also hit that trap if trying to boot
* from OF while CONFIG_PPC_OF_BOOT_TRAMPOLINE isn't selected */
trap
-_STATIC(__after_prom_start)
+__after_prom_start:
#ifdef CONFIG_RELOCATABLE
/* process relocations for the final address of the kernel */
lis r25,PAGE_OFFSET@highest /* compute virtual base of kernel */
sldi r25,r25,32
-#ifdef CONFIG_CRASH_DUMP
lwz r7,__run_at_load-_stext(r26)
- cmplwi cr0,r7,1 /* kdump kernel ? - stay where we are */
+ cmplwi cr0,r7,1 /* flagged to stay where we are ? */
bne 1f
add r25,r25,r26
-#endif
1: mr r3,r25
- bl .relocate
+ bl relocate
#endif
/*
@@ -413,7 +446,7 @@ _STATIC(__after_prom_start)
tovirt(r6,r6) /* on booke, we already run at PAGE_OFFSET */
#endif
-#ifdef CONFIG_CRASH_DUMP
+#ifdef CONFIG_RELOCATABLE
/*
* Check if the kernel has to be running as relocatable kernel based on the
* variable __run_at_load, if it is set the kernel is treated as relocatable
@@ -423,29 +456,31 @@ _STATIC(__after_prom_start)
cmplwi cr0,r7,1
bne 3f
- li r5,__end_interrupts - _stext /* just copy interrupts */
+ /* just copy interrupts */
+ LOAD_REG_IMMEDIATE(r5, __end_interrupts - _stext)
b 5f
3:
#endif
lis r5,(copy_to_here - _stext)@ha
addi r5,r5,(copy_to_here - _stext)@l /* # bytes of memory to copy */
- bl .copy_and_flush /* copy the first n bytes */
+ bl copy_and_flush /* copy the first n bytes */
/* this includes the code being */
/* executed here. */
addis r8,r3,(4f - _stext)@ha /* Jump to the copy of this code */
- addi r8,r8,(4f - _stext)@l /* that we just made */
- mtctr r8
+ addi r12,r8,(4f - _stext)@l /* that we just made */
+ mtctr r12
bctr
+.balign 8
p_end: .llong _end - _stext
4: /* Now copy the rest of the kernel up to _end */
addis r5,r26,(p_end - _stext)@ha
ld r5,(p_end - _stext)@l(r5) /* get _end */
-5: bl .copy_and_flush /* copy the rest */
+5: bl copy_and_flush /* copy the rest */
-9: b .start_here_multiplatform
+9: b start_here_multiplatform
/*
* Copy routine used to copy the kernel to start at physical address 0
@@ -480,6 +515,7 @@ _GLOBAL(copy_and_flush)
sync
addi r5,r5,8
addi r6,r6,8
+ isync
blr
.align 8
@@ -508,7 +544,7 @@ __secondary_start_pmac_0:
_GLOBAL(pmac_secondary_start)
/* turn on 64-bit mode */
- bl .enable_64b_mode
+ bl enable_64b_mode
li r0,0
mfspr r3,SPRN_HID4
@@ -520,10 +556,11 @@ _GLOBAL(pmac_secondary_start)
slbia
/* get TOC pointer (real address) */
- bl .relative_toc
+ bl relative_toc
+ tovirt(r2,r2)
/* Copy some CPU settings from CPU 0 */
- bl .__restore_cpu_ppc970
+ bl __restore_cpu_ppc970
/* pSeries do that early though I don't think we really need it */
mfmsr r3
@@ -535,7 +572,15 @@ _GLOBAL(pmac_secondary_start)
ld r4,0(r4) /* Get base vaddr of paca array */
mulli r13,r24,PACA_SIZE /* Calculate vaddr of right paca */
add r13,r13,r4 /* for this processor. */
- mtspr SPRN_SPRG_PACA,r13 /* Save vaddr of paca in an SPRG*/
+ SET_PACA(r13) /* Save vaddr of paca in an SPRG*/
+
+ /* Mark interrupts soft and hard disabled (they might be enabled
+ * in the PACA when doing hotplug)
+ */
+ li r0,0
+ stb r0,PACASOFTIRQEN(r13)
+ li r0,PACA_IRQ_HARD_DIS
+ stb r0,PACAIRQHAPPENED(r13)
/* Create a temp kernel stack for use before relocation is on. */
ld r1,PACAEMERGSP(r13)
@@ -553,7 +598,7 @@ _GLOBAL(pmac_secondary_start)
* 1. Processor number
* 2. Segment table pointer (virtual address)
* On entry the following are set:
- * r1 = stack pointer. vaddr for iSeries, raddr (temp stack) for pSeries
+ * r1 = stack pointer (real addr of temp stack)
* r24 = cpu# (in Linux terms)
* r13 = paca virtual address
* SPRG_PACA = paca virtual address
@@ -566,7 +611,7 @@ __secondary_start:
/* Set thread priority to MEDIUM */
HMT_MEDIUM
- /* Initialize the kernel stack. Just a repeat for iSeries. */
+ /* Initialize the kernel stack */
LOAD_REG_ADDR(r3, current_set)
sldi r28,r24,3 /* get current_set[cpu#] */
ldx r14,r3,r28
@@ -574,7 +619,7 @@ __secondary_start:
std r14,PACAKSAVE(r13)
/* Do early setup for that CPU (stab, slb, hash table pointer) */
- bl .early_setup_secondary
+ bl early_setup_secondary
/*
* setup the new stack pointer, but *don't* use this until
@@ -586,20 +631,16 @@ __secondary_start:
li r7,0
mtlr r7
+ /* Mark interrupts soft and hard disabled (they might be enabled
+ * in the PACA when doing hotplug)
+ */
+ stb r7,PACASOFTIRQEN(r13)
+ li r0,PACA_IRQ_HARD_DIS
+ stb r0,PACAIRQHAPPENED(r13)
+
/* enable MMU and jump to start_secondary */
- LOAD_REG_ADDR(r3, .start_secondary_prolog)
+ LOAD_REG_ADDR(r3, start_secondary_prolog)
LOAD_REG_IMMEDIATE(r4, MSR_KERNEL)
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
- ori r4,r4,MSR_EE
- li r8,1
- stb r8,PACAHARDIRQEN(r13)
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif
-BEGIN_FW_FTR_SECTION
- stb r7,PACAHARDIRQEN(r13)
-END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES)
- stb r7,PACASOFTIRQEN(r13)
mtspr SPRN_SRR0,r3
mtspr SPRN_SRR1,r4
@@ -611,11 +652,11 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES)
* zero the stack back-chain pointer and get the TOC virtual address
* before going into C code.
*/
-_GLOBAL(start_secondary_prolog)
+start_secondary_prolog:
ld r2,PACATOC(r13)
li r3,0
std r3,0(r1) /* Zero the stack frame pointer */
- bl .start_secondary
+ bl start_secondary
b .
/*
* Reset stack pointer and call start_secondary
@@ -626,20 +667,20 @@ _GLOBAL(start_secondary_resume)
ld r1,PACAKSAVE(r13) /* Reload kernel stack pointer */
li r3,0
std r3,0(r1) /* Zero the stack frame pointer */
- bl .start_secondary
+ bl start_secondary
b .
#endif
/*
* This subroutine clobbers r11 and r12
*/
-_GLOBAL(enable_64b_mode)
+enable_64b_mode:
mfmsr r11 /* grab the current MSR */
#ifdef CONFIG_PPC_BOOK3E
oris r11,r11,0x8000 /* CM bit set, we'll set ICM later */
mtmsr r11
#else /* CONFIG_PPC_BOOK3E */
- li r12,(MSR_SF | MSR_ISF)@highest
+ li r12,(MSR_64BIT | MSR_ISF)@highest
sldi r12,r12,48
or r11,r11,r12
mtmsrd r11
@@ -651,24 +692,33 @@ _GLOBAL(enable_64b_mode)
* This puts the TOC pointer into r2, offset by 0x8000 (as expected
* by the toolchain). It computes the correct value for wherever we
* are running at the moment, using position-independent code.
+ *
+ * Note: The compiler constructs pointers using offsets from the
+ * TOC in -mcmodel=medium mode. After we relocate to 0 but before
+ * the MMU is on we need our TOC to be a virtual address otherwise
+ * these pointers will be real addresses which may get stored and
+ * accessed later with the MMU on. We use tovirt() at the call
+ * sites to handle this.
*/
_GLOBAL(relative_toc)
mflr r0
bcl 20,31,$+4
-0: mflr r9
- ld r2,(p_toc - 0b)(r9)
- add r2,r2,r9
+0: mflr r11
+ ld r2,(p_toc - 0b)(r11)
+ add r2,r2,r11
mtlr r0
blr
+.balign 8
p_toc: .llong __toc_start + 0x8000 - 0b
/*
* This is where the main kernel code starts.
*/
-_INIT_STATIC(start_here_multiplatform)
- /* set up the TOC (real address) */
- bl .relative_toc
+start_here_multiplatform:
+ /* set up the TOC */
+ bl relative_toc
+ tovirt(r2,r2)
/* Clear out the BSS. It may have been done in prom_init,
* already but that's irrelevant since prom_init will soon
@@ -688,6 +738,13 @@ _INIT_STATIC(start_here_multiplatform)
bdnz 3b
4:
+#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL
+ /* Setup OPAL entry */
+ LOAD_REG_ADDR(r11, opal)
+ std r28,0(r11);
+ std r29,8(r11);
+#endif
+
#ifndef CONFIG_PPC_BOOK3E
mfmsr r6
ori r6,r6,MSR_RI
@@ -719,9 +776,9 @@ _INIT_STATIC(start_here_multiplatform)
/* Restore parameters passed from prom_init/kexec */
mr r3,r31
- bl .early_setup /* also sets r13 and SPRG_PACA */
+ bl early_setup /* also sets r13 and SPRG_PACA */
- LOAD_REG_ADDR(r3, .start_here_common)
+ LOAD_REG_ADDR(r3, start_here_common)
ld r4,PACAKMSR(r13)
mtspr SPRN_SRR0,r3
mtspr SPRN_SRR1,r4
@@ -729,30 +786,27 @@ _INIT_STATIC(start_here_multiplatform)
b . /* prevent speculative execution */
/* This is where all platforms converge execution */
-_INIT_GLOBAL(start_here_common)
+
+start_here_common:
/* relocation is on at this point */
std r1,PACAKSAVE(r13)
/* Load the TOC (virtual address) */
ld r2,PACATOC(r13)
- bl .setup_system
-
- /* Load up the kernel context */
-5:
- li r5,0
- stb r5,PACASOFTIRQEN(r13) /* Soft Disabled */
-#ifdef CONFIG_PPC_ISERIES
-BEGIN_FW_FTR_SECTION
- mfmsr r5
- ori r5,r5,MSR_EE /* Hard Enabled on iSeries*/
- mtmsrd r5
- li r5,1
-END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
-#endif
- stb r5,PACAHARDIRQEN(r13) /* Hard Disabled on others */
+ /* Do more system initializations in virtual mode */
+ bl setup_system
+
+ /* Mark interrupts soft and hard disabled (they might be enabled
+ * in the PACA when doing hotplug)
+ */
+ li r0,0
+ stb r0,PACASOFTIRQEN(r13)
+ li r0,PACA_IRQ_HARD_DIS
+ stb r0,PACAIRQHAPPENED(r13)
- bl .start_kernel
+ /* Generic kernel entry */
+ bl start_kernel
/* Not reached */
BUG_OPCODE
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 1f1a04b5c2a..7ee876d2adb 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -29,6 +29,7 @@
#include <asm/thread_info.h>
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
/* Macro to make the code more readable. */
#ifdef CONFIG_8xx_CPU6
@@ -75,11 +76,7 @@ _ENTRY(_start);
*/
.globl __start
__start:
- mr r31,r3 /* save parameters */
- mr r30,r4
- mr r29,r5
- mr r28,r6
- mr r27,r7
+ mr r31,r3 /* save device tree ptr */
/* We have to turn on the MMU right away so we get cache modes
* set correctly.
@@ -223,7 +220,7 @@ DataAccess:
mfspr r4,SPRN_DAR
li r10,0x00f0
mtspr SPRN_DAR,r10 /* Tag DAR, to be used in DTLB Error */
- EXC_XFER_EE_LITE(0x300, handle_page_fault)
+ EXC_XFER_LITE(0x300, handle_page_fault)
/* Instruction access exception.
* This is "never generated" by the MPC8xx. We jump to it for other
@@ -234,7 +231,7 @@ InstructionAccess:
EXCEPTION_PROLOG
mr r4,r12
mr r5,r9
- EXC_XFER_EE_LITE(0x400, handle_page_fault)
+ EXC_XFER_LITE(0x400, handle_page_fault)
/* External interrupt */
EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE)
@@ -694,10 +691,6 @@ modified_instr:
b 151b
#endif
- .globl giveup_fpu
-giveup_fpu:
- blr
-
/*
* This is where the main kernel code starts.
*/
@@ -722,11 +715,8 @@ start_here:
/*
* Decide what sort of machine this is and initialize the MMU.
*/
- mr r3,r31
- mr r4,r30
- mr r5,r29
- mr r6,r28
- mr r7,r27
+ li r3,0
+ mr r4,r31
bl machine_init
bl MMU_init
@@ -868,6 +858,9 @@ initial_mmu:
addis r11, r11, 0x0080 /* Add 8M */
mtspr SPRN_MD_RPN, r11
+ addi r10, r10, 0x0100
+ mtspr SPRN_MD_CTR, r10
+
addis r8, r8, 0x0080 /* Add 8M */
mtspr SPRN_MD_EPN, r8
mtspr SPRN_MD_TWC, r9
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index a0bf158c8b4..a620203f7de 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -2,6 +2,9 @@
#define __HEAD_BOOKE_H__
#include <asm/ptrace.h> /* for STACK_FRAME_REGS_MARKER */
+#include <asm/kvm_asm.h>
+#include <asm/kvm_booke_hv_asm.h>
+
/*
* Macros used for common Book-e exception handling
*/
@@ -20,33 +23,44 @@
addi reg,reg,val@l
#endif
-#define NORMAL_EXCEPTION_PROLOG \
- mtspr SPRN_SPRG_WSCRATCH0,r10;/* save two registers to work with */\
- mtspr SPRN_SPRG_WSCRATCH1,r11; \
- mtspr SPRN_SPRG_WSCRATCH2,r1; \
- mfcr r10; /* save CR in r10 for now */\
- mfspr r11,SPRN_SRR1; /* check whether user or kernel */\
- andi. r11,r11,MSR_PR; \
+/*
+ * Macro used to get to thread save registers.
+ * Note that entries 0-3 are used for the prolog code, and the remaining
+ * entries are available for specific exception use in the event a handler
+ * requires more than 4 scratch registers.
+ */
+#define THREAD_NORMSAVE(offset) (THREAD_NORMSAVES + (offset * 4))
+
+#define NORMAL_EXCEPTION_PROLOG(intno) \
+ mtspr SPRN_SPRG_WSCRATCH0, r10; /* save one register */ \
+ mfspr r10, SPRN_SPRG_THREAD; \
+ stw r11, THREAD_NORMSAVE(0)(r10); \
+ stw r13, THREAD_NORMSAVE(2)(r10); \
+ mfcr r13; /* save CR in r13 for now */\
+ mfspr r11, SPRN_SRR1; \
+ DO_KVM BOOKE_INTERRUPT_##intno SPRN_SRR1; \
+ andi. r11, r11, MSR_PR; /* check whether user or kernel */\
+ mr r11, r1; \
beq 1f; \
- mfspr r1,SPRN_SPRG_THREAD; /* if from user, start at top of */\
- lwz r1,THREAD_INFO-THREAD(r1); /* this thread's kernel stack */\
- ALLOC_STACK_FRAME(r1, THREAD_SIZE); \
-1: subi r1,r1,INT_FRAME_SIZE; /* Allocate an exception frame */\
- mr r11,r1; \
- stw r10,_CCR(r11); /* save various registers */\
+ /* if from user, start at top of this thread's kernel stack */ \
+ lwz r11, THREAD_INFO-THREAD(r10); \
+ ALLOC_STACK_FRAME(r11, THREAD_SIZE); \
+1 : subi r11, r11, INT_FRAME_SIZE; /* Allocate exception frame */ \
+ stw r13, _CCR(r11); /* save various registers */ \
stw r12,GPR12(r11); \
stw r9,GPR9(r11); \
- mfspr r10,SPRN_SPRG_RSCRATCH0; \
- stw r10,GPR10(r11); \
- mfspr r12,SPRN_SPRG_RSCRATCH1; \
+ mfspr r13, SPRN_SPRG_RSCRATCH0; \
+ stw r13, GPR10(r11); \
+ lwz r12, THREAD_NORMSAVE(0)(r10); \
stw r12,GPR11(r11); \
+ lwz r13, THREAD_NORMSAVE(2)(r10); /* restore r13 */ \
mflr r10; \
stw r10,_LINK(r11); \
- mfspr r10,SPRN_SPRG_RSCRATCH2; \
mfspr r12,SPRN_SRR0; \
- stw r10,GPR1(r11); \
+ stw r1, GPR1(r11); \
mfspr r9,SPRN_SRR1; \
- stw r10,0(r11); \
+ stw r1, 0(r11); \
+ mr r1, r11; \
rlwinm r9,r9,0,14,12; /* clear MSR_WE (necessary?) */\
stw r0,GPR0(r11); \
lis r10, STACK_FRAME_REGS_MARKER@ha;/* exception frame marker */ \
@@ -103,7 +117,7 @@
* registers as the normal prolog above. Instead we use a portion of the
* critical/machine check exception stack at low physical addresses.
*/
-#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, exc_level_srr0, exc_level_srr1) \
+#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, intno, exc_level_srr0, exc_level_srr1) \
mtspr SPRN_SPRG_WSCRATCH_##exc_level,r8; \
BOOKE_LOAD_EXC_LEVEL_STACK(exc_level);/* r8 points to the exc_level stack*/ \
stw r9,GPR9(r8); /* save various registers */\
@@ -111,8 +125,9 @@
stw r10,GPR10(r8); \
stw r11,GPR11(r8); \
stw r9,_CCR(r8); /* save CR on stack */\
- mfspr r10,exc_level_srr1; /* check whether user or kernel */\
- andi. r10,r10,MSR_PR; \
+ mfspr r11,exc_level_srr1; /* check whether user or kernel */\
+ DO_KVM BOOKE_INTERRUPT_##intno exc_level_srr1; \
+ andi. r11,r11,MSR_PR; \
mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\
lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\
addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\
@@ -152,12 +167,30 @@
SAVE_4GPRS(3, r11); \
SAVE_2GPRS(7, r11)
-#define CRITICAL_EXCEPTION_PROLOG \
- EXC_LEVEL_EXCEPTION_PROLOG(CRIT, SPRN_CSRR0, SPRN_CSRR1)
+#define CRITICAL_EXCEPTION_PROLOG(intno) \
+ EXC_LEVEL_EXCEPTION_PROLOG(CRIT, intno, SPRN_CSRR0, SPRN_CSRR1)
#define DEBUG_EXCEPTION_PROLOG \
- EXC_LEVEL_EXCEPTION_PROLOG(DBG, SPRN_DSRR0, SPRN_DSRR1)
+ EXC_LEVEL_EXCEPTION_PROLOG(DBG, DEBUG, SPRN_DSRR0, SPRN_DSRR1)
#define MCHECK_EXCEPTION_PROLOG \
- EXC_LEVEL_EXCEPTION_PROLOG(MC, SPRN_MCSRR0, SPRN_MCSRR1)
+ EXC_LEVEL_EXCEPTION_PROLOG(MC, MACHINE_CHECK, \
+ SPRN_MCSRR0, SPRN_MCSRR1)
+
+/*
+ * Guest Doorbell -- this is a bit odd in that uses GSRR0/1 despite
+ * being delivered to the host. This exception can only happen
+ * inside a KVM guest -- so we just handle up to the DO_KVM rather
+ * than try to fit this into one of the existing prolog macros.
+ */
+#define GUEST_DOORBELL_EXCEPTION \
+ START_EXCEPTION(GuestDoorbell); \
+ mtspr SPRN_SPRG_WSCRATCH0, r10; /* save one register */ \
+ mfspr r10, SPRN_SPRG_THREAD; \
+ stw r11, THREAD_NORMSAVE(0)(r10); \
+ mfspr r11, SPRN_SRR1; \
+ stw r13, THREAD_NORMSAVE(2)(r10); \
+ mfcr r13; /* save CR in r13 for now */\
+ DO_KVM BOOKE_INTERRUPT_GUEST_DBELL SPRN_GSRR1; \
+ trap
/*
* Exception vectors.
@@ -166,21 +199,16 @@
.align 5; \
label:
-#define FINISH_EXCEPTION(func) \
- bl transfer_to_handler_full; \
- .long func; \
- .long ret_from_except_full
-
-#define EXCEPTION(n, label, hdlr, xfer) \
+#define EXCEPTION(n, intno, label, hdlr, xfer) \
START_EXCEPTION(label); \
- NORMAL_EXCEPTION_PROLOG; \
+ NORMAL_EXCEPTION_PROLOG(intno); \
addi r3,r1,STACK_FRAME_OVERHEAD; \
xfer(n, hdlr)
-#define CRITICAL_EXCEPTION(n, label, hdlr) \
- START_EXCEPTION(label); \
- CRITICAL_EXCEPTION_PROLOG; \
- addi r3,r1,STACK_FRAME_OVERHEAD; \
+#define CRITICAL_EXCEPTION(n, intno, label, hdlr) \
+ START_EXCEPTION(label); \
+ CRITICAL_EXCEPTION_PROLOG(intno); \
+ addi r3,r1,STACK_FRAME_OVERHEAD; \
EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
NOCOPY, crit_transfer_to_handler, \
ret_from_crit_exc)
@@ -253,13 +281,13 @@ label:
andis. r10,r10,(DBSR_IC|DBSR_BT)@h; \
beq+ 2f; \
\
- lis r10,KERNELBASE@h; /* check if exception in vectors */ \
- ori r10,r10,KERNELBASE@l; \
+ lis r10,interrupt_base@h; /* check if exception in vectors */ \
+ ori r10,r10,interrupt_base@l; \
cmplw r12,r10; \
blt+ 2f; /* addr below exception vectors */ \
\
- lis r10,DebugDebug@h; \
- ori r10,r10,DebugDebug@l; \
+ lis r10,interrupt_end@h; \
+ ori r10,r10,interrupt_end@l; \
cmplw r12,r10; \
bgt+ 2f; /* addr above exception vectors */ \
\
@@ -292,7 +320,7 @@ label:
#define DEBUG_CRIT_EXCEPTION \
START_EXCEPTION(DebugCrit); \
- CRITICAL_EXCEPTION_PROLOG; \
+ CRITICAL_EXCEPTION_PROLOG(DEBUG); \
\
/* \
* If there is a single step or branch-taken exception in an \
@@ -306,13 +334,13 @@ label:
andis. r10,r10,(DBSR_IC|DBSR_BT)@h; \
beq+ 2f; \
\
- lis r10,KERNELBASE@h; /* check if exception in vectors */ \
- ori r10,r10,KERNELBASE@l; \
+ lis r10,interrupt_base@h; /* check if exception in vectors */ \
+ ori r10,r10,interrupt_base@l; \
cmplw r12,r10; \
blt+ 2f; /* addr below exception vectors */ \
\
- lis r10,DebugCrit@h; \
- ori r10,r10,DebugCrit@l; \
+ lis r10,interrupt_end@h; \
+ ori r10,r10,interrupt_end@l; \
cmplw r12,r10; \
bgt+ 2f; /* addr above exception vectors */ \
\
@@ -345,24 +373,24 @@ label:
#define DATA_STORAGE_EXCEPTION \
START_EXCEPTION(DataStorage) \
- NORMAL_EXCEPTION_PROLOG; \
+ NORMAL_EXCEPTION_PROLOG(DATA_STORAGE); \
mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \
stw r5,_ESR(r11); \
mfspr r4,SPRN_DEAR; /* Grab the DEAR */ \
- EXC_XFER_EE_LITE(0x0300, handle_page_fault)
+ EXC_XFER_LITE(0x0300, handle_page_fault)
#define INSTRUCTION_STORAGE_EXCEPTION \
START_EXCEPTION(InstructionStorage) \
- NORMAL_EXCEPTION_PROLOG; \
+ NORMAL_EXCEPTION_PROLOG(INST_STORAGE); \
mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \
stw r5,_ESR(r11); \
mr r4,r12; /* Pass SRR0 as arg2 */ \
li r5,0; /* Pass zero as arg3 */ \
- EXC_XFER_EE_LITE(0x0400, handle_page_fault)
+ EXC_XFER_LITE(0x0400, handle_page_fault)
#define ALIGNMENT_EXCEPTION \
START_EXCEPTION(Alignment) \
- NORMAL_EXCEPTION_PROLOG; \
+ NORMAL_EXCEPTION_PROLOG(ALIGNMENT); \
mfspr r4,SPRN_DEAR; /* Grab the DEAR and save it */ \
stw r4,_DEAR(r11); \
addi r3,r1,STACK_FRAME_OVERHEAD; \
@@ -370,7 +398,7 @@ label:
#define PROGRAM_EXCEPTION \
START_EXCEPTION(Program) \
- NORMAL_EXCEPTION_PROLOG; \
+ NORMAL_EXCEPTION_PROLOG(PROGRAM); \
mfspr r4,SPRN_ESR; /* Grab the ESR and save it */ \
stw r4,_ESR(r11); \
addi r3,r1,STACK_FRAME_OVERHEAD; \
@@ -378,7 +406,7 @@ label:
#define DECREMENTER_EXCEPTION \
START_EXCEPTION(Decrementer) \
- NORMAL_EXCEPTION_PROLOG; \
+ NORMAL_EXCEPTION_PROLOG(DECREMENTER); \
lis r0,TSR_DIS@h; /* Setup the DEC interrupt mask */ \
mtspr SPRN_TSR,r0; /* Clear the DEC interrupt */ \
addi r3,r1,STACK_FRAME_OVERHEAD; \
@@ -386,7 +414,7 @@ label:
#define FP_UNAVAILABLE_EXCEPTION \
START_EXCEPTION(FloatingPointUnavailable) \
- NORMAL_EXCEPTION_PROLOG; \
+ NORMAL_EXCEPTION_PROLOG(FP_UNAVAIL); \
beq 1f; \
bl load_up_fpu; /* if from user, just load it up */ \
b fast_exception_return; \
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 529b817f473..b497188a94a 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -41,6 +41,7 @@
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/cache.h>
+#include <asm/ptrace.h>
#include "head_booke.h"
/* As with the other PowerPC ports, it is expected that when code
@@ -62,17 +63,79 @@ _ENTRY(_start);
* of abatron_pteptrs
*/
nop
-/*
- * Save parameters we are passed
- */
- mr r31,r3
- mr r30,r4
- mr r29,r5
- mr r28,r6
- mr r27,r7
- li r25,0 /* phys kernel start (low) */
- li r24,0 /* CPU number */
- li r23,0 /* phys kernel start (high) */
+
+ /* Translate device tree address to physical, save in r30/r31 */
+ bl get_phys_addr
+ mr r30,r3
+ mr r31,r4
+
+ li r25,0 /* phys kernel start (low) */
+ li r24,0 /* CPU number */
+ li r23,0 /* phys kernel start (high) */
+
+#ifdef CONFIG_RELOCATABLE
+ LOAD_REG_ADDR_PIC(r3, _stext) /* Get our current runtime base */
+
+ /* Translate _stext address to physical, save in r23/r25 */
+ bl get_phys_addr
+ mr r23,r3
+ mr r25,r4
+
+ bl 0f
+0: mflr r8
+ addis r3,r8,(is_second_reloc - 0b)@ha
+ lwz r19,(is_second_reloc - 0b)@l(r3)
+
+ /* Check if this is the second relocation. */
+ cmpwi r19,1
+ bne 1f
+
+ /*
+ * For the second relocation, we already get the real memstart_addr
+ * from device tree. So we will map PAGE_OFFSET to memstart_addr,
+ * then the virtual address of start kernel should be:
+ * PAGE_OFFSET + (kernstart_addr - memstart_addr)
+ * Since the offset between kernstart_addr and memstart_addr should
+ * never be beyond 1G, so we can just use the lower 32bit of them
+ * for the calculation.
+ */
+ lis r3,PAGE_OFFSET@h
+
+ addis r4,r8,(kernstart_addr - 0b)@ha
+ addi r4,r4,(kernstart_addr - 0b)@l
+ lwz r5,4(r4)
+
+ addis r6,r8,(memstart_addr - 0b)@ha
+ addi r6,r6,(memstart_addr - 0b)@l
+ lwz r7,4(r6)
+
+ subf r5,r7,r5
+ add r3,r3,r5
+ b 2f
+
+1:
+ /*
+ * We have the runtime (virutal) address of our base.
+ * We calculate our shift of offset from a 64M page.
+ * We could map the 64M page we belong to at PAGE_OFFSET and
+ * get going from there.
+ */
+ lis r4,KERNELBASE@h
+ ori r4,r4,KERNELBASE@l
+ rlwinm r6,r25,0,0x3ffffff /* r6 = PHYS_START % 64M */
+ rlwinm r5,r4,0,0x3ffffff /* r5 = KERNELBASE % 64M */
+ subf r3,r5,r6 /* r3 = r6 - r5 */
+ add r3,r4,r3 /* Required Virtual Address */
+
+2: bl relocate
+
+ /*
+ * For the second relocation, we already set the right tlb entries
+ * for the kernel space, so skip the code in fsl_booke_entry_mapping.S
+ */
+ cmpwi r19,1
+ beq set_ivor
+#endif
/* We try to not make any assumptions about how the boot loader
* setup or used the TLBs. We invalidate all mappings from the
@@ -99,6 +162,7 @@ _ENTRY(__early_start)
#include "fsl_booke_entry_mapping.S"
#undef ENTRY_MAPPING_BOOT_SETUP
+set_ivor:
/* Establish the interrupt vector offsets */
SET_IVOR(0, CriticalInput);
SET_IVOR(1, MachineCheck);
@@ -152,8 +216,7 @@ _ENTRY(__early_start)
/* Check to see if we're the second processor, and jump
* to the secondary_start code if so
*/
- lis r24, boot_cpuid@h
- ori r24, r24, boot_cpuid@l
+ LOAD_REG_ADDR_PIC(r24, boot_cpuid)
lwz r24, 0(r24)
cmpwi r24, -1
mfspr r24,SPRN_PIR
@@ -178,12 +241,24 @@ _ENTRY(__early_start)
li r0,0
stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1)
- rlwinm r22,r1,0,0,31-THREAD_SHIFT /* current thread_info */
+ CURRENT_THREAD_INFO(r22, r1)
stw r24, TI_CPU(r22)
bl early_init
#ifdef CONFIG_RELOCATABLE
+ mr r3,r30
+ mr r4,r31
+#ifdef CONFIG_PHYS_64BIT
+ mr r5,r23
+ mr r6,r25
+#else
+ mr r5,r25
+#endif
+ bl relocate_init
+#endif
+
+#ifdef CONFIG_DYNAMIC_MEMSTART
lis r3,kernstart_addr@ha
la r3,kernstart_addr@l(r3)
#ifdef CONFIG_PHYS_64BIT
@@ -197,11 +272,8 @@ _ENTRY(__early_start)
/*
* Decide what sort of machine this is and initialize the MMU.
*/
- mr r3,r31
- mr r4,r30
- mr r5,r29
- mr r6,r28
- mr r7,r27
+ mr r3,r30
+ mr r4,r31
bl machine_init
bl MMU_init
@@ -235,8 +307,24 @@ _ENTRY(__early_start)
* if we find the pte (fall through):
* r11 is low pte word
* r12 is pointer to the pte
+ * r10 is the pshift from the PGD, if we're a hugepage
*/
#ifdef CONFIG_PTE_64BIT
+#ifdef CONFIG_HUGETLB_PAGE
+#define FIND_PTE \
+ rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \
+ lwzx r11, r12, r11; /* Get pgd/pmd entry */ \
+ rlwinm. r12, r11, 0, 0, 20; /* Extract pt base address */ \
+ blt 1000f; /* Normal non-huge page */ \
+ beq 2f; /* Bail if no table */ \
+ oris r11, r11, PD_HUGE@h; /* Put back address bit */ \
+ andi. r10, r11, HUGEPD_SHIFT_MASK@l; /* extract size field */ \
+ xor r12, r10, r11; /* drop size bits from pointer */ \
+ b 1001f; \
+1000: rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \
+ li r10, 0; /* clear r10 */ \
+1001: lwz r11, 4(r12); /* Get pte entry */
+#else
#define FIND_PTE \
rlwinm r12, r10, 13, 19, 29; /* Compute pgdir/pmd offset */ \
lwzx r11, r12, r11; /* Get pgd/pmd entry */ \
@@ -244,7 +332,8 @@ _ENTRY(__early_start)
beq 2f; /* Bail if no table */ \
rlwimi r12, r10, 23, 20, 28; /* Compute pte address */ \
lwz r11, 4(r12); /* Get pte entry */
-#else
+#endif /* HUGEPAGE */
+#else /* !PTE_64BIT */
#define FIND_PTE \
rlwimi r11, r10, 12, 20, 29; /* Create L1 (pgdir/pmd) address */ \
lwz r11, 0(r11); /* Get L1 entry */ \
@@ -273,25 +362,26 @@ _ENTRY(__early_start)
interrupt_base:
/* Critical Input Interrupt */
- CRITICAL_EXCEPTION(0x0100, CriticalInput, unknown_exception)
+ CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception)
/* Machine Check Interrupt */
#ifdef CONFIG_E200
/* no RFMCI, MCSRRs on E200 */
- CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
+ CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \
+ machine_check_exception)
#else
MCHECK_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
#endif
/* Data Storage Interrupt */
START_EXCEPTION(DataStorage)
- NORMAL_EXCEPTION_PROLOG
+ NORMAL_EXCEPTION_PROLOG(DATA_STORAGE)
mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */
stw r5,_ESR(r11)
mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */
andis. r10,r5,(ESR_ILK|ESR_DLK)@h
bne 1f
- EXC_XFER_EE_LITE(0x0300, handle_page_fault)
+ EXC_XFER_LITE(0x0300, handle_page_fault)
1:
addi r3,r1,STACK_FRAME_OVERHEAD
EXC_XFER_EE_LITE(0x0300, CacheLockingException)
@@ -300,7 +390,7 @@ interrupt_base:
INSTRUCTION_STORAGE_EXCEPTION
/* External Input Interrupt */
- EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE)
+ EXCEPTION(0x0500, EXTERNAL, ExternalInput, do_IRQ, EXC_XFER_LITE)
/* Alignment Interrupt */
ALIGNMENT_EXCEPTION
@@ -314,42 +404,53 @@ interrupt_base:
#else
#ifdef CONFIG_E200
/* E200 treats 'normal' floating point instructions as FP Unavail exception */
- EXCEPTION(0x0800, FloatingPointUnavailable, program_check_exception, EXC_XFER_EE)
+ EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \
+ program_check_exception, EXC_XFER_EE)
#else
- EXCEPTION(0x0800, FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
+ EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \
+ unknown_exception, EXC_XFER_EE)
#endif
#endif
/* System Call Interrupt */
START_EXCEPTION(SystemCall)
- NORMAL_EXCEPTION_PROLOG
+ NORMAL_EXCEPTION_PROLOG(SYSCALL)
EXC_XFER_EE_LITE(0x0c00, DoSyscall)
- /* Auxillary Processor Unavailable Interrupt */
- EXCEPTION(0x2900, AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
+ /* Auxiliary Processor Unavailable Interrupt */
+ EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \
+ unknown_exception, EXC_XFER_EE)
/* Decrementer Interrupt */
DECREMENTER_EXCEPTION
/* Fixed Internal Timer Interrupt */
/* TODO: Add FIT support */
- EXCEPTION(0x3100, FixedIntervalTimer, unknown_exception, EXC_XFER_EE)
+ EXCEPTION(0x3100, FIT, FixedIntervalTimer, \
+ unknown_exception, EXC_XFER_EE)
/* Watchdog Timer Interrupt */
#ifdef CONFIG_BOOKE_WDT
- CRITICAL_EXCEPTION(0x3200, WatchdogTimer, WatchdogException)
+ CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, WatchdogException)
#else
- CRITICAL_EXCEPTION(0x3200, WatchdogTimer, unknown_exception)
+ CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, unknown_exception)
#endif
/* Data TLB Error Interrupt */
START_EXCEPTION(DataTLBError)
mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */
- mtspr SPRN_SPRG_WSCRATCH1, r11
- mtspr SPRN_SPRG_WSCRATCH2, r12
- mtspr SPRN_SPRG_WSCRATCH3, r13
- mfcr r11
- mtspr SPRN_SPRG_WSCRATCH4, r11
+ mfspr r10, SPRN_SPRG_THREAD
+ stw r11, THREAD_NORMSAVE(0)(r10)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+ mfspr r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
+ stw r12, THREAD_NORMSAVE(1)(r10)
+ stw r13, THREAD_NORMSAVE(2)(r10)
+ mfcr r13
+ stw r13, THREAD_NORMSAVE(3)(r10)
+ DO_KVM BOOKE_INTERRUPT_DTLB_MISS SPRN_SRR1
mfspr r10, SPRN_DEAR /* Get faulting address */
/* If we are faulting a kernel address, we have to use the
@@ -400,8 +501,8 @@ interrupt_base:
#ifdef CONFIG_PTE_64BIT
#ifdef CONFIG_SMP
- subf r10,r11,r12 /* create false data dep */
- lwzx r13,r11,r10 /* Get upper pte bits */
+ subf r13,r11,r12 /* create false data dep */
+ lwzx r13,r11,r13 /* Get upper pte bits */
#else
lwz r13,0(r12) /* Get upper pte bits */
#endif
@@ -415,11 +516,12 @@ interrupt_base:
/* The bailout. Restore registers to pre-exception conditions
* and call the heavyweights to help us out.
*/
- mfspr r11, SPRN_SPRG_RSCRATCH4
+ mfspr r10, SPRN_SPRG_THREAD
+ lwz r11, THREAD_NORMSAVE(3)(r10)
mtcr r11
- mfspr r13, SPRN_SPRG_RSCRATCH3
- mfspr r12, SPRN_SPRG_RSCRATCH2
- mfspr r11, SPRN_SPRG_RSCRATCH1
+ lwz r13, THREAD_NORMSAVE(2)(r10)
+ lwz r12, THREAD_NORMSAVE(1)(r10)
+ lwz r11, THREAD_NORMSAVE(0)(r10)
mfspr r10, SPRN_SPRG_RSCRATCH0
b DataStorage
@@ -431,11 +533,18 @@ interrupt_base:
*/
START_EXCEPTION(InstructionTLBError)
mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */
- mtspr SPRN_SPRG_WSCRATCH1, r11
- mtspr SPRN_SPRG_WSCRATCH2, r12
- mtspr SPRN_SPRG_WSCRATCH3, r13
- mfcr r11
- mtspr SPRN_SPRG_WSCRATCH4, r11
+ mfspr r10, SPRN_SPRG_THREAD
+ stw r11, THREAD_NORMSAVE(0)(r10)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+ mfspr r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
+ stw r12, THREAD_NORMSAVE(1)(r10)
+ stw r13, THREAD_NORMSAVE(2)(r10)
+ mfcr r13
+ stw r13, THREAD_NORMSAVE(3)(r10)
+ DO_KVM BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR1
mfspr r10, SPRN_SRR0 /* Get faulting address */
/* If we are faulting a kernel address, we have to use the
@@ -479,8 +588,8 @@ interrupt_base:
#ifdef CONFIG_PTE_64BIT
#ifdef CONFIG_SMP
- subf r10,r11,r12 /* create false data dep */
- lwzx r13,r11,r10 /* Get upper pte bits */
+ subf r13,r11,r12 /* create false data dep */
+ lwzx r13,r11,r13 /* Get upper pte bits */
#else
lwz r13,0(r12) /* Get upper pte bits */
#endif
@@ -495,47 +604,70 @@ interrupt_base:
/* The bailout. Restore registers to pre-exception conditions
* and call the heavyweights to help us out.
*/
- mfspr r11, SPRN_SPRG_RSCRATCH4
+ mfspr r10, SPRN_SPRG_THREAD
+ lwz r11, THREAD_NORMSAVE(3)(r10)
mtcr r11
- mfspr r13, SPRN_SPRG_RSCRATCH3
- mfspr r12, SPRN_SPRG_RSCRATCH2
- mfspr r11, SPRN_SPRG_RSCRATCH1
+ lwz r13, THREAD_NORMSAVE(2)(r10)
+ lwz r12, THREAD_NORMSAVE(1)(r10)
+ lwz r11, THREAD_NORMSAVE(0)(r10)
mfspr r10, SPRN_SPRG_RSCRATCH0
b InstructionStorage
#ifdef CONFIG_SPE
/* SPE Unavailable */
START_EXCEPTION(SPEUnavailable)
- NORMAL_EXCEPTION_PROLOG
- bne load_up_spe
- addi r3,r1,STACK_FRAME_OVERHEAD
+ NORMAL_EXCEPTION_PROLOG(SPE_ALTIVEC_UNAVAIL)
+ beq 1f
+ bl load_up_spe
+ b fast_exception_return
+1: addi r3,r1,STACK_FRAME_OVERHEAD
EXC_XFER_EE_LITE(0x2010, KernelSPE)
#else
- EXCEPTION(0x2020, SPEUnavailable, unknown_exception, EXC_XFER_EE)
+ EXCEPTION(0x2020, SPE_ALTIVEC_UNAVAIL, SPEUnavailable, \
+ unknown_exception, EXC_XFER_EE)
#endif /* CONFIG_SPE */
/* SPE Floating Point Data */
#ifdef CONFIG_SPE
- EXCEPTION(0x2030, SPEFloatingPointData, SPEFloatingPointException, EXC_XFER_EE);
+ EXCEPTION(0x2030, SPE_FP_DATA_ALTIVEC_ASSIST, SPEFloatingPointData,
+ SPEFloatingPointException, EXC_XFER_EE)
/* SPE Floating Point Round */
- EXCEPTION(0x2050, SPEFloatingPointRound, SPEFloatingPointRoundException, EXC_XFER_EE)
+ EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \
+ SPEFloatingPointRoundException, EXC_XFER_EE)
#else
- EXCEPTION(0x2040, SPEFloatingPointData, unknown_exception, EXC_XFER_EE)
- EXCEPTION(0x2050, SPEFloatingPointRound, unknown_exception, EXC_XFER_EE)
+ EXCEPTION(0x2040, SPE_FP_DATA_ALTIVEC_ASSIST, SPEFloatingPointData,
+ unknown_exception, EXC_XFER_EE)
+ EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \
+ unknown_exception, EXC_XFER_EE)
#endif /* CONFIG_SPE */
/* Performance Monitor */
- EXCEPTION(0x2060, PerformanceMonitor, performance_monitor_exception, EXC_XFER_STD)
+ EXCEPTION(0x2060, PERFORMANCE_MONITOR, PerformanceMonitor, \
+ performance_monitor_exception, EXC_XFER_STD)
- EXCEPTION(0x2070, Doorbell, doorbell_exception, EXC_XFER_STD)
+ EXCEPTION(0x2070, DOORBELL, Doorbell, doorbell_exception, EXC_XFER_STD)
- CRITICAL_EXCEPTION(0x2080, CriticalDoorbell, unknown_exception)
+ CRITICAL_EXCEPTION(0x2080, DOORBELL_CRITICAL, \
+ CriticalDoorbell, unknown_exception)
/* Debug Interrupt */
DEBUG_DEBUG_EXCEPTION
DEBUG_CRIT_EXCEPTION
+ GUEST_DOORBELL_EXCEPTION
+
+ CRITICAL_EXCEPTION(0, GUEST_DBELL_CRIT, CriticalGuestDoorbell, \
+ unknown_exception)
+
+ /* Hypercall */
+ EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception, EXC_XFER_EE)
+
+ /* Embedded Hypervisor Privilege */
+ EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_EE)
+
+interrupt_end:
+
/*
* Local functions
*/
@@ -543,7 +675,7 @@ interrupt_base:
/*
* Both the instruction and data TLB miss get to this
* point to load the TLB.
- * r10 - available to use
+ * r10 - tsize encoding (if HUGETLB_PAGE) or available to use
* r11 - TLB (info from Linux PTE)
* r12 - available to use
* r13 - upper bits of PTE (if PTE_64BIT) or available to use
@@ -553,21 +685,73 @@ interrupt_base:
* Upon exit, we reload everything and RFI.
*/
finish_tlb_load:
+#ifdef CONFIG_HUGETLB_PAGE
+ cmpwi 6, r10, 0 /* check for huge page */
+ beq 6, finish_tlb_load_cont /* !huge */
+
+ /* Alas, we need more scratch registers for hugepages */
+ mfspr r12, SPRN_SPRG_THREAD
+ stw r14, THREAD_NORMSAVE(4)(r12)
+ stw r15, THREAD_NORMSAVE(5)(r12)
+ stw r16, THREAD_NORMSAVE(6)(r12)
+ stw r17, THREAD_NORMSAVE(7)(r12)
+
+ /* Get the next_tlbcam_idx percpu var */
+#ifdef CONFIG_SMP
+ lwz r12, THREAD_INFO-THREAD(r12)
+ lwz r15, TI_CPU(r12)
+ lis r14, __per_cpu_offset@h
+ ori r14, r14, __per_cpu_offset@l
+ rlwinm r15, r15, 2, 0, 29
+ lwzx r16, r14, r15
+#else
+ li r16, 0
+#endif
+ lis r17, next_tlbcam_idx@h
+ ori r17, r17, next_tlbcam_idx@l
+ add r17, r17, r16 /* r17 = *next_tlbcam_idx */
+ lwz r15, 0(r17) /* r15 = next_tlbcam_idx */
+
+ lis r14, MAS0_TLBSEL(1)@h /* select TLB1 (TLBCAM) */
+ rlwimi r14, r15, 16, 4, 15 /* next_tlbcam_idx entry */
+ mtspr SPRN_MAS0, r14
+
+ /* Extract TLB1CFG(NENTRY) */
+ mfspr r16, SPRN_TLB1CFG
+ andi. r16, r16, 0xfff
+
+ /* Update next_tlbcam_idx, wrapping when necessary */
+ addi r15, r15, 1
+ cmpw r15, r16
+ blt 100f
+ lis r14, tlbcam_index@h
+ ori r14, r14, tlbcam_index@l
+ lwz r15, 0(r14)
+100: stw r15, 0(r17)
+
+ /*
+ * Calc MAS1_TSIZE from r10 (which has pshift encoded)
+ * tlb_enc = (pshift - 10).
+ */
+ subi r15, r10, 10
+ mfspr r16, SPRN_MAS1
+ rlwimi r16, r15, 7, 20, 24
+ mtspr SPRN_MAS1, r16
+
+ /* copy the pshift for use later */
+ mr r14, r10
+
+ /* fall through */
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
/*
* We set execute, because we don't have the granularity to
* properly set this at the page level (Linux problem).
* Many of these bits are software only. Bits we don't set
* here we (properly should) assume have the appropriate value.
*/
-
- mfspr r12, SPRN_MAS2
-#ifdef CONFIG_PTE_64BIT
- rlwimi r12, r11, 32-19, 27, 31 /* extract WIMGE from pte */
-#else
- rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */
-#endif
- mtspr SPRN_MAS2, r12
-
+finish_tlb_load_cont:
#ifdef CONFIG_PTE_64BIT
rlwinm r12, r11, 32-2, 26, 31 /* Move in perm bits */
andi. r10, r11, _PAGE_DIRTY
@@ -576,22 +760,40 @@ finish_tlb_load:
andc r12, r12, r10
1: rlwimi r12, r13, 20, 0, 11 /* grab RPN[32:43] */
rlwimi r12, r11, 20, 12, 19 /* grab RPN[44:51] */
- mtspr SPRN_MAS3, r12
+2: mtspr SPRN_MAS3, r12
BEGIN_MMU_FTR_SECTION
srwi r10, r13, 12 /* grab RPN[12:31] */
mtspr SPRN_MAS7, r10
END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
#else
li r10, (_PAGE_EXEC | _PAGE_PRESENT)
+ mr r13, r11
rlwimi r10, r11, 31, 29, 29 /* extract _PAGE_DIRTY into SW */
and r12, r11, r10
andi. r10, r11, _PAGE_USER /* Test for _PAGE_USER */
slwi r10, r12, 1
or r10, r10, r12
iseleq r12, r12, r10
- rlwimi r11, r12, 0, 20, 31 /* Extract RPN from PTE and merge with perms */
- mtspr SPRN_MAS3, r11
+ rlwimi r13, r12, 0, 20, 31 /* Get RPN from PTE, merge w/ perms */
+ mtspr SPRN_MAS3, r13
#endif
+
+ mfspr r12, SPRN_MAS2
+#ifdef CONFIG_PTE_64BIT
+ rlwimi r12, r11, 32-19, 27, 31 /* extract WIMGE from pte */
+#else
+ rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */
+#endif
+#ifdef CONFIG_HUGETLB_PAGE
+ beq 6, 3f /* don't mask if page isn't huge */
+ li r13, 1
+ slw r13, r13, r14
+ subi r13, r13, 1
+ rlwinm r13, r13, 0, 0, 19 /* bottom bits used for WIMGE/etc */
+ andc r12, r12, r13 /* mask off ea bits within the page */
+#endif
+3: mtspr SPRN_MAS2, r12
+
#ifdef CONFIG_E200
/* Round robin TLB1 entries assignment */
mfspr r12, SPRN_MAS0
@@ -617,14 +819,23 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
mtspr SPRN_MAS0,r12
#endif /* CONFIG_E200 */
+tlb_write_entry:
tlbwe
/* Done...restore registers and get out of here. */
- mfspr r11, SPRN_SPRG_RSCRATCH4
+ mfspr r10, SPRN_SPRG_THREAD
+#ifdef CONFIG_HUGETLB_PAGE
+ beq 6, 8f /* skip restore for 4k page faults */
+ lwz r14, THREAD_NORMSAVE(4)(r10)
+ lwz r15, THREAD_NORMSAVE(5)(r10)
+ lwz r16, THREAD_NORMSAVE(6)(r10)
+ lwz r17, THREAD_NORMSAVE(7)(r10)
+#endif
+8: lwz r11, THREAD_NORMSAVE(3)(r10)
mtcr r11
- mfspr r13, SPRN_SPRG_RSCRATCH3
- mfspr r12, SPRN_SPRG_RSCRATCH2
- mfspr r11, SPRN_SPRG_RSCRATCH1
+ lwz r13, THREAD_NORMSAVE(2)(r10)
+ lwz r12, THREAD_NORMSAVE(1)(r10)
+ lwz r11, THREAD_NORMSAVE(0)(r10)
mfspr r10, SPRN_SPRG_RSCRATCH0
rfi /* Force context change */
@@ -632,7 +843,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
/* Note that the SPE support is closely modeled after the AltiVec
* support. Changes to one are likely to be applicable to the
* other! */
-load_up_spe:
+_GLOBAL(load_up_spe)
/*
* Disable SPE for the task which had SPE previously,
* and save its SPE registers in its thread_struct.
@@ -655,7 +866,7 @@ load_up_spe:
cmpi 0,r4,0
beq 1f
addi r4,r4,THREAD /* want THREAD of last_task_used_spe */
- SAVE_32EVRS(0,r10,r4)
+ SAVE_32EVRS(0,r10,r4,THREAD_EVR0)
evxor evr10, evr10, evr10 /* clear out evr10 */
evmwumiaa evr10, evr10, evr10 /* evr10 <- ACC = 0 * 0 + ACC */
li r5,THREAD_ACC
@@ -675,25 +886,12 @@ load_up_spe:
stw r4,THREAD_USED_SPE(r5)
evlddx evr4,r10,r5
evmra evr4,evr4
- REST_32EVRS(0,r10,r5)
+ REST_32EVRS(0,r10,r5,THREAD_EVR0)
#ifndef CONFIG_SMP
subi r4,r5,THREAD
stw r4,last_task_used_spe@l(r3)
#endif /* !CONFIG_SMP */
- /* restore registers and return */
-2: REST_4GPRS(3, r11)
- lwz r10,_CCR(r11)
- REST_GPR(1, r11)
- mtcr r10
- lwz r10,_LINK(r11)
- mtlr r10
- REST_GPR(10, r11)
- mtspr SPRN_SRR1,r9
- mtspr SPRN_SRR0,r12
- REST_GPR(9, r11)
- REST_GPR(12, r11)
- lwz r11,GPR11(r11)
- rfi
+ blr
/*
* SPE unavailable trap from kernel - print a message, but let
@@ -719,6 +917,33 @@ KernelSPE:
#endif /* CONFIG_SPE */
/*
+ * Translate the effec addr in r3 to phys addr. The phys addr will be put
+ * into r3(higher 32bit) and r4(lower 32bit)
+ */
+get_phys_addr:
+ mfmsr r8
+ mfspr r9,SPRN_PID
+ rlwinm r9,r9,16,0x3fff0000 /* turn PID into MAS6[SPID] */
+ rlwimi r9,r8,28,0x00000001 /* turn MSR[DS] into MAS6[SAS] */
+ mtspr SPRN_MAS6,r9
+
+ tlbsx 0,r3 /* must succeed */
+
+ mfspr r8,SPRN_MAS1
+ mfspr r12,SPRN_MAS3
+ rlwinm r9,r8,25,0x1f /* r9 = log2(page size) */
+ li r10,1024
+ slw r10,r10,r9 /* r10 = page size */
+ addi r10,r10,-1
+ and r11,r3,r10 /* r11 = page offset */
+ andc r4,r12,r10 /* r4 = page base */
+ or r4,r4,r11 /* r4 = devtree phys addr */
+#ifdef CONFIG_PHYS_64BIT
+ mfspr r3,SPRN_MAS7
+#endif
+ blr
+
+/*
* Global functions
*/
@@ -763,12 +988,17 @@ _GLOBAL(__setup_e500mc_ivors)
sync
blr
-/*
- * extern void giveup_altivec(struct task_struct *prev)
- *
- * The e500 core does not have an AltiVec unit.
- */
-_GLOBAL(giveup_altivec)
+/* setup ehv ivors for */
+_GLOBAL(__setup_ehv_ivors)
+ li r3,GuestDoorbell@l
+ mtspr SPRN_IVOR38,r3
+ li r3,CriticalGuestDoorbell@l
+ mtspr SPRN_IVOR39,r3
+ li r3,Hypercall@l
+ mtspr SPRN_IVOR40,r3
+ li r3,Ehvpriv@l
+ mtspr SPRN_IVOR41,r3
+ sync
blr
#ifdef CONFIG_SPE
@@ -786,13 +1016,11 @@ _GLOBAL(giveup_spe)
addi r3,r3,THREAD /* want THREAD of task */
lwz r5,PT_REGS(r3)
cmpi 0,r5,0
- SAVE_32EVRS(0, r4, r3)
+ SAVE_32EVRS(0, r4, r3, THREAD_EVR0)
evxor evr6, evr6, evr6 /* clear out evr6 */
evmwumiaa evr6, evr6, evr6 /* evr6 <- ACC = 0 * 0 + ACC */
li r4,THREAD_ACC
evstddx evr6, r4, r3 /* save off accumulator */
- mfspr r6,SPRN_SPEFSCR
- stw r6,THREAD_SPEFSCR(r3) /* save spefscr register value */
beq 1f
lwz r4,_MSR-STACK_FRAME_OVERHEAD(r5)
lis r3,MSR_SPE@h
@@ -808,16 +1036,6 @@ _GLOBAL(giveup_spe)
#endif /* CONFIG_SPE */
/*
- * extern void giveup_fpu(struct task_struct *prev)
- *
- * Not all FSL Book-E cores have an FPU
- */
-#ifndef CONFIG_PPC_FPU
-_GLOBAL(giveup_fpu)
- blr
-#endif
-
-/*
* extern void abort(void)
*
* At present, this routine just applies a system reset.
@@ -895,28 +1113,68 @@ _GLOBAL(flush_dcache_L1)
blr
+/* Flush L1 d-cache, invalidate and disable d-cache and i-cache */
+_GLOBAL(__flush_disable_L1)
+ mflr r10
+ bl flush_dcache_L1 /* Flush L1 d-cache */
+ mtlr r10
+
+ mfspr r4, SPRN_L1CSR0 /* Invalidate and disable d-cache */
+ li r5, 2
+ rlwimi r4, r5, 0, 3
+
+ msync
+ isync
+ mtspr SPRN_L1CSR0, r4
+ isync
+
+1: mfspr r4, SPRN_L1CSR0 /* Wait for the invalidate to finish */
+ andi. r4, r4, 2
+ bne 1b
+
+ mfspr r4, SPRN_L1CSR1 /* Invalidate and disable i-cache */
+ li r5, 2
+ rlwimi r4, r5, 0, 3
+
+ mtspr SPRN_L1CSR1, r4
+ isync
+
+ blr
+
#ifdef CONFIG_SMP
/* When we get here, r24 needs to hold the CPU # */
.globl __secondary_start
__secondary_start:
- lis r3,__secondary_hold_acknowledge@h
- ori r3,r3,__secondary_hold_acknowledge@l
- stw r24,0(r3)
-
- li r3,0
- mr r4,r24 /* Why? */
- bl call_setup_cpu
-
- lis r3,tlbcam_index@ha
- lwz r3,tlbcam_index@l(r3)
+ LOAD_REG_ADDR_PIC(r3, tlbcam_index)
+ lwz r3,0(r3)
mtctr r3
li r26,0 /* r26 safe? */
+ bl switch_to_as1
+ mr r27,r3 /* tlb entry */
/* Load each CAM entry */
1: mr r3,r26
bl loadcam_entry
addi r26,r26,1
bdnz 1b
+ mr r3,r27 /* tlb entry */
+ LOAD_REG_ADDR_PIC(r4, memstart_addr)
+ lwz r4,0(r4)
+ mr r5,r25 /* phys kernel start */
+ rlwinm r5,r5,0,~0x3ffffff /* aligned 64M */
+ subf r4,r5,r4 /* memstart_addr - phys kernel start */
+ li r5,0 /* no device tree */
+ li r6,0 /* not boot cpu */
+ bl restore_to_as0
+
+
+ lis r3,__secondary_hold_acknowledge@h
+ ori r3,r3,__secondary_hold_acknowledge@l
+ stw r24,0(r3)
+
+ li r3,0
+ mr r4,r24 /* Why? */
+ bl call_setup_cpu
/* get current_thread_info and current */
lis r1,secondary_ti@ha
@@ -953,6 +1211,112 @@ __secondary_hold_acknowledge:
#endif
/*
+ * Create a tlb entry with the same effective and physical address as
+ * the tlb entry used by the current running code. But set the TS to 1.
+ * Then switch to the address space 1. It will return with the r3 set to
+ * the ESEL of the new created tlb.
+ */
+_GLOBAL(switch_to_as1)
+ mflr r5
+
+ /* Find a entry not used */
+ mfspr r3,SPRN_TLB1CFG
+ andi. r3,r3,0xfff
+ mfspr r4,SPRN_PID
+ rlwinm r4,r4,16,0x3fff0000 /* turn PID into MAS6[SPID] */
+ mtspr SPRN_MAS6,r4
+1: lis r4,0x1000 /* Set MAS0(TLBSEL) = 1 */
+ addi r3,r3,-1
+ rlwimi r4,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */
+ mtspr SPRN_MAS0,r4
+ tlbre
+ mfspr r4,SPRN_MAS1
+ andis. r4,r4,MAS1_VALID@h
+ bne 1b
+
+ /* Get the tlb entry used by the current running code */
+ bl 0f
+0: mflr r4
+ tlbsx 0,r4
+
+ mfspr r4,SPRN_MAS1
+ ori r4,r4,MAS1_TS /* Set the TS = 1 */
+ mtspr SPRN_MAS1,r4
+
+ mfspr r4,SPRN_MAS0
+ rlwinm r4,r4,0,~MAS0_ESEL_MASK
+ rlwimi r4,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */
+ mtspr SPRN_MAS0,r4
+ tlbwe
+ isync
+ sync
+
+ mfmsr r4
+ ori r4,r4,MSR_IS | MSR_DS
+ mtspr SPRN_SRR0,r5
+ mtspr SPRN_SRR1,r4
+ sync
+ rfi
+
+/*
+ * Restore to the address space 0 and also invalidate the tlb entry created
+ * by switch_to_as1.
+ * r3 - the tlb entry which should be invalidated
+ * r4 - __pa(PAGE_OFFSET in AS1) - __pa(PAGE_OFFSET in AS0)
+ * r5 - device tree virtual address. If r4 is 0, r5 is ignored.
+ * r6 - boot cpu
+*/
+_GLOBAL(restore_to_as0)
+ mflr r0
+
+ bl 0f
+0: mflr r9
+ addi r9,r9,1f - 0b
+
+ /*
+ * We may map the PAGE_OFFSET in AS0 to a different physical address,
+ * so we need calculate the right jump and device tree address based
+ * on the offset passed by r4.
+ */
+ add r9,r9,r4
+ add r5,r5,r4
+ add r0,r0,r4
+
+2: mfmsr r7
+ li r8,(MSR_IS | MSR_DS)
+ andc r7,r7,r8
+
+ mtspr SPRN_SRR0,r9
+ mtspr SPRN_SRR1,r7
+ sync
+ rfi
+
+ /* Invalidate the temporary tlb entry for AS1 */
+1: lis r9,0x1000 /* Set MAS0(TLBSEL) = 1 */
+ rlwimi r9,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r3) */
+ mtspr SPRN_MAS0,r9
+ tlbre
+ mfspr r9,SPRN_MAS1
+ rlwinm r9,r9,0,2,31 /* Clear MAS1 Valid and IPPROT */
+ mtspr SPRN_MAS1,r9
+ tlbwe
+ isync
+
+ cmpwi r4,0
+ cmpwi cr1,r6,0
+ cror eq,4*cr1+eq,eq
+ bne 3f /* offset != 0 && is_boot_cpu */
+ mtlr r0
+ blr
+
+ /*
+ * The PAGE_OFFSET will map to a different physical address,
+ * jump to _start to do another relocation again.
+ */
+3: mr r3,r5
+ bl _start
+
+/*
* We put a few things here that have to be page-aligned. This stuff
* goes at the beginning of the data segment, which is page-aligned.
*/
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 5ecd0401cdb..0bb5918faaa 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -27,9 +27,7 @@
#include <linux/kprobes.h>
#include <linux/percpu.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/sched.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <asm/hw_breakpoint.h>
@@ -74,7 +72,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
* If so, DABR will be populated in single_step_dabr_instruction().
*/
if (current->thread.last_hit_ubp != bp)
- set_dabr(info->address | info->type | DABR_TRANSLATION);
+ __set_breakpoint(info);
return 0;
}
@@ -98,7 +96,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
}
*slot = NULL;
- set_dabr(0);
+ hw_breakpoint_disable();
}
/*
@@ -112,7 +110,7 @@ void arch_unregister_hw_breakpoint(struct perf_event *bp)
* and the single_step_dabr_instruction(), then cleanup the breakpoint
* restoration variables to prevent dangling pointers.
*/
- if (bp->ctx->task)
+ if (bp->ctx && bp->ctx->task)
bp->ctx->task->thread.last_hit_ubp = NULL;
}
@@ -128,19 +126,13 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
int arch_bp_generic_fields(int type, int *gen_bp_type)
{
- switch (type) {
- case DABR_DATA_READ:
- *gen_bp_type = HW_BREAKPOINT_R;
- break;
- case DABR_DATA_WRITE:
- *gen_bp_type = HW_BREAKPOINT_W;
- break;
- case (DABR_DATA_WRITE | DABR_DATA_READ):
- *gen_bp_type = (HW_BREAKPOINT_W | HW_BREAKPOINT_R);
- break;
- default:
+ *gen_bp_type = 0;
+ if (type & HW_BRK_TYPE_READ)
+ *gen_bp_type |= HW_BREAKPOINT_R;
+ if (type & HW_BRK_TYPE_WRITE)
+ *gen_bp_type |= HW_BREAKPOINT_W;
+ if (*gen_bp_type == 0)
return -EINVAL;
- }
return 0;
}
@@ -149,26 +141,26 @@ int arch_bp_generic_fields(int type, int *gen_bp_type)
*/
int arch_validate_hwbkpt_settings(struct perf_event *bp)
{
- int ret = -EINVAL;
+ int ret = -EINVAL, length_max;
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
if (!bp)
return ret;
- switch (bp->attr.bp_type) {
- case HW_BREAKPOINT_R:
- info->type = DABR_DATA_READ;
- break;
- case HW_BREAKPOINT_W:
- info->type = DABR_DATA_WRITE;
- break;
- case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
- info->type = (DABR_DATA_READ | DABR_DATA_WRITE);
- break;
- default:
+ info->type = HW_BRK_TYPE_TRANSLATE;
+ if (bp->attr.bp_type & HW_BREAKPOINT_R)
+ info->type |= HW_BRK_TYPE_READ;
+ if (bp->attr.bp_type & HW_BREAKPOINT_W)
+ info->type |= HW_BRK_TYPE_WRITE;
+ if (info->type == HW_BRK_TYPE_TRANSLATE)
+ /* must set alteast read or write */
return ret;
- }
-
+ if (!(bp->attr.exclude_user))
+ info->type |= HW_BRK_TYPE_USER;
+ if (!(bp->attr.exclude_kernel))
+ info->type |= HW_BRK_TYPE_KERNEL;
+ if (!(bp->attr.exclude_hv))
+ info->type |= HW_BRK_TYPE_HYP;
info->address = bp->attr.bp_addr;
info->len = bp->attr.bp_len;
@@ -178,8 +170,16 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
* HW_BREAKPOINT_ALIGN by rounding off to the lower address, the
* 'symbolsize' should satisfy the check below.
*/
+ length_max = 8; /* DABR */
+ if (cpu_has_feature(CPU_FTR_DAWR)) {
+ length_max = 512 ; /* 64 doublewords */
+ /* DAWR region can't cross 512 boundary */
+ if ((bp->attr.bp_addr >> 10) !=
+ ((bp->attr.bp_addr + bp->attr.bp_len - 1) >> 10))
+ return -EINVAL;
+ }
if (info->len >
- (HW_BREAKPOINT_LEN - (info->address & HW_BREAKPOINT_ALIGN)))
+ (length_max - (info->address & HW_BREAKPOINT_ALIGN)))
return -EINVAL;
return 0;
}
@@ -198,7 +198,7 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs)
info = counter_arch_bp(tsk->thread.last_hit_ubp);
regs->msr &= ~MSR_SE;
- set_dabr(info->address | info->type | DABR_TRANSLATION);
+ __set_breakpoint(info);
tsk->thread.last_hit_ubp = NULL;
}
@@ -216,7 +216,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
unsigned long dar = regs->dar;
/* Disable breakpoints during exception handling */
- set_dabr(0);
+ hw_breakpoint_disable();
/*
* The counter may be concurrently released but that can only
@@ -249,12 +249,14 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
* we still need to single-step the instruction, but we don't
* generate an event.
*/
- info->extraneous_interrupt = !((bp->attr.bp_addr <= dar) &&
- (dar - bp->attr.bp_addr < bp->attr.bp_len));
+ info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ;
+ if (!((bp->attr.bp_addr <= dar) &&
+ (dar - bp->attr.bp_addr < bp->attr.bp_len)))
+ info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
/* Do not emulate user-space instructions, instead single-step them */
if (user_mode(regs)) {
- bp->ctx->task->thread.last_hit_ubp = bp;
+ current->thread.last_hit_ubp = bp;
regs->msr |= MSR_SE;
goto out;
}
@@ -279,10 +281,10 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
* As a policy, the callback is invoked in a 'trigger-after-execute'
* fashion
*/
- if (!info->extraneous_interrupt)
+ if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ))
perf_bp_event(bp, regs);
- set_dabr(info->address | info->type | DABR_TRANSLATION);
+ __set_breakpoint(info);
out:
rcu_read_unlock();
return rc;
@@ -295,7 +297,7 @@ int __kprobes single_step_dabr_instruction(struct die_args *args)
{
struct pt_regs *regs = args->regs;
struct perf_event *bp = NULL;
- struct arch_hw_breakpoint *bp_info;
+ struct arch_hw_breakpoint *info;
bp = current->thread.last_hit_ubp;
/*
@@ -305,16 +307,16 @@ int __kprobes single_step_dabr_instruction(struct die_args *args)
if (!bp)
return NOTIFY_DONE;
- bp_info = counter_arch_bp(bp);
+ info = counter_arch_bp(bp);
/*
* We shall invoke the user-defined callback function in the single
* stepping handler to confirm to 'trigger-after-execute' semantics
*/
- if (!bp_info->extraneous_interrupt)
+ if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ))
perf_bp_event(bp, regs);
- set_dabr(bp_info->address | bp_info->type | DABR_TRANSLATION);
+ __set_breakpoint(info);
current->thread.last_hit_ubp = NULL;
/*
diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c
index f62efdfd176..1114d13ac19 100644
--- a/arch/powerpc/kernel/ibmebus.c
+++ b/arch/powerpc/kernel/ibmebus.c
@@ -37,15 +37,16 @@
*/
#include <linux/init.h>
+#include <linux/export.h>
#include <linux/console.h>
#include <linux/kobject.h>
#include <linux/dma-mapping.h>
#include <linux/interrupt.h>
#include <linux/of.h>
#include <linux/slab.h>
+#include <linux/stat.h>
#include <linux/of_platform.h>
#include <asm/ibmebus.h>
-#include <asm/abs_addr.h>
static struct device ibmebus_bus_device = { /* fake "parent" device */
.init_name = "ibmebus",
@@ -63,7 +64,8 @@ static struct of_device_id __initdata ibmebus_matches[] = {
static void *ibmebus_alloc_coherent(struct device *dev,
size_t size,
dma_addr_t *dma_handle,
- gfp_t flag)
+ gfp_t flag,
+ struct dma_attrs *attrs)
{
void *mem;
@@ -75,7 +77,8 @@ static void *ibmebus_alloc_coherent(struct device *dev,
static void ibmebus_free_coherent(struct device *dev,
size_t size, void *vaddr,
- dma_addr_t dma_handle)
+ dma_addr_t dma_handle,
+ struct dma_attrs *attrs)
{
kfree(vaddr);
}
@@ -125,17 +128,23 @@ static void ibmebus_unmap_sg(struct device *dev,
static int ibmebus_dma_supported(struct device *dev, u64 mask)
{
- return 1;
+ return mask == DMA_BIT_MASK(64);
+}
+
+static u64 ibmebus_dma_get_required_mask(struct device *dev)
+{
+ return DMA_BIT_MASK(64);
}
static struct dma_map_ops ibmebus_dma_ops = {
- .alloc_coherent = ibmebus_alloc_coherent,
- .free_coherent = ibmebus_free_coherent,
- .map_sg = ibmebus_map_sg,
- .unmap_sg = ibmebus_unmap_sg,
- .dma_supported = ibmebus_dma_supported,
- .map_page = ibmebus_map_page,
- .unmap_page = ibmebus_unmap_page,
+ .alloc = ibmebus_alloc_coherent,
+ .free = ibmebus_free_coherent,
+ .map_sg = ibmebus_map_sg,
+ .unmap_sg = ibmebus_unmap_sg,
+ .dma_supported = ibmebus_dma_supported,
+ .get_required_mask = ibmebus_dma_get_required_mask,
+ .map_page = ibmebus_map_page,
+ .unmap_page = ibmebus_unmap_page,
};
static int ibmebus_match_path(struct device *dev, void *data)
@@ -196,18 +205,19 @@ static int ibmebus_create_devices(const struct of_device_id *matches)
return ret;
}
-int ibmebus_register_driver(struct of_platform_driver *drv)
+int ibmebus_register_driver(struct platform_driver *drv)
{
/* If the driver uses devices that ibmebus doesn't know, add them */
ibmebus_create_devices(drv->driver.of_match_table);
- return of_register_driver(drv, &ibmebus_bus_type);
+ drv->driver.bus = &ibmebus_bus_type;
+ return driver_register(&drv->driver);
}
EXPORT_SYMBOL(ibmebus_register_driver);
-void ibmebus_unregister_driver(struct of_platform_driver *drv)
+void ibmebus_unregister_driver(struct platform_driver *drv)
{
- of_unregister_driver(drv);
+ driver_unregister(&drv->driver);
}
EXPORT_SYMBOL(ibmebus_unregister_driver);
@@ -282,6 +292,7 @@ out:
return rc;
return count;
}
+static BUS_ATTR(probe, S_IWUSR, NULL, ibmebus_store_probe);
static ssize_t ibmebus_store_remove(struct bus_type *bus,
const char *buf, size_t count)
@@ -307,16 +318,410 @@ static ssize_t ibmebus_store_remove(struct bus_type *bus,
return -ENODEV;
}
}
+static BUS_ATTR(remove, S_IWUSR, NULL, ibmebus_store_remove);
+
+static struct attribute *ibmbus_bus_attrs[] = {
+ &bus_attr_probe.attr,
+ &bus_attr_remove.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(ibmbus_bus);
+
+static int ibmebus_bus_bus_match(struct device *dev, struct device_driver *drv)
+{
+ const struct of_device_id *matches = drv->of_match_table;
+
+ if (!matches)
+ return 0;
+
+ return of_match_device(matches, dev) != NULL;
+}
+
+static int ibmebus_bus_device_probe(struct device *dev)
+{
+ int error = -ENODEV;
+ struct platform_driver *drv;
+ struct platform_device *of_dev;
+
+ drv = to_platform_driver(dev->driver);
+ of_dev = to_platform_device(dev);
+
+ if (!drv->probe)
+ return error;
+
+ of_dev_get(of_dev);
+
+ if (of_driver_match_device(dev, dev->driver))
+ error = drv->probe(of_dev);
+ if (error)
+ of_dev_put(of_dev);
+
+ return error;
+}
-static struct bus_attribute ibmebus_bus_attrs[] = {
- __ATTR(probe, S_IWUSR, NULL, ibmebus_store_probe),
- __ATTR(remove, S_IWUSR, NULL, ibmebus_store_remove),
+static int ibmebus_bus_device_remove(struct device *dev)
+{
+ struct platform_device *of_dev = to_platform_device(dev);
+ struct platform_driver *drv = to_platform_driver(dev->driver);
+
+ if (dev->driver && drv->remove)
+ drv->remove(of_dev);
+ return 0;
+}
+
+static void ibmebus_bus_device_shutdown(struct device *dev)
+{
+ struct platform_device *of_dev = to_platform_device(dev);
+ struct platform_driver *drv = to_platform_driver(dev->driver);
+
+ if (dev->driver && drv->shutdown)
+ drv->shutdown(of_dev);
+}
+
+/*
+ * ibmebus_bus_device_attrs
+ */
+static ssize_t devspec_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct platform_device *ofdev;
+
+ ofdev = to_platform_device(dev);
+ return sprintf(buf, "%s\n", ofdev->dev.of_node->full_name);
+}
+
+static ssize_t name_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct platform_device *ofdev;
+
+ ofdev = to_platform_device(dev);
+ return sprintf(buf, "%s\n", ofdev->dev.of_node->name);
+}
+
+static ssize_t modalias_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ ssize_t len = of_device_get_modalias(dev, buf, PAGE_SIZE - 2);
+ buf[len] = '\n';
+ buf[len+1] = 0;
+ return len+1;
+}
+
+struct device_attribute ibmebus_bus_device_attrs[] = {
+ __ATTR_RO(devspec),
+ __ATTR_RO(name),
+ __ATTR_RO(modalias),
__ATTR_NULL
};
+#ifdef CONFIG_PM_SLEEP
+static int ibmebus_bus_legacy_suspend(struct device *dev, pm_message_t mesg)
+{
+ struct platform_device *of_dev = to_platform_device(dev);
+ struct platform_driver *drv = to_platform_driver(dev->driver);
+ int ret = 0;
+
+ if (dev->driver && drv->suspend)
+ ret = drv->suspend(of_dev, mesg);
+ return ret;
+}
+
+static int ibmebus_bus_legacy_resume(struct device *dev)
+{
+ struct platform_device *of_dev = to_platform_device(dev);
+ struct platform_driver *drv = to_platform_driver(dev->driver);
+ int ret = 0;
+
+ if (dev->driver && drv->resume)
+ ret = drv->resume(of_dev);
+ return ret;
+}
+
+static int ibmebus_bus_pm_prepare(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+ int ret = 0;
+
+ if (drv && drv->pm && drv->pm->prepare)
+ ret = drv->pm->prepare(dev);
+
+ return ret;
+}
+
+static void ibmebus_bus_pm_complete(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+
+ if (drv && drv->pm && drv->pm->complete)
+ drv->pm->complete(dev);
+}
+
+#ifdef CONFIG_SUSPEND
+
+static int ibmebus_bus_pm_suspend(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+ int ret = 0;
+
+ if (!drv)
+ return 0;
+
+ if (drv->pm) {
+ if (drv->pm->suspend)
+ ret = drv->pm->suspend(dev);
+ } else {
+ ret = ibmebus_bus_legacy_suspend(dev, PMSG_SUSPEND);
+ }
+
+ return ret;
+}
+
+static int ibmebus_bus_pm_suspend_noirq(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+ int ret = 0;
+
+ if (!drv)
+ return 0;
+
+ if (drv->pm) {
+ if (drv->pm->suspend_noirq)
+ ret = drv->pm->suspend_noirq(dev);
+ }
+
+ return ret;
+}
+
+static int ibmebus_bus_pm_resume(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+ int ret = 0;
+
+ if (!drv)
+ return 0;
+
+ if (drv->pm) {
+ if (drv->pm->resume)
+ ret = drv->pm->resume(dev);
+ } else {
+ ret = ibmebus_bus_legacy_resume(dev);
+ }
+
+ return ret;
+}
+
+static int ibmebus_bus_pm_resume_noirq(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+ int ret = 0;
+
+ if (!drv)
+ return 0;
+
+ if (drv->pm) {
+ if (drv->pm->resume_noirq)
+ ret = drv->pm->resume_noirq(dev);
+ }
+
+ return ret;
+}
+
+#else /* !CONFIG_SUSPEND */
+
+#define ibmebus_bus_pm_suspend NULL
+#define ibmebus_bus_pm_resume NULL
+#define ibmebus_bus_pm_suspend_noirq NULL
+#define ibmebus_bus_pm_resume_noirq NULL
+
+#endif /* !CONFIG_SUSPEND */
+
+#ifdef CONFIG_HIBERNATE_CALLBACKS
+
+static int ibmebus_bus_pm_freeze(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+ int ret = 0;
+
+ if (!drv)
+ return 0;
+
+ if (drv->pm) {
+ if (drv->pm->freeze)
+ ret = drv->pm->freeze(dev);
+ } else {
+ ret = ibmebus_bus_legacy_suspend(dev, PMSG_FREEZE);
+ }
+
+ return ret;
+}
+
+static int ibmebus_bus_pm_freeze_noirq(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+ int ret = 0;
+
+ if (!drv)
+ return 0;
+
+ if (drv->pm) {
+ if (drv->pm->freeze_noirq)
+ ret = drv->pm->freeze_noirq(dev);
+ }
+
+ return ret;
+}
+
+static int ibmebus_bus_pm_thaw(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+ int ret = 0;
+
+ if (!drv)
+ return 0;
+
+ if (drv->pm) {
+ if (drv->pm->thaw)
+ ret = drv->pm->thaw(dev);
+ } else {
+ ret = ibmebus_bus_legacy_resume(dev);
+ }
+
+ return ret;
+}
+
+static int ibmebus_bus_pm_thaw_noirq(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+ int ret = 0;
+
+ if (!drv)
+ return 0;
+
+ if (drv->pm) {
+ if (drv->pm->thaw_noirq)
+ ret = drv->pm->thaw_noirq(dev);
+ }
+
+ return ret;
+}
+
+static int ibmebus_bus_pm_poweroff(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+ int ret = 0;
+
+ if (!drv)
+ return 0;
+
+ if (drv->pm) {
+ if (drv->pm->poweroff)
+ ret = drv->pm->poweroff(dev);
+ } else {
+ ret = ibmebus_bus_legacy_suspend(dev, PMSG_HIBERNATE);
+ }
+
+ return ret;
+}
+
+static int ibmebus_bus_pm_poweroff_noirq(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+ int ret = 0;
+
+ if (!drv)
+ return 0;
+
+ if (drv->pm) {
+ if (drv->pm->poweroff_noirq)
+ ret = drv->pm->poweroff_noirq(dev);
+ }
+
+ return ret;
+}
+
+static int ibmebus_bus_pm_restore(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+ int ret = 0;
+
+ if (!drv)
+ return 0;
+
+ if (drv->pm) {
+ if (drv->pm->restore)
+ ret = drv->pm->restore(dev);
+ } else {
+ ret = ibmebus_bus_legacy_resume(dev);
+ }
+
+ return ret;
+}
+
+static int ibmebus_bus_pm_restore_noirq(struct device *dev)
+{
+ struct device_driver *drv = dev->driver;
+ int ret = 0;
+
+ if (!drv)
+ return 0;
+
+ if (drv->pm) {
+ if (drv->pm->restore_noirq)
+ ret = drv->pm->restore_noirq(dev);
+ }
+
+ return ret;
+}
+
+#else /* !CONFIG_HIBERNATE_CALLBACKS */
+
+#define ibmebus_bus_pm_freeze NULL
+#define ibmebus_bus_pm_thaw NULL
+#define ibmebus_bus_pm_poweroff NULL
+#define ibmebus_bus_pm_restore NULL
+#define ibmebus_bus_pm_freeze_noirq NULL
+#define ibmebus_bus_pm_thaw_noirq NULL
+#define ibmebus_bus_pm_poweroff_noirq NULL
+#define ibmebus_bus_pm_restore_noirq NULL
+
+#endif /* !CONFIG_HIBERNATE_CALLBACKS */
+
+static struct dev_pm_ops ibmebus_bus_dev_pm_ops = {
+ .prepare = ibmebus_bus_pm_prepare,
+ .complete = ibmebus_bus_pm_complete,
+ .suspend = ibmebus_bus_pm_suspend,
+ .resume = ibmebus_bus_pm_resume,
+ .freeze = ibmebus_bus_pm_freeze,
+ .thaw = ibmebus_bus_pm_thaw,
+ .poweroff = ibmebus_bus_pm_poweroff,
+ .restore = ibmebus_bus_pm_restore,
+ .suspend_noirq = ibmebus_bus_pm_suspend_noirq,
+ .resume_noirq = ibmebus_bus_pm_resume_noirq,
+ .freeze_noirq = ibmebus_bus_pm_freeze_noirq,
+ .thaw_noirq = ibmebus_bus_pm_thaw_noirq,
+ .poweroff_noirq = ibmebus_bus_pm_poweroff_noirq,
+ .restore_noirq = ibmebus_bus_pm_restore_noirq,
+};
+
+#define IBMEBUS_BUS_PM_OPS_PTR (&ibmebus_bus_dev_pm_ops)
+
+#else /* !CONFIG_PM_SLEEP */
+
+#define IBMEBUS_BUS_PM_OPS_PTR NULL
+
+#endif /* !CONFIG_PM_SLEEP */
+
struct bus_type ibmebus_bus_type = {
- .uevent = of_device_uevent,
- .bus_attrs = ibmebus_bus_attrs
+ .name = "ibmebus",
+ .uevent = of_device_uevent_modalias,
+ .bus_groups = ibmbus_bus_groups,
+ .match = ibmebus_bus_bus_match,
+ .probe = ibmebus_bus_device_probe,
+ .remove = ibmebus_bus_device_remove,
+ .shutdown = ibmebus_bus_device_shutdown,
+ .dev_attrs = ibmebus_bus_device_attrs,
+ .pm = IBMEBUS_BUS_PM_OPS_PTR,
};
EXPORT_SYMBOL(ibmebus_bus_type);
@@ -326,7 +731,7 @@ static int __init ibmebus_bus_init(void)
printk(KERN_INFO "IBM eBus Device Driver\n");
- err = of_bus_type_init(&ibmebus_bus_type, "ibmebus");
+ err = bus_register(&ibmebus_bus_type);
if (err) {
printk(KERN_ERR "%s: failed to register IBM eBus.\n",
__func__);
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 39a2baa6ad5..d7216c9abda 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -26,80 +26,57 @@
#include <linux/sysctl.h>
#include <linux/tick.h>
-#include <asm/system.h>
#include <asm/processor.h>
#include <asm/cputable.h>
#include <asm/time.h>
#include <asm/machdep.h>
+#include <asm/runlatch.h>
#include <asm/smp.h>
-#ifdef CONFIG_HOTPLUG_CPU
-#define cpu_should_die() cpu_is_offline(smp_processor_id())
-#else
-#define cpu_should_die() 0
-#endif
+
+unsigned long cpuidle_disable = IDLE_NO_OVERRIDE;
+EXPORT_SYMBOL(cpuidle_disable);
static int __init powersave_off(char *arg)
{
ppc_md.power_save = NULL;
+ cpuidle_disable = IDLE_POWERSAVE_OFF;
return 0;
}
__setup("powersave=off", powersave_off);
-/*
- * The body of the idle task.
- */
-void cpu_idle(void)
+#ifdef CONFIG_HOTPLUG_CPU
+void arch_cpu_idle_dead(void)
{
- if (ppc_md.idle_loop)
- ppc_md.idle_loop(); /* doesn't return */
-
- set_thread_flag(TIF_POLLING_NRFLAG);
- while (1) {
- tick_nohz_stop_sched_tick(1);
- while (!need_resched() && !cpu_should_die()) {
- ppc64_runlatch_off();
-
- if (ppc_md.power_save) {
- clear_thread_flag(TIF_POLLING_NRFLAG);
- /*
- * smp_mb is so clearing of TIF_POLLING_NRFLAG
- * is ordered w.r.t. need_resched() test.
- */
- smp_mb();
- local_irq_disable();
-
- /* Don't trace irqs off for idle */
- stop_critical_timings();
-
- /* check again after disabling irqs */
- if (!need_resched() && !cpu_should_die())
- ppc_md.power_save();
-
- start_critical_timings();
-
- local_irq_enable();
- set_thread_flag(TIF_POLLING_NRFLAG);
-
- } else {
- /*
- * Go into low thread priority and possibly
- * low power mode.
- */
- HMT_low();
- HMT_very_low();
- }
- }
+ sched_preempt_enable_no_resched();
+ cpu_die();
+}
+#endif
- HMT_medium();
- ppc64_runlatch_on();
- tick_nohz_restart_sched_tick();
- preempt_enable_no_resched();
- if (cpu_should_die())
- cpu_die();
- schedule();
- preempt_disable();
+void arch_cpu_idle(void)
+{
+ ppc64_runlatch_off();
+
+ if (ppc_md.power_save) {
+ ppc_md.power_save();
+ /*
+ * Some power_save functions return with
+ * interrupts enabled, some don't.
+ */
+ if (irqs_disabled())
+ local_irq_enable();
+ } else {
+ local_irq_enable();
+ /*
+ * Go into low thread priority and possibly
+ * low power mode.
+ */
+ HMT_low();
+ HMT_very_low();
}
+
+ HMT_medium();
+ ppc64_runlatch_on();
}
int powersave_nap;
@@ -108,7 +85,7 @@ int powersave_nap;
/*
* Register the sysctl to set/clear powersave_nap.
*/
-static ctl_table powersave_nap_ctl_table[]={
+static struct ctl_table powersave_nap_ctl_table[] = {
{
.procname = "powersave-nap",
.data = &powersave_nap,
@@ -118,7 +95,7 @@ static ctl_table powersave_nap_ctl_table[]={
},
{}
};
-static ctl_table powersave_nap_sysctl_root[] = {
+static struct ctl_table powersave_nap_sysctl_root[] = {
{
.procname = "kernel",
.mode = 0555,
diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S
index 15c611de1ee..1686916cc7f 100644
--- a/arch/powerpc/kernel/idle_6xx.S
+++ b/arch/powerpc/kernel/idle_6xx.S
@@ -135,7 +135,7 @@ BEGIN_FTR_SECTION
DSSALL
sync
END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
- rlwinm r9,r1,0,0,31-THREAD_SHIFT /* current thread_info */
+ CURRENT_THREAD_INFO(r9, r1)
lwz r8,TI_LOCAL_FLAGS(r9) /* set napping bit */
ori r8,r8,_TLF_NAPPING /* so when we take an exception */
stw r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */
@@ -158,7 +158,7 @@ _GLOBAL(power_save_ppc32_restore)
stw r9,_NIP(r11) /* make it do a blr */
#ifdef CONFIG_SMP
- rlwinm r12,r11,0,0,31-THREAD_SHIFT
+ CURRENT_THREAD_INFO(r12, r11)
lwz r11,TI_CPU(r12) /* get cpu number * 4 */
slwi r11,r11,2
#else
diff --git a/arch/powerpc/kernel/idle_book3e.S b/arch/powerpc/kernel/idle_book3e.S
index 16c002d6bdf..48c21acef91 100644
--- a/arch/powerpc/kernel/idle_book3e.S
+++ b/arch/powerpc/kernel/idle_book3e.S
@@ -16,11 +16,13 @@
#include <asm/ppc-opcode.h>
#include <asm/processor.h>
#include <asm/thread_info.h>
+#include <asm/epapr_hcalls.h>
/* 64-bit version only for now */
#ifdef CONFIG_PPC64
-_GLOBAL(book3e_idle)
+.macro BOOK3E_IDLE name loop
+_GLOBAL(\name)
/* Save LR for later */
mflr r0
std r0,16(r1)
@@ -29,43 +31,30 @@ _GLOBAL(book3e_idle)
wrteei 0
/* Now check if an interrupt came in while we were soft disabled
- * since we may otherwise lose it (doorbells etc...). We know
- * that since PACAHARDIRQEN will have been cleared in that case.
+ * since we may otherwise lose it (doorbells etc...).
*/
- lbz r3,PACAHARDIRQEN(r13)
+ lbz r3,PACAIRQHAPPENED(r13)
cmpwi cr0,r3,0
- beqlr
+ bnelr
- /* Now we are going to mark ourselves as soft and hard enables in
+ /* Now we are going to mark ourselves as soft and hard enabled in
* order to be able to take interrupts while asleep. We inform lockdep
* of that. We don't actually turn interrupts on just yet tho.
*/
#ifdef CONFIG_TRACE_IRQFLAGS
stdu r1,-128(r1)
- bl .trace_hardirqs_on
+ bl trace_hardirqs_on
+ addi r1,r1,128
#endif
li r0,1
stb r0,PACASOFTIRQEN(r13)
- stb r0,PACAHARDIRQEN(r13)
/* Interrupts will make use return to LR, so get something we want
* in there
*/
bl 1f
- /* Hard disable interrupts again */
- wrteei 0
-
- /* Mark them off again in the PACA as well */
- li r0,0
- stb r0,PACASOFTIRQEN(r13)
- stb r0,PACAHARDIRQEN(r13)
-
- /* Tell lockdep about it */
-#ifdef CONFIG_TRACE_IRQFLAGS
- bl .trace_hardirqs_off
- addi r1,r1,128
-#endif
+ /* And return (interrupts are on) */
ld r0,16(r1)
mtlr r0
blr
@@ -73,14 +62,40 @@ _GLOBAL(book3e_idle)
1: /* Let's set the _TLF_NAPPING flag so interrupts make us return
* to the right spot
*/
- clrrdi r11,r1,THREAD_SHIFT
+ CURRENT_THREAD_INFO(r11, r1)
ld r10,TI_LOCAL_FLAGS(r11)
ori r10,r10,_TLF_NAPPING
std r10,TI_LOCAL_FLAGS(r11)
/* We can now re-enable hard interrupts and go to sleep */
wrteei 1
-1: PPC_WAIT(0)
+ \loop
+
+.endm
+
+.macro BOOK3E_IDLE_LOOP
+1:
+ PPC_WAIT(0)
b 1b
+.endm
+
+/* epapr_ev_idle_start below is patched with the proper hcall
+ opcodes during kernel initialization */
+.macro EPAPR_EV_IDLE_LOOP
+idle_loop:
+ LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE))
+
+.global epapr_ev_idle_start
+epapr_ev_idle_start:
+ li r3, -1
+ nop
+ nop
+ nop
+ b idle_loop
+.endm
+
+BOOK3E_IDLE epapr_ev_idle EPAPR_EV_IDLE_LOOP
+
+BOOK3E_IDLE book3e_idle BOOK3E_IDLE_LOOP
#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S
index 47a1a983ff8..15448668988 100644
--- a/arch/powerpc/kernel/idle_e500.S
+++ b/arch/powerpc/kernel/idle_e500.S
@@ -21,11 +21,22 @@
.text
_GLOBAL(e500_idle)
- rlwinm r3,r1,0,0,31-THREAD_SHIFT /* current thread_info */
+ CURRENT_THREAD_INFO(r3, r1)
lwz r4,TI_LOCAL_FLAGS(r3) /* set napping bit */
ori r4,r4,_TLF_NAPPING /* so when we take an exception */
stw r4,TI_LOCAL_FLAGS(r3) /* it will return to our caller */
+#ifdef CONFIG_PPC_E500MC
+ wrteei 1
+1: wait
+
+ /*
+ * Guard against spurious wakeups (e.g. from a hypervisor) --
+ * any real interrupt will cause us to return to LR due to
+ * _TLF_NAPPING.
+ */
+ b 1b
+#else
/* Check if we can nap or doze, put HID0 mask in r3 */
lis r3,0
BEGIN_FTR_SECTION
@@ -72,6 +83,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_L2CSR|CPU_FTR_CAN_NAP)
mtmsr r7
isync
2: b 2b
+#endif /* !E500MC */
/*
* Return from NAP/DOZE mode, restore some CPU specific registers,
@@ -84,7 +96,7 @@ _GLOBAL(power_save_ppc32_restore)
stw r9,_NIP(r11) /* make it do a blr */
#ifdef CONFIG_SMP
- rlwinm r12,r1,0,0,31-THREAD_SHIFT
+ CURRENT_THREAD_INFO(r12, r1)
lwz r11,TI_CPU(r12) /* get cpu number * 4 */
slwi r11,r11,2
#else
diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S
index 5328709eeed..f57a19348bd 100644
--- a/arch/powerpc/kernel/idle_power4.S
+++ b/arch/powerpc/kernel/idle_power4.S
@@ -14,6 +14,7 @@
#include <asm/thread_info.h>
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
+#include <asm/irqflags.h>
#undef DEBUG
@@ -29,19 +30,36 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP)
cmpwi 0,r4,0
beqlr
- /* Go to NAP now */
+ /* Hard disable interrupts */
mfmsr r7
rldicl r0,r7,48,1
rotldi r0,r0,16
- mtmsrd r0,1 /* hard-disable interrupts */
+ mtmsrd r0,1
+
+ /* Check if something happened while soft-disabled */
+ lbz r0,PACAIRQHAPPENED(r13)
+ cmpwi cr0,r0,0
+ bnelr
+
+ /* Soft-enable interrupts */
+#ifdef CONFIG_TRACE_IRQFLAGS
+ mflr r0
+ std r0,16(r1)
+ stdu r1,-128(r1)
+ bl trace_hardirqs_on
+ addi r1,r1,128
+ ld r0,16(r1)
+ mtlr r0
+ mfmsr r7
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
li r0,1
stb r0,PACASOFTIRQEN(r13) /* we'll hard-enable shortly */
- stb r0,PACAHARDIRQEN(r13)
BEGIN_FTR_SECTION
DSSALL
sync
END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
- clrrdi r9,r1,THREAD_SHIFT /* current thread_info */
+ CURRENT_THREAD_INFO(r9, r1)
ld r8,TI_LOCAL_FLAGS(r9) /* set napping bit */
ori r8,r8,_TLF_NAPPING /* so when we take an exception */
std r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */
@@ -53,24 +71,3 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
isync
b 1b
-_GLOBAL(power4_cpu_offline_powersave)
- /* Go to NAP now */
- mfmsr r7
- rldicl r0,r7,48,1
- rotldi r0,r0,16
- mtmsrd r0,1 /* hard-disable interrupts */
- li r0,1
- li r6,0
- stb r0,PACAHARDIRQEN(r13) /* we'll hard-enable shortly */
- stb r6,PACASOFTIRQEN(r13) /* soft-disable irqs */
-BEGIN_FTR_SECTION
- DSSALL
- sync
-END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
- ori r7,r7,MSR_EE
- oris r7,r7,MSR_POW@h
- sync
- isync
- mtmsrd r7
- isync
- blr
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
new file mode 100644
index 00000000000..5cf3d367190
--- /dev/null
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -0,0 +1,187 @@
+/*
+ * This file contains the power_save function for Power7 CPUs.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/threads.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/cputable.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ppc-opcode.h>
+#include <asm/hw_irq.h>
+#include <asm/kvm_book3s_asm.h>
+#include <asm/opal.h>
+
+#undef DEBUG
+
+/* Idle state entry routines */
+
+#define IDLE_STATE_ENTER_SEQ(IDLE_INST) \
+ /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \
+ std r0,0(r1); \
+ ptesync; \
+ ld r0,0(r1); \
+1: cmp cr0,r0,r0; \
+ bne 1b; \
+ IDLE_INST; \
+ b .
+
+ .text
+
+/*
+ * Pass requested state in r3:
+ * 0 - nap
+ * 1 - sleep
+ *
+ * To check IRQ_HAPPENED in r4
+ * 0 - don't check
+ * 1 - check
+ */
+_GLOBAL(power7_powersave_common)
+ /* Use r3 to pass state nap/sleep/winkle */
+ /* NAP is a state loss, we create a regs frame on the
+ * stack, fill it up with the state we care about and
+ * stick a pointer to it in PACAR1. We really only
+ * need to save PC, some CR bits and the NV GPRs,
+ * but for now an interrupt frame will do.
+ */
+ mflr r0
+ std r0,16(r1)
+ stdu r1,-INT_FRAME_SIZE(r1)
+ std r0,_LINK(r1)
+ std r0,_NIP(r1)
+
+#ifndef CONFIG_SMP
+ /* Make sure FPU, VSX etc... are flushed as we may lose
+ * state when going to nap mode
+ */
+ bl discard_lazy_cpu_state
+#endif /* CONFIG_SMP */
+
+ /* Hard disable interrupts */
+ mfmsr r9
+ rldicl r9,r9,48,1
+ rotldi r9,r9,16
+ mtmsrd r9,1 /* hard-disable interrupts */
+
+ /* Check if something happened while soft-disabled */
+ lbz r0,PACAIRQHAPPENED(r13)
+ cmpwi cr0,r0,0
+ beq 1f
+ cmpwi cr0,r4,0
+ beq 1f
+ addi r1,r1,INT_FRAME_SIZE
+ ld r0,16(r1)
+ mtlr r0
+ blr
+
+1: /* We mark irqs hard disabled as this is the state we'll
+ * be in when returning and we need to tell arch_local_irq_restore()
+ * about it
+ */
+ li r0,PACA_IRQ_HARD_DIS
+ stb r0,PACAIRQHAPPENED(r13)
+
+ /* We haven't lost state ... yet */
+ li r0,0
+ stb r0,PACA_NAPSTATELOST(r13)
+
+ /* Continue saving state */
+ SAVE_GPR(2, r1)
+ SAVE_NVGPRS(r1)
+ mfcr r4
+ std r4,_CCR(r1)
+ std r9,_MSR(r1)
+ std r1,PACAR1(r13)
+
+_GLOBAL(power7_enter_nap_mode)
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ /* Tell KVM we're napping */
+ li r4,KVM_HWTHREAD_IN_NAP
+ stb r4,HSTATE_HWTHREAD_STATE(r13)
+#endif
+ cmpwi cr0,r3,1
+ beq 2f
+ IDLE_STATE_ENTER_SEQ(PPC_NAP)
+ /* No return */
+2: IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
+ /* No return */
+
+_GLOBAL(power7_idle)
+ /* Now check if user or arch enabled NAP mode */
+ LOAD_REG_ADDRBASE(r3,powersave_nap)
+ lwz r4,ADDROFF(powersave_nap)(r3)
+ cmpwi 0,r4,0
+ beqlr
+ li r3, 1
+ /* fall through */
+
+_GLOBAL(power7_nap)
+ mr r4,r3
+ li r3,0
+ b power7_powersave_common
+ /* No return */
+
+_GLOBAL(power7_sleep)
+ li r3,1
+ li r4,1
+ b power7_powersave_common
+ /* No return */
+
+_GLOBAL(power7_wakeup_tb_loss)
+ ld r2,PACATOC(r13);
+ ld r1,PACAR1(r13)
+
+ /* Time base re-sync */
+ li r0,OPAL_RESYNC_TIMEBASE
+ LOAD_REG_ADDR(r11,opal);
+ ld r12,8(r11);
+ ld r2,0(r11);
+ mtctr r12
+ bctrl
+
+ /* TODO: Check r3 for failure */
+
+ REST_NVGPRS(r1)
+ REST_GPR(2, r1)
+ ld r3,_CCR(r1)
+ ld r4,_MSR(r1)
+ ld r5,_NIP(r1)
+ addi r1,r1,INT_FRAME_SIZE
+ mtcr r3
+ mfspr r3,SPRN_SRR1 /* Return SRR1 */
+ mtspr SPRN_SRR1,r4
+ mtspr SPRN_SRR0,r5
+ rfid
+
+_GLOBAL(power7_wakeup_loss)
+ ld r1,PACAR1(r13)
+ REST_NVGPRS(r1)
+ REST_GPR(2, r1)
+ ld r3,_CCR(r1)
+ ld r4,_MSR(r1)
+ ld r5,_NIP(r1)
+ addi r1,r1,INT_FRAME_SIZE
+ mtcr r3
+ mtspr SPRN_SRR1,r4
+ mtspr SPRN_SRR0,r5
+ rfid
+
+_GLOBAL(power7_wakeup_noloss)
+ lbz r0,PACA_NAPSTATELOST(r13)
+ cmpwi r0,0
+ bne power7_wakeup_loss
+ ld r1,PACAR1(r13)
+ ld r4,_MSR(r1)
+ ld r5,_NIP(r1)
+ addi r1,r1,INT_FRAME_SIZE
+ mtspr SPRN_SRR1,r4
+ mtspr SPRN_SRR0,r5
+ rfid
diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c
deleted file mode 100644
index 2375b7eb1c7..00000000000
--- a/arch/powerpc/kernel/init_task.c
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-#include <linux/mqueue.h>
-#include <asm/uaccess.h>
-
-static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
-static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-/*
- * Initial thread structure.
- *
- * We need to make sure that this is 16384-byte aligned due to the
- * way process stacks are handled. This is done by having a special
- * "init_task" linker map entry..
- */
-union thread_union init_thread_union __init_task_data =
- { INIT_THREAD_INFO(init_task) };
-
-/*
- * Initial task structure.
- *
- * All other task structs will be allocated on slabs in fork.c
- */
-struct task_struct init_task = INIT_TASK(init_task);
-
-EXPORT_SYMBOL(init_task);
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
new file mode 100644
index 00000000000..24b968f8e4d
--- /dev/null
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -0,0 +1,212 @@
+/*
+ * Support PCI IO workaround
+ *
+ * Copyright (C) 2006 Benjamin Herrenschmidt <benh@kernel.crashing.org>
+ * IBM, Corp.
+ * (C) Copyright 2007-2008 TOSHIBA CORPORATION
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/sched.h> /* for init_mm */
+
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/pgtable.h>
+#include <asm/ppc-pci.h>
+#include <asm/io-workarounds.h>
+
+#define IOWA_MAX_BUS 8
+
+static struct iowa_bus iowa_busses[IOWA_MAX_BUS];
+static unsigned int iowa_bus_count;
+
+static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr)
+{
+ int i, j;
+ struct resource *res;
+ unsigned long vstart, vend;
+
+ for (i = 0; i < iowa_bus_count; i++) {
+ struct iowa_bus *bus = &iowa_busses[i];
+ struct pci_controller *phb = bus->phb;
+
+ if (vaddr) {
+ vstart = (unsigned long)phb->io_base_virt;
+ vend = vstart + phb->pci_io_size - 1;
+ if ((vaddr >= vstart) && (vaddr <= vend))
+ return bus;
+ }
+
+ if (paddr)
+ for (j = 0; j < 3; j++) {
+ res = &phb->mem_resources[j];
+ if (paddr >= res->start && paddr <= res->end)
+ return bus;
+ }
+ }
+
+ return NULL;
+}
+
+#ifdef CONFIG_PPC_INDIRECT_MMIO
+struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
+{
+ unsigned hugepage_shift;
+ struct iowa_bus *bus;
+ int token;
+
+ token = PCI_GET_ADDR_TOKEN(addr);
+
+ if (token && token <= iowa_bus_count)
+ bus = &iowa_busses[token - 1];
+ else {
+ unsigned long vaddr, paddr;
+ pte_t *ptep;
+
+ vaddr = (unsigned long)PCI_FIX_ADDR(addr);
+ if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
+ return NULL;
+
+ ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
+ &hugepage_shift);
+ if (ptep == NULL)
+ paddr = 0;
+ else {
+ /*
+ * we don't have hugepages backing iomem
+ */
+ WARN_ON(hugepage_shift);
+ paddr = pte_pfn(*ptep) << PAGE_SHIFT;
+ }
+ bus = iowa_pci_find(vaddr, paddr);
+
+ if (bus == NULL)
+ return NULL;
+ }
+
+ return bus;
+}
+#else /* CONFIG_PPC_INDIRECT_MMIO */
+struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
+{
+ return NULL;
+}
+#endif /* !CONFIG_PPC_INDIRECT_MMIO */
+
+#ifdef CONFIG_PPC_INDIRECT_PIO
+struct iowa_bus *iowa_pio_find_bus(unsigned long port)
+{
+ unsigned long vaddr = (unsigned long)pci_io_base + port;
+ return iowa_pci_find(vaddr, 0);
+}
+#else
+struct iowa_bus *iowa_pio_find_bus(unsigned long port)
+{
+ return NULL;
+}
+#endif
+
+#define DEF_PCI_AC_RET(name, ret, at, al, space, aa) \
+static ret iowa_##name at \
+{ \
+ struct iowa_bus *bus; \
+ bus = iowa_##space##_find_bus(aa); \
+ if (bus && bus->ops && bus->ops->name) \
+ return bus->ops->name al; \
+ return __do_##name al; \
+}
+
+#define DEF_PCI_AC_NORET(name, at, al, space, aa) \
+static void iowa_##name at \
+{ \
+ struct iowa_bus *bus; \
+ bus = iowa_##space##_find_bus(aa); \
+ if (bus && bus->ops && bus->ops->name) { \
+ bus->ops->name al; \
+ return; \
+ } \
+ __do_##name al; \
+}
+
+#include <asm/io-defs.h>
+
+#undef DEF_PCI_AC_RET
+#undef DEF_PCI_AC_NORET
+
+static const struct ppc_pci_io iowa_pci_io = {
+
+#define DEF_PCI_AC_RET(name, ret, at, al, space, aa) .name = iowa_##name,
+#define DEF_PCI_AC_NORET(name, at, al, space, aa) .name = iowa_##name,
+
+#include <asm/io-defs.h>
+
+#undef DEF_PCI_AC_RET
+#undef DEF_PCI_AC_NORET
+
+};
+
+#ifdef CONFIG_PPC_INDIRECT_MMIO
+static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size,
+ unsigned long flags, void *caller)
+{
+ struct iowa_bus *bus;
+ void __iomem *res = __ioremap_caller(addr, size, flags, caller);
+ int busno;
+
+ bus = iowa_pci_find(0, (unsigned long)addr);
+ if (bus != NULL) {
+ busno = bus - iowa_busses;
+ PCI_SET_ADDR_TOKEN(res, busno + 1);
+ }
+ return res;
+}
+#else /* CONFIG_PPC_INDIRECT_MMIO */
+#define iowa_ioremap NULL
+#endif /* !CONFIG_PPC_INDIRECT_MMIO */
+
+/* Enable IO workaround */
+static void io_workaround_init(void)
+{
+ static int io_workaround_inited;
+
+ if (io_workaround_inited)
+ return;
+ ppc_pci_io = iowa_pci_io;
+ ppc_md.ioremap = iowa_ioremap;
+ io_workaround_inited = 1;
+}
+
+/* Register new bus to support workaround */
+void iowa_register_bus(struct pci_controller *phb, struct ppc_pci_io *ops,
+ int (*initfunc)(struct iowa_bus *, void *), void *data)
+{
+ struct iowa_bus *bus;
+ struct device_node *np = phb->dn;
+
+ io_workaround_init();
+
+ if (iowa_bus_count >= IOWA_MAX_BUS) {
+ pr_err("IOWA:Too many pci bridges, "
+ "workarounds disabled for %s\n", np->full_name);
+ return;
+ }
+
+ bus = &iowa_busses[iowa_bus_count];
+ bus->phb = phb;
+ bus->ops = ops;
+ bus->private = data;
+
+ if (initfunc)
+ if ((*initfunc)(bus, data))
+ return;
+
+ iowa_bus_count++;
+
+ pr_debug("IOWA:[%d]Add bus, %s.\n", iowa_bus_count-1, np->full_name);
+}
+
diff --git a/arch/powerpc/kernel/io.c b/arch/powerpc/kernel/io.c
index 8dc7547c237..2a2b4aeab80 100644
--- a/arch/powerpc/kernel/io.c
+++ b/arch/powerpc/kernel/io.c
@@ -19,12 +19,15 @@
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/compiler.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <asm/io.h>
#include <asm/firmware.h>
#include <asm/bug.h>
+/* See definition in io.h */
+bool isa_io_special;
+
void _insb(const volatile u8 __iomem *port, void *buf, long count)
{
u8 *tbuf = buf;
diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c
index 1577434f408..12e48d56f77 100644
--- a/arch/powerpc/kernel/iomap.c
+++ b/arch/powerpc/kernel/iomap.c
@@ -3,9 +3,9 @@
*
* (C) Copyright 2004 Linus Torvalds
*/
-#include <linux/init.h>
#include <linux/pci.h>
#include <linux/mm.h>
+#include <linux/export.h>
#include <asm/io.h>
#include <asm/pci-bridge.h>
@@ -23,7 +23,7 @@ unsigned int ioread16(void __iomem *addr)
}
unsigned int ioread16be(void __iomem *addr)
{
- return in_be16(addr);
+ return readw_be(addr);
}
unsigned int ioread32(void __iomem *addr)
{
@@ -31,7 +31,7 @@ unsigned int ioread32(void __iomem *addr)
}
unsigned int ioread32be(void __iomem *addr)
{
- return in_be32(addr);
+ return readl_be(addr);
}
EXPORT_SYMBOL(ioread8);
EXPORT_SYMBOL(ioread16);
@@ -49,7 +49,7 @@ void iowrite16(u16 val, void __iomem *addr)
}
void iowrite16be(u16 val, void __iomem *addr)
{
- out_be16(addr, val);
+ writew_be(val, addr);
}
void iowrite32(u32 val, void __iomem *addr)
{
@@ -57,7 +57,7 @@ void iowrite32(u32 val, void __iomem *addr)
}
void iowrite32be(u32 val, void __iomem *addr)
{
- out_be32(addr, val);
+ writel_be(val, addr);
}
EXPORT_SYMBOL(iowrite8);
EXPORT_SYMBOL(iowrite16);
@@ -75,15 +75,15 @@ EXPORT_SYMBOL(iowrite32be);
*/
void ioread8_rep(void __iomem *addr, void *dst, unsigned long count)
{
- _insb((u8 __iomem *) addr, dst, count);
+ readsb(addr, dst, count);
}
void ioread16_rep(void __iomem *addr, void *dst, unsigned long count)
{
- _insw_ns((u16 __iomem *) addr, dst, count);
+ readsw(addr, dst, count);
}
void ioread32_rep(void __iomem *addr, void *dst, unsigned long count)
{
- _insl_ns((u32 __iomem *) addr, dst, count);
+ readsl(addr, dst, count);
}
EXPORT_SYMBOL(ioread8_rep);
EXPORT_SYMBOL(ioread16_rep);
@@ -91,15 +91,15 @@ EXPORT_SYMBOL(ioread32_rep);
void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count)
{
- _outsb((u8 __iomem *) addr, src, count);
+ writesb(addr, src, count);
}
void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count)
{
- _outsw_ns((u16 __iomem *) addr, src, count);
+ writesw(addr, src, count);
}
void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count)
{
- _outsl_ns((u32 __iomem *) addr, src, count);
+ writesl(addr, src, count);
}
EXPORT_SYMBOL(iowrite8_rep);
EXPORT_SYMBOL(iowrite16_rep);
@@ -117,24 +117,7 @@ void ioport_unmap(void __iomem *addr)
EXPORT_SYMBOL(ioport_map);
EXPORT_SYMBOL(ioport_unmap);
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max)
-{
- resource_size_t start = pci_resource_start(dev, bar);
- resource_size_t len = pci_resource_len(dev, bar);
- unsigned long flags = pci_resource_flags(dev, bar);
-
- if (!len)
- return NULL;
- if (max && len > max)
- len = max;
- if (flags & IORESOURCE_IO)
- return ioport_map(start, len);
- if (flags & IORESOURCE_MEM)
- return ioremap(start, len);
- /* What? */
- return NULL;
-}
-
+#ifdef CONFIG_PCI
void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
{
if (isa_vaddr_is_ioport(addr))
@@ -144,5 +127,5 @@ void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
iounmap(addr);
}
-EXPORT_SYMBOL(pci_iomap);
EXPORT_SYMBOL(pci_iounmap);
+#endif /* CONFIG_PCI */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index d5839179ec7..88e3ec6e1d9 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -33,12 +33,20 @@
#include <linux/bitmap.h>
#include <linux/iommu-helper.h>
#include <linux/crash_dump.h>
+#include <linux/hash.h>
+#include <linux/fault-inject.h>
+#include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/sched.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/iommu.h>
#include <asm/pci-bridge.h>
#include <asm/machdep.h>
#include <asm/kdump.h>
+#include <asm/fadump.h>
+#include <asm/vio.h>
+#include <asm/tce.h>
#define DBG(...)
@@ -57,6 +65,114 @@ static int __init setup_iommu(char *str)
__setup("iommu=", setup_iommu);
+static DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
+
+/*
+ * We precalculate the hash to avoid doing it on every allocation.
+ *
+ * The hash is important to spread CPUs across all the pools. For example,
+ * on a POWER7 with 4 way SMT we want interrupts on the primary threads and
+ * with 4 pools all primary threads would map to the same pool.
+ */
+static int __init setup_iommu_pool_hash(void)
+{
+ unsigned int i;
+
+ for_each_possible_cpu(i)
+ per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
+
+ return 0;
+}
+subsys_initcall(setup_iommu_pool_hash);
+
+#ifdef CONFIG_FAIL_IOMMU
+
+static DECLARE_FAULT_ATTR(fail_iommu);
+
+static int __init setup_fail_iommu(char *str)
+{
+ return setup_fault_attr(&fail_iommu, str);
+}
+__setup("fail_iommu=", setup_fail_iommu);
+
+static bool should_fail_iommu(struct device *dev)
+{
+ return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1);
+}
+
+static int __init fail_iommu_debugfs(void)
+{
+ struct dentry *dir = fault_create_debugfs_attr("fail_iommu",
+ NULL, &fail_iommu);
+
+ return PTR_ERR_OR_ZERO(dir);
+}
+late_initcall(fail_iommu_debugfs);
+
+static ssize_t fail_iommu_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", dev->archdata.fail_iommu);
+}
+
+static ssize_t fail_iommu_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ int i;
+
+ if (count > 0 && sscanf(buf, "%d", &i) > 0)
+ dev->archdata.fail_iommu = (i == 0) ? 0 : 1;
+
+ return count;
+}
+
+static DEVICE_ATTR(fail_iommu, S_IRUGO|S_IWUSR, fail_iommu_show,
+ fail_iommu_store);
+
+static int fail_iommu_bus_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+
+ if (action == BUS_NOTIFY_ADD_DEVICE) {
+ if (device_create_file(dev, &dev_attr_fail_iommu))
+ pr_warn("Unable to create IOMMU fault injection sysfs "
+ "entries\n");
+ } else if (action == BUS_NOTIFY_DEL_DEVICE) {
+ device_remove_file(dev, &dev_attr_fail_iommu);
+ }
+
+ return 0;
+}
+
+static struct notifier_block fail_iommu_bus_notifier = {
+ .notifier_call = fail_iommu_bus_notify
+};
+
+static int __init fail_iommu_setup(void)
+{
+#ifdef CONFIG_PCI
+ bus_register_notifier(&pci_bus_type, &fail_iommu_bus_notifier);
+#endif
+#ifdef CONFIG_IBMVIO
+ bus_register_notifier(&vio_bus_type, &fail_iommu_bus_notifier);
+#endif
+
+ return 0;
+}
+/*
+ * Must execute after PCI and VIO subsystem have initialised but before
+ * devices are probed.
+ */
+arch_initcall(fail_iommu_setup);
+#else
+static inline bool should_fail_iommu(struct device *dev)
+{
+ return false;
+}
+#endif
+
static unsigned long iommu_range_alloc(struct device *dev,
struct iommu_table *tbl,
unsigned long npages,
@@ -70,6 +186,9 @@ static unsigned long iommu_range_alloc(struct device *dev,
int pass = 0;
unsigned long align_mask;
unsigned long boundary_size;
+ unsigned long flags;
+ unsigned int pool_nr;
+ struct iommu_pool *pool;
align_mask = 0xffffffffffffffffl >> (64 - align_order);
@@ -82,59 +201,83 @@ static unsigned long iommu_range_alloc(struct device *dev,
return DMA_ERROR_CODE;
}
- if (handle && *handle)
- start = *handle;
+ if (should_fail_iommu(dev))
+ return DMA_ERROR_CODE;
+
+ /*
+ * We don't need to disable preemption here because any CPU can
+ * safely use any IOMMU pool.
+ */
+ pool_nr = __raw_get_cpu_var(iommu_pool_hash) & (tbl->nr_pools - 1);
+
+ if (largealloc)
+ pool = &(tbl->large_pool);
else
- start = largealloc ? tbl->it_largehint : tbl->it_hint;
+ pool = &(tbl->pools[pool_nr]);
- /* Use only half of the table for small allocs (15 pages or less) */
- limit = largealloc ? tbl->it_size : tbl->it_halfpoint;
+ spin_lock_irqsave(&(pool->lock), flags);
- if (largealloc && start < tbl->it_halfpoint)
- start = tbl->it_halfpoint;
+again:
+ if ((pass == 0) && handle && *handle &&
+ (*handle >= pool->start) && (*handle < pool->end))
+ start = *handle;
+ else
+ start = pool->hint;
+
+ limit = pool->end;
/* The case below can happen if we have a small segment appended
* to a large, or when the previous alloc was at the very end of
* the available space. If so, go back to the initial start.
*/
if (start >= limit)
- start = largealloc ? tbl->it_largehint : tbl->it_hint;
-
- again:
+ start = pool->start;
if (limit + tbl->it_offset > mask) {
limit = mask - tbl->it_offset + 1;
/* If we're constrained on address range, first try
* at the masked hint to avoid O(n) search complexity,
- * but on second pass, start at 0.
+ * but on second pass, start at 0 in pool 0.
*/
- if ((start & mask) >= limit || pass > 0)
- start = 0;
- else
+ if ((start & mask) >= limit || pass > 0) {
+ spin_unlock(&(pool->lock));
+ pool = &(tbl->pools[0]);
+ spin_lock(&(pool->lock));
+ start = pool->start;
+ } else {
start &= mask;
+ }
}
if (dev)
boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- 1 << IOMMU_PAGE_SHIFT);
+ 1 << tbl->it_page_shift);
else
- boundary_size = ALIGN(1UL << 32, 1 << IOMMU_PAGE_SHIFT);
+ boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift);
/* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
- n = iommu_area_alloc(tbl->it_map, limit, start, npages,
- tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT,
- align_mask);
+ n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
+ boundary_size >> tbl->it_page_shift, align_mask);
if (n == -1) {
- if (likely(pass < 2)) {
- /* First failure, just rescan the half of the table.
- * Second failure, rescan the other half of the table.
- */
- start = (largealloc ^ pass) ? tbl->it_halfpoint : 0;
- limit = pass ? tbl->it_size : limit;
+ if (likely(pass == 0)) {
+ /* First try the pool from the start */
+ pool->hint = pool->start;
+ pass++;
+ goto again;
+
+ } else if (pass <= tbl->nr_pools) {
+ /* Now try scanning all the other pools */
+ spin_unlock(&(pool->lock));
+ pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1);
+ pool = &tbl->pools[pool_nr];
+ spin_lock(&(pool->lock));
+ pool->hint = pool->start;
pass++;
goto again;
+
} else {
- /* Third failure, give up */
+ /* Give up */
+ spin_unlock_irqrestore(&(pool->lock), flags);
return DMA_ERROR_CODE;
}
}
@@ -144,10 +287,10 @@ static unsigned long iommu_range_alloc(struct device *dev,
/* Bump the hint to a new block for small allocs. */
if (largealloc) {
/* Don't bump to new block to avoid fragmentation */
- tbl->it_largehint = end;
+ pool->hint = end;
} else {
/* Overflow will be taken care of at the next allocation */
- tbl->it_hint = (end + tbl->it_blocksize - 1) &
+ pool->hint = (end + tbl->it_blocksize - 1) &
~(tbl->it_blocksize - 1);
}
@@ -155,6 +298,8 @@ static unsigned long iommu_range_alloc(struct device *dev,
if (handle)
*handle = end;
+ spin_unlock_irqrestore(&(pool->lock), flags);
+
return n;
}
@@ -164,26 +309,22 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
unsigned long mask, unsigned int align_order,
struct dma_attrs *attrs)
{
- unsigned long entry, flags;
+ unsigned long entry;
dma_addr_t ret = DMA_ERROR_CODE;
int build_fail;
- spin_lock_irqsave(&(tbl->it_lock), flags);
-
entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
- if (unlikely(entry == DMA_ERROR_CODE)) {
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
+ if (unlikely(entry == DMA_ERROR_CODE))
return DMA_ERROR_CODE;
- }
entry += tbl->it_offset; /* Offset into real TCE table */
- ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */
+ ret = entry << tbl->it_page_shift; /* Set the return dma address */
/* Put the TCEs in the HW table */
build_fail = ppc_md.tce_build(tbl, entry, npages,
- (unsigned long)page & IOMMU_PAGE_MASK,
- direction, attrs);
+ (unsigned long)page &
+ IOMMU_PAGE_MASK(tbl), direction, attrs);
/* ppc_md.tce_build() only returns non-zero for transient errors.
* Clean up the table bitmap in this case and return
@@ -192,8 +333,6 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
*/
if (unlikely(build_fail)) {
__iommu_free(tbl, ret, npages);
-
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
return DMA_ERROR_CODE;
}
@@ -201,20 +340,18 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
if (ppc_md.tce_flush)
ppc_md.tce_flush(tbl);
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
-
/* Make sure updates are seen by hardware */
mb();
return ret;
}
-static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
- unsigned int npages)
+static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
+ unsigned int npages)
{
unsigned long entry, free_entry;
- entry = dma_addr >> IOMMU_PAGE_SHIFT;
+ entry = dma_addr >> tbl->it_page_shift;
free_entry = entry - tbl->it_offset;
if (((free_entry + npages) > tbl->it_size) ||
@@ -230,20 +367,57 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
printk(KERN_INFO "\tindex = 0x%llx\n", (u64)tbl->it_index);
WARN_ON(1);
}
- return;
+
+ return false;
+ }
+
+ return true;
+}
+
+static struct iommu_pool *get_pool(struct iommu_table *tbl,
+ unsigned long entry)
+{
+ struct iommu_pool *p;
+ unsigned long largepool_start = tbl->large_pool.start;
+
+ /* The large pool is the last pool at the top of the table */
+ if (entry >= largepool_start) {
+ p = &tbl->large_pool;
+ } else {
+ unsigned int pool_nr = entry / tbl->poolsize;
+
+ BUG_ON(pool_nr > tbl->nr_pools);
+ p = &tbl->pools[pool_nr];
}
+ return p;
+}
+
+static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+ unsigned int npages)
+{
+ unsigned long entry, free_entry;
+ unsigned long flags;
+ struct iommu_pool *pool;
+
+ entry = dma_addr >> tbl->it_page_shift;
+ free_entry = entry - tbl->it_offset;
+
+ pool = get_pool(tbl, free_entry);
+
+ if (!iommu_free_check(tbl, dma_addr, npages))
+ return;
+
ppc_md.tce_free(tbl, entry, npages);
+
+ spin_lock_irqsave(&(pool->lock), flags);
bitmap_clear(tbl->it_map, free_entry, npages);
+ spin_unlock_irqrestore(&(pool->lock), flags);
}
static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned int npages)
{
- unsigned long flags;
-
- spin_lock_irqsave(&(tbl->it_lock), flags);
-
__iommu_free(tbl, dma_addr, npages);
/* Make sure TLB cache is flushed if the HW needs it. We do
@@ -252,8 +426,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
*/
if (ppc_md.tce_flush)
ppc_md.tce_flush(tbl);
-
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
}
int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
@@ -262,7 +434,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
struct dma_attrs *attrs)
{
dma_addr_t dma_next = 0, dma_addr;
- unsigned long flags;
struct scatterlist *s, *outs, *segstart;
int outcount, incount, i, build_fail = 0;
unsigned int align;
@@ -284,8 +455,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
DBG("sg mapping %d elements:\n", nelems);
- spin_lock_irqsave(&(tbl->it_lock), flags);
-
max_seg_size = dma_get_max_seg_size(dev);
for_each_sg(sglist, s, nelems, i) {
unsigned long vaddr, npages, entry, slen;
@@ -298,36 +467,37 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
}
/* Allocate iommu entries for that segment */
vaddr = (unsigned long) sg_virt(s);
- npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE);
+ npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl));
align = 0;
- if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && slen >= PAGE_SIZE &&
+ if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE &&
(vaddr & ~PAGE_MASK) == 0)
- align = PAGE_SHIFT - IOMMU_PAGE_SHIFT;
+ align = PAGE_SHIFT - tbl->it_page_shift;
entry = iommu_range_alloc(dev, tbl, npages, &handle,
- mask >> IOMMU_PAGE_SHIFT, align);
+ mask >> tbl->it_page_shift, align);
DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen);
/* Handle failure */
if (unlikely(entry == DMA_ERROR_CODE)) {
if (printk_ratelimit())
- printk(KERN_INFO "iommu_alloc failed, tbl %p vaddr %lx"
- " npages %lx\n", tbl, vaddr, npages);
+ dev_info(dev, "iommu_alloc failed, tbl %p "
+ "vaddr %lx npages %lu\n", tbl, vaddr,
+ npages);
goto failure;
}
/* Convert entry to a dma_addr_t */
entry += tbl->it_offset;
- dma_addr = entry << IOMMU_PAGE_SHIFT;
- dma_addr |= (s->offset & ~IOMMU_PAGE_MASK);
+ dma_addr = entry << tbl->it_page_shift;
+ dma_addr |= (s->offset & ~IOMMU_PAGE_MASK(tbl));
DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n",
npages, entry, dma_addr);
/* Insert into HW table */
build_fail = ppc_md.tce_build(tbl, entry, npages,
- vaddr & IOMMU_PAGE_MASK,
- direction, attrs);
+ vaddr & IOMMU_PAGE_MASK(tbl),
+ direction, attrs);
if(unlikely(build_fail))
goto failure;
@@ -367,8 +537,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
if (ppc_md.tce_flush)
ppc_md.tce_flush(tbl);
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
-
DBG("mapped %d elements:\n", outcount);
/* For the sake of iommu_unmap_sg, we clear out the length in the
@@ -390,9 +558,9 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
if (s->dma_length != 0) {
unsigned long vaddr, npages;
- vaddr = s->dma_address & IOMMU_PAGE_MASK;
+ vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl);
npages = iommu_num_pages(s->dma_address, s->dma_length,
- IOMMU_PAGE_SIZE);
+ IOMMU_PAGE_SIZE(tbl));
__iommu_free(tbl, vaddr, npages);
s->dma_address = DMA_ERROR_CODE;
s->dma_length = 0;
@@ -400,7 +568,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
if (s == outs)
break;
}
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
return 0;
}
@@ -410,15 +577,12 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
struct dma_attrs *attrs)
{
struct scatterlist *sg;
- unsigned long flags;
BUG_ON(direction == DMA_NONE);
if (!tbl)
return;
- spin_lock_irqsave(&(tbl->it_lock), flags);
-
sg = sglist;
while (nelems--) {
unsigned int npages;
@@ -427,7 +591,7 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
if (sg->dma_length == 0)
break;
npages = iommu_num_pages(dma_handle, sg->dma_length,
- IOMMU_PAGE_SIZE);
+ IOMMU_PAGE_SIZE(tbl));
__iommu_free(tbl, dma_handle, npages);
sg = sg_next(sg);
}
@@ -438,13 +602,16 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
*/
if (ppc_md.tce_flush)
ppc_md.tce_flush(tbl);
-
- spin_unlock_irqrestore(&(tbl->it_lock), flags);
}
static void iommu_table_clear(struct iommu_table *tbl)
{
- if (!is_kdump_kernel()) {
+ /*
+ * In case of firmware assisted dump system goes through clean
+ * reboot process at the time of system crash. Hence it's safe to
+ * clear the TCE entries if firmware assisted dump is active.
+ */
+ if (!is_kdump_kernel() || is_fadump_active()) {
/* Clear the table in case firmware left allocations in it */
ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size);
return;
@@ -487,22 +654,48 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
unsigned long sz;
static int welcomed = 0;
struct page *page;
-
- /* Set aside 1/4 of the table for large allocations. */
- tbl->it_halfpoint = tbl->it_size * 3 / 4;
+ unsigned int i;
+ struct iommu_pool *p;
/* number of bytes needed for the bitmap */
- sz = (tbl->it_size + 7) >> 3;
+ sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
- page = alloc_pages_node(nid, GFP_ATOMIC, get_order(sz));
+ page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz));
if (!page)
panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
tbl->it_map = page_address(page);
memset(tbl->it_map, 0, sz);
- tbl->it_hint = 0;
- tbl->it_largehint = tbl->it_halfpoint;
- spin_lock_init(&tbl->it_lock);
+ /*
+ * Reserve page 0 so it will not be used for any mappings.
+ * This avoids buggy drivers that consider page 0 to be invalid
+ * to crash the machine or even lose data.
+ */
+ if (tbl->it_offset == 0)
+ set_bit(0, tbl->it_map);
+
+ /* We only split the IOMMU table if we have 1GB or more of space */
+ if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
+ tbl->nr_pools = IOMMU_NR_POOLS;
+ else
+ tbl->nr_pools = 1;
+
+ /* We reserve the top 1/4 of the table for large allocations */
+ tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools;
+
+ for (i = 0; i < tbl->nr_pools; i++) {
+ p = &tbl->pools[i];
+ spin_lock_init(&(p->lock));
+ p->start = tbl->poolsize * i;
+ p->hint = p->start;
+ p->end = p->start + tbl->poolsize;
+ }
+
+ p = &tbl->large_pool;
+ spin_lock_init(&(p->lock));
+ p->start = tbl->poolsize * i;
+ p->hint = p->start;
+ p->end = tbl->it_size;
iommu_table_clear(tbl);
@@ -517,7 +710,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
void iommu_free_table(struct iommu_table *tbl, const char *node_name)
{
- unsigned long bitmap_sz, i;
+ unsigned long bitmap_sz;
unsigned int order;
if (!tbl || !tbl->it_map) {
@@ -526,18 +719,26 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
return;
}
- /* verify that table contains no entries */
- /* it_size is in entries, and we're examining 64 at a time */
- for (i = 0; i < (tbl->it_size/64); i++) {
- if (tbl->it_map[i] != 0) {
- printk(KERN_WARNING "%s: Unexpected TCEs for %s\n",
- __func__, node_name);
- break;
- }
+ /*
+ * In case we have reserved the first bit, we should not emit
+ * the warning below.
+ */
+ if (tbl->it_offset == 0)
+ clear_bit(0, tbl->it_map);
+
+#ifdef CONFIG_IOMMU_API
+ if (tbl->it_group) {
+ iommu_group_put(tbl->it_group);
+ BUG_ON(tbl->it_group);
}
+#endif
+
+ /* verify that table contains no entries */
+ if (!bitmap_empty(tbl->it_map, tbl->it_size))
+ pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
/* calculate bitmap size in bytes */
- bitmap_sz = (tbl->it_size + 7) / 8;
+ bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
/* free bitmap */
order = get_order(bitmap_sz);
@@ -566,25 +767,25 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
vaddr = page_address(page) + offset;
uaddr = (unsigned long)vaddr;
- npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE);
+ npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl));
if (tbl) {
align = 0;
- if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && size >= PAGE_SIZE &&
+ if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE &&
((unsigned long)vaddr & ~PAGE_MASK) == 0)
- align = PAGE_SHIFT - IOMMU_PAGE_SHIFT;
+ align = PAGE_SHIFT - tbl->it_page_shift;
dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
- mask >> IOMMU_PAGE_SHIFT, align,
+ mask >> tbl->it_page_shift, align,
attrs);
if (dma_handle == DMA_ERROR_CODE) {
if (printk_ratelimit()) {
- printk(KERN_INFO "iommu_alloc failed, "
- "tbl %p vaddr %p npages %d\n",
- tbl, vaddr, npages);
+ dev_info(dev, "iommu_alloc failed, tbl %p "
+ "vaddr %p npages %d\n", tbl, vaddr,
+ npages);
}
} else
- dma_handle |= (uaddr & ~IOMMU_PAGE_MASK);
+ dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl));
}
return dma_handle;
@@ -599,7 +800,8 @@ void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle,
BUG_ON(direction == DMA_NONE);
if (tbl) {
- npages = iommu_num_pages(dma_handle, size, IOMMU_PAGE_SIZE);
+ npages = iommu_num_pages(dma_handle, size,
+ IOMMU_PAGE_SIZE(tbl));
iommu_free(tbl, dma_handle, npages);
}
}
@@ -627,7 +829,8 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
* the tce tables.
*/
if (order >= IOMAP_MAX_ORDER) {
- printk("iommu_alloc_consistent size too large: 0x%lx\n", size);
+ dev_info(dev, "iommu_alloc_consistent size too large: 0x%lx\n",
+ size);
return NULL;
}
@@ -642,10 +845,10 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
memset(ret, 0, size);
/* Set up tces to cover the allocated range */
- nio_pages = size >> IOMMU_PAGE_SHIFT;
- io_order = get_iommu_order(size);
+ nio_pages = size >> tbl->it_page_shift;
+ io_order = get_iommu_order(size, tbl);
mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
- mask >> IOMMU_PAGE_SHIFT, io_order, NULL);
+ mask >> tbl->it_page_shift, io_order, NULL);
if (mapping == DMA_ERROR_CODE) {
free_pages((unsigned long)ret, order);
return NULL;
@@ -661,9 +864,311 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
unsigned int nio_pages;
size = PAGE_ALIGN(size);
- nio_pages = size >> IOMMU_PAGE_SHIFT;
+ nio_pages = size >> tbl->it_page_shift;
iommu_free(tbl, dma_handle, nio_pages);
size = PAGE_ALIGN(size);
free_pages((unsigned long)vaddr, get_order(size));
}
}
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void group_release(void *iommu_data)
+{
+ struct iommu_table *tbl = iommu_data;
+ tbl->it_group = NULL;
+}
+
+void iommu_register_group(struct iommu_table *tbl,
+ int pci_domain_number, unsigned long pe_num)
+{
+ struct iommu_group *grp;
+ char *name;
+
+ grp = iommu_group_alloc();
+ if (IS_ERR(grp)) {
+ pr_warn("powerpc iommu api: cannot create new group, err=%ld\n",
+ PTR_ERR(grp));
+ return;
+ }
+ tbl->it_group = grp;
+ iommu_group_set_iommudata(grp, tbl, group_release);
+ name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
+ pci_domain_number, pe_num);
+ if (!name)
+ return;
+ iommu_group_set_name(grp, name);
+ kfree(name);
+}
+
+enum dma_data_direction iommu_tce_direction(unsigned long tce)
+{
+ if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE))
+ return DMA_BIDIRECTIONAL;
+ else if (tce & TCE_PCI_READ)
+ return DMA_TO_DEVICE;
+ else if (tce & TCE_PCI_WRITE)
+ return DMA_FROM_DEVICE;
+ else
+ return DMA_NONE;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_direction);
+
+void iommu_flush_tce(struct iommu_table *tbl)
+{
+ /* Flush/invalidate TLB caches if necessary */
+ if (ppc_md.tce_flush)
+ ppc_md.tce_flush(tbl);
+
+ /* Make sure updates are seen by hardware */
+ mb();
+}
+EXPORT_SYMBOL_GPL(iommu_flush_tce);
+
+int iommu_tce_clear_param_check(struct iommu_table *tbl,
+ unsigned long ioba, unsigned long tce_value,
+ unsigned long npages)
+{
+ /* ppc_md.tce_free() does not support any value but 0 */
+ if (tce_value)
+ return -EINVAL;
+
+ if (ioba & ~IOMMU_PAGE_MASK(tbl))
+ return -EINVAL;
+
+ ioba >>= tbl->it_page_shift;
+ if (ioba < tbl->it_offset)
+ return -EINVAL;
+
+ if ((ioba + npages) > (tbl->it_offset + tbl->it_size))
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
+
+int iommu_tce_put_param_check(struct iommu_table *tbl,
+ unsigned long ioba, unsigned long tce)
+{
+ if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+ return -EINVAL;
+
+ if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ))
+ return -EINVAL;
+
+ if (ioba & ~IOMMU_PAGE_MASK(tbl))
+ return -EINVAL;
+
+ ioba >>= tbl->it_page_shift;
+ if (ioba < tbl->it_offset)
+ return -EINVAL;
+
+ if ((ioba + 1) > (tbl->it_offset + tbl->it_size))
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
+
+unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+ unsigned long oldtce;
+ struct iommu_pool *pool = get_pool(tbl, entry);
+
+ spin_lock(&(pool->lock));
+
+ oldtce = ppc_md.tce_get(tbl, entry);
+ if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
+ ppc_md.tce_free(tbl, entry, 1);
+ else
+ oldtce = 0;
+
+ spin_unlock(&(pool->lock));
+
+ return oldtce;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tce);
+
+int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
+ unsigned long entry, unsigned long pages)
+{
+ unsigned long oldtce;
+ struct page *page;
+
+ for ( ; pages; --pages, ++entry) {
+ oldtce = iommu_clear_tce(tbl, entry);
+ if (!oldtce)
+ continue;
+
+ page = pfn_to_page(oldtce >> PAGE_SHIFT);
+ WARN_ON(!page);
+ if (page) {
+ if (oldtce & TCE_PCI_WRITE)
+ SetPageDirty(page);
+ put_page(page);
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
+
+/*
+ * hwaddr is a kernel virtual address here (0xc... bazillion),
+ * tce_build converts it to a physical address.
+ */
+int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
+ unsigned long hwaddr, enum dma_data_direction direction)
+{
+ int ret = -EBUSY;
+ unsigned long oldtce;
+ struct iommu_pool *pool = get_pool(tbl, entry);
+
+ spin_lock(&(pool->lock));
+
+ oldtce = ppc_md.tce_get(tbl, entry);
+ /* Add new entry if it is not busy */
+ if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+ ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL);
+
+ spin_unlock(&(pool->lock));
+
+ /* if (unlikely(ret))
+ pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
+ __func__, hwaddr, entry << IOMMU_PAGE_SHIFT(tbl),
+ hwaddr, ret); */
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_build);
+
+int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
+ unsigned long tce)
+{
+ int ret;
+ struct page *page = NULL;
+ unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
+ enum dma_data_direction direction = iommu_tce_direction(tce);
+
+ ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+ direction != DMA_TO_DEVICE, &page);
+ if (unlikely(ret != 1)) {
+ /* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
+ tce, entry << IOMMU_PAGE_SHIFT(tbl), ret); */
+ return -EFAULT;
+ }
+ hwaddr = (unsigned long) page_address(page) + offset;
+
+ ret = iommu_tce_build(tbl, entry, hwaddr, direction);
+ if (ret)
+ put_page(page);
+
+ if (ret < 0)
+ pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
+ __func__, entry << tbl->it_page_shift, tce, ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
+
+int iommu_take_ownership(struct iommu_table *tbl)
+{
+ unsigned long sz = (tbl->it_size + 7) >> 3;
+
+ if (tbl->it_offset == 0)
+ clear_bit(0, tbl->it_map);
+
+ if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+ pr_err("iommu_tce: it_map is not empty");
+ return -EBUSY;
+ }
+
+ memset(tbl->it_map, 0xff, sz);
+ iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+
+ /*
+ * Disable iommu bypass, otherwise the user can DMA to all of
+ * our physical memory via the bypass window instead of just
+ * the pages that has been explicitly mapped into the iommu
+ */
+ if (tbl->set_bypass)
+ tbl->set_bypass(tbl, false);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_take_ownership);
+
+void iommu_release_ownership(struct iommu_table *tbl)
+{
+ unsigned long sz = (tbl->it_size + 7) >> 3;
+
+ iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+ memset(tbl->it_map, 0, sz);
+
+ /* Restore bit#0 set by iommu_init_table() */
+ if (tbl->it_offset == 0)
+ set_bit(0, tbl->it_map);
+
+ /* The kernel owns the device now, we can restore the iommu bypass */
+ if (tbl->set_bypass)
+ tbl->set_bypass(tbl, true);
+}
+EXPORT_SYMBOL_GPL(iommu_release_ownership);
+
+int iommu_add_device(struct device *dev)
+{
+ struct iommu_table *tbl;
+ int ret = 0;
+
+ if (WARN_ON(dev->iommu_group)) {
+ pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n",
+ dev_name(dev),
+ iommu_group_id(dev->iommu_group));
+ return -EBUSY;
+ }
+
+ tbl = get_iommu_table_base(dev);
+ if (!tbl || !tbl->it_group) {
+ pr_debug("iommu_tce: skipping device %s with no tbl\n",
+ dev_name(dev));
+ return 0;
+ }
+
+ pr_debug("iommu_tce: adding %s to iommu group %d\n",
+ dev_name(dev), iommu_group_id(tbl->it_group));
+
+ if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
+ pr_err("iommu_tce: unsupported iommu page size.");
+ pr_err("%s has not been added\n", dev_name(dev));
+ return -EINVAL;
+ }
+
+ ret = iommu_group_add_device(tbl->it_group, dev);
+ if (ret < 0)
+ pr_err("iommu_tce: %s has not been added, ret=%d\n",
+ dev_name(dev), ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_add_device);
+
+void iommu_del_device(struct device *dev)
+{
+ /*
+ * Some devices might not have IOMMU table and group
+ * and we needn't detach them from the associated
+ * IOMMU groups
+ */
+ if (!dev->iommu_group) {
+ pr_debug("iommu_tce: skipping device %s with no tbl\n",
+ dev_name(dev));
+ return;
+ }
+
+ iommu_group_remove_device(dev);
+}
+EXPORT_SYMBOL_GPL(iommu_del_device);
+
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index ce557f6f00f..248ee7e5beb 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -30,7 +30,7 @@
#undef DEBUG
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/threads.h>
#include <linux/kernel_stat.h>
#include <linux/signal.h>
@@ -57,7 +57,6 @@
#include <linux/of_irq.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <asm/io.h>
#include <asm/pgtable.h>
#include <asm/irq.h>
@@ -66,8 +65,8 @@
#include <asm/ptrace.h>
#include <asm/machdep.h>
#include <asm/udbg.h>
-#include <asm/dbell.h>
#include <asm/smp.h>
+#include <asm/debug.h>
#ifdef CONFIG_PPC64
#include <asm/paca.h>
@@ -94,20 +93,16 @@ extern int tau_interrupts(int);
#ifdef CONFIG_PPC64
-#ifndef CONFIG_SPARSE_IRQ
-EXPORT_SYMBOL(irq_desc);
-#endif
-
int distribute_irqs = 1;
-static inline notrace unsigned long get_hard_enabled(void)
+static inline notrace unsigned long get_irq_happened(void)
{
- unsigned long enabled;
+ unsigned long happened;
__asm__ __volatile__("lbz %0,%1(13)"
- : "=r" (enabled) : "i" (offsetof(struct paca_struct, hard_enabled)));
+ : "=r" (happened) : "i" (offsetof(struct paca_struct, irq_happened)));
- return enabled;
+ return happened;
}
static inline notrace void set_soft_enabled(unsigned long enable)
@@ -116,86 +111,235 @@ static inline notrace void set_soft_enabled(unsigned long enable)
: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
}
-notrace void arch_local_irq_restore(unsigned long en)
+static inline notrace int decrementer_check_overflow(void)
+{
+ u64 now = get_tb_or_rtc();
+ u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
+
+ return now >= *next_tb;
+}
+
+/* This is called whenever we are re-enabling interrupts
+ * and returns either 0 (nothing to do) or 500/900/280/a00/e80 if
+ * there's an EE, DEC or DBELL to generate.
+ *
+ * This is called in two contexts: From arch_local_irq_restore()
+ * before soft-enabling interrupts, and from the exception exit
+ * path when returning from an interrupt from a soft-disabled to
+ * a soft enabled context. In both case we have interrupts hard
+ * disabled.
+ *
+ * We take care of only clearing the bits we handled in the
+ * PACA irq_happened field since we can only re-emit one at a
+ * time and we don't want to "lose" one.
+ */
+notrace unsigned int __check_irq_replay(void)
{
/*
- * get_paca()->soft_enabled = en;
- * Is it ever valid to use local_irq_restore(0) when soft_enabled is 1?
- * That was allowed before, and in such a case we do need to take care
- * that gcc will set soft_enabled directly via r13, not choose to use
- * an intermediate register, lest we're preempted to a different cpu.
+ * We use local_paca rather than get_paca() to avoid all
+ * the debug_smp_processor_id() business in this low level
+ * function
+ */
+ unsigned char happened = local_paca->irq_happened;
+
+ /* Clear bit 0 which we wouldn't clear otherwise */
+ local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+
+ /*
+ * Force the delivery of pending soft-disabled interrupts on PS3.
+ * Any HV call will have this side effect.
+ */
+ if (firmware_has_feature(FW_FEATURE_PS3_LV1)) {
+ u64 tmp, tmp2;
+ lv1_get_version_info(&tmp, &tmp2);
+ }
+
+ /*
+ * We may have missed a decrementer interrupt. We check the
+ * decrementer itself rather than the paca irq_happened field
+ * in case we also had a rollover while hard disabled
+ */
+ local_paca->irq_happened &= ~PACA_IRQ_DEC;
+ if ((happened & PACA_IRQ_DEC) || decrementer_check_overflow())
+ return 0x900;
+
+ /* Finally check if an external interrupt happened */
+ local_paca->irq_happened &= ~PACA_IRQ_EE;
+ if (happened & PACA_IRQ_EE)
+ return 0x500;
+
+#ifdef CONFIG_PPC_BOOK3E
+ /* Finally check if an EPR external interrupt happened
+ * this bit is typically set if we need to handle another
+ * "edge" interrupt from within the MPIC "EPR" handler
*/
+ local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE;
+ if (happened & PACA_IRQ_EE_EDGE)
+ return 0x500;
+
+ local_paca->irq_happened &= ~PACA_IRQ_DBELL;
+ if (happened & PACA_IRQ_DBELL)
+ return 0x280;
+#else
+ local_paca->irq_happened &= ~PACA_IRQ_DBELL;
+ if (happened & PACA_IRQ_DBELL) {
+ if (cpu_has_feature(CPU_FTR_HVMODE))
+ return 0xe80;
+ return 0xa00;
+ }
+#endif /* CONFIG_PPC_BOOK3E */
+
+ /* There should be nothing left ! */
+ BUG_ON(local_paca->irq_happened != 0);
+
+ return 0;
+}
+
+notrace void arch_local_irq_restore(unsigned long en)
+{
+ unsigned char irq_happened;
+ unsigned int replay;
+
+ /* Write the new soft-enabled value */
set_soft_enabled(en);
if (!en)
return;
+ /*
+ * From this point onward, we can take interrupts, preempt,
+ * etc... unless we got hard-disabled. We check if an event
+ * happened. If none happened, we know we can just return.
+ *
+ * We may have preempted before the check below, in which case
+ * we are checking the "new" CPU instead of the old one. This
+ * is only a problem if an event happened on the "old" CPU.
+ *
+ * External interrupt events will have caused interrupts to
+ * be hard-disabled, so there is no problem, we
+ * cannot have preempted.
+ */
+ irq_happened = get_irq_happened();
+ if (!irq_happened)
+ return;
-#ifdef CONFIG_PPC_STD_MMU_64
- if (firmware_has_feature(FW_FEATURE_ISERIES)) {
+ /*
+ * We need to hard disable to get a trusted value from
+ * __check_irq_replay(). We also need to soft-disable
+ * again to avoid warnings in there due to the use of
+ * per-cpu variables.
+ *
+ * We know that if the value in irq_happened is exactly 0x01
+ * then we are already hard disabled (there are other less
+ * common cases that we'll ignore for now), so we skip the
+ * (expensive) mtmsrd.
+ */
+ if (unlikely(irq_happened != PACA_IRQ_HARD_DIS))
+ __hard_irq_disable();
+#ifdef CONFIG_TRACE_IRQFLAGS
+ else {
/*
- * Do we need to disable preemption here? Not really: in the
- * unlikely event that we're preempted to a different cpu in
- * between getting r13, loading its lppaca_ptr, and loading
- * its any_int, we might call iseries_handle_interrupts without
- * an interrupt pending on the new cpu, but that's no disaster,
- * is it? And the business of preempting us off the old cpu
- * would itself involve a local_irq_restore which handles the
- * interrupt to that cpu.
- *
- * But use "local_paca->lppaca_ptr" instead of "get_lppaca()"
- * to avoid any preemption checking added into get_paca().
+ * We should already be hard disabled here. We had bugs
+ * where that wasn't the case so let's dbl check it and
+ * warn if we are wrong. Only do that when IRQ tracing
+ * is enabled as mfmsr() can be costly.
*/
- if (local_paca->lppaca_ptr->int_dword.any_int)
- iseries_handle_interrupts();
+ if (WARN_ON(mfmsr() & MSR_EE))
+ __hard_irq_disable();
}
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_TRACE_IRQFLAG */
+
+ set_soft_enabled(0);
+
+ /*
+ * Check if anything needs to be re-emitted. We haven't
+ * soft-enabled yet to avoid warnings in decrementer_check_overflow
+ * accessing per-cpu variables
+ */
+ replay = __check_irq_replay();
+
+ /* We can soft-enable now */
+ set_soft_enabled(1);
/*
- * if (get_paca()->hard_enabled) return;
- * But again we need to take care that gcc gets hard_enabled directly
- * via r13, not choose to use an intermediate register, lest we're
- * preempted to a different cpu in between the two instructions.
+ * And replay if we have to. This will return with interrupts
+ * hard-enabled.
*/
- if (get_hard_enabled())
+ if (replay) {
+ __replay_interrupt(replay);
return;
+ }
-#if defined(CONFIG_BOOKE) && defined(CONFIG_SMP)
- /* Check for pending doorbell interrupts and resend to ourself */
- doorbell_check_self();
-#endif
+ /* Finally, let's ensure we are hard enabled */
+ __hard_irq_enable();
+}
+EXPORT_SYMBOL(arch_local_irq_restore);
+
+/*
+ * This is specifically called by assembly code to re-enable interrupts
+ * if they are currently disabled. This is typically called before
+ * schedule() or do_signal() when returning to userspace. We do it
+ * in C to avoid the burden of dealing with lockdep etc...
+ *
+ * NOTE: This is called with interrupts hard disabled but not marked
+ * as such in paca->irq_happened, so we need to resync this.
+ */
+void notrace restore_interrupts(void)
+{
+ if (irqs_disabled()) {
+ local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+ local_irq_enable();
+ } else
+ __hard_irq_enable();
+}
+/*
+ * This is a helper to use when about to go into idle low-power
+ * when the latter has the side effect of re-enabling interrupts
+ * (such as calling H_CEDE under pHyp).
+ *
+ * You call this function with interrupts soft-disabled (this is
+ * already the case when ppc_md.power_save is called). The function
+ * will return whether to enter power save or just return.
+ *
+ * In the former case, it will have notified lockdep of interrupts
+ * being re-enabled and generally sanitized the lazy irq state,
+ * and in the latter case it will leave with interrupts hard
+ * disabled and marked as such, so the local_irq_enable() call
+ * in arch_cpu_idle() will properly re-enable everything.
+ */
+bool prep_irq_for_idle(void)
+{
/*
- * Need to hard-enable interrupts here. Since currently disabled,
- * no need to take further asm precautions against preemption; but
- * use local_paca instead of get_paca() to avoid preemption checking.
+ * First we need to hard disable to ensure no interrupt
+ * occurs before we effectively enter the low power state
*/
- local_paca->hard_enabled = en;
+ hard_irq_disable();
-#ifndef CONFIG_BOOKE
- /* On server, re-trigger the decrementer if it went negative since
- * some processors only trigger on edge transitions of the sign bit.
- *
- * BookE has a level sensitive decrementer (latches in TSR) so we
- * don't need that
+ /*
+ * If anything happened while we were soft-disabled,
+ * we return now and do not enter the low power state.
*/
- if ((int)mfspr(SPRN_DEC) < 0)
- mtspr(SPRN_DEC, 1);
-#endif /* CONFIG_BOOKE */
+ if (lazy_irq_pending())
+ return false;
+
+ /* Tell lockdep we are about to re-enable */
+ trace_hardirqs_on();
/*
- * Force the delivery of pending soft-disabled interrupts on PS3.
- * Any HV call will have this side effect.
+ * Mark interrupts as soft-enabled and clear the
+ * PACA_IRQ_HARD_DIS from the pending mask since we
+ * are about to hard enable as well as a side effect
+ * of entering the low power state.
*/
- if (firmware_has_feature(FW_FEATURE_PS3_LV1)) {
- u64 tmp;
- lv1_get_version_info(&tmp);
- }
+ local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+ local_paca->soft_enabled = 1;
- __hard_irq_enable();
+ /* Tell the caller to enter the low power state */
+ return true;
}
-EXPORT_SYMBOL(arch_local_irq_restore);
+
#endif /* CONFIG_PPC64 */
-static int show_other_interrupts(struct seq_file *p, int prec)
+int arch_show_interrupts(struct seq_file *p, int prec)
{
int j;
@@ -210,15 +354,20 @@ static int show_other_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%*s: ", prec, "LOC");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs);
- seq_printf(p, " Local timer interrupts\n");
+ seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_event);
+ seq_printf(p, " Local timer interrupts for timer event device\n");
+
+ seq_printf(p, "%*s: ", prec, "LOC");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_others);
+ seq_printf(p, " Local timer interrupts for others\n");
seq_printf(p, "%*s: ", prec, "SPU");
for_each_online_cpu(j)
seq_printf(p, "%10u ", per_cpu(irq_stat, j).spurious_irqs);
seq_printf(p, " Spurious interrupts\n");
- seq_printf(p, "%*s: ", prec, "CNT");
+ seq_printf(p, "%*s: ", prec, "PMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", per_cpu(irq_stat, j).pmu_irqs);
seq_printf(p, " Performance monitoring interrupts\n");
@@ -228,63 +377,15 @@ static int show_other_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%10u ", per_cpu(irq_stat, j).mce_exceptions);
seq_printf(p, " Machine check exceptions\n");
- return 0;
-}
-
-int show_interrupts(struct seq_file *p, void *v)
-{
- unsigned long flags, any_count = 0;
- int i = *(loff_t *) v, j, prec;
- struct irqaction *action;
- struct irq_desc *desc;
-
- if (i > nr_irqs)
- return 0;
-
- for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
- j *= 10;
-
- if (i == nr_irqs)
- return show_other_interrupts(p, prec);
-
- /* print header */
- if (i == 0) {
- seq_printf(p, "%*s", prec + 8, "");
+#ifdef CONFIG_PPC_DOORBELL
+ if (cpu_has_feature(CPU_FTR_DBELL)) {
+ seq_printf(p, "%*s: ", prec, "DBL");
for_each_online_cpu(j)
- seq_printf(p, "CPU%-8d", j);
- seq_putc(p, '\n');
- }
-
- desc = irq_to_desc(i);
- if (!desc)
- return 0;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- for_each_online_cpu(j)
- any_count |= kstat_irqs_cpu(i, j);
- action = desc->action;
- if (!action && !any_count)
- goto out;
-
- seq_printf(p, "%*d: ", prec, i);
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-
- if (desc->chip)
- seq_printf(p, " %-16s", desc->chip->name);
- else
- seq_printf(p, " %-16s", "None");
- seq_printf(p, " %-8s", (desc->status & IRQ_LEVEL) ? "Level" : "Edge");
-
- if (action) {
- seq_printf(p, " %s", action->name);
- while ((action = action->next) != NULL)
- seq_printf(p, ", %s", action->name);
+ seq_printf(p, "%10u ", per_cpu(irq_stat, j).doorbell_irqs);
+ seq_printf(p, " Doorbell interrupts\n");
}
+#endif
- seq_putc(p, '\n');
-out:
- raw_spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
@@ -293,40 +394,47 @@ out:
*/
u64 arch_irq_stat_cpu(unsigned int cpu)
{
- u64 sum = per_cpu(irq_stat, cpu).timer_irqs;
+ u64 sum = per_cpu(irq_stat, cpu).timer_irqs_event;
sum += per_cpu(irq_stat, cpu).pmu_irqs;
sum += per_cpu(irq_stat, cpu).mce_exceptions;
sum += per_cpu(irq_stat, cpu).spurious_irqs;
+ sum += per_cpu(irq_stat, cpu).timer_irqs_others;
+#ifdef CONFIG_PPC_DOORBELL
+ sum += per_cpu(irq_stat, cpu).doorbell_irqs;
+#endif
return sum;
}
#ifdef CONFIG_HOTPLUG_CPU
-void fixup_irqs(const struct cpumask *map)
+void migrate_irqs(void)
{
struct irq_desc *desc;
unsigned int irq;
static int warned;
cpumask_var_t mask;
+ const struct cpumask *map = cpu_online_mask;
alloc_cpumask_var(&mask, GFP_KERNEL);
- for_each_irq(irq) {
- desc = irq_to_desc(irq);
- if (!desc)
- continue;
+ for_each_irq_desc(irq, desc) {
+ struct irq_data *data;
+ struct irq_chip *chip;
- if (desc->status & IRQ_PER_CPU)
+ data = irq_desc_get_irq_data(desc);
+ if (irqd_is_per_cpu(data))
continue;
- cpumask_and(mask, desc->affinity, map);
+ chip = irq_data_get_irq_chip(data);
+
+ cpumask_and(mask, data->affinity, map);
if (cpumask_any(mask) >= nr_cpu_ids) {
printk("Breaking affinity for irq %i\n", irq);
cpumask_copy(mask, map);
}
- if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, mask);
+ if (chip->irq_set_affinity)
+ chip->irq_set_affinity(data, mask, true);
else if (desc->action && !(warned++))
printk("Cannot set affinity for irq %i\n", irq);
}
@@ -339,47 +447,6 @@ void fixup_irqs(const struct cpumask *map)
}
#endif
-static inline void handle_one_irq(unsigned int irq)
-{
- struct thread_info *curtp, *irqtp;
- unsigned long saved_sp_limit;
- struct irq_desc *desc;
-
- /* Switch to the irq stack to handle this */
- curtp = current_thread_info();
- irqtp = hardirq_ctx[smp_processor_id()];
-
- if (curtp == irqtp) {
- /* We're already on the irq stack, just handle it */
- generic_handle_irq(irq);
- return;
- }
-
- desc = irq_to_desc(irq);
- saved_sp_limit = current->thread.ksp_limit;
-
- irqtp->task = curtp->task;
- irqtp->flags = 0;
-
- /* Copy the softirq bits in preempt_count so that the
- * softirq checks work in the hardirq context. */
- irqtp->preempt_count = (irqtp->preempt_count & ~SOFTIRQ_MASK) |
- (curtp->preempt_count & SOFTIRQ_MASK);
-
- current->thread.ksp_limit = (unsigned long)irqtp +
- _ALIGN_UP(sizeof(struct thread_info), 16);
-
- call_handle_irq(irq, desc, irqtp, desc->handle_irq);
- current->thread.ksp_limit = saved_sp_limit;
- irqtp->task = NULL;
-
- /* Set any flag that may have been set on the
- * alternate stack
- */
- if (irqtp->flags)
- set_bits(irqtp->flags, &curtp->flags);
-}
-
static inline void check_stack_overflow(void)
{
#ifdef CONFIG_DEBUG_STACKOVERFLOW
@@ -396,37 +463,72 @@ static inline void check_stack_overflow(void)
#endif
}
-void do_IRQ(struct pt_regs *regs)
+void __do_irq(struct pt_regs *regs)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
unsigned int irq;
- trace_irq_entry(regs);
-
irq_enter();
+ trace_irq_entry(regs);
+
check_stack_overflow();
+ /*
+ * Query the platform PIC for the interrupt & ack it.
+ *
+ * This will typically lower the interrupt line to the CPU
+ */
irq = ppc_md.get_irq();
- if (irq != NO_IRQ && irq != NO_IRQ_IGNORE)
- handle_one_irq(irq);
- else if (irq != NO_IRQ_IGNORE)
+ /* We can hard enable interrupts now to allow perf interrupts */
+ may_hard_irq_enable();
+
+ /* And finally process it */
+ if (unlikely(irq == NO_IRQ))
__get_cpu_var(irq_stat).spurious_irqs++;
+ else
+ generic_handle_irq(irq);
+
+ trace_irq_exit(regs);
irq_exit();
- set_irq_regs(old_regs);
+}
+
+void do_IRQ(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ struct thread_info *curtp, *irqtp, *sirqtp;
-#ifdef CONFIG_PPC_ISERIES
- if (firmware_has_feature(FW_FEATURE_ISERIES) &&
- get_lppaca()->int_dword.fields.decr_int) {
- get_lppaca()->int_dword.fields.decr_int = 0;
- /* Signal a fake decrementer interrupt */
- timer_interrupt(regs);
+ /* Switch to the irq stack to handle this */
+ curtp = current_thread_info();
+ irqtp = hardirq_ctx[raw_smp_processor_id()];
+ sirqtp = softirq_ctx[raw_smp_processor_id()];
+
+ /* Already there ? */
+ if (unlikely(curtp == irqtp || curtp == sirqtp)) {
+ __do_irq(regs);
+ set_irq_regs(old_regs);
+ return;
}
-#endif
- trace_irq_exit(regs);
+ /* Prepare the thread_info in the irq stack */
+ irqtp->task = curtp->task;
+ irqtp->flags = 0;
+
+ /* Copy the preempt_count so that the [soft]irq checks work. */
+ irqtp->preempt_count = curtp->preempt_count;
+
+ /* Switch stack and call */
+ call_do_irq(regs, irqtp);
+
+ /* Restore stack limit */
+ irqtp->task = NULL;
+
+ /* Copy back updates to the thread_info */
+ if (irqtp->flags)
+ set_bits(irqtp->flags, &curtp->flags);
+
+ set_irq_regs(old_regs);
}
void __init init_IRQ(void)
@@ -447,24 +549,33 @@ struct thread_info *mcheckirq_ctx[NR_CPUS] __read_mostly;
void exc_lvl_ctx_init(void)
{
struct thread_info *tp;
- int i, hw_cpu;
+ int i, cpu_nr;
for_each_possible_cpu(i) {
- hw_cpu = get_hard_smp_processor_id(i);
- memset((void *)critirq_ctx[hw_cpu], 0, THREAD_SIZE);
- tp = critirq_ctx[hw_cpu];
- tp->cpu = i;
+#ifdef CONFIG_PPC64
+ cpu_nr = i;
+#else
+#ifdef CONFIG_SMP
+ cpu_nr = get_hard_smp_processor_id(i);
+#else
+ cpu_nr = 0;
+#endif
+#endif
+
+ memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE);
+ tp = critirq_ctx[cpu_nr];
+ tp->cpu = cpu_nr;
tp->preempt_count = 0;
#ifdef CONFIG_BOOKE
- memset((void *)dbgirq_ctx[hw_cpu], 0, THREAD_SIZE);
- tp = dbgirq_ctx[hw_cpu];
- tp->cpu = i;
+ memset((void *)dbgirq_ctx[cpu_nr], 0, THREAD_SIZE);
+ tp = dbgirq_ctx[cpu_nr];
+ tp->cpu = cpu_nr;
tp->preempt_count = 0;
- memset((void *)mcheckirq_ctx[hw_cpu], 0, THREAD_SIZE);
- tp = mcheckirq_ctx[hw_cpu];
- tp->cpu = i;
+ memset((void *)mcheckirq_ctx[cpu_nr], 0, THREAD_SIZE);
+ tp = mcheckirq_ctx[cpu_nr];
+ tp->cpu = cpu_nr;
tp->preempt_count = HARDIRQ_OFFSET;
#endif
}
@@ -483,726 +594,78 @@ void irq_ctx_init(void)
memset((void *)softirq_ctx[i], 0, THREAD_SIZE);
tp = softirq_ctx[i];
tp->cpu = i;
- tp->preempt_count = 0;
memset((void *)hardirq_ctx[i], 0, THREAD_SIZE);
tp = hardirq_ctx[i];
tp->cpu = i;
- tp->preempt_count = HARDIRQ_OFFSET;
}
}
-static inline void do_softirq_onstack(void)
+void do_softirq_own_stack(void)
{
struct thread_info *curtp, *irqtp;
- unsigned long saved_sp_limit = current->thread.ksp_limit;
curtp = current_thread_info();
irqtp = softirq_ctx[smp_processor_id()];
irqtp->task = curtp->task;
- current->thread.ksp_limit = (unsigned long)irqtp +
- _ALIGN_UP(sizeof(struct thread_info), 16);
+ irqtp->flags = 0;
call_do_softirq(irqtp);
- current->thread.ksp_limit = saved_sp_limit;
irqtp->task = NULL;
-}
-
-void do_softirq(void)
-{
- unsigned long flags;
-
- if (in_interrupt())
- return;
-
- local_irq_save(flags);
-
- if (local_softirq_pending())
- do_softirq_onstack();
- local_irq_restore(flags);
+ /* Set any flag that may have been set on the
+ * alternate stack
+ */
+ if (irqtp->flags)
+ set_bits(irqtp->flags, &curtp->flags);
}
-
-/*
- * IRQ controller and virtual interrupts
- */
-
-static LIST_HEAD(irq_hosts);
-static DEFINE_RAW_SPINLOCK(irq_big_lock);
-static unsigned int revmap_trees_allocated;
-static DEFINE_MUTEX(revmap_trees_mutex);
-struct irq_map_entry irq_map[NR_IRQS];
-static unsigned int irq_virq_count = NR_IRQS;
-static struct irq_host *irq_default_host;
-
irq_hw_number_t virq_to_hw(unsigned int virq)
{
- return irq_map[virq].hwirq;
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ return WARN_ON(!irq_data) ? 0 : irq_data->hwirq;
}
EXPORT_SYMBOL_GPL(virq_to_hw);
-static int default_irq_host_match(struct irq_host *h, struct device_node *np)
+#ifdef CONFIG_SMP
+int irq_choose_cpu(const struct cpumask *mask)
{
- return h->of_node != NULL && h->of_node == np;
-}
+ int cpuid;
-struct irq_host *irq_alloc_host(struct device_node *of_node,
- unsigned int revmap_type,
- unsigned int revmap_arg,
- struct irq_host_ops *ops,
- irq_hw_number_t inval_irq)
-{
- struct irq_host *host;
- unsigned int size = sizeof(struct irq_host);
- unsigned int i;
- unsigned int *rmap;
- unsigned long flags;
-
- /* Allocate structure and revmap table if using linear mapping */
- if (revmap_type == IRQ_HOST_MAP_LINEAR)
- size += revmap_arg * sizeof(unsigned int);
- host = zalloc_maybe_bootmem(size, GFP_KERNEL);
- if (host == NULL)
- return NULL;
-
- /* Fill structure */
- host->revmap_type = revmap_type;
- host->inval_irq = inval_irq;
- host->ops = ops;
- host->of_node = of_node_get(of_node);
-
- if (host->ops->match == NULL)
- host->ops->match = default_irq_host_match;
-
- raw_spin_lock_irqsave(&irq_big_lock, flags);
-
- /* If it's a legacy controller, check for duplicates and
- * mark it as allocated (we use irq 0 host pointer for that
- */
- if (revmap_type == IRQ_HOST_MAP_LEGACY) {
- if (irq_map[0].host != NULL) {
- raw_spin_unlock_irqrestore(&irq_big_lock, flags);
- /* If we are early boot, we can't free the structure,
- * too bad...
- * this will be fixed once slab is made available early
- * instead of the current cruft
- */
- if (mem_init_done) {
- of_node_put(host->of_node);
- kfree(host);
- }
- return NULL;
- }
- irq_map[0].host = host;
- }
-
- list_add(&host->link, &irq_hosts);
- raw_spin_unlock_irqrestore(&irq_big_lock, flags);
-
- /* Additional setups per revmap type */
- switch(revmap_type) {
- case IRQ_HOST_MAP_LEGACY:
- /* 0 is always the invalid number for legacy */
- host->inval_irq = 0;
- /* setup us as the host for all legacy interrupts */
- for (i = 1; i < NUM_ISA_INTERRUPTS; i++) {
- irq_map[i].hwirq = i;
- smp_wmb();
- irq_map[i].host = host;
- smp_wmb();
-
- /* Clear norequest flags */
- irq_to_desc(i)->status &= ~IRQ_NOREQUEST;
-
- /* Legacy flags are left to default at this point,
- * one can then use irq_create_mapping() to
- * explicitly change them
- */
- ops->map(host, i, i);
- }
- break;
- case IRQ_HOST_MAP_LINEAR:
- rmap = (unsigned int *)(host + 1);
- for (i = 0; i < revmap_arg; i++)
- rmap[i] = NO_IRQ;
- host->revmap_data.linear.size = revmap_arg;
- smp_wmb();
- host->revmap_data.linear.revmap = rmap;
- break;
- default:
- break;
- }
-
- pr_debug("irq: Allocated host of type %d @0x%p\n", revmap_type, host);
-
- return host;
-}
-
-struct irq_host *irq_find_host(struct device_node *node)
-{
- struct irq_host *h, *found = NULL;
- unsigned long flags;
-
- /* We might want to match the legacy controller last since
- * it might potentially be set to match all interrupts in
- * the absence of a device node. This isn't a problem so far
- * yet though...
- */
- raw_spin_lock_irqsave(&irq_big_lock, flags);
- list_for_each_entry(h, &irq_hosts, link)
- if (h->ops->match(h, node)) {
- found = h;
- break;
- }
- raw_spin_unlock_irqrestore(&irq_big_lock, flags);
- return found;
-}
-EXPORT_SYMBOL_GPL(irq_find_host);
-
-void irq_set_default_host(struct irq_host *host)
-{
- pr_debug("irq: Default host set to @0x%p\n", host);
-
- irq_default_host = host;
-}
-
-void irq_set_virq_count(unsigned int count)
-{
- pr_debug("irq: Trying to set virq count to %d\n", count);
-
- BUG_ON(count < NUM_ISA_INTERRUPTS);
- if (count < NR_IRQS)
- irq_virq_count = count;
-}
-
-static int irq_setup_virq(struct irq_host *host, unsigned int virq,
- irq_hw_number_t hwirq)
-{
- struct irq_desc *desc;
+ if (cpumask_equal(mask, cpu_online_mask)) {
+ static int irq_rover;
+ static DEFINE_RAW_SPINLOCK(irq_rover_lock);
+ unsigned long flags;
- desc = irq_to_desc_alloc_node(virq, 0);
- if (!desc) {
- pr_debug("irq: -> allocating desc failed\n");
- goto error;
- }
-
- /* Clear IRQ_NOREQUEST flag */
- desc->status &= ~IRQ_NOREQUEST;
-
- /* map it */
- smp_wmb();
- irq_map[virq].hwirq = hwirq;
- smp_mb();
-
- if (host->ops->map(host, virq, hwirq)) {
- pr_debug("irq: -> mapping failed, freeing\n");
- goto error;
- }
-
- return 0;
-
-error:
- irq_free_virt(virq, 1);
- return -1;
-}
-
-unsigned int irq_create_direct_mapping(struct irq_host *host)
-{
- unsigned int virq;
-
- if (host == NULL)
- host = irq_default_host;
+ /* Round-robin distribution... */
+do_round_robin:
+ raw_spin_lock_irqsave(&irq_rover_lock, flags);
- BUG_ON(host == NULL);
- WARN_ON(host->revmap_type != IRQ_HOST_MAP_NOMAP);
+ irq_rover = cpumask_next(irq_rover, cpu_online_mask);
+ if (irq_rover >= nr_cpu_ids)
+ irq_rover = cpumask_first(cpu_online_mask);
- virq = irq_alloc_virt(host, 1, 0);
- if (virq == NO_IRQ) {
- pr_debug("irq: create_direct virq allocation failed\n");
- return NO_IRQ;
- }
-
- pr_debug("irq: create_direct obtained virq %d\n", virq);
-
- if (irq_setup_virq(host, virq, virq))
- return NO_IRQ;
-
- return virq;
-}
-
-unsigned int irq_create_mapping(struct irq_host *host,
- irq_hw_number_t hwirq)
-{
- unsigned int virq, hint;
-
- pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", host, hwirq);
-
- /* Look for default host if nececssary */
- if (host == NULL)
- host = irq_default_host;
- if (host == NULL) {
- printk(KERN_WARNING "irq_create_mapping called for"
- " NULL host, hwirq=%lx\n", hwirq);
- WARN_ON(1);
- return NO_IRQ;
- }
- pr_debug("irq: -> using host @%p\n", host);
-
- /* Check if mapping already exist, if it does, call
- * host->ops->map() to update the flags
- */
- virq = irq_find_mapping(host, hwirq);
- if (virq != NO_IRQ) {
- if (host->ops->remap)
- host->ops->remap(host, virq, hwirq);
- pr_debug("irq: -> existing mapping on virq %d\n", virq);
- return virq;
- }
+ cpuid = irq_rover;
- /* Get a virtual interrupt number */
- if (host->revmap_type == IRQ_HOST_MAP_LEGACY) {
- /* Handle legacy */
- virq = (unsigned int)hwirq;
- if (virq == 0 || virq >= NUM_ISA_INTERRUPTS)
- return NO_IRQ;
- return virq;
+ raw_spin_unlock_irqrestore(&irq_rover_lock, flags);
} else {
- /* Allocate a virtual interrupt number */
- hint = hwirq % irq_virq_count;
- virq = irq_alloc_virt(host, 1, hint);
- if (virq == NO_IRQ) {
- pr_debug("irq: -> virq allocation failed\n");
- return NO_IRQ;
- }
+ cpuid = cpumask_first_and(mask, cpu_online_mask);
+ if (cpuid >= nr_cpu_ids)
+ goto do_round_robin;
}
- if (irq_setup_virq(host, virq, hwirq))
- return NO_IRQ;
-
- printk(KERN_DEBUG "irq: irq %lu on host %s mapped to virtual irq %u\n",
- hwirq, host->of_node ? host->of_node->full_name : "null", virq);
-
- return virq;
+ return get_hard_smp_processor_id(cpuid);
}
-EXPORT_SYMBOL_GPL(irq_create_mapping);
-
-unsigned int irq_create_of_mapping(struct device_node *controller,
- const u32 *intspec, unsigned int intsize)
+#else
+int irq_choose_cpu(const struct cpumask *mask)
{
- struct irq_host *host;
- irq_hw_number_t hwirq;
- unsigned int type = IRQ_TYPE_NONE;
- unsigned int virq;
-
- if (controller == NULL)
- host = irq_default_host;
- else
- host = irq_find_host(controller);
- if (host == NULL) {
- printk(KERN_WARNING "irq: no irq host found for %s !\n",
- controller->full_name);
- return NO_IRQ;
- }
-
- /* If host has no translation, then we assume interrupt line */
- if (host->ops->xlate == NULL)
- hwirq = intspec[0];
- else {
- if (host->ops->xlate(host, controller, intspec, intsize,
- &hwirq, &type))
- return NO_IRQ;
- }
-
- /* Create mapping */
- virq = irq_create_mapping(host, hwirq);
- if (virq == NO_IRQ)
- return virq;
-
- /* Set type if specified and different than the current one */
- if (type != IRQ_TYPE_NONE &&
- type != (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK))
- set_irq_type(virq, type);
- return virq;
-}
-EXPORT_SYMBOL_GPL(irq_create_of_mapping);
-
-void irq_dispose_mapping(unsigned int virq)
-{
- struct irq_host *host;
- irq_hw_number_t hwirq;
-
- if (virq == NO_IRQ)
- return;
-
- host = irq_map[virq].host;
- WARN_ON (host == NULL);
- if (host == NULL)
- return;
-
- /* Never unmap legacy interrupts */
- if (host->revmap_type == IRQ_HOST_MAP_LEGACY)
- return;
-
- /* remove chip and handler */
- set_irq_chip_and_handler(virq, NULL, NULL);
-
- /* Make sure it's completed */
- synchronize_irq(virq);
-
- /* Tell the PIC about it */
- if (host->ops->unmap)
- host->ops->unmap(host, virq);
- smp_mb();
-
- /* Clear reverse map */
- hwirq = irq_map[virq].hwirq;
- switch(host->revmap_type) {
- case IRQ_HOST_MAP_LINEAR:
- if (hwirq < host->revmap_data.linear.size)
- host->revmap_data.linear.revmap[hwirq] = NO_IRQ;
- break;
- case IRQ_HOST_MAP_TREE:
- /*
- * Check if radix tree allocated yet, if not then nothing to
- * remove.
- */
- smp_rmb();
- if (revmap_trees_allocated < 1)
- break;
- mutex_lock(&revmap_trees_mutex);
- radix_tree_delete(&host->revmap_data.tree, hwirq);
- mutex_unlock(&revmap_trees_mutex);
- break;
- }
-
- /* Destroy map */
- smp_mb();
- irq_map[virq].hwirq = host->inval_irq;
-
- /* Set some flags */
- irq_to_desc(virq)->status |= IRQ_NOREQUEST;
-
- /* Free it */
- irq_free_virt(virq, 1);
-}
-EXPORT_SYMBOL_GPL(irq_dispose_mapping);
-
-unsigned int irq_find_mapping(struct irq_host *host,
- irq_hw_number_t hwirq)
-{
- unsigned int i;
- unsigned int hint = hwirq % irq_virq_count;
-
- /* Look for default host if nececssary */
- if (host == NULL)
- host = irq_default_host;
- if (host == NULL)
- return NO_IRQ;
-
- /* legacy -> bail early */
- if (host->revmap_type == IRQ_HOST_MAP_LEGACY)
- return hwirq;
-
- /* Slow path does a linear search of the map */
- if (hint < NUM_ISA_INTERRUPTS)
- hint = NUM_ISA_INTERRUPTS;
- i = hint;
- do {
- if (irq_map[i].host == host &&
- irq_map[i].hwirq == hwirq)
- return i;
- i++;
- if (i >= irq_virq_count)
- i = NUM_ISA_INTERRUPTS;
- } while(i != hint);
- return NO_IRQ;
-}
-EXPORT_SYMBOL_GPL(irq_find_mapping);
-
-
-unsigned int irq_radix_revmap_lookup(struct irq_host *host,
- irq_hw_number_t hwirq)
-{
- struct irq_map_entry *ptr;
- unsigned int virq;
-
- WARN_ON(host->revmap_type != IRQ_HOST_MAP_TREE);
-
- /*
- * Check if the radix tree exists and has bee initialized.
- * If not, we fallback to slow mode
- */
- if (revmap_trees_allocated < 2)
- return irq_find_mapping(host, hwirq);
-
- /* Now try to resolve */
- /*
- * No rcu_read_lock(ing) needed, the ptr returned can't go under us
- * as it's referencing an entry in the static irq_map table.
- */
- ptr = radix_tree_lookup(&host->revmap_data.tree, hwirq);
-
- /*
- * If found in radix tree, then fine.
- * Else fallback to linear lookup - this should not happen in practice
- * as it means that we failed to insert the node in the radix tree.
- */
- if (ptr)
- virq = ptr - irq_map;
- else
- virq = irq_find_mapping(host, hwirq);
-
- return virq;
-}
-
-void irq_radix_revmap_insert(struct irq_host *host, unsigned int virq,
- irq_hw_number_t hwirq)
-{
-
- WARN_ON(host->revmap_type != IRQ_HOST_MAP_TREE);
-
- /*
- * Check if the radix tree exists yet.
- * If not, then the irq will be inserted into the tree when it gets
- * initialized.
- */
- smp_rmb();
- if (revmap_trees_allocated < 1)
- return;
-
- if (virq != NO_IRQ) {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_insert(&host->revmap_data.tree, hwirq,
- &irq_map[virq]);
- mutex_unlock(&revmap_trees_mutex);
- }
-}
-
-unsigned int irq_linear_revmap(struct irq_host *host,
- irq_hw_number_t hwirq)
-{
- unsigned int *revmap;
-
- WARN_ON(host->revmap_type != IRQ_HOST_MAP_LINEAR);
-
- /* Check revmap bounds */
- if (unlikely(hwirq >= host->revmap_data.linear.size))
- return irq_find_mapping(host, hwirq);
-
- /* Check if revmap was allocated */
- revmap = host->revmap_data.linear.revmap;
- if (unlikely(revmap == NULL))
- return irq_find_mapping(host, hwirq);
-
- /* Fill up revmap with slow path if no mapping found */
- if (unlikely(revmap[hwirq] == NO_IRQ))
- revmap[hwirq] = irq_find_mapping(host, hwirq);
-
- return revmap[hwirq];
-}
-
-unsigned int irq_alloc_virt(struct irq_host *host,
- unsigned int count,
- unsigned int hint)
-{
- unsigned long flags;
- unsigned int i, j, found = NO_IRQ;
-
- if (count == 0 || count > (irq_virq_count - NUM_ISA_INTERRUPTS))
- return NO_IRQ;
-
- raw_spin_lock_irqsave(&irq_big_lock, flags);
-
- /* Use hint for 1 interrupt if any */
- if (count == 1 && hint >= NUM_ISA_INTERRUPTS &&
- hint < irq_virq_count && irq_map[hint].host == NULL) {
- found = hint;
- goto hint_found;
- }
-
- /* Look for count consecutive numbers in the allocatable
- * (non-legacy) space
- */
- for (i = NUM_ISA_INTERRUPTS, j = 0; i < irq_virq_count; i++) {
- if (irq_map[i].host != NULL)
- j = 0;
- else
- j++;
-
- if (j == count) {
- found = i - count + 1;
- break;
- }
- }
- if (found == NO_IRQ) {
- raw_spin_unlock_irqrestore(&irq_big_lock, flags);
- return NO_IRQ;
- }
- hint_found:
- for (i = found; i < (found + count); i++) {
- irq_map[i].hwirq = host->inval_irq;
- smp_wmb();
- irq_map[i].host = host;
- }
- raw_spin_unlock_irqrestore(&irq_big_lock, flags);
- return found;
-}
-
-void irq_free_virt(unsigned int virq, unsigned int count)
-{
- unsigned long flags;
- unsigned int i;
-
- WARN_ON (virq < NUM_ISA_INTERRUPTS);
- WARN_ON (count == 0 || (virq + count) > irq_virq_count);
-
- raw_spin_lock_irqsave(&irq_big_lock, flags);
- for (i = virq; i < (virq + count); i++) {
- struct irq_host *host;
-
- if (i < NUM_ISA_INTERRUPTS ||
- (virq + count) > irq_virq_count)
- continue;
-
- host = irq_map[i].host;
- irq_map[i].hwirq = host->inval_irq;
- smp_wmb();
- irq_map[i].host = NULL;
- }
- raw_spin_unlock_irqrestore(&irq_big_lock, flags);
+ return hard_smp_processor_id();
}
+#endif
int arch_early_irq_init(void)
{
- struct irq_desc *desc;
- int i;
-
- for (i = 0; i < NR_IRQS; i++) {
- desc = irq_to_desc(i);
- if (desc)
- desc->status |= IRQ_NOREQUEST;
- }
-
- return 0;
-}
-
-int arch_init_chip_data(struct irq_desc *desc, int node)
-{
- desc->status |= IRQ_NOREQUEST;
- return 0;
-}
-
-/* We need to create the radix trees late */
-static int irq_late_init(void)
-{
- struct irq_host *h;
- unsigned int i;
-
- /*
- * No mutual exclusion with respect to accessors of the tree is needed
- * here as the synchronization is done via the state variable
- * revmap_trees_allocated.
- */
- list_for_each_entry(h, &irq_hosts, link) {
- if (h->revmap_type == IRQ_HOST_MAP_TREE)
- INIT_RADIX_TREE(&h->revmap_data.tree, GFP_KERNEL);
- }
-
- /*
- * Make sure the radix trees inits are visible before setting
- * the flag
- */
- smp_wmb();
- revmap_trees_allocated = 1;
-
- /*
- * Insert the reverse mapping for those interrupts already present
- * in irq_map[].
- */
- mutex_lock(&revmap_trees_mutex);
- for (i = 0; i < irq_virq_count; i++) {
- if (irq_map[i].host &&
- (irq_map[i].host->revmap_type == IRQ_HOST_MAP_TREE))
- radix_tree_insert(&irq_map[i].host->revmap_data.tree,
- irq_map[i].hwirq, &irq_map[i]);
- }
- mutex_unlock(&revmap_trees_mutex);
-
- /*
- * Make sure the radix trees insertions are visible before setting
- * the flag
- */
- smp_wmb();
- revmap_trees_allocated = 2;
-
- return 0;
-}
-arch_initcall(irq_late_init);
-
-#ifdef CONFIG_VIRQ_DEBUG
-static int virq_debug_show(struct seq_file *m, void *private)
-{
- unsigned long flags;
- struct irq_desc *desc;
- const char *p;
- static const char none[] = "none";
- int i;
-
- seq_printf(m, "%-5s %-7s %-15s %s\n", "virq", "hwirq",
- "chip name", "host name");
-
- for (i = 1; i < nr_irqs; i++) {
- desc = irq_to_desc(i);
- if (!desc)
- continue;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
-
- if (desc->action && desc->action->handler) {
- seq_printf(m, "%5d ", i);
- seq_printf(m, "0x%05lx ", virq_to_hw(i));
-
- if (desc->chip && desc->chip->name)
- p = desc->chip->name;
- else
- p = none;
- seq_printf(m, "%-15s ", p);
-
- if (irq_map[i].host && irq_map[i].host->of_node)
- p = irq_map[i].host->of_node->full_name;
- else
- p = none;
- seq_printf(m, "%s\n", p);
- }
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- }
-
- return 0;
-}
-
-static int virq_debug_open(struct inode *inode, struct file *file)
-{
- return single_open(file, virq_debug_show, inode->i_private);
-}
-
-static const struct file_operations virq_debug_fops = {
- .open = virq_debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int __init irq_debugfs_init(void)
-{
- if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root,
- NULL, &virq_debug_fops) == NULL)
- return -ENOMEM;
-
return 0;
}
-__initcall(irq_debugfs_init);
-#endif /* CONFIG_VIRQ_DEBUG */
#ifdef CONFIG_PPC64
static int __init setup_noirqdistrib(char *str)
diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c
index 4d5731b2429..0f199709796 100644
--- a/arch/powerpc/kernel/isa-bridge.c
+++ b/arch/powerpc/kernel/isa-bridge.c
@@ -18,6 +18,7 @@
#include <linux/kernel.h>
#include <linux/pci.h>
#include <linux/string.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/notifier.h>
@@ -28,7 +29,6 @@
#include <asm/pci-bridge.h>
#include <asm/machdep.h>
#include <asm/ppc-pci.h>
-#include <asm/firmware.h>
unsigned long isa_io_base; /* NULL if no ISA bus */
EXPORT_SYMBOL(isa_io_base);
@@ -41,8 +41,8 @@ EXPORT_SYMBOL_GPL(isa_bridge_pcidev);
#define ISA_SPACE_MASK 0x1
#define ISA_SPACE_IO 0x1
-static void __devinit pci_process_ISA_OF_ranges(struct device_node *isa_node,
- unsigned long phb_io_base_phys)
+static void pci_process_ISA_OF_ranges(struct device_node *isa_node,
+ unsigned long phb_io_base_phys)
{
/* We should get some saner parsing here and remove these structs */
struct pci_address {
@@ -170,8 +170,8 @@ void __init isa_bridge_find_early(struct pci_controller *hose)
* isa_bridge_find_late - Find and map the ISA IO space upon discovery of
* a new ISA bridge
*/
-static void __devinit isa_bridge_find_late(struct pci_dev *pdev,
- struct device_node *devnode)
+static void isa_bridge_find_late(struct pci_dev *pdev,
+ struct device_node *devnode)
{
struct pci_controller *hose = pci_bus_to_host(pdev->bus);
@@ -215,8 +215,8 @@ static void isa_bridge_remove(void)
/**
* isa_bridge_notify - Get notified of PCI devices addition/removal
*/
-static int __devinit isa_bridge_notify(struct notifier_block *nb,
- unsigned long action, void *data)
+static int isa_bridge_notify(struct notifier_block *nb, unsigned long action,
+ void *data)
{
struct device *dev = data;
struct pci_dev *pdev = to_pci_dev(dev);
@@ -260,8 +260,6 @@ static struct notifier_block isa_bridge_notifier = {
*/
static int __init isa_bridge_init(void)
{
- if (firmware_has_feature(FW_FEATURE_ISERIES))
- return 0;
bus_register_notifier(&pci_bus_type, &isa_bridge_notifier);
return 0;
}
diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c
new file mode 100644
index 00000000000..a1ed8a8c7cb
--- /dev/null
+++ b/arch/powerpc/kernel/jump_label.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2010 Michael Ellerman, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/jump_label.h>
+#include <asm/code-patching.h>
+
+#ifdef HAVE_JUMP_LABEL
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ u32 *addr = (u32 *)(unsigned long)entry->code;
+
+ if (type == JUMP_LABEL_ENABLE)
+ patch_branch(addr, entry->target, 0);
+ else
+ patch_instruction(addr, PPC_INST_NOP);
+}
+#endif
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 7a9db64f3f0..8504657379f 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -15,7 +15,6 @@
*/
#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/kgdb.h>
#include <linux/smp.h>
#include <linux/signal.h>
@@ -24,6 +23,8 @@
#include <asm/current.h>
#include <asm/processor.h>
#include <asm/machdep.h>
+#include <asm/debug.h>
+#include <linux/slab.h>
/*
* This table contains the mapping between PowerPC hardware trap types, and
@@ -100,6 +101,21 @@ static int computeSignal(unsigned int tt)
return SIGHUP; /* default for things we don't know about */
}
+/**
+ *
+ * kgdb_skipexception - Bail out of KGDB when we've been triggered.
+ * @exception: Exception vector number
+ * @regs: Current &struct pt_regs.
+ *
+ * On some architectures we need to skip a breakpoint exception when
+ * it occurs after a breakpoint has been removed.
+ *
+ */
+int kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+ return kgdb_isremovedbreak(regs->nip);
+}
+
static int kgdb_call_nmi_hook(struct pt_regs *regs)
{
kgdb_nmicallback(raw_smp_processor_id(), regs);
@@ -109,7 +125,7 @@ static int kgdb_call_nmi_hook(struct pt_regs *regs)
#ifdef CONFIG_SMP
void kgdb_roundup_cpus(unsigned long flags)
{
- smp_send_debugger_break(MSG_ALL_BUT_SELF);
+ smp_send_debugger_break();
}
#endif
@@ -134,15 +150,18 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs)
return 1;
}
+static DEFINE_PER_CPU(struct thread_info, kgdb_thread_info);
static int kgdb_singlestep(struct pt_regs *regs)
{
struct thread_info *thread_info, *exception_thread_info;
+ struct thread_info *backup_current_thread_info =
+ &__get_cpu_var(kgdb_thread_info);
if (user_mode(regs))
return 0;
/*
- * On Book E and perhaps other processsors, singlestep is handled on
+ * On Book E and perhaps other processors, singlestep is handled on
* the critical exception stack. This causes current_thread_info()
* to fail, since it it locates the thread_info by masking off
* the low bits of the current stack pointer. We work around
@@ -154,13 +173,17 @@ static int kgdb_singlestep(struct pt_regs *regs)
thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1));
exception_thread_info = current_thread_info();
- if (thread_info != exception_thread_info)
+ if (thread_info != exception_thread_info) {
+ /* Save the original current_thread_info. */
+ memcpy(backup_current_thread_info, exception_thread_info, sizeof *thread_info);
memcpy(exception_thread_info, thread_info, sizeof *thread_info);
+ }
kgdb_handle_exception(0, SIGTRAP, 0, regs);
if (thread_info != exception_thread_info)
- memcpy(thread_info, exception_thread_info, sizeof *thread_info);
+ /* Restore current_thread_info lastly. */
+ memcpy(exception_thread_info, backup_current_thread_info, sizeof *thread_info);
return 1;
}
@@ -175,7 +198,7 @@ static int kgdb_iabr_match(struct pt_regs *regs)
return 1;
}
-static int kgdb_dabr_match(struct pt_regs *regs)
+static int kgdb_break_match(struct pt_regs *regs)
{
if (user_mode(regs))
return 0;
@@ -337,7 +360,7 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
/* FP registers 32 -> 63 */
#if defined(CONFIG_FSL_BOOKE) && defined(CONFIG_SPE)
if (current)
- memcpy(mem, current->thread.evr[regno-32],
+ memcpy(mem, &current->thread.evr[regno-32],
dbg_reg_def[regno].size);
#else
/* fp registers not used by kernel, leave zero */
@@ -362,7 +385,7 @@ int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
if (regno >= 32 && regno < 64) {
/* FP registers 32 -> 63 */
#if defined(CONFIG_FSL_BOOKE) && defined(CONFIG_SPE)
- memcpy(current->thread.evr[regno-32], mem,
+ memcpy(&current->thread.evr[regno-32], mem,
dbg_reg_def[regno].size);
#else
/* fp registers not used by kernel, leave zero */
@@ -409,7 +432,6 @@ int kgdb_arch_handle_exception(int vector, int signo, int err_code,
#else
linux_regs->msr |= MSR_SE;
#endif
- kgdb_single_step = 1;
atomic_set(&kgdb_cpu_doing_single_step,
raw_smp_processor_id());
}
@@ -436,7 +458,7 @@ static void *old__debugger;
static void *old__debugger_bpt;
static void *old__debugger_sstep;
static void *old__debugger_iabr_match;
-static void *old__debugger_dabr_match;
+static void *old__debugger_break_match;
static void *old__debugger_fault_handler;
int kgdb_arch_init(void)
@@ -446,7 +468,7 @@ int kgdb_arch_init(void)
old__debugger_bpt = __debugger_bpt;
old__debugger_sstep = __debugger_sstep;
old__debugger_iabr_match = __debugger_iabr_match;
- old__debugger_dabr_match = __debugger_dabr_match;
+ old__debugger_break_match = __debugger_break_match;
old__debugger_fault_handler = __debugger_fault_handler;
__debugger_ipi = kgdb_call_nmi_hook;
@@ -454,7 +476,7 @@ int kgdb_arch_init(void)
__debugger_bpt = kgdb_handle_breakpoint;
__debugger_sstep = kgdb_singlestep;
__debugger_iabr_match = kgdb_iabr_match;
- __debugger_dabr_match = kgdb_dabr_match;
+ __debugger_break_match = kgdb_break_match;
__debugger_fault_handler = kgdb_not_implemented;
return 0;
@@ -467,6 +489,6 @@ void kgdb_arch_exit(void)
__debugger_bpt = old__debugger_bpt;
__debugger_sstep = old__debugger_sstep;
__debugger_iabr_match = old__debugger_iabr_match;
- __debugger_dabr_match = old__debugger_dabr_match;
+ __debugger_break_match = old__debugger_break_match;
__debugger_fault_handler = old__debugger_fault_handler;
}
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index bc47352deb1..2f72af82513 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -32,16 +32,10 @@
#include <linux/module.h>
#include <linux/kdebug.h>
#include <linux/slab.h>
+#include <asm/code-patching.h>
#include <asm/cacheflush.h>
#include <asm/sstep.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
-
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
-#define MSR_SINGLESTEP (MSR_DE)
-#else
-#define MSR_SINGLESTEP (MSR_SE)
-#endif
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
@@ -105,19 +99,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
{
- /* We turn off async exceptions to ensure that the single step will
- * be for the instruction we have the kprobe on, if we dont its
- * possible we'd get the single step reported for an exception handler
- * like Decrementer or External Interrupt */
- regs->msr &= ~MSR_EE;
- regs->msr |= MSR_SINGLESTEP;
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
- regs->msr &= ~MSR_CE;
- mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
-#ifdef CONFIG_PPC_47x
- isync();
-#endif
-#endif
+ enable_single_step(regs);
/*
* On powerpc we should single step on the original
@@ -311,7 +293,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
{
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
@@ -331,7 +313,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
* real return address, and all the rest will point to
* kretprobe_trampoline
*/
- hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
@@ -358,7 +340,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
kretprobe_hash_unlock(current, &flags);
preempt_enable_no_resched();
- hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+ hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
@@ -448,7 +430,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
case KPROBE_HIT_SSDONE:
/*
* We increment the nmissed count for accounting,
- * we can also use npre/npostfault count for accouting
+ * we can also use npre/npostfault count for accounting
* these specific fault cases.
*/
kprobes_inc_nmissed_count(cur);
@@ -510,12 +492,10 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
return ret;
}
-#ifdef CONFIG_PPC64
unsigned long arch_deref_entry_point(void *entry)
{
- return ((func_descr_t *)entry)->entry;
+ return ppc_global_function_entry(entry);
}
-#endif
int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
@@ -527,8 +507,12 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
/* setup return addr to the jprobe handler routine */
regs->nip = arch_deref_entry_point(jp->entry);
#ifdef CONFIG_PPC64
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+ regs->gpr[12] = (unsigned long)jp->entry;
+#else
regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc);
#endif
+#endif
return 1;
}
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index b06bdae0406..33aa4ddf597 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
*
* Authors:
* Alexander Graf <agraf@suse.de>
@@ -20,6 +21,7 @@
#include <linux/kvm_host.h>
#include <linux/init.h>
+#include <linux/export.h>
#include <linux/kvm_para.h>
#include <linux/slab.h>
#include <linux/of.h>
@@ -28,6 +30,8 @@
#include <asm/sections.h>
#include <asm/cacheflush.h>
#include <asm/disassemble.h>
+#include <asm/ppc-opcode.h>
+#include <asm/epapr_hcalls.h>
#define KVM_MAGIC_PAGE (-4096L)
#define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
@@ -40,41 +44,37 @@
#define KVM_INST_B 0x48000000
#define KVM_INST_B_MASK 0x03ffffff
#define KVM_INST_B_MAX 0x01ffffff
+#define KVM_INST_LI 0x38000000
#define KVM_MASK_RT 0x03e00000
#define KVM_RT_30 0x03c00000
#define KVM_MASK_RB 0x0000f800
#define KVM_INST_MFMSR 0x7c0000a6
-#define KVM_INST_MFSPR_SPRG0 0x7c1042a6
-#define KVM_INST_MFSPR_SPRG1 0x7c1142a6
-#define KVM_INST_MFSPR_SPRG2 0x7c1242a6
-#define KVM_INST_MFSPR_SPRG3 0x7c1342a6
-#define KVM_INST_MFSPR_SRR0 0x7c1a02a6
-#define KVM_INST_MFSPR_SRR1 0x7c1b02a6
-#define KVM_INST_MFSPR_DAR 0x7c1302a6
-#define KVM_INST_MFSPR_DSISR 0x7c1202a6
-
-#define KVM_INST_MTSPR_SPRG0 0x7c1043a6
-#define KVM_INST_MTSPR_SPRG1 0x7c1143a6
-#define KVM_INST_MTSPR_SPRG2 0x7c1243a6
-#define KVM_INST_MTSPR_SPRG3 0x7c1343a6
-#define KVM_INST_MTSPR_SRR0 0x7c1a03a6
-#define KVM_INST_MTSPR_SRR1 0x7c1b03a6
-#define KVM_INST_MTSPR_DAR 0x7c1303a6
-#define KVM_INST_MTSPR_DSISR 0x7c1203a6
+
+#define SPR_FROM 0
+#define SPR_TO 0x100
+
+#define KVM_INST_SPR(sprn, moveto) (0x7c0002a6 | \
+ (((sprn) & 0x1f) << 16) | \
+ (((sprn) & 0x3e0) << 6) | \
+ (moveto))
+
+#define KVM_INST_MFSPR(sprn) KVM_INST_SPR(sprn, SPR_FROM)
+#define KVM_INST_MTSPR(sprn) KVM_INST_SPR(sprn, SPR_TO)
#define KVM_INST_TLBSYNC 0x7c00046c
#define KVM_INST_MTMSRD_L0 0x7c000164
#define KVM_INST_MTMSRD_L1 0x7c010164
#define KVM_INST_MTMSR 0x7c000124
+#define KVM_INST_WRTEE 0x7c000106
#define KVM_INST_WRTEEI_0 0x7c000146
#define KVM_INST_WRTEEI_1 0x7c008146
#define KVM_INST_MTSRIN 0x7c0001e4
static bool kvm_patching_worked = true;
-static char kvm_tmp[1024 * 1024];
+char kvm_tmp[1024 * 1024];
static int kvm_tmp_index;
static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
@@ -131,7 +131,6 @@ static void kvm_patch_ins_b(u32 *inst, int addr)
/* On relocatable kernels interrupts handlers and our code
can be in different regions, so we don't patch them */
- extern u32 __end_interrupts;
if ((ulong)inst < (ulong)&__end_interrupts)
return;
#endif
@@ -270,26 +269,27 @@ static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
#ifdef CONFIG_BOOKE
-extern u32 kvm_emulate_wrteei_branch_offs;
-extern u32 kvm_emulate_wrteei_ee_offs;
-extern u32 kvm_emulate_wrteei_len;
-extern u32 kvm_emulate_wrteei[];
+extern u32 kvm_emulate_wrtee_branch_offs;
+extern u32 kvm_emulate_wrtee_reg_offs;
+extern u32 kvm_emulate_wrtee_orig_ins_offs;
+extern u32 kvm_emulate_wrtee_len;
+extern u32 kvm_emulate_wrtee[];
-static void kvm_patch_ins_wrteei(u32 *inst)
+static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one)
{
u32 *p;
int distance_start;
int distance_end;
ulong next_inst;
- p = kvm_alloc(kvm_emulate_wrteei_len * 4);
+ p = kvm_alloc(kvm_emulate_wrtee_len * 4);
if (!p)
return;
/* Find out where we are and put everything there */
distance_start = (ulong)p - (ulong)inst;
next_inst = ((ulong)inst + 4);
- distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_branch_offs];
+ distance_end = next_inst - (ulong)&p[kvm_emulate_wrtee_branch_offs];
/* Make sure we only write valid b instructions */
if (distance_start > KVM_INST_B_MAX) {
@@ -298,10 +298,65 @@ static void kvm_patch_ins_wrteei(u32 *inst)
}
/* Modify the chunk to fit the invocation */
- memcpy(p, kvm_emulate_wrteei, kvm_emulate_wrteei_len * 4);
- p[kvm_emulate_wrteei_branch_offs] |= distance_end & KVM_INST_B_MASK;
- p[kvm_emulate_wrteei_ee_offs] |= (*inst & MSR_EE);
- flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_len * 4);
+ memcpy(p, kvm_emulate_wrtee, kvm_emulate_wrtee_len * 4);
+ p[kvm_emulate_wrtee_branch_offs] |= distance_end & KVM_INST_B_MASK;
+
+ if (imm_one) {
+ p[kvm_emulate_wrtee_reg_offs] =
+ KVM_INST_LI | __PPC_RT(R30) | MSR_EE;
+ } else {
+ /* Make clobbered registers work too */
+ switch (get_rt(rt)) {
+ case 30:
+ kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs],
+ magic_var(scratch2), KVM_RT_30);
+ break;
+ case 31:
+ kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs],
+ magic_var(scratch1), KVM_RT_30);
+ break;
+ default:
+ p[kvm_emulate_wrtee_reg_offs] |= rt;
+ break;
+ }
+ }
+
+ p[kvm_emulate_wrtee_orig_ins_offs] = *inst;
+ flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrtee_len * 4);
+
+ /* Patch the invocation */
+ kvm_patch_ins_b(inst, distance_start);
+}
+
+extern u32 kvm_emulate_wrteei_0_branch_offs;
+extern u32 kvm_emulate_wrteei_0_len;
+extern u32 kvm_emulate_wrteei_0[];
+
+static void kvm_patch_ins_wrteei_0(u32 *inst)
+{
+ u32 *p;
+ int distance_start;
+ int distance_end;
+ ulong next_inst;
+
+ p = kvm_alloc(kvm_emulate_wrteei_0_len * 4);
+ if (!p)
+ return;
+
+ /* Find out where we are and put everything there */
+ distance_start = (ulong)p - (ulong)inst;
+ next_inst = ((ulong)inst + 4);
+ distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_0_branch_offs];
+
+ /* Make sure we only write valid b instructions */
+ if (distance_start > KVM_INST_B_MAX) {
+ kvm_patching_worked = false;
+ return;
+ }
+
+ memcpy(p, kvm_emulate_wrteei_0, kvm_emulate_wrteei_0_len * 4);
+ p[kvm_emulate_wrteei_0_branch_offs] |= distance_end & KVM_INST_B_MASK;
+ flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_0_len * 4);
/* Patch the invocation */
kvm_patch_ins_b(inst, distance_start);
@@ -358,13 +413,13 @@ static void kvm_map_magic_page(void *data)
{
u32 *features = data;
- ulong in[8];
+ ulong in[8] = {0};
ulong out[8];
in[0] = KVM_MAGIC_PAGE;
- in[1] = KVM_MAGIC_PAGE;
+ in[1] = KVM_MAGIC_PAGE | MAGIC_PAGE_FLAG_NOT_MAPPED_NX;
- kvm_hypercall(in, out, HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE);
+ epapr_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE));
*features = out[0];
}
@@ -380,56 +435,191 @@ static void kvm_check_ins(u32 *inst, u32 features)
case KVM_INST_MFMSR:
kvm_patch_ins_ld(inst, magic_var(msr), inst_rt);
break;
- case KVM_INST_MFSPR_SPRG0:
+ case KVM_INST_MFSPR(SPRN_SPRG0):
kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt);
break;
- case KVM_INST_MFSPR_SPRG1:
+ case KVM_INST_MFSPR(SPRN_SPRG1):
kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt);
break;
- case KVM_INST_MFSPR_SPRG2:
+ case KVM_INST_MFSPR(SPRN_SPRG2):
kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt);
break;
- case KVM_INST_MFSPR_SPRG3:
+ case KVM_INST_MFSPR(SPRN_SPRG3):
kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt);
break;
- case KVM_INST_MFSPR_SRR0:
+ case KVM_INST_MFSPR(SPRN_SRR0):
kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt);
break;
- case KVM_INST_MFSPR_SRR1:
+ case KVM_INST_MFSPR(SPRN_SRR1):
kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt);
break;
- case KVM_INST_MFSPR_DAR:
+#ifdef CONFIG_BOOKE
+ case KVM_INST_MFSPR(SPRN_DEAR):
+#else
+ case KVM_INST_MFSPR(SPRN_DAR):
+#endif
kvm_patch_ins_ld(inst, magic_var(dar), inst_rt);
break;
- case KVM_INST_MFSPR_DSISR:
+ case KVM_INST_MFSPR(SPRN_DSISR):
kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt);
break;
+#ifdef CONFIG_PPC_BOOK3E_MMU
+ case KVM_INST_MFSPR(SPRN_MAS0):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_lwz(inst, magic_var(mas0), inst_rt);
+ break;
+ case KVM_INST_MFSPR(SPRN_MAS1):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_lwz(inst, magic_var(mas1), inst_rt);
+ break;
+ case KVM_INST_MFSPR(SPRN_MAS2):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_ld(inst, magic_var(mas2), inst_rt);
+ break;
+ case KVM_INST_MFSPR(SPRN_MAS3):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_lwz(inst, magic_var(mas7_3) + 4, inst_rt);
+ break;
+ case KVM_INST_MFSPR(SPRN_MAS4):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_lwz(inst, magic_var(mas4), inst_rt);
+ break;
+ case KVM_INST_MFSPR(SPRN_MAS6):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_lwz(inst, magic_var(mas6), inst_rt);
+ break;
+ case KVM_INST_MFSPR(SPRN_MAS7):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_lwz(inst, magic_var(mas7_3), inst_rt);
+ break;
+#endif /* CONFIG_PPC_BOOK3E_MMU */
+
+ case KVM_INST_MFSPR(SPRN_SPRG4):
+#ifdef CONFIG_BOOKE
+ case KVM_INST_MFSPR(SPRN_SPRG4R):
+#endif
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_ld(inst, magic_var(sprg4), inst_rt);
+ break;
+ case KVM_INST_MFSPR(SPRN_SPRG5):
+#ifdef CONFIG_BOOKE
+ case KVM_INST_MFSPR(SPRN_SPRG5R):
+#endif
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_ld(inst, magic_var(sprg5), inst_rt);
+ break;
+ case KVM_INST_MFSPR(SPRN_SPRG6):
+#ifdef CONFIG_BOOKE
+ case KVM_INST_MFSPR(SPRN_SPRG6R):
+#endif
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_ld(inst, magic_var(sprg6), inst_rt);
+ break;
+ case KVM_INST_MFSPR(SPRN_SPRG7):
+#ifdef CONFIG_BOOKE
+ case KVM_INST_MFSPR(SPRN_SPRG7R):
+#endif
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_ld(inst, magic_var(sprg7), inst_rt);
+ break;
+
+#ifdef CONFIG_BOOKE
+ case KVM_INST_MFSPR(SPRN_ESR):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_lwz(inst, magic_var(esr), inst_rt);
+ break;
+#endif
+
+ case KVM_INST_MFSPR(SPRN_PIR):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_lwz(inst, magic_var(pir), inst_rt);
+ break;
+
+
/* Stores */
- case KVM_INST_MTSPR_SPRG0:
+ case KVM_INST_MTSPR(SPRN_SPRG0):
kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt);
break;
- case KVM_INST_MTSPR_SPRG1:
+ case KVM_INST_MTSPR(SPRN_SPRG1):
kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt);
break;
- case KVM_INST_MTSPR_SPRG2:
+ case KVM_INST_MTSPR(SPRN_SPRG2):
kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt);
break;
- case KVM_INST_MTSPR_SPRG3:
+ case KVM_INST_MTSPR(SPRN_SPRG3):
kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt);
break;
- case KVM_INST_MTSPR_SRR0:
+ case KVM_INST_MTSPR(SPRN_SRR0):
kvm_patch_ins_std(inst, magic_var(srr0), inst_rt);
break;
- case KVM_INST_MTSPR_SRR1:
+ case KVM_INST_MTSPR(SPRN_SRR1):
kvm_patch_ins_std(inst, magic_var(srr1), inst_rt);
break;
- case KVM_INST_MTSPR_DAR:
+#ifdef CONFIG_BOOKE
+ case KVM_INST_MTSPR(SPRN_DEAR):
+#else
+ case KVM_INST_MTSPR(SPRN_DAR):
+#endif
kvm_patch_ins_std(inst, magic_var(dar), inst_rt);
break;
- case KVM_INST_MTSPR_DSISR:
+ case KVM_INST_MTSPR(SPRN_DSISR):
kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt);
break;
+#ifdef CONFIG_PPC_BOOK3E_MMU
+ case KVM_INST_MTSPR(SPRN_MAS0):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_stw(inst, magic_var(mas0), inst_rt);
+ break;
+ case KVM_INST_MTSPR(SPRN_MAS1):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_stw(inst, magic_var(mas1), inst_rt);
+ break;
+ case KVM_INST_MTSPR(SPRN_MAS2):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_std(inst, magic_var(mas2), inst_rt);
+ break;
+ case KVM_INST_MTSPR(SPRN_MAS3):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_stw(inst, magic_var(mas7_3) + 4, inst_rt);
+ break;
+ case KVM_INST_MTSPR(SPRN_MAS4):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_stw(inst, magic_var(mas4), inst_rt);
+ break;
+ case KVM_INST_MTSPR(SPRN_MAS6):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_stw(inst, magic_var(mas6), inst_rt);
+ break;
+ case KVM_INST_MTSPR(SPRN_MAS7):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_stw(inst, magic_var(mas7_3), inst_rt);
+ break;
+#endif /* CONFIG_PPC_BOOK3E_MMU */
+
+ case KVM_INST_MTSPR(SPRN_SPRG4):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_std(inst, magic_var(sprg4), inst_rt);
+ break;
+ case KVM_INST_MTSPR(SPRN_SPRG5):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_std(inst, magic_var(sprg5), inst_rt);
+ break;
+ case KVM_INST_MTSPR(SPRN_SPRG6):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_std(inst, magic_var(sprg6), inst_rt);
+ break;
+ case KVM_INST_MTSPR(SPRN_SPRG7):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_std(inst, magic_var(sprg7), inst_rt);
+ break;
+
+#ifdef CONFIG_BOOKE
+ case KVM_INST_MTSPR(SPRN_ESR):
+ if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7)
+ kvm_patch_ins_stw(inst, magic_var(esr), inst_rt);
+ break;
+#endif
/* Nops */
case KVM_INST_TLBSYNC:
@@ -444,6 +634,11 @@ static void kvm_check_ins(u32 *inst, u32 features)
case KVM_INST_MTMSRD_L0:
kvm_patch_ins_mtmsr(inst, inst_rt);
break;
+#ifdef CONFIG_BOOKE
+ case KVM_INST_WRTEE:
+ kvm_patch_ins_wrtee(inst, inst_rt, 0);
+ break;
+#endif
}
switch (inst_no_rt & ~KVM_MASK_RB) {
@@ -461,13 +656,19 @@ static void kvm_check_ins(u32 *inst, u32 features)
switch (_inst) {
#ifdef CONFIG_BOOKE
case KVM_INST_WRTEEI_0:
+ kvm_patch_ins_wrteei_0(inst);
+ break;
+
case KVM_INST_WRTEEI_1:
- kvm_patch_ins_wrteei(inst);
+ kvm_patch_ins_wrtee(inst, 0, 1);
break;
#endif
}
}
+extern u32 kvm_template_start[];
+extern u32 kvm_template_end[];
+
static void kvm_use_magic_page(void)
{
u32 *p;
@@ -488,87 +689,32 @@ static void kvm_use_magic_page(void)
start = (void*)_stext;
end = (void*)_etext;
- for (p = start; p < end; p++)
+ /*
+ * Being interrupted in the middle of patching would
+ * be bad for SPRG4-7, which KVM can't keep in sync
+ * with emulated accesses because reads don't trap.
+ */
+ local_irq_disable();
+
+ for (p = start; p < end; p++) {
+ /* Avoid patching the template code */
+ if (p >= kvm_template_start && p < kvm_template_end) {
+ p = kvm_template_end - 1;
+ continue;
+ }
kvm_check_ins(p, features);
+ }
+
+ local_irq_enable();
printk(KERN_INFO "KVM: Live patching for a fast VM %s\n",
kvm_patching_worked ? "worked" : "failed");
}
-unsigned long kvm_hypercall(unsigned long *in,
- unsigned long *out,
- unsigned long nr)
-{
- unsigned long register r0 asm("r0");
- unsigned long register r3 asm("r3") = in[0];
- unsigned long register r4 asm("r4") = in[1];
- unsigned long register r5 asm("r5") = in[2];
- unsigned long register r6 asm("r6") = in[3];
- unsigned long register r7 asm("r7") = in[4];
- unsigned long register r8 asm("r8") = in[5];
- unsigned long register r9 asm("r9") = in[6];
- unsigned long register r10 asm("r10") = in[7];
- unsigned long register r11 asm("r11") = nr;
- unsigned long register r12 asm("r12");
-
- asm volatile("bl kvm_hypercall_start"
- : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
- "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11),
- "=r"(r12)
- : "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7), "r"(r8),
- "r"(r9), "r"(r10), "r"(r11)
- : "memory", "cc", "xer", "ctr", "lr");
-
- out[0] = r4;
- out[1] = r5;
- out[2] = r6;
- out[3] = r7;
- out[4] = r8;
- out[5] = r9;
- out[6] = r10;
- out[7] = r11;
-
- return r3;
-}
-EXPORT_SYMBOL_GPL(kvm_hypercall);
-
-static int kvm_para_setup(void)
-{
- extern u32 kvm_hypercall_start;
- struct device_node *hyper_node;
- u32 *insts;
- int len, i;
-
- hyper_node = of_find_node_by_path("/hypervisor");
- if (!hyper_node)
- return -1;
-
- insts = (u32*)of_get_property(hyper_node, "hcall-instructions", &len);
- if (len % 4)
- return -1;
- if (len > (4 * 4))
- return -1;
-
- for (i = 0; i < (len / 4); i++)
- kvm_patch_ins(&(&kvm_hypercall_start)[i], insts[i]);
-
- return 0;
-}
-
static __init void kvm_free_tmp(void)
{
- unsigned long start, end;
-
- start = (ulong)&kvm_tmp[kvm_tmp_index + (PAGE_SIZE - 1)] & PAGE_MASK;
- end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK;
-
- /* Free the tmp space we don't need */
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- }
+ free_reserved_area(&kvm_tmp[kvm_tmp_index],
+ &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL);
}
static int __init kvm_guest_init(void)
@@ -576,7 +722,7 @@ static int __init kvm_guest_init(void)
if (!kvm_para_available())
goto free_tmp;
- if (kvm_para_setup())
+ if (!epapr_paravirt_enabled)
goto free_tmp;
if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index f2b1b2523e6..e100ff324a8 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -13,6 +13,7 @@
* Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright SUSE Linux Products GmbH 2010
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
*
* Authors: Alexander Graf <agraf@suse.de>
*/
@@ -23,16 +24,6 @@
#include <asm/page.h>
#include <asm/asm-offsets.h>
-/* Hypercall entry point. Will be patched with device tree instructions. */
-
-.global kvm_hypercall_start
-kvm_hypercall_start:
- li r3, -1
- nop
- nop
- nop
- blr
-
#define KVM_MAGIC_PAGE (-4096)
#ifdef CONFIG_64BIT
@@ -65,6 +56,9 @@ kvm_hypercall_start:
shared->critical == r1 and r2 is always != r1 */ \
STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);
+.global kvm_template_start
+kvm_template_start:
+
.global kvm_emulate_mtmsrd
kvm_emulate_mtmsrd:
@@ -128,7 +122,7 @@ kvm_emulate_mtmsrd_len:
.long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4
-#define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI)
+#define MSR_SAFE_BITS (MSR_EE | MSR_RI)
#define MSR_CRITICAL_BITS ~MSR_SAFE_BITS
.global kvm_emulate_mtmsr
@@ -167,6 +161,9 @@ maybe_stay_in_guest:
kvm_emulate_mtmsr_reg2:
ori r30, r0, 0
+ /* Put MSR into magic page because we don't call mtmsr */
+ STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
/* Check if we have to fetch an interrupt */
lwz r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
cmpwi r31, 0
@@ -174,15 +171,10 @@ kvm_emulate_mtmsr_reg2:
/* Check if we may trigger an interrupt */
andi. r31, r30, MSR_EE
- beq no_mtmsr
-
- b do_mtmsr
+ bne do_mtmsr
no_mtmsr:
- /* Put MSR into magic page because we don't call mtmsr */
- STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
-
SCRATCH_RESTORE
/* Go back to caller */
@@ -210,24 +202,80 @@ kvm_emulate_mtmsr_orig_ins_offs:
kvm_emulate_mtmsr_len:
.long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
+/* also used for wrteei 1 */
+.global kvm_emulate_wrtee
+kvm_emulate_wrtee:
+
+ SCRATCH_SAVE
+ /* Fetch old MSR in r31 */
+ LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
-.global kvm_emulate_wrteei
-kvm_emulate_wrteei:
+ /* Insert new MSR[EE] */
+kvm_emulate_wrtee_reg:
+ ori r30, r0, 0
+ rlwimi r31, r30, 0, MSR_EE
+
+ /*
+ * If MSR[EE] is now set, check for a pending interrupt.
+ * We could skip this if MSR[EE] was already on, but that
+ * should be rare, so don't bother.
+ */
+ andi. r30, r30, MSR_EE
+
+ /* Put MSR into magic page because we don't call wrtee */
+ STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+ beq no_wrtee
+
+ /* Check if we have to fetch an interrupt */
+ lwz r30, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
+ cmpwi r30, 0
+ bne do_wrtee
+
+no_wrtee:
+ SCRATCH_RESTORE
+
+ /* Go back to caller */
+kvm_emulate_wrtee_branch:
+ b .
+
+do_wrtee:
+ SCRATCH_RESTORE
+
+ /* Just fire off the wrtee if it's critical */
+kvm_emulate_wrtee_orig_ins:
+ wrtee r0
+
+ b kvm_emulate_wrtee_branch
+
+kvm_emulate_wrtee_end:
+
+.global kvm_emulate_wrtee_branch_offs
+kvm_emulate_wrtee_branch_offs:
+ .long (kvm_emulate_wrtee_branch - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrtee_reg_offs
+kvm_emulate_wrtee_reg_offs:
+ .long (kvm_emulate_wrtee_reg - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrtee_orig_ins_offs
+kvm_emulate_wrtee_orig_ins_offs:
+ .long (kvm_emulate_wrtee_orig_ins - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrtee_len
+kvm_emulate_wrtee_len:
+ .long (kvm_emulate_wrtee_end - kvm_emulate_wrtee) / 4
+
+.global kvm_emulate_wrteei_0
+kvm_emulate_wrteei_0:
SCRATCH_SAVE
/* Fetch old MSR in r31 */
LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
/* Remove MSR_EE from old MSR */
- li r30, 0
- ori r30, r30, MSR_EE
- andc r31, r31, r30
-
- /* OR new MSR_EE onto the old MSR */
-kvm_emulate_wrteei_ee:
- ori r31, r31, 0
+ rlwinm r31, r31, 0, ~MSR_EE
/* Write new MSR value back */
STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
@@ -235,22 +283,17 @@ kvm_emulate_wrteei_ee:
SCRATCH_RESTORE
/* Go back to caller */
-kvm_emulate_wrteei_branch:
+kvm_emulate_wrteei_0_branch:
b .
-kvm_emulate_wrteei_end:
-
-.global kvm_emulate_wrteei_branch_offs
-kvm_emulate_wrteei_branch_offs:
- .long (kvm_emulate_wrteei_branch - kvm_emulate_wrteei) / 4
+kvm_emulate_wrteei_0_end:
-.global kvm_emulate_wrteei_ee_offs
-kvm_emulate_wrteei_ee_offs:
- .long (kvm_emulate_wrteei_ee - kvm_emulate_wrteei) / 4
-
-.global kvm_emulate_wrteei_len
-kvm_emulate_wrteei_len:
- .long (kvm_emulate_wrteei_end - kvm_emulate_wrteei) / 4
+.global kvm_emulate_wrteei_0_branch_offs
+kvm_emulate_wrteei_0_branch_offs:
+ .long (kvm_emulate_wrteei_0_branch - kvm_emulate_wrteei_0) / 4
+.global kvm_emulate_wrteei_0_len
+kvm_emulate_wrteei_0_len:
+ .long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4
.global kvm_emulate_mtsrin
kvm_emulate_mtsrin:
@@ -300,3 +343,6 @@ kvm_emulate_mtsrin_orig_ins_offs:
.global kvm_emulate_mtsrin_len
kvm_emulate_mtsrin_len:
.long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4
+
+.global kvm_template_end
+kvm_template_end:
diff --git a/arch/powerpc/kernel/l2cr_6xx.S b/arch/powerpc/kernel/l2cr_6xx.S
index 2a2f3c3f6d8..97ec8557f97 100644
--- a/arch/powerpc/kernel/l2cr_6xx.S
+++ b/arch/powerpc/kernel/l2cr_6xx.S
@@ -151,7 +151,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
/**** Might be a good idea to set L2DO here - to prevent instructions
from getting into the cache. But since we invalidate
the next time we enable the cache it doesn't really matter.
- Don't do this unless you accomodate all processor variations.
+ Don't do this unless you accommodate all processor variations.
The bit moved on the 7450.....
****/
diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c
index c834757bebc..936258881c9 100644
--- a/arch/powerpc/kernel/legacy_serial.c
+++ b/arch/powerpc/kernel/legacy_serial.c
@@ -6,6 +6,7 @@
#include <linux/pci.h>
#include <linux/of_address.h>
#include <linux/of_device.h>
+#include <linux/serial_reg.h>
#include <asm/io.h>
#include <asm/mmu.h>
#include <asm/prom.h>
@@ -34,7 +35,7 @@ static struct legacy_serial_info {
phys_addr_t taddr;
} legacy_serial_infos[MAX_LEGACY_SERIAL_PORTS];
-static struct __initdata of_device_id legacy_serial_parents[] = {
+static struct of_device_id legacy_serial_parents[] __initdata = {
{.type = "soc",},
{.type = "tsi-bridge",},
{.type = "opb", },
@@ -47,13 +48,35 @@ static struct __initdata of_device_id legacy_serial_parents[] = {
static unsigned int legacy_serial_count;
static int legacy_serial_console = -1;
+static const upf_t legacy_port_flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST |
+ UPF_SHARE_IRQ | UPF_FIXED_PORT;
+
+static unsigned int tsi_serial_in(struct uart_port *p, int offset)
+{
+ unsigned int tmp;
+ offset = offset << p->regshift;
+ if (offset == UART_IIR) {
+ tmp = readl(p->membase + (UART_IIR & ~3));
+ return (tmp >> 16) & 0xff; /* UART_IIR % 4 == 2 */
+ } else
+ return readb(p->membase + offset);
+}
+
+static void tsi_serial_out(struct uart_port *p, int offset, int value)
+{
+ offset = offset << p->regshift;
+ if (!((offset == UART_IER) && (value & UART_IER_UUE)))
+ writeb(value, p->membase + offset);
+}
+
static int __init add_legacy_port(struct device_node *np, int want_index,
int iotype, phys_addr_t base,
phys_addr_t taddr, unsigned long irq,
upf_t flags, int irq_check_parent)
{
- const __be32 *clk, *spd;
+ const __be32 *clk, *spd, *rs;
u32 clock = BASE_BAUD * 16;
+ u32 shift = 0;
int index;
/* get clock freq. if present */
@@ -64,6 +87,11 @@ static int __init add_legacy_port(struct device_node *np, int want_index,
/* get default speed if present */
spd = of_get_property(np, "current-speed", NULL);
+ /* get register shift if present */
+ rs = of_get_property(np, "reg-shift", NULL);
+ if (rs && *rs)
+ shift = be32_to_cpup(rs);
+
/* If we have a location index, then try to use it */
if (want_index >= 0 && want_index < MAX_LEGACY_SERIAL_PORTS)
index = want_index;
@@ -80,7 +108,7 @@ static int __init add_legacy_port(struct device_node *np, int want_index,
legacy_serial_count = index + 1;
/* Check if there is a port who already claimed our slot */
- if (legacy_serial_infos[index].np != 0) {
+ if (legacy_serial_infos[index].np != NULL) {
/* if we still have some room, move it, else override */
if (legacy_serial_count < MAX_LEGACY_SERIAL_PORTS) {
printk(KERN_DEBUG "Moved legacy port %d -> %d\n",
@@ -102,16 +130,23 @@ static int __init add_legacy_port(struct device_node *np, int want_index,
legacy_serial_ports[index].iobase = base;
else
legacy_serial_ports[index].mapbase = base;
+
legacy_serial_ports[index].iotype = iotype;
legacy_serial_ports[index].uartclk = clock;
legacy_serial_ports[index].irq = irq;
legacy_serial_ports[index].flags = flags;
+ legacy_serial_ports[index].regshift = shift;
legacy_serial_infos[index].taddr = taddr;
legacy_serial_infos[index].np = of_node_get(np);
legacy_serial_infos[index].clock = clock;
legacy_serial_infos[index].speed = spd ? be32_to_cpup(spd) : 0;
legacy_serial_infos[index].irq_check_parent = irq_check_parent;
+ if (iotype == UPIO_TSI) {
+ legacy_serial_ports[index].serial_in = tsi_serial_in;
+ legacy_serial_ports[index].serial_out = tsi_serial_out;
+ }
+
printk(KERN_DEBUG "Found legacy serial port %d for %s\n",
index, np->full_name);
printk(KERN_DEBUG " %s=%llx, taddr=%llx, irq=%lx, clk=%d, speed=%d\n",
@@ -127,9 +162,7 @@ static int __init add_legacy_soc_port(struct device_node *np,
struct device_node *soc_dev)
{
u64 addr;
- const u32 *addrp;
- upf_t flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_SHARE_IRQ
- | UPF_FIXED_PORT;
+ const __be32 *addrp;
struct device_node *tsi = of_get_parent(np);
/* We only support ports that have a clock frequency properly
@@ -138,9 +171,8 @@ static int __init add_legacy_soc_port(struct device_node *np,
if (of_get_property(np, "clock-frequency", NULL) == NULL)
return -1;
- /* if reg-shift or offset, don't try to use it */
- if ((of_get_property(np, "reg-shift", NULL) != NULL) ||
- (of_get_property(np, "reg-offset", NULL) != NULL))
+ /* if reg-offset don't try to use it */
+ if ((of_get_property(np, "reg-offset", NULL) != NULL))
return -1;
/* if rtas uses this device, don't try to use it as well */
@@ -160,9 +192,11 @@ static int __init add_legacy_soc_port(struct device_node *np,
* IO port value. It will be fixed up later along with the irq
*/
if (tsi && !strcmp(tsi->type, "tsi-bridge"))
- return add_legacy_port(np, -1, UPIO_TSI, addr, addr, NO_IRQ, flags, 0);
+ return add_legacy_port(np, -1, UPIO_TSI, addr, addr,
+ NO_IRQ, legacy_port_flags, 0);
else
- return add_legacy_port(np, -1, UPIO_MEM, addr, addr, NO_IRQ, flags, 0);
+ return add_legacy_port(np, -1, UPIO_MEM, addr, addr,
+ NO_IRQ, legacy_port_flags, 0);
}
static int __init add_legacy_isa_port(struct device_node *np,
@@ -196,14 +230,19 @@ static int __init add_legacy_isa_port(struct device_node *np,
/* Translate ISA address. If it fails, we still register the port
* with no translated address so that it can be picked up as an IO
* port later by the serial driver
+ *
+ * Note: Don't even try on P8 lpc, we know it's not directly mapped
*/
- taddr = of_translate_address(np, reg);
- if (taddr == OF_BAD_ADDR)
+ if (!of_device_is_compatible(isa_brg, "ibm,power8-lpc")) {
+ taddr = of_translate_address(np, reg);
+ if (taddr == OF_BAD_ADDR)
+ taddr = 0;
+ } else
taddr = 0;
/* Add port, irq will be dealt with later */
- return add_legacy_port(np, index, UPIO_PORT, be32_to_cpu(reg[1]), taddr,
- NO_IRQ, UPF_BOOT_AUTOCONF, 0);
+ return add_legacy_port(np, index, UPIO_PORT, be32_to_cpu(reg[1]),
+ taddr, NO_IRQ, legacy_port_flags, 0);
}
@@ -212,7 +251,7 @@ static int __init add_legacy_pci_port(struct device_node *np,
struct device_node *pci_dev)
{
u64 addr, base;
- const u32 *addrp;
+ const __be32 *addrp;
unsigned int flags;
int iotype, index = -1, lindex = 0;
@@ -245,7 +284,7 @@ static int __init add_legacy_pci_port(struct device_node *np,
if (iotype == UPIO_MEM)
base = addr;
else
- base = addrp[2];
+ base = of_read_number(&addrp[2], 1);
/* Try to guess an index... If we have subdevices of the pci dev,
* we get to their "reg" property
@@ -276,25 +315,40 @@ static int __init add_legacy_pci_port(struct device_node *np,
* IO port value. It will be fixed up later along with the irq
*/
return add_legacy_port(np, index, iotype, base, addr, NO_IRQ,
- UPF_BOOT_AUTOCONF, np != pci_dev);
+ legacy_port_flags, np != pci_dev);
}
#endif
static void __init setup_legacy_serial_console(int console)
{
- struct legacy_serial_info *info =
- &legacy_serial_infos[console];
+ struct legacy_serial_info *info = &legacy_serial_infos[console];
+ struct plat_serial8250_port *port = &legacy_serial_ports[console];
void __iomem *addr;
+ unsigned int stride;
- if (info->taddr == 0)
- return;
- addr = ioremap(info->taddr, 0x1000);
- if (addr == NULL)
- return;
+ stride = 1 << port->regshift;
+
+ /* Check if a translated MMIO address has been found */
+ if (info->taddr) {
+ addr = ioremap(info->taddr, 0x1000);
+ if (addr == NULL)
+ return;
+ udbg_uart_init_mmio(addr, stride);
+ } else {
+ /* Check if it's PIO and we support untranslated PIO */
+ if (port->iotype == UPIO_PORT && isa_io_special)
+ udbg_uart_init_pio(port->iobase, stride);
+ else
+ return;
+ }
+
+ /* Try to query the current speed */
if (info->speed == 0)
- info->speed = udbg_probe_uart_speed(addr, info->clock);
+ info->speed = udbg_probe_uart_speed(info->clock);
+
+ /* Set it up */
DBG("default console speed = %d\n", info->speed);
- udbg_init_uart(addr, info->speed, info->clock);
+ udbg_uart_setup(info->speed, info->clock);
}
/*
@@ -330,9 +384,11 @@ void __init find_legacy_serial_ports(void)
if (!parent)
continue;
if (of_match_node(legacy_serial_parents, parent) != NULL) {
- index = add_legacy_soc_port(np, np);
- if (index >= 0 && np == stdout)
- legacy_serial_console = index;
+ if (of_device_is_available(np)) {
+ index = add_legacy_soc_port(np, np);
+ if (index >= 0 && np == stdout)
+ legacy_serial_console = index;
+ }
}
of_node_put(parent);
}
@@ -340,10 +396,13 @@ void __init find_legacy_serial_ports(void)
/* Next, fill our array with ISA ports */
for_each_node_by_type(np, "serial") {
struct device_node *isa = of_get_parent(np);
- if (isa && !strcmp(isa->name, "isa")) {
- index = add_legacy_isa_port(np, isa);
- if (index >= 0 && np == stdout)
- legacy_serial_console = index;
+ if (isa && (!strcmp(isa->name, "isa") ||
+ !strcmp(isa->name, "lpc"))) {
+ if (of_device_is_available(np)) {
+ index = add_legacy_isa_port(np, isa);
+ if (index >= 0 && np == stdout)
+ legacy_serial_console = index;
+ }
}
of_node_put(isa);
}
@@ -360,7 +419,7 @@ void __init find_legacy_serial_ports(void)
of_node_put(parent);
continue;
}
- /* Check for known pciclass, and also check wether we have
+ /* Check for known pciclass, and also check whether we have
* a device with child nodes for ports or not
*/
if (of_device_is_compatible(np, "pciclass,0700") ||
@@ -414,6 +473,11 @@ static void __init fixup_port_irq(int index,
return;
port->irq = virq;
+
+#ifdef CONFIG_SERIAL_8250_FSL
+ if (of_device_is_compatible(np, "fsl,ns16550"))
+ port->handle_irq = fsl8250_handle_irq;
+#endif
}
static void __init fixup_port_pio(int index,
diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
deleted file mode 100644
index 16468362ad5..00000000000
--- a/arch/powerpc/kernel/lparcfg.c
+++ /dev/null
@@ -1,816 +0,0 @@
-/*
- * PowerPC64 LPAR Configuration Information Driver
- *
- * Dave Engebretsen engebret@us.ibm.com
- * Copyright (c) 2003 Dave Engebretsen
- * Will Schmidt willschm@us.ibm.com
- * SPLPAR updates, Copyright (c) 2003 Will Schmidt IBM Corporation.
- * seq_file updates, Copyright (c) 2004 Will Schmidt IBM Corporation.
- * Nathan Lynch nathanl@austin.ibm.com
- * Added lparcfg_write, Copyright (C) 2004 Nathan Lynch IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * This driver creates a proc file at /proc/ppc64/lparcfg which contains
- * keyword - value pairs that specify the configuration of the partition.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/proc_fs.h>
-#include <linux/init.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include <asm/iseries/hv_lp_config.h>
-#include <asm/lppaca.h>
-#include <asm/hvcall.h>
-#include <asm/firmware.h>
-#include <asm/rtas.h>
-#include <asm/system.h>
-#include <asm/time.h>
-#include <asm/prom.h>
-#include <asm/vdso_datapage.h>
-#include <asm/vio.h>
-#include <asm/mmu.h>
-
-#define MODULE_VERS "1.9"
-#define MODULE_NAME "lparcfg"
-
-/* #define LPARCFG_DEBUG */
-
-static struct proc_dir_entry *proc_ppc64_lparcfg;
-
-/*
- * Track sum of all purrs across all processors. This is used to further
- * calculate usage values by different applications
- */
-static unsigned long get_purr(void)
-{
- unsigned long sum_purr = 0;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- if (firmware_has_feature(FW_FEATURE_ISERIES))
- sum_purr += lppaca_of(cpu).emulated_time_base;
- else {
- struct cpu_usage *cu;
-
- cu = &per_cpu(cpu_usage_array, cpu);
- sum_purr += cu->current_tb;
- }
- }
- return sum_purr;
-}
-
-#ifdef CONFIG_PPC_ISERIES
-
-/*
- * Methods used to fetch LPAR data when running on an iSeries platform.
- */
-static int iseries_lparcfg_data(struct seq_file *m, void *v)
-{
- unsigned long pool_id;
- int shared, entitled_capacity, max_entitled_capacity;
- int processors, max_processors;
- unsigned long purr = get_purr();
-
- shared = (int)(local_paca->lppaca_ptr->shared_proc);
-
- seq_printf(m, "system_active_processors=%d\n",
- (int)HvLpConfig_getSystemPhysicalProcessors());
-
- seq_printf(m, "system_potential_processors=%d\n",
- (int)HvLpConfig_getSystemPhysicalProcessors());
-
- processors = (int)HvLpConfig_getPhysicalProcessors();
- seq_printf(m, "partition_active_processors=%d\n", processors);
-
- max_processors = (int)HvLpConfig_getMaxPhysicalProcessors();
- seq_printf(m, "partition_potential_processors=%d\n", max_processors);
-
- if (shared) {
- entitled_capacity = HvLpConfig_getSharedProcUnits();
- max_entitled_capacity = HvLpConfig_getMaxSharedProcUnits();
- } else {
- entitled_capacity = processors * 100;
- max_entitled_capacity = max_processors * 100;
- }
- seq_printf(m, "partition_entitled_capacity=%d\n", entitled_capacity);
-
- seq_printf(m, "partition_max_entitled_capacity=%d\n",
- max_entitled_capacity);
-
- if (shared) {
- pool_id = HvLpConfig_getSharedPoolIndex();
- seq_printf(m, "pool=%d\n", (int)pool_id);
- seq_printf(m, "pool_capacity=%d\n",
- (int)(HvLpConfig_getNumProcsInSharedPool(pool_id) *
- 100));
- seq_printf(m, "purr=%ld\n", purr);
- }
-
- seq_printf(m, "shared_processor_mode=%d\n", shared);
-
- return 0;
-}
-
-#else /* CONFIG_PPC_ISERIES */
-
-static int iseries_lparcfg_data(struct seq_file *m, void *v)
-{
- return 0;
-}
-
-#endif /* CONFIG_PPC_ISERIES */
-
-#ifdef CONFIG_PPC_PSERIES
-/*
- * Methods used to fetch LPAR data when running on a pSeries platform.
- */
-/**
- * h_get_mpp
- * H_GET_MPP hcall returns info in 7 parms
- */
-int h_get_mpp(struct hvcall_mpp_data *mpp_data)
-{
- int rc;
- unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
-
- rc = plpar_hcall9(H_GET_MPP, retbuf);
-
- mpp_data->entitled_mem = retbuf[0];
- mpp_data->mapped_mem = retbuf[1];
-
- mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
- mpp_data->pool_num = retbuf[2] & 0xffff;
-
- mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
- mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
- mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff;
-
- mpp_data->pool_size = retbuf[4];
- mpp_data->loan_request = retbuf[5];
- mpp_data->backing_mem = retbuf[6];
-
- return rc;
-}
-EXPORT_SYMBOL(h_get_mpp);
-
-struct hvcall_ppp_data {
- u64 entitlement;
- u64 unallocated_entitlement;
- u16 group_num;
- u16 pool_num;
- u8 capped;
- u8 weight;
- u8 unallocated_weight;
- u16 active_procs_in_pool;
- u16 active_system_procs;
- u16 phys_platform_procs;
- u32 max_proc_cap_avail;
- u32 entitled_proc_cap_avail;
-};
-
-/*
- * H_GET_PPP hcall returns info in 4 parms.
- * entitled_capacity,unallocated_capacity,
- * aggregation, resource_capability).
- *
- * R4 = Entitled Processor Capacity Percentage.
- * R5 = Unallocated Processor Capacity Percentage.
- * R6 (AABBCCDDEEFFGGHH).
- * XXXX - reserved (0)
- * XXXX - reserved (0)
- * XXXX - Group Number
- * XXXX - Pool Number.
- * R7 (IIJJKKLLMMNNOOPP).
- * XX - reserved. (0)
- * XX - bit 0-6 reserved (0). bit 7 is Capped indicator.
- * XX - variable processor Capacity Weight
- * XX - Unallocated Variable Processor Capacity Weight.
- * XXXX - Active processors in Physical Processor Pool.
- * XXXX - Processors active on platform.
- * R8 (QQQQRRRRRRSSSSSS). if ibm,partition-performance-parameters-level >= 1
- * XXXX - Physical platform procs allocated to virtualization.
- * XXXXXX - Max procs capacity % available to the partitions pool.
- * XXXXXX - Entitled procs capacity % available to the
- * partitions pool.
- */
-static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
-{
- unsigned long rc;
- unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
-
- rc = plpar_hcall9(H_GET_PPP, retbuf);
-
- ppp_data->entitlement = retbuf[0];
- ppp_data->unallocated_entitlement = retbuf[1];
-
- ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
- ppp_data->pool_num = retbuf[2] & 0xffff;
-
- ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01;
- ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff;
- ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff;
- ppp_data->active_procs_in_pool = (retbuf[3] >> 2 * 8) & 0xffff;
- ppp_data->active_system_procs = retbuf[3] & 0xffff;
-
- ppp_data->phys_platform_procs = retbuf[4] >> 6 * 8;
- ppp_data->max_proc_cap_avail = (retbuf[4] >> 3 * 8) & 0xffffff;
- ppp_data->entitled_proc_cap_avail = retbuf[4] & 0xffffff;
-
- return rc;
-}
-
-static unsigned h_pic(unsigned long *pool_idle_time,
- unsigned long *num_procs)
-{
- unsigned long rc;
- unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
- rc = plpar_hcall(H_PIC, retbuf);
-
- *pool_idle_time = retbuf[0];
- *num_procs = retbuf[1];
-
- return rc;
-}
-
-/*
- * parse_ppp_data
- * Parse out the data returned from h_get_ppp and h_pic
- */
-static void parse_ppp_data(struct seq_file *m)
-{
- struct hvcall_ppp_data ppp_data;
- struct device_node *root;
- const int *perf_level;
- int rc;
-
- rc = h_get_ppp(&ppp_data);
- if (rc)
- return;
-
- seq_printf(m, "partition_entitled_capacity=%lld\n",
- ppp_data.entitlement);
- seq_printf(m, "group=%d\n", ppp_data.group_num);
- seq_printf(m, "system_active_processors=%d\n",
- ppp_data.active_system_procs);
-
- /* pool related entries are apropriate for shared configs */
- if (lppaca_of(0).shared_proc) {
- unsigned long pool_idle_time, pool_procs;
-
- seq_printf(m, "pool=%d\n", ppp_data.pool_num);
-
- /* report pool_capacity in percentage */
- seq_printf(m, "pool_capacity=%d\n",
- ppp_data.active_procs_in_pool * 100);
-
- h_pic(&pool_idle_time, &pool_procs);
- seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time);
- seq_printf(m, "pool_num_procs=%ld\n", pool_procs);
- }
-
- seq_printf(m, "unallocated_capacity_weight=%d\n",
- ppp_data.unallocated_weight);
- seq_printf(m, "capacity_weight=%d\n", ppp_data.weight);
- seq_printf(m, "capped=%d\n", ppp_data.capped);
- seq_printf(m, "unallocated_capacity=%lld\n",
- ppp_data.unallocated_entitlement);
-
- /* The last bits of information returned from h_get_ppp are only
- * valid if the ibm,partition-performance-parameters-level
- * property is >= 1.
- */
- root = of_find_node_by_path("/");
- if (root) {
- perf_level = of_get_property(root,
- "ibm,partition-performance-parameters-level",
- NULL);
- if (perf_level && (*perf_level >= 1)) {
- seq_printf(m,
- "physical_procs_allocated_to_virtualization=%d\n",
- ppp_data.phys_platform_procs);
- seq_printf(m, "max_proc_capacity_available=%d\n",
- ppp_data.max_proc_cap_avail);
- seq_printf(m, "entitled_proc_capacity_available=%d\n",
- ppp_data.entitled_proc_cap_avail);
- }
-
- of_node_put(root);
- }
-}
-
-/**
- * parse_mpp_data
- * Parse out data returned from h_get_mpp
- */
-static void parse_mpp_data(struct seq_file *m)
-{
- struct hvcall_mpp_data mpp_data;
- int rc;
-
- rc = h_get_mpp(&mpp_data);
- if (rc)
- return;
-
- seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem);
-
- if (mpp_data.mapped_mem != -1)
- seq_printf(m, "mapped_entitled_memory=%ld\n",
- mpp_data.mapped_mem);
-
- seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num);
- seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num);
-
- seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight);
- seq_printf(m, "unallocated_entitled_memory_weight=%d\n",
- mpp_data.unallocated_mem_weight);
- seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n",
- mpp_data.unallocated_entitlement);
-
- if (mpp_data.pool_size != -1)
- seq_printf(m, "entitled_memory_pool_size=%ld bytes\n",
- mpp_data.pool_size);
-
- seq_printf(m, "entitled_memory_loan_request=%ld\n",
- mpp_data.loan_request);
-
- seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem);
-}
-
-#define SPLPAR_CHARACTERISTICS_TOKEN 20
-#define SPLPAR_MAXLENGTH 1026*(sizeof(char))
-
-/*
- * parse_system_parameter_string()
- * Retrieve the potential_processors, max_entitled_capacity and friends
- * through the get-system-parameter rtas call. Replace keyword strings as
- * necessary.
- */
-static void parse_system_parameter_string(struct seq_file *m)
-{
- int call_status;
-
- unsigned char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
- if (!local_buffer) {
- printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
- __FILE__, __func__, __LINE__);
- return;
- }
-
- spin_lock(&rtas_data_buf_lock);
- memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH);
- call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
- NULL,
- SPLPAR_CHARACTERISTICS_TOKEN,
- __pa(rtas_data_buf),
- RTAS_DATA_BUF_SIZE);
- memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH);
- spin_unlock(&rtas_data_buf_lock);
-
- if (call_status != 0) {
- printk(KERN_INFO
- "%s %s Error calling get-system-parameter (0x%x)\n",
- __FILE__, __func__, call_status);
- } else {
- int splpar_strlen;
- int idx, w_idx;
- char *workbuffer = kzalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
- if (!workbuffer) {
- printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
- __FILE__, __func__, __LINE__);
- kfree(local_buffer);
- return;
- }
-#ifdef LPARCFG_DEBUG
- printk(KERN_INFO "success calling get-system-parameter\n");
-#endif
- splpar_strlen = local_buffer[0] * 256 + local_buffer[1];
- local_buffer += 2; /* step over strlen value */
-
- w_idx = 0;
- idx = 0;
- while ((*local_buffer) && (idx < splpar_strlen)) {
- workbuffer[w_idx++] = local_buffer[idx++];
- if ((local_buffer[idx] == ',')
- || (local_buffer[idx] == '\0')) {
- workbuffer[w_idx] = '\0';
- if (w_idx) {
- /* avoid the empty string */
- seq_printf(m, "%s\n", workbuffer);
- }
- memset(workbuffer, 0, SPLPAR_MAXLENGTH);
- idx++; /* skip the comma */
- w_idx = 0;
- } else if (local_buffer[idx] == '=') {
- /* code here to replace workbuffer contents
- with different keyword strings */
- if (0 == strcmp(workbuffer, "MaxEntCap")) {
- strcpy(workbuffer,
- "partition_max_entitled_capacity");
- w_idx = strlen(workbuffer);
- }
- if (0 == strcmp(workbuffer, "MaxPlatProcs")) {
- strcpy(workbuffer,
- "system_potential_processors");
- w_idx = strlen(workbuffer);
- }
- }
- }
- kfree(workbuffer);
- local_buffer -= 2; /* back up over strlen value */
- }
- kfree(local_buffer);
-}
-
-/* Return the number of processors in the system.
- * This function reads through the device tree and counts
- * the virtual processors, this does not include threads.
- */
-static int lparcfg_count_active_processors(void)
-{
- struct device_node *cpus_dn = NULL;
- int count = 0;
-
- while ((cpus_dn = of_find_node_by_type(cpus_dn, "cpu"))) {
-#ifdef LPARCFG_DEBUG
- printk(KERN_ERR "cpus_dn %p\n", cpus_dn);
-#endif
- count++;
- }
- return count;
-}
-
-static void pseries_cmo_data(struct seq_file *m)
-{
- int cpu;
- unsigned long cmo_faults = 0;
- unsigned long cmo_fault_time = 0;
-
- seq_printf(m, "cmo_enabled=%d\n", firmware_has_feature(FW_FEATURE_CMO));
-
- if (!firmware_has_feature(FW_FEATURE_CMO))
- return;
-
- for_each_possible_cpu(cpu) {
- cmo_faults += lppaca_of(cpu).cmo_faults;
- cmo_fault_time += lppaca_of(cpu).cmo_fault_time;
- }
-
- seq_printf(m, "cmo_faults=%lu\n", cmo_faults);
- seq_printf(m, "cmo_fault_time_usec=%lu\n",
- cmo_fault_time / tb_ticks_per_usec);
- seq_printf(m, "cmo_primary_psp=%d\n", cmo_get_primary_psp());
- seq_printf(m, "cmo_secondary_psp=%d\n", cmo_get_secondary_psp());
- seq_printf(m, "cmo_page_size=%lu\n", cmo_get_page_size());
-}
-
-static void splpar_dispatch_data(struct seq_file *m)
-{
- int cpu;
- unsigned long dispatches = 0;
- unsigned long dispatch_dispersions = 0;
-
- for_each_possible_cpu(cpu) {
- dispatches += lppaca_of(cpu).yield_count;
- dispatch_dispersions += lppaca_of(cpu).dispersion_count;
- }
-
- seq_printf(m, "dispatches=%lu\n", dispatches);
- seq_printf(m, "dispatch_dispersions=%lu\n", dispatch_dispersions);
-}
-
-static void parse_em_data(struct seq_file *m)
-{
- unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
- if (plpar_hcall(H_GET_EM_PARMS, retbuf) == H_SUCCESS)
- seq_printf(m, "power_mode_data=%016lx\n", retbuf[0]);
-}
-
-static int pseries_lparcfg_data(struct seq_file *m, void *v)
-{
- int partition_potential_processors;
- int partition_active_processors;
- struct device_node *rtas_node;
- const int *lrdrp = NULL;
-
- rtas_node = of_find_node_by_path("/rtas");
- if (rtas_node)
- lrdrp = of_get_property(rtas_node, "ibm,lrdr-capacity", NULL);
-
- if (lrdrp == NULL) {
- partition_potential_processors = vdso_data->processorCount;
- } else {
- partition_potential_processors = *(lrdrp + 4);
- }
- of_node_put(rtas_node);
-
- partition_active_processors = lparcfg_count_active_processors();
-
- if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
- /* this call handles the ibm,get-system-parameter contents */
- parse_system_parameter_string(m);
- parse_ppp_data(m);
- parse_mpp_data(m);
- pseries_cmo_data(m);
- splpar_dispatch_data(m);
-
- seq_printf(m, "purr=%ld\n", get_purr());
- } else { /* non SPLPAR case */
-
- seq_printf(m, "system_active_processors=%d\n",
- partition_potential_processors);
-
- seq_printf(m, "system_potential_processors=%d\n",
- partition_potential_processors);
-
- seq_printf(m, "partition_max_entitled_capacity=%d\n",
- partition_potential_processors * 100);
-
- seq_printf(m, "partition_entitled_capacity=%d\n",
- partition_active_processors * 100);
- }
-
- seq_printf(m, "partition_active_processors=%d\n",
- partition_active_processors);
-
- seq_printf(m, "partition_potential_processors=%d\n",
- partition_potential_processors);
-
- seq_printf(m, "shared_processor_mode=%d\n", lppaca_of(0).shared_proc);
-
- seq_printf(m, "slb_size=%d\n", mmu_slb_size);
-
- parse_em_data(m);
-
- return 0;
-}
-
-static ssize_t update_ppp(u64 *entitlement, u8 *weight)
-{
- struct hvcall_ppp_data ppp_data;
- u8 new_weight;
- u64 new_entitled;
- ssize_t retval;
-
- /* Get our current parameters */
- retval = h_get_ppp(&ppp_data);
- if (retval)
- return retval;
-
- if (entitlement) {
- new_weight = ppp_data.weight;
- new_entitled = *entitlement;
- } else if (weight) {
- new_weight = *weight;
- new_entitled = ppp_data.entitlement;
- } else
- return -EINVAL;
-
- pr_debug("%s: current_entitled = %llu, current_weight = %u\n",
- __func__, ppp_data.entitlement, ppp_data.weight);
-
- pr_debug("%s: new_entitled = %llu, new_weight = %u\n",
- __func__, new_entitled, new_weight);
-
- retval = plpar_hcall_norets(H_SET_PPP, new_entitled, new_weight);
- return retval;
-}
-
-/**
- * update_mpp
- *
- * Update the memory entitlement and weight for the partition. Caller must
- * specify either a new entitlement or weight, not both, to be updated
- * since the h_set_mpp call takes both entitlement and weight as parameters.
- */
-static ssize_t update_mpp(u64 *entitlement, u8 *weight)
-{
- struct hvcall_mpp_data mpp_data;
- u64 new_entitled;
- u8 new_weight;
- ssize_t rc;
-
- if (entitlement) {
- /* Check with vio to ensure the new memory entitlement
- * can be handled.
- */
- rc = vio_cmo_entitlement_update(*entitlement);
- if (rc)
- return rc;
- }
-
- rc = h_get_mpp(&mpp_data);
- if (rc)
- return rc;
-
- if (entitlement) {
- new_weight = mpp_data.mem_weight;
- new_entitled = *entitlement;
- } else if (weight) {
- new_weight = *weight;
- new_entitled = mpp_data.entitled_mem;
- } else
- return -EINVAL;
-
- pr_debug("%s: current_entitled = %lu, current_weight = %u\n",
- __func__, mpp_data.entitled_mem, mpp_data.mem_weight);
-
- pr_debug("%s: new_entitled = %llu, new_weight = %u\n",
- __func__, new_entitled, new_weight);
-
- rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight);
- return rc;
-}
-
-/*
- * Interface for changing system parameters (variable capacity weight
- * and entitled capacity). Format of input is "param_name=value";
- * anything after value is ignored. Valid parameters at this time are
- * "partition_entitled_capacity" and "capacity_weight". We use
- * H_SET_PPP to alter parameters.
- *
- * This function should be invoked only on systems with
- * FW_FEATURE_SPLPAR.
- */
-static ssize_t lparcfg_write(struct file *file, const char __user * buf,
- size_t count, loff_t * off)
-{
- int kbuf_sz = 64;
- char kbuf[kbuf_sz];
- char *tmp;
- u64 new_entitled, *new_entitled_ptr = &new_entitled;
- u8 new_weight, *new_weight_ptr = &new_weight;
- ssize_t retval;
-
- if (!firmware_has_feature(FW_FEATURE_SPLPAR) ||
- firmware_has_feature(FW_FEATURE_ISERIES))
- return -EINVAL;
-
- if (count > kbuf_sz)
- return -EINVAL;
-
- if (copy_from_user(kbuf, buf, count))
- return -EFAULT;
-
- kbuf[count - 1] = '\0';
- tmp = strchr(kbuf, '=');
- if (!tmp)
- return -EINVAL;
-
- *tmp++ = '\0';
-
- if (!strcmp(kbuf, "partition_entitled_capacity")) {
- char *endp;
- *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
- if (endp == tmp)
- return -EINVAL;
-
- retval = update_ppp(new_entitled_ptr, NULL);
- } else if (!strcmp(kbuf, "capacity_weight")) {
- char *endp;
- *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
- if (endp == tmp)
- return -EINVAL;
-
- retval = update_ppp(NULL, new_weight_ptr);
- } else if (!strcmp(kbuf, "entitled_memory")) {
- char *endp;
- *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10);
- if (endp == tmp)
- return -EINVAL;
-
- retval = update_mpp(new_entitled_ptr, NULL);
- } else if (!strcmp(kbuf, "entitled_memory_weight")) {
- char *endp;
- *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10);
- if (endp == tmp)
- return -EINVAL;
-
- retval = update_mpp(NULL, new_weight_ptr);
- } else
- return -EINVAL;
-
- if (retval == H_SUCCESS || retval == H_CONSTRAINED) {
- retval = count;
- } else if (retval == H_BUSY) {
- retval = -EBUSY;
- } else if (retval == H_HARDWARE) {
- retval = -EIO;
- } else if (retval == H_PARAMETER) {
- retval = -EINVAL;
- }
-
- return retval;
-}
-
-#else /* CONFIG_PPC_PSERIES */
-
-static int pseries_lparcfg_data(struct seq_file *m, void *v)
-{
- return 0;
-}
-
-static ssize_t lparcfg_write(struct file *file, const char __user * buf,
- size_t count, loff_t * off)
-{
- return -EINVAL;
-}
-
-#endif /* CONFIG_PPC_PSERIES */
-
-static int lparcfg_data(struct seq_file *m, void *v)
-{
- struct device_node *rootdn;
- const char *model = "";
- const char *system_id = "";
- const char *tmp;
- const unsigned int *lp_index_ptr;
- unsigned int lp_index = 0;
-
- seq_printf(m, "%s %s\n", MODULE_NAME, MODULE_VERS);
-
- rootdn = of_find_node_by_path("/");
- if (rootdn) {
- tmp = of_get_property(rootdn, "model", NULL);
- if (tmp) {
- model = tmp;
- /* Skip "IBM," - see platforms/iseries/dt.c */
- if (firmware_has_feature(FW_FEATURE_ISERIES))
- model += 4;
- }
- tmp = of_get_property(rootdn, "system-id", NULL);
- if (tmp) {
- system_id = tmp;
- /* Skip "IBM," - see platforms/iseries/dt.c */
- if (firmware_has_feature(FW_FEATURE_ISERIES))
- system_id += 4;
- }
- lp_index_ptr = of_get_property(rootdn, "ibm,partition-no",
- NULL);
- if (lp_index_ptr)
- lp_index = *lp_index_ptr;
- of_node_put(rootdn);
- }
- seq_printf(m, "serial_number=%s\n", system_id);
- seq_printf(m, "system_type=%s\n", model);
- seq_printf(m, "partition_id=%d\n", (int)lp_index);
-
- if (firmware_has_feature(FW_FEATURE_ISERIES))
- return iseries_lparcfg_data(m, v);
- return pseries_lparcfg_data(m, v);
-}
-
-static int lparcfg_open(struct inode *inode, struct file *file)
-{
- return single_open(file, lparcfg_data, NULL);
-}
-
-static const struct file_operations lparcfg_fops = {
- .owner = THIS_MODULE,
- .read = seq_read,
- .write = lparcfg_write,
- .open = lparcfg_open,
- .release = single_release,
- .llseek = seq_lseek,
-};
-
-static int __init lparcfg_init(void)
-{
- struct proc_dir_entry *ent;
- mode_t mode = S_IRUSR | S_IRGRP | S_IROTH;
-
- /* Allow writing if we have FW_FEATURE_SPLPAR */
- if (firmware_has_feature(FW_FEATURE_SPLPAR) &&
- !firmware_has_feature(FW_FEATURE_ISERIES))
- mode |= S_IWUSR;
-
- ent = proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops);
- if (!ent) {
- printk(KERN_ERR "Failed to create powerpc/lparcfg\n");
- return -EIO;
- }
-
- proc_ppc64_lparcfg = ent;
- return 0;
-}
-
-static void __exit lparcfg_cleanup(void)
-{
- if (proc_ppc64_lparcfg)
- remove_proc_entry("lparcfg", proc_ppc64_lparcfg->parent);
-}
-
-module_init(lparcfg_init);
-module_exit(lparcfg_cleanup);
-MODULE_DESCRIPTION("Interface for LPAR configuration data");
-MODULE_AUTHOR("Dave Engebretsen");
-MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c
index df7e20c191c..015ae55c186 100644
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -15,39 +15,38 @@
#include <linux/memblock.h>
#include <linux/of.h>
#include <linux/irq.h>
+#include <linux/ftrace.h>
#include <asm/machdep.h>
+#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/sections.h>
void machine_kexec_mask_interrupts(void) {
unsigned int i;
+ struct irq_desc *desc;
- for_each_irq(i) {
- struct irq_desc *desc = irq_to_desc(i);
+ for_each_irq_desc(i, desc) {
+ struct irq_chip *chip;
- if (!desc || !desc->chip)
+ chip = irq_desc_get_chip(desc);
+ if (!chip)
continue;
- if (desc->chip->eoi &&
- desc->status & IRQ_INPROGRESS)
- desc->chip->eoi(i);
+ if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
+ chip->irq_eoi(&desc->irq_data);
- if (desc->chip->mask)
- desc->chip->mask(i);
+ if (chip->irq_mask)
+ chip->irq_mask(&desc->irq_data);
- if (desc->chip->disable &&
- !(desc->status & IRQ_DISABLED))
- desc->chip->disable(i);
+ if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
+ chip->irq_disable(&desc->irq_data);
}
}
void machine_crash_shutdown(struct pt_regs *regs)
{
- if (ppc_md.machine_crash_shutdown)
- ppc_md.machine_crash_shutdown(regs);
- else
- default_machine_crash_shutdown(regs);
+ default_machine_crash_shutdown(regs);
}
/*
@@ -65,8 +64,6 @@ int machine_kexec_prepare(struct kimage *image)
void machine_kexec_cleanup(struct kimage *image)
{
- if (ppc_md.machine_kexec_cleanup)
- ppc_md.machine_kexec_cleanup(image);
}
void arch_crash_save_vmcoreinfo(void)
@@ -79,6 +76,17 @@ void arch_crash_save_vmcoreinfo(void)
#ifndef CONFIG_NEED_MULTIPLE_NODES
VMCOREINFO_SYMBOL(contig_page_data);
#endif
+#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP)
+ VMCOREINFO_SYMBOL(vmemmap_list);
+ VMCOREINFO_SYMBOL(mmu_vmemmap_psize);
+ VMCOREINFO_SYMBOL(mmu_psize_defs);
+ VMCOREINFO_STRUCT_SIZE(vmemmap_backing);
+ VMCOREINFO_OFFSET(vmemmap_backing, list);
+ VMCOREINFO_OFFSET(vmemmap_backing, phys);
+ VMCOREINFO_OFFSET(vmemmap_backing, virt_addr);
+ VMCOREINFO_STRUCT_SIZE(mmu_psize_def);
+ VMCOREINFO_OFFSET(mmu_psize_def, shift);
+#endif
}
/*
@@ -87,11 +95,17 @@ void arch_crash_save_vmcoreinfo(void)
*/
void machine_kexec(struct kimage *image)
{
+ int save_ftrace_enabled;
+
+ save_ftrace_enabled = __ftrace_enabled_save();
+
if (ppc_md.machine_kexec)
ppc_md.machine_kexec(image);
else
default_machine_kexec(image);
+ __ftrace_enabled_restore(save_ftrace_enabled);
+
/* Fall back to normal restart if we're still alive. */
machine_restart(NULL);
for(;;);
@@ -102,9 +116,6 @@ void __init reserve_crashkernel(void)
unsigned long long crash_size, crash_base;
int ret;
- /* this is necessary because of memblock_phys_mem_size() */
- memblock_analyze();
-
/* use common parsing */
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&crash_size, &crash_base);
@@ -121,9 +132,9 @@ void __init reserve_crashkernel(void)
/* We might have got these values via the command line or the
* device tree, either way sanitise them now. */
- crash_size = crashk_res.end - crashk_res.start + 1;
+ crash_size = resource_size(&crashk_res);
-#ifndef CONFIG_RELOCATABLE
+#ifndef CONFIG_NONSTATIC_KERNEL
if (crashk_res.start != KDUMP_KERNELBASE)
printk("Crash kernel location must be 0x%x\n",
KDUMP_KERNELBASE);
@@ -131,12 +142,16 @@ void __init reserve_crashkernel(void)
crashk_res.start = KDUMP_KERNELBASE;
#else
if (!crashk_res.start) {
+#ifdef CONFIG_PPC64
/*
- * unspecified address, choose a region of specified size
- * can overlap with initrd (ignoring corruption when retained)
- * ppc64 requires kernel and some stacks to be in first segemnt
+ * On 64bit we split the RMO in half but cap it at half of
+ * a small SLB (128MB) since the crash kernel needs to place
+ * itself and some stacks to be in the first segment.
*/
+ crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2));
+#else
crashk_res.start = KDUMP_KERNELBASE;
+#endif
}
crash_base = PAGE_ALIGN(crashk_res.start);
@@ -162,7 +177,7 @@ void __init reserve_crashkernel(void)
if (memory_limit && memory_limit <= crashk_res.end) {
memory_limit = crashk_res.end + 1;
printk("Adjusted memory limit for crashkernel, now 0x%llx\n",
- (unsigned long long)memory_limit);
+ memory_limit);
}
printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
@@ -181,7 +196,9 @@ int overlaps_crashkernel(unsigned long start, unsigned long size)
/* Values we need to export to the second kernel via the device tree. */
static phys_addr_t kernel_end;
+static phys_addr_t crashk_base;
static phys_addr_t crashk_size;
+static unsigned long long mem_limit;
static struct property kernel_end_prop = {
.name = "linux,kernel-end",
@@ -192,7 +209,7 @@ static struct property kernel_end_prop = {
static struct property crashk_base_prop = {
.name = "linux,crashkernel-base",
.length = sizeof(phys_addr_t),
- .value = &crashk_res.start,
+ .value = &crashk_base
};
static struct property crashk_size_prop = {
@@ -201,6 +218,14 @@ static struct property crashk_size_prop = {
.value = &crashk_size,
};
+static struct property memory_limit_prop = {
+ .name = "linux,memory-limit",
+ .length = sizeof(unsigned long long),
+ .value = &mem_limit,
+};
+
+#define cpu_to_be_ulong __PASTE(cpu_to_be, BITS_PER_LONG)
+
static void __init export_crashk_values(struct device_node *node)
{
struct property *prop;
@@ -209,17 +234,25 @@ static void __init export_crashk_values(struct device_node *node)
* be sure what's in them, so remove them. */
prop = of_find_property(node, "linux,crashkernel-base", NULL);
if (prop)
- prom_remove_property(node, prop);
+ of_remove_property(node, prop);
prop = of_find_property(node, "linux,crashkernel-size", NULL);
if (prop)
- prom_remove_property(node, prop);
+ of_remove_property(node, prop);
if (crashk_res.start != 0) {
- prom_add_property(node, &crashk_base_prop);
- crashk_size = crashk_res.end - crashk_res.start + 1;
- prom_add_property(node, &crashk_size_prop);
+ crashk_base = cpu_to_be_ulong(crashk_res.start),
+ of_add_property(node, &crashk_base_prop);
+ crashk_size = cpu_to_be_ulong(resource_size(&crashk_res));
+ of_add_property(node, &crashk_size_prop);
}
+
+ /*
+ * memory_limit is required by the kexec-tools to limit the
+ * crash regions to the actual memory used.
+ */
+ mem_limit = cpu_to_be_ulong(memory_limit);
+ of_update_property(node, &memory_limit_prop);
}
static int __init kexec_setup(void)
@@ -234,11 +267,11 @@ static int __init kexec_setup(void)
/* remove any stale properties so ours can be found */
prop = of_find_property(node, kernel_end_prop.name, NULL);
if (prop)
- prom_remove_property(node, prop);
+ of_remove_property(node, prop);
/* information needed by userspace when using default_machine_kexec */
- kernel_end = __pa(_end);
- prom_add_property(node, &kernel_end_prop);
+ kernel_end = cpu_to_be_ulong(__pa(_end));
+ of_add_property(node, &kernel_end_prop);
export_crashk_values(node);
diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c
index e63f2e7d2ef..affe5dcce7f 100644
--- a/arch/powerpc/kernel/machine_kexec_32.c
+++ b/arch/powerpc/kernel/machine_kexec_32.c
@@ -16,10 +16,10 @@
#include <asm/hw_irq.h>
#include <asm/io.h>
-typedef NORET_TYPE void (*relocate_new_kernel_t)(
+typedef void (*relocate_new_kernel_t)(
unsigned long indirection_page,
unsigned long reboot_code_buffer,
- unsigned long start_address) ATTRIB_NORET;
+ unsigned long start_address) __noreturn;
/*
* This is a generic machine_kexec function suitable at least for
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 583af70c4b1..879b3aacac3 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -17,6 +17,7 @@
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/cpu.h>
+#include <linux/hardirq.h>
#include <asm/page.h>
#include <asm/current.h>
@@ -74,8 +75,7 @@ int default_machine_kexec_prepare(struct kimage *image)
}
/* We also should not overwrite the tce tables */
- for (node = of_find_node_by_type(NULL, "pci"); node != NULL;
- node = of_find_node_by_type(node, "pci")) {
+ for_each_node_by_type(node, "pci") {
basep = of_get_property(node, "linux,tce-base", NULL);
sizep = of_get_property(node, "linux,tce-size", NULL);
if (basep == NULL || sizep == NULL)
@@ -163,6 +163,8 @@ static int kexec_all_irq_disabled = 0;
static void kexec_smp_down(void *arg)
{
local_irq_disable();
+ hard_irq_disable();
+
mb(); /* make sure our irqs are disabled before we say they are */
get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
while(kexec_all_irq_disabled == 0)
@@ -235,7 +237,7 @@ static void wake_offline_cpus(void)
if (!cpu_online(cpu)) {
printk(KERN_INFO "kexec: Waking offline cpu %d.\n",
cpu);
- cpu_up(cpu);
+ WARN_ON(cpu_up(cpu));
}
}
}
@@ -245,6 +247,8 @@ static void kexec_prepare_cpus(void)
wake_offline_cpus();
smp_call_function(kexec_smp_down, NULL, /* wait */0);
local_irq_disable();
+ hard_irq_disable();
+
mb(); /* make sure IRQs are disabled before we say they are */
get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
@@ -282,6 +286,7 @@ static void kexec_prepare_cpus(void)
if (ppc_md.kexec_cpu_down)
ppc_md.kexec_cpu_down(0, 0);
local_irq_disable();
+ hard_irq_disable();
}
#endif /* SMP */
@@ -307,10 +312,10 @@ static union thread_union kexec_stack __init_task_data =
*/
struct paca_struct kexec_paca;
-/* Our assembly helper, in kexec_stub.S */
-extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start,
- void *image, void *control,
- void (*clear_all)(void)) ATTRIB_NORET;
+/* Our assembly helper, in misc_64.S */
+extern void kexec_sequence(void *newstack, unsigned long start,
+ void *image, void *control,
+ void (*clear_all)(void)) __noreturn;
/* too late to fail here */
void default_machine_kexec(struct kimage *image)
@@ -331,10 +336,13 @@ void default_machine_kexec(struct kimage *image)
pr_debug("kexec: Starting switchover sequence.\n");
/* switch to a staticly allocated stack. Based on irq stack code.
+ * We setup preempt_count to avoid using VMX in memcpy.
* XXX: the task struct will likely be invalid once we do the copy!
*/
kexec_stack.thread_info.task = current_thread_info()->task;
kexec_stack.thread_info.flags = 0;
+ kexec_stack.thread_info.preempt_count = HARDIRQ_OFFSET;
+ kexec_stack.thread_info.cpu = current_thread_info()->cpu;
/* We need a static PACA, too; copy this CPU's PACA over and switch to
* it. Also poison per_cpu_offset to catch anyone using non-static
@@ -361,6 +369,7 @@ void default_machine_kexec(struct kimage *image)
/* Values we need to export to the second kernel via the device tree. */
static unsigned long htab_base;
+static unsigned long htab_size;
static struct property htab_base_prop = {
.name = "linux,htab-base",
@@ -371,7 +380,7 @@ static struct property htab_base_prop = {
static struct property htab_size_prop = {
.name = "linux,htab-size",
.length = sizeof(unsigned long),
- .value = &htab_size_bytes,
+ .value = &htab_size,
};
static int __init export_htab_values(void)
@@ -390,14 +399,15 @@ static int __init export_htab_values(void)
/* remove any stale propertys so ours can be found */
prop = of_find_property(node, htab_base_prop.name, NULL);
if (prop)
- prom_remove_property(node, prop);
+ of_remove_property(node, prop);
prop = of_find_property(node, htab_size_prop.name, NULL);
if (prop)
- prom_remove_property(node, prop);
+ of_remove_property(node, prop);
- htab_base = __pa(htab_address);
- prom_add_property(node, &htab_base_prop);
- prom_add_property(node, &htab_size_prop);
+ htab_base = cpu_to_be64(__pa(htab_address));
+ of_add_property(node, &htab_base_prop);
+ htab_size = cpu_to_be64(htab_size_bytes);
+ of_add_property(node, &htab_size_prop);
of_node_put(node);
return 0;
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
new file mode 100644
index 00000000000..a7fd4cb78b7
--- /dev/null
+++ b/arch/powerpc/kernel/mce.c
@@ -0,0 +1,352 @@
+/*
+ * Machine check exception handling.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "mce: " fmt
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+#include <linux/export.h>
+#include <linux/irq_work.h>
+#include <asm/mce.h>
+
+static DEFINE_PER_CPU(int, mce_nest_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
+
+/* Queue for delayed MCE events. */
+static DEFINE_PER_CPU(int, mce_queue_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
+
+static void machine_check_process_queued_event(struct irq_work *work);
+struct irq_work mce_event_process_work = {
+ .func = machine_check_process_queued_event,
+};
+
+static void mce_set_error_info(struct machine_check_event *mce,
+ struct mce_error_info *mce_err)
+{
+ mce->error_type = mce_err->error_type;
+ switch (mce_err->error_type) {
+ case MCE_ERROR_TYPE_UE:
+ mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type;
+ break;
+ case MCE_ERROR_TYPE_SLB:
+ mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type;
+ break;
+ case MCE_ERROR_TYPE_ERAT:
+ mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type;
+ break;
+ case MCE_ERROR_TYPE_TLB:
+ mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type;
+ break;
+ case MCE_ERROR_TYPE_UNKNOWN:
+ default:
+ break;
+ }
+}
+
+/*
+ * Decode and save high level MCE information into per cpu buffer which
+ * is an array of machine_check_event structure.
+ */
+void save_mce_event(struct pt_regs *regs, long handled,
+ struct mce_error_info *mce_err,
+ uint64_t nip, uint64_t addr)
+{
+ uint64_t srr1;
+ int index = __get_cpu_var(mce_nest_count)++;
+ struct machine_check_event *mce = &__get_cpu_var(mce_event[index]);
+
+ /*
+ * Return if we don't have enough space to log mce event.
+ * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
+ * the check below will stop buffer overrun.
+ */
+ if (index >= MAX_MC_EVT)
+ return;
+
+ /* Populate generic machine check info */
+ mce->version = MCE_V1;
+ mce->srr0 = nip;
+ mce->srr1 = regs->msr;
+ mce->gpr3 = regs->gpr[3];
+ mce->in_use = 1;
+
+ mce->initiator = MCE_INITIATOR_CPU;
+ if (handled)
+ mce->disposition = MCE_DISPOSITION_RECOVERED;
+ else
+ mce->disposition = MCE_DISPOSITION_NOT_RECOVERED;
+ mce->severity = MCE_SEV_ERROR_SYNC;
+
+ srr1 = regs->msr;
+
+ /*
+ * Populate the mce error_type and type-specific error_type.
+ */
+ mce_set_error_info(mce, mce_err);
+
+ if (!addr)
+ return;
+
+ if (mce->error_type == MCE_ERROR_TYPE_TLB) {
+ mce->u.tlb_error.effective_address_provided = true;
+ mce->u.tlb_error.effective_address = addr;
+ } else if (mce->error_type == MCE_ERROR_TYPE_SLB) {
+ mce->u.slb_error.effective_address_provided = true;
+ mce->u.slb_error.effective_address = addr;
+ } else if (mce->error_type == MCE_ERROR_TYPE_ERAT) {
+ mce->u.erat_error.effective_address_provided = true;
+ mce->u.erat_error.effective_address = addr;
+ } else if (mce->error_type == MCE_ERROR_TYPE_UE) {
+ mce->u.ue_error.effective_address_provided = true;
+ mce->u.ue_error.effective_address = addr;
+ }
+ return;
+}
+
+/*
+ * get_mce_event:
+ * mce Pointer to machine_check_event structure to be filled.
+ * release Flag to indicate whether to free the event slot or not.
+ * 0 <= do not release the mce event. Caller will invoke
+ * release_mce_event() once event has been consumed.
+ * 1 <= release the slot.
+ *
+ * return 1 = success
+ * 0 = failure
+ *
+ * get_mce_event() will be called by platform specific machine check
+ * handle routine and in KVM.
+ * When we call get_mce_event(), we are still in interrupt context and
+ * preemption will not be scheduled until ret_from_expect() routine
+ * is called.
+ */
+int get_mce_event(struct machine_check_event *mce, bool release)
+{
+ int index = __get_cpu_var(mce_nest_count) - 1;
+ struct machine_check_event *mc_evt;
+ int ret = 0;
+
+ /* Sanity check */
+ if (index < 0)
+ return ret;
+
+ /* Check if we have MCE info to process. */
+ if (index < MAX_MC_EVT) {
+ mc_evt = &__get_cpu_var(mce_event[index]);
+ /* Copy the event structure and release the original */
+ if (mce)
+ *mce = *mc_evt;
+ if (release)
+ mc_evt->in_use = 0;
+ ret = 1;
+ }
+ /* Decrement the count to free the slot. */
+ if (release)
+ __get_cpu_var(mce_nest_count)--;
+
+ return ret;
+}
+
+void release_mce_event(void)
+{
+ get_mce_event(NULL, true);
+}
+
+/*
+ * Queue up the MCE event which then can be handled later.
+ */
+void machine_check_queue_event(void)
+{
+ int index;
+ struct machine_check_event evt;
+
+ if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+ return;
+
+ index = __get_cpu_var(mce_queue_count)++;
+ /* If queue is full, just return for now. */
+ if (index >= MAX_MC_EVT) {
+ __get_cpu_var(mce_queue_count)--;
+ return;
+ }
+ __get_cpu_var(mce_event_queue[index]) = evt;
+
+ /* Queue irq work to process this event later. */
+ irq_work_queue(&mce_event_process_work);
+}
+
+/*
+ * process pending MCE event from the mce event queue. This function will be
+ * called during syscall exit.
+ */
+static void machine_check_process_queued_event(struct irq_work *work)
+{
+ int index;
+
+ /*
+ * For now just print it to console.
+ * TODO: log this error event to FSP or nvram.
+ */
+ while (__get_cpu_var(mce_queue_count) > 0) {
+ index = __get_cpu_var(mce_queue_count) - 1;
+ machine_check_print_event_info(
+ &__get_cpu_var(mce_event_queue[index]));
+ __get_cpu_var(mce_queue_count)--;
+ }
+}
+
+void machine_check_print_event_info(struct machine_check_event *evt)
+{
+ const char *level, *sevstr, *subtype;
+ static const char *mc_ue_types[] = {
+ "Indeterminate",
+ "Instruction fetch",
+ "Page table walk ifetch",
+ "Load/Store",
+ "Page table walk Load/Store",
+ };
+ static const char *mc_slb_types[] = {
+ "Indeterminate",
+ "Parity",
+ "Multihit",
+ };
+ static const char *mc_erat_types[] = {
+ "Indeterminate",
+ "Parity",
+ "Multihit",
+ };
+ static const char *mc_tlb_types[] = {
+ "Indeterminate",
+ "Parity",
+ "Multihit",
+ };
+
+ /* Print things out */
+ if (evt->version != MCE_V1) {
+ pr_err("Machine Check Exception, Unknown event version %d !\n",
+ evt->version);
+ return;
+ }
+ switch (evt->severity) {
+ case MCE_SEV_NO_ERROR:
+ level = KERN_INFO;
+ sevstr = "Harmless";
+ break;
+ case MCE_SEV_WARNING:
+ level = KERN_WARNING;
+ sevstr = "";
+ break;
+ case MCE_SEV_ERROR_SYNC:
+ level = KERN_ERR;
+ sevstr = "Severe";
+ break;
+ case MCE_SEV_FATAL:
+ default:
+ level = KERN_ERR;
+ sevstr = "Fatal";
+ break;
+ }
+
+ printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
+ evt->disposition == MCE_DISPOSITION_RECOVERED ?
+ "Recovered" : "[Not recovered");
+ printk("%s Initiator: %s\n", level,
+ evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
+ switch (evt->error_type) {
+ case MCE_ERROR_TYPE_UE:
+ subtype = evt->u.ue_error.ue_error_type <
+ ARRAY_SIZE(mc_ue_types) ?
+ mc_ue_types[evt->u.ue_error.ue_error_type]
+ : "Unknown";
+ printk("%s Error type: UE [%s]\n", level, subtype);
+ if (evt->u.ue_error.effective_address_provided)
+ printk("%s Effective address: %016llx\n",
+ level, evt->u.ue_error.effective_address);
+ if (evt->u.ue_error.physical_address_provided)
+ printk("%s Physial address: %016llx\n",
+ level, evt->u.ue_error.physical_address);
+ break;
+ case MCE_ERROR_TYPE_SLB:
+ subtype = evt->u.slb_error.slb_error_type <
+ ARRAY_SIZE(mc_slb_types) ?
+ mc_slb_types[evt->u.slb_error.slb_error_type]
+ : "Unknown";
+ printk("%s Error type: SLB [%s]\n", level, subtype);
+ if (evt->u.slb_error.effective_address_provided)
+ printk("%s Effective address: %016llx\n",
+ level, evt->u.slb_error.effective_address);
+ break;
+ case MCE_ERROR_TYPE_ERAT:
+ subtype = evt->u.erat_error.erat_error_type <
+ ARRAY_SIZE(mc_erat_types) ?
+ mc_erat_types[evt->u.erat_error.erat_error_type]
+ : "Unknown";
+ printk("%s Error type: ERAT [%s]\n", level, subtype);
+ if (evt->u.erat_error.effective_address_provided)
+ printk("%s Effective address: %016llx\n",
+ level, evt->u.erat_error.effective_address);
+ break;
+ case MCE_ERROR_TYPE_TLB:
+ subtype = evt->u.tlb_error.tlb_error_type <
+ ARRAY_SIZE(mc_tlb_types) ?
+ mc_tlb_types[evt->u.tlb_error.tlb_error_type]
+ : "Unknown";
+ printk("%s Error type: TLB [%s]\n", level, subtype);
+ if (evt->u.tlb_error.effective_address_provided)
+ printk("%s Effective address: %016llx\n",
+ level, evt->u.tlb_error.effective_address);
+ break;
+ default:
+ case MCE_ERROR_TYPE_UNKNOWN:
+ printk("%s Error type: Unknown\n", level);
+ break;
+ }
+}
+
+uint64_t get_mce_fault_addr(struct machine_check_event *evt)
+{
+ switch (evt->error_type) {
+ case MCE_ERROR_TYPE_UE:
+ if (evt->u.ue_error.effective_address_provided)
+ return evt->u.ue_error.effective_address;
+ break;
+ case MCE_ERROR_TYPE_SLB:
+ if (evt->u.slb_error.effective_address_provided)
+ return evt->u.slb_error.effective_address;
+ break;
+ case MCE_ERROR_TYPE_ERAT:
+ if (evt->u.erat_error.effective_address_provided)
+ return evt->u.erat_error.effective_address;
+ break;
+ case MCE_ERROR_TYPE_TLB:
+ if (evt->u.tlb_error.effective_address_provided)
+ return evt->u.tlb_error.effective_address;
+ break;
+ default:
+ case MCE_ERROR_TYPE_UNKNOWN:
+ break;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(get_mce_fault_addr);
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
new file mode 100644
index 00000000000..aa9aff3d6ad
--- /dev/null
+++ b/arch/powerpc/kernel/mce_power.c
@@ -0,0 +1,313 @@
+/*
+ * Machine check exception handling CPU-side for power7 and power8
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "mce_power: " fmt
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <asm/mmu.h>
+#include <asm/mce.h>
+#include <asm/machdep.h>
+
+/* flush SLBs and reload */
+static void flush_and_reload_slb(void)
+{
+ struct slb_shadow *slb;
+ unsigned long i, n;
+
+ /* Invalidate all SLBs */
+ asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+
+#ifdef CONFIG_KVM_BOOK3S_HANDLER
+ /*
+ * If machine check is hit when in guest or in transition, we will
+ * only flush the SLBs and continue.
+ */
+ if (get_paca()->kvm_hstate.in_guest)
+ return;
+#endif
+
+ /* For host kernel, reload the SLBs from shadow SLB buffer. */
+ slb = get_slb_shadow();
+ if (!slb)
+ return;
+
+ n = min_t(u32, be32_to_cpu(slb->persistent), SLB_MIN_SIZE);
+
+ /* Load up the SLB entries from shadow SLB */
+ for (i = 0; i < n; i++) {
+ unsigned long rb = be64_to_cpu(slb->save_area[i].esid);
+ unsigned long rs = be64_to_cpu(slb->save_area[i].vsid);
+
+ rb = (rb & ~0xFFFul) | i;
+ asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
+ }
+}
+
+static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits)
+{
+ long handled = 1;
+
+ /*
+ * flush and reload SLBs for SLB errors and flush TLBs for TLB errors.
+ * reset the error bits whenever we handle them so that at the end
+ * we can check whether we handled all of them or not.
+ * */
+ if (dsisr & slb_error_bits) {
+ flush_and_reload_slb();
+ /* reset error bits */
+ dsisr &= ~(slb_error_bits);
+ }
+ if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
+ if (cur_cpu_spec && cur_cpu_spec->flush_tlb)
+ cur_cpu_spec->flush_tlb(TLBIEL_INVAL_PAGE);
+ /* reset error bits */
+ dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB;
+ }
+ /* Any other errors we don't understand? */
+ if (dsisr & 0xffffffffUL)
+ handled = 0;
+
+ return handled;
+}
+
+static long mce_handle_derror_p7(uint64_t dsisr)
+{
+ return mce_handle_derror(dsisr, P7_DSISR_MC_SLB_ERRORS);
+}
+
+static long mce_handle_common_ierror(uint64_t srr1)
+{
+ long handled = 0;
+
+ switch (P7_SRR1_MC_IFETCH(srr1)) {
+ case 0:
+ break;
+ case P7_SRR1_MC_IFETCH_SLB_PARITY:
+ case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
+ /* flush and reload SLBs for SLB errors. */
+ flush_and_reload_slb();
+ handled = 1;
+ break;
+ case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
+ if (cur_cpu_spec && cur_cpu_spec->flush_tlb) {
+ cur_cpu_spec->flush_tlb(TLBIEL_INVAL_PAGE);
+ handled = 1;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return handled;
+}
+
+static long mce_handle_ierror_p7(uint64_t srr1)
+{
+ long handled = 0;
+
+ handled = mce_handle_common_ierror(srr1);
+
+ if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
+ flush_and_reload_slb();
+ handled = 1;
+ }
+ return handled;
+}
+
+static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1)
+{
+ switch (P7_SRR1_MC_IFETCH(srr1)) {
+ case P7_SRR1_MC_IFETCH_SLB_PARITY:
+ mce_err->error_type = MCE_ERROR_TYPE_SLB;
+ mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
+ break;
+ case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
+ mce_err->error_type = MCE_ERROR_TYPE_SLB;
+ mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
+ break;
+ case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
+ mce_err->error_type = MCE_ERROR_TYPE_TLB;
+ mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
+ break;
+ case P7_SRR1_MC_IFETCH_UE:
+ case P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL:
+ mce_err->error_type = MCE_ERROR_TYPE_UE;
+ mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH;
+ break;
+ case P7_SRR1_MC_IFETCH_UE_TLB_RELOAD:
+ mce_err->error_type = MCE_ERROR_TYPE_UE;
+ mce_err->u.ue_error_type =
+ MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
+ break;
+ }
+}
+
+static void mce_get_ierror_p7(struct mce_error_info *mce_err, uint64_t srr1)
+{
+ mce_get_common_ierror(mce_err, srr1);
+ if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
+ mce_err->error_type = MCE_ERROR_TYPE_SLB;
+ mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
+ }
+}
+
+static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr)
+{
+ if (dsisr & P7_DSISR_MC_UE) {
+ mce_err->error_type = MCE_ERROR_TYPE_UE;
+ mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
+ } else if (dsisr & P7_DSISR_MC_UE_TABLEWALK) {
+ mce_err->error_type = MCE_ERROR_TYPE_UE;
+ mce_err->u.ue_error_type =
+ MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
+ } else if (dsisr & P7_DSISR_MC_ERAT_MULTIHIT) {
+ mce_err->error_type = MCE_ERROR_TYPE_ERAT;
+ mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+ } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT) {
+ mce_err->error_type = MCE_ERROR_TYPE_SLB;
+ mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
+ } else if (dsisr & P7_DSISR_MC_SLB_PARITY_MFSLB) {
+ mce_err->error_type = MCE_ERROR_TYPE_SLB;
+ mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
+ } else if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
+ mce_err->error_type = MCE_ERROR_TYPE_TLB;
+ mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
+ } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT_PARITY) {
+ mce_err->error_type = MCE_ERROR_TYPE_SLB;
+ mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
+ }
+}
+
+static long mce_handle_ue_error(struct pt_regs *regs)
+{
+ long handled = 0;
+
+ /*
+ * On specific SCOM read via MMIO we may get a machine check
+ * exception with SRR0 pointing inside opal. If that is the
+ * case OPAL may have recovery address to re-read SCOM data in
+ * different way and hence we can recover from this MC.
+ */
+
+ if (ppc_md.mce_check_early_recovery) {
+ if (ppc_md.mce_check_early_recovery(regs))
+ handled = 1;
+ }
+ return handled;
+}
+
+long __machine_check_early_realmode_p7(struct pt_regs *regs)
+{
+ uint64_t srr1, nip, addr;
+ long handled = 1;
+ struct mce_error_info mce_error_info = { 0 };
+
+ srr1 = regs->msr;
+ nip = regs->nip;
+
+ /*
+ * Handle memory errors depending whether this was a load/store or
+ * ifetch exception. Also, populate the mce error_type and
+ * type-specific error_type from either SRR1 or DSISR, depending
+ * whether this was a load/store or ifetch exception
+ */
+ if (P7_SRR1_MC_LOADSTORE(srr1)) {
+ handled = mce_handle_derror_p7(regs->dsisr);
+ mce_get_derror_p7(&mce_error_info, regs->dsisr);
+ addr = regs->dar;
+ } else {
+ handled = mce_handle_ierror_p7(srr1);
+ mce_get_ierror_p7(&mce_error_info, srr1);
+ addr = regs->nip;
+ }
+
+ /* Handle UE error. */
+ if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
+ handled = mce_handle_ue_error(regs);
+
+ save_mce_event(regs, handled, &mce_error_info, nip, addr);
+ return handled;
+}
+
+static void mce_get_ierror_p8(struct mce_error_info *mce_err, uint64_t srr1)
+{
+ mce_get_common_ierror(mce_err, srr1);
+ if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
+ mce_err->error_type = MCE_ERROR_TYPE_ERAT;
+ mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+ }
+}
+
+static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr)
+{
+ mce_get_derror_p7(mce_err, dsisr);
+ if (dsisr & P8_DSISR_MC_ERAT_MULTIHIT_SEC) {
+ mce_err->error_type = MCE_ERROR_TYPE_ERAT;
+ mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+ }
+}
+
+static long mce_handle_ierror_p8(uint64_t srr1)
+{
+ long handled = 0;
+
+ handled = mce_handle_common_ierror(srr1);
+
+ if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
+ flush_and_reload_slb();
+ handled = 1;
+ }
+ return handled;
+}
+
+static long mce_handle_derror_p8(uint64_t dsisr)
+{
+ return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS);
+}
+
+long __machine_check_early_realmode_p8(struct pt_regs *regs)
+{
+ uint64_t srr1, nip, addr;
+ long handled = 1;
+ struct mce_error_info mce_error_info = { 0 };
+
+ srr1 = regs->msr;
+ nip = regs->nip;
+
+ if (P7_SRR1_MC_LOADSTORE(srr1)) {
+ handled = mce_handle_derror_p8(regs->dsisr);
+ mce_get_derror_p8(&mce_error_info, regs->dsisr);
+ addr = regs->dar;
+ } else {
+ handled = mce_handle_ierror_p8(srr1);
+ mce_get_ierror_p8(&mce_error_info, srr1);
+ addr = regs->nip;
+ }
+
+ /* Handle UE error. */
+ if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
+ handled = mce_handle_ue_error(regs);
+
+ save_mce_event(regs, handled, &mce_error_info, nip, addr);
+ return handled;
+}
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index 2d29752cbe1..7ce26d45777 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -5,7 +5,6 @@
* Largely rewritten by Cort Dougan (cort@cs.nmt.edu)
* and Paul Mackerras.
*
- * Adapted for iSeries by Mike Corrigan (mikejc@us.ibm.com)
* PPC64 updates by Dave Engebretsen (engebret@us.ibm.com)
*
* setjmp/longjmp code by Paul Mackerras.
@@ -55,13 +54,6 @@ _GLOBAL(add_reloc_offset)
.align 3
2: PPC_LONG 1b
-_GLOBAL(kernel_execve)
- li r0,__NR_execve
- sc
- bnslr
- neg r3,r3
- blr
-
_GLOBAL(setjmp)
mflr r0
PPC_STL r0,0(r3)
@@ -122,8 +114,3 @@ _GLOBAL(longjmp)
mtlr r0
mr r3,r4
blr
-
-_GLOBAL(__setup_cpu_power7)
-_GLOBAL(__restore_cpu_power7)
- /* place holder */
- blr
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index a7a570dcdd5..7c6bb4b17b4 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -8,6 +8,8 @@
* kexec bits:
* Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com>
* GameCube/ppc32 port Copyright (C) 2004 Albert Herranz
+ * PPC44x port. Copyright (C) 2011, IBM Corporation
+ * Author: Suzuki Poulose <suzuki@in.ibm.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -30,29 +32,48 @@
#include <asm/processor.h>
#include <asm/kexec.h>
#include <asm/bug.h>
+#include <asm/ptrace.h>
.text
+/*
+ * We store the saved ksp_limit in the unused part
+ * of the STACK_FRAME_OVERHEAD
+ */
_GLOBAL(call_do_softirq)
mflr r0
stw r0,4(r1)
+ lwz r10,THREAD+KSP_LIMIT(r2)
+ addi r11,r3,THREAD_INFO_GAP
stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3)
mr r1,r3
+ stw r10,8(r1)
+ stw r11,THREAD+KSP_LIMIT(r2)
bl __do_softirq
+ lwz r10,8(r1)
lwz r1,0(r1)
lwz r0,4(r1)
+ stw r10,THREAD+KSP_LIMIT(r2)
mtlr r0
blr
-_GLOBAL(call_handle_irq)
+/*
+ * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
+ */
+_GLOBAL(call_do_irq)
mflr r0
stw r0,4(r1)
- mtctr r6
- stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r5)
- mr r1,r5
- bctrl
+ lwz r10,THREAD+KSP_LIMIT(r2)
+ addi r11,r4,THREAD_INFO_GAP
+ stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4)
+ mr r1,r4
+ stw r10,8(r1)
+ stw r11,THREAD+KSP_LIMIT(r2)
+ bl __do_irq
+ lwz r10,8(r1)
lwz r1,0(r1)
lwz r0,4(r1)
+ stw r10,THREAD+KSP_LIMIT(r2)
mtlr r0
blr
@@ -176,7 +197,7 @@ _GLOBAL(low_choose_750fx_pll)
mtspr SPRN_HID1,r4
/* Store new HID1 image */
- rlwinm r6,r1,0,0,(31-THREAD_SHIFT)
+ CURRENT_THREAD_INFO(r6, r1)
lwz r6,TI_CPU(r6)
slwi r6,r6,2
addis r6,r6,nap_save_hid1@ha
@@ -324,8 +345,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE)
*
* flush_icache_range(unsigned long start, unsigned long stop)
*/
-_KPROBE(__flush_icache_range)
+_KPROBE(flush_icache_range)
BEGIN_FTR_SECTION
+ PURGE_PREFETCHED_INS
blr /* for 601, do nothing */
END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
li r5,L1_CACHE_BYTES-1
@@ -429,6 +451,7 @@ _GLOBAL(invalidate_dcache_range)
*/
_GLOBAL(__flush_dcache_icache)
BEGIN_FTR_SECTION
+ PURGE_PREFETCHED_INS
blr
END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
rlwinm r3,r3,0,0,31-PAGE_SHIFT /* Get page base address */
@@ -470,6 +493,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x)
*/
_GLOBAL(__flush_dcache_icache_phys)
BEGIN_FTR_SECTION
+ PURGE_PREFETCHED_INS
blr /* for 601, do nothing */
END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
mfmsr r10
@@ -640,6 +664,20 @@ _GLOBAL(__lshrdi3)
blr
/*
+ * 64-bit comparison: __cmpdi2(s64 a, s64 b)
+ * Returns 0 if a < b, 1 if a == b, 2 if a > b.
+ */
+_GLOBAL(__cmpdi2)
+ cmpw r3,r5
+ li r3,1
+ bne 1f
+ cmplw r4,r6
+ beqlr
+1: li r3,0
+ bltlr
+ li r3,2
+ blr
+/*
* 64-bit comparison: __ucmpdi2(u64 a, u64 b)
* Returns 0 if a < b, 1 if a == b, 2 if a > b.
*/
@@ -654,45 +692,34 @@ _GLOBAL(__ucmpdi2)
li r3,2
blr
+_GLOBAL(__bswapdi2)
+ rotlwi r9,r4,8
+ rotlwi r10,r3,8
+ rlwimi r9,r4,24,0,7
+ rlwimi r10,r3,24,0,7
+ rlwimi r9,r4,24,16,23
+ rlwimi r10,r3,24,16,23
+ mr r3,r9
+ mr r4,r10
+ blr
+
_GLOBAL(abs)
srawi r4,r3,31
xor r3,r3,r4
sub r3,r3,r4
blr
-/*
- * Create a kernel thread
- * kernel_thread(fn, arg, flags)
- */
-_GLOBAL(kernel_thread)
- stwu r1,-16(r1)
- stw r30,8(r1)
- stw r31,12(r1)
- mr r30,r3 /* function */
- mr r31,r4 /* argument */
- ori r3,r5,CLONE_VM /* flags */
- oris r3,r3,CLONE_UNTRACED>>16
- li r4,0 /* new sp (unused) */
- li r0,__NR_clone
- sc
- bns+ 1f /* did system call indicate error? */
- neg r3,r3 /* if so, make return code negative */
-1: cmpwi 0,r3,0 /* parent or child? */
- bne 2f /* return if parent */
- li r0,0 /* make top-level stack frame */
- stwu r0,-16(r1)
- mtlr r30 /* fn addr in lr */
- mr r3,r31 /* load arg and call fn */
- PPC440EP_ERR42
- blrl
- li r0,__NR_exit /* exit if function returns */
+#ifdef CONFIG_SMP
+_GLOBAL(start_secondary_resume)
+ /* Reset stack */
+ CURRENT_THREAD_INFO(r1, r1)
+ addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD
li r3,0
- sc
-2: lwz r30,8(r1)
- lwz r31,12(r1)
- addi r1,r1,16
- blr
-
+ stw r3,0(r1) /* Zero the stack frame pointer */
+ bl start_secondary
+ b .
+#endif /* CONFIG_SMP */
+
/*
* This routine is just here to keep GCC happy - sigh...
*/
@@ -724,6 +751,362 @@ relocate_new_kernel:
mr r5, r31
li r0, 0
+#elif defined(CONFIG_44x)
+
+ /* Save our parameters */
+ mr r29, r3
+ mr r30, r4
+ mr r31, r5
+
+#ifdef CONFIG_PPC_47x
+ /* Check for 47x cores */
+ mfspr r3,SPRN_PVR
+ srwi r3,r3,16
+ cmplwi cr0,r3,PVR_476@h
+ beq setup_map_47x
+ cmplwi cr0,r3,PVR_476_ISS@h
+ beq setup_map_47x
+#endif /* CONFIG_PPC_47x */
+
+/*
+ * Code for setting up 1:1 mapping for PPC440x for KEXEC
+ *
+ * We cannot switch off the MMU on PPC44x.
+ * So we:
+ * 1) Invalidate all the mappings except the one we are running from.
+ * 2) Create a tmp mapping for our code in the other address space(TS) and
+ * jump to it. Invalidate the entry we started in.
+ * 3) Create a 1:1 mapping for 0-2GiB in chunks of 256M in original TS.
+ * 4) Jump to the 1:1 mapping in original TS.
+ * 5) Invalidate the tmp mapping.
+ *
+ * - Based on the kexec support code for FSL BookE
+ *
+ */
+
+ /*
+ * Load the PID with kernel PID (0).
+ * Also load our MSR_IS and TID to MMUCR for TLB search.
+ */
+ li r3, 0
+ mtspr SPRN_PID, r3
+ mfmsr r4
+ andi. r4,r4,MSR_IS@l
+ beq wmmucr
+ oris r3,r3,PPC44x_MMUCR_STS@h
+wmmucr:
+ mtspr SPRN_MMUCR,r3
+ sync
+
+ /*
+ * Invalidate all the TLB entries except the current entry
+ * where we are running from
+ */
+ bl 0f /* Find our address */
+0: mflr r5 /* Make it accessible */
+ tlbsx r23,0,r5 /* Find entry we are in */
+ li r4,0 /* Start at TLB entry 0 */
+ li r3,0 /* Set PAGEID inval value */
+1: cmpw r23,r4 /* Is this our entry? */
+ beq skip /* If so, skip the inval */
+ tlbwe r3,r4,PPC44x_TLB_PAGEID /* If not, inval the entry */
+skip:
+ addi r4,r4,1 /* Increment */
+ cmpwi r4,64 /* Are we done? */
+ bne 1b /* If not, repeat */
+ isync
+
+ /* Create a temp mapping and jump to it */
+ andi. r6, r23, 1 /* Find the index to use */
+ addi r24, r6, 1 /* r24 will contain 1 or 2 */
+
+ mfmsr r9 /* get the MSR */
+ rlwinm r5, r9, 27, 31, 31 /* Extract the MSR[IS] */
+ xori r7, r5, 1 /* Use the other address space */
+
+ /* Read the current mapping entries */
+ tlbre r3, r23, PPC44x_TLB_PAGEID
+ tlbre r4, r23, PPC44x_TLB_XLAT
+ tlbre r5, r23, PPC44x_TLB_ATTRIB
+
+ /* Save our current XLAT entry */
+ mr r25, r4
+
+ /* Extract the TLB PageSize */
+ li r10, 1 /* r10 will hold PageSize */
+ rlwinm r11, r3, 0, 24, 27 /* bits 24-27 */
+
+ /* XXX: As of now we use 256M, 4K pages */
+ cmpwi r11, PPC44x_TLB_256M
+ bne tlb_4k
+ rotlwi r10, r10, 28 /* r10 = 256M */
+ b write_out
+tlb_4k:
+ cmpwi r11, PPC44x_TLB_4K
+ bne default
+ rotlwi r10, r10, 12 /* r10 = 4K */
+ b write_out
+default:
+ rotlwi r10, r10, 10 /* r10 = 1K */
+
+write_out:
+ /*
+ * Write out the tmp 1:1 mapping for this code in other address space
+ * Fixup EPN = RPN , TS=other address space
+ */
+ insrwi r3, r7, 1, 23 /* Bit 23 is TS for PAGEID field */
+
+ /* Write out the tmp mapping entries */
+ tlbwe r3, r24, PPC44x_TLB_PAGEID
+ tlbwe r4, r24, PPC44x_TLB_XLAT
+ tlbwe r5, r24, PPC44x_TLB_ATTRIB
+
+ subi r11, r10, 1 /* PageOffset Mask = PageSize - 1 */
+ not r10, r11 /* Mask for PageNum */
+
+ /* Switch to other address space in MSR */
+ insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */
+
+ bl 1f
+1: mflr r8
+ addi r8, r8, (2f-1b) /* Find the target offset */
+
+ /* Jump to the tmp mapping */
+ mtspr SPRN_SRR0, r8
+ mtspr SPRN_SRR1, r9
+ rfi
+
+2:
+ /* Invalidate the entry we were executing from */
+ li r3, 0
+ tlbwe r3, r23, PPC44x_TLB_PAGEID
+
+ /* attribute fields. rwx for SUPERVISOR mode */
+ li r5, 0
+ ori r5, r5, (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G)
+
+ /* Create 1:1 mapping in 256M pages */
+ xori r7, r7, 1 /* Revert back to Original TS */
+
+ li r8, 0 /* PageNumber */
+ li r6, 3 /* TLB Index, start at 3 */
+
+next_tlb:
+ rotlwi r3, r8, 28 /* Create EPN (bits 0-3) */
+ mr r4, r3 /* RPN = EPN */
+ ori r3, r3, (PPC44x_TLB_VALID | PPC44x_TLB_256M) /* SIZE = 256M, Valid */
+ insrwi r3, r7, 1, 23 /* Set TS from r7 */
+
+ tlbwe r3, r6, PPC44x_TLB_PAGEID /* PageID field : EPN, V, SIZE */
+ tlbwe r4, r6, PPC44x_TLB_XLAT /* Address translation : RPN */
+ tlbwe r5, r6, PPC44x_TLB_ATTRIB /* Attributes */
+
+ addi r8, r8, 1 /* Increment PN */
+ addi r6, r6, 1 /* Increment TLB Index */
+ cmpwi r8, 8 /* Are we done ? */
+ bne next_tlb
+ isync
+
+ /* Jump to the new mapping 1:1 */
+ li r9,0
+ insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */
+
+ bl 1f
+1: mflr r8
+ and r8, r8, r11 /* Get our offset within page */
+ addi r8, r8, (2f-1b)
+
+ and r5, r25, r10 /* Get our target PageNum */
+ or r8, r8, r5 /* Target jump address */
+
+ mtspr SPRN_SRR0, r8
+ mtspr SPRN_SRR1, r9
+ rfi
+2:
+ /* Invalidate the tmp entry we used */
+ li r3, 0
+ tlbwe r3, r24, PPC44x_TLB_PAGEID
+ sync
+ b ppc44x_map_done
+
+#ifdef CONFIG_PPC_47x
+
+ /* 1:1 mapping for 47x */
+
+setup_map_47x:
+
+ /*
+ * Load the kernel pid (0) to PID and also to MMUCR[TID].
+ * Also set the MSR IS->MMUCR STS
+ */
+ li r3, 0
+ mtspr SPRN_PID, r3 /* Set PID */
+ mfmsr r4 /* Get MSR */
+ andi. r4, r4, MSR_IS@l /* TS=1? */
+ beq 1f /* If not, leave STS=0 */
+ oris r3, r3, PPC47x_MMUCR_STS@h /* Set STS=1 */
+1: mtspr SPRN_MMUCR, r3 /* Put MMUCR */
+ sync
+
+ /* Find the entry we are running from */
+ bl 2f
+2: mflr r23
+ tlbsx r23, 0, r23
+ tlbre r24, r23, 0 /* TLB Word 0 */
+ tlbre r25, r23, 1 /* TLB Word 1 */
+ tlbre r26, r23, 2 /* TLB Word 2 */
+
+
+ /*
+ * Invalidates all the tlb entries by writing to 256 RPNs(r4)
+ * of 4k page size in all 4 ways (0-3 in r3).
+ * This would invalidate the entire UTLB including the one we are
+ * running from. However the shadow TLB entries would help us
+ * to continue the execution, until we flush them (rfi/isync).
+ */
+ addis r3, 0, 0x8000 /* specify the way */
+ addi r4, 0, 0 /* TLB Word0 = (EPN=0, VALID = 0) */
+ addi r5, 0, 0
+ b clear_utlb_entry
+
+ /* Align the loop to speed things up. from head_44x.S */
+ .align 6
+
+clear_utlb_entry:
+
+ tlbwe r4, r3, 0
+ tlbwe r5, r3, 1
+ tlbwe r5, r3, 2
+ addis r3, r3, 0x2000 /* Increment the way */
+ cmpwi r3, 0
+ bne clear_utlb_entry
+ addis r3, 0, 0x8000
+ addis r4, r4, 0x100 /* Increment the EPN */
+ cmpwi r4, 0
+ bne clear_utlb_entry
+
+ /* Create the entries in the other address space */
+ mfmsr r5
+ rlwinm r7, r5, 27, 31, 31 /* Get the TS (Bit 26) from MSR */
+ xori r7, r7, 1 /* r7 = !TS */
+
+ insrwi r24, r7, 1, 21 /* Change the TS in the saved TLB word 0 */
+
+ /*
+ * write out the TLB entries for the tmp mapping
+ * Use way '0' so that we could easily invalidate it later.
+ */
+ lis r3, 0x8000 /* Way '0' */
+
+ tlbwe r24, r3, 0
+ tlbwe r25, r3, 1
+ tlbwe r26, r3, 2
+
+ /* Update the msr to the new TS */
+ insrwi r5, r7, 1, 26
+
+ bl 1f
+1: mflr r6
+ addi r6, r6, (2f-1b)
+
+ mtspr SPRN_SRR0, r6
+ mtspr SPRN_SRR1, r5
+ rfi
+
+ /*
+ * Now we are in the tmp address space.
+ * Create a 1:1 mapping for 0-2GiB in the original TS.
+ */
+2:
+ li r3, 0
+ li r4, 0 /* TLB Word 0 */
+ li r5, 0 /* TLB Word 1 */
+ li r6, 0
+ ori r6, r6, PPC47x_TLB2_S_RWX /* TLB word 2 */
+
+ li r8, 0 /* PageIndex */
+
+ xori r7, r7, 1 /* revert back to original TS */
+
+write_utlb:
+ rotlwi r5, r8, 28 /* RPN = PageIndex * 256M */
+ /* ERPN = 0 as we don't use memory above 2G */
+
+ mr r4, r5 /* EPN = RPN */
+ ori r4, r4, (PPC47x_TLB0_VALID | PPC47x_TLB0_256M)
+ insrwi r4, r7, 1, 21 /* Insert the TS to Word 0 */
+
+ tlbwe r4, r3, 0 /* Write out the entries */
+ tlbwe r5, r3, 1
+ tlbwe r6, r3, 2
+ addi r8, r8, 1
+ cmpwi r8, 8 /* Have we completed ? */
+ bne write_utlb
+
+ /* make sure we complete the TLB write up */
+ isync
+
+ /*
+ * Prepare to jump to the 1:1 mapping.
+ * 1) Extract page size of the tmp mapping
+ * DSIZ = TLB_Word0[22:27]
+ * 2) Calculate the physical address of the address
+ * to jump to.
+ */
+ rlwinm r10, r24, 0, 22, 27
+
+ cmpwi r10, PPC47x_TLB0_4K
+ bne 0f
+ li r10, 0x1000 /* r10 = 4k */
+ bl 1f
+
+0:
+ /* Defaults to 256M */
+ lis r10, 0x1000
+
+ bl 1f
+1: mflr r4
+ addi r4, r4, (2f-1b) /* virtual address of 2f */
+
+ subi r11, r10, 1 /* offsetmask = Pagesize - 1 */
+ not r10, r11 /* Pagemask = ~(offsetmask) */
+
+ and r5, r25, r10 /* Physical page */
+ and r6, r4, r11 /* offset within the current page */
+
+ or r5, r5, r6 /* Physical address for 2f */
+
+ /* Switch the TS in MSR to the original one */
+ mfmsr r8
+ insrwi r8, r7, 1, 26
+
+ mtspr SPRN_SRR1, r8
+ mtspr SPRN_SRR0, r5
+ rfi
+
+2:
+ /* Invalidate the tmp mapping */
+ lis r3, 0x8000 /* Way '0' */
+
+ clrrwi r24, r24, 12 /* Clear the valid bit */
+ tlbwe r24, r3, 0
+ tlbwe r25, r3, 1
+ tlbwe r26, r3, 2
+
+ /* Make sure we complete the TLB write and flush the shadow TLB */
+ isync
+
+#endif
+
+ppc44x_map_done:
+
+
+ /* Restore the parameters */
+ mr r3, r29
+ mr r4, r30
+ mr r5, r31
+
+ li r0, 0
#else
li r0, 0
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index e5144906a56..4e314b90c75 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -25,6 +25,7 @@
#include <asm/cputable.h>
#include <asm/thread_info.h>
#include <asm/kexec.h>
+#include <asm/ptrace.h>
.text
@@ -33,20 +34,18 @@ _GLOBAL(call_do_softirq)
std r0,16(r1)
stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3)
mr r1,r3
- bl .__do_softirq
+ bl __do_softirq
ld r1,0(r1)
ld r0,16(r1)
mtlr r0
blr
-_GLOBAL(call_handle_irq)
- ld r8,0(r6)
+_GLOBAL(call_do_irq)
mflr r0
std r0,16(r1)
- mtctr r8
- stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r5)
- mr r1,r5
- bctrl
+ stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4)
+ mr r1,r4
+ bl __do_irq
ld r1,0(r1)
ld r0,16(r1)
mtlr r0
@@ -66,8 +65,11 @@ PPC64_CACHES:
* flush all bytes from start through stop-1 inclusive
*/
-_KPROBE(__flush_icache_range)
-
+_KPROBE(flush_icache_range)
+BEGIN_FTR_SECTION
+ PURGE_PREFETCHED_INS
+ blr
+END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
/*
* Flush the data cache to memory
*
@@ -210,6 +212,11 @@ _GLOBAL(__flush_dcache_icache)
* Different systems have different cache line sizes
*/
+BEGIN_FTR_SECTION
+ PURGE_PREFETCHED_INS
+ blr
+END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
+
/* Flush the dcache */
ld r7,PPC64_CACHES@toc(r2)
clrrdi r3,r3,PAGE_SHIFT /* Page align */
@@ -233,8 +240,53 @@ _GLOBAL(__flush_dcache_icache)
isync
blr
+_GLOBAL(__bswapdi2)
+ srdi r8,r3,32
+ rlwinm r7,r3,8,0xffffffff
+ rlwimi r7,r3,24,0,7
+ rlwinm r9,r8,8,0xffffffff
+ rlwimi r7,r3,24,16,23
+ rlwimi r9,r8,24,0,7
+ rlwimi r9,r8,24,16,23
+ sldi r7,r7,32
+ or r3,r7,r9
+ blr
+
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX
+_GLOBAL(rmci_on)
+ sync
+ isync
+ li r3,0x100
+ rldicl r3,r3,32,0
+ mfspr r5,SPRN_HID4
+ or r5,r5,r3
+ sync
+ mtspr SPRN_HID4,r5
+ isync
+ slbia
+ isync
+ sync
+ blr
+
+_GLOBAL(rmci_off)
+ sync
+ isync
+ li r3,0x100
+ rldicl r3,r3,32,0
+ mfspr r5,SPRN_HID4
+ andc r5,r5,r3
+ sync
+ mtspr SPRN_HID4,r5
+ isync
+ slbia
+ isync
+ sync
+ blr
+#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */
#if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE)
+
/*
* Do an IO access in real mode
*/
@@ -300,11 +352,6 @@ _GLOBAL(real_writeb)
#ifdef CONFIG_PPC_PASEMI
-/* No support in all binutils for these yet, so use defines */
-#define LBZCIX(RT,RA,RB) .long (0x7c0006aa|(RT<<21)|(RA<<16)|(RB << 11))
-#define STBCIX(RS,RA,RB) .long (0x7c0007aa|(RS<<21)|(RA<<16)|(RB << 11))
-
-
_GLOBAL(real_205_readb)
mfmsr r7
ori r0,r7,MSR_DR
@@ -313,7 +360,7 @@ _GLOBAL(real_205_readb)
mtmsrd r0
sync
isync
- LBZCIX(r3,0,r3)
+ LBZCIX(R3,R0,R3)
isync
mtmsrd r7
sync
@@ -328,7 +375,7 @@ _GLOBAL(real_205_writeb)
mtmsrd r0
sync
isync
- STBCIX(r3,0,r4)
+ STBCIX(R3,R0,R4)
isync
mtmsrd r7
sync
@@ -338,7 +385,7 @@ _GLOBAL(real_205_writeb)
#endif /* CONFIG_PPC_PASEMI */
-#ifdef CONFIG_CPU_FREQ_PMAC64
+#if defined(CONFIG_CPU_FREQ_PMAC64) || defined(CONFIG_CPU_FREQ_MAPLE)
/*
* SCOM access functions for 970 (FX only for now)
*
@@ -407,61 +454,15 @@ _GLOBAL(scom970_write)
/* restore interrupts */
mtmsrd r5,1
blr
-#endif /* CONFIG_CPU_FREQ_PMAC64 */
-
-
-/*
- * Create a kernel thread
- * kernel_thread(fn, arg, flags)
- */
-_GLOBAL(kernel_thread)
- std r29,-24(r1)
- std r30,-16(r1)
- stdu r1,-STACK_FRAME_OVERHEAD(r1)
- mr r29,r3
- mr r30,r4
- ori r3,r5,CLONE_VM /* flags */
- oris r3,r3,(CLONE_UNTRACED>>16)
- li r4,0 /* new sp (unused) */
- li r0,__NR_clone
- sc
- bns+ 1f /* did system call indicate error? */
- neg r3,r3 /* if so, make return code negative */
-1: cmpdi 0,r3,0 /* parent or child? */
- bne 2f /* return if parent */
- li r0,0
- stdu r0,-STACK_FRAME_OVERHEAD(r1)
- ld r2,8(r29)
- ld r29,0(r29)
- mtlr r29 /* fn addr in lr */
- mr r3,r30 /* load arg and call fn */
- blrl
- li r0,__NR_exit /* exit after child exits */
- li r3,0
- sc
-2: addi r1,r1,STACK_FRAME_OVERHEAD
- ld r29,-24(r1)
- ld r30,-16(r1)
- blr
-
-/*
- * disable_kernel_fp()
- * Disable the FPU.
- */
-_GLOBAL(disable_kernel_fp)
- mfmsr r3
- rldicl r0,r3,(63-MSR_FP_LG),1
- rldicl r3,r0,(MSR_FP_LG+1),0
- mtmsrd r3 /* disable use of fpu now */
- isync
- blr
+#endif /* CONFIG_CPU_FREQ_PMAC64 || CONFIG_CPU_FREQ_MAPLE */
/* kexec_wait(phys_cpu)
*
* wait for the flag to change, indicating this kernel is going away but
* the slave code for the next one is at addresses 0 to 100.
*
- * This is used by all slaves.
+ * This is used by all slaves, even those that did not find a matching
+ * paca in the secondary startup code.
*
* Physical (hardware) cpu id should be in r3.
*/
@@ -470,10 +471,6 @@ _GLOBAL(kexec_wait)
1: mflr r5
addi r5,r5,kexec_flag-1b
- li r4,KEXEC_STATE_REAL_MODE
- stb r4,PACAKEXECSTATE(r13)
- SYNC
-
99: HMT_LOW
#ifdef CONFIG_KEXEC /* use no memory without kexec */
lwz r4,0(r5)
@@ -498,12 +495,18 @@ kexec_flag:
*
* get phys id from paca
* switch to real mode
+ * mark the paca as no longer used
* join other cpus in kexec_wait(phys_id)
*/
_GLOBAL(kexec_smp_wait)
lhz r3,PACAHWCPUID(r13)
bl real_mode
- b .kexec_wait
+
+ li r4,KEXEC_STATE_REAL_MODE
+ stb r4,PACAKEXECSTATE(r13)
+ SYNC
+
+ b kexec_wait
/*
* switch to real mode (turn mmu off)
@@ -573,7 +576,7 @@ _GLOBAL(kexec_sequence)
/* copy dest pages, flush whole dest image */
mr r3,r29
- bl .kexec_copy_flush /* (image) */
+ bl kexec_copy_flush /* (image) */
/* turn off mmu */
bl real_mode
@@ -583,7 +586,7 @@ _GLOBAL(kexec_sequence)
mr r4,r30 /* start, aka phys mem offset */
li r5,0x100
li r6,0
- bl .copy_and_flush /* (dest, src, copy limit, start offset) */
+ bl copy_and_flush /* (dest, src, copy limit, start offset) */
1: /* assume normal blr return */
/* release other cpus to the new kernel secondary start at 0x60 */
@@ -592,8 +595,12 @@ _GLOBAL(kexec_sequence)
stw r6,kexec_flag-1b(5)
/* clear out hardware hash page table and tlb */
- ld r5,0(r27) /* deref function descriptor */
- mtctr r5
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
+ ld r12,0(r27) /* deref function descriptor */
+#else
+ mr r12,r27
+#endif
+ mtctr r12
bctrl /* ppc_md.hpte_clear_all(void); */
/*
@@ -627,3 +634,31 @@ _GLOBAL(kexec_sequence)
li r5,0
blr /* image->start(physid, image->start, 0); */
#endif /* CONFIG_KEXEC */
+
+#ifdef CONFIG_MODULES
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+
+#ifdef CONFIG_MODVERSIONS
+.weak __crc_TOC.
+.section "___kcrctab+TOC.","a"
+.globl __kcrctab_TOC.
+__kcrctab_TOC.:
+ .llong __crc_TOC.
+#endif
+
+/*
+ * Export a fake .TOC. since both modpost and depmod will complain otherwise.
+ * Both modpost and depmod strip the leading . so we do the same here.
+ */
+.section "__ksymtab_strings","a"
+__kstrtab_TOC.:
+ .asciz "TOC."
+
+.section "___ksymtab+TOC.","a"
+/* This symbol name is important: it's used by modpost to find exported syms */
+.globl __ksymtab_TOC.
+__ksymtab_TOC.:
+ .llong 0 /* .value */
+ .llong __kstrtab_TOC.
+#endif /* ELFv2 */
+#endif /* MODULES */
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index 49cee9df225..9547381b631 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -16,7 +16,6 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#include <linux/module.h>
#include <linux/elf.h>
#include <linux/moduleloader.h>
#include <linux/err.h>
@@ -26,25 +25,10 @@
#include <asm/uaccess.h>
#include <asm/firmware.h>
#include <linux/sort.h>
-
-#include "setup.h"
+#include <asm/setup.h>
LIST_HEAD(module_bug_list);
-void *module_alloc(unsigned long size)
-{
- if (size == 0)
- return NULL;
-
- return vmalloc_exec(size);
-}
-
-/* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
-{
- vfree(module_region);
-}
-
static const Elf_Shdr *find_section(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
const char *name)
@@ -93,7 +77,3 @@ int module_finalize(const Elf_Ehdr *hdr,
return 0;
}
-
-void module_arch_cleanup(struct module *mod)
-{
-}
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c
index f832773fc28..6cff040bf45 100644
--- a/arch/powerpc/kernel/module_32.c
+++ b/arch/powerpc/kernel/module_32.c
@@ -26,8 +26,7 @@
#include <linux/cache.h>
#include <linux/bug.h>
#include <linux/sort.h>
-
-#include "setup.h"
+#include <asm/setup.h>
#if 0
#define DEBUGP printk
@@ -174,21 +173,10 @@ int module_frob_arch_sections(Elf32_Ehdr *hdr,
return 0;
}
-int apply_relocate(Elf32_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *module)
-{
- printk(KERN_ERR "%s: Non-ADD RELOCATION unsupported\n",
- module->name);
- return -ENOEXEC;
-}
-
static inline int entry_matches(struct ppc_plt_entry *entry, Elf32_Addr val)
{
- if (entry->jump[0] == 0x3d600000 + ((val + 0x8000) >> 16)
- && entry->jump[1] == 0x396b0000 + (val & 0xffff))
+ if (entry->jump[0] == 0x3d800000 + ((val + 0x8000) >> 16)
+ && entry->jump[1] == 0x398c0000 + (val & 0xffff))
return 1;
return 0;
}
@@ -215,10 +203,9 @@ static uint32_t do_plt_call(void *location,
entry++;
}
- /* Stolen from Paul Mackerras as well... */
- entry->jump[0] = 0x3d600000+((val+0x8000)>>16); /* lis r11,sym@ha */
- entry->jump[1] = 0x396b0000 + (val&0xffff); /* addi r11,r11,sym@l*/
- entry->jump[2] = 0x7d6903a6; /* mtctr r11 */
+ entry->jump[0] = 0x3d800000+((val+0x8000)>>16); /* lis r12,sym@ha */
+ entry->jump[1] = 0x398c0000 + (val&0xffff); /* addi r12,r12,sym@l*/
+ entry->jump[2] = 0x7d8903a6; /* mtctr r12 */
entry->jump[3] = 0x4e800420; /* bctr */
DEBUGP("Initialized plt for 0x%x at %p\n", val, entry);
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 8fbb12508bf..d807ee626af 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -22,12 +22,12 @@
#include <linux/vmalloc.h>
#include <linux/ftrace.h>
#include <linux/bug.h>
+#include <linux/uaccess.h>
#include <asm/module.h>
#include <asm/firmware.h>
#include <asm/code-patching.h>
#include <linux/sort.h>
-
-#include "setup.h"
+#include <asm/setup.h>
/* FIXME: We don't do .init separately. To do this, we'd need to have
a separate r2 value in the init and core section, and stub between
@@ -42,35 +42,170 @@
#define DEBUGP(fmt , ...)
#endif
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define R2_STACK_OFFSET 24
+
+/* An address is simply the address of the function. */
+typedef unsigned long func_desc_t;
+
+static func_desc_t func_desc(unsigned long addr)
+{
+ return addr;
+}
+static unsigned long func_addr(unsigned long addr)
+{
+ return addr;
+}
+static unsigned long stub_func_addr(func_desc_t func)
+{
+ return func;
+}
+
+/* PowerPC64 specific values for the Elf64_Sym st_other field. */
+#define STO_PPC64_LOCAL_BIT 5
+#define STO_PPC64_LOCAL_MASK (7 << STO_PPC64_LOCAL_BIT)
+#define PPC64_LOCAL_ENTRY_OFFSET(other) \
+ (((1 << (((other) & STO_PPC64_LOCAL_MASK) >> STO_PPC64_LOCAL_BIT)) >> 2) << 2)
+
+static unsigned int local_entry_offset(const Elf64_Sym *sym)
+{
+ /* sym->st_other indicates offset to local entry point
+ * (otherwise it will assume r12 is the address of the start
+ * of function and try to derive r2 from it). */
+ return PPC64_LOCAL_ENTRY_OFFSET(sym->st_other);
+}
+#else
+#define R2_STACK_OFFSET 40
+
+/* An address is address of the OPD entry, which contains address of fn. */
+typedef struct ppc64_opd_entry func_desc_t;
+
+static func_desc_t func_desc(unsigned long addr)
+{
+ return *(struct ppc64_opd_entry *)addr;
+}
+static unsigned long func_addr(unsigned long addr)
+{
+ return func_desc(addr).funcaddr;
+}
+static unsigned long stub_func_addr(func_desc_t func)
+{
+ return func.funcaddr;
+}
+static unsigned int local_entry_offset(const Elf64_Sym *sym)
+{
+ return 0;
+}
+#endif
+
/* Like PPC32, we need little trampolines to do > 24-bit jumps (into
the kernel itself). But on PPC64, these need to be used for every
jump, actually, to reset r2 (TOC+0x8000). */
struct ppc64_stub_entry
{
- /* 28 byte jump instruction sequence (7 instructions) */
- unsigned char jump[28];
- unsigned char unused[4];
+ /* 28 byte jump instruction sequence (7 instructions). We only
+ * need 6 instructions on ABIv2 but we always allocate 7 so
+ * so we don't have to modify the trampoline load instruction. */
+ u32 jump[7];
+ u32 unused;
/* Data for the above code */
- struct ppc64_opd_entry opd;
+ func_desc_t funcdata;
};
-/* We use a stub to fix up r2 (TOC ptr) and to jump to the (external)
- function which may be more than 24-bits away. We could simply
- patch the new r2 value and function pointer into the stub, but it's
- significantly shorter to put these values at the end of the stub
- code, and patch the stub address (32-bits relative to the TOC ptr,
- r2) into the stub. */
-static struct ppc64_stub_entry ppc64_stub =
-{ .jump = {
- 0x3d, 0x82, 0x00, 0x00, /* addis r12,r2, <high> */
- 0x39, 0x8c, 0x00, 0x00, /* addi r12,r12, <low> */
+/*
+ * PPC64 uses 24 bit jumps, but we need to jump into other modules or
+ * the kernel which may be further. So we jump to a stub.
+ *
+ * For ELFv1 we need to use this to set up the new r2 value (aka TOC
+ * pointer). For ELFv2 it's the callee's responsibility to set up the
+ * new r2, but for both we need to save the old r2.
+ *
+ * We could simply patch the new r2 value and function pointer into
+ * the stub, but it's significantly shorter to put these values at the
+ * end of the stub code, and patch the stub address (32-bits relative
+ * to the TOC ptr, r2) into the stub.
+ */
+
+static u32 ppc64_stub_insns[] = {
+ 0x3d620000, /* addis r11,r2, <high> */
+ 0x396b0000, /* addi r11,r11, <low> */
/* Save current r2 value in magic place on the stack. */
- 0xf8, 0x41, 0x00, 0x28, /* std r2,40(r1) */
- 0xe9, 0x6c, 0x00, 0x20, /* ld r11,32(r12) */
- 0xe8, 0x4c, 0x00, 0x28, /* ld r2,40(r12) */
- 0x7d, 0x69, 0x03, 0xa6, /* mtctr r11 */
- 0x4e, 0x80, 0x04, 0x20 /* bctr */
-} };
+ 0xf8410000|R2_STACK_OFFSET, /* std r2,R2_STACK_OFFSET(r1) */
+ 0xe98b0020, /* ld r12,32(r11) */
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
+ /* Set up new r2 from function descriptor */
+ 0xe84b0028, /* ld r2,40(r11) */
+#endif
+ 0x7d8903a6, /* mtctr r12 */
+ 0x4e800420 /* bctr */
+};
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static u32 ppc64_stub_mask[] = {
+ 0xffff0000,
+ 0xffff0000,
+ 0xffffffff,
+ 0xffffffff,
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
+ 0xffffffff,
+#endif
+ 0xffffffff,
+ 0xffffffff
+};
+
+bool is_module_trampoline(u32 *p)
+{
+ unsigned int i;
+ u32 insns[ARRAY_SIZE(ppc64_stub_insns)];
+
+ BUILD_BUG_ON(sizeof(ppc64_stub_insns) != sizeof(ppc64_stub_mask));
+
+ if (probe_kernel_read(insns, p, sizeof(insns)))
+ return -EFAULT;
+
+ for (i = 0; i < ARRAY_SIZE(ppc64_stub_insns); i++) {
+ u32 insna = insns[i];
+ u32 insnb = ppc64_stub_insns[i];
+ u32 mask = ppc64_stub_mask[i];
+
+ if ((insna & mask) != (insnb & mask))
+ return false;
+ }
+
+ return true;
+}
+
+int module_trampoline_target(struct module *mod, u32 *trampoline,
+ unsigned long *target)
+{
+ u32 buf[2];
+ u16 upper, lower;
+ long offset;
+ void *toc_entry;
+
+ if (probe_kernel_read(buf, trampoline, sizeof(buf)))
+ return -EFAULT;
+
+ upper = buf[0] & 0xffff;
+ lower = buf[1] & 0xffff;
+
+ /* perform the addis/addi, both signed */
+ offset = ((short)upper << 16) + (short)lower;
+
+ /*
+ * Now get the address this trampoline jumps to. This
+ * is always 32 bytes into our trampoline stub.
+ */
+ toc_entry = (void *)mod->arch.toc + offset + 32;
+
+ if (probe_kernel_read(target, toc_entry, sizeof(*target)))
+ return -EFAULT;
+
+ return 0;
+}
+
+#endif
/* Count how many different 24-bit relocations (different symbol,
different addend) */
@@ -173,17 +308,27 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
return relocs * sizeof(struct ppc64_stub_entry);
}
+/* Still needed for ELFv2, for .TOC. */
static void dedotify_versions(struct modversion_info *vers,
unsigned long size)
{
struct modversion_info *end;
for (end = (void *)vers + size; vers < end; vers++)
- if (vers->name[0] == '.')
+ if (vers->name[0] == '.') {
memmove(vers->name, vers->name+1, strlen(vers->name));
+#ifdef ARCH_RELOCATES_KCRCTAB
+ /* The TOC symbol has no CRC computed. To avoid CRC
+ * check failing, we must force it to the expected
+ * value (see CRC check in module.c).
+ */
+ if (!strcmp(vers->name, "TOC."))
+ vers->crc = -(unsigned long)reloc_start;
+#endif
+ }
}
-/* Undefined symbols which refer to .funcname, hack to funcname */
+/* Undefined symbols which refer to .funcname, hack to funcname (or .TOC.) */
static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab)
{
unsigned int i;
@@ -197,6 +342,24 @@ static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab)
}
}
+static Elf64_Sym *find_dot_toc(Elf64_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex)
+{
+ unsigned int i, numsyms;
+ Elf64_Sym *syms;
+
+ syms = (Elf64_Sym *)sechdrs[symindex].sh_addr;
+ numsyms = sechdrs[symindex].sh_size / sizeof(Elf64_Sym);
+
+ for (i = 1; i < numsyms; i++) {
+ if (syms[i].st_shndx == SHN_UNDEF
+ && strcmp(strtab + syms[i].st_name, "TOC.") == 0)
+ return &syms[i];
+ }
+ return NULL;
+}
+
int module_frob_arch_sections(Elf64_Ehdr *hdr,
Elf64_Shdr *sechdrs,
char *secstrings,
@@ -243,16 +406,6 @@ int module_frob_arch_sections(Elf64_Ehdr *hdr,
return 0;
}
-int apply_relocate(Elf64_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- printk(KERN_ERR "%s: Non-ADD RELOCATION unsupported\n", me->name);
- return -ENOEXEC;
-}
-
/* r2 is the TOC pointer: it actually points 0x8000 into the TOC (this
gives the value maximum span in an instruction which uses a signed
offset) */
@@ -271,16 +424,12 @@ static inline unsigned long my_r2(Elf64_Shdr *sechdrs, struct module *me)
/* Patch stub to reference function and correct r2 value. */
static inline int create_stub(Elf64_Shdr *sechdrs,
struct ppc64_stub_entry *entry,
- struct ppc64_opd_entry *opd,
+ unsigned long addr,
struct module *me)
{
- Elf64_Half *loc1, *loc2;
long reladdr;
- *entry = ppc64_stub;
-
- loc1 = (Elf64_Half *)&entry->jump[2];
- loc2 = (Elf64_Half *)&entry->jump[6];
+ memcpy(entry->jump, ppc64_stub_insns, sizeof(ppc64_stub_insns));
/* Stub uses address relative to r2. */
reladdr = (unsigned long)entry - my_r2(sechdrs, me);
@@ -291,35 +440,33 @@ static inline int create_stub(Elf64_Shdr *sechdrs,
}
DEBUGP("Stub %p get data from reladdr %li\n", entry, reladdr);
- *loc1 = PPC_HA(reladdr);
- *loc2 = PPC_LO(reladdr);
- entry->opd.funcaddr = opd->funcaddr;
- entry->opd.r2 = opd->r2;
+ entry->jump[0] |= PPC_HA(reladdr);
+ entry->jump[1] |= PPC_LO(reladdr);
+ entry->funcdata = func_desc(addr);
return 1;
}
-/* Create stub to jump to function described in this OPD: we need the
+/* Create stub to jump to function described in this OPD/ptr: we need the
stub to set up the TOC ptr (r2) for the function. */
static unsigned long stub_for_addr(Elf64_Shdr *sechdrs,
- unsigned long opdaddr,
+ unsigned long addr,
struct module *me)
{
struct ppc64_stub_entry *stubs;
- struct ppc64_opd_entry *opd = (void *)opdaddr;
unsigned int i, num_stubs;
num_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stubs);
/* Find this stub, or if that fails, the next avail. entry */
stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr;
- for (i = 0; stubs[i].opd.funcaddr; i++) {
+ for (i = 0; stub_func_addr(stubs[i].funcdata); i++) {
BUG_ON(i >= num_stubs);
- if (stubs[i].opd.funcaddr == opd->funcaddr)
+ if (stub_func_addr(stubs[i].funcdata) == func_addr(addr))
return (unsigned long)&stubs[i];
}
- if (!create_stub(sechdrs, &stubs[i], opd, me))
+ if (!create_stub(sechdrs, &stubs[i], addr, me))
return 0;
return (unsigned long)&stubs[i];
@@ -334,7 +481,8 @@ static int restore_r2(u32 *instruction, struct module *me)
me->name, *instruction);
return 0;
}
- *instruction = 0xe8410028; /* ld r2,40(r1) */
+ /* ld r2,R2_STACK_OFFSET(r1) */
+ *instruction = 0xe8410000 | R2_STACK_OFFSET;
return 1;
}
@@ -352,6 +500,17 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
DEBUGP("Applying ADD relocate section %u to %u\n", relsec,
sechdrs[relsec].sh_info);
+
+ /* First time we're called, we can fix up .TOC. */
+ if (!me->arch.toc_fixed) {
+ sym = find_dot_toc(sechdrs, strtab, symindex);
+ /* It's theoretically possible that a module doesn't want a
+ * .TOC. so don't fail it just for that. */
+ if (sym)
+ sym->st_value = my_r2(sechdrs, me);
+ me->arch.toc_fixed = true;
+ }
+
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) {
/* This is where to make the change */
location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -396,6 +555,14 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
| (value & 0xffff);
break;
+ case R_PPC64_TOC16_LO:
+ /* Subtract TOC pointer */
+ value -= my_r2(sechdrs, me);
+ *((uint16_t *) location)
+ = (*((uint16_t *) location) & ~0xffff)
+ | (value & 0xffff);
+ break;
+
case R_PPC64_TOC16_DS:
/* Subtract TOC pointer */
value -= my_r2(sechdrs, me);
@@ -409,6 +576,28 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
| (value & 0xfffc);
break;
+ case R_PPC64_TOC16_LO_DS:
+ /* Subtract TOC pointer */
+ value -= my_r2(sechdrs, me);
+ if ((value & 3) != 0) {
+ printk("%s: bad TOC16_LO_DS relocation (%lu)\n",
+ me->name, value);
+ return -ENOEXEC;
+ }
+ *((uint16_t *) location)
+ = (*((uint16_t *) location) & ~0xfffc)
+ | (value & 0xfffc);
+ break;
+
+ case R_PPC64_TOC16_HA:
+ /* Subtract TOC pointer */
+ value -= my_r2(sechdrs, me);
+ value = ((value + 0x8000) >> 16);
+ *((uint16_t *) location)
+ = (*((uint16_t *) location) & ~0xffff)
+ | (value & 0xffff);
+ break;
+
case R_PPC_REL24:
/* FIXME: Handle weak symbols here --RR */
if (sym->st_shndx == SHN_UNDEF) {
@@ -418,7 +607,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
return -ENOENT;
if (!restore_r2((u32 *)location + 1, me))
return -ENOEXEC;
- }
+ } else
+ value += local_entry_offset(sym);
/* Convert value to relative */
value -= (unsigned long)location;
@@ -439,6 +629,31 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
*location = value - (unsigned long)location;
break;
+ case R_PPC64_TOCSAVE:
+ /*
+ * Marker reloc indicates we don't have to save r2.
+ * That would only save us one instruction, so ignore
+ * it.
+ */
+ break;
+
+ case R_PPC64_REL16_HA:
+ /* Subtract location pointer */
+ value -= (unsigned long)location;
+ value = ((value + 0x8000) >> 16);
+ *((uint16_t *) location)
+ = (*((uint16_t *) location) & ~0xffff)
+ | (value & 0xffff);
+ break;
+
+ case R_PPC64_REL16_LO:
+ /* Subtract location pointer */
+ value -= (unsigned long)location;
+ *((uint16_t *) location)
+ = (*((uint16_t *) location) & ~0xffff)
+ | (value & 0xffff);
+ break;
+
default:
printk("%s: Unknown ADD relocation: %lu\n",
me->name,
diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c
deleted file mode 100644
index 09d72028f31..00000000000
--- a/arch/powerpc/kernel/mpc7450-pmu.c
+++ /dev/null
@@ -1,417 +0,0 @@
-/*
- * Performance counter support for MPC7450-family processors.
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/string.h>
-#include <linux/perf_event.h>
-#include <asm/reg.h>
-#include <asm/cputable.h>
-
-#define N_COUNTER 6 /* Number of hardware counters */
-#define MAX_ALT 3 /* Maximum number of event alternative codes */
-
-/*
- * Bits in event code for MPC7450 family
- */
-#define PM_THRMULT_MSKS 0x40000
-#define PM_THRESH_SH 12
-#define PM_THRESH_MSK 0x3f
-#define PM_PMC_SH 8
-#define PM_PMC_MSK 7
-#define PM_PMCSEL_MSK 0x7f
-
-/*
- * Classify events according to how specific their PMC requirements are.
- * Result is:
- * 0: can go on any PMC
- * 1: can go on PMCs 1-4
- * 2: can go on PMCs 1,2,4
- * 3: can go on PMCs 1 or 2
- * 4: can only go on one PMC
- * -1: event code is invalid
- */
-#define N_CLASSES 5
-
-static int mpc7450_classify_event(u32 event)
-{
- int pmc;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- if (pmc > N_COUNTER)
- return -1;
- return 4;
- }
- event &= PM_PMCSEL_MSK;
- if (event <= 1)
- return 0;
- if (event <= 7)
- return 1;
- if (event <= 13)
- return 2;
- if (event <= 22)
- return 3;
- return -1;
-}
-
-/*
- * Events using threshold and possible threshold scale:
- * code scale? name
- * 11e N PM_INSTQ_EXCEED_CYC
- * 11f N PM_ALTV_IQ_EXCEED_CYC
- * 128 Y PM_DTLB_SEARCH_EXCEED_CYC
- * 12b Y PM_LD_MISS_EXCEED_L1_CYC
- * 220 N PM_CQ_EXCEED_CYC
- * 30c N PM_GPR_RB_EXCEED_CYC
- * 30d ? PM_FPR_IQ_EXCEED_CYC ?
- * 311 Y PM_ITLB_SEARCH_EXCEED
- * 410 N PM_GPR_IQ_EXCEED_CYC
- */
-
-/*
- * Return use of threshold and threshold scale bits:
- * 0 = uses neither, 1 = uses threshold, 2 = uses both
- */
-static int mpc7450_threshold_use(u32 event)
-{
- int pmc, sel;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- sel = event & PM_PMCSEL_MSK;
- switch (pmc) {
- case 1:
- if (sel == 0x1e || sel == 0x1f)
- return 1;
- if (sel == 0x28 || sel == 0x2b)
- return 2;
- break;
- case 2:
- if (sel == 0x20)
- return 1;
- break;
- case 3:
- if (sel == 0xc || sel == 0xd)
- return 1;
- if (sel == 0x11)
- return 2;
- break;
- case 4:
- if (sel == 0x10)
- return 1;
- break;
- }
- return 0;
-}
-
-/*
- * Layout of constraint bits:
- * 33222222222211111111110000000000
- * 10987654321098765432109876543210
- * |< >< > < > < ><><><><><><>
- * TS TV G4 G3 G2P6P5P4P3P2P1
- *
- * P1 - P6
- * 0 - 11: Count of events needing PMC1 .. PMC6
- *
- * G2
- * 12 - 14: Count of events needing PMC1 or PMC2
- *
- * G3
- * 16 - 18: Count of events needing PMC1, PMC2 or PMC4
- *
- * G4
- * 20 - 23: Count of events needing PMC1, PMC2, PMC3 or PMC4
- *
- * TV
- * 24 - 29: Threshold value requested
- *
- * TS
- * 30: Threshold scale value requested
- */
-
-static u32 pmcbits[N_COUNTER][2] = {
- { 0x00844002, 0x00111001 }, /* PMC1 mask, value: P1,G2,G3,G4 */
- { 0x00844008, 0x00111004 }, /* PMC2: P2,G2,G3,G4 */
- { 0x00800020, 0x00100010 }, /* PMC3: P3,G4 */
- { 0x00840080, 0x00110040 }, /* PMC4: P4,G3,G4 */
- { 0x00000200, 0x00000100 }, /* PMC5: P5 */
- { 0x00000800, 0x00000400 } /* PMC6: P6 */
-};
-
-static u32 classbits[N_CLASSES - 1][2] = {
- { 0x00000000, 0x00000000 }, /* class 0: no constraint */
- { 0x00800000, 0x00100000 }, /* class 1: G4 */
- { 0x00040000, 0x00010000 }, /* class 2: G3 */
- { 0x00004000, 0x00001000 }, /* class 3: G2 */
-};
-
-static int mpc7450_get_constraint(u64 event, unsigned long *maskp,
- unsigned long *valp)
-{
- int pmc, class;
- u32 mask, value;
- int thresh, tuse;
-
- class = mpc7450_classify_event(event);
- if (class < 0)
- return -1;
- if (class == 4) {
- pmc = ((unsigned int)event >> PM_PMC_SH) & PM_PMC_MSK;
- mask = pmcbits[pmc - 1][0];
- value = pmcbits[pmc - 1][1];
- } else {
- mask = classbits[class][0];
- value = classbits[class][1];
- }
-
- tuse = mpc7450_threshold_use(event);
- if (tuse) {
- thresh = ((unsigned int)event >> PM_THRESH_SH) & PM_THRESH_MSK;
- mask |= 0x3f << 24;
- value |= thresh << 24;
- if (tuse == 2) {
- mask |= 0x40000000;
- if ((unsigned int)event & PM_THRMULT_MSKS)
- value |= 0x40000000;
- }
- }
-
- *maskp = mask;
- *valp = value;
- return 0;
-}
-
-static const unsigned int event_alternatives[][MAX_ALT] = {
- { 0x217, 0x317 }, /* PM_L1_DCACHE_MISS */
- { 0x418, 0x50f, 0x60f }, /* PM_SNOOP_RETRY */
- { 0x502, 0x602 }, /* PM_L2_HIT */
- { 0x503, 0x603 }, /* PM_L3_HIT */
- { 0x504, 0x604 }, /* PM_L2_ICACHE_MISS */
- { 0x505, 0x605 }, /* PM_L3_ICACHE_MISS */
- { 0x506, 0x606 }, /* PM_L2_DCACHE_MISS */
- { 0x507, 0x607 }, /* PM_L3_DCACHE_MISS */
- { 0x50a, 0x623 }, /* PM_LD_HIT_L3 */
- { 0x50b, 0x624 }, /* PM_ST_HIT_L3 */
- { 0x50d, 0x60d }, /* PM_L2_TOUCH_HIT */
- { 0x50e, 0x60e }, /* PM_L3_TOUCH_HIT */
- { 0x512, 0x612 }, /* PM_INT_LOCAL */
- { 0x513, 0x61d }, /* PM_L2_MISS */
- { 0x514, 0x61e }, /* PM_L3_MISS */
-};
-
-/*
- * Scan the alternatives table for a match and return the
- * index into the alternatives table if found, else -1.
- */
-static int find_alternative(u32 event)
-{
- int i, j;
-
- for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
- if (event < event_alternatives[i][0])
- break;
- for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
- if (event == event_alternatives[i][j])
- return i;
- }
- return -1;
-}
-
-static int mpc7450_get_alternatives(u64 event, unsigned int flags, u64 alt[])
-{
- int i, j, nalt = 1;
- u32 ae;
-
- alt[0] = event;
- nalt = 1;
- i = find_alternative((u32)event);
- if (i >= 0) {
- for (j = 0; j < MAX_ALT; ++j) {
- ae = event_alternatives[i][j];
- if (ae && ae != (u32)event)
- alt[nalt++] = ae;
- }
- }
- return nalt;
-}
-
-/*
- * Bitmaps of which PMCs each class can use for classes 0 - 3.
- * Bit i is set if PMC i+1 is usable.
- */
-static const u8 classmap[N_CLASSES] = {
- 0x3f, 0x0f, 0x0b, 0x03, 0
-};
-
-/* Bit position and width of each PMCSEL field */
-static const int pmcsel_shift[N_COUNTER] = {
- 6, 0, 27, 22, 17, 11
-};
-static const u32 pmcsel_mask[N_COUNTER] = {
- 0x7f, 0x3f, 0x1f, 0x1f, 0x1f, 0x3f
-};
-
-/*
- * Compute MMCR0/1/2 values for a set of events.
- */
-static int mpc7450_compute_mmcr(u64 event[], int n_ev,
- unsigned int hwc[], unsigned long mmcr[])
-{
- u8 event_index[N_CLASSES][N_COUNTER];
- int n_classevent[N_CLASSES];
- int i, j, class, tuse;
- u32 pmc_inuse = 0, pmc_avail;
- u32 mmcr0 = 0, mmcr1 = 0, mmcr2 = 0;
- u32 ev, pmc, thresh;
-
- if (n_ev > N_COUNTER)
- return -1;
-
- /* First pass: count usage in each class */
- for (i = 0; i < N_CLASSES; ++i)
- n_classevent[i] = 0;
- for (i = 0; i < n_ev; ++i) {
- class = mpc7450_classify_event(event[i]);
- if (class < 0)
- return -1;
- j = n_classevent[class]++;
- event_index[class][j] = i;
- }
-
- /* Second pass: allocate PMCs from most specific event to least */
- for (class = N_CLASSES - 1; class >= 0; --class) {
- for (i = 0; i < n_classevent[class]; ++i) {
- ev = event[event_index[class][i]];
- if (class == 4) {
- pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc_inuse & (1 << (pmc - 1)))
- return -1;
- } else {
- /* Find a suitable PMC */
- pmc_avail = classmap[class] & ~pmc_inuse;
- if (!pmc_avail)
- return -1;
- pmc = ffs(pmc_avail);
- }
- pmc_inuse |= 1 << (pmc - 1);
-
- tuse = mpc7450_threshold_use(ev);
- if (tuse) {
- thresh = (ev >> PM_THRESH_SH) & PM_THRESH_MSK;
- mmcr0 |= thresh << 16;
- if (tuse == 2 && (ev & PM_THRMULT_MSKS))
- mmcr2 = 0x80000000;
- }
- ev &= pmcsel_mask[pmc - 1];
- ev <<= pmcsel_shift[pmc - 1];
- if (pmc <= 2)
- mmcr0 |= ev;
- else
- mmcr1 |= ev;
- hwc[event_index[class][i]] = pmc - 1;
- }
- }
-
- if (pmc_inuse & 1)
- mmcr0 |= MMCR0_PMC1CE;
- if (pmc_inuse & 0x3e)
- mmcr0 |= MMCR0_PMCnCE;
-
- /* Return MMCRx values */
- mmcr[0] = mmcr0;
- mmcr[1] = mmcr1;
- mmcr[2] = mmcr2;
- return 0;
-}
-
-/*
- * Disable counting by a PMC.
- * Note that the pmc argument is 0-based here, not 1-based.
- */
-static void mpc7450_disable_pmc(unsigned int pmc, unsigned long mmcr[])
-{
- if (pmc <= 1)
- mmcr[0] &= ~(pmcsel_mask[pmc] << pmcsel_shift[pmc]);
- else
- mmcr[1] &= ~(pmcsel_mask[pmc] << pmcsel_shift[pmc]);
-}
-
-static int mpc7450_generic_events[] = {
- [PERF_COUNT_HW_CPU_CYCLES] = 1,
- [PERF_COUNT_HW_INSTRUCTIONS] = 2,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x217, /* PM_L1_DCACHE_MISS */
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x122, /* PM_BR_CMPL */
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x41c, /* PM_BR_MPRED */
-};
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-/*
- * Table of generalized cache-related events.
- * 0 means not supported, -1 means nonsensical, other values
- * are event codes.
- */
-static int mpc7450_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
- [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x225 },
- [C(OP_WRITE)] = { 0, 0x227 },
- [C(OP_PREFETCH)] = { 0, 0 },
- },
- [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x129, 0x115 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { 0x634, 0 },
- },
- [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0 },
- [C(OP_WRITE)] = { 0, 0 },
- [C(OP_PREFETCH)] = { 0, 0 },
- },
- [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x312 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x223 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x122, 0x41c },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
-};
-
-struct power_pmu mpc7450_pmu = {
- .name = "MPC7450 family",
- .n_counter = N_COUNTER,
- .max_alternatives = MAX_ALT,
- .add_fields = 0x00111555ul,
- .test_adder = 0x00301000ul,
- .compute_mmcr = mpc7450_compute_mmcr,
- .get_constraint = mpc7450_get_constraint,
- .get_alternatives = mpc7450_get_alternatives,
- .disable_pmc = mpc7450_disable_pmc,
- .n_generic = ARRAY_SIZE(mpc7450_generic_events),
- .generic_events = mpc7450_generic_events,
- .cache_events = &mpc7450_cache_events,
-};
-
-static int init_mpc7450_pmu(void)
-{
- if (!cur_cpu_spec->oprofile_cpu_type ||
- strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/7450"))
- return -ENODEV;
-
- return register_power_pmu(&mpc7450_pmu);
-}
-
-arch_initcall(init_mpc7450_pmu);
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 9cf197f01e9..28b898e6818 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -34,15 +34,26 @@
#undef DEBUG_NVRAM
-static struct nvram_partition * nvram_part;
-static long nvram_error_log_index = -1;
-static long nvram_error_log_size = 0;
+#define NVRAM_HEADER_LEN sizeof(struct nvram_header)
+#define NVRAM_BLOCK_LEN NVRAM_HEADER_LEN
+
+/* If change this size, then change the size of NVNAME_LEN */
+struct nvram_header {
+ unsigned char signature;
+ unsigned char checksum;
+ unsigned short length;
+ /* Terminating null required only for names < 12 chars. */
+ char name[12];
+};
-struct err_log_info {
- int error_type;
- unsigned int seq_num;
+struct nvram_partition {
+ struct list_head partition;
+ struct nvram_header header;
+ unsigned int index;
};
+static LIST_HEAD(nvram_partitions);
+
static loff_t dev_nvram_llseek(struct file *file, loff_t offset, int origin)
{
int size;
@@ -73,22 +84,30 @@ static ssize_t dev_nvram_read(struct file *file, char __user *buf,
char *tmp = NULL;
ssize_t size;
- ret = -ENODEV;
- if (!ppc_md.nvram_size)
+ if (!ppc_md.nvram_size) {
+ ret = -ENODEV;
goto out;
+ }
- ret = 0;
size = ppc_md.nvram_size();
- if (*ppos >= size || size < 0)
+ if (size < 0) {
+ ret = size;
goto out;
+ }
+
+ if (*ppos >= size) {
+ ret = 0;
+ goto out;
+ }
count = min_t(size_t, count, size - *ppos);
count = min(count, PAGE_SIZE);
- ret = -ENOMEM;
tmp = kmalloc(count, GFP_KERNEL);
- if (!tmp)
+ if (!tmp) {
+ ret = -ENOMEM;
goto out;
+ }
ret = ppc_md.nvram_read(tmp, count, ppos);
if (ret <= 0)
@@ -186,14 +205,12 @@ static struct miscdevice nvram_dev = {
#ifdef DEBUG_NVRAM
static void __init nvram_print_partitions(char * label)
{
- struct list_head * p;
struct nvram_partition * tmp_part;
printk(KERN_WARNING "--------%s---------\n", label);
printk(KERN_WARNING "indx\t\tsig\tchks\tlen\tname\n");
- list_for_each(p, &nvram_part->partition) {
- tmp_part = list_entry(p, struct nvram_partition, partition);
- printk(KERN_WARNING "%4d \t%02x\t%02x\t%d\t%s\n",
+ list_for_each_entry(tmp_part, &nvram_partitions, partition) {
+ printk(KERN_WARNING "%4d \t%02x\t%02x\t%d\t%12.12s\n",
tmp_part->index, tmp_part->header.signature,
tmp_part->header.checksum, tmp_part->header.length,
tmp_part->header.name);
@@ -206,9 +223,13 @@ static int __init nvram_write_header(struct nvram_partition * part)
{
loff_t tmp_index;
int rc;
-
+ struct nvram_header phead;
+
+ memcpy(&phead, &part->header, NVRAM_HEADER_LEN);
+ phead.length = cpu_to_be16(phead.length);
+
tmp_index = part->index;
- rc = ppc_md.nvram_write((char *)&part->header, NVRAM_HEADER_LEN, &tmp_index);
+ rc = ppc_md.nvram_write((char *)&phead, NVRAM_HEADER_LEN, &tmp_index);
return rc;
}
@@ -228,95 +249,136 @@ static unsigned char __init nvram_checksum(struct nvram_header *p)
return c_sum;
}
-static int __init nvram_remove_os_partition(void)
+/*
+ * Per the criteria passed via nvram_remove_partition(), should this
+ * partition be removed? 1=remove, 0=keep
+ */
+static int nvram_can_remove_partition(struct nvram_partition *part,
+ const char *name, int sig, const char *exceptions[])
+{
+ if (part->header.signature != sig)
+ return 0;
+ if (name) {
+ if (strncmp(name, part->header.name, 12))
+ return 0;
+ } else if (exceptions) {
+ const char **except;
+ for (except = exceptions; *except; except++) {
+ if (!strncmp(*except, part->header.name, 12))
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/**
+ * nvram_remove_partition - Remove one or more partitions in nvram
+ * @name: name of the partition to remove, or NULL for a
+ * signature only match
+ * @sig: signature of the partition(s) to remove
+ * @exceptions: When removing all partitions with a matching signature,
+ * leave these alone.
+ */
+
+int __init nvram_remove_partition(const char *name, int sig,
+ const char *exceptions[])
{
- struct list_head *i;
- struct list_head *j;
- struct nvram_partition * part;
- struct nvram_partition * cur_part;
+ struct nvram_partition *part, *prev, *tmp;
int rc;
- list_for_each(i, &nvram_part->partition) {
- part = list_entry(i, struct nvram_partition, partition);
- if (part->header.signature != NVRAM_SIG_OS)
+ list_for_each_entry(part, &nvram_partitions, partition) {
+ if (!nvram_can_remove_partition(part, name, sig, exceptions))
continue;
-
- /* Make os partition a free partition */
+
+ /* Make partition a free partition */
part->header.signature = NVRAM_SIG_FREE;
- sprintf(part->header.name, "wwwwwwwwwwww");
+ strncpy(part->header.name, "wwwwwwwwwwww", 12);
part->header.checksum = nvram_checksum(&part->header);
-
- /* Merge contiguous free partitions backwards */
- list_for_each_prev(j, &part->partition) {
- cur_part = list_entry(j, struct nvram_partition, partition);
- if (cur_part == nvram_part || cur_part->header.signature != NVRAM_SIG_FREE) {
- break;
- }
-
- part->header.length += cur_part->header.length;
- part->header.checksum = nvram_checksum(&part->header);
- part->index = cur_part->index;
-
- list_del(&cur_part->partition);
- kfree(cur_part);
- j = &part->partition; /* fixup our loop */
- }
-
- /* Merge contiguous free partitions forwards */
- list_for_each(j, &part->partition) {
- cur_part = list_entry(j, struct nvram_partition, partition);
- if (cur_part == nvram_part || cur_part->header.signature != NVRAM_SIG_FREE) {
- break;
- }
-
- part->header.length += cur_part->header.length;
- part->header.checksum = nvram_checksum(&part->header);
-
- list_del(&cur_part->partition);
- kfree(cur_part);
- j = &part->partition; /* fixup our loop */
- }
-
rc = nvram_write_header(part);
if (rc <= 0) {
- printk(KERN_ERR "nvram_remove_os_partition: nvram_write failed (%d)\n", rc);
+ printk(KERN_ERR "nvram_remove_partition: nvram_write failed (%d)\n", rc);
return rc;
}
+ }
+ /* Merge contiguous ones */
+ prev = NULL;
+ list_for_each_entry_safe(part, tmp, &nvram_partitions, partition) {
+ if (part->header.signature != NVRAM_SIG_FREE) {
+ prev = NULL;
+ continue;
+ }
+ if (prev) {
+ prev->header.length += part->header.length;
+ prev->header.checksum = nvram_checksum(&part->header);
+ rc = nvram_write_header(part);
+ if (rc <= 0) {
+ printk(KERN_ERR "nvram_remove_partition: nvram_write failed (%d)\n", rc);
+ return rc;
+ }
+ list_del(&part->partition);
+ kfree(part);
+ } else
+ prev = part;
}
return 0;
}
-/* nvram_create_os_partition
+/**
+ * nvram_create_partition - Create a partition in nvram
+ * @name: name of the partition to create
+ * @sig: signature of the partition to create
+ * @req_size: size of data to allocate in bytes
+ * @min_size: minimum acceptable size (0 means req_size)
*
- * Create a OS linux partition to buffer error logs.
- * Will create a partition starting at the first free
- * space found if space has enough room.
+ * Returns a negative error code or a positive nvram index
+ * of the beginning of the data area of the newly created
+ * partition. If you provided a min_size smaller than req_size
+ * you need to query for the actual size yourself after the
+ * call using nvram_partition_get_size().
*/
-static int __init nvram_create_os_partition(void)
+loff_t __init nvram_create_partition(const char *name, int sig,
+ int req_size, int min_size)
{
struct nvram_partition *part;
struct nvram_partition *new_part;
struct nvram_partition *free_part = NULL;
- int seq_init[2] = { 0, 0 };
+ static char nv_init_vals[16];
loff_t tmp_index;
long size = 0;
int rc;
-
+
+ /* Convert sizes from bytes to blocks */
+ req_size = _ALIGN_UP(req_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN;
+ min_size = _ALIGN_UP(min_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN;
+
+ /* If no minimum size specified, make it the same as the
+ * requested size
+ */
+ if (min_size == 0)
+ min_size = req_size;
+ if (min_size > req_size)
+ return -EINVAL;
+
+ /* Now add one block to each for the header */
+ req_size += 1;
+ min_size += 1;
+
/* Find a free partition that will give us the maximum needed size
If can't find one that will give us the minimum size needed */
- list_for_each_entry(part, &nvram_part->partition, partition) {
+ list_for_each_entry(part, &nvram_partitions, partition) {
if (part->header.signature != NVRAM_SIG_FREE)
continue;
- if (part->header.length >= NVRAM_MAX_REQ) {
- size = NVRAM_MAX_REQ;
+ if (part->header.length >= req_size) {
+ size = req_size;
free_part = part;
break;
}
- if (!size && part->header.length >= NVRAM_MIN_REQ) {
- size = NVRAM_MIN_REQ;
+ if (part->header.length > size &&
+ part->header.length >= min_size) {
+ size = part->header.length;
free_part = part;
}
}
@@ -326,136 +388,95 @@ static int __init nvram_create_os_partition(void)
/* Create our OS partition */
new_part = kmalloc(sizeof(*new_part), GFP_KERNEL);
if (!new_part) {
- printk(KERN_ERR "nvram_create_os_partition: kmalloc failed\n");
+ pr_err("nvram_create_os_partition: kmalloc failed\n");
return -ENOMEM;
}
new_part->index = free_part->index;
- new_part->header.signature = NVRAM_SIG_OS;
+ new_part->header.signature = sig;
new_part->header.length = size;
- strcpy(new_part->header.name, "ppc64,linux");
+ strncpy(new_part->header.name, name, 12);
new_part->header.checksum = nvram_checksum(&new_part->header);
rc = nvram_write_header(new_part);
if (rc <= 0) {
- printk(KERN_ERR "nvram_create_os_partition: nvram_write_header "
- "failed (%d)\n", rc);
- return rc;
- }
-
- /* make sure and initialize to zero the sequence number and the error
- type logged */
- tmp_index = new_part->index + NVRAM_HEADER_LEN;
- rc = ppc_md.nvram_write((char *)&seq_init, sizeof(seq_init), &tmp_index);
- if (rc <= 0) {
- printk(KERN_ERR "nvram_create_os_partition: nvram_write "
+ pr_err("nvram_create_os_partition: nvram_write_header "
"failed (%d)\n", rc);
return rc;
}
-
- nvram_error_log_index = new_part->index + NVRAM_HEADER_LEN;
- nvram_error_log_size = ((part->header.length - 1) *
- NVRAM_BLOCK_LEN) - sizeof(struct err_log_info);
-
list_add_tail(&new_part->partition, &free_part->partition);
- if (free_part->header.length <= size) {
+ /* Adjust or remove the partition we stole the space from */
+ if (free_part->header.length > size) {
+ free_part->index += size * NVRAM_BLOCK_LEN;
+ free_part->header.length -= size;
+ free_part->header.checksum = nvram_checksum(&free_part->header);
+ rc = nvram_write_header(free_part);
+ if (rc <= 0) {
+ pr_err("nvram_create_os_partition: nvram_write_header "
+ "failed (%d)\n", rc);
+ return rc;
+ }
+ } else {
list_del(&free_part->partition);
kfree(free_part);
- return 0;
}
- /* Adjust the partition we stole the space from */
- free_part->index += size * NVRAM_BLOCK_LEN;
- free_part->header.length -= size;
- free_part->header.checksum = nvram_checksum(&free_part->header);
-
- rc = nvram_write_header(free_part);
- if (rc <= 0) {
- printk(KERN_ERR "nvram_create_os_partition: nvram_write_header "
- "failed (%d)\n", rc);
- return rc;
+ /* Clear the new partition */
+ for (tmp_index = new_part->index + NVRAM_HEADER_LEN;
+ tmp_index < ((size - 1) * NVRAM_BLOCK_LEN);
+ tmp_index += NVRAM_BLOCK_LEN) {
+ rc = ppc_md.nvram_write(nv_init_vals, NVRAM_BLOCK_LEN, &tmp_index);
+ if (rc <= 0) {
+ pr_err("nvram_create_partition: nvram_write failed (%d)\n", rc);
+ return rc;
+ }
}
-
- return 0;
+
+ return new_part->index + NVRAM_HEADER_LEN;
}
-
-/* nvram_setup_partition
- *
- * This will setup the partition we need for buffering the
- * error logs and cleanup partitions if needed.
- *
- * The general strategy is the following:
- * 1.) If there is ppc64,linux partition large enough then use it.
- * 2.) If there is not a ppc64,linux partition large enough, search
- * for a free partition that is large enough.
- * 3.) If there is not a free partition large enough remove
- * _all_ OS partitions and consolidate the space.
- * 4.) Will first try getting a chunk that will satisfy the maximum
- * error log size (NVRAM_MAX_REQ).
- * 5.) If the max chunk cannot be allocated then try finding a chunk
- * that will satisfy the minum needed (NVRAM_MIN_REQ).
+/**
+ * nvram_get_partition_size - Get the data size of an nvram partition
+ * @data_index: This is the offset of the start of the data of
+ * the partition. The same value that is returned by
+ * nvram_create_partition().
*/
-static int __init nvram_setup_partition(void)
+int nvram_get_partition_size(loff_t data_index)
{
- struct list_head * p;
- struct nvram_partition * part;
- int rc;
-
- /* For now, we don't do any of this on pmac, until I
- * have figured out if it's worth killing some unused stuffs
- * in our nvram, as Apple defined partitions use pretty much
- * all of the space
- */
- if (machine_is(powermac))
- return -ENOSPC;
-
- /* see if we have an OS partition that meets our needs.
- will try getting the max we need. If not we'll delete
- partitions and try again. */
- list_for_each(p, &nvram_part->partition) {
- part = list_entry(p, struct nvram_partition, partition);
- if (part->header.signature != NVRAM_SIG_OS)
- continue;
+ struct nvram_partition *part;
+
+ list_for_each_entry(part, &nvram_partitions, partition) {
+ if (part->index + NVRAM_HEADER_LEN == data_index)
+ return (part->header.length - 1) * NVRAM_BLOCK_LEN;
+ }
+ return -1;
+}
- if (strcmp(part->header.name, "ppc64,linux"))
- continue;
- if (part->header.length >= NVRAM_MIN_REQ) {
- /* found our partition */
- nvram_error_log_index = part->index + NVRAM_HEADER_LEN;
- nvram_error_log_size = ((part->header.length - 1) *
- NVRAM_BLOCK_LEN) - sizeof(struct err_log_info);
- return 0;
+/**
+ * nvram_find_partition - Find an nvram partition by signature and name
+ * @name: Name of the partition or NULL for any name
+ * @sig: Signature to test against
+ * @out_size: if non-NULL, returns the size of the data part of the partition
+ */
+loff_t nvram_find_partition(const char *name, int sig, int *out_size)
+{
+ struct nvram_partition *p;
+
+ list_for_each_entry(p, &nvram_partitions, partition) {
+ if (p->header.signature == sig &&
+ (!name || !strncmp(p->header.name, name, 12))) {
+ if (out_size)
+ *out_size = (p->header.length - 1) *
+ NVRAM_BLOCK_LEN;
+ return p->index + NVRAM_HEADER_LEN;
}
}
-
- /* try creating a partition with the free space we have */
- rc = nvram_create_os_partition();
- if (!rc) {
- return 0;
- }
-
- /* need to free up some space */
- rc = nvram_remove_os_partition();
- if (rc) {
- return rc;
- }
-
- /* create a partition in this new space */
- rc = nvram_create_os_partition();
- if (rc) {
- printk(KERN_ERR "nvram_create_os_partition: Could not find a "
- "NVRAM partition large enough\n");
- return rc;
- }
-
return 0;
}
-
-static int __init nvram_scan_partitions(void)
+int __init nvram_scan_partitions(void)
{
loff_t cur_index = 0;
struct nvram_header phead;
@@ -465,7 +486,7 @@ static int __init nvram_scan_partitions(void)
int total_size;
int err;
- if (ppc_md.nvram_size == NULL)
+ if (ppc_md.nvram_size == NULL || ppc_md.nvram_size() <= 0)
return -ENODEV;
total_size = ppc_md.nvram_size();
@@ -488,6 +509,8 @@ static int __init nvram_scan_partitions(void)
memcpy(&phead, header, NVRAM_HEADER_LEN);
+ phead.length = be16_to_cpu(phead.length);
+
err = 0;
c_sum = nvram_checksum(&phead);
if (c_sum != phead.checksum) {
@@ -502,8 +525,7 @@ static int __init nvram_scan_partitions(void)
"detected: 0-length partition\n");
goto out;
}
- tmp_part = (struct nvram_partition *)
- kmalloc(sizeof(struct nvram_partition), GFP_KERNEL);
+ tmp_part = kmalloc(sizeof(struct nvram_partition), GFP_KERNEL);
err = -ENOMEM;
if (!tmp_part) {
printk(KERN_ERR "nvram_scan_partitions: kmalloc failed\n");
@@ -512,12 +534,16 @@ static int __init nvram_scan_partitions(void)
memcpy(&tmp_part->header, &phead, NVRAM_HEADER_LEN);
tmp_part->index = cur_index;
- list_add_tail(&tmp_part->partition, &nvram_part->partition);
+ list_add_tail(&tmp_part->partition, &nvram_partitions);
cur_index += phead.length * NVRAM_BLOCK_LEN;
}
err = 0;
+#ifdef DEBUG_NVRAM
+ nvram_print_partitions("NVRAM Partitions");
+#endif
+
out:
kfree(header);
return err;
@@ -525,9 +551,10 @@ static int __init nvram_scan_partitions(void)
static int __init nvram_init(void)
{
- int error;
int rc;
+ BUILD_BUG_ON(NVRAM_BLOCK_LEN != 16);
+
if (ppc_md.nvram_size == NULL || ppc_md.nvram_size() <= 0)
return -ENODEV;
@@ -537,29 +564,6 @@ static int __init nvram_init(void)
return rc;
}
- /* initialize our anchor for the nvram partition list */
- nvram_part = kmalloc(sizeof(struct nvram_partition), GFP_KERNEL);
- if (!nvram_part) {
- printk(KERN_ERR "nvram_init: Failed kmalloc\n");
- return -ENOMEM;
- }
- INIT_LIST_HEAD(&nvram_part->partition);
-
- /* Get all the NVRAM partitions */
- error = nvram_scan_partitions();
- if (error) {
- printk(KERN_ERR "nvram_init: Failed nvram_scan_partitions\n");
- return error;
- }
-
- if(nvram_setup_partition())
- printk(KERN_WARNING "nvram_init: Could not find nvram partition"
- " for nvram buffered error logging.\n");
-
-#ifdef DEBUG_NVRAM
- nvram_print_partitions("NVRAM Partitions");
-#endif
-
return rc;
}
@@ -568,135 +572,6 @@ void __exit nvram_cleanup(void)
misc_deregister( &nvram_dev );
}
-
-#ifdef CONFIG_PPC_PSERIES
-
-/* nvram_write_error_log
- *
- * We need to buffer the error logs into nvram to ensure that we have
- * the failure information to decode. If we have a severe error there
- * is no way to guarantee that the OS or the machine is in a state to
- * get back to user land and write the error to disk. For example if
- * the SCSI device driver causes a Machine Check by writing to a bad
- * IO address, there is no way of guaranteeing that the device driver
- * is in any state that is would also be able to write the error data
- * captured to disk, thus we buffer it in NVRAM for analysis on the
- * next boot.
- *
- * In NVRAM the partition containing the error log buffer will looks like:
- * Header (in bytes):
- * +-----------+----------+--------+------------+------------------+
- * | signature | checksum | length | name | data |
- * |0 |1 |2 3|4 15|16 length-1|
- * +-----------+----------+--------+------------+------------------+
- *
- * The 'data' section would look like (in bytes):
- * +--------------+------------+-----------------------------------+
- * | event_logged | sequence # | error log |
- * |0 3|4 7|8 nvram_error_log_size-1|
- * +--------------+------------+-----------------------------------+
- *
- * event_logged: 0 if event has not been logged to syslog, 1 if it has
- * sequence #: The unique sequence # for each event. (until it wraps)
- * error log: The error log from event_scan
- */
-int nvram_write_error_log(char * buff, int length,
- unsigned int err_type, unsigned int error_log_cnt)
-{
- int rc;
- loff_t tmp_index;
- struct err_log_info info;
-
- if (nvram_error_log_index == -1) {
- return -ESPIPE;
- }
-
- if (length > nvram_error_log_size) {
- length = nvram_error_log_size;
- }
-
- info.error_type = err_type;
- info.seq_num = error_log_cnt;
-
- tmp_index = nvram_error_log_index;
-
- rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index);
- if (rc <= 0) {
- printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc);
- return rc;
- }
-
- rc = ppc_md.nvram_write(buff, length, &tmp_index);
- if (rc <= 0) {
- printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc);
- return rc;
- }
-
- return 0;
-}
-
-/* nvram_read_error_log
- *
- * Reads nvram for error log for at most 'length'
- */
-int nvram_read_error_log(char * buff, int length,
- unsigned int * err_type, unsigned int * error_log_cnt)
-{
- int rc;
- loff_t tmp_index;
- struct err_log_info info;
-
- if (nvram_error_log_index == -1)
- return -1;
-
- if (length > nvram_error_log_size)
- length = nvram_error_log_size;
-
- tmp_index = nvram_error_log_index;
-
- rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index);
- if (rc <= 0) {
- printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
- return rc;
- }
-
- rc = ppc_md.nvram_read(buff, length, &tmp_index);
- if (rc <= 0) {
- printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
- return rc;
- }
-
- *error_log_cnt = info.seq_num;
- *err_type = info.error_type;
-
- return 0;
-}
-
-/* This doesn't actually zero anything, but it sets the event_logged
- * word to tell that this event is safely in syslog.
- */
-int nvram_clear_error_log(void)
-{
- loff_t tmp_index;
- int clear_word = ERR_FLAG_ALREADY_LOGGED;
- int rc;
-
- if (nvram_error_log_index == -1)
- return -1;
-
- tmp_index = nvram_error_log_index;
-
- rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index);
- if (rc <= 0) {
- printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc);
- return rc;
- }
-
- return 0;
-}
-
-#endif /* CONFIG_PPC_PSERIES */
-
module_init(nvram_init);
module_exit(nvram_cleanup);
MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
index b2c363ef38a..a7b74307672 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -15,18 +15,19 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mod_devicetable.h>
#include <linux/pci.h>
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/of_platform.h>
+#include <linux/atomic.h>
#include <asm/errno.h>
#include <asm/topology.h>
#include <asm/pci-bridge.h>
#include <asm/ppc-pci.h>
-#include <asm/atomic.h>
+#include <asm/eeh.h>
#ifdef CONFIG_PPC_OF_PLATFORM_PCI
@@ -36,8 +37,7 @@
* lacking some bits needed here.
*/
-static int __devinit of_pci_phb_probe(struct platform_device *dev,
- const struct of_device_id *match)
+static int of_pci_phb_probe(struct platform_device *dev)
{
struct pci_controller *phb;
@@ -67,31 +67,33 @@ static int __devinit of_pci_phb_probe(struct platform_device *dev,
/* Init pci_dn data structures */
pci_devs_phb_init_dynamic(phb);
+ /* Create EEH devices for the PHB */
+ eeh_dev_phb_init_dynamic(phb);
+
/* Register devices with EEH */
-#ifdef CONFIG_EEH
if (dev->dev.of_node->child)
eeh_add_device_tree_early(dev->dev.of_node);
-#endif /* CONFIG_EEH */
/* Scan the bus */
- pcibios_scan_phb(phb, dev->dev.of_node);
+ pcibios_scan_phb(phb);
if (phb->bus == NULL)
return -ENXIO;
/* Claim resources. This might need some rework as well depending
- * wether we are doing probe-only or not, like assigning unassigned
+ * whether we are doing probe-only or not, like assigning unassigned
* resources etc...
*/
pcibios_claim_one_bus(phb->bus);
/* Finish EEH setup */
-#ifdef CONFIG_EEH
eeh_add_device_tree_late(phb->bus);
-#endif
/* Add probed PCI devices to the device model */
pci_bus_add_devices(phb->bus);
+ /* sysfs files should only be added after devices are added */
+ eeh_add_sysfs_files(phb->bus);
+
return 0;
}
@@ -104,7 +106,7 @@ static struct of_device_id of_pci_phb_ids[] = {
{}
};
-static struct of_platform_driver of_pci_phb_driver = {
+static struct platform_driver of_pci_phb_driver = {
.probe = of_pci_phb_probe,
.driver = {
.name = "of-pci",
@@ -115,7 +117,7 @@ static struct of_platform_driver of_pci_phb_driver = {
static __init int of_pci_phb_init(void)
{
- return of_register_platform_driver(&of_pci_phb_driver);
+ return platform_driver_register(&of_pci_phb_driver);
}
device_initcall(of_pci_phb_init);
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index ebf9846f3c3..d6e195e8cd4 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -7,17 +7,14 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/threads.h>
-#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/export.h>
#include <linux/memblock.h>
-#include <asm/firmware.h>
#include <asm/lppaca.h>
#include <asm/paca.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
-#include <asm/iseries/lpar_map.h>
-#include <asm/iseries/hv_types.h>
#include <asm/kexec.h>
/* This symbol is provided by the linker - let it fill in the paca
@@ -27,37 +24,20 @@ extern unsigned long __toc_start;
#ifdef CONFIG_PPC_BOOK3S
/*
- * We only have to have statically allocated lppaca structs on
- * legacy iSeries, which supports at most 64 cpus.
- */
-#ifdef CONFIG_PPC_ISERIES
-#if NR_CPUS < 64
-#define NR_LPPACAS NR_CPUS
-#else
-#define NR_LPPACAS 64
-#endif
-#else /* not iSeries */
-#define NR_LPPACAS 1
-#endif
-
-/*
* The structure which the hypervisor knows about - this structure
* should not cross a page boundary. The vpa_init/register_vpa call
* is now known to fail if the lppaca structure crosses a page
- * boundary. The lppaca is also used on legacy iSeries and POWER5
- * pSeries boxes. The lppaca is 640 bytes long, and cannot readily
+ * boundary. The lppaca is also used on POWER5 pSeries boxes.
+ * The lppaca is 640 bytes long, and cannot readily
* change since the hypervisor knows its layout, so a 1kB alignment
* will suffice to ensure that it doesn't cross a page boundary.
*/
struct lppaca lppaca[] = {
[0 ... (NR_LPPACAS-1)] = {
- .desc = 0xd397d781, /* "LpPa" */
- .size = sizeof(struct lppaca),
- .dyn_proc_status = 2,
- .decr_val = 0x00ff0000,
+ .desc = cpu_to_be32(0xd397d781), /* "LpPa" */
+ .size = cpu_to_be16(sizeof(struct lppaca)),
.fpregs_in_use = 1,
- .end_of_quantum = 0xfffffffffffffffful,
- .slb_count = 64,
+ .slb_count = cpu_to_be16(64),
.vmxregs_in_use = 0,
.page_ins = 0,
},
@@ -66,7 +46,7 @@ struct lppaca lppaca[] = {
static struct lppaca *extra_lppacas;
static long __initdata lppaca_size;
-static void allocate_lppacas(int nr_cpus, unsigned long limit)
+static void __init allocate_lppacas(int nr_cpus, unsigned long limit)
{
if (nr_cpus <= NR_LPPACAS)
return;
@@ -77,7 +57,7 @@ static void allocate_lppacas(int nr_cpus, unsigned long limit)
PAGE_SIZE, limit));
}
-static struct lppaca *new_lppaca(int cpu)
+static struct lppaca * __init new_lppaca(int cpu)
{
struct lppaca *lp;
@@ -90,7 +70,7 @@ static struct lppaca *new_lppaca(int cpu)
return lp;
}
-static void free_lppacas(void)
+static void __init free_lppacas(void)
{
long new_size = 0, nr;
@@ -118,13 +98,32 @@ static inline void free_lppacas(void) { }
/*
* 3 persistent SLBs are registered here. The buffer will be zero
* initially, hence will all be invaild until we actually write them.
+ *
+ * If you make the number of persistent SLB entries dynamic, please also
+ * update PR KVM to flush and restore them accordingly.
*/
-struct slb_shadow slb_shadow[] __cacheline_aligned = {
- [0 ... (NR_CPUS-1)] = {
- .persistent = SLB_NUM_BOLTED,
- .buffer_length = sizeof(struct slb_shadow),
- },
-};
+static struct slb_shadow *slb_shadow;
+
+static void __init allocate_slb_shadows(int nr_cpus, int limit)
+{
+ int size = PAGE_ALIGN(sizeof(struct slb_shadow) * nr_cpus);
+ slb_shadow = __va(memblock_alloc_base(size, PAGE_SIZE, limit));
+ memset(slb_shadow, 0, size);
+}
+
+static struct slb_shadow * __init init_slb_shadow(int cpu)
+{
+ struct slb_shadow *s = &slb_shadow[cpu];
+
+ s->persistent = cpu_to_be32(SLB_NUM_BOLTED);
+ s->buffer_length = cpu_to_be32(sizeof(*s));
+
+ return s;
+}
+
+#else /* CONFIG_PPC_STD_MMU_64 */
+
+static void __init allocate_slb_shadows(int nr_cpus, int limit) { }
#endif /* CONFIG_PPC_STD_MMU_64 */
@@ -140,8 +139,6 @@ struct slb_shadow slb_shadow[] __cacheline_aligned = {
struct paca_struct *paca;
EXPORT_SYMBOL(paca);
-struct paca_struct boot_paca;
-
void __init initialise_paca(struct paca_struct *new_paca, int cpu)
{
/* The TOC register (GPR2) points 32kB into the TOC, so that 64kB
@@ -158,58 +155,70 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu)
new_paca->paca_index = cpu;
new_paca->kernel_toc = kernel_toc;
new_paca->kernelbase = (unsigned long) _stext;
- new_paca->kernel_msr = MSR_KERNEL;
+ /* Only set MSR:IR/DR when MMU is initialized */
+ new_paca->kernel_msr = MSR_KERNEL & ~(MSR_IR | MSR_DR);
new_paca->hw_cpu_id = 0xffff;
new_paca->kexec_state = KEXEC_STATE_NONE;
new_paca->__current = &init_task;
+ new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL;
#ifdef CONFIG_PPC_STD_MMU_64
- new_paca->slb_shadow_ptr = &slb_shadow[cpu];
+ new_paca->slb_shadow_ptr = init_slb_shadow(cpu);
#endif /* CONFIG_PPC_STD_MMU_64 */
+
+#ifdef CONFIG_PPC_BOOK3E
+ /* For now -- if we have threads this will be adjusted later */
+ new_paca->tcd_ptr = &new_paca->tcd;
+#endif
}
/* Put the paca pointer into r13 and SPRG_PACA */
void setup_paca(struct paca_struct *new_paca)
{
+ /* Setup r13 */
local_paca = new_paca;
- mtspr(SPRN_SPRG_PACA, local_paca);
+
#ifdef CONFIG_PPC_BOOK3E
+ /* On Book3E, initialize the TLB miss exception frames */
mtspr(SPRN_SPRG_TLB_EXFRAME, local_paca->extlb);
+#else
+ /* In HV mode, we setup both HPACA and PACA to avoid problems
+ * if we do a GET_PACA() before the feature fixups have been
+ * applied
+ */
+ if (cpu_has_feature(CPU_FTR_HVMODE))
+ mtspr(SPRN_SPRG_HPACA, local_paca);
#endif
+ mtspr(SPRN_SPRG_PACA, local_paca);
+
}
static int __initdata paca_size;
void __init allocate_pacas(void)
{
- int nr_cpus, cpu, limit;
+ int cpu, limit;
/*
* We can't take SLB misses on the paca, and we want to access them
* in real mode, so allocate them within the RMA and also within
- * the first segment. On iSeries they must be within the area mapped
- * by the HV, which is HvPagesToMap * HVPAGESIZE bytes.
+ * the first segment.
*/
limit = min(0x10000000ULL, ppc64_rma_size);
- if (firmware_has_feature(FW_FEATURE_ISERIES))
- limit = min(limit, HvPagesToMap * HVPAGESIZE);
-
- nr_cpus = NR_CPUS;
- /* On iSeries we know we can never have more than 64 cpus */
- if (firmware_has_feature(FW_FEATURE_ISERIES))
- nr_cpus = min(64, nr_cpus);
- paca_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpus);
+ paca_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpu_ids);
paca = __va(memblock_alloc_base(paca_size, PAGE_SIZE, limit));
memset(paca, 0, paca_size);
printk(KERN_DEBUG "Allocated %u bytes for %d pacas at %p\n",
- paca_size, nr_cpus, paca);
+ paca_size, nr_cpu_ids, paca);
+
+ allocate_lppacas(nr_cpu_ids, limit);
- allocate_lppacas(nr_cpus, limit);
+ allocate_slb_shadows(nr_cpu_ids, limit);
/* Can't use for_each_*_cpu, as they aren't functional yet */
- for (cpu = 0; cpu < nr_cpus; cpu++)
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++)
initialise_paca(&paca[cpu], cpu);
}
@@ -217,7 +226,7 @@ void __init free_unused_pacas(void)
{
int new_size;
- new_size = PAGE_ALIGN(sizeof(struct paca_struct) * num_possible_cpus());
+ new_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpu_ids);
if (new_size >= paca_size)
return;
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 10a44e68ef1..b49c72fd7f1 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -21,13 +21,17 @@
#include <linux/string.h>
#include <linux/init.h>
#include <linux/bootmem.h>
+#include <linux/delay.h>
+#include <linux/export.h>
#include <linux/of_address.h>
+#include <linux/of_pci.h>
#include <linux/mm.h>
#include <linux/list.h>
#include <linux/syscalls.h>
#include <linux/irq.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
+#include <linux/vgaarb.h>
#include <asm/processor.h>
#include <asm/io.h>
@@ -36,7 +40,6 @@
#include <asm/byteorder.h>
#include <asm/machdep.h>
#include <asm/ppc-pci.h>
-#include <asm/firmware.h>
#include <asm/eeh.h>
static DEFINE_SPINLOCK(hose_spinlock);
@@ -48,9 +51,6 @@ static int global_phb_number; /* Global phb counter */
/* ISA Memory physical address */
resource_size_t isa_mem_base;
-/* Default PCI flags is 0 on ppc32, modified at boot on ppc64 */
-unsigned int ppc_pci_flags = 0;
-
static struct dma_map_ops *pci_dma_ops = &dma_direct_ops;
@@ -101,12 +101,51 @@ void pcibios_free_controller(struct pci_controller *phb)
kfree(phb);
}
+/*
+ * The function is used to return the minimal alignment
+ * for memory or I/O windows of the associated P2P bridge.
+ * By default, 4KiB alignment for I/O windows and 1MiB for
+ * memory windows.
+ */
+resource_size_t pcibios_window_alignment(struct pci_bus *bus,
+ unsigned long type)
+{
+ if (ppc_md.pcibios_window_alignment)
+ return ppc_md.pcibios_window_alignment(bus, type);
+
+ /*
+ * PCI core will figure out the default
+ * alignment: 4KiB for I/O and 1MiB for
+ * memory window.
+ */
+ return 1;
+}
+
+void pcibios_reset_secondary_bus(struct pci_dev *dev)
+{
+ u16 ctrl;
+
+ if (ppc_md.pcibios_reset_secondary_bus) {
+ ppc_md.pcibios_reset_secondary_bus(dev);
+ return;
+ }
+
+ pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl);
+ ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
+ pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
+ msleep(2);
+
+ ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
+ pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
+ ssleep(1);
+}
+
static resource_size_t pcibios_io_size(const struct pci_controller *hose)
{
#ifdef CONFIG_PPC64
return hose->pci_io_size;
#else
- return hose->io_resource.end - hose->io_resource.start + 1;
+ return resource_size(&hose->io_resource);
#endif
}
@@ -182,62 +221,23 @@ struct pci_controller* pci_find_hose_for_OF_device(struct device_node* node)
return NULL;
}
-static ssize_t pci_show_devspec(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pci_dev *pdev;
- struct device_node *np;
-
- pdev = to_pci_dev (dev);
- np = pci_device_to_OF_node(pdev);
- if (np == NULL || np->full_name == NULL)
- return 0;
- return sprintf(buf, "%s", np->full_name);
-}
-static DEVICE_ATTR(devspec, S_IRUGO, pci_show_devspec, NULL);
-
-/* Add sysfs properties */
-int pcibios_add_platform_entries(struct pci_dev *pdev)
-{
- return device_create_file(&pdev->dev, &dev_attr_devspec);
-}
-
-char __devinit *pcibios_setup(char *str)
-{
- return str;
-}
-
/*
* Reads the interrupt pin to determine if interrupt is use by card.
* If the interrupt is used, then gets the interrupt line from the
* openfirmware and sets it in the pci_dev and pci_config line.
*/
-int pci_read_irq_line(struct pci_dev *pci_dev)
+static int pci_read_irq_line(struct pci_dev *pci_dev)
{
- struct of_irq oirq;
+ struct of_phandle_args oirq;
unsigned int virq;
- /* The current device-tree that iSeries generates from the HV
- * PCI informations doesn't contain proper interrupt routing,
- * and all the fallback would do is print out crap, so we
- * don't attempt to resolve the interrupts here at all, some
- * iSeries specific fixup does it.
- *
- * In the long run, we will hopefully fix the generated device-tree
- * instead.
- */
-#ifdef CONFIG_PPC_ISERIES
- if (firmware_has_feature(FW_FEATURE_ISERIES))
- return -1;
-#endif
-
pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev));
#ifdef DEBUG
memset(&oirq, 0xff, sizeof(oirq));
#endif
/* Try to get a mapping from the device-tree */
- if (of_irq_map_pci(pci_dev, &oirq)) {
+ if (of_irq_parse_pci(pci_dev, &oirq)) {
u8 line, pin;
/* If that fails, lets fallback to what is in the config
@@ -260,15 +260,13 @@ int pci_read_irq_line(struct pci_dev *pci_dev)
virq = irq_create_mapping(NULL, line);
if (virq != NO_IRQ)
- set_irq_type(virq, IRQ_TYPE_LEVEL_LOW);
+ irq_set_irq_type(virq, IRQ_TYPE_LEVEL_LOW);
} else {
pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %s\n",
- oirq.size, oirq.specifier[0], oirq.specifier[1],
- oirq.controller ? oirq.controller->full_name :
- "<default>");
+ oirq.args_count, oirq.args[0], oirq.args[1],
+ of_node_full_name(oirq.np));
- virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
- oirq.size);
+ virq = irq_create_of_mapping(&oirq);
}
if(virq == NO_IRQ) {
pr_debug(" Failed to map !\n");
@@ -281,7 +279,6 @@ int pci_read_irq_line(struct pci_dev *pci_dev)
return 0;
}
-EXPORT_SYMBOL(pci_read_irq_line);
/*
* Platform support for /proc/bus/pci/X/Y mmap()s,
@@ -308,7 +305,7 @@ static struct resource *__pci_mmap_make_offset(struct pci_dev *dev,
unsigned long io_offset = 0;
int i, res_bit;
- if (hose == 0)
+ if (hose == NULL)
return NULL; /* should never happen */
/* If memory, add on the PCI bridge address offset */
@@ -361,7 +358,6 @@ static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp,
enum pci_mmap_state mmap_state,
int write_combine)
{
- unsigned long prot = pgprot_val(protection);
/* Write combine is always 0 on non-memory space mappings. On
* memory space, if the user didn't pass 1, we check for a
@@ -378,9 +374,9 @@ static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp,
/* XXX would be nice to have a way to ask for write-through */
if (write_combine)
- return pgprot_noncached_wc(prot);
+ return pgprot_noncached_wc(protection);
else
- return pgprot_noncached(prot);
+ return pgprot_noncached(protection);
}
/*
@@ -660,15 +656,6 @@ void pci_resource_to_user(const struct pci_dev *dev, int bar,
* ranges. However, some machines (thanks Apple !) tend to split their
* space into lots of small contiguous ranges. So we have to coalesce.
*
- * - We can only cope with all memory ranges having the same offset
- * between CPU addresses and PCI addresses. Unfortunately, some bridges
- * are setup for a large 1:1 mapping along with a small "window" which
- * maps PCI address 0 to some arbitrary high address of the CPU space in
- * order to give access to the ISA memory hole.
- * The way out of here that I've chosen for now is to always set the
- * offset based on the first resource found, then override it if we
- * have a different offset and the previous was set by an ISA hole.
- *
* - Some busses have IO space not starting at 0, which causes trouble with
* the way we do our IO resource renumbering. The code somewhat deals with
* it for 64 bits but I would expect problems on 32 bits.
@@ -676,65 +663,39 @@ void pci_resource_to_user(const struct pci_dev *dev, int bar,
* - Some 32 bits platforms such as 4xx can have physical space larger than
* 32 bits so we need to use 64 bits values for the parsing
*/
-void __devinit pci_process_bridge_OF_ranges(struct pci_controller *hose,
- struct device_node *dev,
- int primary)
+void pci_process_bridge_OF_ranges(struct pci_controller *hose,
+ struct device_node *dev, int primary)
{
- const u32 *ranges;
- int rlen;
- int pna = of_n_addr_cells(dev);
- int np = pna + 5;
- int memno = 0, isa_hole = -1;
- u32 pci_space;
- unsigned long long pci_addr, cpu_addr, pci_next, cpu_next, size;
- unsigned long long isa_mb = 0;
+ int memno = 0;
struct resource *res;
+ struct of_pci_range range;
+ struct of_pci_range_parser parser;
printk(KERN_INFO "PCI host bridge %s %s ranges:\n",
dev->full_name, primary ? "(primary)" : "");
- /* Get ranges property */
- ranges = of_get_property(dev, "ranges", &rlen);
- if (ranges == NULL)
+ /* Check for ranges property */
+ if (of_pci_range_parser_init(&parser, dev))
return;
/* Parse it */
- while ((rlen -= np * 4) >= 0) {
- /* Read next ranges element */
- pci_space = ranges[0];
- pci_addr = of_read_number(ranges + 1, 2);
- cpu_addr = of_translate_address(dev, ranges + 3);
- size = of_read_number(ranges + pna + 3, 2);
- ranges += np;
-
+ for_each_of_pci_range(&parser, &range) {
/* If we failed translation or got a zero-sized region
* (some FW try to feed us with non sensical zero sized regions
* such as power3 which look like some kind of attempt at exposing
* the VGA memory hole)
*/
- if (cpu_addr == OF_BAD_ADDR || size == 0)
+ if (range.cpu_addr == OF_BAD_ADDR || range.size == 0)
continue;
- /* Now consume following elements while they are contiguous */
- for (; rlen >= np * sizeof(u32);
- ranges += np, rlen -= np * 4) {
- if (ranges[0] != pci_space)
- break;
- pci_next = of_read_number(ranges + 1, 2);
- cpu_next = of_translate_address(dev, ranges + 3);
- if (pci_next != pci_addr + size ||
- cpu_next != cpu_addr + size)
- break;
- size += of_read_number(ranges + pna + 3, 2);
- }
-
/* Act based on address space type */
res = NULL;
- switch ((pci_space >> 24) & 0x3) {
- case 1: /* PCI IO space */
+ switch (range.flags & IORESOURCE_TYPE_BITS) {
+ case IORESOURCE_IO:
printk(KERN_INFO
" IO 0x%016llx..0x%016llx -> 0x%016llx\n",
- cpu_addr, cpu_addr + size - 1, pci_addr);
+ range.cpu_addr, range.cpu_addr + range.size - 1,
+ range.pci_addr);
/* We support only one IO range */
if (hose->pci_io_size) {
@@ -744,11 +705,12 @@ void __devinit pci_process_bridge_OF_ranges(struct pci_controller *hose,
}
#ifdef CONFIG_PPC32
/* On 32 bits, limit I/O space to 16MB */
- if (size > 0x01000000)
- size = 0x01000000;
+ if (range.size > 0x01000000)
+ range.size = 0x01000000;
/* 32 bits needs to map IOs here */
- hose->io_base_virt = ioremap(cpu_addr, size);
+ hose->io_base_virt = ioremap(range.cpu_addr,
+ range.size);
/* Expect trouble if pci_addr is not 0 */
if (primary)
@@ -758,20 +720,20 @@ void __devinit pci_process_bridge_OF_ranges(struct pci_controller *hose,
/* pci_io_size and io_base_phys always represent IO
* space starting at 0 so we factor in pci_addr
*/
- hose->pci_io_size = pci_addr + size;
- hose->io_base_phys = cpu_addr - pci_addr;
+ hose->pci_io_size = range.pci_addr + range.size;
+ hose->io_base_phys = range.cpu_addr - range.pci_addr;
/* Build resource */
res = &hose->io_resource;
- res->flags = IORESOURCE_IO;
- res->start = pci_addr;
+ range.cpu_addr = range.pci_addr;
break;
- case 2: /* PCI Memory space */
- case 3: /* PCI 64 bits Memory space */
+ case IORESOURCE_MEM:
printk(KERN_INFO
" MEM 0x%016llx..0x%016llx -> 0x%016llx %s\n",
- cpu_addr, cpu_addr + size - 1, pci_addr,
- (pci_space & 0x40000000) ? "Prefetch" : "");
+ range.cpu_addr, range.cpu_addr + range.size - 1,
+ range.pci_addr,
+ (range.pci_space & 0x40000000) ?
+ "Prefetch" : "");
/* We support only 3 memory ranges */
if (memno >= 3) {
@@ -780,60 +742,23 @@ void __devinit pci_process_bridge_OF_ranges(struct pci_controller *hose,
continue;
}
/* Handles ISA memory hole space here */
- if (pci_addr == 0) {
- isa_mb = cpu_addr;
- isa_hole = memno;
+ if (range.pci_addr == 0) {
if (primary || isa_mem_base == 0)
- isa_mem_base = cpu_addr;
- hose->isa_mem_phys = cpu_addr;
- hose->isa_mem_size = size;
- }
-
- /* We get the PCI/Mem offset from the first range or
- * the, current one if the offset came from an ISA
- * hole. If they don't match, bugger.
- */
- if (memno == 0 ||
- (isa_hole >= 0 && pci_addr != 0 &&
- hose->pci_mem_offset == isa_mb))
- hose->pci_mem_offset = cpu_addr - pci_addr;
- else if (pci_addr != 0 &&
- hose->pci_mem_offset != cpu_addr - pci_addr) {
- printk(KERN_INFO
- " \\--> Skipped (offset mismatch) !\n");
- continue;
+ isa_mem_base = range.cpu_addr;
+ hose->isa_mem_phys = range.cpu_addr;
+ hose->isa_mem_size = range.size;
}
/* Build resource */
+ hose->mem_offset[memno] = range.cpu_addr -
+ range.pci_addr;
res = &hose->mem_resources[memno++];
- res->flags = IORESOURCE_MEM;
- if (pci_space & 0x40000000)
- res->flags |= IORESOURCE_PREFETCH;
- res->start = cpu_addr;
break;
}
if (res != NULL) {
- res->name = dev->full_name;
- res->end = res->start + size - 1;
- res->parent = NULL;
- res->sibling = NULL;
- res->child = NULL;
+ of_pci_range_to_resource(&range, dev, res);
}
}
-
- /* If there's an ISA hole and the pci_mem_offset is -not- matching
- * the ISA hole offset, then we need to remove the ISA hole from
- * the resource list for that brige
- */
- if (isa_hole >= 0 && hose->pci_mem_offset != isa_mb) {
- unsigned int next = isa_hole + 1;
- printk(KERN_INFO " Removing ISA hole at 0x%016llx\n", isa_mb);
- if (next < memno)
- memmove(&hose->mem_resources[isa_hole],
- &hose->mem_resources[next],
- sizeof(struct resource) * (memno - next));
- hose->mem_resources[--memno].flags = 0;
- }
}
/* Decide whether to display the domain number in /proc */
@@ -841,71 +766,25 @@ int pci_proc_domain(struct pci_bus *bus)
{
struct pci_controller *hose = pci_bus_to_host(bus);
- if (!(ppc_pci_flags & PPC_PCI_ENABLE_PROC_DOMAINS))
+ if (!pci_has_flag(PCI_ENABLE_PROC_DOMAINS))
return 0;
- if (ppc_pci_flags & PPC_PCI_COMPAT_DOMAIN_0)
+ if (pci_has_flag(PCI_COMPAT_DOMAIN_0))
return hose->global_number != 0;
return 1;
}
-void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
- struct resource *res)
-{
- resource_size_t offset = 0, mask = (resource_size_t)-1;
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
-
- if (!hose)
- return;
- if (res->flags & IORESOURCE_IO) {
- offset = (unsigned long)hose->io_base_virt - _IO_BASE;
- mask = 0xffffffffu;
- } else if (res->flags & IORESOURCE_MEM)
- offset = hose->pci_mem_offset;
-
- region->start = (res->start - offset) & mask;
- region->end = (res->end - offset) & mask;
-}
-EXPORT_SYMBOL(pcibios_resource_to_bus);
-
-void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
- struct pci_bus_region *region)
+int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
{
- resource_size_t offset = 0, mask = (resource_size_t)-1;
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ if (ppc_md.pcibios_root_bridge_prepare)
+ return ppc_md.pcibios_root_bridge_prepare(bridge);
- if (!hose)
- return;
- if (res->flags & IORESOURCE_IO) {
- offset = (unsigned long)hose->io_base_virt - _IO_BASE;
- mask = 0xffffffffu;
- } else if (res->flags & IORESOURCE_MEM)
- offset = hose->pci_mem_offset;
- res->start = (region->start + offset) & mask;
- res->end = (region->end + offset) & mask;
-}
-EXPORT_SYMBOL(pcibios_bus_to_resource);
-
-/* Fixup a bus resource into a linux resource */
-static void __devinit fixup_resource(struct resource *res, struct pci_dev *dev)
-{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- resource_size_t offset = 0, mask = (resource_size_t)-1;
-
- if (res->flags & IORESOURCE_IO) {
- offset = (unsigned long)hose->io_base_virt - _IO_BASE;
- mask = 0xffffffffu;
- } else if (res->flags & IORESOURCE_MEM)
- offset = hose->pci_mem_offset;
-
- res->start = (res->start + offset) & mask;
- res->end = (res->end + offset) & mask;
+ return 0;
}
-
/* This header fixup will do the resource fixup for all devices as they are
* probed, but not for bridge ranges
*/
-static void __devinit pcibios_fixup_resources(struct pci_dev *dev)
+static void pcibios_fixup_resources(struct pci_dev *dev)
{
struct pci_controller *hose = pci_bus_to_host(dev->bus);
int i;
@@ -917,38 +796,37 @@ static void __devinit pcibios_fixup_resources(struct pci_dev *dev)
}
for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
struct resource *res = dev->resource + i;
+ struct pci_bus_region reg;
if (!res->flags)
continue;
- /* On platforms that have PPC_PCI_PROBE_ONLY set, we don't
- * consider 0 as an unassigned BAR value. It's technically
- * a valid value, but linux doesn't like it... so when we can
- * re-assign things, we do so, but if we can't, we keep it
- * around and hope for the best...
+
+ /* If we're going to re-assign everything, we mark all resources
+ * as unset (and 0-base them). In addition, we mark BARs starting
+ * at 0 as unset as well, except if PCI_PROBE_ONLY is also set
+ * since in that case, we don't want to re-assign anything
*/
- if (res->start == 0 && !(ppc_pci_flags & PPC_PCI_PROBE_ONLY)) {
- pr_debug("PCI:%s Resource %d %016llx-%016llx [%x] is unassigned\n",
- pci_name(dev), i,
- (unsigned long long)res->start,
- (unsigned long long)res->end,
- (unsigned int)res->flags);
+ pcibios_resource_to_bus(dev->bus, &reg, res);
+ if (pci_has_flag(PCI_REASSIGN_ALL_RSRC) ||
+ (reg.start == 0 && !pci_has_flag(PCI_PROBE_ONLY))) {
+ /* Only print message if not re-assigning */
+ if (!pci_has_flag(PCI_REASSIGN_ALL_RSRC))
+ pr_debug("PCI:%s Resource %d %016llx-%016llx [%x] "
+ "is unassigned\n",
+ pci_name(dev), i,
+ (unsigned long long)res->start,
+ (unsigned long long)res->end,
+ (unsigned int)res->flags);
res->end -= res->start;
res->start = 0;
res->flags |= IORESOURCE_UNSET;
continue;
}
- pr_debug("PCI:%s Resource %d %016llx-%016llx [%x] fixup...\n",
+ pr_debug("PCI:%s Resource %d %016llx-%016llx [%x]\n",
pci_name(dev), i,
(unsigned long long)res->start,\
(unsigned long long)res->end,
(unsigned int)res->flags);
-
- fixup_resource(res, dev);
-
- pr_debug("PCI:%s %016llx-%016llx\n",
- pci_name(dev),
- (unsigned long long)res->start,
- (unsigned long long)res->end);
}
/* Call machine specific resource fixup */
@@ -962,25 +840,26 @@ DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_fixup_resources);
* things go more smoothly when it gets it right. It should covers cases such
* as Apple "closed" bridge resources and bare-metal pSeries unassigned bridges
*/
-static int __devinit pcibios_uninitialized_bridge_resource(struct pci_bus *bus,
- struct resource *res)
+static int pcibios_uninitialized_bridge_resource(struct pci_bus *bus,
+ struct resource *res)
{
struct pci_controller *hose = pci_bus_to_host(bus);
struct pci_dev *dev = bus->self;
resource_size_t offset;
+ struct pci_bus_region region;
u16 command;
int i;
/* We don't do anything if PCI_PROBE_ONLY is set */
- if (ppc_pci_flags & PPC_PCI_PROBE_ONLY)
+ if (pci_has_flag(PCI_PROBE_ONLY))
return 0;
/* Job is a bit different between memory and IO */
if (res->flags & IORESOURCE_MEM) {
- /* If the BAR is non-0 (res != pci_mem_offset) then it's probably been
- * initialized by somebody
- */
- if (res->start != hose->pci_mem_offset)
+ pcibios_resource_to_bus(dev->bus, &region, res);
+
+ /* If the BAR is non-0 then it's probably been initialized */
+ if (region.start != 0)
return 0;
/* The BAR is 0, let's check if memory decoding is enabled on
@@ -992,11 +871,11 @@ static int __devinit pcibios_uninitialized_bridge_resource(struct pci_bus *bus,
/* Memory decoding is enabled and the BAR is 0. If any of the bridge
* resources covers that starting address (0 then it's good enough for
- * us for memory
+ * us for memory space)
*/
for (i = 0; i < 3; i++) {
if ((hose->mem_resources[i].flags & IORESOURCE_MEM) &&
- hose->mem_resources[i].start == hose->pci_mem_offset)
+ hose->mem_resources[i].start == hose->mem_offset[i])
return 0;
}
@@ -1027,7 +906,7 @@ static int __devinit pcibios_uninitialized_bridge_resource(struct pci_bus *bus,
}
/* Fixup resources of a PCI<->PCI bridge */
-static void __devinit pcibios_fixup_bridge(struct pci_bus *bus)
+static void pcibios_fixup_bridge(struct pci_bus *bus)
{
struct resource *res;
int i;
@@ -1040,32 +919,34 @@ static void __devinit pcibios_fixup_bridge(struct pci_bus *bus)
if (i >= 3 && bus->self->transparent)
continue;
- pr_debug("PCI:%s Bus rsrc %d %016llx-%016llx [%x] fixup...\n",
+ /* If we're going to reassign everything, we can
+ * shrink the P2P resource to have size as being
+ * of 0 in order to save space.
+ */
+ if (pci_has_flag(PCI_REASSIGN_ALL_RSRC)) {
+ res->flags |= IORESOURCE_UNSET;
+ res->start = 0;
+ res->end = -1;
+ continue;
+ }
+
+ pr_debug("PCI:%s Bus rsrc %d %016llx-%016llx [%x]\n",
pci_name(dev), i,
(unsigned long long)res->start,\
(unsigned long long)res->end,
(unsigned int)res->flags);
- /* Perform fixup */
- fixup_resource(res, dev);
-
/* Try to detect uninitialized P2P bridge resources,
* and clear them out so they get re-assigned later
*/
if (pcibios_uninitialized_bridge_resource(bus, res)) {
res->flags = 0;
pr_debug("PCI:%s (unassigned)\n", pci_name(dev));
- } else {
-
- pr_debug("PCI:%s %016llx-%016llx\n",
- pci_name(dev),
- (unsigned long long)res->start,
- (unsigned long long)res->end);
}
}
}
-void __devinit pcibios_setup_bus_self(struct pci_bus *bus)
+void pcibios_setup_bus_self(struct pci_bus *bus)
{
/* Fix up the bus resources for P2P bridges */
if (bus->self != NULL)
@@ -1082,7 +963,39 @@ void __devinit pcibios_setup_bus_self(struct pci_bus *bus)
ppc_md.pci_dma_bus_setup(bus);
}
-void __devinit pcibios_setup_bus_devices(struct pci_bus *bus)
+static void pcibios_setup_device(struct pci_dev *dev)
+{
+ /* Fixup NUMA node as it may not be setup yet by the generic
+ * code and is needed by the DMA init
+ */
+ set_dev_node(&dev->dev, pcibus_to_node(dev->bus));
+
+ /* Hook up default DMA ops */
+ set_dma_ops(&dev->dev, pci_dma_ops);
+ set_dma_offset(&dev->dev, PCI_DRAM_OFFSET);
+
+ /* Additional platform DMA/iommu setup */
+ if (ppc_md.pci_dma_dev_setup)
+ ppc_md.pci_dma_dev_setup(dev);
+
+ /* Read default IRQs and fixup if necessary */
+ pci_read_irq_line(dev);
+ if (ppc_md.pci_irq_fixup)
+ ppc_md.pci_irq_fixup(dev);
+}
+
+int pcibios_add_device(struct pci_dev *dev)
+{
+ /*
+ * We can only call pcibios_setup_device() after bus setup is complete,
+ * since some of the platform specific DMA setup code depends on it.
+ */
+ if (dev->bus->is_added)
+ pcibios_setup_device(dev);
+ return 0;
+}
+
+void pcibios_setup_bus_devices(struct pci_bus *bus)
{
struct pci_dev *dev;
@@ -1096,37 +1009,22 @@ void __devinit pcibios_setup_bus_devices(struct pci_bus *bus)
if (dev->is_added)
continue;
- /* Setup OF node pointer in the device */
- dev->dev.of_node = pci_device_to_OF_node(dev);
-
- /* Fixup NUMA node as it may not be setup yet by the generic
- * code and is needed by the DMA init
- */
- set_dev_node(&dev->dev, pcibus_to_node(dev->bus));
-
- /* Hook up default DMA ops */
- set_dma_ops(&dev->dev, pci_dma_ops);
- set_dma_offset(&dev->dev, PCI_DRAM_OFFSET);
-
- /* Additional platform DMA/iommu setup */
- if (ppc_md.pci_dma_dev_setup)
- ppc_md.pci_dma_dev_setup(dev);
-
- /* Read default IRQs and fixup if necessary */
- pci_read_irq_line(dev);
- if (ppc_md.pci_irq_fixup)
- ppc_md.pci_irq_fixup(dev);
+ pcibios_setup_device(dev);
}
}
-void __devinit pcibios_fixup_bus(struct pci_bus *bus)
+void pcibios_set_master(struct pci_dev *dev)
+{
+ /* No special bus mastering setup handling */
+}
+
+void pcibios_fixup_bus(struct pci_bus *bus)
{
/* When called from the generic PCI probe, read PCI<->PCI bridge
* bases. This is -not- called when generating the PCI tree from
* the OF device-tree.
*/
- if (bus->self != NULL)
- pci_read_bridge_bases(bus);
+ pci_read_bridge_bases(bus);
/* Now fixup the bus bus */
pcibios_setup_bus_self(bus);
@@ -1136,7 +1034,7 @@ void __devinit pcibios_fixup_bus(struct pci_bus *bus)
}
EXPORT_SYMBOL(pcibios_fixup_bus);
-void __devinit pci_fixup_cardbus(struct pci_bus *bus)
+void pci_fixup_cardbus(struct pci_bus *bus)
{
/* Now fixup devices on that bus */
pcibios_setup_bus_devices(bus);
@@ -1145,7 +1043,7 @@ void __devinit pci_fixup_cardbus(struct pci_bus *bus)
static int skip_isa_ioresource_align(struct pci_dev *dev)
{
- if ((ppc_pci_flags & PPC_PCI_CAN_SKIP_ISA_ALIGN) &&
+ if (pci_has_flag(PCI_CAN_SKIP_ISA_ALIGN) &&
!(dev->bus->bridge_ctl & PCI_BRIDGE_CTL_ISA))
return 1;
return 0;
@@ -1263,18 +1161,15 @@ void pcibios_allocate_bus_resources(struct pci_bus *bus)
pci_bus_for_each_resource(bus, res, i) {
if (!res || !res->flags || res->start > res->end || res->parent)
continue;
+
+ /* If the resource was left unset at this point, we clear it */
+ if (res->flags & IORESOURCE_UNSET)
+ goto clear_resource;
+
if (bus->parent == NULL)
pr = (res->flags & IORESOURCE_IO) ?
&ioport_resource : &iomem_resource;
else {
- /* Don't bother with non-root busses when
- * re-assigning all resources. We clear the
- * resource flags as if they were colliding
- * and as such ensure proper re-allocation
- * later.
- */
- if (ppc_pci_flags & PPC_PCI_REASSIGN_ALL_RSRC)
- goto clear_resource;
pr = pci_find_parent_resource(bus->self, res);
if (pr == res) {
/* this happens when the generic PCI
@@ -1305,10 +1200,17 @@ void pcibios_allocate_bus_resources(struct pci_bus *bus)
if (reparent_resources(pr, res) == 0)
continue;
}
- printk(KERN_WARNING "PCI: Cannot allocate resource region "
- "%d of PCI bridge %d, will remap\n", i, bus->number);
-clear_resource:
- res->start = res->end = 0;
+ pr_warning("PCI: Cannot allocate resource region "
+ "%d of PCI bridge %d, will remap\n", i, bus->number);
+ clear_resource:
+ /* The resource might be figured out when doing
+ * reassignment based on the resources required
+ * by the downstream PCI devices. Here we set
+ * the size of the resource to be 0 in order to
+ * save more space.
+ */
+ res->start = 0;
+ res->end = -1;
res->flags = 0;
}
@@ -1316,7 +1218,7 @@ clear_resource:
pcibios_allocate_bus_resources(b);
}
-static inline void __devinit alloc_resource(struct pci_dev *dev, int idx)
+static inline void alloc_resource(struct pci_dev *dev, int idx)
{
struct resource *pr, *r = &dev->resource[idx];
@@ -1420,10 +1322,9 @@ static void __init pcibios_reserve_legacy_regions(struct pci_bus *bus)
no_io:
/* Check for memory */
- offset = hose->pci_mem_offset;
- pr_debug("hose mem offset: %016llx\n", (unsigned long long)offset);
for (i = 0; i < 3; i++) {
pres = &hose->mem_resources[i];
+ offset = hose->mem_offset[i];
if (!(pres->flags & IORESOURCE_MEM))
continue;
pr_debug("hose mem res: %pR\n", pres);
@@ -1452,22 +1353,17 @@ void __init pcibios_resource_survey(void)
{
struct pci_bus *b;
- /* Allocate and assign resources. If we re-assign everything, then
- * we skip the allocate phase
- */
+ /* Allocate and assign resources */
list_for_each_entry(b, &pci_root_buses, node)
pcibios_allocate_bus_resources(b);
-
- if (!(ppc_pci_flags & PPC_PCI_REASSIGN_ALL_RSRC)) {
- pcibios_allocate_resources(0);
- pcibios_allocate_resources(1);
- }
+ pcibios_allocate_resources(0);
+ pcibios_allocate_resources(1);
/* Before we start assigning unassigned resource, we try to reserve
* the low IO area and the VGA memory area if they intersect the
* bus available resources to avoid allocating things on top of them
*/
- if (!(ppc_pci_flags & PPC_PCI_PROBE_ONLY)) {
+ if (!pci_has_flag(PCI_PROBE_ONLY)) {
list_for_each_entry(b, &pci_root_buses, node)
pcibios_reserve_legacy_regions(b);
}
@@ -1475,7 +1371,7 @@ void __init pcibios_resource_survey(void)
/* Now, if the platform didn't decide to blindly trust the firmware,
* we proceed to assigning things that were left unassigned
*/
- if (!(ppc_pci_flags & PPC_PCI_PROBE_ONLY)) {
+ if (!pci_has_flag(PCI_PROBE_ONLY)) {
pr_debug("PCI: Assigning unassigned resources...\n");
pci_assign_unassigned_resources();
}
@@ -1485,8 +1381,6 @@ void __init pcibios_resource_survey(void)
ppc_md.pcibios_fixup();
}
-#ifdef CONFIG_HOTPLUG
-
/* This is used by the PCI hotplug driver to allocate resource
* of newly plugged busses. We can try to consolidate with the
* rest of the code later, for now, keep it as-is as our main
@@ -1536,17 +1430,20 @@ void pcibios_finish_adding_to_bus(struct pci_bus *bus)
/* Allocate bus and devices resources */
pcibios_allocate_bus_resources(bus);
pcibios_claim_one_bus(bus);
+ if (!pci_has_flag(PCI_PROBE_ONLY))
+ pci_assign_unassigned_bus_resources(bus);
+
+ /* Fixup EEH */
+ eeh_add_device_tree_late(bus);
/* Add new devices to global lists. Register in proc, sysfs. */
pci_bus_add_devices(bus);
- /* Fixup EEH */
- eeh_add_device_tree_late(bus);
+ /* sysfs files should only be added after devices are added */
+ eeh_add_sysfs_files(bus);
}
EXPORT_SYMBOL_GPL(pcibios_finish_adding_to_bus);
-#endif /* CONFIG_HOTPLUG */
-
int pcibios_enable_device(struct pci_dev *dev, int mask)
{
if (ppc_md.pcibios_enable_device_hook)
@@ -1556,61 +1453,57 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
return pci_enable_resources(dev, mask);
}
-void __devinit pcibios_setup_phb_resources(struct pci_controller *hose)
+resource_size_t pcibios_io_space_offset(struct pci_controller *hose)
+{
+ return (unsigned long) hose->io_base_virt - _IO_BASE;
+}
+
+static void pcibios_setup_phb_resources(struct pci_controller *hose,
+ struct list_head *resources)
{
- struct pci_bus *bus = hose->bus;
struct resource *res;
+ resource_size_t offset;
int i;
/* Hookup PHB IO resource */
- bus->resource[0] = res = &hose->io_resource;
+ res = &hose->io_resource;
if (!res->flags) {
printk(KERN_WARNING "PCI: I/O resource not set for host"
" bridge %s (domain %d)\n",
hose->dn->full_name, hose->global_number);
-#ifdef CONFIG_PPC32
- /* Workaround for lack of IO resource only on 32-bit */
- res->start = (unsigned long)hose->io_base_virt - isa_io_base;
- res->end = res->start + IO_SPACE_LIMIT;
- res->flags = IORESOURCE_IO;
-#endif /* CONFIG_PPC32 */
- }
+ } else {
+ offset = pcibios_io_space_offset(hose);
- pr_debug("PCI: PHB IO resource = %016llx-%016llx [%lx]\n",
- (unsigned long long)res->start,
- (unsigned long long)res->end,
- (unsigned long)res->flags);
+ pr_debug("PCI: PHB IO resource = %08llx-%08llx [%lx] off 0x%08llx\n",
+ (unsigned long long)res->start,
+ (unsigned long long)res->end,
+ (unsigned long)res->flags,
+ (unsigned long long)offset);
+ pci_add_resource_offset(resources, res, offset);
+ }
/* Hookup PHB Memory resources */
for (i = 0; i < 3; ++i) {
res = &hose->mem_resources[i];
if (!res->flags) {
- if (i > 0)
- continue;
- printk(KERN_ERR "PCI: Memory resource 0 not set for "
- "host bridge %s (domain %d)\n",
- hose->dn->full_name, hose->global_number);
-#ifdef CONFIG_PPC32
- /* Workaround for lack of MEM resource only on 32-bit */
- res->start = hose->pci_mem_offset;
- res->end = (resource_size_t)-1LL;
- res->flags = IORESOURCE_MEM;
-#endif /* CONFIG_PPC32 */
+ if (i == 0)
+ printk(KERN_ERR "PCI: Memory resource 0 not set for "
+ "host bridge %s (domain %d)\n",
+ hose->dn->full_name, hose->global_number);
+ continue;
}
- bus->resource[i+1] = res;
+ offset = hose->mem_offset[i];
+
- pr_debug("PCI: PHB MEM resource %d = %016llx-%016llx [%lx]\n", i,
+ pr_debug("PCI: PHB MEM resource %d = %08llx-%08llx [%lx] off 0x%08llx\n", i,
(unsigned long long)res->start,
(unsigned long long)res->end,
- (unsigned long)res->flags);
- }
-
- pr_debug("PCI: PHB MEM offset = %016llx\n",
- (unsigned long long)hose->pci_mem_offset);
- pr_debug("PCI: PHB IO offset = %08lx\n",
- (unsigned long)hose->io_base_virt - _IO_BASE);
+ (unsigned long)res->flags,
+ (unsigned long long)offset);
+ pci_add_resource_offset(resources, res, offset);
+ }
}
/*
@@ -1653,7 +1546,7 @@ fake_pci_bus(struct pci_controller *hose, int busnr)
{
static struct pci_bus bus;
- if (hose == 0) {
+ if (hose == NULL) {
printk(KERN_ERR "Can't find hose for PCI bus %d!\n", busnr);
}
bus.number = busnr;
@@ -1684,51 +1577,105 @@ int early_find_capability(struct pci_controller *hose, int bus, int devfn,
return pci_bus_find_capability(fake_pci_bus(hose, bus), devfn, cap);
}
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+ struct pci_controller *hose = bus->sysdata;
+
+ return of_node_get(hose->dn);
+}
+
/**
* pci_scan_phb - Given a pci_controller, setup and scan the PCI bus
* @hose: Pointer to the PCI host controller instance structure
- * @sysdata: value to use for sysdata pointer. ppc32 and ppc64 differ here
- *
- * Note: the 'data' pointer is a temporary measure. As 32 and 64 bit
- * pci code gets merged, this parameter should become unnecessary because
- * both will use the same value.
*/
-void __devinit pcibios_scan_phb(struct pci_controller *hose, void *sysdata)
+void pcibios_scan_phb(struct pci_controller *hose)
{
+ LIST_HEAD(resources);
struct pci_bus *bus;
struct device_node *node = hose->dn;
int mode;
- pr_debug("PCI: Scanning PHB %s\n",
- node ? node->full_name : "<NO NAME>");
+ pr_debug("PCI: Scanning PHB %s\n", of_node_full_name(node));
+
+ /* Get some IO space for the new PHB */
+ pcibios_setup_phb_io_space(hose);
+
+ /* Wire up PHB bus resources */
+ pcibios_setup_phb_resources(hose, &resources);
+
+ hose->busn.start = hose->first_busno;
+ hose->busn.end = hose->last_busno;
+ hose->busn.flags = IORESOURCE_BUS;
+ pci_add_resource(&resources, &hose->busn);
/* Create an empty bus for the toplevel */
- bus = pci_create_bus(hose->parent, hose->first_busno, hose->ops,
- sysdata);
+ bus = pci_create_root_bus(hose->parent, hose->first_busno,
+ hose->ops, hose, &resources);
if (bus == NULL) {
pr_err("Failed to create bus for PCI domain %04x\n",
hose->global_number);
+ pci_free_resource_list(&resources);
return;
}
- bus->secondary = hose->first_busno;
hose->bus = bus;
- /* Get some IO space for the new PHB */
- pcibios_setup_phb_io_space(hose);
-
- /* Wire up PHB bus resources */
- pcibios_setup_phb_resources(hose);
-
/* Get probe mode and perform scan */
mode = PCI_PROBE_NORMAL;
if (node && ppc_md.pci_probe_mode)
mode = ppc_md.pci_probe_mode(bus);
pr_debug(" probe mode: %d\n", mode);
- if (mode == PCI_PROBE_DEVTREE) {
- bus->subordinate = hose->last_busno;
+ if (mode == PCI_PROBE_DEVTREE)
of_scan_bus(node, bus);
+
+ if (mode == PCI_PROBE_NORMAL) {
+ pci_bus_update_busn_res_end(bus, 255);
+ hose->last_busno = pci_scan_child_bus(bus);
+ pci_bus_update_busn_res_end(bus, hose->last_busno);
}
- if (mode == PCI_PROBE_NORMAL)
- hose->last_busno = bus->subordinate = pci_scan_child_bus(bus);
+ /* Platform gets a chance to do some global fixups before
+ * we proceed to resource allocation
+ */
+ if (ppc_md.pcibios_fixup_phb)
+ ppc_md.pcibios_fixup_phb(hose);
+
+ /* Configure PCI Express settings */
+ if (bus && !pci_has_flag(PCI_PROBE_ONLY)) {
+ struct pci_bus *child;
+ list_for_each_entry(child, &bus->children, node)
+ pcie_bus_configure_settings(child);
+ }
+}
+
+static void fixup_hide_host_resource_fsl(struct pci_dev *dev)
+{
+ int i, class = dev->class >> 8;
+ /* When configured as agent, programing interface = 1 */
+ int prog_if = dev->class & 0xf;
+
+ if ((class == PCI_CLASS_PROCESSOR_POWERPC ||
+ class == PCI_CLASS_BRIDGE_OTHER) &&
+ (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) &&
+ (prog_if == 0) &&
+ (dev->bus->parent == NULL)) {
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+ dev->resource[i].start = 0;
+ dev->resource[i].end = 0;
+ dev->resource[i].flags = 0;
+ }
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MOTOROLA, PCI_ANY_ID, fixup_hide_host_resource_fsl);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, fixup_hide_host_resource_fsl);
+
+static void fixup_vga(struct pci_dev *pdev)
+{
+ u16 cmd;
+
+ pci_read_config_word(pdev, PCI_COMMAND, &cmd);
+ if ((cmd & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) || !vga_default_device())
+ vga_set_default_device(pdev);
+
}
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
+ PCI_CLASS_DISPLAY_VGA, 8, fixup_vga);
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
new file mode 100644
index 00000000000..5b789177aa2
--- /dev/null
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -0,0 +1,109 @@
+/*
+ * Derived from "arch/powerpc/platforms/pseries/pci_dlpar.c"
+ *
+ * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com>
+ * Copyright (C) 2005 International Business Machines
+ *
+ * Updates, 2005, John Rose <johnrose@austin.ibm.com>
+ * Updates, 2005, Linas Vepstas <linas@austin.ibm.com>
+ * Updates, 2013, Gavin Shan <shangw@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/pci.h>
+#include <linux/export.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+
+/**
+ * pcibios_release_device - release PCI device
+ * @dev: PCI device
+ *
+ * The function is called before releasing the indicated PCI device.
+ */
+void pcibios_release_device(struct pci_dev *dev)
+{
+ eeh_remove_device(dev);
+}
+
+/**
+ * pcibios_remove_pci_devices - remove all devices under this bus
+ * @bus: the indicated PCI bus
+ *
+ * Remove all of the PCI devices under this bus both from the
+ * linux pci device tree, and from the powerpc EEH address cache.
+ */
+void pcibios_remove_pci_devices(struct pci_bus *bus)
+{
+ struct pci_dev *dev, *tmp;
+ struct pci_bus *child_bus;
+
+ /* First go down child busses */
+ list_for_each_entry(child_bus, &bus->children, node)
+ pcibios_remove_pci_devices(child_bus);
+
+ pr_debug("PCI: Removing devices on bus %04x:%02x\n",
+ pci_domain_nr(bus), bus->number);
+ list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
+ pr_debug(" Removing %s...\n", pci_name(dev));
+ pci_stop_and_remove_bus_device(dev);
+ }
+}
+
+EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
+
+/**
+ * pcibios_add_pci_devices - adds new pci devices to bus
+ * @bus: the indicated PCI bus
+ *
+ * This routine will find and fixup new pci devices under
+ * the indicated bus. This routine presumes that there
+ * might already be some devices under this bridge, so
+ * it carefully tries to add only new devices. (And that
+ * is how this routine differs from other, similar pcibios
+ * routines.)
+ */
+void pcibios_add_pci_devices(struct pci_bus * bus)
+{
+ int slotno, mode, pass, max;
+ struct pci_dev *dev;
+ struct device_node *dn = pci_bus_to_OF_node(bus);
+
+ eeh_add_device_tree_early(dn);
+
+ mode = PCI_PROBE_NORMAL;
+ if (ppc_md.pci_probe_mode)
+ mode = ppc_md.pci_probe_mode(bus);
+
+ if (mode == PCI_PROBE_DEVTREE) {
+ /* use ofdt-based probe */
+ of_rescan_bus(dn, bus);
+ } else if (mode == PCI_PROBE_NORMAL) {
+ /*
+ * Use legacy probe. In the partial hotplug case, we
+ * probably have grandchildren devices unplugged. So
+ * we don't check the return value from pci_scan_slot() in
+ * order for fully rescan all the way down to pick them up.
+ * They can have been removed during partial hotplug.
+ */
+ slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
+ pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
+ pcibios_setup_bus_devices(bus);
+ max = bus->busn_res.start;
+ for (pass = 0; pass < 2; pass++) {
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ if (pci_is_bridge(dev))
+ max = pci_scan_bridge(bus, dev,
+ max, pass);
+ }
+ }
+ }
+ pcibios_finish_adding_to_bus(bus);
+}
+EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
index e7db5b48004..432459c817f 100644
--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@@ -15,6 +15,7 @@
#include <linux/list.h>
#include <linux/of.h>
#include <linux/slab.h>
+#include <linux/export.h>
#include <asm/processor.h>
#include <asm/io.h>
@@ -51,25 +52,6 @@ struct pci_dev *isa_bridge_pcidev;
EXPORT_SYMBOL_GPL(isa_bridge_pcidev);
static void
-fixup_hide_host_resource_fsl(struct pci_dev *dev)
-{
- int i, class = dev->class >> 8;
-
- if ((class == PCI_CLASS_PROCESSOR_POWERPC ||
- class == PCI_CLASS_BRIDGE_OTHER) &&
- (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) &&
- (dev->bus->parent == NULL)) {
- for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
- dev->resource[i].start = 0;
- dev->resource[i].end = 0;
- dev->resource[i].flags = 0;
- }
- }
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MOTOROLA, PCI_ANY_ID, fixup_hide_host_resource_fsl);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, fixup_hide_host_resource_fsl);
-
-static void
fixup_cpc710_pci64(struct pci_dev* dev)
{
/* Hide the PCI64 BARs from the kernel as their content doesn't
@@ -167,150 +149,26 @@ pcibios_make_OF_bus_map(void)
#endif
}
-typedef int (*pci_OF_scan_iterator)(struct device_node* node, void* data);
-
-static struct device_node*
-scan_OF_pci_childs(struct device_node *parent, pci_OF_scan_iterator filter, void* data)
-{
- struct device_node *node;
- struct device_node* sub_node;
-
- for_each_child_of_node(parent, node) {
- const unsigned int *class_code;
-
- if (filter(node, data)) {
- of_node_put(node);
- return node;
- }
-
- /* For PCI<->PCI bridges or CardBus bridges, we go down
- * Note: some OFs create a parent node "multifunc-device" as
- * a fake root for all functions of a multi-function device,
- * we go down them as well.
- */
- class_code = of_get_property(node, "class-code", NULL);
- if ((!class_code || ((*class_code >> 8) != PCI_CLASS_BRIDGE_PCI &&
- (*class_code >> 8) != PCI_CLASS_BRIDGE_CARDBUS)) &&
- strcmp(node->name, "multifunc-device"))
- continue;
- sub_node = scan_OF_pci_childs(node, filter, data);
- if (sub_node) {
- of_node_put(node);
- return sub_node;
- }
- }
- return NULL;
-}
-
-static struct device_node *scan_OF_for_pci_dev(struct device_node *parent,
- unsigned int devfn)
-{
- struct device_node *np, *cnp;
- const u32 *reg;
- unsigned int psize;
-
- for_each_child_of_node(parent, np) {
- reg = of_get_property(np, "reg", &psize);
- if (reg && psize >= 4 && ((reg[0] >> 8) & 0xff) == devfn)
- return np;
-
- /* Note: some OFs create a parent node "multifunc-device" as
- * a fake root for all functions of a multi-function device,
- * we go down them as well. */
- if (!strcmp(np->name, "multifunc-device")) {
- cnp = scan_OF_for_pci_dev(np, devfn);
- if (cnp)
- return cnp;
- }
- }
- return NULL;
-}
-
-
-static struct device_node *scan_OF_for_pci_bus(struct pci_bus *bus)
-{
- struct device_node *parent, *np;
-
- /* Are we a root bus ? */
- if (bus->self == NULL || bus->parent == NULL) {
- struct pci_controller *hose = pci_bus_to_host(bus);
- if (hose == NULL)
- return NULL;
- return of_node_get(hose->dn);
- }
-
- /* not a root bus, we need to get our parent */
- parent = scan_OF_for_pci_bus(bus->parent);
- if (parent == NULL)
- return NULL;
-
- /* now iterate for children for a match */
- np = scan_OF_for_pci_dev(parent, bus->self->devfn);
- of_node_put(parent);
-
- return np;
-}
-
-/*
- * Scans the OF tree for a device node matching a PCI device
- */
-struct device_node *
-pci_busdev_to_OF_node(struct pci_bus *bus, int devfn)
-{
- struct device_node *parent, *np;
-
- pr_debug("pci_busdev_to_OF_node(%d,0x%x)\n", bus->number, devfn);
- parent = scan_OF_for_pci_bus(bus);
- if (parent == NULL)
- return NULL;
- pr_debug(" parent is %s\n", parent ? parent->full_name : "<NULL>");
- np = scan_OF_for_pci_dev(parent, devfn);
- of_node_put(parent);
- pr_debug(" result is %s\n", np ? np->full_name : "<NULL>");
-
- /* XXX most callers don't release the returned node
- * mostly because ppc64 doesn't increase the refcount,
- * we need to fix that.
- */
- return np;
-}
-EXPORT_SYMBOL(pci_busdev_to_OF_node);
-
-struct device_node*
-pci_device_to_OF_node(struct pci_dev *dev)
-{
- return pci_busdev_to_OF_node(dev->bus, dev->devfn);
-}
-EXPORT_SYMBOL(pci_device_to_OF_node);
-
-static int
-find_OF_pci_device_filter(struct device_node* node, void* data)
-{
- return ((void *)node == data);
-}
/*
* Returns the PCI device matching a given OF node
*/
-int
-pci_device_from_OF_node(struct device_node* node, u8* bus, u8* devfn)
+int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn)
{
- const unsigned int *reg;
- struct pci_controller* hose;
- struct pci_dev* dev = NULL;
-
- /* Make sure it's really a PCI device */
- hose = pci_find_hose_for_OF_device(node);
- if (!hose || !hose->dn)
- return -ENODEV;
- if (!scan_OF_pci_childs(hose->dn,
- find_OF_pci_device_filter, (void *)node))
+ struct pci_dev *dev = NULL;
+ const __be32 *reg;
+ int size;
+
+ /* Check if it might have a chance to be a PCI device */
+ if (!pci_find_hose_for_OF_device(node))
return -ENODEV;
- reg = of_get_property(node, "reg", NULL);
- if (!reg)
+
+ reg = of_get_property(node, "reg", &size);
+ if (!reg || size < 5 * sizeof(u32))
return -ENODEV;
- *bus = (reg[0] >> 16) & 0xff;
- *devfn = ((reg[0] >> 8) & 0xff);
+
+ *bus = (be32_to_cpup(&reg[0]) >> 16) & 0xff;
+ *devfn = (be32_to_cpup(&reg[0]) >> 8) & 0xff;
/* Ok, here we need some tweak. If we have already renumbered
* all busses, we can't rely on the OF bus number any more.
@@ -350,20 +208,20 @@ pci_create_OF_bus_map(void)
of_prop->name = "pci-OF-bus-map";
of_prop->length = 256;
of_prop->value = &of_prop[1];
- prom_add_property(dn, of_prop);
+ of_add_property(dn, of_prop);
of_node_put(dn);
}
}
-void __devinit pcibios_setup_phb_io_space(struct pci_controller *hose)
+void pcibios_setup_phb_io_space(struct pci_controller *hose)
{
unsigned long io_offset;
struct resource *res = &hose->io_resource;
/* Fixup IO space offset */
- io_offset = (unsigned long)hose->io_base_virt - isa_io_base;
- res->start = (res->start + io_offset) & 0xffffffffu;
- res->end = (res->end + io_offset) & 0xffffffffu;
+ io_offset = pcibios_io_space_offset(hose);
+ res->start += io_offset;
+ res->end += io_offset;
}
static int __init pcibios_init(void)
@@ -373,7 +231,7 @@ static int __init pcibios_init(void)
printk(KERN_INFO "PCI: Probing PCI hardware\n");
- if (ppc_pci_flags & PPC_PCI_REASSIGN_ALL_BUS)
+ if (pci_has_flag(PCI_REASSIGN_ALL_BUS))
pci_assign_all_buses = 1;
/* Scan all of the recorded PCI controllers. */
@@ -381,7 +239,7 @@ static int __init pcibios_init(void)
if (pci_assign_all_buses)
hose->first_busno = next_busno;
hose->last_busno = 0xff;
- pcibios_scan_phb(hose, hose);
+ pcibios_scan_phb(hose);
pci_bus_add_devices(hose->bus);
if (pci_assign_all_buses || next_busno <= hose->last_busno)
next_busno = hose->last_busno + pcibios_assign_bus_offset;
@@ -437,7 +295,7 @@ long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn)
case IOBASE_BRIDGE_NUMBER:
return (long)hose->first_busno;
case IOBASE_MEMORY:
- return (long)hose->pci_mem_offset;
+ return (long)hose->mem_offset[0];
case IOBASE_IO:
return (long)hose->io_base_phys;
case IOBASE_ISA_IO:
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index d43fc65749c..155013da27e 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -18,6 +18,7 @@
#include <linux/string.h>
#include <linux/init.h>
#include <linux/bootmem.h>
+#include <linux/export.h>
#include <linux/mm.h>
#include <linux/list.h>
#include <linux/syscalls.h>
@@ -32,8 +33,6 @@
#include <asm/machdep.h>
#include <asm/ppc-pci.h>
-unsigned long pci_probe_only = 1;
-
/* pci_io_base -- the base address from which io bars are offsets.
* This is the lowest I/O base address (so bar values are always positive),
* and it *must* be the start of ISA space if an ISA bus exists because
@@ -54,17 +53,14 @@ static int __init pcibios_init(void)
*/
ppc_md.phys_mem_access_prot = pci_phys_mem_access_prot;
- if (pci_probe_only)
- ppc_pci_flags |= PPC_PCI_PROBE_ONLY;
-
/* On ppc64, we always enable PCI domains and we keep domain 0
* backward compatible in /proc for video cards
*/
- ppc_pci_flags |= PPC_PCI_ENABLE_PROC_DOMAINS | PPC_PCI_COMPAT_DOMAIN_0;
+ pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0);
/* Scan all of the recorded PCI controllers. */
list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
- pcibios_scan_phb(hose, hose->dn);
+ pcibios_scan_phb(hose);
pci_bus_add_devices(hose->bus);
}
@@ -78,8 +74,6 @@ static int __init pcibios_init(void)
subsys_initcall(pcibios_init);
-#ifdef CONFIG_HOTPLUG
-
int pcibios_unmap_io_space(struct pci_bus *bus)
{
struct pci_controller *hose;
@@ -115,7 +109,7 @@ int pcibios_unmap_io_space(struct pci_bus *bus)
hose = pci_bus_to_host(bus);
/* Check if we have IOs allocated */
- if (hose->io_base_alloc == 0)
+ if (hose->io_base_alloc == NULL)
return 0;
pr_debug("IO unmapping for PHB %s\n", hose->dn->full_name);
@@ -128,32 +122,13 @@ int pcibios_unmap_io_space(struct pci_bus *bus)
}
EXPORT_SYMBOL_GPL(pcibios_unmap_io_space);
-#endif /* CONFIG_HOTPLUG */
-
-int __devinit pcibios_map_io_space(struct pci_bus *bus)
+static int pcibios_map_phb_io_space(struct pci_controller *hose)
{
struct vm_struct *area;
unsigned long phys_page;
unsigned long size_page;
unsigned long io_virt_offset;
- struct pci_controller *hose;
- WARN_ON(bus == NULL);
-
- /* If this not a PHB, nothing to do, page tables still exist and
- * thus HPTEs will be faulted in when needed
- */
- if (bus->self) {
- pr_debug("IO mapping for PCI-PCI bridge %s\n",
- pci_name(bus->self));
- pr_debug(" virt=0x%016llx...0x%016llx\n",
- bus->resource[0]->start + _IO_BASE,
- bus->resource[0]->end + _IO_BASE);
- return 0;
- }
-
- /* Get the host bridge */
- hose = pci_bus_to_host(bus);
phys_page = _ALIGN_DOWN(hose->io_base_phys, PAGE_SIZE);
size_page = _ALIGN_UP(hose->pci_io_size, PAGE_SIZE);
@@ -189,20 +164,38 @@ int __devinit pcibios_map_io_space(struct pci_bus *bus)
return -ENOMEM;
/* Fixup hose IO resource */
- io_virt_offset = (unsigned long)hose->io_base_virt - _IO_BASE;
+ io_virt_offset = pcibios_io_space_offset(hose);
hose->io_resource.start += io_virt_offset;
hose->io_resource.end += io_virt_offset;
- pr_debug(" hose->io_resource=0x%016llx...0x%016llx\n",
- hose->io_resource.start, hose->io_resource.end);
+ pr_debug(" hose->io_resource=%pR\n", &hose->io_resource);
return 0;
}
+
+int pcibios_map_io_space(struct pci_bus *bus)
+{
+ WARN_ON(bus == NULL);
+
+ /* If this not a PHB, nothing to do, page tables still exist and
+ * thus HPTEs will be faulted in when needed
+ */
+ if (bus->self) {
+ pr_debug("IO mapping for PCI-PCI bridge %s\n",
+ pci_name(bus->self));
+ pr_debug(" virt=0x%016llx...0x%016llx\n",
+ bus->resource[0]->start + _IO_BASE,
+ bus->resource[0]->end + _IO_BASE);
+ return 0;
+ }
+
+ return pcibios_map_phb_io_space(pci_bus_to_host(bus));
+}
EXPORT_SYMBOL_GPL(pcibios_map_io_space);
-void __devinit pcibios_setup_phb_io_space(struct pci_controller *hose)
+void pcibios_setup_phb_io_space(struct pci_controller *hose)
{
- pcibios_map_io_space(hose->bus);
+ pcibios_map_phb_io_space(hose);
}
#define IOBASE_BRIDGE_NUMBER 0
@@ -215,8 +208,7 @@ long sys_pciconfig_iobase(long which, unsigned long in_bus,
unsigned long in_devfn)
{
struct pci_controller* hose;
- struct list_head *ln;
- struct pci_bus *bus = NULL;
+ struct pci_bus *tmp_bus, *bus = NULL;
struct device_node *hose_node;
/* Argh ! Please forgive me for that hack, but that's the
@@ -237,23 +229,24 @@ long sys_pciconfig_iobase(long which, unsigned long in_bus,
* used on pre-domains setup. We return the first match
*/
- for (ln = pci_root_buses.next; ln != &pci_root_buses; ln = ln->next) {
- bus = pci_bus_b(ln);
- if (in_bus >= bus->number && in_bus <= bus->subordinate)
+ list_for_each_entry(tmp_bus, &pci_root_buses, node) {
+ if (in_bus >= tmp_bus->number &&
+ in_bus <= tmp_bus->busn_res.end) {
+ bus = tmp_bus;
break;
- bus = NULL;
+ }
}
- if (bus == NULL || bus->sysdata == NULL)
+ if (bus == NULL || bus->dev.of_node == NULL)
return -ENODEV;
- hose_node = (struct device_node *)bus->sysdata;
+ hose_node = bus->dev.of_node;
hose = PCI_DN(hose_node)->phb;
switch (which) {
case IOBASE_BRIDGE_NUMBER:
return (long)hose->first_busno;
case IOBASE_MEMORY:
- return (long)hose->pci_mem_offset;
+ return (long)hose->mem_offset[0];
case IOBASE_IO:
return (long)hose->io_base_phys;
case IOBASE_ISA_IO:
@@ -273,3 +266,13 @@ int pcibus_to_node(struct pci_bus *bus)
}
EXPORT_SYMBOL(pcibus_to_node);
#endif
+
+static void quirk_radeon_32bit_msi(struct pci_dev *dev)
+{
+ struct pci_dn *pdn = pci_get_pdn(dev);
+
+ if (pdn)
+ pdn->force_32bit_msi = true;
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x68f2, quirk_radeon_32bit_msi);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0xaa68, quirk_radeon_32bit_msi);
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index d56b35ee7f7..1f61fab59d9 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -22,6 +22,7 @@
#include <linux/kernel.h>
#include <linux/pci.h>
#include <linux/string.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/gfp.h>
@@ -31,33 +32,44 @@
#include <asm/ppc-pci.h>
#include <asm/firmware.h>
+struct pci_dn *pci_get_pdn(struct pci_dev *pdev)
+{
+ struct device_node *dn = pci_device_to_OF_node(pdev);
+ if (!dn)
+ return NULL;
+ return PCI_DN(dn);
+}
+
/*
* Traverse_func that inits the PCI fields of the device node.
* NOTE: this *must* be done before read/write config to the device.
*/
-void * __devinit update_dn_pci_info(struct device_node *dn, void *data)
+void *update_dn_pci_info(struct device_node *dn, void *data)
{
struct pci_controller *phb = data;
- const int *type =
- of_get_property(dn, "ibm,pci-config-space-type", NULL);
- const u32 *regs;
+ const __be32 *type = of_get_property(dn, "ibm,pci-config-space-type", NULL);
+ const __be32 *regs;
struct pci_dn *pdn;
- pdn = alloc_maybe_bootmem(sizeof(*pdn), GFP_KERNEL);
+ pdn = zalloc_maybe_bootmem(sizeof(*pdn), GFP_KERNEL);
if (pdn == NULL)
return NULL;
- memset(pdn, 0, sizeof(*pdn));
dn->data = pdn;
pdn->node = dn;
pdn->phb = phb;
+#ifdef CONFIG_PPC_POWERNV
+ pdn->pe_number = IODA_INVALID_PE;
+#endif
regs = of_get_property(dn, "reg", NULL);
if (regs) {
+ u32 addr = of_read_number(regs, 1);
+
/* First register entry is addr (00BBSS00) */
- pdn->busno = (regs[0] >> 16) & 0xff;
- pdn->devfn = (regs[0] >> 8) & 0xff;
+ pdn->busno = (addr >> 16) & 0xff;
+ pdn->devfn = (addr >> 8) & 0xff;
}
- pdn->pci_ext_config_space = (type && *type == 1);
+ pdn->pci_ext_config_space = (type && of_read_number(type, 1) == 1);
return NULL;
}
@@ -87,12 +99,13 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre,
/* We started with a phb, iterate all childs */
for (dn = start->child; dn; dn = nextdn) {
- const u32 *classp;
- u32 class;
+ const __be32 *classp;
+ u32 class = 0;
nextdn = NULL;
classp = of_get_property(dn, "class-code", NULL);
- class = classp ? *classp : 0;
+ if (classp)
+ class = of_read_number(classp, 1);
if (pre && ((ret = pre(dn, data)) != NULL))
return ret;
@@ -126,7 +139,7 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre,
* subsystem is set up, before kmalloc is valid) and during the
* dynamic lpar operation of adding a PHB to a running system.
*/
-void __devinit pci_devs_phb_init_dynamic(struct pci_controller *phb)
+void pci_devs_phb_init_dynamic(struct pci_controller *phb)
{
struct device_node *dn = phb->dn;
struct pci_dn *pdn;
@@ -143,47 +156,6 @@ void __devinit pci_devs_phb_init_dynamic(struct pci_controller *phb)
traverse_pci_devices(dn, update_dn_pci_info, phb);
}
-/*
- * Traversal func that looks for a <busno,devfcn> value.
- * If found, the pci_dn is returned (thus terminating the traversal).
- */
-static void *is_devfn_node(struct device_node *dn, void *data)
-{
- int busno = ((unsigned long)data >> 8) & 0xff;
- int devfn = ((unsigned long)data) & 0xff;
- struct pci_dn *pci = dn->data;
-
- if (pci && (devfn == pci->devfn) && (busno == pci->busno))
- return dn;
- return NULL;
-}
-
-/*
- * This is the "slow" path for looking up a device_node from a
- * pci_dev. It will hunt for the device under its parent's
- * phb and then update sysdata for a future fastpath.
- *
- * It may also do fixups on the actual device since this happens
- * on the first read/write.
- *
- * Note that it also must deal with devices that don't exist.
- * In this case it may probe for real hardware ("just in case")
- * and add a device_node to the device tree if necessary.
- *
- */
-struct device_node *fetch_dev_dn(struct pci_dev *dev)
-{
- struct device_node *orig_dn = dev->sysdata;
- struct device_node *dn;
- unsigned long searchval = (dev->bus->number << 8) | dev->devfn;
-
- dn = traverse_pci_devices(orig_dn, is_devfn_node, (void *)searchval);
- if (dn)
- dev->sysdata = dn;
- return dn;
-}
-EXPORT_SYMBOL(fetch_dev_dn);
-
/**
* pci_devs_phb_init - Initialize phbs and pci devs under them.
*
diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c
index e751506323b..44562aa97f1 100644
--- a/arch/powerpc/kernel/pci_of_scan.c
+++ b/arch/powerpc/kernel/pci_of_scan.c
@@ -15,6 +15,7 @@
*/
#include <linux/pci.h>
+#include <linux/export.h>
#include <asm/pci-bridge.h>
#include <asm/prom.h>
@@ -23,12 +24,12 @@
*/
static u32 get_int_prop(struct device_node *np, const char *name, u32 def)
{
- const u32 *prop;
+ const __be32 *prop;
int len;
prop = of_get_property(np, name, &len);
if (prop && len >= 4)
- return *prop;
+ return of_read_number(prop, 1);
return def;
}
@@ -74,8 +75,9 @@ static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev)
{
u64 base, size;
unsigned int flags;
+ struct pci_bus_region region;
struct resource *res;
- const u32 *addrs;
+ const __be32 *addrs;
u32 i;
int proplen;
@@ -84,14 +86,14 @@ static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev)
return;
pr_debug(" parse addresses (%d bytes) @ %p\n", proplen, addrs);
for (; proplen >= 20; proplen -= 20, addrs += 5) {
- flags = pci_parse_of_flags(addrs[0], 0);
+ flags = pci_parse_of_flags(of_read_number(addrs, 1), 0);
if (!flags)
continue;
base = of_read_number(&addrs[1], 2);
size = of_read_number(&addrs[3], 2);
if (!size)
continue;
- i = addrs[0] & 0xff;
+ i = of_read_number(addrs, 1) & 0xff;
pr_debug(" base: %llx, size: %llx, i: %x\n",
(unsigned long long)base,
(unsigned long long)size, i);
@@ -105,10 +107,11 @@ static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev)
printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i);
continue;
}
- res->start = base;
- res->end = base + size - 1;
res->flags = flags;
res->name = pci_name(dev);
+ region.start = base;
+ region.end = base + size - 1;
+ pcibios_bus_to_resource(dev->bus, res, &region);
}
}
@@ -125,7 +128,7 @@ struct pci_dev *of_create_pci_dev(struct device_node *node,
const char *type;
struct pci_slot *slot;
- dev = alloc_pci_dev();
+ dev = pci_alloc_dev(bus);
if (!dev)
return NULL;
type = of_get_property(node, "device_type", NULL);
@@ -134,8 +137,7 @@ struct pci_dev *of_create_pci_dev(struct device_node *node,
pr_debug(" create device, devfn: %x, type: %s\n", devfn, type);
- dev->bus = bus;
- dev->sysdata = node;
+ dev->dev.of_node = of_node_get(node);
dev->dev.parent = bus->bridge;
dev->dev.bus = &pci_bus_type;
dev->devfn = devfn;
@@ -162,7 +164,7 @@ struct pci_dev *of_create_pci_dev(struct device_node *node,
pr_debug(" class: 0x%x\n", dev->class);
pr_debug(" revision: 0x%x\n", dev->revision);
- dev->current_state = 4; /* unknown power state */
+ dev->current_state = PCI_UNKNOWN; /* unknown power state */
dev->error_state = pci_channel_io_normal;
dev->dma_mask = 0xffffffff;
@@ -195,19 +197,19 @@ EXPORT_SYMBOL(of_create_pci_dev);
/**
* of_scan_pci_bridge - Set up a PCI bridge and scan for child nodes
- * @node: device tree node of bridge
* @dev: pci_dev structure for the bridge
*
* of_scan_bus() calls this routine for each PCI bridge that it finds, and
* this routine in turn call of_scan_bus() recusively to scan for more child
* devices.
*/
-void __devinit of_scan_pci_bridge(struct device_node *node,
- struct pci_dev *dev)
+void of_scan_pci_bridge(struct pci_dev *dev)
{
+ struct device_node *node = dev->dev.of_node;
struct pci_bus *bus;
- const u32 *busrange, *ranges;
+ const __be32 *busrange, *ranges;
int len, i, mode;
+ struct pci_bus_region region;
struct resource *res;
unsigned int flags;
u64 size;
@@ -228,17 +230,22 @@ void __devinit of_scan_pci_bridge(struct device_node *node,
return;
}
- bus = pci_add_new_bus(dev->bus, dev, busrange[0]);
+ bus = pci_find_bus(pci_domain_nr(dev->bus),
+ of_read_number(busrange, 1));
if (!bus) {
- printk(KERN_ERR "Failed to create pci bus for %s\n",
- node->full_name);
- return;
+ bus = pci_add_new_bus(dev->bus, dev,
+ of_read_number(busrange, 1));
+ if (!bus) {
+ printk(KERN_ERR "Failed to create pci bus for %s\n",
+ node->full_name);
+ return;
+ }
}
bus->primary = dev->bus->number;
- bus->subordinate = busrange[1];
+ pci_bus_insert_busn_res(bus, of_read_number(busrange, 1),
+ of_read_number(busrange+1, 1));
bus->bridge_ctl = 0;
- bus->sysdata = node;
/* parse ranges property */
/* PCI #address-cells == 3 and #size-cells == 2 always */
@@ -250,7 +257,7 @@ void __devinit of_scan_pci_bridge(struct device_node *node,
}
i = 1;
for (; len >= 32; len -= 32, ranges += 8) {
- flags = pci_parse_of_flags(ranges[0], 1);
+ flags = pci_parse_of_flags(of_read_number(ranges, 1), 1);
size = of_read_number(&ranges[6], 2);
if (flags == 0 || size == 0)
continue;
@@ -270,9 +277,10 @@ void __devinit of_scan_pci_bridge(struct device_node *node,
res = bus->resource[i];
++i;
}
- res->start = of_read_number(&ranges[1], 2);
- res->end = res->start + size - 1;
res->flags = flags;
+ region.start = of_read_number(&ranges[1], 2);
+ region.end = region.start + size - 1;
+ pcibios_bus_to_resource(dev->bus, res, &region);
}
sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus),
bus->number);
@@ -290,18 +298,57 @@ void __devinit of_scan_pci_bridge(struct device_node *node,
}
EXPORT_SYMBOL(of_scan_pci_bridge);
+static struct pci_dev *of_scan_pci_dev(struct pci_bus *bus,
+ struct device_node *dn)
+{
+ struct pci_dev *dev = NULL;
+ const __be32 *reg;
+ int reglen, devfn;
+#ifdef CONFIG_EEH
+ struct eeh_dev *edev = of_node_to_eeh_dev(dn);
+#endif
+
+ pr_debug(" * %s\n", dn->full_name);
+ if (!of_device_is_available(dn))
+ return NULL;
+
+ reg = of_get_property(dn, "reg", &reglen);
+ if (reg == NULL || reglen < 20)
+ return NULL;
+ devfn = (of_read_number(reg, 1) >> 8) & 0xff;
+
+ /* Check if the PCI device is already there */
+ dev = pci_get_slot(bus, devfn);
+ if (dev) {
+ pci_dev_put(dev);
+ return dev;
+ }
+
+ /* Device removed permanently ? */
+#ifdef CONFIG_EEH
+ if (edev && (edev->mode & EEH_DEV_REMOVED))
+ return NULL;
+#endif
+
+ /* create a new pci_dev for this device */
+ dev = of_create_pci_dev(dn, bus, devfn);
+ if (!dev)
+ return NULL;
+
+ pr_debug(" dev header type: %x\n", dev->hdr_type);
+ return dev;
+}
+
/**
* __of_scan_bus - given a PCI bus node, setup bus and scan for child devices
* @node: device tree node for the PCI bus
* @bus: pci_bus structure for the PCI bus
* @rescan_existing: Flag indicating bus has already been set up
*/
-static void __devinit __of_scan_bus(struct device_node *node,
- struct pci_bus *bus, int rescan_existing)
+static void __of_scan_bus(struct device_node *node, struct pci_bus *bus,
+ int rescan_existing)
{
struct device_node *child;
- const u32 *reg;
- int reglen, devfn;
struct pci_dev *dev;
pr_debug("of_scan_bus(%s) bus no %d...\n",
@@ -309,16 +356,7 @@ static void __devinit __of_scan_bus(struct device_node *node,
/* Scan direct children */
for_each_child_of_node(node, child) {
- pr_debug(" * %s\n", child->full_name);
- if (!of_device_is_available(child))
- continue;
- reg = of_get_property(child, "reg", &reglen);
- if (reg == NULL || reglen < 20)
- continue;
- devfn = (reg[0] >> 8) & 0xff;
-
- /* create a new pci_dev for this device */
- dev = of_create_pci_dev(child, bus, devfn);
+ dev = of_scan_pci_dev(bus, child);
if (!dev)
continue;
pr_debug(" dev header type: %x\n", dev->hdr_type);
@@ -333,11 +371,8 @@ static void __devinit __of_scan_bus(struct device_node *node,
/* Now scan child busses */
list_for_each_entry(dev, &bus->devices, bus_list) {
- if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
- dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) {
- struct device_node *child = pci_device_to_OF_node(dev);
- if (child)
- of_scan_pci_bridge(child, dev);
+ if (pci_is_bridge(dev)) {
+ of_scan_pci_bridge(dev);
}
}
}
@@ -347,8 +382,7 @@ static void __devinit __of_scan_bus(struct device_node *node,
* @node: device tree node for the PCI bus
* @bus: pci_bus structure for the PCI bus
*/
-void __devinit of_scan_bus(struct device_node *node,
- struct pci_bus *bus)
+void of_scan_bus(struct device_node *node, struct pci_bus *bus)
{
__of_scan_bus(node, bus, 0);
}
@@ -362,8 +396,7 @@ EXPORT_SYMBOL_GPL(of_scan_bus);
* Same as of_scan_bus, but for a pci_bus structure that has already been
* setup.
*/
-void __devinit of_rescan_bus(struct device_node *node,
- struct pci_bus *bus)
+void of_rescan_bus(struct device_node *node, struct pci_bus *bus)
{
__of_scan_bus(node, bus, 1);
}
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c
deleted file mode 100644
index d05ae4204bb..00000000000
--- a/arch/powerpc/kernel/perf_callchain.c
+++ /dev/null
@@ -1,478 +0,0 @@
-/*
- * Performance counter callchain support - powerpc architecture code
- *
- * Copyright © 2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/perf_event.h>
-#include <linux/percpu.h>
-#include <linux/uaccess.h>
-#include <linux/mm.h>
-#include <asm/ptrace.h>
-#include <asm/pgtable.h>
-#include <asm/sigcontext.h>
-#include <asm/ucontext.h>
-#include <asm/vdso.h>
-#ifdef CONFIG_PPC64
-#include "ppc32.h"
-#endif
-
-
-/*
- * Is sp valid as the address of the next kernel stack frame after prev_sp?
- * The next frame may be in a different stack area but should not go
- * back down in the same stack area.
- */
-static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
-{
- if (sp & 0xf)
- return 0; /* must be 16-byte aligned */
- if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
- return 0;
- if (sp >= prev_sp + STACK_FRAME_OVERHEAD)
- return 1;
- /*
- * sp could decrease when we jump off an interrupt stack
- * back to the regular process stack.
- */
- if ((sp & ~(THREAD_SIZE - 1)) != (prev_sp & ~(THREAD_SIZE - 1)))
- return 1;
- return 0;
-}
-
-void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
- unsigned long sp, next_sp;
- unsigned long next_ip;
- unsigned long lr;
- long level = 0;
- unsigned long *fp;
-
- lr = regs->link;
- sp = regs->gpr[1];
- perf_callchain_store(entry, regs->nip);
-
- if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
- return;
-
- for (;;) {
- fp = (unsigned long *) sp;
- next_sp = fp[0];
-
- if (next_sp == sp + STACK_INT_FRAME_SIZE &&
- fp[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
- /*
- * This looks like an interrupt frame for an
- * interrupt that occurred in the kernel
- */
- regs = (struct pt_regs *)(sp + STACK_FRAME_OVERHEAD);
- next_ip = regs->nip;
- lr = regs->link;
- level = 0;
- perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-
- } else {
- if (level == 0)
- next_ip = lr;
- else
- next_ip = fp[STACK_FRAME_LR_SAVE];
-
- /*
- * We can't tell which of the first two addresses
- * we get are valid, but we can filter out the
- * obviously bogus ones here. We replace them
- * with 0 rather than removing them entirely so
- * that userspace can tell which is which.
- */
- if ((level == 1 && next_ip == lr) ||
- (level <= 1 && !kernel_text_address(next_ip)))
- next_ip = 0;
-
- ++level;
- }
-
- perf_callchain_store(entry, next_ip);
- if (!valid_next_sp(next_sp, sp))
- return;
- sp = next_sp;
- }
-}
-
-#ifdef CONFIG_PPC64
-/*
- * On 64-bit we don't want to invoke hash_page on user addresses from
- * interrupt context, so if the access faults, we read the page tables
- * to find which page (if any) is mapped and access it directly.
- */
-static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
-{
- pgd_t *pgdir;
- pte_t *ptep, pte;
- unsigned shift;
- unsigned long addr = (unsigned long) ptr;
- unsigned long offset;
- unsigned long pfn;
- void *kaddr;
-
- pgdir = current->mm->pgd;
- if (!pgdir)
- return -EFAULT;
-
- ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift);
- if (!shift)
- shift = PAGE_SHIFT;
-
- /* align address to page boundary */
- offset = addr & ((1UL << shift) - 1);
- addr -= offset;
-
- if (ptep == NULL)
- return -EFAULT;
- pte = *ptep;
- if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
- return -EFAULT;
- pfn = pte_pfn(pte);
- if (!page_is_ram(pfn))
- return -EFAULT;
-
- /* no highmem to worry about here */
- kaddr = pfn_to_kaddr(pfn);
- memcpy(ret, kaddr + offset, nb);
- return 0;
-}
-
-static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
-{
- if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) ||
- ((unsigned long)ptr & 7))
- return -EFAULT;
-
- if (!__get_user_inatomic(*ret, ptr))
- return 0;
-
- return read_user_stack_slow(ptr, ret, 8);
-}
-
-static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
-{
- if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
- ((unsigned long)ptr & 3))
- return -EFAULT;
-
- if (!__get_user_inatomic(*ret, ptr))
- return 0;
-
- return read_user_stack_slow(ptr, ret, 4);
-}
-
-static inline int valid_user_sp(unsigned long sp, int is_64)
-{
- if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x100000000UL) - 32)
- return 0;
- return 1;
-}
-
-/*
- * 64-bit user processes use the same stack frame for RT and non-RT signals.
- */
-struct signal_frame_64 {
- char dummy[__SIGNAL_FRAMESIZE];
- struct ucontext uc;
- unsigned long unused[2];
- unsigned int tramp[6];
- struct siginfo *pinfo;
- void *puc;
- struct siginfo info;
- char abigap[288];
-};
-
-static int is_sigreturn_64_address(unsigned long nip, unsigned long fp)
-{
- if (nip == fp + offsetof(struct signal_frame_64, tramp))
- return 1;
- if (vdso64_rt_sigtramp && current->mm->context.vdso_base &&
- nip == current->mm->context.vdso_base + vdso64_rt_sigtramp)
- return 1;
- return 0;
-}
-
-/*
- * Do some sanity checking on the signal frame pointed to by sp.
- * We check the pinfo and puc pointers in the frame.
- */
-static int sane_signal_64_frame(unsigned long sp)
-{
- struct signal_frame_64 __user *sf;
- unsigned long pinfo, puc;
-
- sf = (struct signal_frame_64 __user *) sp;
- if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) ||
- read_user_stack_64((unsigned long __user *) &sf->puc, &puc))
- return 0;
- return pinfo == (unsigned long) &sf->info &&
- puc == (unsigned long) &sf->uc;
-}
-
-static void perf_callchain_user_64(struct perf_callchain_entry *entry,
- struct pt_regs *regs)
-{
- unsigned long sp, next_sp;
- unsigned long next_ip;
- unsigned long lr;
- long level = 0;
- struct signal_frame_64 __user *sigframe;
- unsigned long __user *fp, *uregs;
-
- next_ip = regs->nip;
- lr = regs->link;
- sp = regs->gpr[1];
- perf_callchain_store(entry, next_ip);
-
- for (;;) {
- fp = (unsigned long __user *) sp;
- if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
- return;
- if (level > 0 && read_user_stack_64(&fp[2], &next_ip))
- return;
-
- /*
- * Note: the next_sp - sp >= signal frame size check
- * is true when next_sp < sp, which can happen when
- * transitioning from an alternate signal stack to the
- * normal stack.
- */
- if (next_sp - sp >= sizeof(struct signal_frame_64) &&
- (is_sigreturn_64_address(next_ip, sp) ||
- (level <= 1 && is_sigreturn_64_address(lr, sp))) &&
- sane_signal_64_frame(sp)) {
- /*
- * This looks like an signal frame
- */
- sigframe = (struct signal_frame_64 __user *) sp;
- uregs = sigframe->uc.uc_mcontext.gp_regs;
- if (read_user_stack_64(&uregs[PT_NIP], &next_ip) ||
- read_user_stack_64(&uregs[PT_LNK], &lr) ||
- read_user_stack_64(&uregs[PT_R1], &sp))
- return;
- level = 0;
- perf_callchain_store(entry, PERF_CONTEXT_USER);
- perf_callchain_store(entry, next_ip);
- continue;
- }
-
- if (level == 0)
- next_ip = lr;
- perf_callchain_store(entry, next_ip);
- ++level;
- sp = next_sp;
- }
-}
-
-static inline int current_is_64bit(void)
-{
- /*
- * We can't use test_thread_flag() here because we may be on an
- * interrupt stack, and the thread flags don't get copied over
- * from the thread_info on the main stack to the interrupt stack.
- */
- return !test_ti_thread_flag(task_thread_info(current), TIF_32BIT);
-}
-
-#else /* CONFIG_PPC64 */
-/*
- * On 32-bit we just access the address and let hash_page create a
- * HPTE if necessary, so there is no need to fall back to reading
- * the page tables. Since this is called at interrupt level,
- * do_page_fault() won't treat a DSI as a page fault.
- */
-static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
-{
- if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
- ((unsigned long)ptr & 3))
- return -EFAULT;
-
- return __get_user_inatomic(*ret, ptr);
-}
-
-static inline void perf_callchain_user_64(struct perf_callchain_entry *entry,
- struct pt_regs *regs)
-{
-}
-
-static inline int current_is_64bit(void)
-{
- return 0;
-}
-
-static inline int valid_user_sp(unsigned long sp, int is_64)
-{
- if (!sp || (sp & 7) || sp > TASK_SIZE - 32)
- return 0;
- return 1;
-}
-
-#define __SIGNAL_FRAMESIZE32 __SIGNAL_FRAMESIZE
-#define sigcontext32 sigcontext
-#define mcontext32 mcontext
-#define ucontext32 ucontext
-#define compat_siginfo_t struct siginfo
-
-#endif /* CONFIG_PPC64 */
-
-/*
- * Layout for non-RT signal frames
- */
-struct signal_frame_32 {
- char dummy[__SIGNAL_FRAMESIZE32];
- struct sigcontext32 sctx;
- struct mcontext32 mctx;
- int abigap[56];
-};
-
-/*
- * Layout for RT signal frames
- */
-struct rt_signal_frame_32 {
- char dummy[__SIGNAL_FRAMESIZE32 + 16];
- compat_siginfo_t info;
- struct ucontext32 uc;
- int abigap[56];
-};
-
-static int is_sigreturn_32_address(unsigned int nip, unsigned int fp)
-{
- if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad))
- return 1;
- if (vdso32_sigtramp && current->mm->context.vdso_base &&
- nip == current->mm->context.vdso_base + vdso32_sigtramp)
- return 1;
- return 0;
-}
-
-static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp)
-{
- if (nip == fp + offsetof(struct rt_signal_frame_32,
- uc.uc_mcontext.mc_pad))
- return 1;
- if (vdso32_rt_sigtramp && current->mm->context.vdso_base &&
- nip == current->mm->context.vdso_base + vdso32_rt_sigtramp)
- return 1;
- return 0;
-}
-
-static int sane_signal_32_frame(unsigned int sp)
-{
- struct signal_frame_32 __user *sf;
- unsigned int regs;
-
- sf = (struct signal_frame_32 __user *) (unsigned long) sp;
- if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, &regs))
- return 0;
- return regs == (unsigned long) &sf->mctx;
-}
-
-static int sane_rt_signal_32_frame(unsigned int sp)
-{
- struct rt_signal_frame_32 __user *sf;
- unsigned int regs;
-
- sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
- if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, &regs))
- return 0;
- return regs == (unsigned long) &sf->uc.uc_mcontext;
-}
-
-static unsigned int __user *signal_frame_32_regs(unsigned int sp,
- unsigned int next_sp, unsigned int next_ip)
-{
- struct mcontext32 __user *mctx = NULL;
- struct signal_frame_32 __user *sf;
- struct rt_signal_frame_32 __user *rt_sf;
-
- /*
- * Note: the next_sp - sp >= signal frame size check
- * is true when next_sp < sp, for example, when
- * transitioning from an alternate signal stack to the
- * normal stack.
- */
- if (next_sp - sp >= sizeof(struct signal_frame_32) &&
- is_sigreturn_32_address(next_ip, sp) &&
- sane_signal_32_frame(sp)) {
- sf = (struct signal_frame_32 __user *) (unsigned long) sp;
- mctx = &sf->mctx;
- }
-
- if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) &&
- is_rt_sigreturn_32_address(next_ip, sp) &&
- sane_rt_signal_32_frame(sp)) {
- rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
- mctx = &rt_sf->uc.uc_mcontext;
- }
-
- if (!mctx)
- return NULL;
- return mctx->mc_gregs;
-}
-
-static void perf_callchain_user_32(struct perf_callchain_entry *entry,
- struct pt_regs *regs)
-{
- unsigned int sp, next_sp;
- unsigned int next_ip;
- unsigned int lr;
- long level = 0;
- unsigned int __user *fp, *uregs;
-
- next_ip = regs->nip;
- lr = regs->link;
- sp = regs->gpr[1];
- perf_callchain_store(entry, next_ip);
-
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
- fp = (unsigned int __user *) (unsigned long) sp;
- if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
- return;
- if (level > 0 && read_user_stack_32(&fp[1], &next_ip))
- return;
-
- uregs = signal_frame_32_regs(sp, next_sp, next_ip);
- if (!uregs && level <= 1)
- uregs = signal_frame_32_regs(sp, next_sp, lr);
- if (uregs) {
- /*
- * This looks like an signal frame, so restart
- * the stack trace with the values in it.
- */
- if (read_user_stack_32(&uregs[PT_NIP], &next_ip) ||
- read_user_stack_32(&uregs[PT_LNK], &lr) ||
- read_user_stack_32(&uregs[PT_R1], &sp))
- return;
- level = 0;
- perf_callchain_store(entry, PERF_CONTEXT_USER);
- perf_callchain_store(entry, next_ip);
- continue;
- }
-
- if (level == 0)
- next_ip = lr;
- perf_callchain_store(entry, next_ip);
- ++level;
- sp = next_sp;
- }
-}
-
-void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
- if (current_is_64bit())
- perf_callchain_user_64(entry, regs);
- else
- perf_callchain_user_32(entry, regs);
-}
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
deleted file mode 100644
index 3129c855933..00000000000
--- a/arch/powerpc/kernel/perf_event.c
+++ /dev/null
@@ -1,1386 +0,0 @@
-/*
- * Performance event support - powerpc architecture code
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/perf_event.h>
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-#include <asm/reg.h>
-#include <asm/pmc.h>
-#include <asm/machdep.h>
-#include <asm/firmware.h>
-#include <asm/ptrace.h>
-
-struct cpu_hw_events {
- int n_events;
- int n_percpu;
- int disabled;
- int n_added;
- int n_limited;
- u8 pmcs_enabled;
- struct perf_event *event[MAX_HWEVENTS];
- u64 events[MAX_HWEVENTS];
- unsigned int flags[MAX_HWEVENTS];
- unsigned long mmcr[3];
- struct perf_event *limited_counter[MAX_LIMITED_HWCOUNTERS];
- u8 limited_hwidx[MAX_LIMITED_HWCOUNTERS];
- u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
- unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
- unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
-
- unsigned int group_flag;
- int n_txn_start;
-};
-DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
-
-struct power_pmu *ppmu;
-
-/*
- * Normally, to ignore kernel events we set the FCS (freeze counters
- * in supervisor mode) bit in MMCR0, but if the kernel runs with the
- * hypervisor bit set in the MSR, or if we are running on a processor
- * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
- * then we need to use the FCHV bit to ignore kernel events.
- */
-static unsigned int freeze_events_kernel = MMCR0_FCS;
-
-/*
- * 32-bit doesn't have MMCRA but does have an MMCR2,
- * and a few other names are different.
- */
-#ifdef CONFIG_PPC32
-
-#define MMCR0_FCHV 0
-#define MMCR0_PMCjCE MMCR0_PMCnCE
-
-#define SPRN_MMCRA SPRN_MMCR2
-#define MMCRA_SAMPLE_ENABLE 0
-
-static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
-{
- return 0;
-}
-static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
-static inline u32 perf_get_misc_flags(struct pt_regs *regs)
-{
- return 0;
-}
-static inline void perf_read_regs(struct pt_regs *regs) { }
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
- return 0;
-}
-
-#endif /* CONFIG_PPC32 */
-
-/*
- * Things that are specific to 64-bit implementations.
- */
-#ifdef CONFIG_PPC64
-
-static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
-{
- unsigned long mmcra = regs->dsisr;
-
- if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
- unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
- if (slot > 1)
- return 4 * (slot - 1);
- }
- return 0;
-}
-
-/*
- * The user wants a data address recorded.
- * If we're not doing instruction sampling, give them the SDAR
- * (sampled data address). If we are doing instruction sampling, then
- * only give them the SDAR if it corresponds to the instruction
- * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC
- * bit in MMCRA.
- */
-static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
-{
- unsigned long mmcra = regs->dsisr;
- unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
- POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
-
- if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
- *addrp = mfspr(SPRN_SDAR);
-}
-
-static inline u32 perf_get_misc_flags(struct pt_regs *regs)
-{
- unsigned long mmcra = regs->dsisr;
- unsigned long sihv = MMCRA_SIHV;
- unsigned long sipr = MMCRA_SIPR;
-
- if (TRAP(regs) != 0xf00)
- return 0; /* not a PMU interrupt */
-
- if (ppmu->flags & PPMU_ALT_SIPR) {
- sihv = POWER6_MMCRA_SIHV;
- sipr = POWER6_MMCRA_SIPR;
- }
-
- /* PR has priority over HV, so order below is important */
- if (mmcra & sipr)
- return PERF_RECORD_MISC_USER;
- if ((mmcra & sihv) && (freeze_events_kernel != MMCR0_FCHV))
- return PERF_RECORD_MISC_HYPERVISOR;
- return PERF_RECORD_MISC_KERNEL;
-}
-
-/*
- * Overload regs->dsisr to store MMCRA so we only need to read it once
- * on each interrupt.
- */
-static inline void perf_read_regs(struct pt_regs *regs)
-{
- regs->dsisr = mfspr(SPRN_MMCRA);
-}
-
-/*
- * If interrupts were soft-disabled when a PMU interrupt occurs, treat
- * it as an NMI.
- */
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
- return !regs->softe;
-}
-
-#endif /* CONFIG_PPC64 */
-
-static void perf_event_interrupt(struct pt_regs *regs);
-
-void perf_event_print_debug(void)
-{
-}
-
-/*
- * Read one performance monitor counter (PMC).
- */
-static unsigned long read_pmc(int idx)
-{
- unsigned long val;
-
- switch (idx) {
- case 1:
- val = mfspr(SPRN_PMC1);
- break;
- case 2:
- val = mfspr(SPRN_PMC2);
- break;
- case 3:
- val = mfspr(SPRN_PMC3);
- break;
- case 4:
- val = mfspr(SPRN_PMC4);
- break;
- case 5:
- val = mfspr(SPRN_PMC5);
- break;
- case 6:
- val = mfspr(SPRN_PMC6);
- break;
-#ifdef CONFIG_PPC64
- case 7:
- val = mfspr(SPRN_PMC7);
- break;
- case 8:
- val = mfspr(SPRN_PMC8);
- break;
-#endif /* CONFIG_PPC64 */
- default:
- printk(KERN_ERR "oops trying to read PMC%d\n", idx);
- val = 0;
- }
- return val;
-}
-
-/*
- * Write one PMC.
- */
-static void write_pmc(int idx, unsigned long val)
-{
- switch (idx) {
- case 1:
- mtspr(SPRN_PMC1, val);
- break;
- case 2:
- mtspr(SPRN_PMC2, val);
- break;
- case 3:
- mtspr(SPRN_PMC3, val);
- break;
- case 4:
- mtspr(SPRN_PMC4, val);
- break;
- case 5:
- mtspr(SPRN_PMC5, val);
- break;
- case 6:
- mtspr(SPRN_PMC6, val);
- break;
-#ifdef CONFIG_PPC64
- case 7:
- mtspr(SPRN_PMC7, val);
- break;
- case 8:
- mtspr(SPRN_PMC8, val);
- break;
-#endif /* CONFIG_PPC64 */
- default:
- printk(KERN_ERR "oops trying to write PMC%d\n", idx);
- }
-}
-
-/*
- * Check if a set of events can all go on the PMU at once.
- * If they can't, this will look at alternative codes for the events
- * and see if any combination of alternative codes is feasible.
- * The feasible set is returned in event_id[].
- */
-static int power_check_constraints(struct cpu_hw_events *cpuhw,
- u64 event_id[], unsigned int cflags[],
- int n_ev)
-{
- unsigned long mask, value, nv;
- unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS];
- int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS];
- int i, j;
- unsigned long addf = ppmu->add_fields;
- unsigned long tadd = ppmu->test_adder;
-
- if (n_ev > ppmu->n_counter)
- return -1;
-
- /* First see if the events will go on as-is */
- for (i = 0; i < n_ev; ++i) {
- if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
- && !ppmu->limited_pmc_event(event_id[i])) {
- ppmu->get_alternatives(event_id[i], cflags[i],
- cpuhw->alternatives[i]);
- event_id[i] = cpuhw->alternatives[i][0];
- }
- if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0],
- &cpuhw->avalues[i][0]))
- return -1;
- }
- value = mask = 0;
- for (i = 0; i < n_ev; ++i) {
- nv = (value | cpuhw->avalues[i][0]) +
- (value & cpuhw->avalues[i][0] & addf);
- if ((((nv + tadd) ^ value) & mask) != 0 ||
- (((nv + tadd) ^ cpuhw->avalues[i][0]) &
- cpuhw->amasks[i][0]) != 0)
- break;
- value = nv;
- mask |= cpuhw->amasks[i][0];
- }
- if (i == n_ev)
- return 0; /* all OK */
-
- /* doesn't work, gather alternatives... */
- if (!ppmu->get_alternatives)
- return -1;
- for (i = 0; i < n_ev; ++i) {
- choice[i] = 0;
- n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i],
- cpuhw->alternatives[i]);
- for (j = 1; j < n_alt[i]; ++j)
- ppmu->get_constraint(cpuhw->alternatives[i][j],
- &cpuhw->amasks[i][j],
- &cpuhw->avalues[i][j]);
- }
-
- /* enumerate all possibilities and see if any will work */
- i = 0;
- j = -1;
- value = mask = nv = 0;
- while (i < n_ev) {
- if (j >= 0) {
- /* we're backtracking, restore context */
- value = svalues[i];
- mask = smasks[i];
- j = choice[i];
- }
- /*
- * See if any alternative k for event_id i,
- * where k > j, will satisfy the constraints.
- */
- while (++j < n_alt[i]) {
- nv = (value | cpuhw->avalues[i][j]) +
- (value & cpuhw->avalues[i][j] & addf);
- if ((((nv + tadd) ^ value) & mask) == 0 &&
- (((nv + tadd) ^ cpuhw->avalues[i][j])
- & cpuhw->amasks[i][j]) == 0)
- break;
- }
- if (j >= n_alt[i]) {
- /*
- * No feasible alternative, backtrack
- * to event_id i-1 and continue enumerating its
- * alternatives from where we got up to.
- */
- if (--i < 0)
- return -1;
- } else {
- /*
- * Found a feasible alternative for event_id i,
- * remember where we got up to with this event_id,
- * go on to the next event_id, and start with
- * the first alternative for it.
- */
- choice[i] = j;
- svalues[i] = value;
- smasks[i] = mask;
- value = nv;
- mask |= cpuhw->amasks[i][j];
- ++i;
- j = -1;
- }
- }
-
- /* OK, we have a feasible combination, tell the caller the solution */
- for (i = 0; i < n_ev; ++i)
- event_id[i] = cpuhw->alternatives[i][choice[i]];
- return 0;
-}
-
-/*
- * Check if newly-added events have consistent settings for
- * exclude_{user,kernel,hv} with each other and any previously
- * added events.
- */
-static int check_excludes(struct perf_event **ctrs, unsigned int cflags[],
- int n_prev, int n_new)
-{
- int eu = 0, ek = 0, eh = 0;
- int i, n, first;
- struct perf_event *event;
-
- n = n_prev + n_new;
- if (n <= 1)
- return 0;
-
- first = 1;
- for (i = 0; i < n; ++i) {
- if (cflags[i] & PPMU_LIMITED_PMC_OK) {
- cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
- continue;
- }
- event = ctrs[i];
- if (first) {
- eu = event->attr.exclude_user;
- ek = event->attr.exclude_kernel;
- eh = event->attr.exclude_hv;
- first = 0;
- } else if (event->attr.exclude_user != eu ||
- event->attr.exclude_kernel != ek ||
- event->attr.exclude_hv != eh) {
- return -EAGAIN;
- }
- }
-
- if (eu || ek || eh)
- for (i = 0; i < n; ++i)
- if (cflags[i] & PPMU_LIMITED_PMC_OK)
- cflags[i] |= PPMU_LIMITED_PMC_REQD;
-
- return 0;
-}
-
-static void power_pmu_read(struct perf_event *event)
-{
- s64 val, delta, prev;
-
- if (event->hw.state & PERF_HES_STOPPED)
- return;
-
- if (!event->hw.idx)
- return;
- /*
- * Performance monitor interrupts come even when interrupts
- * are soft-disabled, as long as interrupts are hard-enabled.
- * Therefore we treat them like NMIs.
- */
- do {
- prev = local64_read(&event->hw.prev_count);
- barrier();
- val = read_pmc(event->hw.idx);
- } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
-
- /* The counters are only 32 bits wide */
- delta = (val - prev) & 0xfffffffful;
- local64_add(delta, &event->count);
- local64_sub(delta, &event->hw.period_left);
-}
-
-/*
- * On some machines, PMC5 and PMC6 can't be written, don't respect
- * the freeze conditions, and don't generate interrupts. This tells
- * us if `event' is using such a PMC.
- */
-static int is_limited_pmc(int pmcnum)
-{
- return (ppmu->flags & PPMU_LIMITED_PMC5_6)
- && (pmcnum == 5 || pmcnum == 6);
-}
-
-static void freeze_limited_counters(struct cpu_hw_events *cpuhw,
- unsigned long pmc5, unsigned long pmc6)
-{
- struct perf_event *event;
- u64 val, prev, delta;
- int i;
-
- for (i = 0; i < cpuhw->n_limited; ++i) {
- event = cpuhw->limited_counter[i];
- if (!event->hw.idx)
- continue;
- val = (event->hw.idx == 5) ? pmc5 : pmc6;
- prev = local64_read(&event->hw.prev_count);
- event->hw.idx = 0;
- delta = (val - prev) & 0xfffffffful;
- local64_add(delta, &event->count);
- }
-}
-
-static void thaw_limited_counters(struct cpu_hw_events *cpuhw,
- unsigned long pmc5, unsigned long pmc6)
-{
- struct perf_event *event;
- u64 val;
- int i;
-
- for (i = 0; i < cpuhw->n_limited; ++i) {
- event = cpuhw->limited_counter[i];
- event->hw.idx = cpuhw->limited_hwidx[i];
- val = (event->hw.idx == 5) ? pmc5 : pmc6;
- local64_set(&event->hw.prev_count, val);
- perf_event_update_userpage(event);
- }
-}
-
-/*
- * Since limited events don't respect the freeze conditions, we
- * have to read them immediately after freezing or unfreezing the
- * other events. We try to keep the values from the limited
- * events as consistent as possible by keeping the delay (in
- * cycles and instructions) between freezing/unfreezing and reading
- * the limited events as small and consistent as possible.
- * Therefore, if any limited events are in use, we read them
- * both, and always in the same order, to minimize variability,
- * and do it inside the same asm that writes MMCR0.
- */
-static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
-{
- unsigned long pmc5, pmc6;
-
- if (!cpuhw->n_limited) {
- mtspr(SPRN_MMCR0, mmcr0);
- return;
- }
-
- /*
- * Write MMCR0, then read PMC5 and PMC6 immediately.
- * To ensure we don't get a performance monitor interrupt
- * between writing MMCR0 and freezing/thawing the limited
- * events, we first write MMCR0 with the event overflow
- * interrupt enable bits turned off.
- */
- asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
- : "=&r" (pmc5), "=&r" (pmc6)
- : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
- "i" (SPRN_MMCR0),
- "i" (SPRN_PMC5), "i" (SPRN_PMC6));
-
- if (mmcr0 & MMCR0_FC)
- freeze_limited_counters(cpuhw, pmc5, pmc6);
- else
- thaw_limited_counters(cpuhw, pmc5, pmc6);
-
- /*
- * Write the full MMCR0 including the event overflow interrupt
- * enable bits, if necessary.
- */
- if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
- mtspr(SPRN_MMCR0, mmcr0);
-}
-
-/*
- * Disable all events to prevent PMU interrupts and to allow
- * events to be added or removed.
- */
-static void power_pmu_disable(struct pmu *pmu)
-{
- struct cpu_hw_events *cpuhw;
- unsigned long flags;
-
- if (!ppmu)
- return;
- local_irq_save(flags);
- cpuhw = &__get_cpu_var(cpu_hw_events);
-
- if (!cpuhw->disabled) {
- cpuhw->disabled = 1;
- cpuhw->n_added = 0;
-
- /*
- * Check if we ever enabled the PMU on this cpu.
- */
- if (!cpuhw->pmcs_enabled) {
- ppc_enable_pmcs();
- cpuhw->pmcs_enabled = 1;
- }
-
- /*
- * Disable instruction sampling if it was enabled
- */
- if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
- mtspr(SPRN_MMCRA,
- cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
- mb();
- }
-
- /*
- * Set the 'freeze counters' bit.
- * The barrier is to make sure the mtspr has been
- * executed and the PMU has frozen the events
- * before we return.
- */
- write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
- mb();
- }
- local_irq_restore(flags);
-}
-
-/*
- * Re-enable all events if disable == 0.
- * If we were previously disabled and events were added, then
- * put the new config on the PMU.
- */
-static void power_pmu_enable(struct pmu *pmu)
-{
- struct perf_event *event;
- struct cpu_hw_events *cpuhw;
- unsigned long flags;
- long i;
- unsigned long val;
- s64 left;
- unsigned int hwc_index[MAX_HWEVENTS];
- int n_lim;
- int idx;
-
- if (!ppmu)
- return;
- local_irq_save(flags);
- cpuhw = &__get_cpu_var(cpu_hw_events);
- if (!cpuhw->disabled) {
- local_irq_restore(flags);
- return;
- }
- cpuhw->disabled = 0;
-
- /*
- * If we didn't change anything, or only removed events,
- * no need to recalculate MMCR* settings and reset the PMCs.
- * Just reenable the PMU with the current MMCR* settings
- * (possibly updated for removal of events).
- */
- if (!cpuhw->n_added) {
- mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
- mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
- if (cpuhw->n_events == 0)
- ppc_set_pmu_inuse(0);
- goto out_enable;
- }
-
- /*
- * Compute MMCR* values for the new set of events
- */
- if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index,
- cpuhw->mmcr)) {
- /* shouldn't ever get here */
- printk(KERN_ERR "oops compute_mmcr failed\n");
- goto out;
- }
-
- /*
- * Add in MMCR0 freeze bits corresponding to the
- * attr.exclude_* bits for the first event.
- * We have already checked that all events have the
- * same values for these bits as the first event.
- */
- event = cpuhw->event[0];
- if (event->attr.exclude_user)
- cpuhw->mmcr[0] |= MMCR0_FCP;
- if (event->attr.exclude_kernel)
- cpuhw->mmcr[0] |= freeze_events_kernel;
- if (event->attr.exclude_hv)
- cpuhw->mmcr[0] |= MMCR0_FCHV;
-
- /*
- * Write the new configuration to MMCR* with the freeze
- * bit set and set the hardware events to their initial values.
- * Then unfreeze the events.
- */
- ppc_set_pmu_inuse(1);
- mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
- mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
- mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
- | MMCR0_FC);
-
- /*
- * Read off any pre-existing events that need to move
- * to another PMC.
- */
- for (i = 0; i < cpuhw->n_events; ++i) {
- event = cpuhw->event[i];
- if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) {
- power_pmu_read(event);
- write_pmc(event->hw.idx, 0);
- event->hw.idx = 0;
- }
- }
-
- /*
- * Initialize the PMCs for all the new and moved events.
- */
- cpuhw->n_limited = n_lim = 0;
- for (i = 0; i < cpuhw->n_events; ++i) {
- event = cpuhw->event[i];
- if (event->hw.idx)
- continue;
- idx = hwc_index[i] + 1;
- if (is_limited_pmc(idx)) {
- cpuhw->limited_counter[n_lim] = event;
- cpuhw->limited_hwidx[n_lim] = idx;
- ++n_lim;
- continue;
- }
- val = 0;
- if (event->hw.sample_period) {
- left = local64_read(&event->hw.period_left);
- if (left < 0x80000000L)
- val = 0x80000000L - left;
- }
- local64_set(&event->hw.prev_count, val);
- event->hw.idx = idx;
- if (event->hw.state & PERF_HES_STOPPED)
- val = 0;
- write_pmc(idx, val);
- perf_event_update_userpage(event);
- }
- cpuhw->n_limited = n_lim;
- cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
-
- out_enable:
- mb();
- write_mmcr0(cpuhw, cpuhw->mmcr[0]);
-
- /*
- * Enable instruction sampling if necessary
- */
- if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
- mb();
- mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
- }
-
- out:
- local_irq_restore(flags);
-}
-
-static int collect_events(struct perf_event *group, int max_count,
- struct perf_event *ctrs[], u64 *events,
- unsigned int *flags)
-{
- int n = 0;
- struct perf_event *event;
-
- if (!is_software_event(group)) {
- if (n >= max_count)
- return -1;
- ctrs[n] = group;
- flags[n] = group->hw.event_base;
- events[n++] = group->hw.config;
- }
- list_for_each_entry(event, &group->sibling_list, group_entry) {
- if (!is_software_event(event) &&
- event->state != PERF_EVENT_STATE_OFF) {
- if (n >= max_count)
- return -1;
- ctrs[n] = event;
- flags[n] = event->hw.event_base;
- events[n++] = event->hw.config;
- }
- }
- return n;
-}
-
-/*
- * Add a event to the PMU.
- * If all events are not already frozen, then we disable and
- * re-enable the PMU in order to get hw_perf_enable to do the
- * actual work of reconfiguring the PMU.
- */
-static int power_pmu_add(struct perf_event *event, int ef_flags)
-{
- struct cpu_hw_events *cpuhw;
- unsigned long flags;
- int n0;
- int ret = -EAGAIN;
-
- local_irq_save(flags);
- perf_pmu_disable(event->pmu);
-
- /*
- * Add the event to the list (if there is room)
- * and check whether the total set is still feasible.
- */
- cpuhw = &__get_cpu_var(cpu_hw_events);
- n0 = cpuhw->n_events;
- if (n0 >= ppmu->n_counter)
- goto out;
- cpuhw->event[n0] = event;
- cpuhw->events[n0] = event->hw.config;
- cpuhw->flags[n0] = event->hw.event_base;
-
- if (!(ef_flags & PERF_EF_START))
- event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
-
- /*
- * If group events scheduling transaction was started,
- * skip the schedulability test here, it will be peformed
- * at commit time(->commit_txn) as a whole
- */
- if (cpuhw->group_flag & PERF_EVENT_TXN)
- goto nocheck;
-
- if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
- goto out;
- if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
- goto out;
- event->hw.config = cpuhw->events[n0];
-
-nocheck:
- ++cpuhw->n_events;
- ++cpuhw->n_added;
-
- ret = 0;
- out:
- perf_pmu_enable(event->pmu);
- local_irq_restore(flags);
- return ret;
-}
-
-/*
- * Remove a event from the PMU.
- */
-static void power_pmu_del(struct perf_event *event, int ef_flags)
-{
- struct cpu_hw_events *cpuhw;
- long i;
- unsigned long flags;
-
- local_irq_save(flags);
- perf_pmu_disable(event->pmu);
-
- power_pmu_read(event);
-
- cpuhw = &__get_cpu_var(cpu_hw_events);
- for (i = 0; i < cpuhw->n_events; ++i) {
- if (event == cpuhw->event[i]) {
- while (++i < cpuhw->n_events) {
- cpuhw->event[i-1] = cpuhw->event[i];
- cpuhw->events[i-1] = cpuhw->events[i];
- cpuhw->flags[i-1] = cpuhw->flags[i];
- }
- --cpuhw->n_events;
- ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr);
- if (event->hw.idx) {
- write_pmc(event->hw.idx, 0);
- event->hw.idx = 0;
- }
- perf_event_update_userpage(event);
- break;
- }
- }
- for (i = 0; i < cpuhw->n_limited; ++i)
- if (event == cpuhw->limited_counter[i])
- break;
- if (i < cpuhw->n_limited) {
- while (++i < cpuhw->n_limited) {
- cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
- cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
- }
- --cpuhw->n_limited;
- }
- if (cpuhw->n_events == 0) {
- /* disable exceptions if no events are running */
- cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
- }
-
- perf_pmu_enable(event->pmu);
- local_irq_restore(flags);
-}
-
-/*
- * POWER-PMU does not support disabling individual counters, hence
- * program their cycle counter to their max value and ignore the interrupts.
- */
-
-static void power_pmu_start(struct perf_event *event, int ef_flags)
-{
- unsigned long flags;
- s64 left;
-
- if (!event->hw.idx || !event->hw.sample_period)
- return;
-
- if (!(event->hw.state & PERF_HES_STOPPED))
- return;
-
- if (ef_flags & PERF_EF_RELOAD)
- WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
-
- local_irq_save(flags);
- perf_pmu_disable(event->pmu);
-
- event->hw.state = 0;
- left = local64_read(&event->hw.period_left);
- write_pmc(event->hw.idx, left);
-
- perf_event_update_userpage(event);
- perf_pmu_enable(event->pmu);
- local_irq_restore(flags);
-}
-
-static void power_pmu_stop(struct perf_event *event, int ef_flags)
-{
- unsigned long flags;
-
- if (!event->hw.idx || !event->hw.sample_period)
- return;
-
- if (event->hw.state & PERF_HES_STOPPED)
- return;
-
- local_irq_save(flags);
- perf_pmu_disable(event->pmu);
-
- power_pmu_read(event);
- event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
- write_pmc(event->hw.idx, 0);
-
- perf_event_update_userpage(event);
- perf_pmu_enable(event->pmu);
- local_irq_restore(flags);
-}
-
-/*
- * Start group events scheduling transaction
- * Set the flag to make pmu::enable() not perform the
- * schedulability test, it will be performed at commit time
- */
-void power_pmu_start_txn(struct pmu *pmu)
-{
- struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
-
- perf_pmu_disable(pmu);
- cpuhw->group_flag |= PERF_EVENT_TXN;
- cpuhw->n_txn_start = cpuhw->n_events;
-}
-
-/*
- * Stop group events scheduling transaction
- * Clear the flag and pmu::enable() will perform the
- * schedulability test.
- */
-void power_pmu_cancel_txn(struct pmu *pmu)
-{
- struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
-
- cpuhw->group_flag &= ~PERF_EVENT_TXN;
- perf_pmu_enable(pmu);
-}
-
-/*
- * Commit group events scheduling transaction
- * Perform the group schedulability test as a whole
- * Return 0 if success
- */
-int power_pmu_commit_txn(struct pmu *pmu)
-{
- struct cpu_hw_events *cpuhw;
- long i, n;
-
- if (!ppmu)
- return -EAGAIN;
- cpuhw = &__get_cpu_var(cpu_hw_events);
- n = cpuhw->n_events;
- if (check_excludes(cpuhw->event, cpuhw->flags, 0, n))
- return -EAGAIN;
- i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n);
- if (i < 0)
- return -EAGAIN;
-
- for (i = cpuhw->n_txn_start; i < n; ++i)
- cpuhw->event[i]->hw.config = cpuhw->events[i];
-
- cpuhw->group_flag &= ~PERF_EVENT_TXN;
- perf_pmu_enable(pmu);
- return 0;
-}
-
-/*
- * Return 1 if we might be able to put event on a limited PMC,
- * or 0 if not.
- * A event can only go on a limited PMC if it counts something
- * that a limited PMC can count, doesn't require interrupts, and
- * doesn't exclude any processor mode.
- */
-static int can_go_on_limited_pmc(struct perf_event *event, u64 ev,
- unsigned int flags)
-{
- int n;
- u64 alt[MAX_EVENT_ALTERNATIVES];
-
- if (event->attr.exclude_user
- || event->attr.exclude_kernel
- || event->attr.exclude_hv
- || event->attr.sample_period)
- return 0;
-
- if (ppmu->limited_pmc_event(ev))
- return 1;
-
- /*
- * The requested event_id isn't on a limited PMC already;
- * see if any alternative code goes on a limited PMC.
- */
- if (!ppmu->get_alternatives)
- return 0;
-
- flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
- n = ppmu->get_alternatives(ev, flags, alt);
-
- return n > 0;
-}
-
-/*
- * Find an alternative event_id that goes on a normal PMC, if possible,
- * and return the event_id code, or 0 if there is no such alternative.
- * (Note: event_id code 0 is "don't count" on all machines.)
- */
-static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
-{
- u64 alt[MAX_EVENT_ALTERNATIVES];
- int n;
-
- flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
- n = ppmu->get_alternatives(ev, flags, alt);
- if (!n)
- return 0;
- return alt[0];
-}
-
-/* Number of perf_events counting hardware events */
-static atomic_t num_events;
-/* Used to avoid races in calling reserve/release_pmc_hardware */
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-/*
- * Release the PMU if this is the last perf_event.
- */
-static void hw_perf_event_destroy(struct perf_event *event)
-{
- if (!atomic_add_unless(&num_events, -1, 1)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_dec_return(&num_events) == 0)
- release_pmc_hardware();
- mutex_unlock(&pmc_reserve_mutex);
- }
-}
-
-/*
- * Translate a generic cache event_id config to a raw event_id code.
- */
-static int hw_perf_cache_event(u64 config, u64 *eventp)
-{
- unsigned long type, op, result;
- int ev;
-
- if (!ppmu->cache_events)
- return -EINVAL;
-
- /* unpack config */
- type = config & 0xff;
- op = (config >> 8) & 0xff;
- result = (config >> 16) & 0xff;
-
- if (type >= PERF_COUNT_HW_CACHE_MAX ||
- op >= PERF_COUNT_HW_CACHE_OP_MAX ||
- result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
- return -EINVAL;
-
- ev = (*ppmu->cache_events)[type][op][result];
- if (ev == 0)
- return -EOPNOTSUPP;
- if (ev == -1)
- return -EINVAL;
- *eventp = ev;
- return 0;
-}
-
-static int power_pmu_event_init(struct perf_event *event)
-{
- u64 ev;
- unsigned long flags;
- struct perf_event *ctrs[MAX_HWEVENTS];
- u64 events[MAX_HWEVENTS];
- unsigned int cflags[MAX_HWEVENTS];
- int n;
- int err;
- struct cpu_hw_events *cpuhw;
-
- if (!ppmu)
- return -ENOENT;
-
- switch (event->attr.type) {
- case PERF_TYPE_HARDWARE:
- ev = event->attr.config;
- if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
- return -EOPNOTSUPP;
- ev = ppmu->generic_events[ev];
- break;
- case PERF_TYPE_HW_CACHE:
- err = hw_perf_cache_event(event->attr.config, &ev);
- if (err)
- return err;
- break;
- case PERF_TYPE_RAW:
- ev = event->attr.config;
- break;
- default:
- return -ENOENT;
- }
-
- event->hw.config_base = ev;
- event->hw.idx = 0;
-
- /*
- * If we are not running on a hypervisor, force the
- * exclude_hv bit to 0 so that we don't care what
- * the user set it to.
- */
- if (!firmware_has_feature(FW_FEATURE_LPAR))
- event->attr.exclude_hv = 0;
-
- /*
- * If this is a per-task event, then we can use
- * PM_RUN_* events interchangeably with their non RUN_*
- * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
- * XXX we should check if the task is an idle task.
- */
- flags = 0;
- if (event->attach_state & PERF_ATTACH_TASK)
- flags |= PPMU_ONLY_COUNT_RUN;
-
- /*
- * If this machine has limited events, check whether this
- * event_id could go on a limited event.
- */
- if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
- if (can_go_on_limited_pmc(event, ev, flags)) {
- flags |= PPMU_LIMITED_PMC_OK;
- } else if (ppmu->limited_pmc_event(ev)) {
- /*
- * The requested event_id is on a limited PMC,
- * but we can't use a limited PMC; see if any
- * alternative goes on a normal PMC.
- */
- ev = normal_pmc_alternative(ev, flags);
- if (!ev)
- return -EINVAL;
- }
- }
-
- /*
- * If this is in a group, check if it can go on with all the
- * other hardware events in the group. We assume the event
- * hasn't been linked into its leader's sibling list at this point.
- */
- n = 0;
- if (event->group_leader != event) {
- n = collect_events(event->group_leader, ppmu->n_counter - 1,
- ctrs, events, cflags);
- if (n < 0)
- return -EINVAL;
- }
- events[n] = ev;
- ctrs[n] = event;
- cflags[n] = flags;
- if (check_excludes(ctrs, cflags, n, 1))
- return -EINVAL;
-
- cpuhw = &get_cpu_var(cpu_hw_events);
- err = power_check_constraints(cpuhw, events, cflags, n + 1);
- put_cpu_var(cpu_hw_events);
- if (err)
- return -EINVAL;
-
- event->hw.config = events[n];
- event->hw.event_base = cflags[n];
- event->hw.last_period = event->hw.sample_period;
- local64_set(&event->hw.period_left, event->hw.last_period);
-
- /*
- * See if we need to reserve the PMU.
- * If no events are currently in use, then we have to take a
- * mutex to ensure that we don't race with another task doing
- * reserve_pmc_hardware or release_pmc_hardware.
- */
- err = 0;
- if (!atomic_inc_not_zero(&num_events)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&num_events) == 0 &&
- reserve_pmc_hardware(perf_event_interrupt))
- err = -EBUSY;
- else
- atomic_inc(&num_events);
- mutex_unlock(&pmc_reserve_mutex);
- }
- event->destroy = hw_perf_event_destroy;
-
- return err;
-}
-
-struct pmu power_pmu = {
- .pmu_enable = power_pmu_enable,
- .pmu_disable = power_pmu_disable,
- .event_init = power_pmu_event_init,
- .add = power_pmu_add,
- .del = power_pmu_del,
- .start = power_pmu_start,
- .stop = power_pmu_stop,
- .read = power_pmu_read,
- .start_txn = power_pmu_start_txn,
- .cancel_txn = power_pmu_cancel_txn,
- .commit_txn = power_pmu_commit_txn,
-};
-
-/*
- * A counter has overflowed; update its count and record
- * things if requested. Note that interrupts are hard-disabled
- * here so there is no possibility of being interrupted.
- */
-static void record_and_restart(struct perf_event *event, unsigned long val,
- struct pt_regs *regs, int nmi)
-{
- u64 period = event->hw.sample_period;
- s64 prev, delta, left;
- int record = 0;
-
- if (event->hw.state & PERF_HES_STOPPED) {
- write_pmc(event->hw.idx, 0);
- return;
- }
-
- /* we don't have to worry about interrupts here */
- prev = local64_read(&event->hw.prev_count);
- delta = (val - prev) & 0xfffffffful;
- local64_add(delta, &event->count);
-
- /*
- * See if the total period for this event has expired,
- * and update for the next period.
- */
- val = 0;
- left = local64_read(&event->hw.period_left) - delta;
- if (period) {
- if (left <= 0) {
- left += period;
- if (left <= 0)
- left = period;
- record = 1;
- }
- if (left < 0x80000000LL)
- val = 0x80000000LL - left;
- }
-
- write_pmc(event->hw.idx, val);
- local64_set(&event->hw.prev_count, val);
- local64_set(&event->hw.period_left, left);
- perf_event_update_userpage(event);
-
- /*
- * Finally record data if requested.
- */
- if (record) {
- struct perf_sample_data data;
-
- perf_sample_data_init(&data, ~0ULL);
- data.period = event->hw.last_period;
-
- if (event->attr.sample_type & PERF_SAMPLE_ADDR)
- perf_get_data_addr(regs, &data.addr);
-
- if (perf_event_overflow(event, nmi, &data, regs))
- power_pmu_stop(event, 0);
- }
-}
-
-/*
- * Called from generic code to get the misc flags (i.e. processor mode)
- * for an event_id.
- */
-unsigned long perf_misc_flags(struct pt_regs *regs)
-{
- u32 flags = perf_get_misc_flags(regs);
-
- if (flags)
- return flags;
- return user_mode(regs) ? PERF_RECORD_MISC_USER :
- PERF_RECORD_MISC_KERNEL;
-}
-
-/*
- * Called from generic code to get the instruction pointer
- * for an event_id.
- */
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
-{
- unsigned long ip;
-
- if (TRAP(regs) != 0xf00)
- return regs->nip; /* not a PMU interrupt */
-
- ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs);
- return ip;
-}
-
-/*
- * Performance monitor interrupt stuff
- */
-static void perf_event_interrupt(struct pt_regs *regs)
-{
- int i;
- struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
- struct perf_event *event;
- unsigned long val;
- int found = 0;
- int nmi;
-
- if (cpuhw->n_limited)
- freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
- mfspr(SPRN_PMC6));
-
- perf_read_regs(regs);
-
- nmi = perf_intr_is_nmi(regs);
- if (nmi)
- nmi_enter();
- else
- irq_enter();
-
- for (i = 0; i < cpuhw->n_events; ++i) {
- event = cpuhw->event[i];
- if (!event->hw.idx || is_limited_pmc(event->hw.idx))
- continue;
- val = read_pmc(event->hw.idx);
- if ((int)val < 0) {
- /* event has overflowed */
- found = 1;
- record_and_restart(event, val, regs, nmi);
- }
- }
-
- /*
- * In case we didn't find and reset the event that caused
- * the interrupt, scan all events and reset any that are
- * negative, to avoid getting continual interrupts.
- * Any that we processed in the previous loop will not be negative.
- */
- if (!found) {
- for (i = 0; i < ppmu->n_counter; ++i) {
- if (is_limited_pmc(i + 1))
- continue;
- val = read_pmc(i + 1);
- if ((int)val < 0)
- write_pmc(i + 1, 0);
- }
- }
-
- /*
- * Reset MMCR0 to its normal value. This will set PMXE and
- * clear FC (freeze counters) and PMAO (perf mon alert occurred)
- * and thus allow interrupts to occur again.
- * XXX might want to use MSR.PM to keep the events frozen until
- * we get back out of this interrupt.
- */
- write_mmcr0(cpuhw, cpuhw->mmcr[0]);
-
- if (nmi)
- nmi_exit();
- else
- irq_exit();
-}
-
-static void power_pmu_setup(int cpu)
-{
- struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
-
- if (!ppmu)
- return;
- memset(cpuhw, 0, sizeof(*cpuhw));
- cpuhw->mmcr[0] = MMCR0_FC;
-}
-
-static int __cpuinit
-power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- power_pmu_setup(cpu);
- break;
-
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
-int register_power_pmu(struct power_pmu *pmu)
-{
- if (ppmu)
- return -EBUSY; /* something's already registered */
-
- ppmu = pmu;
- pr_info("%s performance monitor hardware support registered\n",
- pmu->name);
-
-#ifdef MSR_HV
- /*
- * Use FCHV to ignore kernel events if MSR.HV is set.
- */
- if (mfmsr() & MSR_HV)
- freeze_events_kernel = MMCR0_FCHV;
-#endif /* CONFIG_PPC64 */
-
- perf_pmu_register(&power_pmu);
- perf_cpu_notifier(power_pmu_notifier);
-
- return 0;
-}
diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c
deleted file mode 100644
index 7ecca59ddf7..00000000000
--- a/arch/powerpc/kernel/perf_event_fsl_emb.c
+++ /dev/null
@@ -1,687 +0,0 @@
-/*
- * Performance event support - Freescale Embedded Performance Monitor
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- * Copyright 2010 Freescale Semiconductor, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/perf_event.h>
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-#include <asm/reg_fsl_emb.h>
-#include <asm/pmc.h>
-#include <asm/machdep.h>
-#include <asm/firmware.h>
-#include <asm/ptrace.h>
-
-struct cpu_hw_events {
- int n_events;
- int disabled;
- u8 pmcs_enabled;
- struct perf_event *event[MAX_HWEVENTS];
-};
-static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
-
-static struct fsl_emb_pmu *ppmu;
-
-/* Number of perf_events counting hardware events */
-static atomic_t num_events;
-/* Used to avoid races in calling reserve/release_pmc_hardware */
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-/*
- * If interrupts were soft-disabled when a PMU interrupt occurs, treat
- * it as an NMI.
- */
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
-#ifdef __powerpc64__
- return !regs->softe;
-#else
- return 0;
-#endif
-}
-
-static void perf_event_interrupt(struct pt_regs *regs);
-
-/*
- * Read one performance monitor counter (PMC).
- */
-static unsigned long read_pmc(int idx)
-{
- unsigned long val;
-
- switch (idx) {
- case 0:
- val = mfpmr(PMRN_PMC0);
- break;
- case 1:
- val = mfpmr(PMRN_PMC1);
- break;
- case 2:
- val = mfpmr(PMRN_PMC2);
- break;
- case 3:
- val = mfpmr(PMRN_PMC3);
- break;
- default:
- printk(KERN_ERR "oops trying to read PMC%d\n", idx);
- val = 0;
- }
- return val;
-}
-
-/*
- * Write one PMC.
- */
-static void write_pmc(int idx, unsigned long val)
-{
- switch (idx) {
- case 0:
- mtpmr(PMRN_PMC0, val);
- break;
- case 1:
- mtpmr(PMRN_PMC1, val);
- break;
- case 2:
- mtpmr(PMRN_PMC2, val);
- break;
- case 3:
- mtpmr(PMRN_PMC3, val);
- break;
- default:
- printk(KERN_ERR "oops trying to write PMC%d\n", idx);
- }
-
- isync();
-}
-
-/*
- * Write one local control A register
- */
-static void write_pmlca(int idx, unsigned long val)
-{
- switch (idx) {
- case 0:
- mtpmr(PMRN_PMLCA0, val);
- break;
- case 1:
- mtpmr(PMRN_PMLCA1, val);
- break;
- case 2:
- mtpmr(PMRN_PMLCA2, val);
- break;
- case 3:
- mtpmr(PMRN_PMLCA3, val);
- break;
- default:
- printk(KERN_ERR "oops trying to write PMLCA%d\n", idx);
- }
-
- isync();
-}
-
-/*
- * Write one local control B register
- */
-static void write_pmlcb(int idx, unsigned long val)
-{
- switch (idx) {
- case 0:
- mtpmr(PMRN_PMLCB0, val);
- break;
- case 1:
- mtpmr(PMRN_PMLCB1, val);
- break;
- case 2:
- mtpmr(PMRN_PMLCB2, val);
- break;
- case 3:
- mtpmr(PMRN_PMLCB3, val);
- break;
- default:
- printk(KERN_ERR "oops trying to write PMLCB%d\n", idx);
- }
-
- isync();
-}
-
-static void fsl_emb_pmu_read(struct perf_event *event)
-{
- s64 val, delta, prev;
-
- if (event->hw.state & PERF_HES_STOPPED)
- return;
-
- /*
- * Performance monitor interrupts come even when interrupts
- * are soft-disabled, as long as interrupts are hard-enabled.
- * Therefore we treat them like NMIs.
- */
- do {
- prev = local64_read(&event->hw.prev_count);
- barrier();
- val = read_pmc(event->hw.idx);
- } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev);
-
- /* The counters are only 32 bits wide */
- delta = (val - prev) & 0xfffffffful;
- local64_add(delta, &event->count);
- local64_sub(delta, &event->hw.period_left);
-}
-
-/*
- * Disable all events to prevent PMU interrupts and to allow
- * events to be added or removed.
- */
-static void fsl_emb_pmu_disable(struct pmu *pmu)
-{
- struct cpu_hw_events *cpuhw;
- unsigned long flags;
-
- local_irq_save(flags);
- cpuhw = &__get_cpu_var(cpu_hw_events);
-
- if (!cpuhw->disabled) {
- cpuhw->disabled = 1;
-
- /*
- * Check if we ever enabled the PMU on this cpu.
- */
- if (!cpuhw->pmcs_enabled) {
- ppc_enable_pmcs();
- cpuhw->pmcs_enabled = 1;
- }
-
- if (atomic_read(&num_events)) {
- /*
- * Set the 'freeze all counters' bit, and disable
- * interrupts. The barrier is to make sure the
- * mtpmr has been executed and the PMU has frozen
- * the events before we return.
- */
-
- mtpmr(PMRN_PMGC0, PMGC0_FAC);
- isync();
- }
- }
- local_irq_restore(flags);
-}
-
-/*
- * Re-enable all events if disable == 0.
- * If we were previously disabled and events were added, then
- * put the new config on the PMU.
- */
-static void fsl_emb_pmu_enable(struct pmu *pmu)
-{
- struct cpu_hw_events *cpuhw;
- unsigned long flags;
-
- local_irq_save(flags);
- cpuhw = &__get_cpu_var(cpu_hw_events);
- if (!cpuhw->disabled)
- goto out;
-
- cpuhw->disabled = 0;
- ppc_set_pmu_inuse(cpuhw->n_events != 0);
-
- if (cpuhw->n_events > 0) {
- mtpmr(PMRN_PMGC0, PMGC0_PMIE | PMGC0_FCECE);
- isync();
- }
-
- out:
- local_irq_restore(flags);
-}
-
-static int collect_events(struct perf_event *group, int max_count,
- struct perf_event *ctrs[])
-{
- int n = 0;
- struct perf_event *event;
-
- if (!is_software_event(group)) {
- if (n >= max_count)
- return -1;
- ctrs[n] = group;
- n++;
- }
- list_for_each_entry(event, &group->sibling_list, group_entry) {
- if (!is_software_event(event) &&
- event->state != PERF_EVENT_STATE_OFF) {
- if (n >= max_count)
- return -1;
- ctrs[n] = event;
- n++;
- }
- }
- return n;
-}
-
-/* context locked on entry */
-static int fsl_emb_pmu_add(struct perf_event *event, int flags)
-{
- struct cpu_hw_events *cpuhw;
- int ret = -EAGAIN;
- int num_counters = ppmu->n_counter;
- u64 val;
- int i;
-
- perf_pmu_disable(event->pmu);
- cpuhw = &get_cpu_var(cpu_hw_events);
-
- if (event->hw.config & FSL_EMB_EVENT_RESTRICTED)
- num_counters = ppmu->n_restricted;
-
- /*
- * Allocate counters from top-down, so that restricted-capable
- * counters are kept free as long as possible.
- */
- for (i = num_counters - 1; i >= 0; i--) {
- if (cpuhw->event[i])
- continue;
-
- break;
- }
-
- if (i < 0)
- goto out;
-
- event->hw.idx = i;
- cpuhw->event[i] = event;
- ++cpuhw->n_events;
-
- val = 0;
- if (event->hw.sample_period) {
- s64 left = local64_read(&event->hw.period_left);
- if (left < 0x80000000L)
- val = 0x80000000L - left;
- }
- local64_set(&event->hw.prev_count, val);
-
- if (!(flags & PERF_EF_START)) {
- event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
- val = 0;
- }
-
- write_pmc(i, val);
- perf_event_update_userpage(event);
-
- write_pmlcb(i, event->hw.config >> 32);
- write_pmlca(i, event->hw.config_base);
-
- ret = 0;
- out:
- put_cpu_var(cpu_hw_events);
- perf_pmu_enable(event->pmu);
- return ret;
-}
-
-/* context locked on entry */
-static void fsl_emb_pmu_del(struct perf_event *event, int flags)
-{
- struct cpu_hw_events *cpuhw;
- int i = event->hw.idx;
-
- perf_pmu_disable(event->pmu);
- if (i < 0)
- goto out;
-
- fsl_emb_pmu_read(event);
-
- cpuhw = &get_cpu_var(cpu_hw_events);
-
- WARN_ON(event != cpuhw->event[event->hw.idx]);
-
- write_pmlca(i, 0);
- write_pmlcb(i, 0);
- write_pmc(i, 0);
-
- cpuhw->event[i] = NULL;
- event->hw.idx = -1;
-
- /*
- * TODO: if at least one restricted event exists, and we
- * just freed up a non-restricted-capable counter, and
- * there is a restricted-capable counter occupied by
- * a non-restricted event, migrate that event to the
- * vacated counter.
- */
-
- cpuhw->n_events--;
-
- out:
- perf_pmu_enable(event->pmu);
- put_cpu_var(cpu_hw_events);
-}
-
-static void fsl_emb_pmu_start(struct perf_event *event, int ef_flags)
-{
- unsigned long flags;
- s64 left;
-
- if (event->hw.idx < 0 || !event->hw.sample_period)
- return;
-
- if (!(event->hw.state & PERF_HES_STOPPED))
- return;
-
- if (ef_flags & PERF_EF_RELOAD)
- WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
-
- local_irq_save(flags);
- perf_pmu_disable(event->pmu);
-
- event->hw.state = 0;
- left = local64_read(&event->hw.period_left);
- write_pmc(event->hw.idx, left);
-
- perf_event_update_userpage(event);
- perf_pmu_enable(event->pmu);
- local_irq_restore(flags);
-}
-
-static void fsl_emb_pmu_stop(struct perf_event *event, int ef_flags)
-{
- unsigned long flags;
-
- if (event->hw.idx < 0 || !event->hw.sample_period)
- return;
-
- if (event->hw.state & PERF_HES_STOPPED)
- return;
-
- local_irq_save(flags);
- perf_pmu_disable(event->pmu);
-
- fsl_emb_pmu_read(event);
- event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
- write_pmc(event->hw.idx, 0);
-
- perf_event_update_userpage(event);
- perf_pmu_enable(event->pmu);
- local_irq_restore(flags);
-}
-
-/*
- * Release the PMU if this is the last perf_event.
- */
-static void hw_perf_event_destroy(struct perf_event *event)
-{
- if (!atomic_add_unless(&num_events, -1, 1)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_dec_return(&num_events) == 0)
- release_pmc_hardware();
- mutex_unlock(&pmc_reserve_mutex);
- }
-}
-
-/*
- * Translate a generic cache event_id config to a raw event_id code.
- */
-static int hw_perf_cache_event(u64 config, u64 *eventp)
-{
- unsigned long type, op, result;
- int ev;
-
- if (!ppmu->cache_events)
- return -EINVAL;
-
- /* unpack config */
- type = config & 0xff;
- op = (config >> 8) & 0xff;
- result = (config >> 16) & 0xff;
-
- if (type >= PERF_COUNT_HW_CACHE_MAX ||
- op >= PERF_COUNT_HW_CACHE_OP_MAX ||
- result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
- return -EINVAL;
-
- ev = (*ppmu->cache_events)[type][op][result];
- if (ev == 0)
- return -EOPNOTSUPP;
- if (ev == -1)
- return -EINVAL;
- *eventp = ev;
- return 0;
-}
-
-static int fsl_emb_pmu_event_init(struct perf_event *event)
-{
- u64 ev;
- struct perf_event *events[MAX_HWEVENTS];
- int n;
- int err;
- int num_restricted;
- int i;
-
- switch (event->attr.type) {
- case PERF_TYPE_HARDWARE:
- ev = event->attr.config;
- if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
- return -EOPNOTSUPP;
- ev = ppmu->generic_events[ev];
- break;
-
- case PERF_TYPE_HW_CACHE:
- err = hw_perf_cache_event(event->attr.config, &ev);
- if (err)
- return err;
- break;
-
- case PERF_TYPE_RAW:
- ev = event->attr.config;
- break;
-
- default:
- return -ENOENT;
- }
-
- event->hw.config = ppmu->xlate_event(ev);
- if (!(event->hw.config & FSL_EMB_EVENT_VALID))
- return -EINVAL;
-
- /*
- * If this is in a group, check if it can go on with all the
- * other hardware events in the group. We assume the event
- * hasn't been linked into its leader's sibling list at this point.
- */
- n = 0;
- if (event->group_leader != event) {
- n = collect_events(event->group_leader,
- ppmu->n_counter - 1, events);
- if (n < 0)
- return -EINVAL;
- }
-
- if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) {
- num_restricted = 0;
- for (i = 0; i < n; i++) {
- if (events[i]->hw.config & FSL_EMB_EVENT_RESTRICTED)
- num_restricted++;
- }
-
- if (num_restricted >= ppmu->n_restricted)
- return -EINVAL;
- }
-
- event->hw.idx = -1;
-
- event->hw.config_base = PMLCA_CE | PMLCA_FCM1 |
- (u32)((ev << 16) & PMLCA_EVENT_MASK);
-
- if (event->attr.exclude_user)
- event->hw.config_base |= PMLCA_FCU;
- if (event->attr.exclude_kernel)
- event->hw.config_base |= PMLCA_FCS;
- if (event->attr.exclude_idle)
- return -ENOTSUPP;
-
- event->hw.last_period = event->hw.sample_period;
- local64_set(&event->hw.period_left, event->hw.last_period);
-
- /*
- * See if we need to reserve the PMU.
- * If no events are currently in use, then we have to take a
- * mutex to ensure that we don't race with another task doing
- * reserve_pmc_hardware or release_pmc_hardware.
- */
- err = 0;
- if (!atomic_inc_not_zero(&num_events)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&num_events) == 0 &&
- reserve_pmc_hardware(perf_event_interrupt))
- err = -EBUSY;
- else
- atomic_inc(&num_events);
- mutex_unlock(&pmc_reserve_mutex);
-
- mtpmr(PMRN_PMGC0, PMGC0_FAC);
- isync();
- }
- event->destroy = hw_perf_event_destroy;
-
- return err;
-}
-
-static struct pmu fsl_emb_pmu = {
- .pmu_enable = fsl_emb_pmu_enable,
- .pmu_disable = fsl_emb_pmu_disable,
- .event_init = fsl_emb_pmu_event_init,
- .add = fsl_emb_pmu_add,
- .del = fsl_emb_pmu_del,
- .start = fsl_emb_pmu_start,
- .stop = fsl_emb_pmu_stop,
- .read = fsl_emb_pmu_read,
-};
-
-/*
- * A counter has overflowed; update its count and record
- * things if requested. Note that interrupts are hard-disabled
- * here so there is no possibility of being interrupted.
- */
-static void record_and_restart(struct perf_event *event, unsigned long val,
- struct pt_regs *regs, int nmi)
-{
- u64 period = event->hw.sample_period;
- s64 prev, delta, left;
- int record = 0;
-
- if (event->hw.state & PERF_HES_STOPPED) {
- write_pmc(event->hw.idx, 0);
- return;
- }
-
- /* we don't have to worry about interrupts here */
- prev = local64_read(&event->hw.prev_count);
- delta = (val - prev) & 0xfffffffful;
- local64_add(delta, &event->count);
-
- /*
- * See if the total period for this event has expired,
- * and update for the next period.
- */
- val = 0;
- left = local64_read(&event->hw.period_left) - delta;
- if (period) {
- if (left <= 0) {
- left += period;
- if (left <= 0)
- left = period;
- record = 1;
- }
- if (left < 0x80000000LL)
- val = 0x80000000LL - left;
- }
-
- write_pmc(event->hw.idx, val);
- local64_set(&event->hw.prev_count, val);
- local64_set(&event->hw.period_left, left);
- perf_event_update_userpage(event);
-
- /*
- * Finally record data if requested.
- */
- if (record) {
- struct perf_sample_data data;
-
- perf_sample_data_init(&data, 0);
- data.period = event->hw.last_period;
-
- if (perf_event_overflow(event, nmi, &data, regs))
- fsl_emb_pmu_stop(event, 0);
- }
-}
-
-static void perf_event_interrupt(struct pt_regs *regs)
-{
- int i;
- struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
- struct perf_event *event;
- unsigned long val;
- int found = 0;
- int nmi;
-
- nmi = perf_intr_is_nmi(regs);
- if (nmi)
- nmi_enter();
- else
- irq_enter();
-
- for (i = 0; i < ppmu->n_counter; ++i) {
- event = cpuhw->event[i];
-
- val = read_pmc(i);
- if ((int)val < 0) {
- if (event) {
- /* event has overflowed */
- found = 1;
- record_and_restart(event, val, regs, nmi);
- } else {
- /*
- * Disabled counter is negative,
- * reset it just in case.
- */
- write_pmc(i, 0);
- }
- }
- }
-
- /* PMM will keep counters frozen until we return from the interrupt. */
- mtmsr(mfmsr() | MSR_PMM);
- mtpmr(PMRN_PMGC0, PMGC0_PMIE | PMGC0_FCECE);
- isync();
-
- if (nmi)
- nmi_exit();
- else
- irq_exit();
-}
-
-void hw_perf_event_setup(int cpu)
-{
- struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
-
- memset(cpuhw, 0, sizeof(*cpuhw));
-}
-
-int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu)
-{
- if (ppmu)
- return -EBUSY; /* something's already registered */
-
- ppmu = pmu;
- pr_info("%s performance monitor hardware support registered\n",
- pmu->name);
-
- perf_pmu_register(&fsl_emb_pmu);
-
- return 0;
-}
diff --git a/arch/powerpc/kernel/pmc.c b/arch/powerpc/kernel/pmc.c
index 461499b43cf..58eaa3ddf7b 100644
--- a/arch/powerpc/kernel/pmc.c
+++ b/arch/powerpc/kernel/pmc.c
@@ -13,8 +13,9 @@
*/
#include <linux/errno.h>
+#include <linux/bug.h>
#include <linux/spinlock.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <asm/processor.h>
#include <asm/cputable.h>
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
deleted file mode 100644
index 2a361cdda63..00000000000
--- a/arch/powerpc/kernel/power4-pmu.c
+++ /dev/null
@@ -1,616 +0,0 @@
-/*
- * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors.
- *
- * Copyright 2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/perf_event.h>
-#include <linux/string.h>
-#include <asm/reg.h>
-#include <asm/cputable.h>
-
-/*
- * Bits in event code for POWER4
- */
-#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
-#define PM_PMC_MSK 0xf
-#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
-#define PM_UNIT_MSK 0xf
-#define PM_LOWER_SH 6
-#define PM_LOWER_MSK 1
-#define PM_LOWER_MSKS 0x40
-#define PM_BYTE_SH 4 /* Byte number of event bus to use */
-#define PM_BYTE_MSK 3
-#define PM_PMCSEL_MSK 7
-
-/*
- * Unit code values
- */
-#define PM_FPU 1
-#define PM_ISU1 2
-#define PM_IFU 3
-#define PM_IDU0 4
-#define PM_ISU1_ALT 6
-#define PM_ISU2 7
-#define PM_IFU_ALT 8
-#define PM_LSU0 9
-#define PM_LSU1 0xc
-#define PM_GPS 0xf
-
-/*
- * Bits in MMCR0 for POWER4
- */
-#define MMCR0_PMC1SEL_SH 8
-#define MMCR0_PMC2SEL_SH 1
-#define MMCR_PMCSEL_MSK 0x1f
-
-/*
- * Bits in MMCR1 for POWER4
- */
-#define MMCR1_TTM0SEL_SH 62
-#define MMCR1_TTC0SEL_SH 61
-#define MMCR1_TTM1SEL_SH 59
-#define MMCR1_TTC1SEL_SH 58
-#define MMCR1_TTM2SEL_SH 56
-#define MMCR1_TTC2SEL_SH 55
-#define MMCR1_TTM3SEL_SH 53
-#define MMCR1_TTC3SEL_SH 52
-#define MMCR1_TTMSEL_MSK 3
-#define MMCR1_TD_CP_DBG0SEL_SH 50
-#define MMCR1_TD_CP_DBG1SEL_SH 48
-#define MMCR1_TD_CP_DBG2SEL_SH 46
-#define MMCR1_TD_CP_DBG3SEL_SH 44
-#define MMCR1_DEBUG0SEL_SH 43
-#define MMCR1_DEBUG1SEL_SH 42
-#define MMCR1_DEBUG2SEL_SH 41
-#define MMCR1_DEBUG3SEL_SH 40
-#define MMCR1_PMC1_ADDER_SEL_SH 39
-#define MMCR1_PMC2_ADDER_SEL_SH 38
-#define MMCR1_PMC6_ADDER_SEL_SH 37
-#define MMCR1_PMC5_ADDER_SEL_SH 36
-#define MMCR1_PMC8_ADDER_SEL_SH 35
-#define MMCR1_PMC7_ADDER_SEL_SH 34
-#define MMCR1_PMC3_ADDER_SEL_SH 33
-#define MMCR1_PMC4_ADDER_SEL_SH 32
-#define MMCR1_PMC3SEL_SH 27
-#define MMCR1_PMC4SEL_SH 22
-#define MMCR1_PMC5SEL_SH 17
-#define MMCR1_PMC6SEL_SH 12
-#define MMCR1_PMC7SEL_SH 7
-#define MMCR1_PMC8SEL_SH 2 /* note bit 0 is in MMCRA for GP */
-
-static short mmcr1_adder_bits[8] = {
- MMCR1_PMC1_ADDER_SEL_SH,
- MMCR1_PMC2_ADDER_SEL_SH,
- MMCR1_PMC3_ADDER_SEL_SH,
- MMCR1_PMC4_ADDER_SEL_SH,
- MMCR1_PMC5_ADDER_SEL_SH,
- MMCR1_PMC6_ADDER_SEL_SH,
- MMCR1_PMC7_ADDER_SEL_SH,
- MMCR1_PMC8_ADDER_SEL_SH
-};
-
-/*
- * Bits in MMCRA
- */
-#define MMCRA_PMC8SEL0_SH 17 /* PMC8SEL bit 0 for GP */
-
-/*
- * Layout of constraint bits:
- * 6666555555555544444444443333333333222222222211111111110000000000
- * 3210987654321098765432109876543210987654321098765432109876543210
- * |[ >[ >[ >|||[ >[ >< >< >< >< ><><><><><><><><>
- * | UC1 UC2 UC3 ||| PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
- * \SMPL ||\TTC3SEL
- * |\TTC_IFU_SEL
- * \TTM2SEL0
- *
- * SMPL - SAMPLE_ENABLE constraint
- * 56: SAMPLE_ENABLE value 0x0100_0000_0000_0000
- *
- * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2
- * 55: UC1 error 0x0080_0000_0000_0000
- * 54: FPU events needed 0x0040_0000_0000_0000
- * 53: ISU1 events needed 0x0020_0000_0000_0000
- * 52: IDU0|ISU2 events needed 0x0010_0000_0000_0000
- *
- * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0
- * 51: UC2 error 0x0008_0000_0000_0000
- * 50: FPU events needed 0x0004_0000_0000_0000
- * 49: IFU events needed 0x0002_0000_0000_0000
- * 48: LSU0 events needed 0x0001_0000_0000_0000
- *
- * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1
- * 47: UC3 error 0x8000_0000_0000
- * 46: LSU0 events needed 0x4000_0000_0000
- * 45: IFU events needed 0x2000_0000_0000
- * 44: IDU0|ISU2 events needed 0x1000_0000_0000
- * 43: ISU1 events needed 0x0800_0000_0000
- *
- * TTM2SEL0
- * 42: 0 = IDU0 events needed
- * 1 = ISU2 events needed 0x0400_0000_0000
- *
- * TTC_IFU_SEL
- * 41: 0 = IFU.U events needed
- * 1 = IFU.L events needed 0x0200_0000_0000
- *
- * TTC3SEL
- * 40: 0 = LSU1.U events needed
- * 1 = LSU1.L events needed 0x0100_0000_0000
- *
- * PS1
- * 39: PS1 error 0x0080_0000_0000
- * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
- *
- * PS2
- * 35: PS2 error 0x0008_0000_0000
- * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
- *
- * B0
- * 28-31: Byte 0 event source 0xf000_0000
- * 1 = FPU
- * 2 = ISU1
- * 3 = IFU
- * 4 = IDU0
- * 7 = ISU2
- * 9 = LSU0
- * c = LSU1
- * f = GPS
- *
- * B1, B2, B3
- * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
- *
- * P8
- * 15: P8 error 0x8000
- * 14-15: Count of events needing PMC8
- *
- * P1..P7
- * 0-13: Count of events needing PMC1..PMC7
- *
- * Note: this doesn't allow events using IFU.U to be combined with events
- * using IFU.L, though that is feasible (using TTM0 and TTM2). However
- * there are no listed events for IFU.L (they are debug events not
- * verified for performance monitoring) so this shouldn't cause a
- * problem.
- */
-
-static struct unitinfo {
- unsigned long value, mask;
- int unit;
- int lowerbit;
-} p4_unitinfo[16] = {
- [PM_FPU] = { 0x44000000000000ul, 0x88000000000000ul, PM_FPU, 0 },
- [PM_ISU1] = { 0x20080000000000ul, 0x88000000000000ul, PM_ISU1, 0 },
- [PM_ISU1_ALT] =
- { 0x20080000000000ul, 0x88000000000000ul, PM_ISU1, 0 },
- [PM_IFU] = { 0x02200000000000ul, 0x08820000000000ul, PM_IFU, 41 },
- [PM_IFU_ALT] =
- { 0x02200000000000ul, 0x08820000000000ul, PM_IFU, 41 },
- [PM_IDU0] = { 0x10100000000000ul, 0x80840000000000ul, PM_IDU0, 1 },
- [PM_ISU2] = { 0x10140000000000ul, 0x80840000000000ul, PM_ISU2, 0 },
- [PM_LSU0] = { 0x01400000000000ul, 0x08800000000000ul, PM_LSU0, 0 },
- [PM_LSU1] = { 0x00000000000000ul, 0x00010000000000ul, PM_LSU1, 40 },
- [PM_GPS] = { 0x00000000000000ul, 0x00000000000000ul, PM_GPS, 0 }
-};
-
-static unsigned char direct_marked_event[8] = {
- (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
- (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
- (1<<3), /* PMC3: PM_MRK_ST_CMPL_INT */
- (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
- (1<<4) | (1<<5), /* PMC5: PM_MRK_GRP_TIMEO */
- (1<<3) | (1<<4) | (1<<5),
- /* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
- (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
- (1<<4), /* PMC8: PM_MRK_LSU_FIN */
-};
-
-/*
- * Returns 1 if event counts things relating to marked instructions
- * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
- */
-static int p4_marked_instr_event(u64 event)
-{
- int pmc, psel, unit, byte, bit;
- unsigned int mask;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- psel = event & PM_PMCSEL_MSK;
- if (pmc) {
- if (direct_marked_event[pmc - 1] & (1 << psel))
- return 1;
- if (psel == 0) /* add events */
- bit = (pmc <= 4)? pmc - 1: 8 - pmc;
- else if (psel == 6) /* decode events */
- bit = 4;
- else
- return 0;
- } else
- bit = psel;
-
- byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
- unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
- mask = 0;
- switch (unit) {
- case PM_LSU1:
- if (event & PM_LOWER_MSKS)
- mask = 1 << 28; /* byte 7 bit 4 */
- else
- mask = 6 << 24; /* byte 3 bits 1 and 2 */
- break;
- case PM_LSU0:
- /* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */
- mask = 0x083dff00;
- }
- return (mask >> (byte * 8 + bit)) & 1;
-}
-
-static int p4_get_constraint(u64 event, unsigned long *maskp,
- unsigned long *valp)
-{
- int pmc, byte, unit, lower, sh;
- unsigned long mask = 0, value = 0;
- int grp = -1;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- if (pmc > 8)
- return -1;
- sh = (pmc - 1) * 2;
- mask |= 2 << sh;
- value |= 1 << sh;
- grp = ((pmc - 1) >> 1) & 1;
- }
- unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
- byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
- if (unit) {
- lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK;
-
- /*
- * Bus events on bytes 0 and 2 can be counted
- * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
- */
- if (!pmc)
- grp = byte & 1;
-
- if (!p4_unitinfo[unit].unit)
- return -1;
- mask |= p4_unitinfo[unit].mask;
- value |= p4_unitinfo[unit].value;
- sh = p4_unitinfo[unit].lowerbit;
- if (sh > 1)
- value |= (unsigned long)lower << sh;
- else if (lower != sh)
- return -1;
- unit = p4_unitinfo[unit].unit;
-
- /* Set byte lane select field */
- mask |= 0xfULL << (28 - 4 * byte);
- value |= (unsigned long)unit << (28 - 4 * byte);
- }
- if (grp == 0) {
- /* increment PMC1/2/5/6 field */
- mask |= 0x8000000000ull;
- value |= 0x1000000000ull;
- } else {
- /* increment PMC3/4/7/8 field */
- mask |= 0x800000000ull;
- value |= 0x100000000ull;
- }
-
- /* Marked instruction events need sample_enable set */
- if (p4_marked_instr_event(event)) {
- mask |= 1ull << 56;
- value |= 1ull << 56;
- }
-
- /* PMCSEL=6 decode events on byte 2 need sample_enable clear */
- if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2)
- mask |= 1ull << 56;
-
- *maskp = mask;
- *valp = value;
- return 0;
-}
-
-static unsigned int ppc_inst_cmpl[] = {
- 0x1001, 0x4001, 0x6001, 0x7001, 0x8001
-};
-
-static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[])
-{
- int i, j, na;
-
- alt[0] = event;
- na = 1;
-
- /* 2 possibilities for PM_GRP_DISP_REJECT */
- if (event == 0x8003 || event == 0x0224) {
- alt[1] = event ^ (0x8003 ^ 0x0224);
- return 2;
- }
-
- /* 2 possibilities for PM_ST_MISS_L1 */
- if (event == 0x0c13 || event == 0x0c23) {
- alt[1] = event ^ (0x0c13 ^ 0x0c23);
- return 2;
- }
-
- /* several possibilities for PM_INST_CMPL */
- for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) {
- if (event == ppc_inst_cmpl[i]) {
- for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j)
- if (j != i)
- alt[na++] = ppc_inst_cmpl[j];
- break;
- }
- }
-
- return na;
-}
-
-static int p4_compute_mmcr(u64 event[], int n_ev,
- unsigned int hwc[], unsigned long mmcr[])
-{
- unsigned long mmcr0 = 0, mmcr1 = 0, mmcra = 0;
- unsigned int pmc, unit, byte, psel, lower;
- unsigned int ttm, grp;
- unsigned int pmc_inuse = 0;
- unsigned int pmc_grp_use[2];
- unsigned char busbyte[4];
- unsigned char unituse[16];
- unsigned int unitlower = 0;
- int i;
-
- if (n_ev > 8)
- return -1;
-
- /* First pass to count resource use */
- pmc_grp_use[0] = pmc_grp_use[1] = 0;
- memset(busbyte, 0, sizeof(busbyte));
- memset(unituse, 0, sizeof(unituse));
- for (i = 0; i < n_ev; ++i) {
- pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- if (pmc_inuse & (1 << (pmc - 1)))
- return -1;
- pmc_inuse |= 1 << (pmc - 1);
- /* count 1/2/5/6 vs 3/4/7/8 use */
- ++pmc_grp_use[((pmc - 1) >> 1) & 1];
- }
- unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
- byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
- lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK;
- if (unit) {
- if (!pmc)
- ++pmc_grp_use[byte & 1];
- if (unit == 6 || unit == 8)
- /* map alt ISU1/IFU codes: 6->2, 8->3 */
- unit = (unit >> 1) - 1;
- if (busbyte[byte] && busbyte[byte] != unit)
- return -1;
- busbyte[byte] = unit;
- lower <<= unit;
- if (unituse[unit] && lower != (unitlower & lower))
- return -1;
- unituse[unit] = 1;
- unitlower |= lower;
- }
- }
- if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
- return -1;
-
- /*
- * Assign resources and set multiplexer selects.
- *
- * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2.
- * Each TTMx can only select one unit, but since
- * units 2 and 6 are both ISU1, and 3 and 8 are both IFU,
- * we have some choices.
- */
- if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) {
- unituse[6] = 1; /* Move 2 to 6 */
- unituse[2] = 0;
- }
- if (unituse[3] & (unituse[1] | unituse[2])) {
- unituse[8] = 1; /* Move 3 to 8 */
- unituse[3] = 0;
- unitlower = (unitlower & ~8) | ((unitlower & 8) << 5);
- }
- /* Check only one unit per TTMx */
- if (unituse[1] + unituse[2] + unituse[3] > 1 ||
- unituse[4] + unituse[6] + unituse[7] > 1 ||
- unituse[8] + unituse[9] > 1 ||
- (unituse[5] | unituse[10] | unituse[11] |
- unituse[13] | unituse[14]))
- return -1;
-
- /* Set TTMxSEL fields. Note, units 1-3 => TTM0SEL codes 0-2 */
- mmcr1 |= (unsigned long)(unituse[3] * 2 + unituse[2])
- << MMCR1_TTM0SEL_SH;
- mmcr1 |= (unsigned long)(unituse[7] * 3 + unituse[6] * 2)
- << MMCR1_TTM1SEL_SH;
- mmcr1 |= (unsigned long)unituse[9] << MMCR1_TTM2SEL_SH;
-
- /* Set TTCxSEL fields. */
- if (unitlower & 0xe)
- mmcr1 |= 1ull << MMCR1_TTC0SEL_SH;
- if (unitlower & 0xf0)
- mmcr1 |= 1ull << MMCR1_TTC1SEL_SH;
- if (unitlower & 0xf00)
- mmcr1 |= 1ull << MMCR1_TTC2SEL_SH;
- if (unitlower & 0x7000)
- mmcr1 |= 1ull << MMCR1_TTC3SEL_SH;
-
- /* Set byte lane select fields. */
- for (byte = 0; byte < 4; ++byte) {
- unit = busbyte[byte];
- if (!unit)
- continue;
- if (unit == 0xf) {
- /* special case for GPS */
- mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte);
- } else {
- if (!unituse[unit])
- ttm = unit - 1; /* 2->1, 3->2 */
- else
- ttm = unit >> 2;
- mmcr1 |= (unsigned long)ttm
- << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
- }
- }
-
- /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
- for (i = 0; i < n_ev; ++i) {
- pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
- unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
- byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
- psel = event[i] & PM_PMCSEL_MSK;
- if (!pmc) {
- /* Bus event or 00xxx direct event (off or cycles) */
- if (unit)
- psel |= 0x10 | ((byte & 2) << 2);
- for (pmc = 0; pmc < 8; ++pmc) {
- if (pmc_inuse & (1 << pmc))
- continue;
- grp = (pmc >> 1) & 1;
- if (unit) {
- if (grp == (byte & 1))
- break;
- } else if (pmc_grp_use[grp] < 4) {
- ++pmc_grp_use[grp];
- break;
- }
- }
- pmc_inuse |= 1 << pmc;
- } else {
- /* Direct event */
- --pmc;
- if (psel == 0 && (byte & 2))
- /* add events on higher-numbered bus */
- mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
- else if (psel == 6 && byte == 3)
- /* seem to need to set sample_enable here */
- mmcra |= MMCRA_SAMPLE_ENABLE;
- psel |= 8;
- }
- if (pmc <= 1)
- mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc);
- else
- mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
- if (pmc == 7) /* PMC8 */
- mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH;
- hwc[i] = pmc;
- if (p4_marked_instr_event(event[i]))
- mmcra |= MMCRA_SAMPLE_ENABLE;
- }
-
- if (pmc_inuse & 1)
- mmcr0 |= MMCR0_PMC1CE;
- if (pmc_inuse & 0xfe)
- mmcr0 |= MMCR0_PMCjCE;
-
- mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
-
- /* Return MMCRx values */
- mmcr[0] = mmcr0;
- mmcr[1] = mmcr1;
- mmcr[2] = mmcra;
- return 0;
-}
-
-static void p4_disable_pmc(unsigned int pmc, unsigned long mmcr[])
-{
- /*
- * Setting the PMCxSEL field to 0 disables PMC x.
- * (Note that pmc is 0-based here, not 1-based.)
- */
- if (pmc <= 1) {
- mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc));
- } else {
- mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)));
- if (pmc == 7)
- mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH);
- }
-}
-
-static int p4_generic_events[] = {
- [PERF_COUNT_HW_CPU_CYCLES] = 7,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x1001,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */
- [PERF_COUNT_HW_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */
-};
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-/*
- * Table of generalized cache-related events.
- * 0 means not supported, -1 means nonsensical, other values
- * are event codes.
- */
-static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
- [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x8c10, 0x3c10 },
- [C(OP_WRITE)] = { 0x7c10, 0xc13 },
- [C(OP_PREFETCH)] = { 0xc35, 0 },
- },
- [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { 0, 0 },
- },
- [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0 },
- [C(OP_WRITE)] = { 0, 0 },
- [C(OP_PREFETCH)] = { 0xc34, 0 },
- },
- [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x904 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x900 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x330, 0x331 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
-};
-
-static struct power_pmu power4_pmu = {
- .name = "POWER4/4+",
- .n_counter = 8,
- .max_alternatives = 5,
- .add_fields = 0x0000001100005555ul,
- .test_adder = 0x0011083300000000ul,
- .compute_mmcr = p4_compute_mmcr,
- .get_constraint = p4_get_constraint,
- .get_alternatives = p4_get_alternatives,
- .disable_pmc = p4_disable_pmc,
- .n_generic = ARRAY_SIZE(p4_generic_events),
- .generic_events = p4_generic_events,
- .cache_events = &power4_cache_events,
-};
-
-static int init_power4_pmu(void)
-{
- if (!cur_cpu_spec->oprofile_cpu_type ||
- strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power4"))
- return -ENODEV;
-
- return register_power_pmu(&power4_pmu);
-}
-
-arch_initcall(init_power4_pmu);
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
deleted file mode 100644
index 199de527d41..00000000000
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ /dev/null
@@ -1,685 +0,0 @@
-/*
- * Performance counter support for POWER5+/++ (not POWER5) processors.
- *
- * Copyright 2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/perf_event.h>
-#include <linux/string.h>
-#include <asm/reg.h>
-#include <asm/cputable.h>
-
-/*
- * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3)
- */
-#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
-#define PM_PMC_MSK 0xf
-#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
-#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
-#define PM_UNIT_MSK 0xf
-#define PM_BYTE_SH 12 /* Byte number of event bus to use */
-#define PM_BYTE_MSK 7
-#define PM_GRS_SH 8 /* Storage subsystem mux select */
-#define PM_GRS_MSK 7
-#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
-#define PM_PMCSEL_MSK 0x7f
-
-/* Values in PM_UNIT field */
-#define PM_FPU 0
-#define PM_ISU0 1
-#define PM_IFU 2
-#define PM_ISU1 3
-#define PM_IDU 4
-#define PM_ISU0_ALT 6
-#define PM_GRS 7
-#define PM_LSU0 8
-#define PM_LSU1 0xc
-#define PM_LASTUNIT 0xc
-
-/*
- * Bits in MMCR1 for POWER5+
- */
-#define MMCR1_TTM0SEL_SH 62
-#define MMCR1_TTM1SEL_SH 60
-#define MMCR1_TTM2SEL_SH 58
-#define MMCR1_TTM3SEL_SH 56
-#define MMCR1_TTMSEL_MSK 3
-#define MMCR1_TD_CP_DBG0SEL_SH 54
-#define MMCR1_TD_CP_DBG1SEL_SH 52
-#define MMCR1_TD_CP_DBG2SEL_SH 50
-#define MMCR1_TD_CP_DBG3SEL_SH 48
-#define MMCR1_GRS_L2SEL_SH 46
-#define MMCR1_GRS_L2SEL_MSK 3
-#define MMCR1_GRS_L3SEL_SH 44
-#define MMCR1_GRS_L3SEL_MSK 3
-#define MMCR1_GRS_MCSEL_SH 41
-#define MMCR1_GRS_MCSEL_MSK 7
-#define MMCR1_GRS_FABSEL_SH 39
-#define MMCR1_GRS_FABSEL_MSK 3
-#define MMCR1_PMC1_ADDER_SEL_SH 35
-#define MMCR1_PMC2_ADDER_SEL_SH 34
-#define MMCR1_PMC3_ADDER_SEL_SH 33
-#define MMCR1_PMC4_ADDER_SEL_SH 32
-#define MMCR1_PMC1SEL_SH 25
-#define MMCR1_PMC2SEL_SH 17
-#define MMCR1_PMC3SEL_SH 9
-#define MMCR1_PMC4SEL_SH 1
-#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
-#define MMCR1_PMCSEL_MSK 0x7f
-
-/*
- * Layout of constraint bits:
- * 6666555555555544444444443333333333222222222211111111110000000000
- * 3210987654321098765432109876543210987654321098765432109876543210
- * [ ><><>< ><> <><>[ > < >< >< >< ><><><><><><>
- * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P6P5P4P3P2P1
- *
- * NC - number of counters
- * 51: NC error 0x0008_0000_0000_0000
- * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
- *
- * G0..G3 - GRS mux constraints
- * 46-47: GRS_L2SEL value
- * 44-45: GRS_L3SEL value
- * 41-44: GRS_MCSEL value
- * 39-40: GRS_FABSEL value
- * Note that these match up with their bit positions in MMCR1
- *
- * T0 - TTM0 constraint
- * 36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000
- *
- * T1 - TTM1 constraint
- * 34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000
- *
- * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
- * 33: UC3 error 0x02_0000_0000
- * 32: FPU|IFU|ISU1 events needed 0x01_0000_0000
- * 31: ISU0 events needed 0x01_8000_0000
- * 30: IDU|GRS events needed 0x00_4000_0000
- *
- * B0
- * 24-27: Byte 0 event source 0x0f00_0000
- * Encoding as for the event code
- *
- * B1, B2, B3
- * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
- *
- * P6
- * 11: P6 error 0x800
- * 10-11: Count of events needing PMC6
- *
- * P1..P5
- * 0-9: Count of events needing PMC1..PMC5
- */
-
-static const int grsel_shift[8] = {
- MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
- MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
- MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
-};
-
-/* Masks and values for using events from the various units */
-static unsigned long unit_cons[PM_LASTUNIT+1][2] = {
- [PM_FPU] = { 0x3200000000ul, 0x0100000000ul },
- [PM_ISU0] = { 0x0200000000ul, 0x0080000000ul },
- [PM_ISU1] = { 0x3200000000ul, 0x3100000000ul },
- [PM_IFU] = { 0x3200000000ul, 0x2100000000ul },
- [PM_IDU] = { 0x0e00000000ul, 0x0040000000ul },
- [PM_GRS] = { 0x0e00000000ul, 0x0c40000000ul },
-};
-
-static int power5p_get_constraint(u64 event, unsigned long *maskp,
- unsigned long *valp)
-{
- int pmc, byte, unit, sh;
- int bit, fmask;
- unsigned long mask = 0, value = 0;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- if (pmc > 6)
- return -1;
- sh = (pmc - 1) * 2;
- mask |= 2 << sh;
- value |= 1 << sh;
- if (pmc >= 5 && !(event == 0x500009 || event == 0x600005))
- return -1;
- }
- if (event & PM_BUSEVENT_MSK) {
- unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
- if (unit > PM_LASTUNIT)
- return -1;
- if (unit == PM_ISU0_ALT)
- unit = PM_ISU0;
- mask |= unit_cons[unit][0];
- value |= unit_cons[unit][1];
- byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
- if (byte >= 4) {
- if (unit != PM_LSU1)
- return -1;
- /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
- ++unit;
- byte &= 3;
- }
- if (unit == PM_GRS) {
- bit = event & 7;
- fmask = (bit == 6)? 7: 3;
- sh = grsel_shift[bit];
- mask |= (unsigned long)fmask << sh;
- value |= (unsigned long)((event >> PM_GRS_SH) & fmask)
- << sh;
- }
- /* Set byte lane select field */
- mask |= 0xfUL << (24 - 4 * byte);
- value |= (unsigned long)unit << (24 - 4 * byte);
- }
- if (pmc < 5) {
- /* need a counter from PMC1-4 set */
- mask |= 0x8000000000000ul;
- value |= 0x1000000000000ul;
- }
- *maskp = mask;
- *valp = value;
- return 0;
-}
-
-static int power5p_limited_pmc_event(u64 event)
-{
- int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
-
- return pmc == 5 || pmc == 6;
-}
-
-#define MAX_ALT 3 /* at most 3 alternatives for any event */
-
-static const unsigned int event_alternatives[][MAX_ALT] = {
- { 0x100c0, 0x40001f }, /* PM_GCT_FULL_CYC */
- { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
- { 0x230e2, 0x323087 }, /* PM_BR_PRED_CR */
- { 0x230e3, 0x223087, 0x3230a0 }, /* PM_BR_PRED_TA */
- { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
- { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */
- { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */
- { 0x100005, 0x600005 }, /* PM_RUN_CYC */
- { 0x100009, 0x200009 }, /* PM_INST_CMPL */
- { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */
- { 0x300009, 0x400009 }, /* PM_INST_DISP */
-};
-
-/*
- * Scan the alternatives table for a match and return the
- * index into the alternatives table if found, else -1.
- */
-static int find_alternative(unsigned int event)
-{
- int i, j;
-
- for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
- if (event < event_alternatives[i][0])
- break;
- for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
- if (event == event_alternatives[i][j])
- return i;
- }
- return -1;
-}
-
-static const unsigned char bytedecode_alternatives[4][4] = {
- /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
- /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
- /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
- /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
-};
-
-/*
- * Some direct events for decodes of event bus byte 3 have alternative
- * PMCSEL values on other counters. This returns the alternative
- * event code for those that do, or -1 otherwise. This also handles
- * alternative PCMSEL values for add events.
- */
-static s64 find_alternative_bdecode(u64 event)
-{
- int pmc, altpmc, pp, j;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc == 0 || pmc > 4)
- return -1;
- altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
- pp = event & PM_PMCSEL_MSK;
- for (j = 0; j < 4; ++j) {
- if (bytedecode_alternatives[pmc - 1][j] == pp) {
- return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
- (altpmc << PM_PMC_SH) |
- bytedecode_alternatives[altpmc - 1][j];
- }
- }
-
- /* new decode alternatives for power5+ */
- if (pmc == 1 && (pp == 0x0d || pp == 0x0e))
- return event + (2 << PM_PMC_SH) + (0x2e - 0x0d);
- if (pmc == 3 && (pp == 0x2e || pp == 0x2f))
- return event - (2 << PM_PMC_SH) - (0x2e - 0x0d);
-
- /* alternative add event encodings */
- if (pp == 0x10 || pp == 0x28)
- return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) |
- (altpmc << PM_PMC_SH);
-
- return -1;
-}
-
-static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[])
-{
- int i, j, nalt = 1;
- int nlim;
- s64 ae;
-
- alt[0] = event;
- nalt = 1;
- nlim = power5p_limited_pmc_event(event);
- i = find_alternative(event);
- if (i >= 0) {
- for (j = 0; j < MAX_ALT; ++j) {
- ae = event_alternatives[i][j];
- if (ae && ae != event)
- alt[nalt++] = ae;
- nlim += power5p_limited_pmc_event(ae);
- }
- } else {
- ae = find_alternative_bdecode(event);
- if (ae > 0)
- alt[nalt++] = ae;
- }
-
- if (flags & PPMU_ONLY_COUNT_RUN) {
- /*
- * We're only counting in RUN state,
- * so PM_CYC is equivalent to PM_RUN_CYC
- * and PM_INST_CMPL === PM_RUN_INST_CMPL.
- * This doesn't include alternatives that don't provide
- * any extra flexibility in assigning PMCs (e.g.
- * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC).
- * Note that even with these additional alternatives
- * we never end up with more than 3 alternatives for any event.
- */
- j = nalt;
- for (i = 0; i < nalt; ++i) {
- switch (alt[i]) {
- case 0xf: /* PM_CYC */
- alt[j++] = 0x600005; /* PM_RUN_CYC */
- ++nlim;
- break;
- case 0x600005: /* PM_RUN_CYC */
- alt[j++] = 0xf;
- break;
- case 0x100009: /* PM_INST_CMPL */
- alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */
- ++nlim;
- break;
- case 0x500009: /* PM_RUN_INST_CMPL */
- alt[j++] = 0x100009; /* PM_INST_CMPL */
- alt[j++] = 0x200009;
- break;
- }
- }
- nalt = j;
- }
-
- if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
- /* remove the limited PMC events */
- j = 0;
- for (i = 0; i < nalt; ++i) {
- if (!power5p_limited_pmc_event(alt[i])) {
- alt[j] = alt[i];
- ++j;
- }
- }
- nalt = j;
- } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
- /* remove all but the limited PMC events */
- j = 0;
- for (i = 0; i < nalt; ++i) {
- if (power5p_limited_pmc_event(alt[i])) {
- alt[j] = alt[i];
- ++j;
- }
- }
- nalt = j;
- }
-
- return nalt;
-}
-
-/*
- * Map of which direct events on which PMCs are marked instruction events.
- * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
- * Bit 0 is set if it is marked for all PMCs.
- * The 0x80 bit indicates a byte decode PMCSEL value.
- */
-static unsigned char direct_event_is_marked[0x28] = {
- 0, /* 00 */
- 0x1f, /* 01 PM_IOPS_CMPL */
- 0x2, /* 02 PM_MRK_GRP_DISP */
- 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
- 0, /* 04 */
- 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
- 0x80, /* 06 */
- 0x80, /* 07 */
- 0, 0, 0,/* 08 - 0a */
- 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
- 0, /* 0c */
- 0x80, /* 0d */
- 0x80, /* 0e */
- 0, /* 0f */
- 0, /* 10 */
- 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
- 0, /* 12 */
- 0x10, /* 13 PM_MRK_GRP_CMPL */
- 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
- 0x2, /* 15 PM_MRK_GRP_ISSUED */
- 0x80, /* 16 */
- 0x80, /* 17 */
- 0, 0, 0, 0, 0,
- 0x80, /* 1d */
- 0x80, /* 1e */
- 0, /* 1f */
- 0x80, /* 20 */
- 0x80, /* 21 */
- 0x80, /* 22 */
- 0x80, /* 23 */
- 0x80, /* 24 */
- 0x80, /* 25 */
- 0x80, /* 26 */
- 0x80, /* 27 */
-};
-
-/*
- * Returns 1 if event counts things relating to marked instructions
- * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
- */
-static int power5p_marked_instr_event(u64 event)
-{
- int pmc, psel;
- int bit, byte, unit;
- u32 mask;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- psel = event & PM_PMCSEL_MSK;
- if (pmc >= 5)
- return 0;
-
- bit = -1;
- if (psel < sizeof(direct_event_is_marked)) {
- if (direct_event_is_marked[psel] & (1 << pmc))
- return 1;
- if (direct_event_is_marked[psel] & 0x80)
- bit = 4;
- else if (psel == 0x08)
- bit = pmc - 1;
- else if (psel == 0x10)
- bit = 4 - pmc;
- else if (psel == 0x1b && (pmc == 1 || pmc == 3))
- bit = 4;
- } else if ((psel & 0x48) == 0x40) {
- bit = psel & 7;
- } else if (psel == 0x28) {
- bit = pmc - 1;
- } else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) {
- bit = 4;
- }
-
- if (!(event & PM_BUSEVENT_MSK) || bit == -1)
- return 0;
-
- byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
- unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
- if (unit == PM_LSU0) {
- /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
- mask = 0x5dff00;
- } else if (unit == PM_LSU1 && byte >= 4) {
- byte -= 4;
- /* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */
- mask = 0x5f11c000;
- } else
- return 0;
-
- return (mask >> (byte * 8 + bit)) & 1;
-}
-
-static int power5p_compute_mmcr(u64 event[], int n_ev,
- unsigned int hwc[], unsigned long mmcr[])
-{
- unsigned long mmcr1 = 0;
- unsigned long mmcra = 0;
- unsigned int pmc, unit, byte, psel;
- unsigned int ttm;
- int i, isbus, bit, grsel;
- unsigned int pmc_inuse = 0;
- unsigned char busbyte[4];
- unsigned char unituse[16];
- int ttmuse;
-
- if (n_ev > 6)
- return -1;
-
- /* First pass to count resource use */
- memset(busbyte, 0, sizeof(busbyte));
- memset(unituse, 0, sizeof(unituse));
- for (i = 0; i < n_ev; ++i) {
- pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- if (pmc > 6)
- return -1;
- if (pmc_inuse & (1 << (pmc - 1)))
- return -1;
- pmc_inuse |= 1 << (pmc - 1);
- }
- if (event[i] & PM_BUSEVENT_MSK) {
- unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
- byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
- if (unit > PM_LASTUNIT)
- return -1;
- if (unit == PM_ISU0_ALT)
- unit = PM_ISU0;
- if (byte >= 4) {
- if (unit != PM_LSU1)
- return -1;
- ++unit;
- byte &= 3;
- }
- if (busbyte[byte] && busbyte[byte] != unit)
- return -1;
- busbyte[byte] = unit;
- unituse[unit] = 1;
- }
- }
-
- /*
- * Assign resources and set multiplexer selects.
- *
- * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
- * choice we have to deal with.
- */
- if (unituse[PM_ISU0] &
- (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
- unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
- unituse[PM_ISU0] = 0;
- }
- /* Set TTM[01]SEL fields. */
- ttmuse = 0;
- for (i = PM_FPU; i <= PM_ISU1; ++i) {
- if (!unituse[i])
- continue;
- if (ttmuse++)
- return -1;
- mmcr1 |= (unsigned long)i << MMCR1_TTM0SEL_SH;
- }
- ttmuse = 0;
- for (; i <= PM_GRS; ++i) {
- if (!unituse[i])
- continue;
- if (ttmuse++)
- return -1;
- mmcr1 |= (unsigned long)(i & 3) << MMCR1_TTM1SEL_SH;
- }
- if (ttmuse > 1)
- return -1;
-
- /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
- for (byte = 0; byte < 4; ++byte) {
- unit = busbyte[byte];
- if (!unit)
- continue;
- if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
- /* get ISU0 through TTM1 rather than TTM0 */
- unit = PM_ISU0_ALT;
- } else if (unit == PM_LSU1 + 1) {
- /* select lower word of LSU1 for this byte */
- mmcr1 |= 1ul << (MMCR1_TTM3SEL_SH + 3 - byte);
- }
- ttm = unit >> 2;
- mmcr1 |= (unsigned long)ttm
- << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
- }
-
- /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
- for (i = 0; i < n_ev; ++i) {
- pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
- unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
- byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
- psel = event[i] & PM_PMCSEL_MSK;
- isbus = event[i] & PM_BUSEVENT_MSK;
- if (!pmc) {
- /* Bus event or any-PMC direct event */
- for (pmc = 0; pmc < 4; ++pmc) {
- if (!(pmc_inuse & (1 << pmc)))
- break;
- }
- if (pmc >= 4)
- return -1;
- pmc_inuse |= 1 << pmc;
- } else if (pmc <= 4) {
- /* Direct event */
- --pmc;
- if (isbus && (byte & 2) &&
- (psel == 8 || psel == 0x10 || psel == 0x28))
- /* add events on higher-numbered bus */
- mmcr1 |= 1ul << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
- } else {
- /* Instructions or run cycles on PMC5/6 */
- --pmc;
- }
- if (isbus && unit == PM_GRS) {
- bit = psel & 7;
- grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
- mmcr1 |= (unsigned long)grsel << grsel_shift[bit];
- }
- if (power5p_marked_instr_event(event[i]))
- mmcra |= MMCRA_SAMPLE_ENABLE;
- if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
- /* select alternate byte lane */
- psel |= 0x10;
- if (pmc <= 3)
- mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
- hwc[i] = pmc;
- }
-
- /* Return MMCRx values */
- mmcr[0] = 0;
- if (pmc_inuse & 1)
- mmcr[0] = MMCR0_PMC1CE;
- if (pmc_inuse & 0x3e)
- mmcr[0] |= MMCR0_PMCjCE;
- mmcr[1] = mmcr1;
- mmcr[2] = mmcra;
- return 0;
-}
-
-static void power5p_disable_pmc(unsigned int pmc, unsigned long mmcr[])
-{
- if (pmc <= 3)
- mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
-}
-
-static int power5p_generic_events[] = {
- [PERF_COUNT_HW_CPU_CYCLES] = 0xf,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x100009,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */
- [PERF_COUNT_HW_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
-};
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-/*
- * Table of generalized cache-related events.
- * 0 means not supported, -1 means nonsensical, other values
- * are event codes.
- */
-static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
- [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x1c10a8, 0x3c1088 },
- [C(OP_WRITE)] = { 0x2c10a8, 0xc10c3 },
- [C(OP_PREFETCH)] = { 0xc70e7, -1 },
- },
- [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { 0, 0 },
- },
- [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0 },
- [C(OP_WRITE)] = { 0, 0 },
- [C(OP_PREFETCH)] = { 0xc50c3, 0 },
- },
- [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0xc20e4, 0x800c4 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x800c0 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x230e4, 0x230e5 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
-};
-
-static struct power_pmu power5p_pmu = {
- .name = "POWER5+/++",
- .n_counter = 6,
- .max_alternatives = MAX_ALT,
- .add_fields = 0x7000000000055ul,
- .test_adder = 0x3000040000000ul,
- .compute_mmcr = power5p_compute_mmcr,
- .get_constraint = power5p_get_constraint,
- .get_alternatives = power5p_get_alternatives,
- .disable_pmc = power5p_disable_pmc,
- .limited_pmc_event = power5p_limited_pmc_event,
- .flags = PPMU_LIMITED_PMC5_6,
- .n_generic = ARRAY_SIZE(power5p_generic_events),
- .generic_events = power5p_generic_events,
- .cache_events = &power5p_cache_events,
-};
-
-static int init_power5p_pmu(void)
-{
- if (!cur_cpu_spec->oprofile_cpu_type ||
- (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5+")
- && strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5++")))
- return -ENODEV;
-
- return register_power_pmu(&power5p_pmu);
-}
-
-arch_initcall(init_power5p_pmu);
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
deleted file mode 100644
index 98b6a729a9d..00000000000
--- a/arch/powerpc/kernel/power5-pmu.c
+++ /dev/null
@@ -1,624 +0,0 @@
-/*
- * Performance counter support for POWER5 (not POWER5++) processors.
- *
- * Copyright 2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/perf_event.h>
-#include <linux/string.h>
-#include <asm/reg.h>
-#include <asm/cputable.h>
-
-/*
- * Bits in event code for POWER5 (not POWER5++)
- */
-#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
-#define PM_PMC_MSK 0xf
-#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
-#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
-#define PM_UNIT_MSK 0xf
-#define PM_BYTE_SH 12 /* Byte number of event bus to use */
-#define PM_BYTE_MSK 7
-#define PM_GRS_SH 8 /* Storage subsystem mux select */
-#define PM_GRS_MSK 7
-#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
-#define PM_PMCSEL_MSK 0x7f
-
-/* Values in PM_UNIT field */
-#define PM_FPU 0
-#define PM_ISU0 1
-#define PM_IFU 2
-#define PM_ISU1 3
-#define PM_IDU 4
-#define PM_ISU0_ALT 6
-#define PM_GRS 7
-#define PM_LSU0 8
-#define PM_LSU1 0xc
-#define PM_LASTUNIT 0xc
-
-/*
- * Bits in MMCR1 for POWER5
- */
-#define MMCR1_TTM0SEL_SH 62
-#define MMCR1_TTM1SEL_SH 60
-#define MMCR1_TTM2SEL_SH 58
-#define MMCR1_TTM3SEL_SH 56
-#define MMCR1_TTMSEL_MSK 3
-#define MMCR1_TD_CP_DBG0SEL_SH 54
-#define MMCR1_TD_CP_DBG1SEL_SH 52
-#define MMCR1_TD_CP_DBG2SEL_SH 50
-#define MMCR1_TD_CP_DBG3SEL_SH 48
-#define MMCR1_GRS_L2SEL_SH 46
-#define MMCR1_GRS_L2SEL_MSK 3
-#define MMCR1_GRS_L3SEL_SH 44
-#define MMCR1_GRS_L3SEL_MSK 3
-#define MMCR1_GRS_MCSEL_SH 41
-#define MMCR1_GRS_MCSEL_MSK 7
-#define MMCR1_GRS_FABSEL_SH 39
-#define MMCR1_GRS_FABSEL_MSK 3
-#define MMCR1_PMC1_ADDER_SEL_SH 35
-#define MMCR1_PMC2_ADDER_SEL_SH 34
-#define MMCR1_PMC3_ADDER_SEL_SH 33
-#define MMCR1_PMC4_ADDER_SEL_SH 32
-#define MMCR1_PMC1SEL_SH 25
-#define MMCR1_PMC2SEL_SH 17
-#define MMCR1_PMC3SEL_SH 9
-#define MMCR1_PMC4SEL_SH 1
-#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
-#define MMCR1_PMCSEL_MSK 0x7f
-
-/*
- * Layout of constraint bits:
- * 6666555555555544444444443333333333222222222211111111110000000000
- * 3210987654321098765432109876543210987654321098765432109876543210
- * <><>[ ><><>< ><> [ >[ >[ >< >< >< >< ><><><><><><>
- * T0T1 NC G0G1G2 G3 UC PS1PS2 B0 B1 B2 B3 P6P5P4P3P2P1
- *
- * T0 - TTM0 constraint
- * 54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000
- *
- * T1 - TTM1 constraint
- * 52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000
- *
- * NC - number of counters
- * 51: NC error 0x0008_0000_0000_0000
- * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
- *
- * G0..G3 - GRS mux constraints
- * 46-47: GRS_L2SEL value
- * 44-45: GRS_L3SEL value
- * 41-44: GRS_MCSEL value
- * 39-40: GRS_FABSEL value
- * Note that these match up with their bit positions in MMCR1
- *
- * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
- * 37: UC3 error 0x20_0000_0000
- * 36: FPU|IFU|ISU1 events needed 0x10_0000_0000
- * 35: ISU0 events needed 0x08_0000_0000
- * 34: IDU|GRS events needed 0x04_0000_0000
- *
- * PS1
- * 33: PS1 error 0x2_0000_0000
- * 31-32: count of events needing PMC1/2 0x1_8000_0000
- *
- * PS2
- * 30: PS2 error 0x4000_0000
- * 28-29: count of events needing PMC3/4 0x3000_0000
- *
- * B0
- * 24-27: Byte 0 event source 0x0f00_0000
- * Encoding as for the event code
- *
- * B1, B2, B3
- * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
- *
- * P1..P6
- * 0-11: Count of events needing PMC1..PMC6
- */
-
-static const int grsel_shift[8] = {
- MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
- MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
- MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
-};
-
-/* Masks and values for using events from the various units */
-static unsigned long unit_cons[PM_LASTUNIT+1][2] = {
- [PM_FPU] = { 0xc0002000000000ul, 0x00001000000000ul },
- [PM_ISU0] = { 0x00002000000000ul, 0x00000800000000ul },
- [PM_ISU1] = { 0xc0002000000000ul, 0xc0001000000000ul },
- [PM_IFU] = { 0xc0002000000000ul, 0x80001000000000ul },
- [PM_IDU] = { 0x30002000000000ul, 0x00000400000000ul },
- [PM_GRS] = { 0x30002000000000ul, 0x30000400000000ul },
-};
-
-static int power5_get_constraint(u64 event, unsigned long *maskp,
- unsigned long *valp)
-{
- int pmc, byte, unit, sh;
- int bit, fmask;
- unsigned long mask = 0, value = 0;
- int grp = -1;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- if (pmc > 6)
- return -1;
- sh = (pmc - 1) * 2;
- mask |= 2 << sh;
- value |= 1 << sh;
- if (pmc <= 4)
- grp = (pmc - 1) >> 1;
- else if (event != 0x500009 && event != 0x600005)
- return -1;
- }
- if (event & PM_BUSEVENT_MSK) {
- unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
- if (unit > PM_LASTUNIT)
- return -1;
- if (unit == PM_ISU0_ALT)
- unit = PM_ISU0;
- mask |= unit_cons[unit][0];
- value |= unit_cons[unit][1];
- byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
- if (byte >= 4) {
- if (unit != PM_LSU1)
- return -1;
- /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
- ++unit;
- byte &= 3;
- }
- if (unit == PM_GRS) {
- bit = event & 7;
- fmask = (bit == 6)? 7: 3;
- sh = grsel_shift[bit];
- mask |= (unsigned long)fmask << sh;
- value |= (unsigned long)((event >> PM_GRS_SH) & fmask)
- << sh;
- }
- /*
- * Bus events on bytes 0 and 2 can be counted
- * on PMC1/2; bytes 1 and 3 on PMC3/4.
- */
- if (!pmc)
- grp = byte & 1;
- /* Set byte lane select field */
- mask |= 0xfUL << (24 - 4 * byte);
- value |= (unsigned long)unit << (24 - 4 * byte);
- }
- if (grp == 0) {
- /* increment PMC1/2 field */
- mask |= 0x200000000ul;
- value |= 0x080000000ul;
- } else if (grp == 1) {
- /* increment PMC3/4 field */
- mask |= 0x40000000ul;
- value |= 0x10000000ul;
- }
- if (pmc < 5) {
- /* need a counter from PMC1-4 set */
- mask |= 0x8000000000000ul;
- value |= 0x1000000000000ul;
- }
- *maskp = mask;
- *valp = value;
- return 0;
-}
-
-#define MAX_ALT 3 /* at most 3 alternatives for any event */
-
-static const unsigned int event_alternatives[][MAX_ALT] = {
- { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
- { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
- { 0x100005, 0x600005 }, /* PM_RUN_CYC */
- { 0x100009, 0x200009, 0x500009 }, /* PM_INST_CMPL */
- { 0x300009, 0x400009 }, /* PM_INST_DISP */
-};
-
-/*
- * Scan the alternatives table for a match and return the
- * index into the alternatives table if found, else -1.
- */
-static int find_alternative(u64 event)
-{
- int i, j;
-
- for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
- if (event < event_alternatives[i][0])
- break;
- for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
- if (event == event_alternatives[i][j])
- return i;
- }
- return -1;
-}
-
-static const unsigned char bytedecode_alternatives[4][4] = {
- /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
- /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
- /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
- /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
-};
-
-/*
- * Some direct events for decodes of event bus byte 3 have alternative
- * PMCSEL values on other counters. This returns the alternative
- * event code for those that do, or -1 otherwise.
- */
-static s64 find_alternative_bdecode(u64 event)
-{
- int pmc, altpmc, pp, j;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc == 0 || pmc > 4)
- return -1;
- altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
- pp = event & PM_PMCSEL_MSK;
- for (j = 0; j < 4; ++j) {
- if (bytedecode_alternatives[pmc - 1][j] == pp) {
- return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
- (altpmc << PM_PMC_SH) |
- bytedecode_alternatives[altpmc - 1][j];
- }
- }
- return -1;
-}
-
-static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[])
-{
- int i, j, nalt = 1;
- s64 ae;
-
- alt[0] = event;
- nalt = 1;
- i = find_alternative(event);
- if (i >= 0) {
- for (j = 0; j < MAX_ALT; ++j) {
- ae = event_alternatives[i][j];
- if (ae && ae != event)
- alt[nalt++] = ae;
- }
- } else {
- ae = find_alternative_bdecode(event);
- if (ae > 0)
- alt[nalt++] = ae;
- }
- return nalt;
-}
-
-/*
- * Map of which direct events on which PMCs are marked instruction events.
- * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
- * Bit 0 is set if it is marked for all PMCs.
- * The 0x80 bit indicates a byte decode PMCSEL value.
- */
-static unsigned char direct_event_is_marked[0x28] = {
- 0, /* 00 */
- 0x1f, /* 01 PM_IOPS_CMPL */
- 0x2, /* 02 PM_MRK_GRP_DISP */
- 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
- 0, /* 04 */
- 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
- 0x80, /* 06 */
- 0x80, /* 07 */
- 0, 0, 0,/* 08 - 0a */
- 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
- 0, /* 0c */
- 0x80, /* 0d */
- 0x80, /* 0e */
- 0, /* 0f */
- 0, /* 10 */
- 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
- 0, /* 12 */
- 0x10, /* 13 PM_MRK_GRP_CMPL */
- 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
- 0x2, /* 15 PM_MRK_GRP_ISSUED */
- 0x80, /* 16 */
- 0x80, /* 17 */
- 0, 0, 0, 0, 0,
- 0x80, /* 1d */
- 0x80, /* 1e */
- 0, /* 1f */
- 0x80, /* 20 */
- 0x80, /* 21 */
- 0x80, /* 22 */
- 0x80, /* 23 */
- 0x80, /* 24 */
- 0x80, /* 25 */
- 0x80, /* 26 */
- 0x80, /* 27 */
-};
-
-/*
- * Returns 1 if event counts things relating to marked instructions
- * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
- */
-static int power5_marked_instr_event(u64 event)
-{
- int pmc, psel;
- int bit, byte, unit;
- u32 mask;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- psel = event & PM_PMCSEL_MSK;
- if (pmc >= 5)
- return 0;
-
- bit = -1;
- if (psel < sizeof(direct_event_is_marked)) {
- if (direct_event_is_marked[psel] & (1 << pmc))
- return 1;
- if (direct_event_is_marked[psel] & 0x80)
- bit = 4;
- else if (psel == 0x08)
- bit = pmc - 1;
- else if (psel == 0x10)
- bit = 4 - pmc;
- else if (psel == 0x1b && (pmc == 1 || pmc == 3))
- bit = 4;
- } else if ((psel & 0x58) == 0x40)
- bit = psel & 7;
-
- if (!(event & PM_BUSEVENT_MSK))
- return 0;
-
- byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
- unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
- if (unit == PM_LSU0) {
- /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
- mask = 0x5dff00;
- } else if (unit == PM_LSU1 && byte >= 4) {
- byte -= 4;
- /* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */
- mask = 0x5f00c0aa;
- } else
- return 0;
-
- return (mask >> (byte * 8 + bit)) & 1;
-}
-
-static int power5_compute_mmcr(u64 event[], int n_ev,
- unsigned int hwc[], unsigned long mmcr[])
-{
- unsigned long mmcr1 = 0;
- unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
- unsigned int pmc, unit, byte, psel;
- unsigned int ttm, grp;
- int i, isbus, bit, grsel;
- unsigned int pmc_inuse = 0;
- unsigned int pmc_grp_use[2];
- unsigned char busbyte[4];
- unsigned char unituse[16];
- int ttmuse;
-
- if (n_ev > 6)
- return -1;
-
- /* First pass to count resource use */
- pmc_grp_use[0] = pmc_grp_use[1] = 0;
- memset(busbyte, 0, sizeof(busbyte));
- memset(unituse, 0, sizeof(unituse));
- for (i = 0; i < n_ev; ++i) {
- pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- if (pmc > 6)
- return -1;
- if (pmc_inuse & (1 << (pmc - 1)))
- return -1;
- pmc_inuse |= 1 << (pmc - 1);
- /* count 1/2 vs 3/4 use */
- if (pmc <= 4)
- ++pmc_grp_use[(pmc - 1) >> 1];
- }
- if (event[i] & PM_BUSEVENT_MSK) {
- unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
- byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
- if (unit > PM_LASTUNIT)
- return -1;
- if (unit == PM_ISU0_ALT)
- unit = PM_ISU0;
- if (byte >= 4) {
- if (unit != PM_LSU1)
- return -1;
- ++unit;
- byte &= 3;
- }
- if (!pmc)
- ++pmc_grp_use[byte & 1];
- if (busbyte[byte] && busbyte[byte] != unit)
- return -1;
- busbyte[byte] = unit;
- unituse[unit] = 1;
- }
- }
- if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2)
- return -1;
-
- /*
- * Assign resources and set multiplexer selects.
- *
- * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
- * choice we have to deal with.
- */
- if (unituse[PM_ISU0] &
- (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
- unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
- unituse[PM_ISU0] = 0;
- }
- /* Set TTM[01]SEL fields. */
- ttmuse = 0;
- for (i = PM_FPU; i <= PM_ISU1; ++i) {
- if (!unituse[i])
- continue;
- if (ttmuse++)
- return -1;
- mmcr1 |= (unsigned long)i << MMCR1_TTM0SEL_SH;
- }
- ttmuse = 0;
- for (; i <= PM_GRS; ++i) {
- if (!unituse[i])
- continue;
- if (ttmuse++)
- return -1;
- mmcr1 |= (unsigned long)(i & 3) << MMCR1_TTM1SEL_SH;
- }
- if (ttmuse > 1)
- return -1;
-
- /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
- for (byte = 0; byte < 4; ++byte) {
- unit = busbyte[byte];
- if (!unit)
- continue;
- if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
- /* get ISU0 through TTM1 rather than TTM0 */
- unit = PM_ISU0_ALT;
- } else if (unit == PM_LSU1 + 1) {
- /* select lower word of LSU1 for this byte */
- mmcr1 |= 1ul << (MMCR1_TTM3SEL_SH + 3 - byte);
- }
- ttm = unit >> 2;
- mmcr1 |= (unsigned long)ttm
- << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
- }
-
- /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
- for (i = 0; i < n_ev; ++i) {
- pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
- unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
- byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
- psel = event[i] & PM_PMCSEL_MSK;
- isbus = event[i] & PM_BUSEVENT_MSK;
- if (!pmc) {
- /* Bus event or any-PMC direct event */
- for (pmc = 0; pmc < 4; ++pmc) {
- if (pmc_inuse & (1 << pmc))
- continue;
- grp = (pmc >> 1) & 1;
- if (isbus) {
- if (grp == (byte & 1))
- break;
- } else if (pmc_grp_use[grp] < 2) {
- ++pmc_grp_use[grp];
- break;
- }
- }
- pmc_inuse |= 1 << pmc;
- } else if (pmc <= 4) {
- /* Direct event */
- --pmc;
- if ((psel == 8 || psel == 0x10) && isbus && (byte & 2))
- /* add events on higher-numbered bus */
- mmcr1 |= 1ul << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
- } else {
- /* Instructions or run cycles on PMC5/6 */
- --pmc;
- }
- if (isbus && unit == PM_GRS) {
- bit = psel & 7;
- grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
- mmcr1 |= (unsigned long)grsel << grsel_shift[bit];
- }
- if (power5_marked_instr_event(event[i]))
- mmcra |= MMCRA_SAMPLE_ENABLE;
- if (pmc <= 3)
- mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
- hwc[i] = pmc;
- }
-
- /* Return MMCRx values */
- mmcr[0] = 0;
- if (pmc_inuse & 1)
- mmcr[0] = MMCR0_PMC1CE;
- if (pmc_inuse & 0x3e)
- mmcr[0] |= MMCR0_PMCjCE;
- mmcr[1] = mmcr1;
- mmcr[2] = mmcra;
- return 0;
-}
-
-static void power5_disable_pmc(unsigned int pmc, unsigned long mmcr[])
-{
- if (pmc <= 3)
- mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
-}
-
-static int power5_generic_events[] = {
- [PERF_COUNT_HW_CPU_CYCLES] = 0xf,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x100009,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */
- [PERF_COUNT_HW_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
-};
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-/*
- * Table of generalized cache-related events.
- * 0 means not supported, -1 means nonsensical, other values
- * are event codes.
- */
-static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
- [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x4c1090, 0x3c1088 },
- [C(OP_WRITE)] = { 0x3c1090, 0xc10c3 },
- [C(OP_PREFETCH)] = { 0xc70e7, 0 },
- },
- [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { 0, 0 },
- },
- [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x3c309b },
- [C(OP_WRITE)] = { 0, 0 },
- [C(OP_PREFETCH)] = { 0xc50c3, 0 },
- },
- [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x2c4090, 0x800c4 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x800c0 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x230e4, 0x230e5 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
-};
-
-static struct power_pmu power5_pmu = {
- .name = "POWER5",
- .n_counter = 6,
- .max_alternatives = MAX_ALT,
- .add_fields = 0x7000090000555ul,
- .test_adder = 0x3000490000000ul,
- .compute_mmcr = power5_compute_mmcr,
- .get_constraint = power5_get_constraint,
- .get_alternatives = power5_get_alternatives,
- .disable_pmc = power5_disable_pmc,
- .n_generic = ARRAY_SIZE(power5_generic_events),
- .generic_events = power5_generic_events,
- .cache_events = &power5_cache_events,
-};
-
-static int init_power5_pmu(void)
-{
- if (!cur_cpu_spec->oprofile_cpu_type ||
- strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5"))
- return -ENODEV;
-
- return register_power_pmu(&power5_pmu);
-}
-
-arch_initcall(init_power5_pmu);
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
deleted file mode 100644
index 84a607bda8f..00000000000
--- a/arch/powerpc/kernel/power6-pmu.c
+++ /dev/null
@@ -1,547 +0,0 @@
-/*
- * Performance counter support for POWER6 processors.
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/perf_event.h>
-#include <linux/string.h>
-#include <asm/reg.h>
-#include <asm/cputable.h>
-
-/*
- * Bits in event code for POWER6
- */
-#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
-#define PM_PMC_MSK 0x7
-#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
-#define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */
-#define PM_UNIT_MSK 0xf
-#define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH)
-#define PM_LLAV 0x8000 /* Load lookahead match value */
-#define PM_LLA 0x4000 /* Load lookahead match enable */
-#define PM_BYTE_SH 12 /* Byte of event bus to use */
-#define PM_BYTE_MSK 3
-#define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */
-#define PM_SUBUNIT_MSK 7
-#define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
-#define PM_PMCSEL_MSK 0xff /* PMCxSEL value */
-#define PM_BUSEVENT_MSK 0xf3700
-
-/*
- * Bits in MMCR1 for POWER6
- */
-#define MMCR1_TTM0SEL_SH 60
-#define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4)
-#define MMCR1_TTMSEL_MSK 0xf
-#define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
-#define MMCR1_NESTSEL_SH 45
-#define MMCR1_NESTSEL_MSK 0x7
-#define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
-#define MMCR1_PMC1_LLA (1ul << 44)
-#define MMCR1_PMC1_LLA_VALUE (1ul << 39)
-#define MMCR1_PMC1_ADDR_SEL (1ul << 35)
-#define MMCR1_PMC1SEL_SH 24
-#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
-#define MMCR1_PMCSEL_MSK 0xff
-
-/*
- * Map of which direct events on which PMCs are marked instruction events.
- * Indexed by PMCSEL value >> 1.
- * Bottom 4 bits are a map of which PMCs are interesting,
- * top 4 bits say what sort of event:
- * 0 = direct marked event,
- * 1 = byte decode event,
- * 4 = add/and event (PMC1 -> bits 0 & 4),
- * 5 = add/and event (PMC1 -> bits 1 & 5),
- * 6 = add/and event (PMC1 -> bits 2 & 6),
- * 7 = add/and event (PMC1 -> bits 3 & 7).
- */
-static unsigned char direct_event_is_marked[0x60 >> 1] = {
- 0, /* 00 */
- 0, /* 02 */
- 0, /* 04 */
- 0x07, /* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
- 0x04, /* 08 PM_MRK_DFU_FIN */
- 0x06, /* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */
- 0, /* 0c */
- 0, /* 0e */
- 0x02, /* 10 PM_MRK_INST_DISP */
- 0x08, /* 12 PM_MRK_LSU_DERAT_MISS */
- 0, /* 14 */
- 0, /* 16 */
- 0x0c, /* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */
- 0x0f, /* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */
- 0x01, /* 1c PM_MRK_INST_ISSUED */
- 0, /* 1e */
- 0, /* 20 */
- 0, /* 22 */
- 0, /* 24 */
- 0, /* 26 */
- 0x15, /* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */
- 0, /* 2a */
- 0, /* 2c */
- 0, /* 2e */
- 0x4f, /* 30 */
- 0x7f, /* 32 */
- 0x4f, /* 34 */
- 0x5f, /* 36 */
- 0x6f, /* 38 */
- 0x4f, /* 3a */
- 0, /* 3c */
- 0x08, /* 3e PM_MRK_INST_TIMEO */
- 0x1f, /* 40 */
- 0x1f, /* 42 */
- 0x1f, /* 44 */
- 0x1f, /* 46 */
- 0x1f, /* 48 */
- 0x1f, /* 4a */
- 0x1f, /* 4c */
- 0x1f, /* 4e */
- 0, /* 50 */
- 0x05, /* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */
- 0x1c, /* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */
- 0x02, /* 56 PM_MRK_LD_MISS_L1 */
- 0, /* 58 */
- 0, /* 5a */
- 0, /* 5c */
- 0, /* 5e */
-};
-
-/*
- * Masks showing for each unit which bits are marked events.
- * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0.
- */
-static u32 marked_bus_events[16] = {
- 0x01000000, /* direct events set 1: byte 3 bit 0 */
- 0x00010000, /* direct events set 2: byte 2 bit 0 */
- 0, 0, 0, 0, /* IDU, IFU, nest: nothing */
- 0x00000088, /* VMX set 1: byte 0 bits 3, 7 */
- 0x000000c0, /* VMX set 2: byte 0 bits 4-7 */
- 0x04010000, /* LSU set 1: byte 2 bit 0, byte 3 bit 2 */
- 0xff010000u, /* LSU set 2: byte 2 bit 0, all of byte 3 */
- 0, /* LSU set 3 */
- 0x00000010, /* VMX set 3: byte 0 bit 4 */
- 0, /* BFP set 1 */
- 0x00000022, /* BFP set 2: byte 0 bits 1, 5 */
- 0, 0
-};
-
-/*
- * Returns 1 if event counts things relating to marked instructions
- * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
- */
-static int power6_marked_instr_event(u64 event)
-{
- int pmc, psel, ptype;
- int bit, byte, unit;
- u32 mask;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- psel = (event & PM_PMCSEL_MSK) >> 1; /* drop edge/level bit */
- if (pmc >= 5)
- return 0;
-
- bit = -1;
- if (psel < sizeof(direct_event_is_marked)) {
- ptype = direct_event_is_marked[psel];
- if (pmc == 0 || !(ptype & (1 << (pmc - 1))))
- return 0;
- ptype >>= 4;
- if (ptype == 0)
- return 1;
- if (ptype == 1)
- bit = 0;
- else
- bit = ptype ^ (pmc - 1);
- } else if ((psel & 0x48) == 0x40)
- bit = psel & 7;
-
- if (!(event & PM_BUSEVENT_MSK) || bit == -1)
- return 0;
-
- byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
- unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
- mask = marked_bus_events[unit];
- return (mask >> (byte * 8 + bit)) & 1;
-}
-
-/*
- * Assign PMC numbers and compute MMCR1 value for a set of events
- */
-static int p6_compute_mmcr(u64 event[], int n_ev,
- unsigned int hwc[], unsigned long mmcr[])
-{
- unsigned long mmcr1 = 0;
- unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
- int i;
- unsigned int pmc, ev, b, u, s, psel;
- unsigned int ttmset = 0;
- unsigned int pmc_inuse = 0;
-
- if (n_ev > 6)
- return -1;
- for (i = 0; i < n_ev; ++i) {
- pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- if (pmc_inuse & (1 << (pmc - 1)))
- return -1; /* collision! */
- pmc_inuse |= 1 << (pmc - 1);
- }
- }
- for (i = 0; i < n_ev; ++i) {
- ev = event[i];
- pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- --pmc;
- } else {
- /* can go on any PMC; find a free one */
- for (pmc = 0; pmc < 4; ++pmc)
- if (!(pmc_inuse & (1 << pmc)))
- break;
- if (pmc >= 4)
- return -1;
- pmc_inuse |= 1 << pmc;
- }
- hwc[i] = pmc;
- psel = ev & PM_PMCSEL_MSK;
- if (ev & PM_BUSEVENT_MSK) {
- /* this event uses the event bus */
- b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
- u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
- /* check for conflict on this byte of event bus */
- if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
- return -1;
- mmcr1 |= (unsigned long)u << MMCR1_TTMSEL_SH(b);
- ttmset |= 1 << b;
- if (u == 5) {
- /* Nest events have a further mux */
- s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
- if ((ttmset & 0x10) &&
- MMCR1_NESTSEL(mmcr1) != s)
- return -1;
- ttmset |= 0x10;
- mmcr1 |= (unsigned long)s << MMCR1_NESTSEL_SH;
- }
- if (0x30 <= psel && psel <= 0x3d) {
- /* these need the PMCx_ADDR_SEL bits */
- if (b >= 2)
- mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
- }
- /* bus select values are different for PMC3/4 */
- if (pmc >= 2 && (psel & 0x90) == 0x80)
- psel ^= 0x20;
- }
- if (ev & PM_LLA) {
- mmcr1 |= MMCR1_PMC1_LLA >> pmc;
- if (ev & PM_LLAV)
- mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
- }
- if (power6_marked_instr_event(event[i]))
- mmcra |= MMCRA_SAMPLE_ENABLE;
- if (pmc < 4)
- mmcr1 |= (unsigned long)psel << MMCR1_PMCSEL_SH(pmc);
- }
- mmcr[0] = 0;
- if (pmc_inuse & 1)
- mmcr[0] = MMCR0_PMC1CE;
- if (pmc_inuse & 0xe)
- mmcr[0] |= MMCR0_PMCjCE;
- mmcr[1] = mmcr1;
- mmcr[2] = mmcra;
- return 0;
-}
-
-/*
- * Layout of constraint bits:
- *
- * 0-1 add field: number of uses of PMC1 (max 1)
- * 2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6
- * 12-15 add field: number of uses of PMC1-4 (max 4)
- * 16-19 select field: unit on byte 0 of event bus
- * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
- * 32-34 select field: nest (subunit) event selector
- */
-static int p6_get_constraint(u64 event, unsigned long *maskp,
- unsigned long *valp)
-{
- int pmc, byte, sh, subunit;
- unsigned long mask = 0, value = 0;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- if (pmc > 4 && !(event == 0x500009 || event == 0x600005))
- return -1;
- sh = (pmc - 1) * 2;
- mask |= 2 << sh;
- value |= 1 << sh;
- }
- if (event & PM_BUSEVENT_MSK) {
- byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
- sh = byte * 4 + (16 - PM_UNIT_SH);
- mask |= PM_UNIT_MSKS << sh;
- value |= (unsigned long)(event & PM_UNIT_MSKS) << sh;
- if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
- subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
- mask |= (unsigned long)PM_SUBUNIT_MSK << 32;
- value |= (unsigned long)subunit << 32;
- }
- }
- if (pmc <= 4) {
- mask |= 0x8000; /* add field for count of PMC1-4 uses */
- value |= 0x1000;
- }
- *maskp = mask;
- *valp = value;
- return 0;
-}
-
-static int p6_limited_pmc_event(u64 event)
-{
- int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
-
- return pmc == 5 || pmc == 6;
-}
-
-#define MAX_ALT 4 /* at most 4 alternatives for any event */
-
-static const unsigned int event_alternatives[][MAX_ALT] = {
- { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */
- { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
- { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */
- { 0x10000a, 0x2000f4, 0x600005 }, /* PM_RUN_CYC */
- { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */
- { 0x10000e, 0x400010 }, /* PM_PURR */
- { 0x100010, 0x4000f8 }, /* PM_FLUSH */
- { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */
- { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */
- { 0x100054, 0x2000f0 }, /* PM_ST_FIN */
- { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */
- { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */
- { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */
- { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */
- { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */
- { 0x200012, 0x300012 }, /* PM_INST_DISP */
- { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */
- { 0x2000f8, 0x300010 }, /* PM_EXT_INT */
- { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */
- { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */
- { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */
- { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */
- { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */
-};
-
-/*
- * This could be made more efficient with a binary search on
- * a presorted list, if necessary
- */
-static int find_alternatives_list(u64 event)
-{
- int i, j;
- unsigned int alt;
-
- for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
- if (event < event_alternatives[i][0])
- return -1;
- for (j = 0; j < MAX_ALT; ++j) {
- alt = event_alternatives[i][j];
- if (!alt || event < alt)
- break;
- if (event == alt)
- return i;
- }
- }
- return -1;
-}
-
-static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[])
-{
- int i, j, nlim;
- unsigned int psel, pmc;
- unsigned int nalt = 1;
- u64 aevent;
-
- alt[0] = event;
- nlim = p6_limited_pmc_event(event);
-
- /* check the alternatives table */
- i = find_alternatives_list(event);
- if (i >= 0) {
- /* copy out alternatives from list */
- for (j = 0; j < MAX_ALT; ++j) {
- aevent = event_alternatives[i][j];
- if (!aevent)
- break;
- if (aevent != event)
- alt[nalt++] = aevent;
- nlim += p6_limited_pmc_event(aevent);
- }
-
- } else {
- /* Check for alternative ways of computing sum events */
- /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
- psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc && (psel == 0x32 || psel == 0x34))
- alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
- ((5 - pmc) << PM_PMC_SH);
-
- /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
- if (pmc && (psel == 0x38 || psel == 0x3a))
- alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
- ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
- }
-
- if (flags & PPMU_ONLY_COUNT_RUN) {
- /*
- * We're only counting in RUN state,
- * so PM_CYC is equivalent to PM_RUN_CYC,
- * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR.
- * This doesn't include alternatives that don't provide
- * any extra flexibility in assigning PMCs (e.g.
- * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC).
- * Note that even with these additional alternatives
- * we never end up with more than 4 alternatives for any event.
- */
- j = nalt;
- for (i = 0; i < nalt; ++i) {
- switch (alt[i]) {
- case 0x1e: /* PM_CYC */
- alt[j++] = 0x600005; /* PM_RUN_CYC */
- ++nlim;
- break;
- case 0x10000a: /* PM_RUN_CYC */
- alt[j++] = 0x1e; /* PM_CYC */
- break;
- case 2: /* PM_INST_CMPL */
- alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */
- ++nlim;
- break;
- case 0x500009: /* PM_RUN_INST_CMPL */
- alt[j++] = 2; /* PM_INST_CMPL */
- break;
- case 0x10000e: /* PM_PURR */
- alt[j++] = 0x4000f4; /* PM_RUN_PURR */
- break;
- case 0x4000f4: /* PM_RUN_PURR */
- alt[j++] = 0x10000e; /* PM_PURR */
- break;
- }
- }
- nalt = j;
- }
-
- if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
- /* remove the limited PMC events */
- j = 0;
- for (i = 0; i < nalt; ++i) {
- if (!p6_limited_pmc_event(alt[i])) {
- alt[j] = alt[i];
- ++j;
- }
- }
- nalt = j;
- } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
- /* remove all but the limited PMC events */
- j = 0;
- for (i = 0; i < nalt; ++i) {
- if (p6_limited_pmc_event(alt[i])) {
- alt[j] = alt[i];
- ++j;
- }
- }
- nalt = j;
- }
-
- return nalt;
-}
-
-static void p6_disable_pmc(unsigned int pmc, unsigned long mmcr[])
-{
- /* Set PMCxSEL to 0 to disable PMCx */
- if (pmc <= 3)
- mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
-}
-
-static int power6_generic_events[] = {
- [PERF_COUNT_HW_CPU_CYCLES] = 0x1e,
- [PERF_COUNT_HW_INSTRUCTIONS] = 2,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */
- [PERF_COUNT_HW_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x400052, /* BR_MPRED */
-};
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-/*
- * Table of generalized cache-related events.
- * 0 means not supported, -1 means nonsensical, other values
- * are event codes.
- * The "DTLB" and "ITLB" events relate to the DERAT and IERAT.
- */
-static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
- [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x80082, 0x80080 },
- [C(OP_WRITE)] = { 0x80086, 0x80088 },
- [C(OP_PREFETCH)] = { 0x810a4, 0 },
- },
- [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x100056 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { 0x4008c, 0 },
- },
- [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x150730, 0x250532 },
- [C(OP_WRITE)] = { 0x250432, 0x150432 },
- [C(OP_PREFETCH)] = { 0x810a6, 0 },
- },
- [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x20000e },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x420ce },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x430e6, 0x400052 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
-};
-
-static struct power_pmu power6_pmu = {
- .name = "POWER6",
- .n_counter = 6,
- .max_alternatives = MAX_ALT,
- .add_fields = 0x1555,
- .test_adder = 0x3000,
- .compute_mmcr = p6_compute_mmcr,
- .get_constraint = p6_get_constraint,
- .get_alternatives = p6_get_alternatives,
- .disable_pmc = p6_disable_pmc,
- .limited_pmc_event = p6_limited_pmc_event,
- .flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
- .n_generic = ARRAY_SIZE(power6_generic_events),
- .generic_events = power6_generic_events,
- .cache_events = &power6_cache_events,
-};
-
-static int init_power6_pmu(void)
-{
- if (!cur_cpu_spec->oprofile_cpu_type ||
- strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power6"))
- return -ENODEV;
-
- return register_power_pmu(&power6_pmu);
-}
-
-arch_initcall(init_power6_pmu);
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
deleted file mode 100644
index 852f7b7f6b4..00000000000
--- a/arch/powerpc/kernel/power7-pmu.c
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Performance counter support for POWER7 processors.
- *
- * Copyright 2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/kernel.h>
-#include <linux/perf_event.h>
-#include <linux/string.h>
-#include <asm/reg.h>
-#include <asm/cputable.h>
-
-/*
- * Bits in event code for POWER7
- */
-#define PM_PMC_SH 16 /* PMC number (1-based) for direct events */
-#define PM_PMC_MSK 0xf
-#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
-#define PM_UNIT_SH 12 /* TTMMUX number and setting - unit select */
-#define PM_UNIT_MSK 0xf
-#define PM_COMBINE_SH 11 /* Combined event bit */
-#define PM_COMBINE_MSK 1
-#define PM_COMBINE_MSKS 0x800
-#define PM_L2SEL_SH 8 /* L2 event select */
-#define PM_L2SEL_MSK 7
-#define PM_PMCSEL_MSK 0xff
-
-/*
- * Bits in MMCR1 for POWER7
- */
-#define MMCR1_TTM0SEL_SH 60
-#define MMCR1_TTM1SEL_SH 56
-#define MMCR1_TTM2SEL_SH 52
-#define MMCR1_TTM3SEL_SH 48
-#define MMCR1_TTMSEL_MSK 0xf
-#define MMCR1_L2SEL_SH 45
-#define MMCR1_L2SEL_MSK 7
-#define MMCR1_PMC1_COMBINE_SH 35
-#define MMCR1_PMC2_COMBINE_SH 34
-#define MMCR1_PMC3_COMBINE_SH 33
-#define MMCR1_PMC4_COMBINE_SH 32
-#define MMCR1_PMC1SEL_SH 24
-#define MMCR1_PMC2SEL_SH 16
-#define MMCR1_PMC3SEL_SH 8
-#define MMCR1_PMC4SEL_SH 0
-#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
-#define MMCR1_PMCSEL_MSK 0xff
-
-/*
- * Layout of constraint bits:
- * 6666555555555544444444443333333333222222222211111111110000000000
- * 3210987654321098765432109876543210987654321098765432109876543210
- * [ ><><><><><><>
- * NC P6P5P4P3P2P1
- *
- * NC - number of counters
- * 15: NC error 0x8000
- * 12-14: number of events needing PMC1-4 0x7000
- *
- * P6
- * 11: P6 error 0x800
- * 10-11: Count of events needing PMC6
- *
- * P1..P5
- * 0-9: Count of events needing PMC1..PMC5
- */
-
-static int power7_get_constraint(u64 event, unsigned long *maskp,
- unsigned long *valp)
-{
- int pmc, sh;
- unsigned long mask = 0, value = 0;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- if (pmc > 6)
- return -1;
- sh = (pmc - 1) * 2;
- mask |= 2 << sh;
- value |= 1 << sh;
- if (pmc >= 5 && !(event == 0x500fa || event == 0x600f4))
- return -1;
- }
- if (pmc < 5) {
- /* need a counter from PMC1-4 set */
- mask |= 0x8000;
- value |= 0x1000;
- }
- *maskp = mask;
- *valp = value;
- return 0;
-}
-
-#define MAX_ALT 2 /* at most 2 alternatives for any event */
-
-static const unsigned int event_alternatives[][MAX_ALT] = {
- { 0x200f2, 0x300f2 }, /* PM_INST_DISP */
- { 0x200f4, 0x600f4 }, /* PM_RUN_CYC */
- { 0x400fa, 0x500fa }, /* PM_RUN_INST_CMPL */
-};
-
-/*
- * Scan the alternatives table for a match and return the
- * index into the alternatives table if found, else -1.
- */
-static int find_alternative(u64 event)
-{
- int i, j;
-
- for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
- if (event < event_alternatives[i][0])
- break;
- for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
- if (event == event_alternatives[i][j])
- return i;
- }
- return -1;
-}
-
-static s64 find_alternative_decode(u64 event)
-{
- int pmc, psel;
-
- /* this only handles the 4x decode events */
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- psel = event & PM_PMCSEL_MSK;
- if ((pmc == 2 || pmc == 4) && (psel & ~7) == 0x40)
- return event - (1 << PM_PMC_SH) + 8;
- if ((pmc == 1 || pmc == 3) && (psel & ~7) == 0x48)
- return event + (1 << PM_PMC_SH) - 8;
- return -1;
-}
-
-static int power7_get_alternatives(u64 event, unsigned int flags, u64 alt[])
-{
- int i, j, nalt = 1;
- s64 ae;
-
- alt[0] = event;
- nalt = 1;
- i = find_alternative(event);
- if (i >= 0) {
- for (j = 0; j < MAX_ALT; ++j) {
- ae = event_alternatives[i][j];
- if (ae && ae != event)
- alt[nalt++] = ae;
- }
- } else {
- ae = find_alternative_decode(event);
- if (ae > 0)
- alt[nalt++] = ae;
- }
-
- if (flags & PPMU_ONLY_COUNT_RUN) {
- /*
- * We're only counting in RUN state,
- * so PM_CYC is equivalent to PM_RUN_CYC
- * and PM_INST_CMPL === PM_RUN_INST_CMPL.
- * This doesn't include alternatives that don't provide
- * any extra flexibility in assigning PMCs.
- */
- j = nalt;
- for (i = 0; i < nalt; ++i) {
- switch (alt[i]) {
- case 0x1e: /* PM_CYC */
- alt[j++] = 0x600f4; /* PM_RUN_CYC */
- break;
- case 0x600f4: /* PM_RUN_CYC */
- alt[j++] = 0x1e;
- break;
- case 0x2: /* PM_PPC_CMPL */
- alt[j++] = 0x500fa; /* PM_RUN_INST_CMPL */
- break;
- case 0x500fa: /* PM_RUN_INST_CMPL */
- alt[j++] = 0x2; /* PM_PPC_CMPL */
- break;
- }
- }
- nalt = j;
- }
-
- return nalt;
-}
-
-/*
- * Returns 1 if event counts things relating to marked instructions
- * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
- */
-static int power7_marked_instr_event(u64 event)
-{
- int pmc, psel;
- int unit;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
- psel = event & PM_PMCSEL_MSK & ~1; /* trim off edge/level bit */
- if (pmc >= 5)
- return 0;
-
- switch (psel >> 4) {
- case 2:
- return pmc == 2 || pmc == 4;
- case 3:
- if (psel == 0x3c)
- return pmc == 1;
- if (psel == 0x3e)
- return pmc != 2;
- return 1;
- case 4:
- case 5:
- return unit == 0xd;
- case 6:
- if (psel == 0x64)
- return pmc >= 3;
- case 8:
- return unit == 0xd;
- }
- return 0;
-}
-
-static int power7_compute_mmcr(u64 event[], int n_ev,
- unsigned int hwc[], unsigned long mmcr[])
-{
- unsigned long mmcr1 = 0;
- unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
- unsigned int pmc, unit, combine, l2sel, psel;
- unsigned int pmc_inuse = 0;
- int i;
-
- /* First pass to count resource use */
- for (i = 0; i < n_ev; ++i) {
- pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- if (pmc > 6)
- return -1;
- if (pmc_inuse & (1 << (pmc - 1)))
- return -1;
- pmc_inuse |= 1 << (pmc - 1);
- }
- }
-
- /* Second pass: assign PMCs, set all MMCR1 fields */
- for (i = 0; i < n_ev; ++i) {
- pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
- unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
- combine = (event[i] >> PM_COMBINE_SH) & PM_COMBINE_MSK;
- l2sel = (event[i] >> PM_L2SEL_SH) & PM_L2SEL_MSK;
- psel = event[i] & PM_PMCSEL_MSK;
- if (!pmc) {
- /* Bus event or any-PMC direct event */
- for (pmc = 0; pmc < 4; ++pmc) {
- if (!(pmc_inuse & (1 << pmc)))
- break;
- }
- if (pmc >= 4)
- return -1;
- pmc_inuse |= 1 << pmc;
- } else {
- /* Direct or decoded event */
- --pmc;
- }
- if (pmc <= 3) {
- mmcr1 |= (unsigned long) unit
- << (MMCR1_TTM0SEL_SH - 4 * pmc);
- mmcr1 |= (unsigned long) combine
- << (MMCR1_PMC1_COMBINE_SH - pmc);
- mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
- if (unit == 6) /* L2 events */
- mmcr1 |= (unsigned long) l2sel
- << MMCR1_L2SEL_SH;
- }
- if (power7_marked_instr_event(event[i]))
- mmcra |= MMCRA_SAMPLE_ENABLE;
- hwc[i] = pmc;
- }
-
- /* Return MMCRx values */
- mmcr[0] = 0;
- if (pmc_inuse & 1)
- mmcr[0] = MMCR0_PMC1CE;
- if (pmc_inuse & 0x3e)
- mmcr[0] |= MMCR0_PMCjCE;
- mmcr[1] = mmcr1;
- mmcr[2] = mmcra;
- return 0;
-}
-
-static void power7_disable_pmc(unsigned int pmc, unsigned long mmcr[])
-{
- if (pmc <= 3)
- mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
-}
-
-static int power7_generic_events[] = {
- [PERF_COUNT_HW_CPU_CYCLES] = 0x1e,
- [PERF_COUNT_HW_INSTRUCTIONS] = 2,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0xc880, /* LD_REF_L1_LSU*/
- [PERF_COUNT_HW_CACHE_MISSES] = 0x400f0, /* LD_MISS_L1 */
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x10068, /* BRU_FIN */
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x400f6, /* BR_MPRED */
-};
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-/*
- * Table of generalized cache-related events.
- * 0 means not supported, -1 means nonsensical, other values
- * are event codes.
- */
-static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
- [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0xc880, 0x400f0 },
- [C(OP_WRITE)] = { 0, 0x300f0 },
- [C(OP_PREFETCH)] = { 0xd8b8, 0 },
- },
- [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x200fc },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { 0x408a, 0 },
- },
- [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x16080, 0x26080 },
- [C(OP_WRITE)] = { 0x16082, 0x26082 },
- [C(OP_PREFETCH)] = { 0, 0 },
- },
- [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x300fc },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x400fc },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x10068, 0x400f6 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
-};
-
-static struct power_pmu power7_pmu = {
- .name = "POWER7",
- .n_counter = 6,
- .max_alternatives = MAX_ALT + 1,
- .add_fields = 0x1555ul,
- .test_adder = 0x3000ul,
- .compute_mmcr = power7_compute_mmcr,
- .get_constraint = power7_get_constraint,
- .get_alternatives = power7_get_alternatives,
- .disable_pmc = power7_disable_pmc,
- .flags = PPMU_ALT_SIPR,
- .n_generic = ARRAY_SIZE(power7_generic_events),
- .generic_events = power7_generic_events,
- .cache_events = &power7_cache_events,
-};
-
-static int init_power7_pmu(void)
-{
- if (!cur_cpu_spec->oprofile_cpu_type ||
- strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power7"))
- return -ENODEV;
-
- return register_power_pmu(&power7_pmu);
-}
-
-arch_initcall(init_power7_pmu);
diff --git a/arch/powerpc/kernel/ppc32.h b/arch/powerpc/kernel/ppc32.h
index dc16aefe1dd..a27c914d580 100644
--- a/arch/powerpc/kernel/ppc32.h
+++ b/arch/powerpc/kernel/ppc32.h
@@ -16,81 +16,6 @@
/* These are here to support 32-bit syscalls on a 64-bit kernel. */
-typedef struct compat_siginfo {
- int si_signo;
- int si_errno;
- int si_code;
-
- union {
- int _pad[SI_PAD_SIZE32];
-
- /* kill() */
- struct {
- compat_pid_t _pid; /* sender's pid */
- compat_uid_t _uid; /* sender's uid */
- } _kill;
-
- /* POSIX.1b timers */
- struct {
- compat_timer_t _tid; /* timer id */
- int _overrun; /* overrun count */
- compat_sigval_t _sigval; /* same as below */
- int _sys_private; /* not to be passed to user */
- } _timer;
-
- /* POSIX.1b signals */
- struct {
- compat_pid_t _pid; /* sender's pid */
- compat_uid_t _uid; /* sender's uid */
- compat_sigval_t _sigval;
- } _rt;
-
- /* SIGCHLD */
- struct {
- compat_pid_t _pid; /* which child */
- compat_uid_t _uid; /* sender's uid */
- int _status; /* exit code */
- compat_clock_t _utime;
- compat_clock_t _stime;
- } _sigchld;
-
- /* SIGILL, SIGFPE, SIGSEGV, SIGBUS, SIGEMT */
- struct {
- unsigned int _addr; /* faulting insn/memory ref. */
- } _sigfault;
-
- /* SIGPOLL */
- struct {
- int _band; /* POLL_IN, POLL_OUT, POLL_MSG */
- int _fd;
- } _sigpoll;
- } _sifields;
-} compat_siginfo_t;
-
-#define __old_sigaction32 old_sigaction32
-
-struct __old_sigaction32 {
- compat_uptr_t sa_handler;
- compat_old_sigset_t sa_mask;
- unsigned int sa_flags;
- compat_uptr_t sa_restorer; /* not used by Linux/SPARC yet */
-};
-
-
-
-struct sigaction32 {
- compat_uptr_t sa_handler; /* Really a pointer, but need to deal with 32 bits */
- unsigned int sa_flags;
- compat_uptr_t sa_restorer; /* Another 32 bit pointer */
- compat_sigset_t sa_mask; /* A 32 bit mask */
-};
-
-typedef struct sigaltstack_32 {
- unsigned int ss_sp;
- int ss_flags;
- compat_size_t ss_size;
-} stack_32_t;
-
struct pt_regs32 {
unsigned int gpr[32];
unsigned int nip;
@@ -126,7 +51,7 @@ struct mcontext32 {
struct ucontext32 {
unsigned int uc_flags;
unsigned int uc_link;
- stack_32_t uc_stack;
+ compat_stack_t uc_stack;
int uc_pad[7];
compat_uptr_t uc_regs; /* points to uc_mcontext field */
compat_sigset_t uc_sigmask; /* mask last for extensibility */
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
deleted file mode 100644
index 3fee685de4d..00000000000
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- * Performance counter support for PPC970-family processors.
- *
- * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/string.h>
-#include <linux/perf_event.h>
-#include <asm/reg.h>
-#include <asm/cputable.h>
-
-/*
- * Bits in event code for PPC970
- */
-#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
-#define PM_PMC_MSK 0xf
-#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
-#define PM_UNIT_MSK 0xf
-#define PM_SPCSEL_SH 6
-#define PM_SPCSEL_MSK 3
-#define PM_BYTE_SH 4 /* Byte number of event bus to use */
-#define PM_BYTE_MSK 3
-#define PM_PMCSEL_MSK 0xf
-
-/* Values in PM_UNIT field */
-#define PM_NONE 0
-#define PM_FPU 1
-#define PM_VPU 2
-#define PM_ISU 3
-#define PM_IFU 4
-#define PM_IDU 5
-#define PM_STS 6
-#define PM_LSU0 7
-#define PM_LSU1U 8
-#define PM_LSU1L 9
-#define PM_LASTUNIT 9
-
-/*
- * Bits in MMCR0 for PPC970
- */
-#define MMCR0_PMC1SEL_SH 8
-#define MMCR0_PMC2SEL_SH 1
-#define MMCR_PMCSEL_MSK 0x1f
-
-/*
- * Bits in MMCR1 for PPC970
- */
-#define MMCR1_TTM0SEL_SH 62
-#define MMCR1_TTM1SEL_SH 59
-#define MMCR1_TTM3SEL_SH 53
-#define MMCR1_TTMSEL_MSK 3
-#define MMCR1_TD_CP_DBG0SEL_SH 50
-#define MMCR1_TD_CP_DBG1SEL_SH 48
-#define MMCR1_TD_CP_DBG2SEL_SH 46
-#define MMCR1_TD_CP_DBG3SEL_SH 44
-#define MMCR1_PMC1_ADDER_SEL_SH 39
-#define MMCR1_PMC2_ADDER_SEL_SH 38
-#define MMCR1_PMC6_ADDER_SEL_SH 37
-#define MMCR1_PMC5_ADDER_SEL_SH 36
-#define MMCR1_PMC8_ADDER_SEL_SH 35
-#define MMCR1_PMC7_ADDER_SEL_SH 34
-#define MMCR1_PMC3_ADDER_SEL_SH 33
-#define MMCR1_PMC4_ADDER_SEL_SH 32
-#define MMCR1_PMC3SEL_SH 27
-#define MMCR1_PMC4SEL_SH 22
-#define MMCR1_PMC5SEL_SH 17
-#define MMCR1_PMC6SEL_SH 12
-#define MMCR1_PMC7SEL_SH 7
-#define MMCR1_PMC8SEL_SH 2
-
-static short mmcr1_adder_bits[8] = {
- MMCR1_PMC1_ADDER_SEL_SH,
- MMCR1_PMC2_ADDER_SEL_SH,
- MMCR1_PMC3_ADDER_SEL_SH,
- MMCR1_PMC4_ADDER_SEL_SH,
- MMCR1_PMC5_ADDER_SEL_SH,
- MMCR1_PMC6_ADDER_SEL_SH,
- MMCR1_PMC7_ADDER_SEL_SH,
- MMCR1_PMC8_ADDER_SEL_SH
-};
-
-/*
- * Layout of constraint bits:
- * 6666555555555544444444443333333333222222222211111111110000000000
- * 3210987654321098765432109876543210987654321098765432109876543210
- * <><><>[ >[ >[ >< >< >< >< ><><><><><><><><>
- * SPT0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
- *
- * SP - SPCSEL constraint
- * 48-49: SPCSEL value 0x3_0000_0000_0000
- *
- * T0 - TTM0 constraint
- * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
- *
- * T1 - TTM1 constraint
- * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
- *
- * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
- * 43: UC3 error 0x0800_0000_0000
- * 42: FPU|IFU|VPU events needed 0x0400_0000_0000
- * 41: ISU events needed 0x0200_0000_0000
- * 40: IDU|STS events needed 0x0100_0000_0000
- *
- * PS1
- * 39: PS1 error 0x0080_0000_0000
- * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
- *
- * PS2
- * 35: PS2 error 0x0008_0000_0000
- * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
- *
- * B0
- * 28-31: Byte 0 event source 0xf000_0000
- * Encoding as for the event code
- *
- * B1, B2, B3
- * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
- *
- * P1
- * 15: P1 error 0x8000
- * 14-15: Count of events needing PMC1
- *
- * P2..P8
- * 0-13: Count of events needing PMC2..PMC8
- */
-
-static unsigned char direct_marked_event[8] = {
- (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
- (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
- (1<<3) | (1<<5), /* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */
- (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
- (1<<4) | (1<<5), /* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */
- (1<<3) | (1<<4) | (1<<5),
- /* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
- (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
- (1<<4) /* PMC8: PM_MRK_LSU_FIN */
-};
-
-/*
- * Returns 1 if event counts things relating to marked instructions
- * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
- */
-static int p970_marked_instr_event(u64 event)
-{
- int pmc, psel, unit, byte, bit;
- unsigned int mask;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- psel = event & PM_PMCSEL_MSK;
- if (pmc) {
- if (direct_marked_event[pmc - 1] & (1 << psel))
- return 1;
- if (psel == 0) /* add events */
- bit = (pmc <= 4)? pmc - 1: 8 - pmc;
- else if (psel == 7 || psel == 13) /* decode events */
- bit = 4;
- else
- return 0;
- } else
- bit = psel;
-
- byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
- unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
- mask = 0;
- switch (unit) {
- case PM_VPU:
- mask = 0x4c; /* byte 0 bits 2,3,6 */
- break;
- case PM_LSU0:
- /* byte 2 bits 0,2,3,4,6; all of byte 1 */
- mask = 0x085dff00;
- break;
- case PM_LSU1L:
- mask = 0x50 << 24; /* byte 3 bits 4,6 */
- break;
- }
- return (mask >> (byte * 8 + bit)) & 1;
-}
-
-/* Masks and values for using events from the various units */
-static unsigned long unit_cons[PM_LASTUNIT+1][2] = {
- [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull },
- [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull },
- [PM_ISU] = { 0x080000000000ull, 0x020000000000ull },
- [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull },
- [PM_IDU] = { 0x380000000000ull, 0x010000000000ull },
- [PM_STS] = { 0x380000000000ull, 0x310000000000ull },
-};
-
-static int p970_get_constraint(u64 event, unsigned long *maskp,
- unsigned long *valp)
-{
- int pmc, byte, unit, sh, spcsel;
- unsigned long mask = 0, value = 0;
- int grp = -1;
-
- pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- if (pmc > 8)
- return -1;
- sh = (pmc - 1) * 2;
- mask |= 2 << sh;
- value |= 1 << sh;
- grp = ((pmc - 1) >> 1) & 1;
- }
- unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
- if (unit) {
- if (unit > PM_LASTUNIT)
- return -1;
- mask |= unit_cons[unit][0];
- value |= unit_cons[unit][1];
- byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
- /*
- * Bus events on bytes 0 and 2 can be counted
- * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
- */
- if (!pmc)
- grp = byte & 1;
- /* Set byte lane select field */
- mask |= 0xfULL << (28 - 4 * byte);
- value |= (unsigned long)unit << (28 - 4 * byte);
- }
- if (grp == 0) {
- /* increment PMC1/2/5/6 field */
- mask |= 0x8000000000ull;
- value |= 0x1000000000ull;
- } else if (grp == 1) {
- /* increment PMC3/4/7/8 field */
- mask |= 0x800000000ull;
- value |= 0x100000000ull;
- }
- spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
- if (spcsel) {
- mask |= 3ull << 48;
- value |= (unsigned long)spcsel << 48;
- }
- *maskp = mask;
- *valp = value;
- return 0;
-}
-
-static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[])
-{
- alt[0] = event;
-
- /* 2 alternatives for LSU empty */
- if (event == 0x2002 || event == 0x3002) {
- alt[1] = event ^ 0x1000;
- return 2;
- }
-
- return 1;
-}
-
-static int p970_compute_mmcr(u64 event[], int n_ev,
- unsigned int hwc[], unsigned long mmcr[])
-{
- unsigned long mmcr0 = 0, mmcr1 = 0, mmcra = 0;
- unsigned int pmc, unit, byte, psel;
- unsigned int ttm, grp;
- unsigned int pmc_inuse = 0;
- unsigned int pmc_grp_use[2];
- unsigned char busbyte[4];
- unsigned char unituse[16];
- unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
- unsigned char ttmuse[2];
- unsigned char pmcsel[8];
- int i;
- int spcsel;
-
- if (n_ev > 8)
- return -1;
-
- /* First pass to count resource use */
- pmc_grp_use[0] = pmc_grp_use[1] = 0;
- memset(busbyte, 0, sizeof(busbyte));
- memset(unituse, 0, sizeof(unituse));
- for (i = 0; i < n_ev; ++i) {
- pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
- if (pmc) {
- if (pmc_inuse & (1 << (pmc - 1)))
- return -1;
- pmc_inuse |= 1 << (pmc - 1);
- /* count 1/2/5/6 vs 3/4/7/8 use */
- ++pmc_grp_use[((pmc - 1) >> 1) & 1];
- }
- unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
- byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
- if (unit) {
- if (unit > PM_LASTUNIT)
- return -1;
- if (!pmc)
- ++pmc_grp_use[byte & 1];
- if (busbyte[byte] && busbyte[byte] != unit)
- return -1;
- busbyte[byte] = unit;
- unituse[unit] = 1;
- }
- }
- if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
- return -1;
-
- /*
- * Assign resources and set multiplexer selects.
- *
- * PM_ISU can go either on TTM0 or TTM1, but that's the only
- * choice we have to deal with.
- */
- if (unituse[PM_ISU] &
- (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
- unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */
- /* Set TTM[01]SEL fields. */
- ttmuse[0] = ttmuse[1] = 0;
- for (i = PM_FPU; i <= PM_STS; ++i) {
- if (!unituse[i])
- continue;
- ttm = unitmap[i];
- ++ttmuse[(ttm >> 2) & 1];
- mmcr1 |= (unsigned long)(ttm & ~4) << MMCR1_TTM1SEL_SH;
- }
- /* Check only one unit per TTMx */
- if (ttmuse[0] > 1 || ttmuse[1] > 1)
- return -1;
-
- /* Set byte lane select fields and TTM3SEL. */
- for (byte = 0; byte < 4; ++byte) {
- unit = busbyte[byte];
- if (!unit)
- continue;
- if (unit <= PM_STS)
- ttm = (unitmap[unit] >> 2) & 1;
- else if (unit == PM_LSU0)
- ttm = 2;
- else {
- ttm = 3;
- if (unit == PM_LSU1L && byte >= 2)
- mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
- }
- mmcr1 |= (unsigned long)ttm
- << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
- }
-
- /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
- memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */
- for (i = 0; i < n_ev; ++i) {
- pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
- unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
- byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
- psel = event[i] & PM_PMCSEL_MSK;
- if (!pmc) {
- /* Bus event or any-PMC direct event */
- if (unit)
- psel |= 0x10 | ((byte & 2) << 2);
- else
- psel |= 8;
- for (pmc = 0; pmc < 8; ++pmc) {
- if (pmc_inuse & (1 << pmc))
- continue;
- grp = (pmc >> 1) & 1;
- if (unit) {
- if (grp == (byte & 1))
- break;
- } else if (pmc_grp_use[grp] < 4) {
- ++pmc_grp_use[grp];
- break;
- }
- }
- pmc_inuse |= 1 << pmc;
- } else {
- /* Direct event */
- --pmc;
- if (psel == 0 && (byte & 2))
- /* add events on higher-numbered bus */
- mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
- }
- pmcsel[pmc] = psel;
- hwc[i] = pmc;
- spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
- mmcr1 |= spcsel;
- if (p970_marked_instr_event(event[i]))
- mmcra |= MMCRA_SAMPLE_ENABLE;
- }
- for (pmc = 0; pmc < 2; ++pmc)
- mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
- for (; pmc < 8; ++pmc)
- mmcr1 |= (unsigned long)pmcsel[pmc]
- << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
- if (pmc_inuse & 1)
- mmcr0 |= MMCR0_PMC1CE;
- if (pmc_inuse & 0xfe)
- mmcr0 |= MMCR0_PMCjCE;
-
- mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
-
- /* Return MMCRx values */
- mmcr[0] = mmcr0;
- mmcr[1] = mmcr1;
- mmcr[2] = mmcra;
- return 0;
-}
-
-static void p970_disable_pmc(unsigned int pmc, unsigned long mmcr[])
-{
- int shift, i;
-
- if (pmc <= 1) {
- shift = MMCR0_PMC1SEL_SH - 7 * pmc;
- i = 0;
- } else {
- shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
- i = 1;
- }
- /*
- * Setting the PMCxSEL field to 0x08 disables PMC x.
- */
- mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
-}
-
-static int ppc970_generic_events[] = {
- [PERF_COUNT_HW_CPU_CYCLES] = 7,
- [PERF_COUNT_HW_INSTRUCTIONS] = 1,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */
- [PERF_COUNT_HW_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */
-};
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-/*
- * Table of generalized cache-related events.
- * 0 means not supported, -1 means nonsensical, other values
- * are event codes.
- */
-static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
- [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x8810, 0x3810 },
- [C(OP_WRITE)] = { 0x7810, 0x813 },
- [C(OP_PREFETCH)] = { 0x731, 0 },
- },
- [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { 0, 0 },
- },
- [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0 },
- [C(OP_WRITE)] = { 0, 0 },
- [C(OP_PREFETCH)] = { 0x733, 0 },
- },
- [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x704 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0, 0x700 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
- [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
- [C(OP_READ)] = { 0x431, 0x327 },
- [C(OP_WRITE)] = { -1, -1 },
- [C(OP_PREFETCH)] = { -1, -1 },
- },
-};
-
-static struct power_pmu ppc970_pmu = {
- .name = "PPC970/FX/MP",
- .n_counter = 8,
- .max_alternatives = 2,
- .add_fields = 0x001100005555ull,
- .test_adder = 0x013300000000ull,
- .compute_mmcr = p970_compute_mmcr,
- .get_constraint = p970_get_constraint,
- .get_alternatives = p970_get_alternatives,
- .disable_pmc = p970_disable_pmc,
- .n_generic = ARRAY_SIZE(ppc970_generic_events),
- .generic_events = ppc970_generic_events,
- .cache_events = &ppc970_cache_events,
-};
-
-static int init_ppc970_pmu(void)
-{
- if (!cur_cpu_spec->oprofile_cpu_type ||
- (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/970")
- && strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/970MP")))
- return -ENODEV;
-
- return register_power_pmu(&ppc970_pmu);
-}
-
-arch_initcall(init_ppc970_pmu);
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index ab3e392ac63..48d17d6fca5 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -1,4 +1,4 @@
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/threads.h>
#include <linux/smp.h>
#include <linux/sched.h>
@@ -18,7 +18,7 @@
#include <asm/cacheflush.h>
#include <asm/uaccess.h>
#include <asm/io.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/checksum.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -26,7 +26,6 @@
#include <linux/cuda.h>
#include <linux/pmu.h>
#include <asm/prom.h>
-#include <asm/system.h>
#include <asm/pci-bridge.h>
#include <asm/irq.h>
#include <asm/pmac_feature.h>
@@ -43,6 +42,8 @@
#include <asm/signal.h>
#include <asm/dcr.h>
#include <asm/ftrace.h>
+#include <asm/switch_to.h>
+#include <asm/epapr_hcalls.h>
#ifdef CONFIG_PPC32
extern void transfer_to_handler(void);
@@ -54,7 +55,6 @@ extern void single_step_exception(struct pt_regs *regs);
extern int sys_sigreturn(struct pt_regs *regs);
EXPORT_SYMBOL(clear_pages);
-EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(ISA_DMA_THRESHOLD);
EXPORT_SYMBOL(DMA_MODE_READ);
EXPORT_SYMBOL(DMA_MODE_WRITE);
@@ -79,18 +79,16 @@ EXPORT_SYMBOL(strlen);
EXPORT_SYMBOL(strcmp);
EXPORT_SYMBOL(strncmp);
+#ifndef CONFIG_GENERIC_CSUM
EXPORT_SYMBOL(csum_partial);
EXPORT_SYMBOL(csum_partial_copy_generic);
EXPORT_SYMBOL(ip_fast_csum);
EXPORT_SYMBOL(csum_tcpudp_magic);
+#endif
EXPORT_SYMBOL(__copy_tofrom_user);
EXPORT_SYMBOL(__clear_user);
-EXPORT_SYMBOL(__strncpy_from_user);
-EXPORT_SYMBOL(__strnlen_user);
-#ifdef CONFIG_PPC64
-EXPORT_SYMBOL(copy_4K_page);
-#endif
+EXPORT_SYMBOL(copy_page);
#if defined(CONFIG_PCI) && defined(CONFIG_PPC32)
EXPORT_SYMBOL(isa_io_base);
@@ -99,11 +97,16 @@ EXPORT_SYMBOL(pci_dram_offset);
#endif /* CONFIG_PCI */
EXPORT_SYMBOL(start_thread);
-EXPORT_SYMBOL(kernel_thread);
+#ifdef CONFIG_PPC_FPU
EXPORT_SYMBOL(giveup_fpu);
+EXPORT_SYMBOL(load_fp_state);
+EXPORT_SYMBOL(store_fp_state);
+#endif
#ifdef CONFIG_ALTIVEC
EXPORT_SYMBOL(giveup_altivec);
+EXPORT_SYMBOL(load_vr_state);
+EXPORT_SYMBOL(store_vr_state);
#endif /* CONFIG_ALTIVEC */
#ifdef CONFIG_VSX
EXPORT_SYMBOL(giveup_vsx);
@@ -116,8 +119,8 @@ EXPORT_SYMBOL(giveup_spe);
#ifndef CONFIG_PPC64
EXPORT_SYMBOL(flush_instruction_cache);
#endif
-EXPORT_SYMBOL(__flush_icache_range);
EXPORT_SYMBOL(flush_dcache_range);
+EXPORT_SYMBOL(flush_icache_range);
#ifdef CONFIG_SMP
#ifdef CONFIG_PPC32
@@ -147,8 +150,11 @@ EXPORT_SYMBOL(__ashldi3);
EXPORT_SYMBOL(__lshrdi3);
int __ucmpdi2(unsigned long long, unsigned long long);
EXPORT_SYMBOL(__ucmpdi2);
+int __cmpdi2(long long, long long);
+EXPORT_SYMBOL(__cmpdi2);
#endif
-
+long long __bswapdi2(long long);
+EXPORT_SYMBOL(__bswapdi2);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memmove);
@@ -186,3 +192,18 @@ EXPORT_SYMBOL(__mtdcr);
EXPORT_SYMBOL(__mfdcr);
#endif
EXPORT_SYMBOL(empty_zero_page);
+
+#ifdef CONFIG_PPC64
+EXPORT_SYMBOL(__arch_hweight8);
+EXPORT_SYMBOL(__arch_hweight16);
+EXPORT_SYMBOL(__arch_hweight32);
+EXPORT_SYMBOL(__arch_hweight64);
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+EXPORT_SYMBOL_GPL(mmu_psize_defs);
+#endif
+
+#ifdef CONFIG_EPAPR_PARAVIRT
+EXPORT_SYMBOL(epapr_hypercall_start);
+#endif
diff --git a/arch/powerpc/kernel/ppc_save_regs.S b/arch/powerpc/kernel/ppc_save_regs.S
index 5113bd2285e..1b1787d5289 100644
--- a/arch/powerpc/kernel/ppc_save_regs.S
+++ b/arch/powerpc/kernel/ppc_save_regs.S
@@ -11,10 +11,11 @@
#include <asm/processor.h>
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
/*
* Grab the register values as they are now.
- * This won't do a particularily good job because we really
+ * This won't do a particularly good job because we really
* want our caller's caller's registers, and our caller has
* already executed its prologue.
* ToDo: We could reach back into the caller's save area to do
diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c
index c8ae3714e79..c30612aad68 100644
--- a/arch/powerpc/kernel/proc_powerpc.c
+++ b/arch/powerpc/kernel/proc_powerpc.c
@@ -29,45 +29,26 @@
#ifdef CONFIG_PPC64
-static loff_t page_map_seek( struct file *file, loff_t off, int whence)
+static loff_t page_map_seek(struct file *file, loff_t off, int whence)
{
- loff_t new;
- struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
-
- switch(whence) {
- case 0:
- new = off;
- break;
- case 1:
- new = file->f_pos + off;
- break;
- case 2:
- new = dp->size + off;
- break;
- default:
- return -EINVAL;
- }
- if ( new < 0 || new > dp->size )
- return -EINVAL;
- return (file->f_pos = new);
+ return fixed_size_llseek(file, off, whence, PAGE_SIZE);
}
static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
- struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
- return simple_read_from_buffer(buf, nbytes, ppos, dp->data, dp->size);
+ return simple_read_from_buffer(buf, nbytes, ppos,
+ PDE_DATA(file_inode(file)), PAGE_SIZE);
}
static int page_map_mmap( struct file *file, struct vm_area_struct *vma )
{
- struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
-
- if ((vma->vm_end - vma->vm_start) > dp->size)
+ if ((vma->vm_end - vma->vm_start) > PAGE_SIZE)
return -EINVAL;
- remap_pfn_range(vma, vma->vm_start, __pa(dp->data) >> PAGE_SHIFT,
- dp->size, vma->vm_page_prot);
+ remap_pfn_range(vma, vma->vm_start,
+ __pa(PDE_DATA(file_inode(file))) >> PAGE_SHIFT,
+ PAGE_SIZE, vma->vm_page_prot);
return 0;
}
@@ -86,7 +67,7 @@ static int __init proc_ppc64_init(void)
&page_map_fops, vdso_data);
if (!pde)
return 1;
- pde->size = PAGE_SIZE;
+ proc_set_size(pde, PAGE_SIZE);
return 0;
}
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 84906d3fc86..be99774d3f4 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -25,10 +25,9 @@
#include <linux/slab.h>
#include <linux/user.h>
#include <linux/elf.h>
-#include <linux/init.h>
#include <linux/prctl.h>
#include <linux/init_task.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/mqueue.h>
#include <linux/hardirq.h>
@@ -41,20 +40,31 @@
#include <asm/pgtable.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/mmu.h>
#include <asm/prom.h>
#include <asm/machdep.h>
#include <asm/time.h>
+#include <asm/runlatch.h>
#include <asm/syscalls.h>
+#include <asm/switch_to.h>
+#include <asm/tm.h>
+#include <asm/debug.h>
#ifdef CONFIG_PPC64
#include <asm/firmware.h>
#endif
+#include <asm/code-patching.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
+/* Transactional Memory debug */
+#ifdef TM_DEBUG_SW
+#define TM_DEBUG(x...) printk(KERN_INFO x)
+#else
+#define TM_DEBUG(x...) do { } while(0)
+#endif
+
extern unsigned long _get_SP(void);
#ifndef CONFIG_SMP
@@ -64,6 +74,49 @@ struct task_struct *last_task_used_vsx = NULL;
struct task_struct *last_task_used_spe = NULL;
#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void giveup_fpu_maybe_transactional(struct task_struct *tsk)
+{
+ /*
+ * If we are saving the current thread's registers, and the
+ * thread is in a transactional state, set the TIF_RESTORE_TM
+ * bit so that we know to restore the registers before
+ * returning to userspace.
+ */
+ if (tsk == current && tsk->thread.regs &&
+ MSR_TM_ACTIVE(tsk->thread.regs->msr) &&
+ !test_thread_flag(TIF_RESTORE_TM)) {
+ tsk->thread.tm_orig_msr = tsk->thread.regs->msr;
+ set_thread_flag(TIF_RESTORE_TM);
+ }
+
+ giveup_fpu(tsk);
+}
+
+void giveup_altivec_maybe_transactional(struct task_struct *tsk)
+{
+ /*
+ * If we are saving the current thread's registers, and the
+ * thread is in a transactional state, set the TIF_RESTORE_TM
+ * bit so that we know to restore the registers before
+ * returning to userspace.
+ */
+ if (tsk == current && tsk->thread.regs &&
+ MSR_TM_ACTIVE(tsk->thread.regs->msr) &&
+ !test_thread_flag(TIF_RESTORE_TM)) {
+ tsk->thread.tm_orig_msr = tsk->thread.regs->msr;
+ set_thread_flag(TIF_RESTORE_TM);
+ }
+
+ giveup_altivec(tsk);
+}
+
+#else
+#define giveup_fpu_maybe_transactional(tsk) giveup_fpu(tsk)
+#define giveup_altivec_maybe_transactional(tsk) giveup_altivec(tsk)
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+#ifdef CONFIG_PPC_FPU
/*
* Make sure the floating-point register state in the
* the thread_struct is up to date for task tsk.
@@ -91,11 +144,13 @@ void flush_fp_to_thread(struct task_struct *tsk)
*/
BUG_ON(tsk != current);
#endif
- giveup_fpu(tsk);
+ giveup_fpu_maybe_transactional(tsk);
}
preempt_enable();
}
}
+EXPORT_SYMBOL_GPL(flush_fp_to_thread);
+#endif /* CONFIG_PPC_FPU */
void enable_kernel_fp(void)
{
@@ -103,11 +158,11 @@ void enable_kernel_fp(void)
#ifdef CONFIG_SMP
if (current->thread.regs && (current->thread.regs->msr & MSR_FP))
- giveup_fpu(current);
+ giveup_fpu_maybe_transactional(current);
else
giveup_fpu(NULL); /* just enables FP for kernel */
#else
- giveup_fpu(last_task_used_math);
+ giveup_fpu_maybe_transactional(last_task_used_math);
#endif /* CONFIG_SMP */
}
EXPORT_SYMBOL(enable_kernel_fp);
@@ -119,11 +174,11 @@ void enable_kernel_altivec(void)
#ifdef CONFIG_SMP
if (current->thread.regs && (current->thread.regs->msr & MSR_VEC))
- giveup_altivec(current);
+ giveup_altivec_maybe_transactional(current);
else
- giveup_altivec(NULL); /* just enable AltiVec for kernel - force */
+ giveup_altivec_notask();
#else
- giveup_altivec(last_task_used_altivec);
+ giveup_altivec_maybe_transactional(last_task_used_altivec);
#endif /* CONFIG_SMP */
}
EXPORT_SYMBOL(enable_kernel_altivec);
@@ -140,11 +195,12 @@ void flush_altivec_to_thread(struct task_struct *tsk)
#ifdef CONFIG_SMP
BUG_ON(tsk != current);
#endif
- giveup_altivec(tsk);
+ giveup_altivec_maybe_transactional(tsk);
}
preempt_enable();
}
}
+EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
#endif /* CONFIG_ALTIVEC */
#ifdef CONFIG_VSX
@@ -168,8 +224,8 @@ EXPORT_SYMBOL(enable_kernel_vsx);
void giveup_vsx(struct task_struct *tsk)
{
- giveup_fpu(tsk);
- giveup_altivec(tsk);
+ giveup_fpu_maybe_transactional(tsk);
+ giveup_altivec_maybe_transactional(tsk);
__giveup_vsx(tsk);
}
@@ -186,6 +242,7 @@ void flush_vsx_to_thread(struct task_struct *tsk)
preempt_enable();
}
}
+EXPORT_SYMBOL_GPL(flush_vsx_to_thread);
#endif /* CONFIG_VSX */
#ifdef CONFIG_SPE
@@ -213,6 +270,7 @@ void flush_spe_to_thread(struct task_struct *tsk)
#ifdef CONFIG_SMP
BUG_ON(tsk != current);
#endif
+ tsk->thread.spefscr = mfspr(SPRN_SPEFSCR);
giveup_spe(tsk);
}
preempt_enable();
@@ -252,6 +310,7 @@ void do_send_trap(struct pt_regs *regs, unsigned long address,
{
siginfo_t info;
+ current->thread.trap_nr = signal_code;
if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
11, SIGSEGV) == NOTIFY_STOP)
return;
@@ -264,20 +323,21 @@ void do_send_trap(struct pt_regs *regs, unsigned long address,
force_sig_info(SIGTRAP, &info, current);
}
#else /* !CONFIG_PPC_ADV_DEBUG_REGS */
-void do_dabr(struct pt_regs *regs, unsigned long address,
+void do_break (struct pt_regs *regs, unsigned long address,
unsigned long error_code)
{
siginfo_t info;
+ current->thread.trap_nr = TRAP_HWBKPT;
if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
11, SIGSEGV) == NOTIFY_STOP)
return;
- if (debugger_dabr_match(regs))
+ if (debugger_break_match(regs))
return;
- /* Clear the DABR */
- set_dabr(0);
+ /* Clear the breakpoint */
+ hw_breakpoint_disable();
/* Deliver the signal to userspace */
info.si_signo = SIGTRAP;
@@ -288,7 +348,7 @@ void do_dabr(struct pt_regs *regs, unsigned long address,
}
#endif /* CONFIG_PPC_ADV_DEBUG_REGS */
-static DEFINE_PER_CPU(unsigned long, current_dabr);
+static DEFINE_PER_CPU(struct arch_hw_breakpoint, current_brk);
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
/*
@@ -296,49 +356,56 @@ static DEFINE_PER_CPU(unsigned long, current_dabr);
*/
static void set_debug_reg_defaults(struct thread_struct *thread)
{
- thread->iac1 = thread->iac2 = 0;
+ thread->debug.iac1 = thread->debug.iac2 = 0;
#if CONFIG_PPC_ADV_DEBUG_IACS > 2
- thread->iac3 = thread->iac4 = 0;
+ thread->debug.iac3 = thread->debug.iac4 = 0;
#endif
- thread->dac1 = thread->dac2 = 0;
+ thread->debug.dac1 = thread->debug.dac2 = 0;
#if CONFIG_PPC_ADV_DEBUG_DVCS > 0
- thread->dvc1 = thread->dvc2 = 0;
+ thread->debug.dvc1 = thread->debug.dvc2 = 0;
#endif
- thread->dbcr0 = 0;
+ thread->debug.dbcr0 = 0;
#ifdef CONFIG_BOOKE
/*
* Force User/Supervisor bits to b11 (user-only MSR[PR]=1)
*/
- thread->dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US | \
+ thread->debug.dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US |
DBCR1_IAC3US | DBCR1_IAC4US;
/*
* Force Data Address Compare User/Supervisor bits to be User-only
* (0b11 MSR[PR]=1) and set all other bits in DBCR2 register to be 0.
*/
- thread->dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US;
+ thread->debug.dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US;
#else
- thread->dbcr1 = 0;
+ thread->debug.dbcr1 = 0;
#endif
}
-static void prime_debug_regs(struct thread_struct *thread)
+static void prime_debug_regs(struct debug_reg *debug)
{
- mtspr(SPRN_IAC1, thread->iac1);
- mtspr(SPRN_IAC2, thread->iac2);
+ /*
+ * We could have inherited MSR_DE from userspace, since
+ * it doesn't get cleared on exception entry. Make sure
+ * MSR_DE is clear before we enable any debug events.
+ */
+ mtmsr(mfmsr() & ~MSR_DE);
+
+ mtspr(SPRN_IAC1, debug->iac1);
+ mtspr(SPRN_IAC2, debug->iac2);
#if CONFIG_PPC_ADV_DEBUG_IACS > 2
- mtspr(SPRN_IAC3, thread->iac3);
- mtspr(SPRN_IAC4, thread->iac4);
+ mtspr(SPRN_IAC3, debug->iac3);
+ mtspr(SPRN_IAC4, debug->iac4);
#endif
- mtspr(SPRN_DAC1, thread->dac1);
- mtspr(SPRN_DAC2, thread->dac2);
+ mtspr(SPRN_DAC1, debug->dac1);
+ mtspr(SPRN_DAC2, debug->dac2);
#if CONFIG_PPC_ADV_DEBUG_DVCS > 0
- mtspr(SPRN_DVC1, thread->dvc1);
- mtspr(SPRN_DVC2, thread->dvc2);
+ mtspr(SPRN_DVC1, debug->dvc1);
+ mtspr(SPRN_DVC2, debug->dvc2);
#endif
- mtspr(SPRN_DBCR0, thread->dbcr0);
- mtspr(SPRN_DBCR1, thread->dbcr1);
+ mtspr(SPRN_DBCR0, debug->dbcr0);
+ mtspr(SPRN_DBCR1, debug->dbcr1);
#ifdef CONFIG_BOOKE
- mtspr(SPRN_DBCR2, thread->dbcr2);
+ mtspr(SPRN_DBCR2, debug->dbcr2);
#endif
}
/*
@@ -346,53 +413,359 @@ static void prime_debug_regs(struct thread_struct *thread)
* debug registers, set the debug registers from the values
* stored in the new thread.
*/
-static void switch_booke_debug_regs(struct thread_struct *new_thread)
+void switch_booke_debug_regs(struct debug_reg *new_debug)
{
- if ((current->thread.dbcr0 & DBCR0_IDM)
- || (new_thread->dbcr0 & DBCR0_IDM))
- prime_debug_regs(new_thread);
+ if ((current->thread.debug.dbcr0 & DBCR0_IDM)
+ || (new_debug->dbcr0 & DBCR0_IDM))
+ prime_debug_regs(new_debug);
}
+EXPORT_SYMBOL_GPL(switch_booke_debug_regs);
#else /* !CONFIG_PPC_ADV_DEBUG_REGS */
+#ifndef CONFIG_HAVE_HW_BREAKPOINT
static void set_debug_reg_defaults(struct thread_struct *thread)
{
- if (thread->dabr) {
- thread->dabr = 0;
- set_dabr(0);
- }
+ thread->hw_brk.address = 0;
+ thread->hw_brk.type = 0;
+ set_breakpoint(&thread->hw_brk);
}
+#endif /* !CONFIG_HAVE_HW_BREAKPOINT */
#endif /* CONFIG_PPC_ADV_DEBUG_REGS */
-int set_dabr(unsigned long dabr)
-{
- __get_cpu_var(current_dabr) = dabr;
-
- if (ppc_md.set_dabr)
- return ppc_md.set_dabr(dabr);
-
- /* XXX should we have a CPU_FTR_HAS_DABR ? */
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+static inline int __set_dabr(unsigned long dabr, unsigned long dabrx)
+{
mtspr(SPRN_DAC1, dabr);
#ifdef CONFIG_PPC_47x
isync();
#endif
+ return 0;
+}
#elif defined(CONFIG_PPC_BOOK3S)
+static inline int __set_dabr(unsigned long dabr, unsigned long dabrx)
+{
mtspr(SPRN_DABR, dabr);
+ if (cpu_has_feature(CPU_FTR_DABRX))
+ mtspr(SPRN_DABRX, dabrx);
+ return 0;
+}
+#else
+static inline int __set_dabr(unsigned long dabr, unsigned long dabrx)
+{
+ return -EINVAL;
+}
#endif
+static inline int set_dabr(struct arch_hw_breakpoint *brk)
+{
+ unsigned long dabr, dabrx;
+
+ dabr = brk->address | (brk->type & HW_BRK_TYPE_DABR);
+ dabrx = ((brk->type >> 3) & 0x7);
+ if (ppc_md.set_dabr)
+ return ppc_md.set_dabr(dabr, dabrx);
+
+ return __set_dabr(dabr, dabrx);
+}
+
+static inline int set_dawr(struct arch_hw_breakpoint *brk)
+{
+ unsigned long dawr, dawrx, mrd;
+
+ dawr = brk->address;
+
+ dawrx = (brk->type & (HW_BRK_TYPE_READ | HW_BRK_TYPE_WRITE)) \
+ << (63 - 58); //* read/write bits */
+ dawrx |= ((brk->type & (HW_BRK_TYPE_TRANSLATE)) >> 2) \
+ << (63 - 59); //* translate */
+ dawrx |= (brk->type & (HW_BRK_TYPE_PRIV_ALL)) \
+ >> 3; //* PRIM bits */
+ /* dawr length is stored in field MDR bits 48:53. Matches range in
+ doublewords (64 bits) baised by -1 eg. 0b000000=1DW and
+ 0b111111=64DW.
+ brk->len is in bytes.
+ This aligns up to double word size, shifts and does the bias.
+ */
+ mrd = ((brk->len + 7) >> 3) - 1;
+ dawrx |= (mrd & 0x3f) << (63 - 53);
+
+ if (ppc_md.set_dawr)
+ return ppc_md.set_dawr(dawr, dawrx);
+ mtspr(SPRN_DAWR, dawr);
+ mtspr(SPRN_DAWRX, dawrx);
return 0;
}
+void __set_breakpoint(struct arch_hw_breakpoint *brk)
+{
+ __get_cpu_var(current_brk) = *brk;
+
+ if (cpu_has_feature(CPU_FTR_DAWR))
+ set_dawr(brk);
+ else
+ set_dabr(brk);
+}
+
+void set_breakpoint(struct arch_hw_breakpoint *brk)
+{
+ preempt_disable();
+ __set_breakpoint(brk);
+ preempt_enable();
+}
+
#ifdef CONFIG_PPC64
DEFINE_PER_CPU(struct cpu_usage, cpu_usage_array);
#endif
+static inline bool hw_brk_match(struct arch_hw_breakpoint *a,
+ struct arch_hw_breakpoint *b)
+{
+ if (a->address != b->address)
+ return false;
+ if (a->type != b->type)
+ return false;
+ if (a->len != b->len)
+ return false;
+ return true;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static void tm_reclaim_thread(struct thread_struct *thr,
+ struct thread_info *ti, uint8_t cause)
+{
+ unsigned long msr_diff = 0;
+
+ /*
+ * If FP/VSX registers have been already saved to the
+ * thread_struct, move them to the transact_fp array.
+ * We clear the TIF_RESTORE_TM bit since after the reclaim
+ * the thread will no longer be transactional.
+ */
+ if (test_ti_thread_flag(ti, TIF_RESTORE_TM)) {
+ msr_diff = thr->tm_orig_msr & ~thr->regs->msr;
+ if (msr_diff & MSR_FP)
+ memcpy(&thr->transact_fp, &thr->fp_state,
+ sizeof(struct thread_fp_state));
+ if (msr_diff & MSR_VEC)
+ memcpy(&thr->transact_vr, &thr->vr_state,
+ sizeof(struct thread_vr_state));
+ clear_ti_thread_flag(ti, TIF_RESTORE_TM);
+ msr_diff &= MSR_FP | MSR_VEC | MSR_VSX | MSR_FE0 | MSR_FE1;
+ }
+
+ tm_reclaim(thr, thr->regs->msr, cause);
+
+ /* Having done the reclaim, we now have the checkpointed
+ * FP/VSX values in the registers. These might be valid
+ * even if we have previously called enable_kernel_fp() or
+ * flush_fp_to_thread(), so update thr->regs->msr to
+ * indicate their current validity.
+ */
+ thr->regs->msr |= msr_diff;
+}
+
+void tm_reclaim_current(uint8_t cause)
+{
+ tm_enable();
+ tm_reclaim_thread(&current->thread, current_thread_info(), cause);
+}
+
+static inline void tm_reclaim_task(struct task_struct *tsk)
+{
+ /* We have to work out if we're switching from/to a task that's in the
+ * middle of a transaction.
+ *
+ * In switching we need to maintain a 2nd register state as
+ * oldtask->thread.ckpt_regs. We tm_reclaim(oldproc); this saves the
+ * checkpointed (tbegin) state in ckpt_regs and saves the transactional
+ * (current) FPRs into oldtask->thread.transact_fpr[].
+ *
+ * We also context switch (save) TFHAR/TEXASR/TFIAR in here.
+ */
+ struct thread_struct *thr = &tsk->thread;
+
+ if (!thr->regs)
+ return;
+
+ if (!MSR_TM_ACTIVE(thr->regs->msr))
+ goto out_and_saveregs;
+
+ /* Stash the original thread MSR, as giveup_fpu et al will
+ * modify it. We hold onto it to see whether the task used
+ * FP & vector regs. If the TIF_RESTORE_TM flag is set,
+ * tm_orig_msr is already set.
+ */
+ if (!test_ti_thread_flag(task_thread_info(tsk), TIF_RESTORE_TM))
+ thr->tm_orig_msr = thr->regs->msr;
+
+ TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, "
+ "ccr=%lx, msr=%lx, trap=%lx)\n",
+ tsk->pid, thr->regs->nip,
+ thr->regs->ccr, thr->regs->msr,
+ thr->regs->trap);
+
+ tm_reclaim_thread(thr, task_thread_info(tsk), TM_CAUSE_RESCHED);
+
+ TM_DEBUG("--- tm_reclaim on pid %d complete\n",
+ tsk->pid);
+
+out_and_saveregs:
+ /* Always save the regs here, even if a transaction's not active.
+ * This context-switches a thread's TM info SPRs. We do it here to
+ * be consistent with the restore path (in recheckpoint) which
+ * cannot happen later in _switch().
+ */
+ tm_save_sprs(thr);
+}
+
+extern void __tm_recheckpoint(struct thread_struct *thread,
+ unsigned long orig_msr);
+
+void tm_recheckpoint(struct thread_struct *thread,
+ unsigned long orig_msr)
+{
+ unsigned long flags;
+
+ /* We really can't be interrupted here as the TEXASR registers can't
+ * change and later in the trecheckpoint code, we have a userspace R1.
+ * So let's hard disable over this region.
+ */
+ local_irq_save(flags);
+ hard_irq_disable();
+
+ /* The TM SPRs are restored here, so that TEXASR.FS can be set
+ * before the trecheckpoint and no explosion occurs.
+ */
+ tm_restore_sprs(thread);
+
+ __tm_recheckpoint(thread, orig_msr);
+
+ local_irq_restore(flags);
+}
+
+static inline void tm_recheckpoint_new_task(struct task_struct *new)
+{
+ unsigned long msr;
+
+ if (!cpu_has_feature(CPU_FTR_TM))
+ return;
+
+ /* Recheckpoint the registers of the thread we're about to switch to.
+ *
+ * If the task was using FP, we non-lazily reload both the original and
+ * the speculative FP register states. This is because the kernel
+ * doesn't see if/when a TM rollback occurs, so if we take an FP
+ * unavoidable later, we are unable to determine which set of FP regs
+ * need to be restored.
+ */
+ if (!new->thread.regs)
+ return;
+
+ if (!MSR_TM_ACTIVE(new->thread.regs->msr)){
+ tm_restore_sprs(&new->thread);
+ return;
+ }
+ msr = new->thread.tm_orig_msr;
+ /* Recheckpoint to restore original checkpointed register state. */
+ TM_DEBUG("*** tm_recheckpoint of pid %d "
+ "(new->msr 0x%lx, new->origmsr 0x%lx)\n",
+ new->pid, new->thread.regs->msr, msr);
+
+ /* This loads the checkpointed FP/VEC state, if used */
+ tm_recheckpoint(&new->thread, msr);
+
+ /* This loads the speculative FP/VEC state, if used */
+ if (msr & MSR_FP) {
+ do_load_up_transact_fpu(&new->thread);
+ new->thread.regs->msr |=
+ (MSR_FP | new->thread.fpexc_mode);
+ }
+#ifdef CONFIG_ALTIVEC
+ if (msr & MSR_VEC) {
+ do_load_up_transact_altivec(&new->thread);
+ new->thread.regs->msr |= MSR_VEC;
+ }
+#endif
+ /* We may as well turn on VSX too since all the state is restored now */
+ if (msr & MSR_VSX)
+ new->thread.regs->msr |= MSR_VSX;
+
+ TM_DEBUG("*** tm_recheckpoint of pid %d complete "
+ "(kernel msr 0x%lx)\n",
+ new->pid, mfmsr());
+}
+
+static inline void __switch_to_tm(struct task_struct *prev)
+{
+ if (cpu_has_feature(CPU_FTR_TM)) {
+ tm_enable();
+ tm_reclaim_task(prev);
+ }
+}
+
+/*
+ * This is called if we are on the way out to userspace and the
+ * TIF_RESTORE_TM flag is set. It checks if we need to reload
+ * FP and/or vector state and does so if necessary.
+ * If userspace is inside a transaction (whether active or
+ * suspended) and FP/VMX/VSX instructions have ever been enabled
+ * inside that transaction, then we have to keep them enabled
+ * and keep the FP/VMX/VSX state loaded while ever the transaction
+ * continues. The reason is that if we didn't, and subsequently
+ * got a FP/VMX/VSX unavailable interrupt inside a transaction,
+ * we don't know whether it's the same transaction, and thus we
+ * don't know which of the checkpointed state and the transactional
+ * state to use.
+ */
+void restore_tm_state(struct pt_regs *regs)
+{
+ unsigned long msr_diff;
+
+ clear_thread_flag(TIF_RESTORE_TM);
+ if (!MSR_TM_ACTIVE(regs->msr))
+ return;
+
+ msr_diff = current->thread.tm_orig_msr & ~regs->msr;
+ msr_diff &= MSR_FP | MSR_VEC | MSR_VSX;
+ if (msr_diff & MSR_FP) {
+ fp_enable();
+ load_fp_state(&current->thread.fp_state);
+ regs->msr |= current->thread.fpexc_mode;
+ }
+ if (msr_diff & MSR_VEC) {
+ vec_enable();
+ load_vr_state(&current->thread.vr_state);
+ }
+ regs->msr |= msr_diff;
+}
+
+#else
+#define tm_recheckpoint_new_task(new)
+#define __switch_to_tm(prev)
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *new)
{
struct thread_struct *new_thread, *old_thread;
- unsigned long flags;
struct task_struct *last;
+#ifdef CONFIG_PPC_BOOK3S_64
+ struct ppc64_tlb_batch *batch;
+#endif
+
+ WARN_ON(!irqs_disabled());
+
+ /* Back up the TAR and DSCR across context switches.
+ * Note that the TAR is not available for use in the kernel. (To
+ * provide this, the TAR should be backed up/restored on exception
+ * entry/exit instead, and be in pt_regs. FIXME, this should be in
+ * pt_regs anyway (for debug).)
+ * Save the TAR and DSCR here before we do treclaim/trecheckpoint as
+ * these will change them.
+ */
+ save_early_sprs(&prev->thread);
+
+ __switch_to_tm(prev);
#ifdef CONFIG_SMP
/* avoid complexity of lazy save/restore of fpu
@@ -461,15 +834,15 @@ struct task_struct *__switch_to(struct task_struct *prev,
#endif /* CONFIG_SMP */
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
- switch_booke_debug_regs(&new->thread);
+ switch_booke_debug_regs(&new->thread.debug);
#else
/*
* For PPC_BOOK3S_64, we use the hw-breakpoint interfaces that would
* schedule DABR
*/
#ifndef CONFIG_HAVE_HW_BREAKPOINT
- if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr))
- set_dabr(new->thread.dabr);
+ if (unlikely(!hw_brk_match(&__get_cpu_var(current_brk), &new->thread.hw_brk)))
+ __set_breakpoint(&new->thread.hw_brk);
#endif /* CONFIG_HAVE_HW_BREAKPOINT */
#endif
@@ -477,28 +850,6 @@ struct task_struct *__switch_to(struct task_struct *prev,
new_thread = &new->thread;
old_thread = &current->thread;
-#if defined(CONFIG_PPC_BOOK3E_64)
- /* XXX Current Book3E code doesn't deal with kernel side DBCR0,
- * we always hold the user values, so we set it now.
- *
- * However, we ensure the kernel MSR:DE is appropriately cleared too
- * to avoid spurrious single step exceptions in the kernel.
- *
- * This will have to change to merge with the ppc32 code at some point,
- * but I don't like much what ppc32 is doing today so there's some
- * thinking needed there
- */
- if ((new_thread->dbcr0 | old_thread->dbcr0) & DBCR0_IDM) {
- u32 dbcr0;
-
- mtmsr(mfmsr() & ~MSR_DE);
- isync();
- dbcr0 = mfspr(SPRN_DBCR0);
- dbcr0 = (dbcr0 & DBCR0_EDM) | new_thread->dbcr0;
- mtspr(SPRN_DBCR0, dbcr0);
- }
-#endif /* CONFIG_PPC64_BOOK3E */
-
#ifdef CONFIG_PPC64
/*
* Collect processor utilization data per process
@@ -511,12 +862,17 @@ struct task_struct *__switch_to(struct task_struct *prev,
old_thread->accum_tb += (current_tb - start_tb);
new_thread->start_tb = current_tb;
}
-#endif
-
- local_irq_save(flags);
-
- account_system_vtime(current);
- account_process_vtime(current);
+#endif /* CONFIG_PPC64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ batch = &__get_cpu_var(ppc64_tlb_batch);
+ if (batch->active) {
+ current_thread_info()->local_flags |= _TLF_LAZY_MMU;
+ if (batch->index)
+ __flush_tlb_pending(batch);
+ batch->active = 0;
+ }
+#endif /* CONFIG_PPC_BOOK3S_64 */
/*
* We can't take a PMU exception inside _switch() since there is a
@@ -524,9 +880,18 @@ struct task_struct *__switch_to(struct task_struct *prev,
* of sync. Hard disable here.
*/
hard_irq_disable();
+
+ tm_recheckpoint_new_task(new);
+
last = _switch(old_thread, new_thread);
- local_irq_restore(flags);
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (current_thread_info()->local_flags & _TLF_LAZY_MMU) {
+ current_thread_info()->local_flags &= ~_TLF_LAZY_MMU;
+ batch = &__get_cpu_var(ppc64_tlb_batch);
+ batch->active = 1;
+ }
+#endif /* CONFIG_PPC_BOOK3S_64 */
return last;
}
@@ -561,12 +926,12 @@ static void show_instructions(struct pt_regs *regs)
*/
if (!__kernel_text_address(pc) ||
__get_user(instr, (unsigned int __user *)pc)) {
- printk("XXXXXXXX ");
+ printk(KERN_CONT "XXXXXXXX ");
} else {
if (regs->nip == pc)
- printk("<%08x> ", instr);
+ printk(KERN_CONT "<%08x> ", instr);
else
- printk("%08x ", instr);
+ printk(KERN_CONT "%08x ", instr);
}
pc += sizeof(int);
@@ -579,16 +944,32 @@ static struct regbit {
unsigned long bit;
const char *name;
} msr_bits[] = {
+#if defined(CONFIG_PPC64) && !defined(CONFIG_BOOKE)
+ {MSR_SF, "SF"},
+ {MSR_HV, "HV"},
+#endif
+ {MSR_VEC, "VEC"},
+ {MSR_VSX, "VSX"},
+#ifdef CONFIG_BOOKE
+ {MSR_CE, "CE"},
+#endif
{MSR_EE, "EE"},
{MSR_PR, "PR"},
{MSR_FP, "FP"},
- {MSR_VEC, "VEC"},
- {MSR_VSX, "VSX"},
{MSR_ME, "ME"},
- {MSR_CE, "CE"},
+#ifdef CONFIG_BOOKE
{MSR_DE, "DE"},
+#else
+ {MSR_SE, "SE"},
+ {MSR_BE, "BE"},
+#endif
{MSR_IR, "IR"},
{MSR_DR, "DR"},
+ {MSR_PMM, "PMM"},
+#ifndef CONFIG_BOOKE
+ {MSR_RI, "RI"},
+ {MSR_LE, "LE"},
+#endif
{0, NULL}
};
@@ -619,6 +1000,8 @@ void show_regs(struct pt_regs * regs)
{
int i, trap;
+ show_regs_print_info(KERN_DEFAULT);
+
printk("NIP: "REG" LR: "REG" CTR: "REG"\n",
regs->nip, regs->link, regs->ctr);
printk("REGS: %p TRAP: %04lx %s (%s)\n",
@@ -627,18 +1010,21 @@ void show_regs(struct pt_regs * regs)
printbits(regs->msr, msr_bits);
printk(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer);
trap = TRAP(regs);
- if (trap == 0x300 || trap == 0x600)
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
- printk("DEAR: "REG", ESR: "REG"\n", regs->dar, regs->dsisr);
+ if ((regs->trap != 0xc00) && cpu_has_feature(CPU_FTR_CFAR))
+ printk("CFAR: "REG" ", regs->orig_gpr3);
+ if (trap == 0x200 || trap == 0x300 || trap == 0x600)
+#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+ printk("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr);
#else
- printk("DAR: "REG", DSISR: "REG"\n", regs->dar, regs->dsisr);
+ printk("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr);
+#endif
+#ifdef CONFIG_PPC64
+ printk("SOFTE: %ld ", regs->softe);
+#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ if (MSR_TM_ACTIVE(regs->msr))
+ printk("\nPACATMSCRATCH: %016llx ", get_paca()->tm_scratch);
#endif
- printk("TASK = %p[%d] '%s' THREAD: %p",
- current, task_pid_nr(current), current->comm, task_thread_info(current));
-
-#ifdef CONFIG_SMP
- printk(" CPU: %d", raw_smp_processor_id());
-#endif /* CONFIG_SMP */
for (i = 0; i < 32; i++) {
if ((i % REGS_PER_LINE) == 0)
@@ -670,11 +1056,11 @@ void flush_thread(void)
{
discard_lazy_cpu_state();
-#ifdef CONFIG_HAVE_HW_BREAKPOINTS
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
flush_ptrace_hw_breakpoint(current);
-#else /* CONFIG_HAVE_HW_BREAKPOINTS */
+#else /* CONFIG_HAVE_HW_BREAKPOINT */
set_debug_reg_defaults(&current->thread);
-#endif /* CONFIG_HAVE_HW_BREAKPOINTS */
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
}
void
@@ -683,48 +1069,72 @@ release_thread(struct task_struct *t)
}
/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
+ * this gets called so that we can store coprocessor state into memory and
+ * copy the current task into the new thread.
*/
-void prepare_to_copy(struct task_struct *tsk)
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- flush_fp_to_thread(current);
- flush_altivec_to_thread(current);
- flush_vsx_to_thread(current);
- flush_spe_to_thread(current);
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
- flush_ptrace_hw_breakpoint(tsk);
-#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+ flush_fp_to_thread(src);
+ flush_altivec_to_thread(src);
+ flush_vsx_to_thread(src);
+ flush_spe_to_thread(src);
+ /*
+ * Flush TM state out so we can copy it. __switch_to_tm() does this
+ * flush but it removes the checkpointed state from the current CPU and
+ * transitions the CPU out of TM mode. Hence we need to call
+ * tm_recheckpoint_new_task() (on the same task) to restore the
+ * checkpointed state back and the TM mode.
+ */
+ __switch_to_tm(src);
+ tm_recheckpoint_new_task(src);
+
+ *dst = *src;
+
+ clear_task_ebb(dst);
+
+ return 0;
}
/*
* Copy a thread..
*/
+extern unsigned long dscr_default; /* defined in arch/powerpc/kernel/sysfs.c */
+
int copy_thread(unsigned long clone_flags, unsigned long usp,
- unsigned long unused, struct task_struct *p,
- struct pt_regs *regs)
+ unsigned long arg, struct task_struct *p)
{
struct pt_regs *childregs, *kregs;
extern void ret_from_fork(void);
+ extern void ret_from_kernel_thread(void);
+ void (*f)(void);
unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE;
- CHECK_FULL_REGS(regs);
/* Copy registers */
sp -= sizeof(struct pt_regs);
childregs = (struct pt_regs *) sp;
- *childregs = *regs;
- if ((childregs->msr & MSR_PR) == 0) {
- /* for kernel thread, set `current' and stackptr in new task */
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ struct thread_info *ti = (void *)task_stack_page(p);
+ memset(childregs, 0, sizeof(struct pt_regs));
childregs->gpr[1] = sp + sizeof(struct pt_regs);
-#ifdef CONFIG_PPC32
- childregs->gpr[2] = (unsigned long) p;
-#else
+ /* function */
+ if (usp)
+ childregs->gpr[14] = ppc_function_entry((void *)usp);
+#ifdef CONFIG_PPC64
clear_tsk_thread_flag(p, TIF_32BIT);
+ childregs->softe = 1;
#endif
+ childregs->gpr[15] = arg;
p->thread.regs = NULL; /* no user register state */
+ ti->flags |= _TIF_RESTOREALL;
+ f = ret_from_kernel_thread;
} else {
- childregs->gpr[1] = usp;
+ struct pt_regs *regs = current_pt_regs();
+ CHECK_FULL_REGS(regs);
+ *childregs = *regs;
+ if (usp)
+ childregs->gpr[1] = usp;
p->thread.regs = childregs;
+ childregs->gpr[3] = 0; /* Result from fork() */
if (clone_flags & CLONE_SETTLS) {
#ifdef CONFIG_PPC64
if (!is_32bit_task())
@@ -733,8 +1143,9 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
#endif
childregs->gpr[2] = childregs->gpr[6];
}
+
+ f = ret_from_fork;
}
- childregs->gpr[3] = 0; /* Result from fork() */
sp -= STACK_FRAME_OVERHEAD;
/*
@@ -745,19 +1156,30 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
* do some house keeping and then return from the fork or clone
* system call, using the stack frame created above.
*/
+ ((unsigned long *)sp)[0] = 0;
sp -= sizeof(struct pt_regs);
kregs = (struct pt_regs *) sp;
sp -= STACK_FRAME_OVERHEAD;
p->thread.ksp = sp;
+#ifdef CONFIG_PPC32
p->thread.ksp_limit = (unsigned long)task_stack_page(p) +
_ALIGN_UP(sizeof(struct thread_info), 16);
+#endif
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+ p->thread.ptrace_bps[0] = NULL;
+#endif
+
+ p->thread.fp_save_area = NULL;
+#ifdef CONFIG_ALTIVEC
+ p->thread.vr_save_area = NULL;
+#endif
#ifdef CONFIG_PPC_STD_MMU_64
- if (cpu_has_feature(CPU_FTR_SLB)) {
+ if (mmu_has_feature(MMU_FTR_SLB)) {
unsigned long sp_vsid;
unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp;
- if (cpu_has_feature(CPU_FTR_1T_SEGMENT))
+ if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
sp_vsid = get_kernel_vsid(sp, MMU_SEGSIZE_1T)
<< SLB_VSID_SHIFT_1T;
else
@@ -767,19 +1189,15 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
p->thread.ksp_vsid = sp_vsid;
}
#endif /* CONFIG_PPC_STD_MMU_64 */
-
- /*
- * The PPC64 ABI makes use of a TOC to contain function
- * pointers. The function (ret_from_except) is actually a pointer
- * to the TOC entry. The first entry is a pointer to the actual
- * function.
- */
-#ifdef CONFIG_PPC64
- kregs->nip = *((unsigned long *)ret_from_fork);
-#else
- kregs->nip = (unsigned long)ret_from_fork;
+#ifdef CONFIG_PPC64
+ if (cpu_has_feature(CPU_FTR_DSCR)) {
+ p->thread.dscr_inherit = current->thread.dscr_inherit;
+ p->thread.dscr = current->thread.dscr;
+ }
+ if (cpu_has_feature(CPU_FTR_HAS_PPR))
+ p->thread.ppr = INIT_PPR;
#endif
-
+ kregs->nip = ppc_function_entry(f);
return 0;
}
@@ -792,8 +1210,6 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp)
unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */
#endif
- set_fs(USER_DS);
-
/*
* If we exec out of a kernel thread then thread.regs will not be
* set. Do it now.
@@ -823,25 +1239,45 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp)
regs->msr = MSR_USER;
#else
if (!is_32bit_task()) {
- unsigned long entry, toc;
+ unsigned long entry;
- /* start is a relocated pointer to the function descriptor for
- * the elf _start routine. The first entry in the function
- * descriptor is the entry address of _start and the second
- * entry is the TOC value we need to use.
- */
- __get_user(entry, (unsigned long __user *)start);
- __get_user(toc, (unsigned long __user *)start+1);
+ if (is_elf2_task()) {
+ /* Look ma, no function descriptors! */
+ entry = start;
- /* Check whether the e_entry function descriptor entries
- * need to be relocated before we can use them.
- */
- if (load_addr != 0) {
- entry += load_addr;
- toc += load_addr;
+ /*
+ * Ulrich says:
+ * The latest iteration of the ABI requires that when
+ * calling a function (at its global entry point),
+ * the caller must ensure r12 holds the entry point
+ * address (so that the function can quickly
+ * establish addressability).
+ */
+ regs->gpr[12] = start;
+ /* Make sure that's restored on entry to userspace. */
+ set_thread_flag(TIF_RESTOREALL);
+ } else {
+ unsigned long toc;
+
+ /* start is a relocated pointer to the function
+ * descriptor for the elf _start routine. The first
+ * entry in the function descriptor is the entry
+ * address of _start and the second entry is the TOC
+ * value we need to use.
+ */
+ __get_user(entry, (unsigned long __user *)start);
+ __get_user(toc, (unsigned long __user *)start+1);
+
+ /* Check whether the e_entry function descriptor entries
+ * need to be relocated before we can use them.
+ */
+ if (load_addr != 0) {
+ entry += load_addr;
+ toc += load_addr;
+ }
+ regs->gpr[2] = toc;
}
regs->nip = entry;
- regs->gpr[2] = toc;
regs->msr = MSR_USER64;
} else {
regs->nip = start;
@@ -849,17 +1285,16 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp)
regs->msr = MSR_USER32;
}
#endif
-
discard_lazy_cpu_state();
#ifdef CONFIG_VSX
current->thread.used_vsr = 0;
#endif
- memset(current->thread.fpr, 0, sizeof(current->thread.fpr));
- current->thread.fpscr.val = 0;
+ memset(&current->thread.fp_state, 0, sizeof(current->thread.fp_state));
+ current->thread.fp_save_area = NULL;
#ifdef CONFIG_ALTIVEC
- memset(current->thread.vr, 0, sizeof(current->thread.vr));
- memset(&current->thread.vscr, 0, sizeof(current->thread.vscr));
- current->thread.vscr.u[3] = 0x00010000; /* Java mode disabled */
+ memset(&current->thread.vr_state, 0, sizeof(current->thread.vr_state));
+ current->thread.vr_state.vscr.u[3] = 0x00010000; /* Java mode disabled */
+ current->thread.vr_save_area = NULL;
current->thread.vrsave = 0;
current->thread.used_vr = 0;
#endif /* CONFIG_ALTIVEC */
@@ -869,6 +1304,13 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp)
current->thread.spefscr = 0;
current->thread.used_spe = 0;
#endif /* CONFIG_SPE */
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ if (cpu_has_feature(CPU_FTR_TM))
+ regs->msr |= MSR_TM;
+ current->thread.tm_tfhar = 0;
+ current->thread.tm_texasr = 0;
+ current->thread.tm_tfiar = 0;
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
}
#define PR_FP_ALL_EXCEPT (PR_FP_EXC_DIV | PR_FP_EXC_OVF | PR_FP_EXC_UND \
@@ -885,6 +1327,19 @@ int set_fpexc_mode(struct task_struct *tsk, unsigned int val)
if (val & PR_FP_EXC_SW_ENABLE) {
#ifdef CONFIG_SPE
if (cpu_has_feature(CPU_FTR_SPE)) {
+ /*
+ * When the sticky exception bits are set
+ * directly by userspace, it must call prctl
+ * with PR_GET_FPEXC (with PR_FP_EXC_SW_ENABLE
+ * in the existing prctl settings) or
+ * PR_SET_FPEXC (with PR_FP_EXC_SW_ENABLE in
+ * the bits being set). <fenv.h> functions
+ * saving and restoring the whole
+ * floating-point environment need to do so
+ * anyway to restore the prctl settings from
+ * the saved environment.
+ */
+ tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR);
tsk->thread.fpexc_mode = val &
(PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT);
return 0;
@@ -916,9 +1371,22 @@ int get_fpexc_mode(struct task_struct *tsk, unsigned long adr)
if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE)
#ifdef CONFIG_SPE
- if (cpu_has_feature(CPU_FTR_SPE))
+ if (cpu_has_feature(CPU_FTR_SPE)) {
+ /*
+ * When the sticky exception bits are set
+ * directly by userspace, it must call prctl
+ * with PR_GET_FPEXC (with PR_FP_EXC_SW_ENABLE
+ * in the existing prctl settings) or
+ * PR_SET_FPEXC (with PR_FP_EXC_SW_ENABLE in
+ * the bits being set). <fenv.h> functions
+ * saving and restoring the whole
+ * floating-point environment need to do so
+ * anyway to restore the prctl settings from
+ * the saved environment.
+ */
+ tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR);
val = tsk->thread.fpexc_mode;
- else
+ } else
return -EINVAL;
#else
return -EINVAL;
@@ -983,64 +1451,6 @@ int get_unalign_ctl(struct task_struct *tsk, unsigned long adr)
return put_user(tsk->thread.align_ctl, (unsigned int __user *)adr);
}
-#define TRUNC_PTR(x) ((typeof(x))(((unsigned long)(x)) & 0xffffffff))
-
-int sys_clone(unsigned long clone_flags, unsigned long usp,
- int __user *parent_tidp, void __user *child_threadptr,
- int __user *child_tidp, int p6,
- struct pt_regs *regs)
-{
- CHECK_FULL_REGS(regs);
- if (usp == 0)
- usp = regs->gpr[1]; /* stack pointer for child */
-#ifdef CONFIG_PPC64
- if (is_32bit_task()) {
- parent_tidp = TRUNC_PTR(parent_tidp);
- child_tidp = TRUNC_PTR(child_tidp);
- }
-#endif
- return do_fork(clone_flags, usp, regs, 0, parent_tidp, child_tidp);
-}
-
-int sys_fork(unsigned long p1, unsigned long p2, unsigned long p3,
- unsigned long p4, unsigned long p5, unsigned long p6,
- struct pt_regs *regs)
-{
- CHECK_FULL_REGS(regs);
- return do_fork(SIGCHLD, regs->gpr[1], regs, 0, NULL, NULL);
-}
-
-int sys_vfork(unsigned long p1, unsigned long p2, unsigned long p3,
- unsigned long p4, unsigned long p5, unsigned long p6,
- struct pt_regs *regs)
-{
- CHECK_FULL_REGS(regs);
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->gpr[1],
- regs, 0, NULL, NULL);
-}
-
-int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2,
- unsigned long a3, unsigned long a4, unsigned long a5,
- struct pt_regs *regs)
-{
- int error;
- char *filename;
-
- filename = getname((const char __user *) a0);
- error = PTR_ERR(filename);
- if (IS_ERR(filename))
- goto out;
- flush_fp_to_thread(current);
- flush_altivec_to_thread(current);
- flush_spe_to_thread(current);
- error = do_execve(filename,
- (const char __user *const __user *) a1,
- (const char __user *const __user *) a2, regs);
- putname(filename);
-out:
- return error;
-}
-
static inline int valid_irq_stack(unsigned long sp, struct task_struct *p,
unsigned long nbytes)
{
@@ -1176,72 +1586,33 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
} while (count++ < kstack_depth_to_print);
}
-void dump_stack(void)
-{
- show_stack(current, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
-
#ifdef CONFIG_PPC64
-void ppc64_runlatch_on(void)
+/* Called with hard IRQs off */
+void notrace __ppc64_runlatch_on(void)
{
+ struct thread_info *ti = current_thread_info();
unsigned long ctrl;
- if (cpu_has_feature(CPU_FTR_CTRL) && !test_thread_flag(TIF_RUNLATCH)) {
- HMT_medium();
-
- ctrl = mfspr(SPRN_CTRLF);
- ctrl |= CTRL_RUNLATCH;
- mtspr(SPRN_CTRLT, ctrl);
+ ctrl = mfspr(SPRN_CTRLF);
+ ctrl |= CTRL_RUNLATCH;
+ mtspr(SPRN_CTRLT, ctrl);
- set_thread_flag(TIF_RUNLATCH);
- }
+ ti->local_flags |= _TLF_RUNLATCH;
}
-void __ppc64_runlatch_off(void)
+/* Called with hard IRQs off */
+void notrace __ppc64_runlatch_off(void)
{
+ struct thread_info *ti = current_thread_info();
unsigned long ctrl;
- HMT_medium();
-
- clear_thread_flag(TIF_RUNLATCH);
+ ti->local_flags &= ~_TLF_RUNLATCH;
ctrl = mfspr(SPRN_CTRLF);
ctrl &= ~CTRL_RUNLATCH;
mtspr(SPRN_CTRLT, ctrl);
}
-#endif
-
-#if THREAD_SHIFT < PAGE_SHIFT
-
-static struct kmem_cache *thread_info_cache;
-
-struct thread_info *alloc_thread_info(struct task_struct *tsk)
-{
- struct thread_info *ti;
-
- ti = kmem_cache_alloc(thread_info_cache, GFP_KERNEL);
- if (unlikely(ti == NULL))
- return NULL;
-#ifdef CONFIG_DEBUG_STACK_USAGE
- memset(ti, 0, THREAD_SIZE);
-#endif
- return ti;
-}
-
-void free_thread_info(struct thread_info *ti)
-{
- kmem_cache_free(thread_info_cache, ti);
-}
-
-void thread_info_cache_init(void)
-{
- thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
- THREAD_SIZE, 0, NULL);
- BUG_ON(thread_info_cache == NULL);
-}
-
-#endif /* THREAD_SHIFT < PAGE_SHIFT */
+#endif /* CONFIG_PPC64 */
unsigned long arch_align_stack(unsigned long sp)
{
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 9e3132db718..b694b073097 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -27,11 +27,13 @@
#include <linux/delay.h>
#include <linux/initrd.h>
#include <linux/bitops.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kexec.h>
-#include <linux/debugfs.h>
#include <linux/irq.h>
#include <linux/memblock.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
#include <asm/prom.h>
#include <asm/rtas.h>
@@ -41,7 +43,6 @@
#include <asm/io.h>
#include <asm/kdump.h>
#include <asm/smp.h>
-#include <asm/system.h>
#include <asm/mmu.h>
#include <asm/paca.h>
#include <asm/pgtable.h>
@@ -50,10 +51,12 @@
#include <asm/btext.h>
#include <asm/sections.h>
#include <asm/machdep.h>
-#include <asm/pSeries_reconfig.h>
#include <asm/pci-bridge.h>
-#include <asm/phyp_dump.h>
#include <asm/kexec.h>
+#include <asm/opal.h>
+#include <asm/fadump.h>
+#include <asm/debug.h>
+
#include <mm/mmu_decl.h>
#ifdef DEBUG
@@ -68,6 +71,8 @@ int __initdata iommu_force_on;
unsigned long tce_alloc_start, tce_alloc_end;
u64 ppc64_rma_size;
#endif
+static phys_addr_t first_memblock_size;
+static int __initdata boot_cpu_count;
static int __init early_parse_mem(char *p)
{
@@ -75,17 +80,35 @@ static int __init early_parse_mem(char *p)
return 1;
memory_limit = PAGE_ALIGN(memparse(p, &p));
- DBG("memory limit = 0x%llx\n", (unsigned long long)memory_limit);
+ DBG("memory limit = 0x%llx\n", memory_limit);
return 0;
}
early_param("mem", early_parse_mem);
+/*
+ * overlaps_initrd - check for overlap with page aligned extension of
+ * initrd.
+ */
+static inline int overlaps_initrd(unsigned long start, unsigned long size)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+ if (!initrd_start)
+ return 0;
+
+ return (start + size) > _ALIGN_DOWN(initrd_start, PAGE_SIZE) &&
+ start <= _ALIGN_UP(initrd_end, PAGE_SIZE);
+#else
+ return 0;
+#endif
+}
+
/**
* move_device_tree - move tree to an unused area, if needed.
*
* The device tree may be allocated beyond our memory limit, or inside the
- * crash kernel region for kdump. If so, move it out of the way.
+ * crash kernel region for kdump, or within the page aligned range of initrd.
+ * If so, move it out of the way.
*/
static void __init move_device_tree(void)
{
@@ -95,13 +118,14 @@ static void __init move_device_tree(void)
DBG("-> move_device_tree\n");
start = __pa(initial_boot_params);
- size = be32_to_cpu(initial_boot_params->totalsize);
+ size = fdt_totalsize(initial_boot_params);
- if ((memory_limit && (start + size) > memory_limit) ||
- overlaps_crashkernel(start, size)) {
+ if ((memory_limit && (start + size) > PHYSICAL_START + memory_limit) ||
+ overlaps_crashkernel(start, size) ||
+ overlaps_initrd(start, size)) {
p = __va(memblock_alloc(size, PAGE_SIZE));
memcpy(p, initial_boot_params, size);
- initial_boot_params = (struct boot_param_header *)p;
+ initial_boot_params = p;
DBG("Moved device tree to 0x%p\n", p);
}
@@ -123,22 +147,23 @@ static void __init move_device_tree(void)
*/
static struct ibm_pa_feature {
unsigned long cpu_features; /* CPU_FTR_xxx bit */
+ unsigned long mmu_features; /* MMU_FTR_xxx bit */
unsigned int cpu_user_ftrs; /* PPC_FEATURE_xxx bit */
unsigned char pabyte; /* byte number in ibm,pa-features */
unsigned char pabit; /* bit number (big-endian) */
unsigned char invert; /* if 1, pa bit set => clear feature */
} ibm_pa_features[] __initdata = {
- {0, PPC_FEATURE_HAS_MMU, 0, 0, 0},
- {0, PPC_FEATURE_HAS_FPU, 0, 1, 0},
- {CPU_FTR_SLB, 0, 0, 2, 0},
- {CPU_FTR_CTRL, 0, 0, 3, 0},
- {CPU_FTR_NOEXECUTE, 0, 0, 6, 0},
- {CPU_FTR_NODSISRALIGN, 0, 1, 1, 1},
- {CPU_FTR_CI_LARGE_PAGE, 0, 1, 2, 0},
+ {0, 0, PPC_FEATURE_HAS_MMU, 0, 0, 0},
+ {0, 0, PPC_FEATURE_HAS_FPU, 0, 1, 0},
+ {0, MMU_FTR_SLB, 0, 0, 2, 0},
+ {CPU_FTR_CTRL, 0, 0, 0, 3, 0},
+ {CPU_FTR_NOEXECUTE, 0, 0, 0, 6, 0},
+ {CPU_FTR_NODSISRALIGN, 0, 0, 1, 1, 1},
+ {0, MMU_FTR_CI_LARGE_PAGE, 0, 1, 2, 0},
{CPU_FTR_REAL_LE, PPC_FEATURE_TRUE_LE, 5, 0, 0},
};
-static void __init scan_features(unsigned long node, unsigned char *ftrs,
+static void __init scan_features(unsigned long node, const unsigned char *ftrs,
unsigned long tablelen,
struct ibm_pa_feature *fp,
unsigned long ft_size)
@@ -166,17 +191,19 @@ static void __init scan_features(unsigned long node, unsigned char *ftrs,
if (bit ^ fp->invert) {
cur_cpu_spec->cpu_features |= fp->cpu_features;
cur_cpu_spec->cpu_user_features |= fp->cpu_user_ftrs;
+ cur_cpu_spec->mmu_features |= fp->mmu_features;
} else {
cur_cpu_spec->cpu_features &= ~fp->cpu_features;
cur_cpu_spec->cpu_user_features &= ~fp->cpu_user_ftrs;
+ cur_cpu_spec->mmu_features &= ~fp->mmu_features;
}
}
}
static void __init check_cpu_pa_features(unsigned long node)
{
- unsigned char *pa_ftrs;
- unsigned long tablelen;
+ const unsigned char *pa_ftrs;
+ int tablelen;
pa_ftrs = of_get_flat_dt_prop(node, "ibm,pa-features", &tablelen);
if (pa_ftrs == NULL)
@@ -189,16 +216,16 @@ static void __init check_cpu_pa_features(unsigned long node)
#ifdef CONFIG_PPC_STD_MMU_64
static void __init check_cpu_slb_size(unsigned long node)
{
- u32 *slb_size_ptr;
+ const __be32 *slb_size_ptr;
slb_size_ptr = of_get_flat_dt_prop(node, "slb-size", NULL);
if (slb_size_ptr != NULL) {
- mmu_slb_size = *slb_size_ptr;
+ mmu_slb_size = be32_to_cpup(slb_size_ptr);
return;
}
slb_size_ptr = of_get_flat_dt_prop(node, "ibm,slb-size", NULL);
if (slb_size_ptr != NULL) {
- mmu_slb_size = *slb_size_ptr;
+ mmu_slb_size = be32_to_cpup(slb_size_ptr);
}
}
#else
@@ -230,7 +257,7 @@ static struct feature_property {
static inline void identical_pvr_fixup(unsigned long node)
{
unsigned int pvr;
- char *model = of_get_flat_dt_prop(node, "model", NULL);
+ const char *model = of_get_flat_dt_prop(node, "model", NULL);
/*
* Since 440GR(x)/440EP(x) processors have the same pvr,
@@ -253,11 +280,11 @@ static void __init check_cpu_feature_properties(unsigned long node)
{
unsigned long i;
struct feature_property *fp = feature_properties;
- const u32 *prop;
+ const __be32 *prop;
for (i = 0; i < ARRAY_SIZE(feature_properties); ++i, ++fp) {
prop = of_get_flat_dt_prop(node, fp->name, NULL);
- if (prop && *prop >= fp->min_value) {
+ if (prop && be32_to_cpup(prop) >= fp->min_value) {
cur_cpu_spec->cpu_features |= fp->cpu_feature;
cur_cpu_spec->cpu_user_features |= fp->cpu_user_ftr;
}
@@ -268,13 +295,13 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
const char *uname, int depth,
void *data)
{
- static int logical_cpuid = 0;
- char *type = of_get_flat_dt_prop(node, "device_type", NULL);
- const u32 *prop;
- const u32 *intserv;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ const __be32 *intserv;
int i, nthreads;
- unsigned long len;
- int found = 0;
+ int len;
+ int found = -1;
+ int found_thread = 0;
/* We are scanning "cpu" nodes only */
if (type == NULL || strcmp(type, "cpu") != 0)
@@ -298,11 +325,11 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
* version 2 of the kexec param format adds the phys cpuid of
* booted proc.
*/
- if (initial_boot_params && initial_boot_params->version >= 2) {
- if (intserv[i] ==
- initial_boot_params->boot_cpuid_phys) {
- found = 1;
- break;
+ if (fdt_version(initial_boot_params) >= 2) {
+ if (be32_to_cpu(intserv[i]) ==
+ fdt_boot_cpuid_phys(initial_boot_params)) {
+ found = boot_cpu_count;
+ found_thread = i;
}
} else {
/*
@@ -311,63 +338,61 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
* off secondary threads.
*/
if (of_get_flat_dt_prop(node,
- "linux,boot-cpu", NULL) != NULL) {
- found = 1;
- break;
- }
+ "linux,boot-cpu", NULL) != NULL)
+ found = boot_cpu_count;
}
-
#ifdef CONFIG_SMP
/* logical cpu id is always 0 on UP kernels */
- logical_cpuid++;
+ boot_cpu_count++;
#endif
}
- if (found) {
- DBG("boot cpu: logical %d physical %d\n", logical_cpuid,
- intserv[i]);
- boot_cpuid = logical_cpuid;
- set_hard_smp_processor_id(boot_cpuid, intserv[i]);
+ /* Not the boot CPU */
+ if (found < 0)
+ return 0;
- /*
- * PAPR defines "logical" PVR values for cpus that
- * meet various levels of the architecture:
- * 0x0f000001 Architecture version 2.04
- * 0x0f000002 Architecture version 2.05
- * If the cpu-version property in the cpu node contains
- * such a value, we call identify_cpu again with the
- * logical PVR value in order to use the cpu feature
- * bits appropriate for the architecture level.
- *
- * A POWER6 partition in "POWER6 architected" mode
- * uses the 0x0f000002 PVR value; in POWER5+ mode
- * it uses 0x0f000001.
- */
- prop = of_get_flat_dt_prop(node, "cpu-version", NULL);
- if (prop && (*prop & 0xff000000) == 0x0f000000)
- identify_cpu(0, *prop);
+ DBG("boot cpu: logical %d physical %d\n", found,
+ be32_to_cpu(intserv[found_thread]));
+ boot_cpuid = found;
+ set_hard_smp_processor_id(found, be32_to_cpu(intserv[found_thread]));
- identical_pvr_fixup(node);
- }
+ /*
+ * PAPR defines "logical" PVR values for cpus that
+ * meet various levels of the architecture:
+ * 0x0f000001 Architecture version 2.04
+ * 0x0f000002 Architecture version 2.05
+ * If the cpu-version property in the cpu node contains
+ * such a value, we call identify_cpu again with the
+ * logical PVR value in order to use the cpu feature
+ * bits appropriate for the architecture level.
+ *
+ * A POWER6 partition in "POWER6 architected" mode
+ * uses the 0x0f000002 PVR value; in POWER5+ mode
+ * it uses 0x0f000001.
+ */
+ prop = of_get_flat_dt_prop(node, "cpu-version", NULL);
+ if (prop && (be32_to_cpup(prop) & 0xff000000) == 0x0f000000)
+ identify_cpu(0, be32_to_cpup(prop));
+
+ identical_pvr_fixup(node);
check_cpu_feature_properties(node);
check_cpu_pa_features(node);
check_cpu_slb_size(node);
-#ifdef CONFIG_PPC_PSERIES
+#ifdef CONFIG_PPC64
if (nthreads > 1)
cur_cpu_spec->cpu_features |= CPU_FTR_SMT;
else
cur_cpu_spec->cpu_features &= ~CPU_FTR_SMT;
#endif
-
return 0;
}
int __init early_init_dt_scan_chosen_ppc(unsigned long node, const char *uname,
int depth, void *data)
{
- unsigned long *lprop;
+ const unsigned long *lprop; /* All these set by kernel, so no need to convert endian */
/* Use common scan routine to determine if this is the chosen node */
if (early_init_dt_scan_chosen(node, uname, depth, data) == 0)
@@ -418,8 +443,9 @@ int __init early_init_dt_scan_chosen_ppc(unsigned long node, const char *uname,
*/
static int __init early_init_dt_scan_drconf_memory(unsigned long node)
{
- __be32 *dm, *ls, *usm;
- unsigned long l, n, flags;
+ const __be32 *dm, *ls, *usm;
+ int l;
+ unsigned long n, flags;
u64 base, size, memblock_size;
unsigned int is_kexec_kdump = 0, rngs;
@@ -432,7 +458,7 @@ static int __init early_init_dt_scan_drconf_memory(unsigned long node)
if (dm == NULL || l < sizeof(__be32))
return 0;
- n = *dm++; /* number of entries */
+ n = of_read_number(dm++, 1); /* number of entries */
if (l < (n * (dt_root_addr_cells + 4) + 1) * sizeof(__be32))
return 0;
@@ -444,7 +470,7 @@ static int __init early_init_dt_scan_drconf_memory(unsigned long node)
for (; n != 0; --n) {
base = dt_mem_next_cell(dt_root_addr_cells, &dm);
- flags = dm[3];
+ flags = of_read_number(&dm[3], 1);
/* skip DRC index, pad, assoc. list index, flags */
dm += 4;
/* skip this block if the reserved bit is set in flags (0x80)
@@ -499,6 +525,20 @@ static int __init early_init_dt_scan_memory_ppc(unsigned long node,
return early_init_dt_scan_memory(node, uname, depth, data);
}
+/*
+ * For a relocatable kernel, we need to get the memstart_addr first,
+ * then use it to calculate the virtual kernel start address. This has
+ * to happen at a very early stage (before machine_init). In this case,
+ * we just want to get the memstart_address and would not like to mess the
+ * memblock at this stage. So introduce a variable to skip the memblock_add()
+ * for this reason.
+ */
+#ifdef CONFIG_RELOCATABLE
+static int add_mem_to_memblock = 1;
+#else
+#define add_mem_to_memblock 1
+#endif
+
void __init early_init_dt_add_memory_arch(u64 base, u64 size)
{
#ifdef CONFIG_PPC64
@@ -509,50 +549,69 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
size = 0x80000000ul - base;
}
#endif
-
- /* First MEMBLOCK added, do some special initializations */
- if (memstart_addr == ~(phys_addr_t)0)
- setup_initial_memory_limit(base, size);
- memstart_addr = min((u64)memstart_addr, base);
+ /* Keep track of the beginning of memory -and- the size of
+ * the very first block in the device-tree as it represents
+ * the RMA on ppc64 server
+ */
+ if (base < memstart_addr) {
+ memstart_addr = base;
+ first_memblock_size = size;
+ }
/* Add the chunk to the MEMBLOCK list */
- memblock_add(base, size);
+ if (add_mem_to_memblock)
+ memblock_add(base, size);
}
-u64 __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
+static void __init early_reserve_mem_dt(void)
{
- return memblock_alloc(size, align);
-}
+ unsigned long i, dt_root;
+ int len;
+ const __be32 *prop;
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init early_init_dt_setup_initrd_arch(unsigned long start,
- unsigned long end)
-{
- initrd_start = (unsigned long)__va(start);
- initrd_end = (unsigned long)__va(end);
- initrd_below_start_ok = 1;
+ early_init_fdt_scan_reserved_mem();
+
+ dt_root = of_get_flat_dt_root();
+
+ prop = of_get_flat_dt_prop(dt_root, "reserved-ranges", &len);
+
+ if (!prop)
+ return;
+
+ DBG("Found new-style reserved-ranges\n");
+
+ /* Each reserved range is an (address,size) pair, 2 cells each,
+ * totalling 4 cells per range. */
+ for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+ u64 base, size;
+
+ base = of_read_number(prop + (i * 4) + 0, 2);
+ size = of_read_number(prop + (i * 4) + 2, 2);
+
+ if (size) {
+ DBG("reserving: %llx -> %llx\n", base, size);
+ memblock_reserve(base, size);
+ }
+ }
}
-#endif
static void __init early_reserve_mem(void)
{
- u64 base, size;
- u64 *reserve_map;
- unsigned long self_base;
- unsigned long self_size;
+ __be64 *reserve_map;
- reserve_map = (u64 *)(((unsigned long)initial_boot_params) +
- initial_boot_params->off_mem_rsvmap);
+ reserve_map = (__be64 *)(((unsigned long)initial_boot_params) +
+ fdt_off_mem_rsvmap(initial_boot_params));
- /* before we do anything, lets reserve the dt blob */
- self_base = __pa((unsigned long)initial_boot_params);
- self_size = initial_boot_params->totalsize;
- memblock_reserve(self_base, self_size);
+ /* Look for the new "reserved-regions" property in the DT */
+ early_reserve_mem_dt();
#ifdef CONFIG_BLK_DEV_INITRD
- /* then reserve the initrd, if any */
- if (initrd_start && (initrd_end > initrd_start))
- memblock_reserve(__pa(initrd_start), initrd_end - initrd_start);
+ /* Then reserve the initrd, if any */
+ if (initrd_start && (initrd_end > initrd_start)) {
+ memblock_reserve(_ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE),
+ _ALIGN_UP(initrd_end, PAGE_SIZE) -
+ _ALIGN_DOWN(initrd_start, PAGE_SIZE));
+ }
#endif /* CONFIG_BLK_DEV_INITRD */
#ifdef CONFIG_PPC32
@@ -560,113 +619,24 @@ static void __init early_reserve_mem(void)
* Handle the case where we might be booting from an old kexec
* image that setup the mem_rsvmap as pairs of 32-bit values
*/
- if (*reserve_map > 0xffffffffull) {
+ if (be64_to_cpup(reserve_map) > 0xffffffffull) {
u32 base_32, size_32;
- u32 *reserve_map_32 = (u32 *)reserve_map;
+ __be32 *reserve_map_32 = (__be32 *)reserve_map;
+
+ DBG("Found old 32-bit reserve map\n");
while (1) {
- base_32 = *(reserve_map_32++);
- size_32 = *(reserve_map_32++);
+ base_32 = be32_to_cpup(reserve_map_32++);
+ size_32 = be32_to_cpup(reserve_map_32++);
if (size_32 == 0)
break;
- /* skip if the reservation is for the blob */
- if (base_32 == self_base && size_32 == self_size)
- continue;
DBG("reserving: %x -> %x\n", base_32, size_32);
memblock_reserve(base_32, size_32);
}
return;
}
#endif
- while (1) {
- base = *(reserve_map++);
- size = *(reserve_map++);
- if (size == 0)
- break;
- DBG("reserving: %llx -> %llx\n", base, size);
- memblock_reserve(base, size);
- }
-}
-
-#ifdef CONFIG_PHYP_DUMP
-/**
- * phyp_dump_calculate_reserve_size() - reserve variable boot area 5% or arg
- *
- * Function to find the largest size we need to reserve
- * during early boot process.
- *
- * It either looks for boot param and returns that OR
- * returns larger of 256 or 5% rounded down to multiples of 256MB.
- *
- */
-static inline unsigned long phyp_dump_calculate_reserve_size(void)
-{
- unsigned long tmp;
-
- if (phyp_dump_info->reserve_bootvar)
- return phyp_dump_info->reserve_bootvar;
-
- /* divide by 20 to get 5% of value */
- tmp = memblock_end_of_DRAM();
- do_div(tmp, 20);
-
- /* round it down in multiples of 256 */
- tmp = tmp & ~0x0FFFFFFFUL;
-
- return (tmp > PHYP_DUMP_RMR_END ? tmp : PHYP_DUMP_RMR_END);
-}
-
-/**
- * phyp_dump_reserve_mem() - reserve all not-yet-dumped mmemory
- *
- * This routine may reserve memory regions in the kernel only
- * if the system is supported and a dump was taken in last
- * boot instance or if the hardware is supported and the
- * scratch area needs to be setup. In other instances it returns
- * without reserving anything. The memory in case of dump being
- * active is freed when the dump is collected (by userland tools).
- */
-static void __init phyp_dump_reserve_mem(void)
-{
- unsigned long base, size;
- unsigned long variable_reserve_size;
-
- if (!phyp_dump_info->phyp_dump_configured) {
- printk(KERN_ERR "Phyp-dump not supported on this hardware\n");
- return;
- }
-
- if (!phyp_dump_info->phyp_dump_at_boot) {
- printk(KERN_INFO "Phyp-dump disabled at boot time\n");
- return;
- }
-
- variable_reserve_size = phyp_dump_calculate_reserve_size();
-
- if (phyp_dump_info->phyp_dump_is_active) {
- /* Reserve *everything* above RMR.Area freed by userland tools*/
- base = variable_reserve_size;
- size = memblock_end_of_DRAM() - base;
-
- /* XXX crashed_ram_end is wrong, since it may be beyond
- * the memory_limit, it will need to be adjusted. */
- memblock_reserve(base, size);
-
- phyp_dump_info->init_reserve_start = base;
- phyp_dump_info->init_reserve_size = size;
- } else {
- size = phyp_dump_info->cpu_state_size +
- phyp_dump_info->hpte_region_size +
- variable_reserve_size;
- base = memblock_end_of_DRAM() - size;
- memblock_reserve(base, size);
- phyp_dump_info->init_reserve_start = base;
- phyp_dump_info->init_reserve_size = size;
- }
}
-#else
-static inline void __init phyp_dump_reserve_mem(void) {}
-#endif /* CONFIG_PHYP_DUMP && CONFIG_PPC_RTAS */
void __init early_init_devtree(void *params)
{
@@ -682,20 +652,23 @@ void __init early_init_devtree(void *params)
of_scan_flat_dt(early_init_dt_scan_rtas, NULL);
#endif
-#ifdef CONFIG_PHYP_DUMP
- /* scan tree to see if dump occured during last boot */
- of_scan_flat_dt(early_init_dt_scan_phyp_dump, NULL);
+#ifdef CONFIG_PPC_POWERNV
+ /* Some machines might need OPAL info for debugging, grab it now. */
+ of_scan_flat_dt(early_init_dt_scan_opal, NULL);
+#endif
+
+#ifdef CONFIG_FA_DUMP
+ /* scan tree to see if dump is active during last boot */
+ of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL);
#endif
/* Retrieve various informations from the /chosen node of the
* device-tree, including the platform type, initrd location and
* size, TCE reserve, and more ...
*/
- of_scan_flat_dt(early_init_dt_scan_chosen_ppc, NULL);
+ of_scan_flat_dt(early_init_dt_scan_chosen_ppc, cmd_line);
/* Scan memory nodes and rebuild MEMBLOCKs */
- memblock_init();
-
of_scan_flat_dt(early_init_dt_scan_root, NULL);
of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL);
@@ -703,30 +676,34 @@ void __init early_init_devtree(void *params)
strlcpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE);
parse_early_param();
+ /* make sure we've parsed cmdline for mem= before this */
+ if (memory_limit)
+ first_memblock_size = min_t(u64, first_memblock_size, memory_limit);
+ setup_initial_memory_limit(memstart_addr, first_memblock_size);
/* Reserve MEMBLOCK regions used by kernel, initrd, dt, etc... */
memblock_reserve(PHYSICAL_START, __pa(klimit) - PHYSICAL_START);
/* If relocatable, reserve first 32k for interrupt vectors etc. */
if (PHYSICAL_START > MEMORY_START)
memblock_reserve(MEMORY_START, 0x8000);
reserve_kdump_trampoline();
- reserve_crashkernel();
+#ifdef CONFIG_FA_DUMP
+ /*
+ * If we fail to reserve memory for firmware-assisted dump then
+ * fallback to kexec based kdump.
+ */
+ if (fadump_reserve_mem() == 0)
+#endif
+ reserve_crashkernel();
early_reserve_mem();
- phyp_dump_reserve_mem();
-
- limit = memory_limit;
- if (! limit) {
- phys_addr_t memsize;
-
- /* Ensure that total memory size is page-aligned, because
- * otherwise mark_bootmem() gets upset. */
- memblock_analyze();
- memsize = memblock_phys_mem_size();
- if ((memsize & PAGE_MASK) != memsize)
- limit = memsize & PAGE_MASK;
- }
+
+ /*
+ * Ensure that total memory size is page-aligned, because otherwise
+ * mark_bootmem() gets upset.
+ */
+ limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE);
memblock_enforce_memory_limit(limit);
- memblock_analyze();
+ memblock_allow_resize();
memblock_dump_all();
DBG("Phys. mem: %llx\n", memblock_phys_mem_size());
@@ -739,14 +716,54 @@ void __init early_init_devtree(void *params)
DBG("Scanning CPUs ...\n");
- /* Retreive CPU related informations from the flat tree
+ /* Retrieve CPU related informations from the flat tree
* (altivec support, boot CPU ID, ...)
*/
of_scan_flat_dt(early_init_dt_scan_cpus, NULL);
+ if (boot_cpuid < 0) {
+ printk("Failed to indentify boot CPU !\n");
+ BUG();
+ }
+
+#if defined(CONFIG_SMP) && defined(CONFIG_PPC64)
+ /* We'll later wait for secondaries to check in; there are
+ * NCPUS-1 non-boot CPUs :-)
+ */
+ spinning_secondaries = boot_cpu_count - 1;
+#endif
+
+#ifdef CONFIG_PPC_POWERNV
+ /* Scan and build the list of machine check recoverable ranges */
+ of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL);
+#endif
DBG(" <- early_init_devtree()\n");
}
+#ifdef CONFIG_RELOCATABLE
+/*
+ * This function run before early_init_devtree, so we have to init
+ * initial_boot_params.
+ */
+void __init early_get_first_memblock_info(void *params, phys_addr_t *size)
+{
+ /* Setup flat device-tree pointer */
+ initial_boot_params = params;
+
+ /*
+ * Scan the memory nodes and set add_mem_to_memblock to 0 to avoid
+ * mess the memblock.
+ */
+ add_mem_to_memblock = 0;
+ of_scan_flat_dt(early_init_dt_scan_root, NULL);
+ of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL);
+ add_mem_to_memblock = 1;
+
+ if (size)
+ *size = first_memblock_size;
+}
+#endif
+
/*******
*
* New implementation of the OF "find" APIs, return a refcounted
@@ -759,35 +776,50 @@ void __init early_init_devtree(void *params)
*******/
/**
- * of_find_next_cache_node - Find a node's subsidiary cache
- * @np: node of type "cpu" or "cache"
+ * of_get_ibm_chip_id - Returns the IBM "chip-id" of a device
+ * @np: device node of the device
*
- * Returns a node pointer with refcount incremented, use
- * of_node_put() on it when done. Caller should hold a reference
- * to np.
+ * This looks for a property "ibm,chip-id" in the node or any
+ * of its parents and returns its content, or -1 if it cannot
+ * be found.
*/
-struct device_node *of_find_next_cache_node(struct device_node *np)
+int of_get_ibm_chip_id(struct device_node *np)
{
- struct device_node *child;
- const phandle *handle;
-
- handle = of_get_property(np, "l2-cache", NULL);
- if (!handle)
- handle = of_get_property(np, "next-level-cache", NULL);
+ of_node_get(np);
+ while(np) {
+ struct device_node *old = np;
+ const __be32 *prop;
+
+ prop = of_get_property(np, "ibm,chip-id", NULL);
+ if (prop) {
+ of_node_put(np);
+ return be32_to_cpup(prop);
+ }
+ np = of_get_parent(np);
+ of_node_put(old);
+ }
+ return -1;
+}
- if (handle)
- return of_find_node_by_phandle(*handle);
+/**
+ * cpu_to_chip_id - Return the cpus chip-id
+ * @cpu: The logical cpu number.
+ *
+ * Return the value of the ibm,chip-id property corresponding to the given
+ * logical cpu number. If the chip-id can not be found, returns -1.
+ */
+int cpu_to_chip_id(int cpu)
+{
+ struct device_node *np;
- /* OF on pmac has nodes instead of properties named "l2-cache"
- * beneath CPU nodes.
- */
- if (!strcmp(np->type, "cpu"))
- for_each_child_of_node(np, child)
- if (!strcmp(child->type, "cache"))
- return child;
+ np = of_get_cpu_node(cpu, NULL);
+ if (!np)
+ return -1;
- return NULL;
+ of_node_put(np);
+ return of_get_ibm_chip_id(np);
}
+EXPORT_SYMBOL(cpu_to_chip_id);
#ifdef CONFIG_PPC_PSERIES
/*
@@ -835,18 +867,16 @@ static int prom_reconfig_notifier(struct notifier_block *nb,
int err;
switch (action) {
- case PSERIES_RECONFIG_ADD:
+ case OF_RECONFIG_ATTACH_NODE:
err = of_finish_dynamic_node(node);
- if (err < 0) {
+ if (err < 0)
printk(KERN_ERR "finish_node returned %d\n", err);
- err = NOTIFY_BAD;
- }
break;
default:
- err = NOTIFY_DONE;
+ err = 0;
break;
}
- return err;
+ return notifier_from_errno(err);
}
static struct notifier_block prom_reconfig_nb = {
@@ -856,71 +886,12 @@ static struct notifier_block prom_reconfig_nb = {
static int __init prom_reconfig_setup(void)
{
- return pSeries_reconfig_notifier_register(&prom_reconfig_nb);
+ return of_reconfig_notifier_register(&prom_reconfig_nb);
}
__initcall(prom_reconfig_setup);
#endif
-/* Find the device node for a given logical cpu number, also returns the cpu
- * local thread number (index in ibm,interrupt-server#s) if relevant and
- * asked for (non NULL)
- */
-struct device_node *of_get_cpu_node(int cpu, unsigned int *thread)
+bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
{
- int hardid;
- struct device_node *np;
-
- hardid = get_hard_smp_processor_id(cpu);
-
- for_each_node_by_type(np, "cpu") {
- const u32 *intserv;
- unsigned int plen, t;
-
- /* Check for ibm,ppc-interrupt-server#s. If it doesn't exist
- * fallback to "reg" property and assume no threads
- */
- intserv = of_get_property(np, "ibm,ppc-interrupt-server#s",
- &plen);
- if (intserv == NULL) {
- const u32 *reg = of_get_property(np, "reg", NULL);
- if (reg == NULL)
- continue;
- if (*reg == hardid) {
- if (thread)
- *thread = 0;
- return np;
- }
- } else {
- plen /= sizeof(u32);
- for (t = 0; t < plen; t++) {
- if (hardid == intserv[t]) {
- if (thread)
- *thread = t;
- return np;
- }
- }
- }
- }
- return NULL;
-}
-EXPORT_SYMBOL(of_get_cpu_node);
-
-#if defined(CONFIG_DEBUG_FS) && defined(DEBUG)
-static struct debugfs_blob_wrapper flat_dt_blob;
-
-static int __init export_flat_device_tree(void)
-{
- struct dentry *d;
-
- flat_dt_blob.data = initial_boot_params;
- flat_dt_blob.size = initial_boot_params->totalsize;
-
- d = debugfs_create_blob("flat-device-tree", S_IFREG | S_IRUSR,
- powerpc_debugfs_root, &flat_dt_blob);
- if (!d)
- return 1;
-
- return 0;
+ return (int)phys_id == get_hard_smp_processor_id(cpu);
}
-__initcall(export_flat_device_tree);
-#endif
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 941ff4dbc56..1a85d8f9673 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -35,7 +35,6 @@
#include <asm/irq.h>
#include <asm/io.h>
#include <asm/smp.h>
-#include <asm/system.h>
#include <asm/mmu.h>
#include <asm/pgtable.h>
#include <asm/pci.h>
@@ -43,18 +42,11 @@
#include <asm/btext.h>
#include <asm/sections.h>
#include <asm/machdep.h>
+#include <asm/opal.h>
#include <linux/linux_logo.h>
/*
- * Properties whose value is longer than this get excluded from our
- * copy of the device tree. This value does need to be big enough to
- * ensure that we don't lose things like the interrupt-map property
- * on a PCI-PCI bridge.
- */
-#define MAX_PROPERTY_LENGTH (1UL * 1024 * 1024)
-
-/*
* Eventually bump that one up
*/
#define DEVTREE_CHUNK_SIZE 0x100000
@@ -74,8 +66,8 @@
* is running at whatever address it has been loaded at.
* On ppc32 we compile with -mrelocatable, which means that references
* to extern and static variables get relocated automatically.
- * On ppc64 we have to relocate the references explicitly with
- * RELOC. (Note that strings count as static variables.)
+ * ppc64 objects are always relocatable, we just need to relocate the
+ * TOC.
*
* Because OF may have mapped I/O devices into the area starting at
* KERNELBASE, particularly on CHRP machines, we can't safely call
@@ -87,13 +79,11 @@
* On ppc64, 64 bit values are truncated to 32 bits (and
* fortunately don't get interpreted as two arguments).
*/
+#define ADDR(x) (u32)(unsigned long)(x)
+
#ifdef CONFIG_PPC64
-#define RELOC(x) (*PTRRELOC(&(x)))
-#define ADDR(x) (u32) add_reloc_offset((unsigned long)(x))
#define OF_WORKAROUNDS 0
#else
-#define RELOC(x) (x)
-#define ADDR(x) (u32) (x)
#define OF_WORKAROUNDS of_workarounds
int of_workarounds;
#endif
@@ -103,7 +93,7 @@ int of_workarounds;
#define PROM_BUG() do { \
prom_printf("kernel BUG at %s line 0x%x!\n", \
- RELOC(__FILE__), __LINE__); \
+ __FILE__, __LINE__); \
__asm__ __volatile__(".long " BUG_ILLEGAL_INSTR); \
} while (0)
@@ -117,10 +107,10 @@ int of_workarounds;
typedef u32 prom_arg_t;
struct prom_args {
- u32 service;
- u32 nargs;
- u32 nret;
- prom_arg_t args[10];
+ __be32 service;
+ __be32 nargs;
+ __be32 nret;
+ __be32 args[10];
};
struct prom_t {
@@ -133,13 +123,15 @@ struct prom_t {
};
struct mem_map_entry {
- u64 base;
- u64 size;
+ __be64 base;
+ __be64 size;
};
-typedef u32 cell_t;
+typedef __be32 cell_t;
-extern void __start(unsigned long r3, unsigned long r4, unsigned long r5);
+extern void __start(unsigned long r3, unsigned long r4, unsigned long r5,
+ unsigned long r6, unsigned long r7, unsigned long r8,
+ unsigned long r9);
#ifdef CONFIG_PPC64
extern int enter_prom(struct prom_args *args, unsigned long entry);
@@ -185,6 +177,7 @@ static unsigned long __initdata prom_tce_alloc_end;
#define PLATFORM_LPAR 0x0001
#define PLATFORM_POWERMAC 0x0400
#define PLATFORM_GENERIC 0x0500
+#define PLATFORM_OPAL 0x0600
static int __initdata of_platform;
@@ -203,6 +196,8 @@ static int __initdata mem_reserve_cnt;
static cell_t __initdata regbuf[1024];
+static bool rtas_has_query_cpu_stopped;
+
/*
* Error results ... some OF calls will return "-1" on error, some
@@ -226,22 +221,22 @@ static int __init call_prom(const char *service, int nargs, int nret, ...)
struct prom_args args;
va_list list;
- args.service = ADDR(service);
- args.nargs = nargs;
- args.nret = nret;
+ args.service = cpu_to_be32(ADDR(service));
+ args.nargs = cpu_to_be32(nargs);
+ args.nret = cpu_to_be32(nret);
va_start(list, nret);
for (i = 0; i < nargs; i++)
- args.args[i] = va_arg(list, prom_arg_t);
+ args.args[i] = cpu_to_be32(va_arg(list, prom_arg_t));
va_end(list);
for (i = 0; i < nret; i++)
args.args[nargs+i] = 0;
- if (enter_prom(&args, RELOC(prom_entry)) < 0)
+ if (enter_prom(&args, prom_entry) < 0)
return PROM_ERROR;
- return (nret > 0) ? args.args[nargs] : 0;
+ return (nret > 0) ? be32_to_cpu(args.args[nargs]) : 0;
}
static int __init call_prom_ret(const char *service, int nargs, int nret,
@@ -251,46 +246,45 @@ static int __init call_prom_ret(const char *service, int nargs, int nret,
struct prom_args args;
va_list list;
- args.service = ADDR(service);
- args.nargs = nargs;
- args.nret = nret;
+ args.service = cpu_to_be32(ADDR(service));
+ args.nargs = cpu_to_be32(nargs);
+ args.nret = cpu_to_be32(nret);
va_start(list, rets);
for (i = 0; i < nargs; i++)
- args.args[i] = va_arg(list, prom_arg_t);
+ args.args[i] = cpu_to_be32(va_arg(list, prom_arg_t));
va_end(list);
for (i = 0; i < nret; i++)
args.args[nargs+i] = 0;
- if (enter_prom(&args, RELOC(prom_entry)) < 0)
+ if (enter_prom(&args, prom_entry) < 0)
return PROM_ERROR;
if (rets != NULL)
for (i = 1; i < nret; ++i)
- rets[i-1] = args.args[nargs+i];
+ rets[i-1] = be32_to_cpu(args.args[nargs+i]);
- return (nret > 0) ? args.args[nargs] : 0;
+ return (nret > 0) ? be32_to_cpu(args.args[nargs]) : 0;
}
static void __init prom_print(const char *msg)
{
const char *p, *q;
- struct prom_t *_prom = &RELOC(prom);
- if (_prom->stdout == 0)
+ if (prom.stdout == 0)
return;
for (p = msg; *p != 0; p = q) {
for (q = p; *q != 0 && *q != '\n'; ++q)
;
if (q > p)
- call_prom("write", 3, 1, _prom->stdout, p, q - p);
+ call_prom("write", 3, 1, prom.stdout, p, q - p);
if (*q == 0)
break;
++q;
- call_prom("write", 3, 1, _prom->stdout, ADDR("\r\n"), 2);
+ call_prom("write", 3, 1, prom.stdout, ADDR("\r\n"), 2);
}
}
@@ -299,7 +293,6 @@ static void __init prom_print_hex(unsigned long val)
{
int i, nibbles = sizeof(val)*2;
char buf[sizeof(val)*2+1];
- struct prom_t *_prom = &RELOC(prom);
for (i = nibbles-1; i >= 0; i--) {
buf[i] = (val & 0xf) + '0';
@@ -308,7 +301,7 @@ static void __init prom_print_hex(unsigned long val)
val >>= 4;
}
buf[nibbles] = '\0';
- call_prom("write", 3, 1, _prom->stdout, buf, nibbles);
+ call_prom("write", 3, 1, prom.stdout, buf, nibbles);
}
/* max number of decimal digits in an unsigned long */
@@ -317,7 +310,6 @@ static void __init prom_print_dec(unsigned long val)
{
int i, size;
char buf[UL_DIGITS+1];
- struct prom_t *_prom = &RELOC(prom);
for (i = UL_DIGITS-1; i >= 0; i--) {
buf[i] = (val % 10) + '0';
@@ -327,7 +319,7 @@ static void __init prom_print_dec(unsigned long val)
}
/* shift stuff down */
size = UL_DIGITS - i;
- call_prom("write", 3, 1, _prom->stdout, buf+i, size);
+ call_prom("write", 3, 1, prom.stdout, buf+i, size);
}
static void __init prom_printf(const char *format, ...)
@@ -335,22 +327,19 @@ static void __init prom_printf(const char *format, ...)
const char *p, *q, *s;
va_list args;
unsigned long v;
- struct prom_t *_prom = &RELOC(prom);
+ long vs;
va_start(args, format);
-#ifdef CONFIG_PPC64
- format = PTRRELOC(format);
-#endif
for (p = format; *p != 0; p = q) {
for (q = p; *q != 0 && *q != '\n' && *q != '%'; ++q)
;
if (q > p)
- call_prom("write", 3, 1, _prom->stdout, p, q - p);
+ call_prom("write", 3, 1, prom.stdout, p, q - p);
if (*q == 0)
break;
if (*q == '\n') {
++q;
- call_prom("write", 3, 1, _prom->stdout,
+ call_prom("write", 3, 1, prom.stdout,
ADDR("\r\n"), 2);
continue;
}
@@ -368,12 +357,35 @@ static void __init prom_printf(const char *format, ...)
v = va_arg(args, unsigned long);
prom_print_hex(v);
break;
+ case 'd':
+ ++q;
+ vs = va_arg(args, int);
+ if (vs < 0) {
+ prom_print("-");
+ vs = -vs;
+ }
+ prom_print_dec(vs);
+ break;
case 'l':
++q;
- if (*q == 'u') { /* '%lu' */
+ if (*q == 0)
+ break;
+ else if (*q == 'x') {
+ ++q;
+ v = va_arg(args, unsigned long);
+ prom_print_hex(v);
+ } else if (*q == 'u') { /* '%lu' */
++q;
v = va_arg(args, unsigned long);
prom_print_dec(v);
+ } else if (*q == 'd') { /* %ld */
+ ++q;
+ vs = va_arg(args, long);
+ if (vs < 0) {
+ prom_print("-");
+ vs = -vs;
+ }
+ prom_print_dec(vs);
}
break;
}
@@ -384,7 +396,6 @@ static void __init prom_printf(const char *format, ...)
static unsigned int __init prom_claim(unsigned long virt, unsigned long size,
unsigned long align)
{
- struct prom_t *_prom = &RELOC(prom);
if (align == 0 && (OF_WORKAROUNDS & OF_WA_CLAIM)) {
/*
@@ -395,21 +406,21 @@ static unsigned int __init prom_claim(unsigned long virt, unsigned long size,
prom_arg_t result;
ret = call_prom_ret("call-method", 5, 2, &result,
- ADDR("claim"), _prom->memory,
+ ADDR("claim"), prom.memory,
align, size, virt);
if (ret != 0 || result == -1)
return -1;
ret = call_prom_ret("call-method", 5, 2, &result,
- ADDR("claim"), _prom->mmumap,
+ ADDR("claim"), prom.mmumap,
align, size, virt);
if (ret != 0) {
call_prom("call-method", 4, 1, ADDR("release"),
- _prom->memory, size, virt);
+ prom.memory, size, virt);
return -1;
}
/* the 0x12 is M (coherence) + PP == read/write */
call_prom("call-method", 6, 1,
- ADDR("map"), _prom->mmumap, 0x12, size, virt, virt);
+ ADDR("map"), prom.mmumap, 0x12, size, virt, virt);
return virt;
}
return call_prom("claim", 3, 1, (prom_arg_t)virt, (prom_arg_t)size,
@@ -418,16 +429,13 @@ static unsigned int __init prom_claim(unsigned long virt, unsigned long size,
static void __init __attribute__((noreturn)) prom_panic(const char *reason)
{
-#ifdef CONFIG_PPC64
- reason = PTRRELOC(reason);
-#endif
prom_print(reason);
/* Do not call exit because it clears the screen on pmac
* it also causes some sort of double-fault on early pmacs */
- if (RELOC(of_platform) == PLATFORM_POWERMAC)
+ if (of_platform == PLATFORM_POWERMAC)
asm("trap\n");
- /* ToDo: should put up an SRC here on p/iSeries */
+ /* ToDo: should put up an SRC here on pSeries */
call_prom("exit", 0, 0);
for (;;) /* should never get here */
@@ -506,13 +514,13 @@ static int __init prom_setprop(phandle node, const char *nodename,
add_string(&p, tohex((u32)(unsigned long) value));
add_string(&p, tohex(valuelen));
add_string(&p, tohex(ADDR(pname)));
- add_string(&p, tohex(strlen(RELOC(pname))));
+ add_string(&p, tohex(strlen(pname)));
add_string(&p, "property");
*p = 0;
return call_prom("interpret", 1, 1, (u32)(unsigned long) cmd);
}
-/* We can't use the standard versions because of RELOC headaches. */
+/* We can't use the standard versions because of relocation headaches. */
#define isxdigit(c) (('0' <= (c) && (c) <= '9') \
|| ('a' <= (c) && (c) <= 'f') \
|| ('A' <= (c) && (c) <= 'F'))
@@ -521,7 +529,7 @@ static int __init prom_setprop(phandle node, const char *nodename,
#define islower(c) ('a' <= (c) && (c) <= 'z')
#define toupper(c) (islower(c) ? ((c) - 'a' + 'A') : (c))
-unsigned long prom_strtoul(const char *cp, const char **endp)
+static unsigned long prom_strtoul(const char *cp, const char **endp)
{
unsigned long result = 0, base = 10, value;
@@ -546,7 +554,7 @@ unsigned long prom_strtoul(const char *cp, const char **endp)
return result;
}
-unsigned long prom_memparse(const char *ptr, const char **retptr)
+static unsigned long prom_memparse(const char *ptr, const char **retptr)
{
unsigned long ret = prom_strtoul(ptr, retptr);
int shift = 0;
@@ -579,59 +587,53 @@ unsigned long prom_memparse(const char *ptr, const char **retptr)
*/
static void __init early_cmdline_parse(void)
{
- struct prom_t *_prom = &RELOC(prom);
const char *opt;
char *p;
int l = 0;
- RELOC(prom_cmd_line[0]) = 0;
- p = RELOC(prom_cmd_line);
- if ((long)_prom->chosen > 0)
- l = prom_getprop(_prom->chosen, "bootargs", p, COMMAND_LINE_SIZE-1);
+ prom_cmd_line[0] = 0;
+ p = prom_cmd_line;
+ if ((long)prom.chosen > 0)
+ l = prom_getprop(prom.chosen, "bootargs", p, COMMAND_LINE_SIZE-1);
#ifdef CONFIG_CMDLINE
if (l <= 0 || p[0] == '\0') /* dbl check */
- strlcpy(RELOC(prom_cmd_line),
- RELOC(CONFIG_CMDLINE), sizeof(prom_cmd_line));
+ strlcpy(prom_cmd_line,
+ CONFIG_CMDLINE, sizeof(prom_cmd_line));
#endif /* CONFIG_CMDLINE */
- prom_printf("command line: %s\n", RELOC(prom_cmd_line));
+ prom_printf("command line: %s\n", prom_cmd_line);
#ifdef CONFIG_PPC64
- opt = strstr(RELOC(prom_cmd_line), RELOC("iommu="));
+ opt = strstr(prom_cmd_line, "iommu=");
if (opt) {
prom_printf("iommu opt is: %s\n", opt);
opt += 6;
while (*opt && *opt == ' ')
opt++;
- if (!strncmp(opt, RELOC("off"), 3))
- RELOC(prom_iommu_off) = 1;
- else if (!strncmp(opt, RELOC("force"), 5))
- RELOC(prom_iommu_force_on) = 1;
+ if (!strncmp(opt, "off", 3))
+ prom_iommu_off = 1;
+ else if (!strncmp(opt, "force", 5))
+ prom_iommu_force_on = 1;
}
#endif
- opt = strstr(RELOC(prom_cmd_line), RELOC("mem="));
+ opt = strstr(prom_cmd_line, "mem=");
if (opt) {
opt += 4;
- RELOC(prom_memory_limit) = prom_memparse(opt, (const char **)&opt);
+ prom_memory_limit = prom_memparse(opt, (const char **)&opt);
#ifdef CONFIG_PPC64
/* Align to 16 MB == size of ppc64 large page */
- RELOC(prom_memory_limit) = ALIGN(RELOC(prom_memory_limit), 0x1000000);
+ prom_memory_limit = ALIGN(prom_memory_limit, 0x1000000);
#endif
}
}
-#ifdef CONFIG_PPC_PSERIES
+#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
/*
- * There are two methods for telling firmware what our capabilities are.
- * Newer machines have an "ibm,client-architecture-support" method on the
- * root node. For older machines, we have to call the "process-elf-header"
- * method in the /packages/elf-loader node, passing it a fake 32-bit
- * ELF header containing a couple of PT_NOTE sections that contain
- * structures that contain various information.
- */
-
-/*
- * New method - extensible architecture description vector.
+ * The architecture vector has an array of PVR mask/value pairs,
+ * followed by # option vectors - 1, followed by the option vectors.
+ *
+ * See prom.h for the definition of the bits specified in the
+ * architecture vector.
*
* Because the description vector contains a mix of byte and word
* values, we declare it as an unsigned char array, and use this
@@ -640,58 +642,13 @@ static void __init early_cmdline_parse(void)
#define W(x) ((x) >> 24) & 0xff, ((x) >> 16) & 0xff, \
((x) >> 8) & 0xff, (x) & 0xff
-/* Option vector bits - generic bits in byte 1 */
-#define OV_IGNORE 0x80 /* ignore this vector */
-#define OV_CESSATION_POLICY 0x40 /* halt if unsupported option present*/
-
-/* Option vector 1: processor architectures supported */
-#define OV1_PPC_2_00 0x80 /* set if we support PowerPC 2.00 */
-#define OV1_PPC_2_01 0x40 /* set if we support PowerPC 2.01 */
-#define OV1_PPC_2_02 0x20 /* set if we support PowerPC 2.02 */
-#define OV1_PPC_2_03 0x10 /* set if we support PowerPC 2.03 */
-#define OV1_PPC_2_04 0x08 /* set if we support PowerPC 2.04 */
-#define OV1_PPC_2_05 0x04 /* set if we support PowerPC 2.05 */
-#define OV1_PPC_2_06 0x02 /* set if we support PowerPC 2.06 */
-
-/* Option vector 2: Open Firmware options supported */
-#define OV2_REAL_MODE 0x20 /* set if we want OF in real mode */
-
-/* Option vector 3: processor options supported */
-#define OV3_FP 0x80 /* floating point */
-#define OV3_VMX 0x40 /* VMX/Altivec */
-#define OV3_DFP 0x20 /* decimal FP */
-
-/* Option vector 5: PAPR/OF options supported */
-#define OV5_LPAR 0x80 /* logical partitioning supported */
-#define OV5_SPLPAR 0x40 /* shared-processor LPAR supported */
-/* ibm,dynamic-reconfiguration-memory property supported */
-#define OV5_DRCONF_MEMORY 0x20
-#define OV5_LARGE_PAGES 0x10 /* large pages supported */
-#define OV5_DONATE_DEDICATE_CPU 0x02 /* donate dedicated CPU support */
-/* PCIe/MSI support. Without MSI full PCIe is not supported */
-#ifdef CONFIG_PCI_MSI
-#define OV5_MSI 0x01 /* PCIe/MSI support */
-#else
-#define OV5_MSI 0x00
-#endif /* CONFIG_PCI_MSI */
-#ifdef CONFIG_PPC_SMLPAR
-#define OV5_CMO 0x80 /* Cooperative Memory Overcommitment */
-#else
-#define OV5_CMO 0x00
-#endif
-#define OV5_TYPE1_AFFINITY 0x80 /* Type 1 NUMA affinity */
-
-/* Option Vector 6: IBM PAPR hints */
-#define OV6_LINUX 0x02 /* Linux is our OS */
-
-/*
- * The architecture vector has an array of PVR mask/value pairs,
- * followed by # option vectors - 1, followed by the option vectors.
- */
-static unsigned char ibm_architecture_vec[] = {
+unsigned char ibm_architecture_vec[] = {
W(0xfffe0000), W(0x003a0000), /* POWER5/POWER5+ */
W(0xffff0000), W(0x003e0000), /* POWER6 */
W(0xffff0000), W(0x003f0000), /* POWER7 */
+ W(0xffff0000), W(0x004b0000), /* POWER8E */
+ W(0xffff0000), W(0x004d0000), /* POWER8 */
+ W(0xffffffff), W(0x0f000004), /* all 2.07-compliant */
W(0xffffffff), W(0x0f000003), /* all 2.06-compliant */
W(0xffffffff), W(0x0f000002), /* all 2.05-compliant */
W(0xfffffffe), W(0x0f000001), /* all 2.04-compliant and earlier */
@@ -701,7 +658,7 @@ static unsigned char ibm_architecture_vec[] = {
3 - 2, /* length */
0, /* don't ignore, don't halt */
OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 |
- OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06,
+ OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07,
/* option vector 2: Open Firmware options supported */
34 - 2, /* length */
@@ -712,7 +669,7 @@ static unsigned char ibm_architecture_vec[] = {
W(0xffffffff), /* virt_base */
W(0xffffffff), /* virt_size */
W(0xffffffff), /* load_base */
- W(64), /* 64MB min RMA */
+ W(256), /* 256MB min RMA */
W(0xffffffff), /* full client load */
0, /* min RMA percentage of total RAM */
48, /* max log_2(hash table size) */
@@ -723,17 +680,28 @@ static unsigned char ibm_architecture_vec[] = {
OV3_FP | OV3_VMX | OV3_DFP,
/* option vector 4: IBM PAPR implementation */
- 2 - 2, /* length */
+ 3 - 2, /* length */
0, /* don't halt */
+ OV4_MIN_ENT_CAP, /* minimum VP entitled capacity */
/* option vector 5: PAPR/OF options */
- 13 - 2, /* length */
+ 19 - 2, /* length */
0, /* don't ignore, don't halt */
- OV5_LPAR | OV5_SPLPAR | OV5_LARGE_PAGES | OV5_DRCONF_MEMORY |
- OV5_DONATE_DEDICATE_CPU | OV5_MSI,
+ OV5_FEAT(OV5_LPAR) | OV5_FEAT(OV5_SPLPAR) | OV5_FEAT(OV5_LARGE_PAGES) |
+ OV5_FEAT(OV5_DRCONF_MEMORY) | OV5_FEAT(OV5_DONATE_DEDICATE_CPU) |
+#ifdef CONFIG_PCI_MSI
+ /* PCIe/MSI support. Without MSI full PCIe is not supported */
+ OV5_FEAT(OV5_MSI),
+#else
0,
- OV5_CMO,
- OV5_TYPE1_AFFINITY,
+#endif
+ 0,
+#ifdef CONFIG_PPC_SMLPAR
+ OV5_FEAT(OV5_CMO) | OV5_FEAT(OV5_XCMO),
+#else
+ 0,
+#endif
+ OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN),
0,
0,
0,
@@ -741,9 +709,15 @@ static unsigned char ibm_architecture_vec[] = {
* must match by the macro below. Update the definition if
* the structure layout changes.
*/
-#define IBM_ARCH_VEC_NRCORES_OFFSET 100
+#define IBM_ARCH_VEC_NRCORES_OFFSET 125
W(NR_CPUS), /* number of cores supported */
-
+ 0,
+ 0,
+ 0,
+ 0,
+ OV5_FEAT(OV5_PFO_HW_RNG) | OV5_FEAT(OV5_PFO_HW_ENCR) |
+ OV5_FEAT(OV5_PFO_HW_842),
+ OV5_FEAT(OV5_SUB_PROCESSORS),
/* option vector 6: IBM PAPR hints */
4 - 2, /* length */
0,
@@ -752,7 +726,8 @@ static unsigned char ibm_architecture_vec[] = {
};
-/* Old method - ELF header with PT_NOTE sections */
+/* Old method - ELF header with PT_NOTE sections only works on BE */
+#ifdef __BIG_ENDIAN__
static struct fake_elf {
Elf32_Ehdr elfhdr;
Elf32_Phdr phdr[2];
@@ -838,6 +813,7 @@ static struct fake_elf {
}
}
};
+#endif /* __BIG_ENDIAN__ */
static int __init prom_count_smt_threads(void)
{
@@ -850,7 +826,7 @@ static int __init prom_count_smt_threads(void)
type[0] = 0;
prom_getprop(node, "device_type", type, sizeof(type));
- if (strcmp(type, RELOC("cpu")))
+ if (strcmp(type, "cpu"))
continue;
/*
* There is an entry for each smt thread, each entry being
@@ -880,9 +856,10 @@ static int __init prom_count_smt_threads(void)
static void __init prom_send_capabilities(void)
{
- ihandle elfloader, root;
+ ihandle root;
prom_arg_t ret;
- u32 *cores;
+ u32 cores;
+ unsigned char *ptcores;
root = call_prom("open", 1, 1, ADDR("/"));
if (root != 0) {
@@ -892,15 +869,30 @@ static void __init prom_send_capabilities(void)
* (we assume this is the same for all cores) and use it to
* divide NR_CPUS.
*/
- cores = (u32 *)PTRRELOC(&ibm_architecture_vec[IBM_ARCH_VEC_NRCORES_OFFSET]);
- if (*cores != NR_CPUS) {
+
+ /* The core value may start at an odd address. If such a word
+ * access is made at a cache line boundary, this leads to an
+ * exception which may not be handled at this time.
+ * Forcing a per byte access to avoid exception.
+ */
+ ptcores = &ibm_architecture_vec[IBM_ARCH_VEC_NRCORES_OFFSET];
+ cores = 0;
+ cores |= ptcores[0] << 24;
+ cores |= ptcores[1] << 16;
+ cores |= ptcores[2] << 8;
+ cores |= ptcores[3];
+ if (cores != NR_CPUS) {
prom_printf("WARNING ! "
"ibm_architecture_vec structure inconsistent: %lu!\n",
- *cores);
+ cores);
} else {
- *cores = DIV_ROUND_UP(NR_CPUS, prom_count_smt_threads());
+ cores = DIV_ROUND_UP(NR_CPUS, prom_count_smt_threads());
prom_printf("Max number of cores passed to firmware: %lu (NR_CPUS = %lu)\n",
- *cores, NR_CPUS);
+ cores, NR_CPUS);
+ ptcores[0] = (cores >> 24) & 0xff;
+ ptcores[1] = (cores >> 16) & 0xff;
+ ptcores[2] = (cores >> 8) & 0xff;
+ ptcores[3] = cores & 0xff;
}
/* try calling the ibm,client-architecture-support method */
@@ -921,17 +913,24 @@ static void __init prom_send_capabilities(void)
prom_printf(" not implemented\n");
}
- /* no ibm,client-architecture-support call, try the old way */
- elfloader = call_prom("open", 1, 1, ADDR("/packages/elf-loader"));
- if (elfloader == 0) {
- prom_printf("couldn't open /packages/elf-loader\n");
- return;
+#ifdef __BIG_ENDIAN__
+ {
+ ihandle elfloader;
+
+ /* no ibm,client-architecture-support call, try the old way */
+ elfloader = call_prom("open", 1, 1,
+ ADDR("/packages/elf-loader"));
+ if (elfloader == 0) {
+ prom_printf("couldn't open /packages/elf-loader\n");
+ return;
+ }
+ call_prom("call-method", 3, 1, ADDR("process-elf-header"),
+ elfloader, ADDR(&fake_elf));
+ call_prom("close", 1, 0, elfloader);
}
- call_prom("call-method", 3, 1, ADDR("process-elf-header"),
- elfloader, ADDR(&fake_elf));
- call_prom("close", 1, 0, elfloader);
+#endif /* __BIG_ENDIAN__ */
}
-#endif
+#endif /* #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
/*
* Memory allocation strategy... our layout is normally:
@@ -968,21 +967,21 @@ static void __init prom_send_capabilities(void)
*/
static unsigned long __init alloc_up(unsigned long size, unsigned long align)
{
- unsigned long base = RELOC(alloc_bottom);
+ unsigned long base = alloc_bottom;
unsigned long addr = 0;
if (align)
base = _ALIGN_UP(base, align);
prom_debug("alloc_up(%x, %x)\n", size, align);
- if (RELOC(ram_top) == 0)
+ if (ram_top == 0)
prom_panic("alloc_up() called with mem not initialized\n");
if (align)
- base = _ALIGN_UP(RELOC(alloc_bottom), align);
+ base = _ALIGN_UP(alloc_bottom, align);
else
- base = RELOC(alloc_bottom);
+ base = alloc_bottom;
- for(; (base + size) <= RELOC(alloc_top);
+ for(; (base + size) <= alloc_top;
base = _ALIGN_UP(base + 0x100000, align)) {
prom_debug(" trying: 0x%x\n\r", base);
addr = (unsigned long)prom_claim(base, size, 0);
@@ -994,14 +993,14 @@ static unsigned long __init alloc_up(unsigned long size, unsigned long align)
}
if (addr == 0)
return 0;
- RELOC(alloc_bottom) = addr;
+ alloc_bottom = addr + size;
prom_debug(" -> %x\n", addr);
- prom_debug(" alloc_bottom : %x\n", RELOC(alloc_bottom));
- prom_debug(" alloc_top : %x\n", RELOC(alloc_top));
- prom_debug(" alloc_top_hi : %x\n", RELOC(alloc_top_high));
- prom_debug(" rmo_top : %x\n", RELOC(rmo_top));
- prom_debug(" ram_top : %x\n", RELOC(ram_top));
+ prom_debug(" alloc_bottom : %x\n", alloc_bottom);
+ prom_debug(" alloc_top : %x\n", alloc_top);
+ prom_debug(" alloc_top_hi : %x\n", alloc_top_high);
+ prom_debug(" rmo_top : %x\n", rmo_top);
+ prom_debug(" ram_top : %x\n", ram_top);
return addr;
}
@@ -1017,32 +1016,32 @@ static unsigned long __init alloc_down(unsigned long size, unsigned long align,
unsigned long base, addr = 0;
prom_debug("alloc_down(%x, %x, %s)\n", size, align,
- highmem ? RELOC("(high)") : RELOC("(low)"));
- if (RELOC(ram_top) == 0)
+ highmem ? "(high)" : "(low)");
+ if (ram_top == 0)
prom_panic("alloc_down() called with mem not initialized\n");
if (highmem) {
/* Carve out storage for the TCE table. */
- addr = _ALIGN_DOWN(RELOC(alloc_top_high) - size, align);
- if (addr <= RELOC(alloc_bottom))
+ addr = _ALIGN_DOWN(alloc_top_high - size, align);
+ if (addr <= alloc_bottom)
return 0;
/* Will we bump into the RMO ? If yes, check out that we
* didn't overlap existing allocations there, if we did,
* we are dead, we must be the first in town !
*/
- if (addr < RELOC(rmo_top)) {
+ if (addr < rmo_top) {
/* Good, we are first */
- if (RELOC(alloc_top) == RELOC(rmo_top))
- RELOC(alloc_top) = RELOC(rmo_top) = addr;
+ if (alloc_top == rmo_top)
+ alloc_top = rmo_top = addr;
else
return 0;
}
- RELOC(alloc_top_high) = addr;
+ alloc_top_high = addr;
goto bail;
}
- base = _ALIGN_DOWN(RELOC(alloc_top) - size, align);
- for (; base > RELOC(alloc_bottom);
+ base = _ALIGN_DOWN(alloc_top - size, align);
+ for (; base > alloc_bottom;
base = _ALIGN_DOWN(base - 0x100000, align)) {
prom_debug(" trying: 0x%x\n\r", base);
addr = (unsigned long)prom_claim(base, size, 0);
@@ -1052,15 +1051,15 @@ static unsigned long __init alloc_down(unsigned long size, unsigned long align,
}
if (addr == 0)
return 0;
- RELOC(alloc_top) = addr;
+ alloc_top = addr;
bail:
prom_debug(" -> %x\n", addr);
- prom_debug(" alloc_bottom : %x\n", RELOC(alloc_bottom));
- prom_debug(" alloc_top : %x\n", RELOC(alloc_top));
- prom_debug(" alloc_top_hi : %x\n", RELOC(alloc_top_high));
- prom_debug(" rmo_top : %x\n", RELOC(rmo_top));
- prom_debug(" ram_top : %x\n", RELOC(ram_top));
+ prom_debug(" alloc_bottom : %x\n", alloc_bottom);
+ prom_debug(" alloc_top : %x\n", alloc_top);
+ prom_debug(" alloc_top_hi : %x\n", alloc_top_high);
+ prom_debug(" rmo_top : %x\n", rmo_top);
+ prom_debug(" ram_top : %x\n", ram_top);
return addr;
}
@@ -1078,11 +1077,11 @@ static unsigned long __init prom_next_cell(int s, cell_t **cellp)
p++;
s--;
}
- r = *p++;
+ r = be32_to_cpu(*p++);
#ifdef CONFIG_PPC64
if (s > 1) {
r <<= 32;
- r |= *(p++);
+ r |= be32_to_cpu(*(p++));
}
#endif
*cellp = p;
@@ -1100,7 +1099,7 @@ static unsigned long __init prom_next_cell(int s, cell_t **cellp)
static void __init reserve_mem(u64 base, u64 size)
{
u64 top = base + size;
- unsigned long cnt = RELOC(mem_reserve_cnt);
+ unsigned long cnt = mem_reserve_cnt;
if (size == 0)
return;
@@ -1115,9 +1114,9 @@ static void __init reserve_mem(u64 base, u64 size)
if (cnt >= (MEM_RESERVE_MAP_SIZE - 1))
prom_panic("Memory reserve map exhausted !\n");
- RELOC(mem_reserve_map)[cnt].base = base;
- RELOC(mem_reserve_map)[cnt].size = size;
- RELOC(mem_reserve_cnt) = cnt + 1;
+ mem_reserve_map[cnt].base = cpu_to_be64(base);
+ mem_reserve_map[cnt].size = cpu_to_be64(size);
+ mem_reserve_cnt = cnt + 1;
}
/*
@@ -1130,7 +1129,7 @@ static void __init prom_init_mem(void)
char *path, type[64];
unsigned int plen;
cell_t *p, *endp;
- struct prom_t *_prom = &RELOC(prom);
+ __be32 val;
u32 rac, rsc;
/*
@@ -1138,15 +1137,17 @@ static void __init prom_init_mem(void)
* 1) top of RMO (first node)
* 2) top of memory
*/
- rac = 2;
- prom_getprop(_prom->root, "#address-cells", &rac, sizeof(rac));
- rsc = 1;
- prom_getprop(_prom->root, "#size-cells", &rsc, sizeof(rsc));
- prom_debug("root_addr_cells: %x\n", (unsigned long) rac);
- prom_debug("root_size_cells: %x\n", (unsigned long) rsc);
+ val = cpu_to_be32(2);
+ prom_getprop(prom.root, "#address-cells", &val, sizeof(val));
+ rac = be32_to_cpu(val);
+ val = cpu_to_be32(1);
+ prom_getprop(prom.root, "#size-cells", &val, sizeof(rsc));
+ rsc = be32_to_cpu(val);
+ prom_debug("root_addr_cells: %x\n", rac);
+ prom_debug("root_size_cells: %x\n", rsc);
prom_debug("scanning memory:\n");
- path = RELOC(prom_scratch);
+ path = prom_scratch;
for (node = 0; prom_next_node(&node); ) {
type[0] = 0;
@@ -1159,15 +1160,15 @@ static void __init prom_init_mem(void)
*/
prom_getprop(node, "name", type, sizeof(type));
}
- if (strcmp(type, RELOC("memory")))
+ if (strcmp(type, "memory"))
continue;
- plen = prom_getprop(node, "reg", RELOC(regbuf), sizeof(regbuf));
+ plen = prom_getprop(node, "reg", regbuf, sizeof(regbuf));
if (plen > sizeof(regbuf)) {
prom_printf("memory node too large for buffer !\n");
plen = sizeof(regbuf);
}
- p = RELOC(regbuf);
+ p = regbuf;
endp = p + (plen / sizeof(cell_t));
#ifdef DEBUG_PROM
@@ -1185,22 +1186,14 @@ static void __init prom_init_mem(void)
if (size == 0)
continue;
prom_debug(" %x %x\n", base, size);
- if (base == 0 && (RELOC(of_platform) & PLATFORM_LPAR))
- RELOC(rmo_top) = size;
- if ((base + size) > RELOC(ram_top))
- RELOC(ram_top) = base + size;
+ if (base == 0 && (of_platform & PLATFORM_LPAR))
+ rmo_top = size;
+ if ((base + size) > ram_top)
+ ram_top = base + size;
}
}
- RELOC(alloc_bottom) = PAGE_ALIGN((unsigned long)&RELOC(_end) + 0x4000);
-
- /* Check if we have an initrd after the kernel, if we do move our bottom
- * point to after it
- */
- if (RELOC(prom_initrd_start)) {
- if (RELOC(prom_initrd_end) > RELOC(alloc_bottom))
- RELOC(alloc_bottom) = PAGE_ALIGN(RELOC(prom_initrd_end));
- }
+ alloc_bottom = PAGE_ALIGN((unsigned long)&_end + 0x4000);
/*
* If prom_memory_limit is set we reduce the upper limits *except* for
@@ -1208,20 +1201,20 @@ static void __init prom_init_mem(void)
* TCE's up there.
*/
- RELOC(alloc_top_high) = RELOC(ram_top);
+ alloc_top_high = ram_top;
- if (RELOC(prom_memory_limit)) {
- if (RELOC(prom_memory_limit) <= RELOC(alloc_bottom)) {
+ if (prom_memory_limit) {
+ if (prom_memory_limit <= alloc_bottom) {
prom_printf("Ignoring mem=%x <= alloc_bottom.\n",
- RELOC(prom_memory_limit));
- RELOC(prom_memory_limit) = 0;
- } else if (RELOC(prom_memory_limit) >= RELOC(ram_top)) {
+ prom_memory_limit);
+ prom_memory_limit = 0;
+ } else if (prom_memory_limit >= ram_top) {
prom_printf("Ignoring mem=%x >= ram_top.\n",
- RELOC(prom_memory_limit));
- RELOC(prom_memory_limit) = 0;
+ prom_memory_limit);
+ prom_memory_limit = 0;
} else {
- RELOC(ram_top) = RELOC(prom_memory_limit);
- RELOC(rmo_top) = min(RELOC(rmo_top), RELOC(prom_memory_limit));
+ ram_top = prom_memory_limit;
+ rmo_top = min(rmo_top, prom_memory_limit);
}
}
@@ -1233,21 +1226,122 @@ static void __init prom_init_mem(void)
* Since 768MB is plenty of room, and we need to cap to something
* reasonable on 32-bit, cap at 768MB on all machines.
*/
- if (!RELOC(rmo_top))
- RELOC(rmo_top) = RELOC(ram_top);
- RELOC(rmo_top) = min(0x30000000ul, RELOC(rmo_top));
- RELOC(alloc_top) = RELOC(rmo_top);
- RELOC(alloc_top_high) = RELOC(ram_top);
+ if (!rmo_top)
+ rmo_top = ram_top;
+ rmo_top = min(0x30000000ul, rmo_top);
+ alloc_top = rmo_top;
+ alloc_top_high = ram_top;
+
+ /*
+ * Check if we have an initrd after the kernel but still inside
+ * the RMO. If we do move our bottom point to after it.
+ */
+ if (prom_initrd_start &&
+ prom_initrd_start < rmo_top &&
+ prom_initrd_end > alloc_bottom)
+ alloc_bottom = PAGE_ALIGN(prom_initrd_end);
prom_printf("memory layout at init:\n");
- prom_printf(" memory_limit : %x (16 MB aligned)\n", RELOC(prom_memory_limit));
- prom_printf(" alloc_bottom : %x\n", RELOC(alloc_bottom));
- prom_printf(" alloc_top : %x\n", RELOC(alloc_top));
- prom_printf(" alloc_top_hi : %x\n", RELOC(alloc_top_high));
- prom_printf(" rmo_top : %x\n", RELOC(rmo_top));
- prom_printf(" ram_top : %x\n", RELOC(ram_top));
+ prom_printf(" memory_limit : %x (16 MB aligned)\n", prom_memory_limit);
+ prom_printf(" alloc_bottom : %x\n", alloc_bottom);
+ prom_printf(" alloc_top : %x\n", alloc_top);
+ prom_printf(" alloc_top_hi : %x\n", alloc_top_high);
+ prom_printf(" rmo_top : %x\n", rmo_top);
+ prom_printf(" ram_top : %x\n", ram_top);
+}
+
+static void __init prom_close_stdin(void)
+{
+ __be32 val;
+ ihandle stdin;
+
+ if (prom_getprop(prom.chosen, "stdin", &val, sizeof(val)) > 0) {
+ stdin = be32_to_cpu(val);
+ call_prom("close", 1, 0, stdin);
+ }
}
+#ifdef CONFIG_PPC_POWERNV
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL
+static u64 __initdata prom_opal_base;
+static u64 __initdata prom_opal_entry;
+#endif
+
+/*
+ * Allocate room for and instantiate OPAL
+ */
+static void __init prom_instantiate_opal(void)
+{
+ phandle opal_node;
+ ihandle opal_inst;
+ u64 base, entry;
+ u64 size = 0, align = 0x10000;
+ __be64 val64;
+ u32 rets[2];
+
+ prom_debug("prom_instantiate_opal: start...\n");
+
+ opal_node = call_prom("finddevice", 1, 1, ADDR("/ibm,opal"));
+ prom_debug("opal_node: %x\n", opal_node);
+ if (!PHANDLE_VALID(opal_node))
+ return;
+
+ val64 = 0;
+ prom_getprop(opal_node, "opal-runtime-size", &val64, sizeof(val64));
+ size = be64_to_cpu(val64);
+ if (size == 0)
+ return;
+ val64 = 0;
+ prom_getprop(opal_node, "opal-runtime-alignment", &val64,sizeof(val64));
+ align = be64_to_cpu(val64);
+
+ base = alloc_down(size, align, 0);
+ if (base == 0) {
+ prom_printf("OPAL allocation failed !\n");
+ return;
+ }
+
+ opal_inst = call_prom("open", 1, 1, ADDR("/ibm,opal"));
+ if (!IHANDLE_VALID(opal_inst)) {
+ prom_printf("opening opal package failed (%x)\n", opal_inst);
+ return;
+ }
+
+ prom_printf("instantiating opal at 0x%x...", base);
+
+ if (call_prom_ret("call-method", 4, 3, rets,
+ ADDR("load-opal-runtime"),
+ opal_inst,
+ base >> 32, base & 0xffffffff) != 0
+ || (rets[0] == 0 && rets[1] == 0)) {
+ prom_printf(" failed\n");
+ return;
+ }
+ entry = (((u64)rets[0]) << 32) | rets[1];
+
+ prom_printf(" done\n");
+
+ reserve_mem(base, size);
+
+ prom_debug("opal base = 0x%x\n", base);
+ prom_debug("opal align = 0x%x\n", align);
+ prom_debug("opal entry = 0x%x\n", entry);
+ prom_debug("opal size = 0x%x\n", (long)size);
+
+ prom_setprop(opal_node, "/ibm,opal", "opal-base-address",
+ &base, sizeof(base));
+ prom_setprop(opal_node, "/ibm,opal", "opal-entry-address",
+ &entry, sizeof(entry));
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL
+ prom_opal_base = base;
+ prom_opal_entry = entry;
+#endif
+ prom_debug("prom_instantiate_opal: end...\n");
+}
+
+#endif /* CONFIG_PPC_POWERNV */
/*
* Allocate room for and instantiate RTAS
@@ -1257,6 +1351,7 @@ static void __init prom_instantiate_rtas(void)
phandle rtas_node;
ihandle rtas_inst;
u32 base, entry = 0;
+ __be32 val;
u32 size = 0;
prom_debug("prom_instantiate_rtas: start...\n");
@@ -1266,15 +1361,15 @@ static void __init prom_instantiate_rtas(void)
if (!PHANDLE_VALID(rtas_node))
return;
- prom_getprop(rtas_node, "rtas-size", &size, sizeof(size));
+ val = 0;
+ prom_getprop(rtas_node, "rtas-size", &val, sizeof(size));
+ size = be32_to_cpu(val);
if (size == 0)
return;
base = alloc_down(size, PAGE_SIZE, 0);
- if (base == 0) {
- prom_printf("RTAS allocation failed !\n");
- return;
- }
+ if (base == 0)
+ prom_panic("Could not allocate memory for RTAS\n");
rtas_inst = call_prom("open", 1, 1, ADDR("/rtas"));
if (!IHANDLE_VALID(rtas_inst)) {
@@ -1295,10 +1390,17 @@ static void __init prom_instantiate_rtas(void)
reserve_mem(base, size);
+ val = cpu_to_be32(base);
prom_setprop(rtas_node, "/rtas", "linux,rtas-base",
- &base, sizeof(base));
+ &val, sizeof(val));
+ val = cpu_to_be32(entry);
prom_setprop(rtas_node, "/rtas", "linux,rtas-entry",
- &entry, sizeof(entry));
+ &val, sizeof(val));
+
+ /* Check if it supports "query-cpu-stopped-state" */
+ if (prom_getprop(rtas_node, "query-cpu-stopped-state",
+ &val, sizeof(val)) != PROM_ERROR)
+ rtas_has_query_cpu_stopped = true;
prom_debug("rtas base = 0x%x\n", base);
prom_debug("rtas entry = 0x%x\n", entry);
@@ -1309,27 +1411,85 @@ static void __init prom_instantiate_rtas(void)
#ifdef CONFIG_PPC64
/*
+ * Allocate room for and instantiate Stored Measurement Log (SML)
+ */
+static void __init prom_instantiate_sml(void)
+{
+ phandle ibmvtpm_node;
+ ihandle ibmvtpm_inst;
+ u32 entry = 0, size = 0;
+ u64 base;
+
+ prom_debug("prom_instantiate_sml: start...\n");
+
+ ibmvtpm_node = call_prom("finddevice", 1, 1, ADDR("/ibm,vtpm"));
+ prom_debug("ibmvtpm_node: %x\n", ibmvtpm_node);
+ if (!PHANDLE_VALID(ibmvtpm_node))
+ return;
+
+ ibmvtpm_inst = call_prom("open", 1, 1, ADDR("/ibm,vtpm"));
+ if (!IHANDLE_VALID(ibmvtpm_inst)) {
+ prom_printf("opening vtpm package failed (%x)\n", ibmvtpm_inst);
+ return;
+ }
+
+ if (call_prom_ret("call-method", 2, 2, &size,
+ ADDR("sml-get-handover-size"),
+ ibmvtpm_inst) != 0 || size == 0) {
+ prom_printf("SML get handover size failed\n");
+ return;
+ }
+
+ base = alloc_down(size, PAGE_SIZE, 0);
+ if (base == 0)
+ prom_panic("Could not allocate memory for sml\n");
+
+ prom_printf("instantiating sml at 0x%x...", base);
+
+ if (call_prom_ret("call-method", 4, 2, &entry,
+ ADDR("sml-handover"),
+ ibmvtpm_inst, size, base) != 0 || entry == 0) {
+ prom_printf("SML handover failed\n");
+ return;
+ }
+ prom_printf(" done\n");
+
+ reserve_mem(base, size);
+
+ prom_setprop(ibmvtpm_node, "/ibm,vtpm", "linux,sml-base",
+ &base, sizeof(base));
+ prom_setprop(ibmvtpm_node, "/ibm,vtpm", "linux,sml-size",
+ &size, sizeof(size));
+
+ prom_debug("sml base = 0x%x\n", base);
+ prom_debug("sml size = 0x%x\n", (long)size);
+
+ prom_debug("prom_instantiate_sml: end...\n");
+}
+
+/*
* Allocate room for and initialize TCE tables
*/
+#ifdef __BIG_ENDIAN__
static void __init prom_initialize_tce_table(void)
{
phandle node;
ihandle phb_node;
char compatible[64], type[64], model[64];
- char *path = RELOC(prom_scratch);
+ char *path = prom_scratch;
u64 base, align;
u32 minalign, minsize;
u64 tce_entry, *tce_entryp;
u64 local_alloc_top, local_alloc_bottom;
u64 i;
- if (RELOC(prom_iommu_off))
+ if (prom_iommu_off)
return;
prom_debug("starting prom_initialize_tce_table\n");
/* Cache current top of allocs so we reserve a single block */
- local_alloc_top = RELOC(alloc_top_high);
+ local_alloc_top = alloc_top_high;
local_alloc_bottom = local_alloc_top;
/* Search all nodes looking for PHBs. */
@@ -1342,19 +1502,19 @@ static void __init prom_initialize_tce_table(void)
prom_getprop(node, "device_type", type, sizeof(type));
prom_getprop(node, "model", model, sizeof(model));
- if ((type[0] == 0) || (strstr(type, RELOC("pci")) == NULL))
+ if ((type[0] == 0) || (strstr(type, "pci") == NULL))
continue;
/* Keep the old logic intact to avoid regression. */
if (compatible[0] != 0) {
- if ((strstr(compatible, RELOC("python")) == NULL) &&
- (strstr(compatible, RELOC("Speedwagon")) == NULL) &&
- (strstr(compatible, RELOC("Winnipeg")) == NULL))
+ if ((strstr(compatible, "python") == NULL) &&
+ (strstr(compatible, "Speedwagon") == NULL) &&
+ (strstr(compatible, "Winnipeg") == NULL))
continue;
} else if (model[0] != 0) {
- if ((strstr(model, RELOC("ython")) == NULL) &&
- (strstr(model, RELOC("peedwagon")) == NULL) &&
- (strstr(model, RELOC("innipeg")) == NULL))
+ if ((strstr(model, "ython") == NULL) &&
+ (strstr(model, "peedwagon") == NULL) &&
+ (strstr(model, "innipeg") == NULL))
continue;
}
@@ -1376,7 +1536,7 @@ static void __init prom_initialize_tce_table(void)
* else will impact performance, so we always allocate 8MB.
* Anton
*/
- if (__is_processor(PV_POWER4) || __is_processor(PV_POWER4p))
+ if (pvr_version_is(PVR_POWER4) || pvr_version_is(PVR_POWER4p))
minsize = 8UL << 20;
else
minsize = 4UL << 20;
@@ -1433,13 +1593,14 @@ static void __init prom_initialize_tce_table(void)
/* These are only really needed if there is a memory limit in
* effect, but we don't know so export them always. */
- RELOC(prom_tce_alloc_start) = local_alloc_bottom;
- RELOC(prom_tce_alloc_end) = local_alloc_top;
+ prom_tce_alloc_start = local_alloc_bottom;
+ prom_tce_alloc_end = local_alloc_top;
/* Flag the first invalid entry */
prom_debug("ending prom_initialize_tce_table\n");
}
-#endif
+#endif /* __BIG_ENDIAN__ */
+#endif /* CONFIG_PPC64 */
/*
* With CHRP SMP we need to use the OF to start the other processors.
@@ -1468,16 +1629,26 @@ static void __init prom_initialize_tce_table(void)
static void __init prom_hold_cpus(void)
{
unsigned long i;
- unsigned int reg;
phandle node;
char type[64];
- struct prom_t *_prom = &RELOC(prom);
unsigned long *spinloop
= (void *) LOW_ADDR(__secondary_hold_spinloop);
unsigned long *acknowledge
= (void *) LOW_ADDR(__secondary_hold_acknowledge);
unsigned long secondary_hold = LOW_ADDR(__secondary_hold);
+ /*
+ * On pseries, if RTAS supports "query-cpu-stopped-state",
+ * we skip this stage, the CPUs will be started by the
+ * kernel using RTAS.
+ */
+ if ((of_platform == PLATFORM_PSERIES ||
+ of_platform == PLATFORM_PSERIES_LPAR) &&
+ rtas_has_query_cpu_stopped) {
+ prom_printf("prom_hold_cpus: skipped\n");
+ return;
+ }
+
prom_debug("prom_hold_cpus: start...\n");
prom_debug(" 1) spinloop = 0x%x\n", (unsigned long)spinloop);
prom_debug(" 1) *spinloop = 0x%x\n", *spinloop);
@@ -1495,20 +1666,24 @@ static void __init prom_hold_cpus(void)
/* look for cpus */
for (node = 0; prom_next_node(&node); ) {
+ unsigned int cpu_no;
+ __be32 reg;
+
type[0] = 0;
prom_getprop(node, "device_type", type, sizeof(type));
- if (strcmp(type, RELOC("cpu")) != 0)
+ if (strcmp(type, "cpu") != 0)
continue;
/* Skip non-configured cpus. */
if (prom_getprop(node, "status", type, sizeof(type)) > 0)
- if (strcmp(type, RELOC("okay")) != 0)
+ if (strcmp(type, "okay") != 0)
continue;
- reg = -1;
+ reg = cpu_to_be32(-1); /* make sparse happy */
prom_getprop(node, "reg", &reg, sizeof(reg));
+ cpu_no = be32_to_cpu(reg);
- prom_debug("cpu hw idx = %lu\n", reg);
+ prom_debug("cpu hw idx = %lu\n", cpu_no);
/* Init the acknowledge var which will be reset by
* the secondary cpu when it awakens from its OF
@@ -1516,24 +1691,24 @@ static void __init prom_hold_cpus(void)
*/
*acknowledge = (unsigned long)-1;
- if (reg != _prom->cpu) {
- /* Primary Thread of non-boot cpu */
- prom_printf("starting cpu hw idx %lu... ", reg);
+ if (cpu_no != prom.cpu) {
+ /* Primary Thread of non-boot cpu or any thread */
+ prom_printf("starting cpu hw idx %lu... ", cpu_no);
call_prom("start-cpu", 3, 0, node,
- secondary_hold, reg);
+ secondary_hold, cpu_no);
for (i = 0; (i < 100000000) &&
(*acknowledge == ((unsigned long)-1)); i++ )
mb();
- if (*acknowledge == reg)
+ if (*acknowledge == cpu_no)
prom_printf("done\n");
else
prom_printf("failed: %x\n", *acknowledge);
}
#ifdef CONFIG_SMP
else
- prom_printf("boot cpu hw idx %lu\n", reg);
+ prom_printf("boot cpu hw idx %lu\n", cpu_no);
#endif /* CONFIG_SMP */
}
@@ -1543,22 +1718,20 @@ static void __init prom_hold_cpus(void)
static void __init prom_init_client_services(unsigned long pp)
{
- struct prom_t *_prom = &RELOC(prom);
-
/* Get a handle to the prom entry point before anything else */
- RELOC(prom_entry) = pp;
+ prom_entry = pp;
/* get a handle for the stdout device */
- _prom->chosen = call_prom("finddevice", 1, 1, ADDR("/chosen"));
- if (!PHANDLE_VALID(_prom->chosen))
+ prom.chosen = call_prom("finddevice", 1, 1, ADDR("/chosen"));
+ if (!PHANDLE_VALID(prom.chosen))
prom_panic("cannot find chosen"); /* msg won't be printed :( */
/* get device tree root */
- _prom->root = call_prom("finddevice", 1, 1, ADDR("/"));
- if (!PHANDLE_VALID(_prom->root))
+ prom.root = call_prom("finddevice", 1, 1, ADDR("/"));
+ if (!PHANDLE_VALID(prom.root))
prom_panic("cannot find device tree root"); /* msg won't be printed :( */
- _prom->mmumap = 0;
+ prom.mmumap = 0;
}
#ifdef CONFIG_PPC32
@@ -1569,7 +1742,6 @@ static void __init prom_init_client_services(unsigned long pp)
*/
static void __init prom_find_mmu(void)
{
- struct prom_t *_prom = &RELOC(prom);
phandle oprom;
char version[64];
@@ -1587,10 +1759,11 @@ static void __init prom_find_mmu(void)
call_prom("interpret", 1, 1, "dev /memory 0 to allow-reclaim");
} else
return;
- _prom->memory = call_prom("open", 1, 1, ADDR("/memory"));
- prom_getprop(_prom->chosen, "mmu", &_prom->mmumap,
- sizeof(_prom->mmumap));
- if (!IHANDLE_VALID(_prom->memory) || !IHANDLE_VALID(_prom->mmumap))
+ prom.memory = call_prom("open", 1, 1, ADDR("/memory"));
+ prom_getprop(prom.chosen, "mmu", &prom.mmumap,
+ sizeof(prom.mmumap));
+ prom.mmumap = be32_to_cpu(prom.mmumap);
+ if (!IHANDLE_VALID(prom.memory) || !IHANDLE_VALID(prom.mmumap))
of_workarounds &= ~OF_WA_CLAIM; /* hmmm */
}
#else
@@ -1599,45 +1772,40 @@ static void __init prom_find_mmu(void)
static void __init prom_init_stdout(void)
{
- struct prom_t *_prom = &RELOC(prom);
- char *path = RELOC(of_stdout_device);
+ char *path = of_stdout_device;
char type[16];
- u32 val;
+ phandle stdout_node;
+ __be32 val;
- if (prom_getprop(_prom->chosen, "stdout", &val, sizeof(val)) <= 0)
+ if (prom_getprop(prom.chosen, "stdout", &val, sizeof(val)) <= 0)
prom_panic("cannot find stdout");
- _prom->stdout = val;
+ prom.stdout = be32_to_cpu(val);
/* Get the full OF pathname of the stdout device */
memset(path, 0, 256);
- call_prom("instance-to-path", 3, 1, _prom->stdout, path, 255);
- val = call_prom("instance-to-package", 1, 1, _prom->stdout);
- prom_setprop(_prom->chosen, "/chosen", "linux,stdout-package",
- &val, sizeof(val));
- prom_printf("OF stdout device is: %s\n", RELOC(of_stdout_device));
- prom_setprop(_prom->chosen, "/chosen", "linux,stdout-path",
+ call_prom("instance-to-path", 3, 1, prom.stdout, path, 255);
+ prom_printf("OF stdout device is: %s\n", of_stdout_device);
+ prom_setprop(prom.chosen, "/chosen", "linux,stdout-path",
path, strlen(path) + 1);
- /* If it's a display, note it */
- memset(type, 0, sizeof(type));
- prom_getprop(val, "device_type", type, sizeof(type));
- if (strcmp(type, RELOC("display")) == 0)
- prom_setprop(val, path, "linux,boot-display", NULL, 0);
-}
-
-static void __init prom_close_stdin(void)
-{
- struct prom_t *_prom = &RELOC(prom);
- ihandle val;
+ /* instance-to-package fails on PA-Semi */
+ stdout_node = call_prom("instance-to-package", 1, 1, prom.stdout);
+ if (stdout_node != PROM_ERROR) {
+ val = cpu_to_be32(stdout_node);
+ prom_setprop(prom.chosen, "/chosen", "linux,stdout-package",
+ &val, sizeof(val));
- if (prom_getprop(_prom->chosen, "stdin", &val, sizeof(val)) > 0)
- call_prom("close", 1, 0, val);
+ /* If it's a display, note it */
+ memset(type, 0, sizeof(type));
+ prom_getprop(stdout_node, "device_type", type, sizeof(type));
+ if (strcmp(type, "display") == 0)
+ prom_setprop(stdout_node, path, "linux,boot-display", NULL, 0);
+ }
}
static int __init prom_find_machine_type(void)
{
- struct prom_t *_prom = &RELOC(prom);
char compat[256];
int len, i = 0;
#ifdef CONFIG_PPC64
@@ -1645,8 +1813,8 @@ static int __init prom_find_machine_type(void)
int x;
#endif
- /* Look for a PowerMac */
- len = prom_getprop(_prom->root, "compatible",
+ /* Look for a PowerMac or a Cell */
+ len = prom_getprop(prom.root, "compatible",
compat, sizeof(compat)-1);
if (len > 0) {
compat[len] = 0;
@@ -1655,33 +1823,37 @@ static int __init prom_find_machine_type(void)
int sl = strlen(p);
if (sl == 0)
break;
- if (strstr(p, RELOC("Power Macintosh")) ||
- strstr(p, RELOC("MacRISC")))
+ if (strstr(p, "Power Macintosh") ||
+ strstr(p, "MacRISC"))
return PLATFORM_POWERMAC;
#ifdef CONFIG_PPC64
/* We must make sure we don't detect the IBM Cell
* blades as pSeries due to some firmware issues,
* so we do it here.
*/
- if (strstr(p, RELOC("IBM,CBEA")) ||
- strstr(p, RELOC("IBM,CPBW-1.0")))
+ if (strstr(p, "IBM,CBEA") ||
+ strstr(p, "IBM,CPBW-1.0"))
return PLATFORM_GENERIC;
#endif /* CONFIG_PPC64 */
i += sl + 1;
}
}
#ifdef CONFIG_PPC64
- /* If not a mac, try to figure out if it's an IBM pSeries or any other
+ /* Try to detect OPAL */
+ if (PHANDLE_VALID(call_prom("finddevice", 1, 1, ADDR("/ibm,opal"))))
+ return PLATFORM_OPAL;
+
+ /* Try to figure out if it's an IBM pSeries or any other
* PAPR compliant platform. We assume it is if :
* - /device_type is "chrp" (please, do NOT use that for future
* non-IBM designs !
* - it has /rtas
*/
- len = prom_getprop(_prom->root, "device_type",
+ len = prom_getprop(prom.root, "device_type",
compat, sizeof(compat)-1);
if (len <= 0)
return PLATFORM_GENERIC;
- if (strcmp(compat, RELOC("chrp")))
+ if (strcmp(compat, "chrp"))
return PLATFORM_GENERIC;
/* Default to pSeries. We need to know if we are running LPAR */
@@ -1743,11 +1915,11 @@ static void __init prom_check_displays(void)
for (node = 0; prom_next_node(&node); ) {
memset(type, 0, sizeof(type));
prom_getprop(node, "device_type", type, sizeof(type));
- if (strcmp(type, RELOC("display")) != 0)
+ if (strcmp(type, "display") != 0)
continue;
/* It seems OF doesn't null-terminate the path :-( */
- path = RELOC(prom_scratch);
+ path = prom_scratch;
memset(path, 0, PROM_SCRATCH_SIZE);
/*
@@ -1771,19 +1943,35 @@ static void __init prom_check_displays(void)
/* Setup a usable color table when the appropriate
* method is available. Should update this to set-colors */
- clut = RELOC(default_colors);
- for (i = 0; i < 32; i++, clut += 3)
+ clut = default_colors;
+ for (i = 0; i < 16; i++, clut += 3)
if (prom_set_color(ih, i, clut[0], clut[1],
clut[2]) != 0)
break;
#ifdef CONFIG_LOGO_LINUX_CLUT224
- clut = PTRRELOC(RELOC(logo_linux_clut224.clut));
- for (i = 0; i < RELOC(logo_linux_clut224.clutsize); i++, clut += 3)
+ clut = PTRRELOC(logo_linux_clut224.clut);
+ for (i = 0; i < logo_linux_clut224.clutsize; i++, clut += 3)
if (prom_set_color(ih, i + 32, clut[0], clut[1],
clut[2]) != 0)
break;
#endif /* CONFIG_LOGO_LINUX_CLUT224 */
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX
+ if (prom_getprop(node, "linux,boot-display", NULL, 0) !=
+ PROM_ERROR) {
+ u32 width, height, pitch, addr;
+
+ prom_printf("Setting btext !\n");
+ prom_getprop(node, "width", &width, 4);
+ prom_getprop(node, "height", &height, 4);
+ prom_getprop(node, "linebytes", &pitch, 4);
+ prom_getprop(node, "address", &addr, 4);
+ prom_printf("W=%d H=%d LB=%d addr=0x%x\n",
+ width, height, pitch, addr);
+ btext_setup_display(width, height, 8, pitch, addr);
+ }
+#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */
}
}
@@ -1799,16 +1987,18 @@ static void __init *make_room(unsigned long *mem_start, unsigned long *mem_end,
unsigned long room, chunk;
prom_debug("Chunk exhausted, claiming more at %x...\n",
- RELOC(alloc_bottom));
- room = RELOC(alloc_top) - RELOC(alloc_bottom);
+ alloc_bottom);
+ room = alloc_top - alloc_bottom;
if (room > DEVTREE_CHUNK_SIZE)
room = DEVTREE_CHUNK_SIZE;
if (room < PAGE_SIZE)
- prom_panic("No memory for flatten_device_tree (no room)");
+ prom_panic("No memory for flatten_device_tree "
+ "(no room)\n");
chunk = alloc_up(room, 0);
if (chunk == 0)
- prom_panic("No memory for flatten_device_tree (claim failed)");
- *mem_end = RELOC(alloc_top);
+ prom_panic("No memory for flatten_device_tree "
+ "(claim failed)\n");
+ *mem_end = chunk + room;
}
ret = (void *)*mem_start;
@@ -1817,16 +2007,18 @@ static void __init *make_room(unsigned long *mem_start, unsigned long *mem_end,
return ret;
}
-#define dt_push_token(token, mem_start, mem_end) \
- do { *((u32 *)make_room(mem_start, mem_end, 4, 4)) = token; } while(0)
+#define dt_push_token(token, mem_start, mem_end) do { \
+ void *room = make_room(mem_start, mem_end, 4, 4); \
+ *(__be32 *)room = cpu_to_be32(token); \
+ } while(0)
static unsigned long __init dt_find_string(char *str)
{
char *s, *os;
- s = os = (char *)RELOC(dt_string_start);
+ s = os = (char *)dt_string_start;
s += 4;
- while (s < (char *)RELOC(dt_string_end)) {
+ while (s < (char *)dt_string_end) {
if (strcmp(s, str) == 0)
return s - os;
s += strlen(s) + 1;
@@ -1848,10 +2040,10 @@ static void __init scan_dt_build_strings(phandle node,
unsigned long soff;
phandle child;
- sstart = (char *)RELOC(dt_string_start);
+ sstart = (char *)dt_string_start;
/* get and store all property names */
- prev_name = RELOC("");
+ prev_name = "";
for (;;) {
/* 64 is max len of name including nul. */
namep = make_room(mem_start, mem_end, MAX_PROPERTY_NAME, 1);
@@ -1862,9 +2054,9 @@ static void __init scan_dt_build_strings(phandle node,
}
/* skip "name" */
- if (strcmp(namep, RELOC("name")) == 0) {
+ if (strcmp(namep, "name") == 0) {
*mem_start = (unsigned long)namep;
- prev_name = RELOC("name");
+ prev_name = "name";
continue;
}
/* get/create string entry */
@@ -1875,7 +2067,7 @@ static void __init scan_dt_build_strings(phandle node,
} else {
/* Trim off some if we can */
*mem_start = (unsigned long)namep + strlen(namep) + 1;
- RELOC(dt_string_end) = *mem_start;
+ dt_string_end = *mem_start;
}
prev_name = namep;
}
@@ -1896,7 +2088,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
unsigned long soff;
unsigned char *valp;
static char pname[MAX_PROPERTY_NAME];
- int l, room;
+ int l, room, has_phandle = 0;
dt_push_token(OF_DT_BEGIN_NODE, mem_start, mem_end);
@@ -1930,46 +2122,39 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
}
/* get it again for debugging */
- path = RELOC(prom_scratch);
+ path = prom_scratch;
memset(path, 0, PROM_SCRATCH_SIZE);
call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1);
/* get and store all properties */
- prev_name = RELOC("");
- sstart = (char *)RELOC(dt_string_start);
+ prev_name = "";
+ sstart = (char *)dt_string_start;
for (;;) {
if (call_prom("nextprop", 3, 1, node, prev_name,
- RELOC(pname)) != 1)
+ pname) != 1)
break;
/* skip "name" */
- if (strcmp(RELOC(pname), RELOC("name")) == 0) {
- prev_name = RELOC("name");
+ if (strcmp(pname, "name") == 0) {
+ prev_name = "name";
continue;
}
/* find string offset */
- soff = dt_find_string(RELOC(pname));
+ soff = dt_find_string(pname);
if (soff == 0) {
prom_printf("WARNING: Can't find string index for"
- " <%s>, node %s\n", RELOC(pname), path);
+ " <%s>, node %s\n", pname, path);
break;
}
prev_name = sstart + soff;
/* get length */
- l = call_prom("getproplen", 2, 1, node, RELOC(pname));
+ l = call_prom("getproplen", 2, 1, node, pname);
/* sanity checks */
if (l == PROM_ERROR)
continue;
- if (l > MAX_PROPERTY_LENGTH) {
- prom_printf("WARNING: ignoring large property ");
- /* It seems OF doesn't null-terminate the path :-( */
- prom_printf("[%s] ", path);
- prom_printf("%s length 0x%x\n", RELOC(pname), l);
- continue;
- }
/* push property head */
dt_push_token(OF_DT_PROP, mem_start, mem_end);
@@ -1978,21 +2163,28 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
/* push property content */
valp = make_room(mem_start, mem_end, l, 4);
- call_prom("getprop", 4, 1, node, RELOC(pname), valp, l);
+ call_prom("getprop", 4, 1, node, pname, valp, l);
*mem_start = _ALIGN(*mem_start, 4);
+
+ if (!strcmp(pname, "phandle"))
+ has_phandle = 1;
}
- /* Add a "linux,phandle" property. */
- soff = dt_find_string(RELOC("linux,phandle"));
- if (soff == 0)
- prom_printf("WARNING: Can't find string index for"
- " <linux-phandle> node %s\n", path);
- else {
- dt_push_token(OF_DT_PROP, mem_start, mem_end);
- dt_push_token(4, mem_start, mem_end);
- dt_push_token(soff, mem_start, mem_end);
- valp = make_room(mem_start, mem_end, 4, 4);
- *(u32 *)valp = node;
+ /* Add a "linux,phandle" property if no "phandle" property already
+ * existed (can happen with OPAL)
+ */
+ if (!has_phandle) {
+ soff = dt_find_string("linux,phandle");
+ if (soff == 0)
+ prom_printf("WARNING: Can't find string index for"
+ " <linux-phandle> node %s\n", path);
+ else {
+ dt_push_token(OF_DT_PROP, mem_start, mem_end);
+ dt_push_token(4, mem_start, mem_end);
+ dt_push_token(soff, mem_start, mem_end);
+ valp = make_room(mem_start, mem_end, 4, 4);
+ *(__be32 *)valp = cpu_to_be32(node);
+ }
}
/* do all our children */
@@ -2010,24 +2202,23 @@ static void __init flatten_device_tree(void)
phandle root;
unsigned long mem_start, mem_end, room;
struct boot_param_header *hdr;
- struct prom_t *_prom = &RELOC(prom);
char *namep;
u64 *rsvmap;
/*
* Check how much room we have between alloc top & bottom (+/- a
- * few pages), crop to 4Mb, as this is our "chuck" size
+ * few pages), crop to 1MB, as this is our "chunk" size
*/
- room = RELOC(alloc_top) - RELOC(alloc_bottom) - 0x4000;
+ room = alloc_top - alloc_bottom - 0x4000;
if (room > DEVTREE_CHUNK_SIZE)
room = DEVTREE_CHUNK_SIZE;
- prom_debug("starting device tree allocs at %x\n", RELOC(alloc_bottom));
+ prom_debug("starting device tree allocs at %x\n", alloc_bottom);
/* Now try to claim that */
mem_start = (unsigned long)alloc_up(room, PAGE_SIZE);
if (mem_start == 0)
prom_panic("Can't allocate initial device-tree chunk\n");
- mem_end = RELOC(alloc_top);
+ mem_end = mem_start + room;
/* Get root of tree */
root = call_prom("peer", 1, 1, (phandle)0);
@@ -2038,67 +2229,66 @@ static void __init flatten_device_tree(void)
mem_start = _ALIGN(mem_start, 4);
hdr = make_room(&mem_start, &mem_end,
sizeof(struct boot_param_header), 4);
- RELOC(dt_header_start) = (unsigned long)hdr;
+ dt_header_start = (unsigned long)hdr;
rsvmap = make_room(&mem_start, &mem_end, sizeof(mem_reserve_map), 8);
/* Start of strings */
mem_start = PAGE_ALIGN(mem_start);
- RELOC(dt_string_start) = mem_start;
+ dt_string_start = mem_start;
mem_start += 4; /* hole */
/* Add "linux,phandle" in there, we'll need it */
namep = make_room(&mem_start, &mem_end, 16, 1);
- strcpy(namep, RELOC("linux,phandle"));
+ strcpy(namep, "linux,phandle");
mem_start = (unsigned long)namep + strlen(namep) + 1;
/* Build string array */
prom_printf("Building dt strings...\n");
scan_dt_build_strings(root, &mem_start, &mem_end);
- RELOC(dt_string_end) = mem_start;
+ dt_string_end = mem_start;
/* Build structure */
mem_start = PAGE_ALIGN(mem_start);
- RELOC(dt_struct_start) = mem_start;
+ dt_struct_start = mem_start;
prom_printf("Building dt structure...\n");
scan_dt_build_struct(root, &mem_start, &mem_end);
dt_push_token(OF_DT_END, &mem_start, &mem_end);
- RELOC(dt_struct_end) = PAGE_ALIGN(mem_start);
+ dt_struct_end = PAGE_ALIGN(mem_start);
/* Finish header */
- hdr->boot_cpuid_phys = _prom->cpu;
- hdr->magic = OF_DT_HEADER;
- hdr->totalsize = RELOC(dt_struct_end) - RELOC(dt_header_start);
- hdr->off_dt_struct = RELOC(dt_struct_start) - RELOC(dt_header_start);
- hdr->off_dt_strings = RELOC(dt_string_start) - RELOC(dt_header_start);
- hdr->dt_strings_size = RELOC(dt_string_end) - RELOC(dt_string_start);
- hdr->off_mem_rsvmap = ((unsigned long)rsvmap) - RELOC(dt_header_start);
- hdr->version = OF_DT_VERSION;
+ hdr->boot_cpuid_phys = cpu_to_be32(prom.cpu);
+ hdr->magic = cpu_to_be32(OF_DT_HEADER);
+ hdr->totalsize = cpu_to_be32(dt_struct_end - dt_header_start);
+ hdr->off_dt_struct = cpu_to_be32(dt_struct_start - dt_header_start);
+ hdr->off_dt_strings = cpu_to_be32(dt_string_start - dt_header_start);
+ hdr->dt_strings_size = cpu_to_be32(dt_string_end - dt_string_start);
+ hdr->off_mem_rsvmap = cpu_to_be32(((unsigned long)rsvmap) - dt_header_start);
+ hdr->version = cpu_to_be32(OF_DT_VERSION);
/* Version 16 is not backward compatible */
- hdr->last_comp_version = 0x10;
+ hdr->last_comp_version = cpu_to_be32(0x10);
/* Copy the reserve map in */
- memcpy(rsvmap, RELOC(mem_reserve_map), sizeof(mem_reserve_map));
+ memcpy(rsvmap, mem_reserve_map, sizeof(mem_reserve_map));
#ifdef DEBUG_PROM
{
int i;
prom_printf("reserved memory map:\n");
- for (i = 0; i < RELOC(mem_reserve_cnt); i++)
+ for (i = 0; i < mem_reserve_cnt; i++)
prom_printf(" %x - %x\n",
- RELOC(mem_reserve_map)[i].base,
- RELOC(mem_reserve_map)[i].size);
+ be64_to_cpu(mem_reserve_map[i].base),
+ be64_to_cpu(mem_reserve_map[i].size));
}
#endif
/* Bump mem_reserve_cnt to cause further reservations to fail
* since it's too late.
*/
- RELOC(mem_reserve_cnt) = MEM_RESERVE_MAP_SIZE;
+ mem_reserve_cnt = MEM_RESERVE_MAP_SIZE;
prom_printf("Device tree strings 0x%x -> 0x%x\n",
- RELOC(dt_string_start), RELOC(dt_string_end));
+ dt_string_start, dt_string_end);
prom_printf("Device tree struct 0x%x -> 0x%x\n",
- RELOC(dt_struct_start), RELOC(dt_struct_end));
-
+ dt_struct_start, dt_struct_end);
}
#ifdef CONFIG_PPC_MAPLE
@@ -2152,7 +2342,6 @@ static void __init fixup_device_tree_maple_memory_controller(void)
phandle mc;
u32 mc_reg[4];
char *name = "/hostbridge@f8000000";
- struct prom_t *_prom = &RELOC(prom);
u32 ac, sc;
mc = call_prom("finddevice", 1, 1, ADDR(name));
@@ -2162,8 +2351,8 @@ static void __init fixup_device_tree_maple_memory_controller(void)
if (prom_getproplen(mc, "reg") != 8)
return;
- prom_getprop(_prom->root, "#address-cells", &ac, sizeof(ac));
- prom_getprop(_prom->root, "#size-cells", &sc, sizeof(sc));
+ prom_getprop(prom.root, "#address-cells", &ac, sizeof(ac));
+ prom_getprop(prom.root, "#size-cells", &sc, sizeof(sc));
if ((ac != 2) || (sc != 2))
return;
@@ -2432,50 +2621,96 @@ static void __init fixup_device_tree(void)
static void __init prom_find_boot_cpu(void)
{
- struct prom_t *_prom = &RELOC(prom);
- u32 getprop_rval;
+ __be32 rval;
ihandle prom_cpu;
phandle cpu_pkg;
- _prom->cpu = 0;
- if (prom_getprop(_prom->chosen, "cpu", &prom_cpu, sizeof(prom_cpu)) <= 0)
+ rval = 0;
+ if (prom_getprop(prom.chosen, "cpu", &rval, sizeof(rval)) <= 0)
return;
+ prom_cpu = be32_to_cpu(rval);
cpu_pkg = call_prom("instance-to-package", 1, 1, prom_cpu);
- prom_getprop(cpu_pkg, "reg", &getprop_rval, sizeof(getprop_rval));
- _prom->cpu = getprop_rval;
+ prom_getprop(cpu_pkg, "reg", &rval, sizeof(rval));
+ prom.cpu = be32_to_cpu(rval);
- prom_debug("Booting CPU hw index = %lu\n", _prom->cpu);
+ prom_debug("Booting CPU hw index = %lu\n", prom.cpu);
}
static void __init prom_check_initrd(unsigned long r3, unsigned long r4)
{
#ifdef CONFIG_BLK_DEV_INITRD
- struct prom_t *_prom = &RELOC(prom);
-
if (r3 && r4 && r4 != 0xdeadbeef) {
- unsigned long val;
+ __be64 val;
- RELOC(prom_initrd_start) = is_kernel_addr(r3) ? __pa(r3) : r3;
- RELOC(prom_initrd_end) = RELOC(prom_initrd_start) + r4;
+ prom_initrd_start = is_kernel_addr(r3) ? __pa(r3) : r3;
+ prom_initrd_end = prom_initrd_start + r4;
- val = RELOC(prom_initrd_start);
- prom_setprop(_prom->chosen, "/chosen", "linux,initrd-start",
+ val = cpu_to_be64(prom_initrd_start);
+ prom_setprop(prom.chosen, "/chosen", "linux,initrd-start",
&val, sizeof(val));
- val = RELOC(prom_initrd_end);
- prom_setprop(_prom->chosen, "/chosen", "linux,initrd-end",
+ val = cpu_to_be64(prom_initrd_end);
+ prom_setprop(prom.chosen, "/chosen", "linux,initrd-end",
&val, sizeof(val));
- reserve_mem(RELOC(prom_initrd_start),
- RELOC(prom_initrd_end) - RELOC(prom_initrd_start));
+ reserve_mem(prom_initrd_start,
+ prom_initrd_end - prom_initrd_start);
- prom_debug("initrd_start=0x%x\n", RELOC(prom_initrd_start));
- prom_debug("initrd_end=0x%x\n", RELOC(prom_initrd_end));
+ prom_debug("initrd_start=0x%x\n", prom_initrd_start);
+ prom_debug("initrd_end=0x%x\n", prom_initrd_end);
}
#endif /* CONFIG_BLK_DEV_INITRD */
}
+#ifdef CONFIG_PPC64
+#ifdef CONFIG_RELOCATABLE
+static void reloc_toc(void)
+{
+}
+
+static void unreloc_toc(void)
+{
+}
+#else
+static void __reloc_toc(unsigned long offset, unsigned long nr_entries)
+{
+ unsigned long i;
+ unsigned long *toc_entry;
+
+ /* Get the start of the TOC by using r2 directly. */
+ asm volatile("addi %0,2,-0x8000" : "=b" (toc_entry));
+
+ for (i = 0; i < nr_entries; i++) {
+ *toc_entry = *toc_entry + offset;
+ toc_entry++;
+ }
+}
+
+static void reloc_toc(void)
+{
+ unsigned long offset = reloc_offset();
+ unsigned long nr_entries =
+ (__prom_init_toc_end - __prom_init_toc_start) / sizeof(long);
+
+ __reloc_toc(offset, nr_entries);
+
+ mb();
+}
+
+static void unreloc_toc(void)
+{
+ unsigned long offset = reloc_offset();
+ unsigned long nr_entries =
+ (__prom_init_toc_end - __prom_init_toc_start) / sizeof(long);
+
+ mb();
+
+ __reloc_toc(-offset, nr_entries);
+}
+#endif
+#endif
+
/*
* We enter here early on, when the Open Firmware prom is still
* handling exceptions and the MMU hash table for us.
@@ -2486,20 +2721,19 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
unsigned long r6, unsigned long r7,
unsigned long kbase)
{
- struct prom_t *_prom;
unsigned long hdr;
#ifdef CONFIG_PPC32
unsigned long offset = reloc_offset();
reloc_got2(offset);
+#else
+ reloc_toc();
#endif
- _prom = &RELOC(prom);
-
/*
* First zero the BSS
*/
- memset(&RELOC(__bss_start), 0, __bss_stop - __bss_start);
+ memset(&__bss_start, 0, __bss_stop - __bss_start);
/*
* Init interface to Open Firmware, get some node references,
@@ -2518,15 +2752,16 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
*/
prom_init_stdout();
- prom_printf("Preparing to boot %s", RELOC(linux_banner));
+ prom_printf("Preparing to boot %s", linux_banner);
/*
* Get default machine type. At this point, we do not differentiate
* between pSeries SMP and pSeries LPAR
*/
- RELOC(of_platform) = prom_find_machine_type();
+ of_platform = prom_find_machine_type();
+ prom_printf("Detected machine type: %x\n", of_platform);
-#ifndef CONFIG_RELOCATABLE
+#ifndef CONFIG_NONSTATIC_KERNEL
/* Bail if this is a kdump kernel. */
if (PHYSICAL_START > 0)
prom_panic("Error: You can't boot a kdump kernel from OF!\n");
@@ -2537,19 +2772,19 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
*/
prom_check_initrd(r3, r4);
-#ifdef CONFIG_PPC_PSERIES
+#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
/*
* On pSeries, inform the firmware about our capabilities
*/
- if (RELOC(of_platform) == PLATFORM_PSERIES ||
- RELOC(of_platform) == PLATFORM_PSERIES_LPAR)
+ if (of_platform == PLATFORM_PSERIES ||
+ of_platform == PLATFORM_PSERIES_LPAR)
prom_send_capabilities();
#endif
/*
* Copy the CPU hold code
*/
- if (RELOC(of_platform) != PLATFORM_POWERMAC)
+ if (of_platform != PLATFORM_POWERMAC)
copy_and_flush(0, kbase, 0x100, 0);
/*
@@ -2572,48 +2807,68 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
*/
prom_check_displays();
-#ifdef CONFIG_PPC64
+#if defined(CONFIG_PPC64) && defined(__BIG_ENDIAN__)
/*
* Initialize IOMMU (TCE tables) on pSeries. Do that before anything else
* that uses the allocator, we need to make sure we get the top of memory
* available for us here...
*/
- if (RELOC(of_platform) == PLATFORM_PSERIES)
+ if (of_platform == PLATFORM_PSERIES)
prom_initialize_tce_table();
#endif
/*
- * On non-powermacs, try to instantiate RTAS and puts all CPUs
- * in spin-loops. PowerMacs don't have a working RTAS and use
- * a different way to spin CPUs
+ * On non-powermacs, try to instantiate RTAS. PowerMacs don't
+ * have a usable RTAS implementation.
*/
- if (RELOC(of_platform) != PLATFORM_POWERMAC) {
+ if (of_platform != PLATFORM_POWERMAC &&
+ of_platform != PLATFORM_OPAL)
prom_instantiate_rtas();
+
+#ifdef CONFIG_PPC_POWERNV
+ if (of_platform == PLATFORM_OPAL)
+ prom_instantiate_opal();
+#endif /* CONFIG_PPC_POWERNV */
+
+#ifdef CONFIG_PPC64
+ /* instantiate sml */
+ prom_instantiate_sml();
+#endif
+
+ /*
+ * On non-powermacs, put all CPUs in spin-loops.
+ *
+ * PowerMacs use a different mechanism to spin CPUs
+ *
+ * (This must be done after instanciating RTAS)
+ */
+ if (of_platform != PLATFORM_POWERMAC &&
+ of_platform != PLATFORM_OPAL)
prom_hold_cpus();
- }
/*
* Fill in some infos for use by the kernel later on
*/
- if (RELOC(prom_memory_limit))
- prom_setprop(_prom->chosen, "/chosen", "linux,memory-limit",
- &RELOC(prom_memory_limit),
- sizeof(prom_memory_limit));
+ if (prom_memory_limit) {
+ __be64 val = cpu_to_be64(prom_memory_limit);
+ prom_setprop(prom.chosen, "/chosen", "linux,memory-limit",
+ &val, sizeof(val));
+ }
#ifdef CONFIG_PPC64
- if (RELOC(prom_iommu_off))
- prom_setprop(_prom->chosen, "/chosen", "linux,iommu-off",
+ if (prom_iommu_off)
+ prom_setprop(prom.chosen, "/chosen", "linux,iommu-off",
NULL, 0);
- if (RELOC(prom_iommu_force_on))
- prom_setprop(_prom->chosen, "/chosen", "linux,iommu-force-on",
+ if (prom_iommu_force_on)
+ prom_setprop(prom.chosen, "/chosen", "linux,iommu-force-on",
NULL, 0);
- if (RELOC(prom_tce_alloc_start)) {
- prom_setprop(_prom->chosen, "/chosen", "linux,tce-alloc-start",
- &RELOC(prom_tce_alloc_start),
+ if (prom_tce_alloc_start) {
+ prom_setprop(prom.chosen, "/chosen", "linux,tce-alloc-start",
+ &prom_tce_alloc_start,
sizeof(prom_tce_alloc_start));
- prom_setprop(_prom->chosen, "/chosen", "linux,tce-alloc-end",
- &RELOC(prom_tce_alloc_end),
+ prom_setprop(prom.chosen, "/chosen", "linux,tce-alloc-end",
+ &prom_tce_alloc_end,
sizeof(prom_tce_alloc_end));
}
#endif
@@ -2632,9 +2887,11 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
/*
* in case stdin is USB and still active on IBM machines...
* Unfortunately quiesce crashes on some powermacs if we have
- * closed stdin already (in particular the powerbook 101).
+ * closed stdin already (in particular the powerbook 101). It
+ * appears that the OPAL version of OFW doesn't like it either.
*/
- if (RELOC(of_platform) != PLATFORM_POWERMAC)
+ if (of_platform != PLATFORM_POWERMAC &&
+ of_platform != PLATFORM_OPAL)
prom_close_stdin();
/*
@@ -2649,15 +2906,27 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
* tree and NULL as r5, thus triggering the new entry point which
* is common to us and kexec
*/
- hdr = RELOC(dt_header_start);
- prom_printf("returning from prom_init\n");
- prom_debug("->dt_header_start=0x%x\n", hdr);
+ hdr = dt_header_start;
+
+ /* Don't print anything after quiesce under OPAL, it crashes OFW */
+ if (of_platform != PLATFORM_OPAL) {
+ prom_printf("returning from prom_init\n");
+ prom_debug("->dt_header_start=0x%x\n", hdr);
+ }
#ifdef CONFIG_PPC32
reloc_got2(-offset);
+#else
+ unreloc_toc();
#endif
- __start(hdr, kbase, 0);
+#ifdef CONFIG_PPC_EARLY_DEBUG_OPAL
+ /* OPAL early debug gets the OPAL base & entry in r8 and r9 */
+ __start(hdr, kbase, 0, 0, 0,
+ prom_opal_base, prom_opal_entry);
+#else
+ __start(hdr, kbase, 0, 0, 0, 0, 0);
+#endif
return 0;
}
diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh
index 9f82f493789..fe8e54b9ef7 100644
--- a/arch/powerpc/kernel/prom_init_check.sh
+++ b/arch/powerpc/kernel/prom_init_check.sh
@@ -20,7 +20,8 @@ WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush
_end enter_prom memcpy memset reloc_offset __secondary_hold
__secondary_hold_acknowledge __secondary_hold_spinloop __start
strcmp strcpy strlcpy strlen strncmp strstr logo_linux_clut224
-reloc_got2 kernstart_addr memstart_addr linux_banner"
+reloc_got2 kernstart_addr memstart_addr linux_banner _stext
+__prom_init_toc_start __prom_init_toc_end btext_setup_display TOC."
NM="$1"
OBJ="$2"
diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c
index 88334af038e..6295e646f78 100644
--- a/arch/powerpc/kernel/prom_parse.c
+++ b/arch/powerpc/kernel/prom_parse.c
@@ -2,156 +2,32 @@
#include <linux/kernel.h>
#include <linux/string.h>
-#include <linux/pci_regs.h>
-#include <linux/module.h>
#include <linux/ioport.h>
#include <linux/etherdevice.h>
#include <linux/of_address.h>
#include <asm/prom.h>
-#include <asm/pci-bridge.h>
-#ifdef CONFIG_PCI
-int of_irq_map_pci(struct pci_dev *pdev, struct of_irq *out_irq)
+void of_parse_dma_window(struct device_node *dn, const __be32 *dma_window,
+ unsigned long *busno, unsigned long *phys,
+ unsigned long *size)
{
- struct device_node *dn, *ppnode;
- struct pci_dev *ppdev;
- u32 lspec;
- u32 laddr[3];
- u8 pin;
- int rc;
-
- /* Check if we have a device node, if yes, fallback to standard OF
- * parsing
- */
- dn = pci_device_to_OF_node(pdev);
- if (dn) {
- rc = of_irq_map_one(dn, 0, out_irq);
- if (!rc)
- return rc;
- }
-
- /* Ok, we don't, time to have fun. Let's start by building up an
- * interrupt spec. we assume #interrupt-cells is 1, which is standard
- * for PCI. If you do different, then don't use that routine.
- */
- rc = pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
- if (rc != 0)
- return rc;
- /* No pin, exit */
- if (pin == 0)
- return -ENODEV;
-
- /* Now we walk up the PCI tree */
- lspec = pin;
- for (;;) {
- /* Get the pci_dev of our parent */
- ppdev = pdev->bus->self;
-
- /* Ouch, it's a host bridge... */
- if (ppdev == NULL) {
-#ifdef CONFIG_PPC64
- ppnode = pci_bus_to_OF_node(pdev->bus);
-#else
- struct pci_controller *host;
- host = pci_bus_to_host(pdev->bus);
- ppnode = host ? host->dn : NULL;
-#endif
- /* No node for host bridge ? give up */
- if (ppnode == NULL)
- return -EINVAL;
- } else
- /* We found a P2P bridge, check if it has a node */
- ppnode = pci_device_to_OF_node(ppdev);
-
- /* Ok, we have found a parent with a device-node, hand over to
- * the OF parsing code.
- * We build a unit address from the linux device to be used for
- * resolution. Note that we use the linux bus number which may
- * not match your firmware bus numbering.
- * Fortunately, in most cases, interrupt-map-mask doesn't include
- * the bus number as part of the matching.
- * You should still be careful about that though if you intend
- * to rely on this function (you ship a firmware that doesn't
- * create device nodes for all PCI devices).
- */
- if (ppnode)
- break;
-
- /* We can only get here if we hit a P2P bridge with no node,
- * let's do standard swizzling and try again
- */
- lspec = pci_swizzle_interrupt_pin(pdev, lspec);
- pdev = ppdev;
- }
-
- laddr[0] = (pdev->bus->number << 16)
- | (pdev->devfn << 8);
- laddr[1] = laddr[2] = 0;
- return of_irq_map_raw(ppnode, &lspec, 1, laddr, out_irq);
-}
-EXPORT_SYMBOL_GPL(of_irq_map_pci);
-#endif /* CONFIG_PCI */
-
-void of_parse_dma_window(struct device_node *dn, const void *dma_window_prop,
- unsigned long *busno, unsigned long *phys, unsigned long *size)
-{
- const u32 *dma_window;
u32 cells;
- const unsigned char *prop;
-
- dma_window = dma_window_prop;
+ const __be32 *prop;
/* busno is always one cell */
- *busno = *(dma_window++);
+ *busno = of_read_number(dma_window, 1);
+ dma_window++;
prop = of_get_property(dn, "ibm,#dma-address-cells", NULL);
if (!prop)
prop = of_get_property(dn, "#address-cells", NULL);
- cells = prop ? *(u32 *)prop : of_n_addr_cells(dn);
+ cells = prop ? of_read_number(prop, 1) : of_n_addr_cells(dn);
*phys = of_read_number(dma_window, cells);
dma_window += cells;
prop = of_get_property(dn, "ibm,#dma-size-cells", NULL);
- cells = prop ? *(u32 *)prop : of_n_size_cells(dn);
+ cells = prop ? of_read_number(prop, 1) : of_n_size_cells(dn);
*size = of_read_number(dma_window, cells);
}
-
-/**
- * Search the device tree for the best MAC address to use. 'mac-address' is
- * checked first, because that is supposed to contain to "most recent" MAC
- * address. If that isn't set, then 'local-mac-address' is checked next,
- * because that is the default address. If that isn't set, then the obsolete
- * 'address' is checked, just in case we're using an old device tree.
- *
- * Note that the 'address' property is supposed to contain a virtual address of
- * the register set, but some DTS files have redefined that property to be the
- * MAC address.
- *
- * All-zero MAC addresses are rejected, because those could be properties that
- * exist in the device tree, but were not set by U-Boot. For example, the
- * DTS could define 'mac-address' and 'local-mac-address', with zero MAC
- * addresses. Some older U-Boots only initialized 'local-mac-address'. In
- * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists
- * but is all zeros.
-*/
-const void *of_get_mac_address(struct device_node *np)
-{
- struct property *pp;
-
- pp = of_find_property(np, "mac-address", NULL);
- if (pp && (pp->length == 6) && is_valid_ether_addr(pp->value))
- return pp->value;
-
- pp = of_find_property(np, "local-mac-address", NULL);
- if (pp && (pp->length == 6) && is_valid_ether_addr(pp->value))
- return pp->value;
-
- pp = of_find_property(np, "address", NULL);
- if (pp && (pp->length == 6) && is_valid_ether_addr(pp->value))
- return pp->value;
-
- return NULL;
-}
-EXPORT_SYMBOL(of_get_mac_address);
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index a9b32967cff..2e3d2bf536c 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -29,16 +29,18 @@
#include <linux/signal.h>
#include <linux/seccomp.h>
#include <linux/audit.h>
-#ifdef CONFIG_PPC32
-#include <linux/module.h>
-#endif
+#include <trace/syscall.h>
#include <linux/hw_breakpoint.h>
#include <linux/perf_event.h>
+#include <linux/context_tracking.h>
#include <asm/uaccess.h>
#include <asm/page.h>
#include <asm/pgtable.h>
-#include <asm/system.h>
+#include <asm/switch_to.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
/*
* The parameter save area on the stack is used to store arguments being passed
@@ -178,6 +180,31 @@ static int set_user_msr(struct task_struct *task, unsigned long msr)
return 0;
}
+#ifdef CONFIG_PPC64
+static int get_user_dscr(struct task_struct *task, unsigned long *data)
+{
+ *data = task->thread.dscr;
+ return 0;
+}
+
+static int set_user_dscr(struct task_struct *task, unsigned long dscr)
+{
+ task->thread.dscr = dscr;
+ task->thread.dscr_inherit = 1;
+ return 0;
+}
+#else
+static int get_user_dscr(struct task_struct *task, unsigned long *data)
+{
+ return -EIO;
+}
+
+static int set_user_dscr(struct task_struct *task, unsigned long dscr)
+{
+ return -EIO;
+}
+#endif
+
/*
* We prevent mucking around with the reserved area of trap
* which are used internally by the kernel.
@@ -191,16 +218,23 @@ static int set_user_trap(struct task_struct *task, unsigned long trap)
/*
* Get contents of register REGNO in task TASK.
*/
-unsigned long ptrace_get_reg(struct task_struct *task, int regno)
+int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data)
{
- if (task->thread.regs == NULL)
+ if ((task->thread.regs == NULL) || !data)
return -EIO;
- if (regno == PT_MSR)
- return get_user_msr(task);
+ if (regno == PT_MSR) {
+ *data = get_user_msr(task);
+ return 0;
+ }
+
+ if (regno == PT_DSCR)
+ return get_user_dscr(task, data);
- if (regno < (sizeof(struct pt_regs) / sizeof(unsigned long)))
- return ((unsigned long *)task->thread.regs)[regno];
+ if (regno < (sizeof(struct pt_regs) / sizeof(unsigned long))) {
+ *data = ((unsigned long *)task->thread.regs)[regno];
+ return 0;
+ }
return -EIO;
}
@@ -217,6 +251,8 @@ int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data)
return set_user_msr(task, data);
if (regno == PT_TRAP)
return set_user_trap(task, data);
+ if (regno == PT_DSCR)
+ return set_user_dscr(task, data);
if (regno <= PT_MAX_PUT_REG) {
((unsigned long *)task->thread.regs)[regno] = data;
@@ -229,12 +265,16 @@ static int gpr_get(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
- int ret;
+ int i, ret;
if (target->thread.regs == NULL)
return -EIO;
- CHECK_FULL_REGS(target->thread.regs);
+ if (!FULL_REGS(target->thread.regs)) {
+ /* We have a partial register set. Fill 14-31 with bogus values */
+ for (i = 14; i < 32; i++)
+ target->thread.regs->gpr[i] = NV_REG_POISON;
+ }
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
target->thread.regs,
@@ -322,7 +362,7 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset,
void *kbuf, void __user *ubuf)
{
#ifdef CONFIG_VSX
- double buf[33];
+ u64 buf[33];
int i;
#endif
flush_fp_to_thread(target);
@@ -331,15 +371,15 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset,
/* copy to local buffer then write that out */
for (i = 0; i < 32 ; i++)
buf[i] = target->thread.TS_FPR(i);
- memcpy(&buf[32], &target->thread.fpscr, sizeof(double));
+ buf[32] = target->thread.fp_state.fpscr;
return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
#else
- BUILD_BUG_ON(offsetof(struct thread_struct, fpscr) !=
- offsetof(struct thread_struct, TS_FPR(32)));
+ BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) !=
+ offsetof(struct thread_fp_state, fpr[32][0]));
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpr, 0, -1);
+ &target->thread.fp_state, 0, -1);
#endif
}
@@ -348,7 +388,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
const void *kbuf, const void __user *ubuf)
{
#ifdef CONFIG_VSX
- double buf[33];
+ u64 buf[33];
int i;
#endif
flush_fp_to_thread(target);
@@ -360,14 +400,14 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
return i;
for (i = 0; i < 32 ; i++)
target->thread.TS_FPR(i) = buf[i];
- memcpy(&target->thread.fpscr, &buf[32], sizeof(double));
+ target->thread.fp_state.fpscr = buf[32];
return 0;
#else
- BUILD_BUG_ON(offsetof(struct thread_struct, fpscr) !=
- offsetof(struct thread_struct, TS_FPR(32)));
+ BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) !=
+ offsetof(struct thread_fp_state, fpr[32][0]));
return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpr, 0, -1);
+ &target->thread.fp_state, 0, -1);
#endif
}
@@ -400,11 +440,11 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset,
flush_altivec_to_thread(target);
- BUILD_BUG_ON(offsetof(struct thread_struct, vscr) !=
- offsetof(struct thread_struct, vr[32]));
+ BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) !=
+ offsetof(struct thread_vr_state, vr[32]));
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.vr, 0,
+ &target->thread.vr_state, 0,
33 * sizeof(vector128));
if (!ret) {
/*
@@ -431,11 +471,12 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset,
flush_altivec_to_thread(target);
- BUILD_BUG_ON(offsetof(struct thread_struct, vscr) !=
- offsetof(struct thread_struct, vr[32]));
+ BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) !=
+ offsetof(struct thread_vr_state, vr[32]));
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.vr, 0, 33 * sizeof(vector128));
+ &target->thread.vr_state, 0,
+ 33 * sizeof(vector128));
if (!ret && count > 0) {
/*
* We use only the first word of vrsave.
@@ -459,7 +500,7 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset,
#ifdef CONFIG_VSX
/*
* Currently to set and and get all the vsx state, you need to call
- * the fp and VMX calls aswell. This only get/sets the lower 32
+ * the fp and VMX calls as well. This only get/sets the lower 32
* 128bit VSX registers.
*/
@@ -474,13 +515,13 @@ static int vsr_get(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
- double buf[32];
+ u64 buf[32];
int ret, i;
flush_vsx_to_thread(target);
for (i = 0; i < 32 ; i++)
- buf[i] = target->thread.fpr[i][TS_VSRLOWOFFSET];
+ buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
buf, 0, 32 * sizeof(double));
@@ -491,7 +532,7 @@ static int vsr_set(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- double buf[32];
+ u64 buf[32];
int ret,i;
flush_vsx_to_thread(target);
@@ -499,7 +540,7 @@ static int vsr_set(struct task_struct *target, const struct user_regset *regset,
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
buf, 0, 32 * sizeof(double));
for (i = 0; i < 32 ; i++)
- target->thread.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+ target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
return ret;
@@ -617,7 +658,7 @@ static const struct user_regset native_regsets[] = {
#endif
#ifdef CONFIG_SPE
[REGSET_SPE] = {
- .n = 35,
+ .core_note_type = NT_PPC_SPE, .n = 35,
.size = sizeof(u32), .align = sizeof(u32),
.active = evr_active, .get = evr_get, .set = evr_set
},
@@ -641,11 +682,16 @@ static int gpr32_get(struct task_struct *target,
compat_ulong_t *k = kbuf;
compat_ulong_t __user *u = ubuf;
compat_ulong_t reg;
+ int i;
if (target->thread.regs == NULL)
return -EIO;
- CHECK_FULL_REGS(target->thread.regs);
+ if (!FULL_REGS(target->thread.regs)) {
+ /* We have a partial register set. Fill 14-31 with bogus values */
+ for (i = 14; i < 32; i++)
+ target->thread.regs->gpr[i] = NV_REG_POISON;
+ }
pos /= sizeof(reg);
count /= sizeof(reg);
@@ -809,8 +855,8 @@ void user_enable_single_step(struct task_struct *task)
if (regs != NULL) {
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
- task->thread.dbcr0 &= ~DBCR0_BT;
- task->thread.dbcr0 |= DBCR0_IDM | DBCR0_IC;
+ task->thread.debug.dbcr0 &= ~DBCR0_BT;
+ task->thread.debug.dbcr0 |= DBCR0_IDM | DBCR0_IC;
regs->msr |= MSR_DE;
#else
regs->msr &= ~MSR_BE;
@@ -826,8 +872,8 @@ void user_enable_block_step(struct task_struct *task)
if (regs != NULL) {
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
- task->thread.dbcr0 &= ~DBCR0_IC;
- task->thread.dbcr0 = DBCR0_IDM | DBCR0_BT;
+ task->thread.debug.dbcr0 &= ~DBCR0_IC;
+ task->thread.debug.dbcr0 = DBCR0_IDM | DBCR0_BT;
regs->msr |= MSR_DE;
#else
regs->msr &= ~MSR_SE;
@@ -849,16 +895,16 @@ void user_disable_single_step(struct task_struct *task)
* And, after doing so, if all debug flags are off, turn
* off DBCR0(IDM) and MSR(DE) .... Torez
*/
- task->thread.dbcr0 &= ~DBCR0_IC;
+ task->thread.debug.dbcr0 &= ~(DBCR0_IC|DBCR0_BT);
/*
* Test to see if any of the DBCR_ACTIVE_EVENTS bits are set.
*/
- if (!DBCR_ACTIVE_EVENTS(task->thread.dbcr0,
- task->thread.dbcr1)) {
+ if (!DBCR_ACTIVE_EVENTS(task->thread.debug.dbcr0,
+ task->thread.debug.dbcr1)) {
/*
* All debug events were off.....
*/
- task->thread.dbcr0 &= ~DBCR0_IDM;
+ task->thread.debug.dbcr0 &= ~DBCR0_IDM;
regs->msr &= ~MSR_DE;
}
#else
@@ -869,7 +915,7 @@ void user_disable_single_step(struct task_struct *task)
}
#ifdef CONFIG_HAVE_HW_BREAKPOINT
-void ptrace_triggered(struct perf_event *bp, int nmi,
+void ptrace_triggered(struct perf_event *bp,
struct perf_sample_data *data, struct pt_regs *regs)
{
struct perf_event_attr attr;
@@ -895,6 +941,9 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
struct perf_event *bp;
struct perf_event_attr attr;
#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+#ifndef CONFIG_PPC_ADV_DEBUG_REGS
+ struct arch_hw_breakpoint hw_brk;
+#endif
/* For ppc64 we support one DABR and no IABR's at the moment (ppc64).
* For embedded processors we support one DAC and no IAC's at the
@@ -921,11 +970,14 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
*/
/* Ensure breakpoint translation bit is set */
- if (data && !(data & DABR_TRANSLATION))
+ if (data && !(data & HW_BRK_TYPE_TRANSLATE))
return -EIO;
+ hw_brk.address = data & (~HW_BRK_TYPE_DABR);
+ hw_brk.type = (data & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL;
+ hw_brk.len = 8;
#ifdef CONFIG_HAVE_HW_BREAKPOINT
bp = thread->ptrace_bps[0];
- if ((!data) || !(data & (DABR_DATA_WRITE | DABR_DATA_READ))) {
+ if ((!data) || !(hw_brk.type & HW_BRK_TYPE_RDWR)) {
if (bp) {
unregister_hw_breakpoint(bp);
thread->ptrace_bps[0] = NULL;
@@ -934,35 +986,36 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
}
if (bp) {
attr = bp->attr;
- attr.bp_addr = data & ~HW_BREAKPOINT_ALIGN;
- arch_bp_generic_fields(data &
- (DABR_DATA_WRITE | DABR_DATA_READ),
- &attr.bp_type);
+ attr.bp_addr = hw_brk.address;
+ arch_bp_generic_fields(hw_brk.type, &attr.bp_type);
+
+ /* Enable breakpoint */
+ attr.disabled = false;
+
ret = modify_user_hw_breakpoint(bp, &attr);
- if (ret)
+ if (ret) {
return ret;
+ }
thread->ptrace_bps[0] = bp;
- thread->dabr = data;
+ thread->hw_brk = hw_brk;
return 0;
}
/* Create a new breakpoint request if one doesn't exist already */
hw_breakpoint_init(&attr);
- attr.bp_addr = data & ~HW_BREAKPOINT_ALIGN;
- arch_bp_generic_fields(data & (DABR_DATA_WRITE | DABR_DATA_READ),
- &attr.bp_type);
+ attr.bp_addr = hw_brk.address;
+ arch_bp_generic_fields(hw_brk.type,
+ &attr.bp_type);
thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr,
- ptrace_triggered, task);
+ ptrace_triggered, NULL, task);
if (IS_ERR(bp)) {
thread->ptrace_bps[0] = NULL;
return PTR_ERR(bp);
}
#endif /* CONFIG_HAVE_HW_BREAKPOINT */
-
- /* Move contents to the DABR register */
- task->thread.dabr = data;
+ task->thread.hw_brk = hw_brk;
#else /* CONFIG_PPC_ADV_DEBUG_REGS */
/* As described above, it was assumed 3 bits were passed with the data
* address, but we will assume only the mode bits will be passed
@@ -970,14 +1023,14 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
*/
/* DAC's hold the whole address without any mode flags */
- task->thread.dac1 = data & ~0x3UL;
+ task->thread.debug.dac1 = data & ~0x3UL;
- if (task->thread.dac1 == 0) {
+ if (task->thread.debug.dac1 == 0) {
dbcr_dac(task) &= ~(DBCR_DAC1R | DBCR_DAC1W);
- if (!DBCR_ACTIVE_EVENTS(task->thread.dbcr0,
- task->thread.dbcr1)) {
+ if (!DBCR_ACTIVE_EVENTS(task->thread.debug.dbcr0,
+ task->thread.debug.dbcr1)) {
task->thread.regs->msr &= ~MSR_DE;
- task->thread.dbcr0 &= ~DBCR0_IDM;
+ task->thread.debug.dbcr0 &= ~DBCR0_IDM;
}
return 0;
}
@@ -989,7 +1042,7 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr,
/* Set the Internal Debugging flag (IDM bit 1) for the DBCR0
register */
- task->thread.dbcr0 |= DBCR0_IDM;
+ task->thread.debug.dbcr0 |= DBCR0_IDM;
/* Check for write and read flags and set DBCR0
accordingly */
@@ -1015,14 +1068,14 @@ void ptrace_disable(struct task_struct *child)
}
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
-static long set_intruction_bp(struct task_struct *child,
+static long set_instruction_bp(struct task_struct *child,
struct ppc_hw_breakpoint *bp_info)
{
int slot;
- int slot1_in_use = ((child->thread.dbcr0 & DBCR0_IAC1) != 0);
- int slot2_in_use = ((child->thread.dbcr0 & DBCR0_IAC2) != 0);
- int slot3_in_use = ((child->thread.dbcr0 & DBCR0_IAC3) != 0);
- int slot4_in_use = ((child->thread.dbcr0 & DBCR0_IAC4) != 0);
+ int slot1_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC1) != 0);
+ int slot2_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC2) != 0);
+ int slot3_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC3) != 0);
+ int slot4_in_use = ((child->thread.debug.dbcr0 & DBCR0_IAC4) != 0);
if (dbcr_iac_range(child) & DBCR_IAC12MODE)
slot2_in_use = 1;
@@ -1041,9 +1094,9 @@ static long set_intruction_bp(struct task_struct *child,
/* We need a pair of IAC regsisters */
if ((!slot1_in_use) && (!slot2_in_use)) {
slot = 1;
- child->thread.iac1 = bp_info->addr;
- child->thread.iac2 = bp_info->addr2;
- child->thread.dbcr0 |= DBCR0_IAC1;
+ child->thread.debug.iac1 = bp_info->addr;
+ child->thread.debug.iac2 = bp_info->addr2;
+ child->thread.debug.dbcr0 |= DBCR0_IAC1;
if (bp_info->addr_mode ==
PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE)
dbcr_iac_range(child) |= DBCR_IAC12X;
@@ -1052,9 +1105,9 @@ static long set_intruction_bp(struct task_struct *child,
#if CONFIG_PPC_ADV_DEBUG_IACS > 2
} else if ((!slot3_in_use) && (!slot4_in_use)) {
slot = 3;
- child->thread.iac3 = bp_info->addr;
- child->thread.iac4 = bp_info->addr2;
- child->thread.dbcr0 |= DBCR0_IAC3;
+ child->thread.debug.iac3 = bp_info->addr;
+ child->thread.debug.iac4 = bp_info->addr2;
+ child->thread.debug.dbcr0 |= DBCR0_IAC3;
if (bp_info->addr_mode ==
PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE)
dbcr_iac_range(child) |= DBCR_IAC34X;
@@ -1074,30 +1127,30 @@ static long set_intruction_bp(struct task_struct *child,
*/
if (slot2_in_use || (slot3_in_use == slot4_in_use)) {
slot = 1;
- child->thread.iac1 = bp_info->addr;
- child->thread.dbcr0 |= DBCR0_IAC1;
+ child->thread.debug.iac1 = bp_info->addr;
+ child->thread.debug.dbcr0 |= DBCR0_IAC1;
goto out;
}
}
if (!slot2_in_use) {
slot = 2;
- child->thread.iac2 = bp_info->addr;
- child->thread.dbcr0 |= DBCR0_IAC2;
+ child->thread.debug.iac2 = bp_info->addr;
+ child->thread.debug.dbcr0 |= DBCR0_IAC2;
#if CONFIG_PPC_ADV_DEBUG_IACS > 2
} else if (!slot3_in_use) {
slot = 3;
- child->thread.iac3 = bp_info->addr;
- child->thread.dbcr0 |= DBCR0_IAC3;
+ child->thread.debug.iac3 = bp_info->addr;
+ child->thread.debug.dbcr0 |= DBCR0_IAC3;
} else if (!slot4_in_use) {
slot = 4;
- child->thread.iac4 = bp_info->addr;
- child->thread.dbcr0 |= DBCR0_IAC4;
+ child->thread.debug.iac4 = bp_info->addr;
+ child->thread.debug.dbcr0 |= DBCR0_IAC4;
#endif
} else
return -ENOSPC;
}
out:
- child->thread.dbcr0 |= DBCR0_IDM;
+ child->thread.debug.dbcr0 |= DBCR0_IDM;
child->thread.regs->msr |= MSR_DE;
return slot;
@@ -1107,49 +1160,49 @@ static int del_instruction_bp(struct task_struct *child, int slot)
{
switch (slot) {
case 1:
- if ((child->thread.dbcr0 & DBCR0_IAC1) == 0)
+ if ((child->thread.debug.dbcr0 & DBCR0_IAC1) == 0)
return -ENOENT;
if (dbcr_iac_range(child) & DBCR_IAC12MODE) {
/* address range - clear slots 1 & 2 */
- child->thread.iac2 = 0;
+ child->thread.debug.iac2 = 0;
dbcr_iac_range(child) &= ~DBCR_IAC12MODE;
}
- child->thread.iac1 = 0;
- child->thread.dbcr0 &= ~DBCR0_IAC1;
+ child->thread.debug.iac1 = 0;
+ child->thread.debug.dbcr0 &= ~DBCR0_IAC1;
break;
case 2:
- if ((child->thread.dbcr0 & DBCR0_IAC2) == 0)
+ if ((child->thread.debug.dbcr0 & DBCR0_IAC2) == 0)
return -ENOENT;
if (dbcr_iac_range(child) & DBCR_IAC12MODE)
/* used in a range */
return -EINVAL;
- child->thread.iac2 = 0;
- child->thread.dbcr0 &= ~DBCR0_IAC2;
+ child->thread.debug.iac2 = 0;
+ child->thread.debug.dbcr0 &= ~DBCR0_IAC2;
break;
#if CONFIG_PPC_ADV_DEBUG_IACS > 2
case 3:
- if ((child->thread.dbcr0 & DBCR0_IAC3) == 0)
+ if ((child->thread.debug.dbcr0 & DBCR0_IAC3) == 0)
return -ENOENT;
if (dbcr_iac_range(child) & DBCR_IAC34MODE) {
/* address range - clear slots 3 & 4 */
- child->thread.iac4 = 0;
+ child->thread.debug.iac4 = 0;
dbcr_iac_range(child) &= ~DBCR_IAC34MODE;
}
- child->thread.iac3 = 0;
- child->thread.dbcr0 &= ~DBCR0_IAC3;
+ child->thread.debug.iac3 = 0;
+ child->thread.debug.dbcr0 &= ~DBCR0_IAC3;
break;
case 4:
- if ((child->thread.dbcr0 & DBCR0_IAC4) == 0)
+ if ((child->thread.debug.dbcr0 & DBCR0_IAC4) == 0)
return -ENOENT;
if (dbcr_iac_range(child) & DBCR_IAC34MODE)
/* Used in a range */
return -EINVAL;
- child->thread.iac4 = 0;
- child->thread.dbcr0 &= ~DBCR0_IAC4;
+ child->thread.debug.iac4 = 0;
+ child->thread.debug.dbcr0 &= ~DBCR0_IAC4;
break;
#endif
default:
@@ -1179,18 +1232,18 @@ static int set_dac(struct task_struct *child, struct ppc_hw_breakpoint *bp_info)
dbcr_dac(child) |= DBCR_DAC1R;
if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE)
dbcr_dac(child) |= DBCR_DAC1W;
- child->thread.dac1 = (unsigned long)bp_info->addr;
+ child->thread.debug.dac1 = (unsigned long)bp_info->addr;
#if CONFIG_PPC_ADV_DEBUG_DVCS > 0
if (byte_enable) {
- child->thread.dvc1 =
+ child->thread.debug.dvc1 =
(unsigned long)bp_info->condition_value;
- child->thread.dbcr2 |=
+ child->thread.debug.dbcr2 |=
((byte_enable << DBCR2_DVC1BE_SHIFT) |
(condition_mode << DBCR2_DVC1M_SHIFT));
}
#endif
#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE
- } else if (child->thread.dbcr2 & DBCR2_DAC12MODE) {
+ } else if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) {
/* Both dac1 and dac2 are part of a range */
return -ENOSPC;
#endif
@@ -1200,19 +1253,19 @@ static int set_dac(struct task_struct *child, struct ppc_hw_breakpoint *bp_info)
dbcr_dac(child) |= DBCR_DAC2R;
if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE)
dbcr_dac(child) |= DBCR_DAC2W;
- child->thread.dac2 = (unsigned long)bp_info->addr;
+ child->thread.debug.dac2 = (unsigned long)bp_info->addr;
#if CONFIG_PPC_ADV_DEBUG_DVCS > 0
if (byte_enable) {
- child->thread.dvc2 =
+ child->thread.debug.dvc2 =
(unsigned long)bp_info->condition_value;
- child->thread.dbcr2 |=
+ child->thread.debug.dbcr2 |=
((byte_enable << DBCR2_DVC2BE_SHIFT) |
(condition_mode << DBCR2_DVC2M_SHIFT));
}
#endif
} else
return -ENOSPC;
- child->thread.dbcr0 |= DBCR0_IDM;
+ child->thread.debug.dbcr0 |= DBCR0_IDM;
child->thread.regs->msr |= MSR_DE;
return slot + 4;
@@ -1224,32 +1277,32 @@ static int del_dac(struct task_struct *child, int slot)
if ((dbcr_dac(child) & (DBCR_DAC1R | DBCR_DAC1W)) == 0)
return -ENOENT;
- child->thread.dac1 = 0;
+ child->thread.debug.dac1 = 0;
dbcr_dac(child) &= ~(DBCR_DAC1R | DBCR_DAC1W);
#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE
- if (child->thread.dbcr2 & DBCR2_DAC12MODE) {
- child->thread.dac2 = 0;
- child->thread.dbcr2 &= ~DBCR2_DAC12MODE;
+ if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE) {
+ child->thread.debug.dac2 = 0;
+ child->thread.debug.dbcr2 &= ~DBCR2_DAC12MODE;
}
- child->thread.dbcr2 &= ~(DBCR2_DVC1M | DBCR2_DVC1BE);
+ child->thread.debug.dbcr2 &= ~(DBCR2_DVC1M | DBCR2_DVC1BE);
#endif
#if CONFIG_PPC_ADV_DEBUG_DVCS > 0
- child->thread.dvc1 = 0;
+ child->thread.debug.dvc1 = 0;
#endif
} else if (slot == 2) {
if ((dbcr_dac(child) & (DBCR_DAC2R | DBCR_DAC2W)) == 0)
return -ENOENT;
#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE
- if (child->thread.dbcr2 & DBCR2_DAC12MODE)
+ if (child->thread.debug.dbcr2 & DBCR2_DAC12MODE)
/* Part of a range */
return -EINVAL;
- child->thread.dbcr2 &= ~(DBCR2_DVC2M | DBCR2_DVC2BE);
+ child->thread.debug.dbcr2 &= ~(DBCR2_DVC2M | DBCR2_DVC2BE);
#endif
#if CONFIG_PPC_ADV_DEBUG_DVCS > 0
- child->thread.dvc2 = 0;
+ child->thread.debug.dvc2 = 0;
#endif
- child->thread.dac2 = 0;
+ child->thread.debug.dac2 = 0;
dbcr_dac(child) &= ~(DBCR_DAC2R | DBCR_DAC2W);
} else
return -EINVAL;
@@ -1291,22 +1344,22 @@ static int set_dac_range(struct task_struct *child,
return -EIO;
}
- if (child->thread.dbcr0 &
+ if (child->thread.debug.dbcr0 &
(DBCR0_DAC1R | DBCR0_DAC1W | DBCR0_DAC2R | DBCR0_DAC2W))
return -ENOSPC;
if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ)
- child->thread.dbcr0 |= (DBCR0_DAC1R | DBCR0_IDM);
+ child->thread.debug.dbcr0 |= (DBCR0_DAC1R | DBCR0_IDM);
if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE)
- child->thread.dbcr0 |= (DBCR0_DAC1W | DBCR0_IDM);
- child->thread.dac1 = bp_info->addr;
- child->thread.dac2 = bp_info->addr2;
+ child->thread.debug.dbcr0 |= (DBCR0_DAC1W | DBCR0_IDM);
+ child->thread.debug.dac1 = bp_info->addr;
+ child->thread.debug.dac2 = bp_info->addr2;
if (mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE)
- child->thread.dbcr2 |= DBCR2_DAC12M;
+ child->thread.debug.dbcr2 |= DBCR2_DAC12M;
else if (mode == PPC_BREAKPOINT_MODE_RANGE_EXCLUSIVE)
- child->thread.dbcr2 |= DBCR2_DAC12MX;
+ child->thread.debug.dbcr2 |= DBCR2_DAC12MX;
else /* PPC_BREAKPOINT_MODE_MASK */
- child->thread.dbcr2 |= DBCR2_DAC12MM;
+ child->thread.debug.dbcr2 |= DBCR2_DAC12MM;
child->thread.regs->msr |= MSR_DE;
return 5;
@@ -1316,6 +1369,16 @@ static int set_dac_range(struct task_struct *child,
static long ppc_set_hwdebug(struct task_struct *child,
struct ppc_hw_breakpoint *bp_info)
{
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+ int len = 0;
+ struct thread_struct *thread = &(child->thread);
+ struct perf_event *bp;
+ struct perf_event_attr attr;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+#ifndef CONFIG_PPC_ADV_DEBUG_REGS
+ struct arch_hw_breakpoint brk;
+#endif
+
if (bp_info->version != 1)
return -ENOTSUPP;
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
@@ -1339,7 +1402,7 @@ static long ppc_set_hwdebug(struct task_struct *child,
if ((bp_info->trigger_type != PPC_BREAKPOINT_TRIGGER_EXECUTE) ||
(bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE))
return -EINVAL;
- return set_intruction_bp(child, bp_info);
+ return set_instruction_bp(child, bp_info);
}
if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT)
return set_dac(child, bp_info);
@@ -1353,27 +1416,71 @@ static long ppc_set_hwdebug(struct task_struct *child,
/*
* We only support one data breakpoint
*/
- if (((bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_RW) == 0) ||
- ((bp_info->trigger_type & ~PPC_BREAKPOINT_TRIGGER_RW) != 0) ||
- (bp_info->trigger_type != PPC_BREAKPOINT_TRIGGER_WRITE) ||
- (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) ||
- (bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE))
+ if ((bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_RW) == 0 ||
+ (bp_info->trigger_type & ~PPC_BREAKPOINT_TRIGGER_RW) != 0 ||
+ bp_info->condition_mode != PPC_BREAKPOINT_CONDITION_NONE)
return -EINVAL;
- if (child->thread.dabr)
- return -ENOSPC;
-
if ((unsigned long)bp_info->addr >= TASK_SIZE)
return -EIO;
- child->thread.dabr = (unsigned long)bp_info->addr;
+ brk.address = bp_info->addr & ~7UL;
+ brk.type = HW_BRK_TYPE_TRANSLATE;
+ brk.len = 8;
+ if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ)
+ brk.type |= HW_BRK_TYPE_READ;
+ if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE)
+ brk.type |= HW_BRK_TYPE_WRITE;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+ /*
+ * Check if the request is for 'range' breakpoints. We can
+ * support it if range < 8 bytes.
+ */
+ if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE)
+ len = bp_info->addr2 - bp_info->addr;
+ else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT)
+ len = 1;
+ else
+ return -EINVAL;
+ bp = thread->ptrace_bps[0];
+ if (bp)
+ return -ENOSPC;
+
+ /* Create a new breakpoint request if one doesn't exist already */
+ hw_breakpoint_init(&attr);
+ attr.bp_addr = (unsigned long)bp_info->addr & ~HW_BREAKPOINT_ALIGN;
+ attr.bp_len = len;
+ arch_bp_generic_fields(brk.type, &attr.bp_type);
+
+ thread->ptrace_bps[0] = bp = register_user_hw_breakpoint(&attr,
+ ptrace_triggered, NULL, child);
+ if (IS_ERR(bp)) {
+ thread->ptrace_bps[0] = NULL;
+ return PTR_ERR(bp);
+ }
+
+ return 1;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+
+ if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT)
+ return -EINVAL;
+
+ if (child->thread.hw_brk.address)
+ return -ENOSPC;
+
+ child->thread.hw_brk = brk;
return 1;
#endif /* !CONFIG_PPC_ADV_DEBUG_DVCS */
}
-static long ppc_del_hwdebug(struct task_struct *child, long addr, long data)
+static long ppc_del_hwdebug(struct task_struct *child, long data)
{
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+ int ret = 0;
+ struct thread_struct *thread = &(child->thread);
+ struct perf_event *bp;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
int rc;
@@ -1383,9 +1490,9 @@ static long ppc_del_hwdebug(struct task_struct *child, long addr, long data)
rc = del_dac(child, (int)data - 4);
if (!rc) {
- if (!DBCR_ACTIVE_EVENTS(child->thread.dbcr0,
- child->thread.dbcr1)) {
- child->thread.dbcr0 &= ~DBCR0_IDM;
+ if (!DBCR_ACTIVE_EVENTS(child->thread.debug.dbcr0,
+ child->thread.debug.dbcr1)) {
+ child->thread.debug.dbcr0 &= ~DBCR0_IDM;
child->thread.regs->msr &= ~MSR_DE;
}
}
@@ -1393,49 +1500,27 @@ static long ppc_del_hwdebug(struct task_struct *child, long addr, long data)
#else
if (data != 1)
return -EINVAL;
- if (child->thread.dabr == 0)
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+ bp = thread->ptrace_bps[0];
+ if (bp) {
+ unregister_hw_breakpoint(bp);
+ thread->ptrace_bps[0] = NULL;
+ } else
+ ret = -ENOENT;
+ return ret;
+#else /* CONFIG_HAVE_HW_BREAKPOINT */
+ if (child->thread.hw_brk.address == 0)
return -ENOENT;
- child->thread.dabr = 0;
+ child->thread.hw_brk.address = 0;
+ child->thread.hw_brk.type = 0;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
return 0;
#endif
}
-/*
- * Here are the old "legacy" powerpc specific getregs/setregs ptrace calls,
- * we mark them as obsolete now, they will be removed in a future version
- */
-static long arch_ptrace_old(struct task_struct *child, long request,
- unsigned long addr, unsigned long data)
-{
- void __user *datavp = (void __user *) data;
-
- switch (request) {
- case PPC_PTRACE_GETREGS: /* Get GPRs 0 - 31. */
- return copy_regset_to_user(child, &user_ppc_native_view,
- REGSET_GPR, 0, 32 * sizeof(long),
- datavp);
-
- case PPC_PTRACE_SETREGS: /* Set GPRs 0 - 31. */
- return copy_regset_from_user(child, &user_ppc_native_view,
- REGSET_GPR, 0, 32 * sizeof(long),
- datavp);
-
- case PPC_PTRACE_GETFPREGS: /* Get FPRs 0 - 31. */
- return copy_regset_to_user(child, &user_ppc_native_view,
- REGSET_FPR, 0, 32 * sizeof(double),
- datavp);
-
- case PPC_PTRACE_SETFPREGS: /* Set FPRs 0 - 31. */
- return copy_regset_from_user(child, &user_ppc_native_view,
- REGSET_FPR, 0, 32 * sizeof(double),
- datavp);
- }
-
- return -EPERM;
-}
-
long arch_ptrace(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
{
@@ -1462,11 +1547,18 @@ long arch_ptrace(struct task_struct *child, long request,
CHECK_FULL_REGS(child->thread.regs);
if (index < PT_FPR0) {
- tmp = ptrace_get_reg(child, (int) index);
+ ret = ptrace_get_reg(child, (int) index, &tmp);
+ if (ret)
+ break;
} else {
+ unsigned int fpidx = index - PT_FPR0;
+
flush_fp_to_thread(child);
- tmp = ((unsigned long *)child->thread.fpr)
- [TS_FPRWIDTH * (index - PT_FPR0)];
+ if (fpidx < (PT_FPSCR - PT_FPR0))
+ memcpy(&tmp, &child->thread.TS_FPR(fpidx),
+ sizeof(long));
+ else
+ tmp = child->thread.fp_state.fpscr;
}
ret = put_user(tmp, datalp);
break;
@@ -1492,9 +1584,14 @@ long arch_ptrace(struct task_struct *child, long request,
if (index < PT_FPR0) {
ret = ptrace_put_reg(child, index, data);
} else {
+ unsigned int fpidx = index - PT_FPR0;
+
flush_fp_to_thread(child);
- ((unsigned long *)child->thread.fpr)
- [TS_FPRWIDTH * (index - PT_FPR0)] = data;
+ if (fpidx < (PT_FPSCR - PT_FPR0))
+ memcpy(&child->thread.TS_FPR(fpidx), &data,
+ sizeof(long));
+ else
+ child->thread.fp_state.fpscr = data;
ret = 0;
}
break;
@@ -1527,7 +1624,13 @@ long arch_ptrace(struct task_struct *child, long request,
dbginfo.data_bp_alignment = 4;
#endif
dbginfo.sizeof_condition = 0;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+ dbginfo.features = PPC_DEBUG_FEATURE_DATA_BP_RANGE;
+ if (cpu_has_feature(CPU_FTR_DAWR))
+ dbginfo.features |= PPC_DEBUG_FEATURE_DATA_BP_DAWR;
+#else
dbginfo.features = 0;
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
#endif /* CONFIG_PPC_ADV_DEBUG_REGS */
if (!access_ok(VERIFY_WRITE, datavp,
@@ -1554,19 +1657,24 @@ long arch_ptrace(struct task_struct *child, long request,
}
case PPC_PTRACE_DELHWDEBUG: {
- ret = ppc_del_hwdebug(child, addr, data);
+ ret = ppc_del_hwdebug(child, data);
break;
}
case PTRACE_GET_DEBUGREG: {
+#ifndef CONFIG_PPC_ADV_DEBUG_REGS
+ unsigned long dabr_fake;
+#endif
ret = -EINVAL;
/* We only support one DABR and no IABRS at the moment */
if (addr > 0)
break;
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
- ret = put_user(child->thread.dac1, datalp);
+ ret = put_user(child->thread.debug.dac1, datalp);
#else
- ret = put_user(child->thread.dabr, datalp);
+ dabr_fake = ((child->thread.hw_brk.address & (~HW_BRK_TYPE_DABR)) |
+ (child->thread.hw_brk.type & HW_BRK_TYPE_DABR));
+ ret = put_user(dabr_fake, datalp);
#endif
break;
}
@@ -1647,14 +1755,6 @@ long arch_ptrace(struct task_struct *child, long request,
datavp);
#endif
- /* Old reverse args ptrace callss */
- case PPC_PTRACE_GETREGS: /* Get GPRs 0 - 31. */
- case PPC_PTRACE_SETREGS: /* Set GPRs 0 - 31. */
- case PPC_PTRACE_GETFPREGS: /* Get FPRs 0 - 31. */
- case PPC_PTRACE_SETFPREGS: /* Get FPRs 0 - 31. */
- ret = arch_ptrace_old(child, request, addr, data);
- break;
-
default:
ret = ptrace_request(child, request, addr, data);
break;
@@ -1670,7 +1770,9 @@ long do_syscall_trace_enter(struct pt_regs *regs)
{
long ret = 0;
- secure_computing(regs->gpr[0]);
+ user_exit();
+
+ secure_computing_strict(regs->gpr[0]);
if (test_thread_flag(TIF_SYSCALL_TRACE) &&
tracehook_report_syscall_entry(regs))
@@ -1681,22 +1783,23 @@ long do_syscall_trace_enter(struct pt_regs *regs)
*/
ret = -1L;
- if (unlikely(current->audit_context)) {
+ if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+ trace_sys_enter(regs, regs->gpr[0]);
+
#ifdef CONFIG_PPC64
- if (!is_32bit_task())
- audit_syscall_entry(AUDIT_ARCH_PPC64,
- regs->gpr[0],
- regs->gpr[3], regs->gpr[4],
- regs->gpr[5], regs->gpr[6]);
- else
+ if (!is_32bit_task())
+ audit_syscall_entry(AUDIT_ARCH_PPC64,
+ regs->gpr[0],
+ regs->gpr[3], regs->gpr[4],
+ regs->gpr[5], regs->gpr[6]);
+ else
#endif
- audit_syscall_entry(AUDIT_ARCH_PPC,
- regs->gpr[0],
- regs->gpr[3] & 0xffffffff,
- regs->gpr[4] & 0xffffffff,
- regs->gpr[5] & 0xffffffff,
- regs->gpr[6] & 0xffffffff);
- }
+ audit_syscall_entry(AUDIT_ARCH_PPC,
+ regs->gpr[0],
+ regs->gpr[3] & 0xffffffff,
+ regs->gpr[4] & 0xffffffff,
+ regs->gpr[5] & 0xffffffff,
+ regs->gpr[6] & 0xffffffff);
return ret ?: regs->gpr[0];
}
@@ -1705,11 +1808,14 @@ void do_syscall_trace_leave(struct pt_regs *regs)
{
int step;
- if (unlikely(current->audit_context))
- audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS,
- regs->result);
+ audit_syscall_exit(regs);
+
+ if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+ trace_sys_exit(regs, regs->result);
step = test_thread_flag(TIF_SINGLESTEP);
if (step || test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall_exit(regs, step);
+
+ user_enter();
}
diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c
index 8a6daf4129f..f52b7db327c 100644
--- a/arch/powerpc/kernel/ptrace32.c
+++ b/arch/powerpc/kernel/ptrace32.c
@@ -32,42 +32,17 @@
#include <asm/uaccess.h>
#include <asm/page.h>
#include <asm/pgtable.h>
-#include <asm/system.h>
+#include <asm/switch_to.h>
/*
* does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
*/
-/*
- * Here are the old "legacy" powerpc specific getregs/setregs ptrace calls,
- * we mark them as obsolete now, they will be removed in a future version
- */
-static long compat_ptrace_old(struct task_struct *child, long request,
- long addr, long data)
-{
- switch (request) {
- case PPC_PTRACE_GETREGS: /* Get GPRs 0 - 31. */
- return copy_regset_to_user(child,
- task_user_regset_view(current), 0,
- 0, 32 * sizeof(compat_long_t),
- compat_ptr(data));
-
- case PPC_PTRACE_SETREGS: /* Set GPRs 0 - 31. */
- return copy_regset_from_user(child,
- task_user_regset_view(current), 0,
- 0, 32 * sizeof(compat_long_t),
- compat_ptr(data));
- }
-
- return -EPERM;
-}
-
/* Macros to workout the correct index for the FPR in the thread struct */
#define FPRNUMBER(i) (((i) - PT_FPR0) >> 1)
#define FPRHALF(i) (((i) - PT_FPR0) & 1)
#define FPRINDEX(i) TS_FPRWIDTH * FPRNUMBER(i) * 2 + FPRHALF(i)
-#define FPRINDEX_3264(i) (TS_FPRWIDTH * ((i) - PT_FPR0))
long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
compat_ulong_t caddr, compat_ulong_t cdata)
@@ -119,7 +94,9 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
CHECK_FULL_REGS(child->thread.regs);
if (index < PT_FPR0) {
- tmp = ptrace_get_reg(child, index);
+ ret = ptrace_get_reg(child, index, &tmp);
+ if (ret)
+ break;
} else {
flush_fp_to_thread(child);
/*
@@ -127,7 +104,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
* to be an array of unsigned int (32 bits) - the
* index passed in is based on this assumption.
*/
- tmp = ((unsigned int *)child->thread.fpr)
+ tmp = ((unsigned int *)child->thread.fp_state.fpr)
[FPRINDEX(index)];
}
ret = put_user((unsigned int)tmp, (u32 __user *)data);
@@ -169,10 +146,13 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
if (numReg >= PT_FPR0) {
flush_fp_to_thread(child);
/* get 64 bit FPR */
- tmp = ((u64 *)child->thread.fpr)
- [FPRINDEX_3264(numReg)];
+ tmp = child->thread.fp_state.fpr[numReg - PT_FPR0][0];
} else { /* register within PT_REGS struct */
- tmp = ptrace_get_reg(child, numReg);
+ unsigned long tmp2;
+ ret = ptrace_get_reg(child, numReg, &tmp2);
+ if (ret)
+ break;
+ tmp = tmp2;
}
reg32bits = ((u32*)&tmp)[part];
ret = put_user(reg32bits, (u32 __user *)data);
@@ -225,7 +205,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
* to be an array of unsigned int (32 bits) - the
* index passed in is based on this assumption.
*/
- ((unsigned int *)child->thread.fpr)
+ ((unsigned int *)child->thread.fp_state.fpr)
[FPRINDEX(index)] = data;
ret = 0;
}
@@ -256,7 +236,10 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
break;
CHECK_FULL_REGS(child->thread.regs);
if (numReg < PT_FPR0) {
- unsigned long freg = ptrace_get_reg(child, numReg);
+ unsigned long freg;
+ ret = ptrace_get_reg(child, numReg, &freg);
+ if (ret)
+ break;
if (index % 2)
freg = (freg & ~0xfffffffful) | (data & 0xfffffffful);
else
@@ -266,8 +249,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
u64 *tmp;
flush_fp_to_thread(child);
/* get 64 bit FPR ... */
- tmp = &(((u64 *)child->thread.fpr)
- [FPRINDEX_3264(numReg)]);
+ tmp = &child->thread.fp_state.fpr[numReg - PT_FPR0][0];
/* ... write the 32 bit part we want */
((u32 *)tmp)[index % 2] = data;
ret = 0;
@@ -276,11 +258,21 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
}
case PTRACE_GET_DEBUGREG: {
+#ifndef CONFIG_PPC_ADV_DEBUG_REGS
+ unsigned long dabr_fake;
+#endif
ret = -EINVAL;
/* We only support one DABR and no IABRS at the moment */
if (addr > 0)
break;
- ret = put_user(child->thread.dabr, (u32 __user *)data);
+#ifdef CONFIG_PPC_ADV_DEBUG_REGS
+ ret = put_user(child->thread.debug.dac1, (u32 __user *)data);
+#else
+ dabr_fake = (
+ (child->thread.hw_brk.address & (~HW_BRK_TYPE_DABR)) |
+ (child->thread.hw_brk.type & HW_BRK_TYPE_DABR));
+ ret = put_user(dabr_fake, (u32 __user *)data);
+#endif
break;
}
@@ -304,23 +296,18 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
case PTRACE_SETVSRREGS:
case PTRACE_GETREGS64:
case PTRACE_SETREGS64:
- case PPC_PTRACE_GETFPREGS:
- case PPC_PTRACE_SETFPREGS:
case PTRACE_KILL:
case PTRACE_SINGLESTEP:
case PTRACE_DETACH:
case PTRACE_SET_DEBUGREG:
case PTRACE_SYSCALL:
case PTRACE_CONT:
+ case PPC_PTRACE_GETHWDBGINFO:
+ case PPC_PTRACE_SETHWDEBUG:
+ case PPC_PTRACE_DELHWDEBUG:
ret = arch_ptrace(child, request, addr, data);
break;
- /* Old reverse args ptrace callss */
- case PPC_PTRACE_GETREGS: /* Get GPRs 0 - 31. */
- case PPC_PTRACE_SETREGS: /* Set GPRs 0 - 31. */
- ret = compat_ptrace_old(child, request, addr, data);
- break;
-
default:
ret = compat_ptrace_request(child, request, addr, data);
break;
diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S
new file mode 100644
index 00000000000..f366fedb087
--- /dev/null
+++ b/arch/powerpc/kernel/reloc_32.S
@@ -0,0 +1,209 @@
+/*
+ * Code to process dynamic relocations for PPC32.
+ *
+ * Copyrights (C) IBM Corporation, 2011.
+ * Author: Suzuki Poulose <suzuki@in.ibm.com>
+ *
+ * - Based on ppc64 code - reloc_64.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/ppc_asm.h>
+
+/* Dynamic section table entry tags */
+DT_RELA = 7 /* Tag for Elf32_Rela section */
+DT_RELASZ = 8 /* Size of the Rela relocs */
+DT_RELAENT = 9 /* Size of one Rela reloc entry */
+
+STN_UNDEF = 0 /* Undefined symbol index */
+STB_LOCAL = 0 /* Local binding for the symbol */
+
+R_PPC_ADDR16_LO = 4 /* Lower half of (S+A) */
+R_PPC_ADDR16_HI = 5 /* Upper half of (S+A) */
+R_PPC_ADDR16_HA = 6 /* High Adjusted (S+A) */
+R_PPC_RELATIVE = 22
+
+/*
+ * r3 = desired final address
+ */
+
+_GLOBAL(relocate)
+
+ mflr r0 /* Save our LR */
+ bl 0f /* Find our current runtime address */
+0: mflr r12 /* Make it accessible */
+ mtlr r0
+
+ lwz r11, (p_dyn - 0b)(r12)
+ add r11, r11, r12 /* runtime address of .dynamic section */
+ lwz r9, (p_rela - 0b)(r12)
+ add r9, r9, r12 /* runtime address of .rela.dyn section */
+ lwz r10, (p_st - 0b)(r12)
+ add r10, r10, r12 /* runtime address of _stext section */
+ lwz r13, (p_sym - 0b)(r12)
+ add r13, r13, r12 /* runtime address of .dynsym section */
+
+ /*
+ * Scan the dynamic section for RELA, RELASZ entries
+ */
+ li r6, 0
+ li r7, 0
+ li r8, 0
+1: lwz r5, 0(r11) /* ELF_Dyn.d_tag */
+ cmpwi r5, 0 /* End of ELF_Dyn[] */
+ beq eodyn
+ cmpwi r5, DT_RELA
+ bne relasz
+ lwz r7, 4(r11) /* r7 = rela.link */
+ b skip
+relasz:
+ cmpwi r5, DT_RELASZ
+ bne relaent
+ lwz r8, 4(r11) /* r8 = Total Rela relocs size */
+ b skip
+relaent:
+ cmpwi r5, DT_RELAENT
+ bne skip
+ lwz r6, 4(r11) /* r6 = Size of one Rela reloc */
+skip:
+ addi r11, r11, 8
+ b 1b
+eodyn: /* End of Dyn Table scan */
+
+ /* Check if we have found all the entries */
+ cmpwi r7, 0
+ beq done
+ cmpwi r8, 0
+ beq done
+ cmpwi r6, 0
+ beq done
+
+
+ /*
+ * Work out the current offset from the link time address of .rela
+ * section.
+ * cur_offset[r7] = rela.run[r9] - rela.link [r7]
+ * _stext.link[r12] = _stext.run[r10] - cur_offset[r7]
+ * final_offset[r3] = _stext.final[r3] - _stext.link[r12]
+ */
+ subf r7, r7, r9 /* cur_offset */
+ subf r12, r7, r10
+ subf r3, r12, r3 /* final_offset */
+
+ subf r8, r6, r8 /* relaz -= relaent */
+ /*
+ * Scan through the .rela table and process each entry
+ * r9 - points to the current .rela table entry
+ * r13 - points to the symbol table
+ */
+
+ /*
+ * Check if we have a relocation based on symbol
+ * r5 will hold the value of the symbol.
+ */
+applyrela:
+ lwz r4, 4(r9) /* r4 = rela.r_info */
+ srwi r5, r4, 8 /* ELF32_R_SYM(r_info) */
+ cmpwi r5, STN_UNDEF /* sym == STN_UNDEF ? */
+ beq get_type /* value = 0 */
+ /* Find the value of the symbol at index(r5) */
+ slwi r5, r5, 4 /* r5 = r5 * sizeof(Elf32_Sym) */
+ add r12, r13, r5 /* r12 = &__dyn_sym[Index] */
+
+ /*
+ * GNU ld has a bug, where dynamic relocs based on
+ * STB_LOCAL symbols, the value should be assumed
+ * to be zero. - Alan Modra
+ */
+ /* XXX: Do we need to check if we are using GNU ld ? */
+ lbz r5, 12(r12) /* r5 = dyn_sym[Index].st_info */
+ extrwi r5, r5, 4, 24 /* r5 = ELF32_ST_BIND(r5) */
+ cmpwi r5, STB_LOCAL /* st_value = 0, ld bug */
+ beq get_type /* We have r5 = 0 */
+ lwz r5, 4(r12) /* r5 = __dyn_sym[Index].st_value */
+
+get_type:
+ /* Load the relocation type to r4 */
+ extrwi r4, r4, 8, 24 /* r4 = ELF32_R_TYPE(r_info) = ((char*)r4)[3] */
+
+ /* R_PPC_RELATIVE */
+ cmpwi r4, R_PPC_RELATIVE
+ bne hi16
+ lwz r4, 0(r9) /* r_offset */
+ lwz r0, 8(r9) /* r_addend */
+ add r0, r0, r3 /* final addend */
+ stwx r0, r4, r7 /* memory[r4+r7]) = (u32)r0 */
+ b nxtrela /* continue */
+
+ /* R_PPC_ADDR16_HI */
+hi16:
+ cmpwi r4, R_PPC_ADDR16_HI
+ bne ha16
+ lwz r4, 0(r9) /* r_offset */
+ lwz r0, 8(r9) /* r_addend */
+ add r0, r0, r3
+ add r0, r0, r5 /* r0 = (S+A+Offset) */
+ extrwi r0, r0, 16, 0 /* r0 = (r0 >> 16) */
+ b store_half
+
+ /* R_PPC_ADDR16_HA */
+ha16:
+ cmpwi r4, R_PPC_ADDR16_HA
+ bne lo16
+ lwz r4, 0(r9) /* r_offset */
+ lwz r0, 8(r9) /* r_addend */
+ add r0, r0, r3
+ add r0, r0, r5 /* r0 = (S+A+Offset) */
+ extrwi r5, r0, 1, 16 /* Extract bit 16 */
+ extrwi r0, r0, 16, 0 /* r0 = (r0 >> 16) */
+ add r0, r0, r5 /* Add it to r0 */
+ b store_half
+
+ /* R_PPC_ADDR16_LO */
+lo16:
+ cmpwi r4, R_PPC_ADDR16_LO
+ bne unknown_type
+ lwz r4, 0(r9) /* r_offset */
+ lwz r0, 8(r9) /* r_addend */
+ add r0, r0, r3
+ add r0, r0, r5 /* r0 = (S+A+Offset) */
+ extrwi r0, r0, 16, 16 /* r0 &= 0xffff */
+ /* Fall through to */
+
+ /* Store half word */
+store_half:
+ sthx r0, r4, r7 /* memory[r4+r7] = (u16)r0 */
+
+nxtrela:
+ /*
+ * We have to flush the modified instructions to the
+ * main storage from the d-cache. And also, invalidate the
+ * cached instructions in i-cache which has been modified.
+ *
+ * We delay the sync / isync operation till the end, since
+ * we won't be executing the modified instructions until
+ * we return from here.
+ */
+ dcbst r4,r7
+ sync /* Ensure the data is flushed before icbi */
+ icbi r4,r7
+unknown_type:
+ cmpwi r8, 0 /* relasz = 0 ? */
+ ble done
+ add r9, r9, r6 /* move to next entry in the .rela table */
+ subf r8, r6, r8 /* relasz -= relaent */
+ b applyrela
+
+done:
+ sync /* Wait for the flush to finish */
+ isync /* Discard prefetched instructions */
+ blr
+
+p_dyn: .long __dynamic_start - 0b
+p_rela: .long __rela_dyn_start - 0b
+p_sym: .long __dynamic_symtab - 0b
+p_st: .long _stext - 0b
diff --git a/arch/powerpc/kernel/reloc_64.S b/arch/powerpc/kernel/reloc_64.S
index b47a0e1ab00..d88736fbece 100644
--- a/arch/powerpc/kernel/reloc_64.S
+++ b/arch/powerpc/kernel/reloc_64.S
@@ -69,8 +69,8 @@ _GLOBAL(relocate)
* R_PPC64_RELATIVE ones.
*/
mtctr r8
-5: lwz r0,12(9) /* ELF64_R_TYPE(reloc->r_info) */
- cmpwi r0,R_PPC64_RELATIVE
+5: ld r0,8(9) /* ELF64_R_TYPE(reloc->r_info) */
+ cmpdi r0,R_PPC64_RELATIVE
bne 6f
ld r6,0(r9) /* reloc->r_offset */
ld r0,16(r9) /* reloc->r_addend */
@@ -81,6 +81,7 @@ _GLOBAL(relocate)
6: blr
+.balign 8
p_dyn: .llong __dynamic_start - 0b
p_rela: .llong __rela_dyn_start - 0b
p_st: .llong _stext - 0b
diff --git a/arch/powerpc/kernel/rtas-rtc.c b/arch/powerpc/kernel/rtas-rtc.c
index 77578c093dd..c57c19358a2 100644
--- a/arch/powerpc/kernel/rtas-rtc.c
+++ b/arch/powerpc/kernel/rtas-rtc.c
@@ -4,6 +4,7 @@
#include <linux/init.h>
#include <linux/rtc.h>
#include <linux/delay.h>
+#include <linux/ratelimit.h>
#include <asm/prom.h>
#include <asm/rtas.h>
#include <asm/time.h>
@@ -29,9 +30,10 @@ unsigned long __init rtas_get_boot_time(void)
}
} while (wait_time && (get_tb() < max_wait_tb));
- if (error != 0 && printk_ratelimit()) {
- printk(KERN_WARNING "error: reading the clock failed (%d)\n",
- error);
+ if (error != 0) {
+ printk_ratelimited(KERN_WARNING
+ "error: reading the clock failed (%d)\n",
+ error);
return 0;
}
@@ -55,19 +57,21 @@ void rtas_get_rtc_time(struct rtc_time *rtc_tm)
wait_time = rtas_busy_delay_time(error);
if (wait_time) {
- if (in_interrupt() && printk_ratelimit()) {
+ if (in_interrupt()) {
memset(rtc_tm, 0, sizeof(struct rtc_time));
- printk(KERN_WARNING "error: reading clock"
- " would delay interrupt\n");
+ printk_ratelimited(KERN_WARNING
+ "error: reading clock "
+ "would delay interrupt\n");
return; /* delay not allowed */
}
msleep(wait_time);
}
} while (wait_time && (get_tb() < max_wait_tb));
- if (error != 0 && printk_ratelimit()) {
- printk(KERN_WARNING "error: reading the clock failed (%d)\n",
- error);
+ if (error != 0) {
+ printk_ratelimited(KERN_WARNING
+ "error: reading the clock failed (%d)\n",
+ error);
return;
}
@@ -99,9 +103,10 @@ int rtas_set_rtc_time(struct rtc_time *tm)
}
} while (wait_time && (get_tb() < max_wait_tb));
- if (error != 0 && printk_ratelimit())
- printk(KERN_WARNING "error: setting the clock failed (%d)\n",
- error);
+ if (error != 0)
+ printk_ratelimited(KERN_WARNING
+ "error: setting the clock failed (%d)\n",
+ error);
return 0;
}
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 8fe8bc61c10..8b4c857c142 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -15,15 +15,17 @@
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/spinlock.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/delay.h>
+#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/completion.h>
#include <linux/cpumask.h>
#include <linux/memblock.h>
#include <linux/slab.h>
+#include <linux/reboot.h>
#include <asm/prom.h>
#include <asm/rtas.h>
@@ -32,15 +34,15 @@
#include <asm/firmware.h>
#include <asm/page.h>
#include <asm/param.h>
-#include <asm/system.h>
#include <asm/delay.h>
#include <asm/uaccess.h>
#include <asm/udbg.h>
#include <asm/syscalls.h>
#include <asm/smp.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/time.h>
#include <asm/mmu.h>
+#include <asm/topology.h>
struct rtas_t rtas = {
.lock = __ARCH_SPIN_LOCK_UNLOCKED
@@ -89,7 +91,7 @@ static void unlock_rtas(unsigned long flags)
* are designed only for very early low-level debugging, which
* is why the token is hard-coded to 10.
*/
-static void call_rtas_display_status(char c)
+static void call_rtas_display_status(unsigned char c)
{
struct rtas_args *args = &rtas.args;
unsigned long s;
@@ -98,11 +100,11 @@ static void call_rtas_display_status(char c)
return;
s = lock_rtas();
- args->token = 10;
- args->nargs = 1;
- args->nret = 1;
- args->rets = (rtas_arg_t *)&(args->args[1]);
- args->args[0] = (unsigned char)c;
+ args->token = cpu_to_be32(10);
+ args->nargs = cpu_to_be32(1);
+ args->nret = cpu_to_be32(1);
+ args->rets = &(args->args[1]);
+ args->args[0] = cpu_to_be32(c);
enter_rtas(__pa(args));
@@ -202,7 +204,7 @@ void rtas_progress(char *s, unsigned short hex)
{
struct device_node *root;
int width;
- const int *p;
+ const __be32 *p;
char *os;
static int display_character, set_indicator;
static int display_width, display_lines, form_feed;
@@ -219,13 +221,13 @@ void rtas_progress(char *s, unsigned short hex)
if ((root = of_find_node_by_path("/rtas"))) {
if ((p = of_get_property(root,
"ibm,display-line-length", NULL)))
- display_width = *p;
+ display_width = be32_to_cpu(*p);
if ((p = of_get_property(root,
"ibm,form-feed", NULL)))
- form_feed = *p;
+ form_feed = be32_to_cpu(*p);
if ((p = of_get_property(root,
"ibm,display-number-of-lines", NULL)))
- display_lines = *p;
+ display_lines = be32_to_cpu(*p);
row_width = of_get_property(root,
"ibm,display-truncation-length", NULL);
of_node_put(root);
@@ -320,11 +322,11 @@ EXPORT_SYMBOL(rtas_progress); /* needed by rtas_flash module */
int rtas_token(const char *service)
{
- const int *tokp;
+ const __be32 *tokp;
if (rtas.dev == NULL)
return RTAS_UNKNOWN_SERVICE;
tokp = of_get_property(rtas.dev, service, NULL);
- return tokp ? *tokp : RTAS_UNKNOWN_SERVICE;
+ return tokp ? be32_to_cpu(*tokp) : RTAS_UNKNOWN_SERVICE;
}
EXPORT_SYMBOL(rtas_token);
@@ -378,11 +380,11 @@ static char *__fetch_rtas_last_error(char *altbuf)
bufsz = rtas_get_error_log_max();
- err_args.token = rtas_last_error_token;
- err_args.nargs = 2;
- err_args.nret = 1;
- err_args.args[0] = (rtas_arg_t)__pa(rtas_err_buf);
- err_args.args[1] = bufsz;
+ err_args.token = cpu_to_be32(rtas_last_error_token);
+ err_args.nargs = cpu_to_be32(2);
+ err_args.nret = cpu_to_be32(1);
+ err_args.args[0] = cpu_to_be32(__pa(rtas_err_buf));
+ err_args.args[1] = cpu_to_be32(bufsz);
err_args.args[2] = 0;
save_args = rtas.args;
@@ -431,13 +433,13 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...)
s = lock_rtas();
rtas_args = &rtas.args;
- rtas_args->token = token;
- rtas_args->nargs = nargs;
- rtas_args->nret = nret;
- rtas_args->rets = (rtas_arg_t *)&(rtas_args->args[nargs]);
+ rtas_args->token = cpu_to_be32(token);
+ rtas_args->nargs = cpu_to_be32(nargs);
+ rtas_args->nret = cpu_to_be32(nret);
+ rtas_args->rets = &(rtas_args->args[nargs]);
va_start(list, outputs);
for (i = 0; i < nargs; ++i)
- rtas_args->args[i] = va_arg(list, rtas_arg_t);
+ rtas_args->args[i] = cpu_to_be32(va_arg(list, __u32));
va_end(list);
for (i = 0; i < nret; ++i)
@@ -447,13 +449,13 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...)
/* A -1 return code indicates that the last command couldn't
be completed due to a hardware error. */
- if (rtas_args->rets[0] == -1)
+ if (be32_to_cpu(rtas_args->rets[0]) == -1)
buff_copy = __fetch_rtas_last_error(NULL);
if (nret > 1 && outputs != NULL)
for (i = 0; i < nret-1; ++i)
- outputs[i] = rtas_args->rets[i+1];
- ret = (nret > 0)? rtas_args->rets[0]: 0;
+ outputs[i] = be32_to_cpu(rtas_args->rets[i+1]);
+ ret = (nret > 0)? be32_to_cpu(rtas_args->rets[0]): 0;
unlock_rtas(s);
@@ -493,7 +495,7 @@ unsigned int rtas_busy_delay(int status)
might_sleep();
ms = rtas_busy_delay_time(status);
- if (ms)
+ if (ms && need_resched())
msleep(ms);
return ms;
@@ -586,8 +588,8 @@ bool rtas_indicator_present(int token, int *maxindex)
{
int proplen, count, i;
const struct indicator_elem {
- u32 token;
- u32 maxindex;
+ __be32 token;
+ __be32 maxindex;
} *indicators;
indicators = of_get_property(rtas.dev, "rtas-indicators", &proplen);
@@ -597,10 +599,10 @@ bool rtas_indicator_present(int token, int *maxindex)
count = proplen / sizeof(struct indicator_elem);
for (i = 0; i < count; i++) {
- if (indicators[i].token != token)
+ if (__be32_to_cpu(indicators[i].token) != token)
continue;
if (maxindex)
- *maxindex = indicators[i].maxindex;
+ *maxindex = __be32_to_cpu(indicators[i].maxindex);
return true;
}
@@ -728,6 +730,7 @@ static int __rtas_suspend_last_cpu(struct rtas_suspend_me_data *data, int wake_w
rc = atomic_read(&data->error);
atomic_set(&data->error, rc);
+ pSeries_coalesce_init();
if (wake_when_done) {
atomic_set(&data->done, 1);
@@ -805,6 +808,95 @@ static void rtas_percpu_suspend_me(void *info)
__rtas_suspend_cpu((struct rtas_suspend_me_data *)info, 1);
}
+enum rtas_cpu_state {
+ DOWN,
+ UP,
+};
+
+#ifndef CONFIG_SMP
+static int rtas_cpu_state_change_mask(enum rtas_cpu_state state,
+ cpumask_var_t cpus)
+{
+ if (!cpumask_empty(cpus)) {
+ cpumask_clear(cpus);
+ return -EINVAL;
+ } else
+ return 0;
+}
+#else
+/* On return cpumask will be altered to indicate CPUs changed.
+ * CPUs with states changed will be set in the mask,
+ * CPUs with status unchanged will be unset in the mask. */
+static int rtas_cpu_state_change_mask(enum rtas_cpu_state state,
+ cpumask_var_t cpus)
+{
+ int cpu;
+ int cpuret = 0;
+ int ret = 0;
+
+ if (cpumask_empty(cpus))
+ return 0;
+
+ for_each_cpu(cpu, cpus) {
+ switch (state) {
+ case DOWN:
+ cpuret = cpu_down(cpu);
+ break;
+ case UP:
+ cpuret = cpu_up(cpu);
+ break;
+ }
+ if (cpuret) {
+ pr_debug("%s: cpu_%s for cpu#%d returned %d.\n",
+ __func__,
+ ((state == UP) ? "up" : "down"),
+ cpu, cpuret);
+ if (!ret)
+ ret = cpuret;
+ if (state == UP) {
+ /* clear bits for unchanged cpus, return */
+ cpumask_shift_right(cpus, cpus, cpu);
+ cpumask_shift_left(cpus, cpus, cpu);
+ break;
+ } else {
+ /* clear bit for unchanged cpu, continue */
+ cpumask_clear_cpu(cpu, cpus);
+ }
+ }
+ }
+
+ return ret;
+}
+#endif
+
+int rtas_online_cpus_mask(cpumask_var_t cpus)
+{
+ int ret;
+
+ ret = rtas_cpu_state_change_mask(UP, cpus);
+
+ if (ret) {
+ cpumask_var_t tmp_mask;
+
+ if (!alloc_cpumask_var(&tmp_mask, GFP_TEMPORARY))
+ return ret;
+
+ /* Use tmp_mask to preserve cpus mask from first failure */
+ cpumask_copy(tmp_mask, cpus);
+ rtas_offline_cpus_mask(tmp_mask);
+ free_cpumask_var(tmp_mask);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(rtas_online_cpus_mask);
+
+int rtas_offline_cpus_mask(cpumask_var_t cpus)
+{
+ return rtas_cpu_state_change_mask(DOWN, cpus);
+}
+EXPORT_SYMBOL(rtas_offline_cpus_mask);
+
int rtas_ibm_suspend_me(struct rtas_args *args)
{
long state;
@@ -812,6 +904,8 @@ int rtas_ibm_suspend_me(struct rtas_args *args)
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
struct rtas_suspend_me_data data;
DECLARE_COMPLETION_ONSTACK(done);
+ cpumask_var_t offline_mask;
+ int cpuret;
if (!rtas_service_present("ibm,suspend-me"))
return -ENOSYS;
@@ -835,12 +929,26 @@ int rtas_ibm_suspend_me(struct rtas_args *args)
return 0;
}
+ if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY))
+ return -ENOMEM;
+
atomic_set(&data.working, 0);
atomic_set(&data.done, 0);
atomic_set(&data.error, 0);
data.token = rtas_token("ibm,suspend-me");
data.complete = &done;
+ /* All present CPUs must be online */
+ cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask);
+ cpuret = rtas_online_cpus_mask(offline_mask);
+ if (cpuret) {
+ pr_err("%s: Could not bring present CPUs online.\n", __func__);
+ atomic_set(&data.error, cpuret);
+ goto out;
+ }
+
+ stop_topology_update();
+
/* Call function on all CPUs. One of us will make the
* rtas call
*/
@@ -852,6 +960,16 @@ int rtas_ibm_suspend_me(struct rtas_args *args)
if (atomic_read(&data.error) != 0)
printk(KERN_ERR "Error doing global join\n");
+ start_topology_update();
+
+ /* Take down CPUs not online prior to suspend */
+ cpuret = rtas_offline_cpus_mask(offline_mask);
+ if (cpuret)
+ pr_warn("%s: Could not restore CPUs to offline state.\n",
+ __func__);
+
+out:
+ free_cpumask_var(offline_mask);
return atomic_read(&data.error);
}
#else /* CONFIG_PPC_PSERIES */
@@ -861,12 +979,50 @@ int rtas_ibm_suspend_me(struct rtas_args *args)
}
#endif
+/**
+ * Find a specific pseries error log in an RTAS extended event log.
+ * @log: RTAS error/event log
+ * @section_id: two character section identifier
+ *
+ * Returns a pointer to the specified errorlog or NULL if not found.
+ */
+struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log,
+ uint16_t section_id)
+{
+ struct rtas_ext_event_log_v6 *ext_log =
+ (struct rtas_ext_event_log_v6 *)log->buffer;
+ struct pseries_errorlog *sect;
+ unsigned char *p, *log_end;
+ uint32_t ext_log_length = rtas_error_extended_log_length(log);
+ uint8_t log_format = rtas_ext_event_log_format(ext_log);
+ uint32_t company_id = rtas_ext_event_company_id(ext_log);
+
+ /* Check that we understand the format */
+ if (ext_log_length < sizeof(struct rtas_ext_event_log_v6) ||
+ log_format != RTAS_V6EXT_LOG_FORMAT_EVENT_LOG ||
+ company_id != RTAS_V6EXT_COMPANY_ID_IBM)
+ return NULL;
+
+ log_end = log->buffer + ext_log_length;
+ p = ext_log->vendor_log;
+
+ while (p < log_end) {
+ sect = (struct pseries_errorlog *)p;
+ if (pseries_errorlog_id(sect) == section_id)
+ return sect;
+ p += pseries_errorlog_length(sect);
+ }
+
+ return NULL;
+}
+
+/* We assume to be passed big endian arguments */
asmlinkage int ppc_rtas(struct rtas_args __user *uargs)
{
struct rtas_args args;
unsigned long flags;
char *buff_copy, *errbuf = NULL;
- int nargs;
+ int nargs, nret, token;
int rc;
if (!capable(CAP_SYS_ADMIN))
@@ -875,10 +1031,13 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs)
if (copy_from_user(&args, uargs, 3 * sizeof(u32)) != 0)
return -EFAULT;
- nargs = args.nargs;
+ nargs = be32_to_cpu(args.nargs);
+ nret = be32_to_cpu(args.nret);
+ token = be32_to_cpu(args.token);
+
if (nargs > ARRAY_SIZE(args.args)
- || args.nret > ARRAY_SIZE(args.args)
- || nargs + args.nret > ARRAY_SIZE(args.args))
+ || nret > ARRAY_SIZE(args.args)
+ || nargs + nret > ARRAY_SIZE(args.args))
return -EINVAL;
/* Copy in args. */
@@ -886,14 +1045,14 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs)
nargs * sizeof(rtas_arg_t)) != 0)
return -EFAULT;
- if (args.token == RTAS_UNKNOWN_SERVICE)
+ if (token == RTAS_UNKNOWN_SERVICE)
return -EINVAL;
args.rets = &args.args[nargs];
- memset(args.rets, 0, args.nret * sizeof(rtas_arg_t));
+ memset(args.rets, 0, nret * sizeof(rtas_arg_t));
/* Need to handle ibm,suspend_me call specially */
- if (args.token == ibm_suspend_me_token) {
+ if (token == ibm_suspend_me_token) {
rc = rtas_ibm_suspend_me(&args);
if (rc)
return rc;
@@ -910,7 +1069,7 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs)
/* A -1 return code indicates that the last command couldn't
be completed due to a hardware error. */
- if (args.rets[0] == -1)
+ if (be32_to_cpu(args.rets[0]) == -1)
errbuf = __fetch_rtas_last_error(buff_copy);
unlock_rtas(flags);
@@ -925,7 +1084,7 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs)
/* Copy out args. */
if (copy_to_user(uargs->args + nargs,
args.args + nargs,
- args.nret * sizeof(rtas_arg_t)) != 0)
+ nret * sizeof(rtas_arg_t)) != 0)
return -EFAULT;
return 0;
@@ -945,19 +1104,19 @@ void __init rtas_initialize(void)
*/
rtas.dev = of_find_node_by_name(NULL, "rtas");
if (rtas.dev) {
- const u32 *basep, *entryp, *sizep;
+ const __be32 *basep, *entryp, *sizep;
basep = of_get_property(rtas.dev, "linux,rtas-base", NULL);
sizep = of_get_property(rtas.dev, "rtas-size", NULL);
if (basep != NULL && sizep != NULL) {
- rtas.base = *basep;
- rtas.size = *sizep;
+ rtas.base = __be32_to_cpu(*basep);
+ rtas.size = __be32_to_cpu(*sizep);
entryp = of_get_property(rtas.dev,
"linux,rtas-entry", NULL);
if (entryp == NULL) /* Ugh */
rtas.entry = rtas.base;
else
- rtas.entry = *entryp;
+ rtas.entry = __be32_to_cpu(*entryp);
} else
rtas.dev = NULL;
}
@@ -983,7 +1142,7 @@ void __init rtas_initialize(void)
int __init early_init_dt_scan_rtas(unsigned long node,
const char *uname, int depth, void *data)
{
- u32 *basep, *entryp, *sizep;
+ const u32 *basep, *entryp, *sizep;
if (depth != 1 || strcmp(uname, "rtas") != 0)
return 0;
@@ -1020,7 +1179,7 @@ int __init early_init_dt_scan_rtas(unsigned long node,
static arch_spinlock_t timebase_lock;
static u64 timebase = 0;
-void __cpuinit rtas_give_timebase(void)
+void rtas_give_timebase(void)
{
unsigned long flags;
@@ -1037,7 +1196,7 @@ void __cpuinit rtas_give_timebase(void)
local_irq_restore(flags);
}
-void __cpuinit rtas_take_timebase(void)
+void rtas_take_timebase(void)
{
while (!timebase)
barrier();
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 2b442e6c21e..db2b482af65 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -17,10 +17,10 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
+#include <linux/reboot.h>
#include <asm/delay.h>
#include <asm/uaccess.h>
#include <asm/rtas.h>
-#include <asm/abs_addr.h>
#define MODULE_VERS "1.0"
#define MODULE_NAME "rtas_flash"
@@ -57,13 +57,31 @@
#define VALIDATE_READY -1001 /* Firmware image ready for validation */
#define VALIDATE_PARAM_ERR -3 /* RTAS Parameter Error */
#define VALIDATE_HW_ERR -1 /* RTAS Hardware Error */
-#define VALIDATE_TMP_UPDATE 0 /* Validate Return Status */
-#define VALIDATE_FLASH_AUTH 1 /* Validate Return Status */
-#define VALIDATE_INVALID_IMG 2 /* Validate Return Status */
-#define VALIDATE_CUR_UNKNOWN 3 /* Validate Return Status */
-#define VALIDATE_TMP_COMMIT_DL 4 /* Validate Return Status */
-#define VALIDATE_TMP_COMMIT 5 /* Validate Return Status */
-#define VALIDATE_TMP_UPDATE_DL 6 /* Validate Return Status */
+
+/* ibm,validate-flash-image update result tokens */
+#define VALIDATE_TMP_UPDATE 0 /* T side will be updated */
+#define VALIDATE_FLASH_AUTH 1 /* Partition does not have authority */
+#define VALIDATE_INVALID_IMG 2 /* Candidate image is not valid */
+#define VALIDATE_CUR_UNKNOWN 3 /* Current fixpack level is unknown */
+/*
+ * Current T side will be committed to P side before being replace with new
+ * image, and the new image is downlevel from current image
+ */
+#define VALIDATE_TMP_COMMIT_DL 4
+/*
+ * Current T side will be committed to P side before being replaced with new
+ * image
+ */
+#define VALIDATE_TMP_COMMIT 5
+/*
+ * T side will be updated with a downlevel image
+ */
+#define VALIDATE_TMP_UPDATE_DL 6
+/*
+ * The candidate image's release date is later than the system's firmware
+ * service entitlement date - service warranty period has expired
+ */
+#define VALIDATE_OUT_OF_WRNTY 7
/* ibm,manage-flash-image operation tokens */
#define RTAS_REJECT_TMP_IMG 0
@@ -71,6 +89,7 @@
/* Array sizes */
#define VALIDATE_BUF_SIZE 4096
+#define VALIDATE_MSG_LEN 256
#define RTAS_MSG_MAXLEN 64
/* Quirk - RTAS requires 4k list length and block size */
@@ -102,9 +121,10 @@ static struct kmem_cache *flash_block_cache = NULL;
#define FLASH_BLOCK_LIST_VERSION (1UL)
-/* Local copy of the flash block list.
- * We only allow one open of the flash proc file and create this
- * list as we go. The rtas_firmware_flash_list varable will be
+/*
+ * Local copy of the flash block list.
+ *
+ * The rtas_firmware_flash_list varable will be
* set once the data is fully read.
*
* For convenience as we build the list we use virtual addrs,
@@ -125,23 +145,23 @@ struct rtas_update_flash_t
struct rtas_manage_flash_t
{
int status; /* Returned status */
- unsigned int op; /* Reject or commit image */
};
/* Status int must be first member of struct */
struct rtas_validate_flash_t
{
int status; /* Returned status */
- char buf[VALIDATE_BUF_SIZE]; /* Candidate image buffer */
+ char *buf; /* Candidate image buffer */
unsigned int buf_size; /* Size of image buf */
unsigned int update_results; /* Update results token */
};
-static DEFINE_SPINLOCK(flash_file_open_lock);
-static struct proc_dir_entry *firmware_flash_pde;
-static struct proc_dir_entry *firmware_update_pde;
-static struct proc_dir_entry *validate_pde;
-static struct proc_dir_entry *manage_pde;
+static struct rtas_update_flash_t rtas_update_flash_data;
+static struct rtas_manage_flash_t rtas_manage_flash_data;
+static struct rtas_validate_flash_t rtas_validate_flash_data;
+static DEFINE_MUTEX(rtas_update_flash_mutex);
+static DEFINE_MUTEX(rtas_manage_flash_mutex);
+static DEFINE_MUTEX(rtas_validate_flash_mutex);
/* Do simple sanity checks on the flash image. */
static int flash_list_valid(struct flash_block_list *flist)
@@ -191,10 +211,10 @@ static void free_flash_list(struct flash_block_list *f)
static int rtas_flash_release(struct inode *inode, struct file *file)
{
- struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
- struct rtas_update_flash_t *uf;
-
- uf = (struct rtas_update_flash_t *) dp->data;
+ struct rtas_update_flash_t *const uf = &rtas_update_flash_data;
+
+ mutex_lock(&rtas_update_flash_mutex);
+
if (uf->flist) {
/* File was opened in write mode for a new flash attempt */
/* Clear saved list */
@@ -214,13 +234,14 @@ static int rtas_flash_release(struct inode *inode, struct file *file)
uf->flist = NULL;
}
- atomic_dec(&dp->count);
+ mutex_unlock(&rtas_update_flash_mutex);
return 0;
}
-static void get_flash_status_msg(int status, char *buf)
+static size_t get_flash_status_msg(int status, char *buf)
{
- char *msg;
+ const char *msg;
+ size_t len;
switch (status) {
case FLASH_AUTH:
@@ -242,51 +263,47 @@ static void get_flash_status_msg(int status, char *buf)
msg = "ready: firmware image ready for flash on reboot\n";
break;
default:
- sprintf(buf, "error: unexpected status value %d\n", status);
- return;
+ return sprintf(buf, "error: unexpected status value %d\n",
+ status);
}
- strcpy(buf, msg);
+ len = strlen(msg);
+ memcpy(buf, msg, len + 1);
+ return len;
}
/* Reading the proc file will show status (not the firmware contents) */
-static ssize_t rtas_flash_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t rtas_flash_read_msg(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
{
- struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
- struct rtas_update_flash_t *uf;
+ struct rtas_update_flash_t *const uf = &rtas_update_flash_data;
char msg[RTAS_MSG_MAXLEN];
- int msglen;
-
- uf = (struct rtas_update_flash_t *) dp->data;
-
- if (!strcmp(dp->name, FIRMWARE_FLASH_NAME)) {
- get_flash_status_msg(uf->status, msg);
- } else { /* FIRMWARE_UPDATE_NAME */
- sprintf(msg, "%d\n", uf->status);
- }
- msglen = strlen(msg);
- if (msglen > count)
- msglen = count;
+ size_t len;
+ int status;
- if (ppos && *ppos != 0)
- return 0; /* be cheap */
+ mutex_lock(&rtas_update_flash_mutex);
+ status = uf->status;
+ mutex_unlock(&rtas_update_flash_mutex);
- if (!access_ok(VERIFY_WRITE, buf, msglen))
- return -EINVAL;
-
- if (copy_to_user(buf, msg, msglen))
- return -EFAULT;
-
- if (ppos)
- *ppos = msglen;
- return msglen;
+ /* Read as text message */
+ len = get_flash_status_msg(status, msg);
+ return simple_read_from_buffer(buf, count, ppos, msg, len);
}
-/* constructor for flash_block_cache */
-void rtas_block_ctor(void *ptr)
+static ssize_t rtas_flash_read_num(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
{
- memset(ptr, 0, RTAS_BLK_SIZE);
+ struct rtas_update_flash_t *const uf = &rtas_update_flash_data;
+ char msg[RTAS_MSG_MAXLEN];
+ int status;
+
+ mutex_lock(&rtas_update_flash_mutex);
+ status = uf->status;
+ mutex_unlock(&rtas_update_flash_mutex);
+
+ /* Read as number */
+ sprintf(msg, "%d\n", status);
+ return simple_read_from_buffer(buf, count, ppos, msg, strlen(msg));
}
/* We could be much more efficient here. But to keep this function
@@ -297,25 +314,24 @@ void rtas_block_ctor(void *ptr)
static ssize_t rtas_flash_write(struct file *file, const char __user *buffer,
size_t count, loff_t *off)
{
- struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
- struct rtas_update_flash_t *uf;
+ struct rtas_update_flash_t *const uf = &rtas_update_flash_data;
char *p;
- int next_free;
+ int next_free, rc;
struct flash_block_list *fl;
- uf = (struct rtas_update_flash_t *) dp->data;
+ mutex_lock(&rtas_update_flash_mutex);
if (uf->status == FLASH_AUTH || count == 0)
- return count; /* discard data */
+ goto out; /* discard data */
/* In the case that the image is not ready for flashing, the memory
* allocated for the block list will be freed upon the release of the
* proc file
*/
if (uf->flist == NULL) {
- uf->flist = kmem_cache_alloc(flash_block_cache, GFP_KERNEL);
+ uf->flist = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL);
if (!uf->flist)
- return -ENOMEM;
+ goto nomem;
}
fl = uf->flist;
@@ -324,63 +340,48 @@ static ssize_t rtas_flash_write(struct file *file, const char __user *buffer,
next_free = fl->num_blocks;
if (next_free == FLASH_BLOCKS_PER_NODE) {
/* Need to allocate another block_list */
- fl->next = kmem_cache_alloc(flash_block_cache, GFP_KERNEL);
+ fl->next = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL);
if (!fl->next)
- return -ENOMEM;
+ goto nomem;
fl = fl->next;
next_free = 0;
}
if (count > RTAS_BLK_SIZE)
count = RTAS_BLK_SIZE;
- p = kmem_cache_alloc(flash_block_cache, GFP_KERNEL);
+ p = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL);
if (!p)
- return -ENOMEM;
+ goto nomem;
if(copy_from_user(p, buffer, count)) {
kmem_cache_free(flash_block_cache, p);
- return -EFAULT;
+ rc = -EFAULT;
+ goto error;
}
fl->blocks[next_free].data = p;
fl->blocks[next_free].length = count;
fl->num_blocks++;
-
+out:
+ mutex_unlock(&rtas_update_flash_mutex);
return count;
-}
-
-static int rtas_excl_open(struct inode *inode, struct file *file)
-{
- struct proc_dir_entry *dp = PDE(inode);
- /* Enforce exclusive open with use count of PDE */
- spin_lock(&flash_file_open_lock);
- if (atomic_read(&dp->count) > 2) {
- spin_unlock(&flash_file_open_lock);
- return -EBUSY;
- }
-
- atomic_inc(&dp->count);
- spin_unlock(&flash_file_open_lock);
-
- return 0;
-}
-
-static int rtas_excl_release(struct inode *inode, struct file *file)
-{
- struct proc_dir_entry *dp = PDE(inode);
-
- atomic_dec(&dp->count);
-
- return 0;
+nomem:
+ rc = -ENOMEM;
+error:
+ mutex_unlock(&rtas_update_flash_mutex);
+ return rc;
}
-static void manage_flash(struct rtas_manage_flash_t *args_buf)
+/*
+ * Flash management routines.
+ */
+static void manage_flash(struct rtas_manage_flash_t *args_buf, unsigned int op)
{
s32 rc;
do {
- rc = rtas_call(rtas_token("ibm,manage-flash-image"), 1,
- 1, NULL, args_buf->op);
+ rc = rtas_call(rtas_token("ibm,manage-flash-image"), 1, 1,
+ NULL, op);
} while (rtas_busy_delay(rc));
args_buf->status = rc;
@@ -389,68 +390,62 @@ static void manage_flash(struct rtas_manage_flash_t *args_buf)
static ssize_t manage_flash_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
- struct rtas_manage_flash_t *args_buf;
+ struct rtas_manage_flash_t *const args_buf = &rtas_manage_flash_data;
char msg[RTAS_MSG_MAXLEN];
- int msglen;
-
- args_buf = (struct rtas_manage_flash_t *) dp->data;
- if (args_buf == NULL)
- return 0;
-
- msglen = sprintf(msg, "%d\n", args_buf->status);
- if (msglen > count)
- msglen = count;
+ int msglen, status;
- if (ppos && *ppos != 0)
- return 0; /* be cheap */
+ mutex_lock(&rtas_manage_flash_mutex);
+ status = args_buf->status;
+ mutex_unlock(&rtas_manage_flash_mutex);
- if (!access_ok(VERIFY_WRITE, buf, msglen))
- return -EINVAL;
-
- if (copy_to_user(buf, msg, msglen))
- return -EFAULT;
-
- if (ppos)
- *ppos = msglen;
- return msglen;
+ msglen = sprintf(msg, "%d\n", status);
+ return simple_read_from_buffer(buf, count, ppos, msg, msglen);
}
static ssize_t manage_flash_write(struct file *file, const char __user *buf,
size_t count, loff_t *off)
{
- struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
- struct rtas_manage_flash_t *args_buf;
- const char reject_str[] = "0";
- const char commit_str[] = "1";
+ struct rtas_manage_flash_t *const args_buf = &rtas_manage_flash_data;
+ static const char reject_str[] = "0";
+ static const char commit_str[] = "1";
char stkbuf[10];
- int op;
+ int op, rc;
+
+ mutex_lock(&rtas_manage_flash_mutex);
- args_buf = (struct rtas_manage_flash_t *) dp->data;
if ((args_buf->status == MANAGE_AUTH) || (count == 0))
- return count;
+ goto out;
op = -1;
if (buf) {
if (count > 9) count = 9;
- if (copy_from_user (stkbuf, buf, count)) {
- return -EFAULT;
- }
+ rc = -EFAULT;
+ if (copy_from_user (stkbuf, buf, count))
+ goto error;
if (strncmp(stkbuf, reject_str, strlen(reject_str)) == 0)
op = RTAS_REJECT_TMP_IMG;
else if (strncmp(stkbuf, commit_str, strlen(commit_str)) == 0)
op = RTAS_COMMIT_TMP_IMG;
}
- if (op == -1) /* buf is empty, or contains invalid string */
- return -EINVAL;
-
- args_buf->op = op;
- manage_flash(args_buf);
+ if (op == -1) { /* buf is empty, or contains invalid string */
+ rc = -EINVAL;
+ goto error;
+ }
+ manage_flash(args_buf, op);
+out:
+ mutex_unlock(&rtas_manage_flash_mutex);
return count;
+
+error:
+ mutex_unlock(&rtas_manage_flash_mutex);
+ return rc;
}
+/*
+ * Validation routines.
+ */
static void validate_flash(struct rtas_validate_flash_t *args_buf)
{
int token = rtas_token("ibm,validate-flash-image");
@@ -472,7 +467,7 @@ static void validate_flash(struct rtas_validate_flash_t *args_buf)
}
static int get_validate_flash_msg(struct rtas_validate_flash_t *args_buf,
- char *msg)
+ char *msg, int msglen)
{
int n;
@@ -480,7 +475,8 @@ static int get_validate_flash_msg(struct rtas_validate_flash_t *args_buf,
n = sprintf(msg, "%d\n", args_buf->update_results);
if ((args_buf->update_results >= VALIDATE_CUR_UNKNOWN) ||
(args_buf->update_results == VALIDATE_TMP_UPDATE))
- n += sprintf(msg + n, "%s\n", args_buf->buf);
+ n += snprintf(msg + n, msglen - n, "%s\n",
+ args_buf->buf);
} else {
n = sprintf(msg, "%d\n", args_buf->status);
}
@@ -490,52 +486,33 @@ static int get_validate_flash_msg(struct rtas_validate_flash_t *args_buf,
static ssize_t validate_flash_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
- struct rtas_validate_flash_t *args_buf;
- char msg[RTAS_MSG_MAXLEN];
+ struct rtas_validate_flash_t *const args_buf =
+ &rtas_validate_flash_data;
+ char msg[VALIDATE_MSG_LEN];
int msglen;
- args_buf = (struct rtas_validate_flash_t *) dp->data;
+ mutex_lock(&rtas_validate_flash_mutex);
+ msglen = get_validate_flash_msg(args_buf, msg, VALIDATE_MSG_LEN);
+ mutex_unlock(&rtas_validate_flash_mutex);
- if (ppos && *ppos != 0)
- return 0; /* be cheap */
-
- msglen = get_validate_flash_msg(args_buf, msg);
- if (msglen > count)
- msglen = count;
-
- if (!access_ok(VERIFY_WRITE, buf, msglen))
- return -EINVAL;
-
- if (copy_to_user(buf, msg, msglen))
- return -EFAULT;
-
- if (ppos)
- *ppos = msglen;
- return msglen;
+ return simple_read_from_buffer(buf, count, ppos, msg, msglen);
}
static ssize_t validate_flash_write(struct file *file, const char __user *buf,
size_t count, loff_t *off)
{
- struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
- struct rtas_validate_flash_t *args_buf;
+ struct rtas_validate_flash_t *const args_buf =
+ &rtas_validate_flash_data;
int rc;
- args_buf = (struct rtas_validate_flash_t *) dp->data;
-
- if (dp->data == NULL) {
- dp->data = kmalloc(sizeof(struct rtas_validate_flash_t),
- GFP_KERNEL);
- if (dp->data == NULL)
- return -ENOMEM;
- }
+ mutex_lock(&rtas_validate_flash_mutex);
/* We are only interested in the first 4K of the
* candidate image */
if ((*off >= VALIDATE_BUF_SIZE) ||
(args_buf->status == VALIDATE_AUTH)) {
*off += count;
+ mutex_unlock(&rtas_validate_flash_mutex);
return count;
}
@@ -558,31 +535,29 @@ static ssize_t validate_flash_write(struct file *file, const char __user *buf,
*off += count;
rc = count;
done:
- if (rc < 0) {
- kfree(dp->data);
- dp->data = NULL;
- }
+ mutex_unlock(&rtas_validate_flash_mutex);
return rc;
}
static int validate_flash_release(struct inode *inode, struct file *file)
{
- struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
- struct rtas_validate_flash_t *args_buf;
+ struct rtas_validate_flash_t *const args_buf =
+ &rtas_validate_flash_data;
- args_buf = (struct rtas_validate_flash_t *) dp->data;
+ mutex_lock(&rtas_validate_flash_mutex);
if (args_buf->status == VALIDATE_READY) {
args_buf->buf_size = VALIDATE_BUF_SIZE;
validate_flash(args_buf);
}
- /* The matching atomic_inc was in rtas_excl_open() */
- atomic_dec(&dp->count);
-
+ mutex_unlock(&rtas_validate_flash_mutex);
return 0;
}
+/*
+ * On-reboot flash update applicator.
+ */
static void rtas_flash_firmware(int reboot_type)
{
unsigned long image_size;
@@ -608,6 +583,12 @@ static void rtas_flash_firmware(int reboot_type)
}
/*
+ * Just before starting the firmware flash, cancel the event scan work
+ * to avoid any soft lockup issues.
+ */
+ rtas_cancel_event_scan();
+
+ /*
* NOTE: the "first" block must be under 4GB, so we create
* an entry with no data blocks in the reserved buffer in
* the kernel data segment.
@@ -616,7 +597,7 @@ static void rtas_flash_firmware(int reboot_type)
flist = (struct flash_block_list *)&rtas_data_buf[0];
flist->num_blocks = 0;
flist->next = rtas_firmware_flash_list;
- rtas_block_list = virt_to_abs(flist);
+ rtas_block_list = __pa(flist);
if (rtas_block_list >= 4UL*1024*1024*1024) {
printk(KERN_ALERT "FLASH: kernel bug...flash list header addr above 4GB\n");
spin_unlock(&rtas_data_buf_lock);
@@ -630,17 +611,19 @@ static void rtas_flash_firmware(int reboot_type)
for (f = flist; f; f = next) {
/* Translate data addrs to absolute */
for (i = 0; i < f->num_blocks; i++) {
- f->blocks[i].data = (char *)virt_to_abs(f->blocks[i].data);
+ f->blocks[i].data = (char *)cpu_to_be64(__pa(f->blocks[i].data));
image_size += f->blocks[i].length;
+ f->blocks[i].length = cpu_to_be64(f->blocks[i].length);
}
next = f->next;
/* Don't translate NULL pointer for last entry */
if (f->next)
- f->next = (struct flash_block_list *)virt_to_abs(f->next);
+ f->next = (struct flash_block_list *)cpu_to_be64(__pa(f->next));
else
f->next = NULL;
/* make num_blocks into the version/length field */
f->num_blocks = (FLASH_BLOCK_LIST_VERSION << 56) | ((f->num_blocks+1)*16);
+ f->num_blocks = cpu_to_be64(f->num_blocks);
}
printk(KERN_ALERT "FLASH: flash image is %ld bytes\n", image_size);
@@ -669,171 +652,128 @@ static void rtas_flash_firmware(int reboot_type)
spin_unlock(&rtas_data_buf_lock);
}
-static void remove_flash_pde(struct proc_dir_entry *dp)
-{
- if (dp) {
- kfree(dp->data);
- remove_proc_entry(dp->name, dp->parent);
- }
-}
-
-static int initialize_flash_pde_data(const char *rtas_call_name,
- size_t buf_size,
- struct proc_dir_entry *dp)
-{
+/*
+ * Manifest of proc files to create
+ */
+struct rtas_flash_file {
+ const char *filename;
+ const char *rtas_call_name;
int *status;
- int token;
-
- dp->data = kzalloc(buf_size, GFP_KERNEL);
- if (dp->data == NULL) {
- remove_flash_pde(dp);
- return -ENOMEM;
- }
-
- /*
- * This code assumes that the status int is the first member of the
- * struct
- */
- status = (int *) dp->data;
- token = rtas_token(rtas_call_name);
- if (token == RTAS_UNKNOWN_SERVICE)
- *status = FLASH_AUTH;
- else
- *status = FLASH_NO_OP;
-
- return 0;
-}
-
-static struct proc_dir_entry *create_flash_pde(const char *filename,
- const struct file_operations *fops)
-{
- return proc_create(filename, S_IRUSR | S_IWUSR, NULL, fops);
-}
-
-static const struct file_operations rtas_flash_operations = {
- .owner = THIS_MODULE,
- .read = rtas_flash_read,
- .write = rtas_flash_write,
- .open = rtas_excl_open,
- .release = rtas_flash_release,
- .llseek = default_llseek,
-};
-
-static const struct file_operations manage_flash_operations = {
- .owner = THIS_MODULE,
- .read = manage_flash_read,
- .write = manage_flash_write,
- .open = rtas_excl_open,
- .release = rtas_excl_release,
- .llseek = default_llseek,
+ const struct file_operations fops;
};
-static const struct file_operations validate_flash_operations = {
- .owner = THIS_MODULE,
- .read = validate_flash_read,
- .write = validate_flash_write,
- .open = rtas_excl_open,
- .release = validate_flash_release,
- .llseek = default_llseek,
+static const struct rtas_flash_file rtas_flash_files[] = {
+ {
+ .filename = "powerpc/rtas/" FIRMWARE_FLASH_NAME,
+ .rtas_call_name = "ibm,update-flash-64-and-reboot",
+ .status = &rtas_update_flash_data.status,
+ .fops.read = rtas_flash_read_msg,
+ .fops.write = rtas_flash_write,
+ .fops.release = rtas_flash_release,
+ .fops.llseek = default_llseek,
+ },
+ {
+ .filename = "powerpc/rtas/" FIRMWARE_UPDATE_NAME,
+ .rtas_call_name = "ibm,update-flash-64-and-reboot",
+ .status = &rtas_update_flash_data.status,
+ .fops.read = rtas_flash_read_num,
+ .fops.write = rtas_flash_write,
+ .fops.release = rtas_flash_release,
+ .fops.llseek = default_llseek,
+ },
+ {
+ .filename = "powerpc/rtas/" VALIDATE_FLASH_NAME,
+ .rtas_call_name = "ibm,validate-flash-image",
+ .status = &rtas_validate_flash_data.status,
+ .fops.read = validate_flash_read,
+ .fops.write = validate_flash_write,
+ .fops.release = validate_flash_release,
+ .fops.llseek = default_llseek,
+ },
+ {
+ .filename = "powerpc/rtas/" MANAGE_FLASH_NAME,
+ .rtas_call_name = "ibm,manage-flash-image",
+ .status = &rtas_manage_flash_data.status,
+ .fops.read = manage_flash_read,
+ .fops.write = manage_flash_write,
+ .fops.llseek = default_llseek,
+ }
};
static int __init rtas_flash_init(void)
{
- int rc;
+ int i;
if (rtas_token("ibm,update-flash-64-and-reboot") ==
RTAS_UNKNOWN_SERVICE) {
- printk(KERN_ERR "rtas_flash: no firmware flash support\n");
- return 1;
+ pr_info("rtas_flash: no firmware flash support\n");
+ return -EINVAL;
}
- firmware_flash_pde = create_flash_pde("powerpc/rtas/"
- FIRMWARE_FLASH_NAME,
- &rtas_flash_operations);
- if (firmware_flash_pde == NULL) {
- rc = -ENOMEM;
- goto cleanup;
- }
+ rtas_validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL);
+ if (!rtas_validate_flash_data.buf)
+ return -ENOMEM;
- rc = initialize_flash_pde_data("ibm,update-flash-64-and-reboot",
- sizeof(struct rtas_update_flash_t),
- firmware_flash_pde);
- if (rc != 0)
- goto cleanup;
-
- firmware_update_pde = create_flash_pde("powerpc/rtas/"
- FIRMWARE_UPDATE_NAME,
- &rtas_flash_operations);
- if (firmware_update_pde == NULL) {
- rc = -ENOMEM;
- goto cleanup;
+ flash_block_cache = kmem_cache_create("rtas_flash_cache",
+ RTAS_BLK_SIZE, RTAS_BLK_SIZE, 0,
+ NULL);
+ if (!flash_block_cache) {
+ printk(KERN_ERR "%s: failed to create block cache\n",
+ __func__);
+ goto enomem_buf;
}
- rc = initialize_flash_pde_data("ibm,update-flash-64-and-reboot",
- sizeof(struct rtas_update_flash_t),
- firmware_update_pde);
- if (rc != 0)
- goto cleanup;
-
- validate_pde = create_flash_pde("powerpc/rtas/" VALIDATE_FLASH_NAME,
- &validate_flash_operations);
- if (validate_pde == NULL) {
- rc = -ENOMEM;
- goto cleanup;
- }
+ for (i = 0; i < ARRAY_SIZE(rtas_flash_files); i++) {
+ const struct rtas_flash_file *f = &rtas_flash_files[i];
+ int token;
- rc = initialize_flash_pde_data("ibm,validate-flash-image",
- sizeof(struct rtas_validate_flash_t),
- validate_pde);
- if (rc != 0)
- goto cleanup;
-
- manage_pde = create_flash_pde("powerpc/rtas/" MANAGE_FLASH_NAME,
- &manage_flash_operations);
- if (manage_pde == NULL) {
- rc = -ENOMEM;
- goto cleanup;
- }
+ if (!proc_create(f->filename, S_IRUSR | S_IWUSR, NULL, &f->fops))
+ goto enomem;
- rc = initialize_flash_pde_data("ibm,manage-flash-image",
- sizeof(struct rtas_manage_flash_t),
- manage_pde);
- if (rc != 0)
- goto cleanup;
+ /*
+ * This code assumes that the status int is the first member of the
+ * struct
+ */
+ token = rtas_token(f->rtas_call_name);
+ if (token == RTAS_UNKNOWN_SERVICE)
+ *f->status = FLASH_AUTH;
+ else
+ *f->status = FLASH_NO_OP;
+ }
rtas_flash_term_hook = rtas_flash_firmware;
-
- flash_block_cache = kmem_cache_create("rtas_flash_cache",
- RTAS_BLK_SIZE, RTAS_BLK_SIZE, 0,
- rtas_block_ctor);
- if (!flash_block_cache) {
- printk(KERN_ERR "%s: failed to create block cache\n",
- __func__);
- rc = -ENOMEM;
- goto cleanup;
- }
return 0;
-cleanup:
- remove_flash_pde(firmware_flash_pde);
- remove_flash_pde(firmware_update_pde);
- remove_flash_pde(validate_pde);
- remove_flash_pde(manage_pde);
+enomem:
+ while (--i >= 0) {
+ const struct rtas_flash_file *f = &rtas_flash_files[i];
+ remove_proc_entry(f->filename, NULL);
+ }
- return rc;
+ kmem_cache_destroy(flash_block_cache);
+enomem_buf:
+ kfree(rtas_validate_flash_data.buf);
+ return -ENOMEM;
}
static void __exit rtas_flash_cleanup(void)
{
+ int i;
+
rtas_flash_term_hook = NULL;
- if (flash_block_cache)
- kmem_cache_destroy(flash_block_cache);
+ if (rtas_firmware_flash_list) {
+ free_flash_list(rtas_firmware_flash_list);
+ rtas_firmware_flash_list = NULL;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(rtas_flash_files); i++) {
+ const struct rtas_flash_file *f = &rtas_flash_files[i];
+ remove_proc_entry(f->filename, NULL);
+ }
- remove_flash_pde(firmware_flash_pde);
- remove_flash_pde(firmware_update_pde);
- remove_flash_pde(validate_pde);
- remove_flash_pde(manage_pde);
+ kmem_cache_destroy(flash_block_cache);
+ kfree(rtas_validate_flash_data.buf);
}
module_init(rtas_flash_init);
diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c
index 54e66da8f74..c168337aef9 100644
--- a/arch/powerpc/kernel/rtas_pci.c
+++ b/arch/powerpc/kernel/rtas_pci.c
@@ -80,10 +80,6 @@ int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val)
if (ret)
return PCIBIOS_DEVICE_NOT_FOUND;
- if (returnval == EEH_IO_ERROR_VALUE(size) &&
- eeh_dn_check_failure (pdn->node, NULL))
- return PCIBIOS_DEVICE_NOT_FOUND;
-
return PCIBIOS_SUCCESSFUL;
}
@@ -92,18 +88,39 @@ static int rtas_pci_read_config(struct pci_bus *bus,
int where, int size, u32 *val)
{
struct device_node *busdn, *dn;
-
- busdn = pci_bus_to_OF_node(bus);
+ struct pci_dn *pdn;
+ bool found = false;
+#ifdef CONFIG_EEH
+ struct eeh_dev *edev;
+#endif
+ int ret;
/* Search only direct children of the bus */
+ *val = 0xFFFFFFFF;
+ busdn = pci_bus_to_OF_node(bus);
for (dn = busdn->child; dn; dn = dn->sibling) {
- struct pci_dn *pdn = PCI_DN(dn);
+ pdn = PCI_DN(dn);
if (pdn && pdn->devfn == devfn
- && of_device_is_available(dn))
- return rtas_read_config(pdn, where, size, val);
+ && of_device_is_available(dn)) {
+ found = true;
+ break;
+ }
}
- return PCIBIOS_DEVICE_NOT_FOUND;
+ if (!found)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+#ifdef CONFIG_EEH
+ edev = of_node_to_eeh_dev(dn);
+ if (edev && edev->pe && edev->pe->state & EEH_PE_RESET)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+#endif
+
+ ret = rtas_read_config(pdn, where, size, val);
+ if (*val == EEH_IO_ERROR_VALUE(size) &&
+ eeh_dev_check_failure(of_node_to_eeh_dev(dn)))
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ return ret;
}
int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val)
@@ -136,17 +153,34 @@ static int rtas_pci_write_config(struct pci_bus *bus,
int where, int size, u32 val)
{
struct device_node *busdn, *dn;
-
- busdn = pci_bus_to_OF_node(bus);
+ struct pci_dn *pdn;
+ bool found = false;
+#ifdef CONFIG_EEH
+ struct eeh_dev *edev;
+#endif
+ int ret;
/* Search only direct children of the bus */
+ busdn = pci_bus_to_OF_node(bus);
for (dn = busdn->child; dn; dn = dn->sibling) {
- struct pci_dn *pdn = PCI_DN(dn);
+ pdn = PCI_DN(dn);
if (pdn && pdn->devfn == devfn
- && of_device_is_available(dn))
- return rtas_write_config(pdn, where, size, val);
+ && of_device_is_available(dn)) {
+ found = true;
+ break;
+ }
}
- return PCIBIOS_DEVICE_NOT_FOUND;
+
+ if (!found)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+#ifdef CONFIG_EEH
+ edev = of_node_to_eeh_dev(dn);
+ if (edev && edev->pe && (edev->pe->state & EEH_PE_RESET))
+ return PCIBIOS_DEVICE_NOT_FOUND;
+#endif
+ ret = rtas_write_config(pdn, where, size, val);
+
+ return ret;
}
static struct pci_ops rtas_pci_ops = {
@@ -201,7 +235,7 @@ static void python_countermeasures(struct device_node *dev)
iounmap(chip_regs);
}
-void __init init_pci_config_tokens (void)
+void __init init_pci_config_tokens(void)
{
read_pci_config = rtas_token("read-pci-config");
write_pci_config = rtas_token("write-pci-config");
@@ -209,7 +243,7 @@ void __init init_pci_config_tokens (void)
ibm_write_pci_config = rtas_token("ibm,write-pci-config");
}
-unsigned long __devinit get_phb_buid (struct device_node *phb)
+unsigned long get_phb_buid(struct device_node *phb)
{
struct resource r;
@@ -223,7 +257,7 @@ unsigned long __devinit get_phb_buid (struct device_node *phb)
static int phb_set_bus_ranges(struct device_node *dev,
struct pci_controller *phb)
{
- const int *bus_range;
+ const __be32 *bus_range;
unsigned int len;
bus_range = of_get_property(dev, "bus-range", &len);
@@ -231,13 +265,13 @@ static int phb_set_bus_ranges(struct device_node *dev,
return 1;
}
- phb->first_busno = bus_range[0];
- phb->last_busno = bus_range[1];
+ phb->first_busno = be32_to_cpu(bus_range[0]);
+ phb->last_busno = be32_to_cpu(bus_range[1]);
return 0;
}
-int __devinit rtas_setup_phb(struct pci_controller *phb)
+int rtas_setup_phb(struct pci_controller *phb)
{
struct device_node *dev = phb->dn;
@@ -276,7 +310,7 @@ void __init find_and_init_phbs(void)
pci_devs_phb_init();
/*
- * pci_probe_only and pci_assign_all_buses can be set via properties
+ * PCI_PROBE_ONLY and PCI_REASSIGN_ALL_BUS can be set via properties
* in chosen.
*/
if (of_chosen) {
@@ -284,14 +318,18 @@ void __init find_and_init_phbs(void)
prop = of_get_property(of_chosen,
"linux,pci-probe-only", NULL);
- if (prop)
- pci_probe_only = *prop;
+ if (prop) {
+ if (*prop)
+ pci_add_flags(PCI_PROBE_ONLY);
+ else
+ pci_clear_flags(PCI_PROBE_ONLY);
+ }
#ifdef CONFIG_PPC32 /* Will be made generic soon */
prop = of_get_property(of_chosen,
"linux,pci-assign-all-buses", NULL);
if (prop && *prop)
- ppc_pci_flags |= PPC_PCI_REASSIGN_ALL_BUS;
+ pci_add_flags(PCI_REASSIGN_ALL_BUS);
#endif /* CONFIG_PPC32 */
}
}
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 0438f819fe6..e736387fee6 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -27,8 +27,9 @@
#include <asm/rtas.h>
#include <asm/prom.h>
#include <asm/nvram.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/machdep.h>
+#include <asm/topology.h>
static DEFINE_SPINLOCK(rtasd_log_lock);
@@ -87,6 +88,8 @@ static char *rtas_event_type(int type)
return "Resource Deallocation Event";
case RTAS_TYPE_DUMP:
return "Dump Notification Event";
+ case RTAS_TYPE_PRRN:
+ return "Platform Resource Reassignment Event";
}
return rtas_type[0];
@@ -147,8 +150,8 @@ static void printk_log_rtas(char *buf, int len)
struct rtas_error_log *errlog = (struct rtas_error_log *)buf;
printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n",
- error_log_cnt, rtas_event_type(errlog->type),
- errlog->severity);
+ error_log_cnt, rtas_event_type(rtas_error_type(errlog)),
+ rtas_error_severity(errlog));
}
}
@@ -156,14 +159,16 @@ static int log_rtas_len(char * buf)
{
int len;
struct rtas_error_log *err;
+ uint32_t extended_log_length;
/* rtas fixed header */
len = 8;
err = (struct rtas_error_log *)buf;
- if (err->extended_log_length) {
+ extended_log_length = rtas_error_extended_log_length(err);
+ if (rtas_error_extended(err) && extended_log_length) {
/* extended header */
- len += err->extended_log_length;
+ len += extended_log_length;
}
if (rtas_error_log_max == 0)
@@ -265,9 +270,49 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
spin_unlock_irqrestore(&rtasd_log_lock, s);
return;
}
+}
+
+#ifdef CONFIG_PPC_PSERIES
+static s32 prrn_update_scope;
+
+static void prrn_work_fn(struct work_struct *work)
+{
+ /*
+ * For PRRN, we must pass the negative of the scope value in
+ * the RTAS event.
+ */
+ pseries_devicetree_update(-prrn_update_scope);
+}
+
+static DECLARE_WORK(prrn_work, prrn_work_fn);
+
+void prrn_schedule_update(u32 scope)
+{
+ flush_work(&prrn_work);
+ prrn_update_scope = scope;
+ schedule_work(&prrn_work);
+}
+
+static void handle_rtas_event(const struct rtas_error_log *log)
+{
+ if (rtas_error_type(log) != RTAS_TYPE_PRRN || !prrn_is_enabled())
+ return;
+
+ /* For PRRN Events the extended log length is used to denote
+ * the scope for calling rtas update-nodes.
+ */
+ prrn_schedule_update(rtas_error_extended_log_length(log));
+}
+
+#else
+static void handle_rtas_event(const struct rtas_error_log *log)
+{
+ return;
}
+#endif
+
static int rtas_log_open(struct inode * inode, struct file * file)
{
return 0;
@@ -388,8 +433,10 @@ static void do_event_scan(void)
break;
}
- if (error == 0)
+ if (error == 0) {
pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
+ handle_rtas_event((struct rtas_error_log *)logdata);
+ }
} while(error == 0);
}
@@ -412,7 +459,8 @@ static void rtas_event_scan(struct work_struct *w)
get_online_cpus();
- cpu = cpumask_next(smp_processor_id(), cpu_online_mask);
+ /* raw_ OK because just using CPU as starting point. */
+ cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
if (cpu >= nr_cpu_ids) {
cpu = cpumask_first(cpu_online_mask);
@@ -464,13 +512,20 @@ static void start_event_scan(void)
pr_debug("rtasd: will sleep for %d milliseconds\n",
(30000 / rtas_event_scan_rate));
- /* Retreive errors from nvram if any */
+ /* Retrieve errors from nvram if any */
retreive_nvram_error_log();
schedule_delayed_work_on(cpumask_first(cpu_online_mask),
&event_scan_work, event_scan_delay);
}
+/* Cancel the rtas event scan work */
+void rtas_cancel_event_scan(void)
+{
+ cancel_delayed_work_sync(&event_scan_work);
+}
+EXPORT_SYMBOL_GPL(rtas_cancel_event_scan);
+
static int __init rtas_init(void)
{
struct proc_dir_entry *entry;
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 9d4882a4664..e5b022c55cc 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -12,7 +12,7 @@
#undef DEBUG
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/init.h>
@@ -51,7 +51,6 @@
#include <asm/btext.h>
#include <asm/nvram.h>
#include <asm/setup.h>
-#include <asm/system.h>
#include <asm/rtas.h>
#include <asm/iommu.h>
#include <asm/serial.h>
@@ -61,8 +60,7 @@
#include <asm/xmon.h>
#include <asm/cputhreads.h>
#include <mm/mmu_decl.h>
-
-#include "setup.h"
+#include <asm/fadump.h>
#ifdef DEBUG
#include <asm/udbg.h>
@@ -78,6 +76,9 @@ EXPORT_SYMBOL(ppc_md);
struct machdep_calls *machine_id;
EXPORT_SYMBOL(machine_id);
+int boot_cpuid = -1;
+EXPORT_SYMBOL_GPL(boot_cpuid);
+
unsigned long klimit = (unsigned long) _end;
char cmd_line[COMMAND_LINE_SIZE];
@@ -109,6 +110,14 @@ EXPORT_SYMBOL(ppc_do_canonicalize_irqs);
/* also used by kexec */
void machine_shutdown(void)
{
+#ifdef CONFIG_FA_DUMP
+ /*
+ * if fadump is active, cleanup the fadump registration before we
+ * shutdown.
+ */
+ fadump_cleanup();
+#endif
+
if (ppc_md.machine_shutdown)
ppc_md.machine_shutdown();
}
@@ -203,6 +212,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
{
unsigned long cpu_id = (unsigned long)v - 1;
unsigned int pvr;
+ unsigned long proc_freq;
unsigned short maj;
unsigned short min;
@@ -254,12 +264,19 @@ static int show_cpuinfo(struct seq_file *m, void *v)
#endif /* CONFIG_TAU */
/*
- * Assume here that all clock rates are the same in a
- * smp system. -- Cort
+ * Platforms that have variable clock rates, should implement
+ * the method ppc_md.get_proc_freq() that reports the clock
+ * rate of a given cpu. The rest can use ppc_proc_freq to
+ * report the clock rate that is same across all cpus.
*/
- if (ppc_proc_freq)
+ if (ppc_md.get_proc_freq)
+ proc_freq = ppc_md.get_proc_freq(cpu_id);
+ else
+ proc_freq = ppc_proc_freq;
+
+ if (proc_freq)
seq_printf(m, "clock\t\t: %lu.%06luMHz\n",
- ppc_proc_freq / 1000000, ppc_proc_freq % 1000000);
+ proc_freq / 1000000, proc_freq % 1000000);
if (ppc_md.show_percpuinfo != NULL)
ppc_md.show_percpuinfo(m, cpu_id);
@@ -373,15 +390,20 @@ void __init check_for_initrd(void)
#ifdef CONFIG_SMP
-int threads_per_core, threads_shift;
+int threads_per_core, threads_per_subcore, threads_shift;
cpumask_t threads_core_mask;
+EXPORT_SYMBOL_GPL(threads_per_core);
+EXPORT_SYMBOL_GPL(threads_per_subcore);
+EXPORT_SYMBOL_GPL(threads_shift);
+EXPORT_SYMBOL_GPL(threads_core_mask);
static void __init cpu_init_thread_core_maps(int tpc)
{
int i;
threads_per_core = tpc;
- threads_core_mask = CPU_MASK_NONE;
+ threads_per_subcore = tpc;
+ cpumask_clear(&threads_core_mask);
/* This implementation only supports power of 2 number of threads
* for simplicity and performance
@@ -390,7 +412,7 @@ static void __init cpu_init_thread_core_maps(int tpc)
BUG_ON(tpc != (1 << threads_shift));
for (i = 0; i < tpc; i++)
- cpu_set(i, threads_core_mask);
+ cpumask_set_cpu(i, &threads_core_mask);
printk(KERN_INFO "CPU maps initialized for %d thread%s per core\n",
tpc, tpc > 1 ? "s" : "");
@@ -404,7 +426,7 @@ static void __init cpu_init_thread_core_maps(int tpc)
* cpu_present_mask
*
* Having the possible map set up early allows us to restrict allocations
- * of things like irqstacks to num_possible_cpus() rather than NR_CPUS.
+ * of things like irqstacks to nr_cpu_ids rather than NR_CPUS.
*
* We do not initialize the online map here; cpus set their own bits in
* cpu_online_mask as they come up.
@@ -424,8 +446,9 @@ void __init smp_setup_cpu_maps(void)
DBG("smp_setup_cpu_maps()\n");
- while ((dn = of_find_node_by_type(dn, "cpu")) && cpu < NR_CPUS) {
- const int *intserv;
+ while ((dn = of_find_node_by_type(dn, "cpu")) && cpu < nr_cpu_ids) {
+ const __be32 *intserv;
+ __be32 cpu_be;
int j, len;
DBG(" * %s...\n", dn->full_name);
@@ -439,15 +462,25 @@ void __init smp_setup_cpu_maps(void)
} else {
DBG(" no ibm,ppc-interrupt-server#s -> 1 thread\n");
intserv = of_get_property(dn, "reg", NULL);
- if (!intserv)
- intserv = &cpu; /* assume logical == phys */
+ if (!intserv) {
+ cpu_be = cpu_to_be32(cpu);
+ intserv = &cpu_be; /* assume logical == phys */
+ }
}
- for (j = 0; j < nthreads && cpu < NR_CPUS; j++) {
+ for (j = 0; j < nthreads && cpu < nr_cpu_ids; j++) {
+ bool avail;
+
DBG(" thread %d -> cpu %d (hard id %d)\n",
- j, cpu, intserv[j]);
- set_cpu_present(cpu, true);
- set_hard_smp_processor_id(cpu, intserv[j]);
+ j, cpu, be32_to_cpu(intserv[j]));
+
+ avail = of_device_is_available(dn);
+ if (!avail)
+ avail = !of_property_match_string(dn,
+ "enable-method", "spin-table");
+
+ set_cpu_present(cpu, avail);
+ set_hard_smp_processor_id(cpu, be32_to_cpu(intserv[j]));
set_cpu_possible(cpu, true);
cpu++;
}
@@ -467,7 +500,7 @@ void __init smp_setup_cpu_maps(void)
if (machine_is(pseries) && firmware_has_feature(FW_FEATURE_LPAR) &&
(dn = of_find_node_by_path("/rtas"))) {
int num_addr_cell, num_size_cell, maxcpus;
- const unsigned int *ireg;
+ const __be32 *ireg;
num_addr_cell = of_n_addr_cells(dn);
num_size_cell = of_n_size_cells(dn);
@@ -477,18 +510,18 @@ void __init smp_setup_cpu_maps(void)
if (!ireg)
goto out;
- maxcpus = ireg[num_addr_cell + num_size_cell];
+ maxcpus = be32_to_cpup(ireg + num_addr_cell + num_size_cell);
/* Double maxcpus for processors which have SMT capability */
if (cpu_has_feature(CPU_FTR_SMT))
maxcpus *= nthreads;
- if (maxcpus > NR_CPUS) {
+ if (maxcpus > nr_cpu_ids) {
printk(KERN_WARNING
"Partition configured for %d cpus, "
"operating system maximum is %d.\n",
- maxcpus, NR_CPUS);
- maxcpus = NR_CPUS;
+ maxcpus, nr_cpu_ids);
+ maxcpus = nr_cpu_ids;
} else
printk(KERN_INFO "Partition configured for %d cpus.\n",
maxcpus);
@@ -509,6 +542,9 @@ void __init smp_setup_cpu_maps(void)
*/
cpu_init_thread_core_maps(nthreads);
+ /* Now that possible cpus are set, set nr_cpu_ids for later use */
+ setup_nr_cpu_ids();
+
free_unused_pacas();
}
#endif /* CONFIG_SMP */
@@ -599,16 +635,14 @@ int check_legacy_ioport(unsigned long base_port)
* name instead */
if (!np)
np = of_find_node_by_name(NULL, "8042");
+ if (np) {
+ of_i8042_kbd_irq = 1;
+ of_i8042_aux_irq = 12;
+ }
break;
case FDC_BASE: /* FDC1 */
np = of_find_node_by_type(NULL, "fdc");
break;
-#ifdef CONFIG_PPC_PREP
- case _PIDXR:
- case _PNPWRP:
- case PNPBIOS_BASE:
- /* implement me */
-#endif
default:
/* ipmi is supposed to fail here */
break;
@@ -629,6 +663,11 @@ EXPORT_SYMBOL(check_legacy_ioport);
static int ppc_panic_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
+ /*
+ * If firmware-assisted dump has been registered then trigger
+ * firmware-assisted dump and let firmware handle everything else.
+ */
+ crash_fadump(NULL, ptr);
ppc_md.panic(ptr); /* May not return */
return NOTIFY_DONE;
}
@@ -697,29 +736,14 @@ static int powerpc_debugfs_init(void)
arch_initcall(powerpc_debugfs_init);
#endif
-static int ppc_dflt_bus_notify(struct notifier_block *nb,
- unsigned long action, void *data)
+void ppc_printk_progress(char *s, unsigned short hex)
{
- struct device *dev = data;
-
- /* We are only intereted in device addition */
- if (action != BUS_NOTIFY_ADD_DEVICE)
- return 0;
-
- set_dma_ops(dev, &dma_direct_ops);
-
- return NOTIFY_DONE;
+ pr_info("%s\n", s);
}
-static struct notifier_block ppc_dflt_plat_bus_notifier = {
- .notifier_call = ppc_dflt_bus_notify,
- .priority = INT_MAX,
-};
-
-static int __init setup_bus_notifier(void)
+void arch_setup_pdev_archdata(struct platform_device *pdev)
{
- bus_register_notifier(&platform_bus_type, &ppc_dflt_plat_bus_notifier);
- return 0;
+ pdev->archdata.dma_mask = DMA_BIT_MASK(32);
+ pdev->dev.dma_mask = &pdev->archdata.dma_mask;
+ set_dma_ops(&pdev->dev, &dma_direct_ops);
}
-
-arch_initcall(setup_bus_notifier);
diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h
deleted file mode 100644
index 4c67ad7fae0..00000000000
--- a/arch/powerpc/kernel/setup.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _POWERPC_KERNEL_SETUP_H
-#define _POWERPC_KERNEL_SETUP_H
-
-void check_for_initrd(void);
-void do_init_bootmem(void);
-void setup_panic(void);
-extern int do_early_xmon;
-
-#endif /* _POWERPC_KERNEL_SETUP_H */
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 1d2fbc90530..ea4fda60e57 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -30,7 +30,6 @@
#include <asm/btext.h>
#include <asm/machdep.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <asm/pmac_feature.h>
#include <asm/sections.h>
#include <asm/nvram.h>
@@ -39,16 +38,14 @@
#include <asm/serial.h>
#include <asm/udbg.h>
#include <asm/mmu_context.h>
-
-#include "setup.h"
+#include <asm/epapr_hcalls.h>
#define DBG(fmt...)
extern void bootx_init(unsigned long r4, unsigned long phys);
-int boot_cpuid = -1;
-EXPORT_SYMBOL_GPL(boot_cpuid);
int boot_cpuid_phys;
+EXPORT_SYMBOL_GPL(boot_cpuid_phys);
int smp_hw_index[NR_CPUS];
@@ -106,6 +103,8 @@ notrace unsigned long __init early_init(unsigned long dt_ptr)
PTRRELOC(&__start___lwsync_fixup),
PTRRELOC(&__stop___lwsync_fixup));
+ do_final_fixups();
+
return KERNELBASE + offset;
}
@@ -116,7 +115,7 @@ notrace unsigned long __init early_init(unsigned long dt_ptr)
* This is called very early on the boot process, after a minimal
* MMU environment has been set up but before MMU_init is called.
*/
-notrace void __init machine_init(unsigned long dt_ptr)
+notrace void __init machine_init(u64 dt_ptr)
{
lockdep_init();
@@ -126,6 +125,10 @@ notrace void __init machine_init(unsigned long dt_ptr)
/* Do some early initialization based on the flat device tree */
early_init_devtree(__va(dt_ptr));
+ epapr_paravirt_early_init();
+
+ early_init_mmu();
+
probe_machine();
setup_kdump_trampoline();
@@ -145,27 +148,6 @@ notrace void __init machine_init(unsigned long dt_ptr)
ppc_md.progress("id mach(): done", 0x200);
}
-#ifdef CONFIG_BOOKE_WDT
-/* Checks wdt=x and wdt_period=xx command-line option */
-notrace int __init early_parse_wdt(char *p)
-{
- if (p && strncmp(p, "0", 1) != 0)
- booke_wdt_enabled = 1;
-
- return 0;
-}
-early_param("wdt", early_parse_wdt);
-
-int __init early_parse_wdt_period (char *p)
-{
- if (p)
- booke_wdt_period = simple_strtoul(p, NULL, 0);
-
- return 0;
-}
-early_param("wdt_period", early_parse_wdt_period);
-#endif /* CONFIG_BOOKE_WDT */
-
/* Checks "l2cr=xxxx" command-line option */
int __init ppc_setup_l2cr(char *str)
{
@@ -263,7 +245,12 @@ static void __init exc_lvl_early_init(void)
/* interrupt stacks must be in lowmem, we get that for free on ppc32
* as the memblock is limited to lowmem by MEMBLOCK_REAL_LIMIT */
for_each_possible_cpu(i) {
+#ifdef CONFIG_SMP
hw_cpu = get_hard_smp_processor_id(i);
+#else
+ hw_cpu = 0;
+#endif
+
critirq_ctx[hw_cpu] = (struct thread_info *)
__va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
#ifdef CONFIG_BOOKE
@@ -312,9 +299,6 @@ void __init setup_arch(char **cmdline_p)
if (cpu_has_feature(CPU_FTR_UNIFIED_ID_CACHE))
ucache_bsize = icache_bsize = dcache_bsize;
- /* reboot on panic */
- panic_timeout = 180;
-
if (ppc_md.panic)
setup_panic();
@@ -343,5 +327,4 @@ void __init setup_arch(char **cmdline_p)
/* Initialize the MMU context management stuff */
mmu_context_init();
-
}
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 2a178b0ebcd..ee082d77117 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -10,9 +10,9 @@
* 2 of the License, or (at your option) any later version.
*/
-#undef DEBUG
+#define DEBUG
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/init.h>
@@ -35,6 +35,9 @@
#include <linux/pci.h>
#include <linux/lockdep.h>
#include <linux/memblock.h>
+#include <linux/hugetlb.h>
+#include <linux/memory.h>
+
#include <asm/io.h>
#include <asm/kdump.h>
#include <asm/prom.h>
@@ -50,7 +53,6 @@
#include <asm/btext.h>
#include <asm/nvram.h>
#include <asm/setup.h>
-#include <asm/system.h>
#include <asm/rtas.h>
#include <asm/iommu.h>
#include <asm/serial.h>
@@ -62,8 +64,10 @@
#include <asm/udbg.h>
#include <asm/kexec.h>
#include <asm/mmu_context.h>
-
-#include "setup.h"
+#include <asm/code-patching.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hugetlb.h>
+#include <asm/epapr_hcalls.h>
#ifdef DEBUG
#define DBG(fmt...) udbg_printf(fmt)
@@ -71,7 +75,7 @@
#define DBG(fmt...)
#endif
-int boot_cpuid = 0;
+int spinning_secondaries;
u64 ppc64_pft_size;
/* Pick defaults since we might want to patch instructions
@@ -93,6 +97,38 @@ int dcache_bsize;
int icache_bsize;
int ucache_bsize;
+#if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP)
+static void setup_tlb_core_data(void)
+{
+ int cpu;
+
+ BUILD_BUG_ON(offsetof(struct tlb_core_data, lock) != 0);
+
+ for_each_possible_cpu(cpu) {
+ int first = cpu_first_thread_sibling(cpu);
+
+ paca[cpu].tcd_ptr = &paca[first].tcd;
+
+ /*
+ * If we have threads, we need either tlbsrx.
+ * or e6500 tablewalk mode, or else TLB handlers
+ * will be racy and could produce duplicate entries.
+ */
+ if (smt_enabled_at_boot >= 2 &&
+ !mmu_has_feature(MMU_FTR_USE_TLBRSRV) &&
+ book3e_htw_mode != PPC_HTW_E6500) {
+ /* Should we panic instead? */
+ WARN_ONCE("%s: unsupported MMU configuration -- expect problems\n",
+ __func__);
+ }
+ }
+}
+#else
+static void setup_tlb_core_data(void)
+{
+}
+#endif
+
#ifdef CONFIG_SMP
static char *smt_enabled_cmdline;
@@ -151,6 +187,28 @@ early_param("smt-enabled", early_smt_enabled);
#define check_smt_enabled()
#endif /* CONFIG_SMP */
+/** Fix up paca fields required for the boot cpu */
+static void fixup_boot_paca(void)
+{
+ /* The boot cpu is started */
+ get_paca()->cpu_start = 1;
+ /* Allow percpu accesses to work until we setup percpu data */
+ get_paca()->data_offset = 0;
+}
+
+static void cpu_ready_for_interrupts(void)
+{
+ /* Set IR and DR in PACA MSR */
+ get_paca()->kernel_msr = MSR_KERNEL;
+
+ /* Enable AIL if supported */
+ if (cpu_has_feature(CPU_FTR_HVMODE) &&
+ cpu_has_feature(CPU_FTR_ARCH_207S)) {
+ unsigned long lpcr = mfspr(SPRN_LPCR);
+ mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3);
+ }
+}
+
/*
* Early initialization entry point. This is called by head.S
* with MMU translation disabled. We rely on the "feature" of
@@ -172,6 +230,8 @@ early_param("smt-enabled", early_smt_enabled);
void __init early_setup(unsigned long dt_ptr)
{
+ static __initdata struct paca_struct boot_paca;
+
/* -------- printk is _NOT_ safe to use here ! ------- */
/* Identify CPU type */
@@ -180,6 +240,7 @@ void __init early_setup(unsigned long dt_ptr)
/* Assume we're on cpu 0 for now. Don't write to the paca yet! */
initialise_paca(&boot_paca, 0);
setup_paca(&boot_paca);
+ fixup_boot_paca();
/* Initialize lockdep early or else spinlocks will blow */
lockdep_init();
@@ -198,11 +259,11 @@ void __init early_setup(unsigned long dt_ptr)
*/
early_init_devtree(__va(dt_ptr));
+ epapr_paravirt_early_init();
+
/* Now we know the logical id of our boot cpu, setup the paca. */
setup_paca(&paca[boot_cpuid]);
-
- /* Fix up paca fields required for the boot cpu */
- get_paca()->cpu_start = 1;
+ fixup_boot_paca();
/* Probe the machine type */
probe_machine();
@@ -214,7 +275,36 @@ void __init early_setup(unsigned long dt_ptr)
/* Initialize the hash table or TLB handling */
early_init_mmu();
+ /*
+ * At this point, we can let interrupts switch to virtual mode
+ * (the MMU has been setup), so adjust the MSR in the PACA to
+ * have IR and DR set and enable AIL if it exists
+ */
+ cpu_ready_for_interrupts();
+
+ /* Reserve large chunks of memory for use by CMA for KVM */
+ kvm_cma_reserve();
+
+ /*
+ * Reserve any gigantic pages requested on the command line.
+ * memblock needs to have been initialized by the time this is
+ * called since this will reserve memory.
+ */
+ reserve_hugetlb_gpages();
+
DBG(" <- early_setup()\n");
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX
+ /*
+ * This needs to be done *last* (after the above DBG() even)
+ *
+ * Right after we return from this function, we turn on the MMU
+ * which means the real-mode access trick that btext does will
+ * no longer work, it needs to switch to using a real MMU
+ * mapping. This call will ensure that it does
+ */
+ btext_map();
+#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */
}
#ifdef CONFIG_SMP
@@ -225,6 +315,13 @@ void early_setup_secondary(void)
/* Initialize the hash table or TLB handling */
early_init_mmu_secondary();
+
+ /*
+ * At this point, we can let interrupts switch to virtual mode
+ * (the MMU has been setup), so adjust the MSR in the PACA to
+ * have IR and DR set.
+ */
+ cpu_ready_for_interrupts();
}
#endif /* CONFIG_SMP */
@@ -233,6 +330,7 @@ void early_setup_secondary(void)
void smp_release_cpus(void)
{
unsigned long *ptr;
+ int i;
DBG(" -> smp_release_cpus()\n");
@@ -244,8 +342,17 @@ void smp_release_cpus(void)
ptr = (unsigned long *)((unsigned long)&__secondary_hold_spinloop
- PHYSICAL_START);
- *ptr = __pa(generic_secondary_smp_init);
- mb();
+ *ptr = ppc_function_entry(generic_secondary_smp_init);
+
+ /* And wait a bit for them to catch up */
+ for (i = 0; i < 100000; i++) {
+ mb();
+ HMT_low();
+ if (spinning_secondaries == 0)
+ break;
+ udelay(1);
+ }
+ DBG("spinning_secondaries = %d\n", spinning_secondaries);
DBG(" <- smp_release_cpus()\n");
}
@@ -265,29 +372,32 @@ static void __init initialize_cache_info(void)
DBG(" -> initialize_cache_info()\n");
- for (np = NULL; (np = of_find_node_by_type(np, "cpu"));) {
+ for_each_node_by_type(np, "cpu") {
num_cpus += 1;
- /* We're assuming *all* of the CPUs have the same
+ /*
+ * We're assuming *all* of the CPUs have the same
* d-cache and i-cache sizes... -Peter
*/
-
- if ( num_cpus == 1 ) {
- const u32 *sizep, *lsizep;
+ if (num_cpus == 1) {
+ const __be32 *sizep, *lsizep;
u32 size, lsize;
size = 0;
lsize = cur_cpu_spec->dcache_bsize;
sizep = of_get_property(np, "d-cache-size", NULL);
if (sizep != NULL)
- size = *sizep;
- lsizep = of_get_property(np, "d-cache-block-size", NULL);
+ size = be32_to_cpu(*sizep);
+ lsizep = of_get_property(np, "d-cache-block-size",
+ NULL);
/* fallback if block size missing */
if (lsizep == NULL)
- lsizep = of_get_property(np, "d-cache-line-size", NULL);
+ lsizep = of_get_property(np,
+ "d-cache-line-size",
+ NULL);
if (lsizep != NULL)
- lsize = *lsizep;
- if (sizep == 0 || lsizep == 0)
+ lsize = be32_to_cpu(*lsizep);
+ if (sizep == NULL || lsizep == NULL)
DBG("Argh, can't find dcache properties ! "
"sizep: %p, lsizep: %p\n", sizep, lsizep);
@@ -300,13 +410,16 @@ static void __init initialize_cache_info(void)
lsize = cur_cpu_spec->icache_bsize;
sizep = of_get_property(np, "i-cache-size", NULL);
if (sizep != NULL)
- size = *sizep;
- lsizep = of_get_property(np, "i-cache-block-size", NULL);
+ size = be32_to_cpu(*sizep);
+ lsizep = of_get_property(np, "i-cache-block-size",
+ NULL);
if (lsizep == NULL)
- lsizep = of_get_property(np, "i-cache-line-size", NULL);
+ lsizep = of_get_property(np,
+ "i-cache-line-size",
+ NULL);
if (lsizep != NULL)
- lsize = *lsizep;
- if (sizep == 0 || lsizep == 0)
+ lsize = be32_to_cpu(*lsizep);
+ if (sizep == NULL || lsizep == NULL)
DBG("Argh, can't find icache properties ! "
"sizep: %p, lsizep: %p\n", sizep, lsizep);
@@ -340,6 +453,7 @@ void __init setup_system(void)
&__start___fw_ftr_fixup, &__stop___fw_ftr_fixup);
do_lwsync_fixups(cur_cpu_spec->cpu_features,
&__start___lwsync_fixup, &__stop___lwsync_fixup);
+ do_final_fixups();
/*
* Unflatten the device-tree passed by prom_init or kexec
@@ -391,6 +505,7 @@ void __init setup_system(void)
smp_setup_cpu_maps();
check_smt_enabled();
+ setup_tlb_core_data();
#ifdef CONFIG_SMP
/* Release secondary cpus out of their spinloops at 0x60 now that
@@ -423,22 +538,35 @@ void __init setup_system(void)
DBG(" <- setup_system()\n");
}
-static u64 slb0_limit(void)
+/* This returns the limit below which memory accesses to the linear
+ * mapping are guarnateed not to cause a TLB or SLB miss. This is
+ * used to allocate interrupt or emergency stacks for which our
+ * exception entry path doesn't deal with being interrupted.
+ */
+static u64 safe_stack_limit(void)
{
- if (cpu_has_feature(CPU_FTR_1T_SEGMENT)) {
+#ifdef CONFIG_PPC_BOOK3E
+ /* Freescale BookE bolts the entire linear mapping */
+ if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
+ return linear_map_top;
+ /* Other BookE, we assume the first GB is bolted */
+ return 1ul << 30;
+#else
+ /* BookS, the first segment is bolted */
+ if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
return 1UL << SID_SHIFT_1T;
- }
return 1UL << SID_SHIFT;
+#endif
}
static void __init irqstack_early_init(void)
{
- u64 limit = slb0_limit();
+ u64 limit = safe_stack_limit();
unsigned int i;
/*
- * interrupt stacks must be under 256MB, we cannot afford to take
- * SLB misses on them.
+ * Interrupt stacks must be in the first segment since we
+ * cannot afford to take SLB misses on them.
*/
for_each_possible_cpu(i) {
softirq_ctx[i] = (struct thread_info *)
@@ -454,15 +582,24 @@ static void __init irqstack_early_init(void)
static void __init exc_lvl_early_init(void)
{
unsigned int i;
+ unsigned long sp;
for_each_possible_cpu(i) {
- critirq_ctx[i] = (struct thread_info *)
- __va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
- dbgirq_ctx[i] = (struct thread_info *)
- __va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
- mcheckirq_ctx[i] = (struct thread_info *)
- __va(memblock_alloc(THREAD_SIZE, THREAD_SIZE));
+ sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+ critirq_ctx[i] = (struct thread_info *)__va(sp);
+ paca[i].crit_kstack = __va(sp + THREAD_SIZE);
+
+ sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+ dbgirq_ctx[i] = (struct thread_info *)__va(sp);
+ paca[i].dbg_kstack = __va(sp + THREAD_SIZE);
+
+ sp = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+ mcheckirq_ctx[i] = (struct thread_info *)__va(sp);
+ paca[i].mc_kstack = __va(sp + THREAD_SIZE);
}
+
+ if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC))
+ patch_exception(0x040, exc_debug_debug_book3e);
}
#else
#define exc_lvl_early_init()
@@ -470,7 +607,8 @@ static void __init exc_lvl_early_init(void)
/*
* Stack space used when we detect a bad kernel stack pointer, and
- * early in SMP boots before relocation is enabled.
+ * early in SMP boots before relocation is enabled. Exclusive emergency
+ * stack for machine checks.
*/
static void __init emergency_stack_init(void)
{
@@ -486,20 +624,26 @@ static void __init emergency_stack_init(void)
* bringup, we need to get at them in real mode. This means they
* must also be within the RMO region.
*/
- limit = min(slb0_limit(), ppc64_rma_size);
+ limit = min(safe_stack_limit(), ppc64_rma_size);
for_each_possible_cpu(i) {
unsigned long sp;
sp = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit);
sp += THREAD_SIZE;
paca[i].emergency_sp = __va(sp);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ /* emergency stack for machine check exception handling. */
+ sp = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit);
+ sp += THREAD_SIZE;
+ paca[i].mc_emergency_sp = __va(sp);
+#endif
}
}
/*
- * Called into from start_kernel, after lock_kernel has been called.
- * Initializes bootmem, which is unsed to manage page allocation until
- * mem_init is called.
+ * Called into from start_kernel this initializes bootmem, which is used
+ * to manage page allocation until mem_init is called.
*/
void __init setup_arch(char **cmdline_p)
{
@@ -515,9 +659,6 @@ void __init setup_arch(char **cmdline_p)
dcache_bsize = ppc64_caches.dline_size;
icache_bsize = ppc64_caches.iline_size;
- /* reboot on panic */
- panic_timeout = 180;
-
if (ppc_md.panic)
setup_panic();
@@ -525,7 +666,9 @@ void __init setup_arch(char **cmdline_p)
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = klimit;
-
+#ifdef CONFIG_PPC_64K_PAGES
+ init_mm.context.pte_frag = NULL;
+#endif
irqstack_early_init();
exc_lvl_early_init();
emergency_stack_init();
@@ -549,6 +692,11 @@ void __init setup_arch(char **cmdline_p)
/* Initialize the MMU context management stuff */
mmu_context_init();
+ /* Interrupt code needs to be 64K-aligned */
+ if ((unsigned long)_stext & 0xffff)
+ panic("Kernelbase not 64K-aligned (0x%lx)!\n",
+ (unsigned long)_stext);
+
ppc64_boot_msg(0x15, "Setup Done");
}
@@ -633,9 +781,17 @@ void __init setup_per_cpu_areas(void)
}
#endif
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+unsigned long memory_block_size_bytes(void)
+{
+ if (ppc_md.memory_block_size)
+ return ppc_md.memory_block_size();
+
+ return MIN_MEMORY_BLOCK_SIZE;
+}
+#endif
-#ifdef CONFIG_PPC_INDIRECT_IO
+#if defined(CONFIG_PPC_INDIRECT_PIO) || defined(CONFIG_PPC_INDIRECT_MMIO)
struct ppc_pci_io ppc_pci_io;
EXPORT_SYMBOL(ppc_pci_io);
-#endif /* CONFIG_PPC_INDIRECT_IO */
-
+#endif
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 2300426e531..1c794cef288 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -11,9 +11,14 @@
#include <linux/tracehook.h>
#include <linux/signal.h>
+#include <linux/uprobes.h>
+#include <linux/key.h>
+#include <linux/context_tracking.h>
#include <asm/hw_breakpoint.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
+#include <asm/debug.h>
+#include <asm/tm.h>
#include "signal.h"
@@ -21,18 +26,18 @@
* through debug.exception-trace sysctl.
*/
-int show_unhandled_signals = 0;
+int show_unhandled_signals = 1;
/*
* Allocate space for the signal frame
*/
-void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
+void __user * get_sigframe(struct k_sigaction *ka, unsigned long sp,
size_t frame_size, int is_32)
{
unsigned long oldsp, newsp;
/* Default to using normal stack */
- oldsp = get_clean_sp(regs, is_32);
+ oldsp = get_clean_sp(sp, is_32);
/* Check for alt stack */
if ((ka->sa.sa_flags & SA_ONSTACK) &&
@@ -49,19 +54,6 @@ void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
return (void __user *)newsp;
}
-
-/*
- * Restore the user process's signal mask
- */
-void restore_sigmask(sigset_t *set)
-{
- sigdelsetmask(set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = *set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-}
-
static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka,
int has_handler)
{
@@ -113,31 +105,23 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka,
}
}
-static int do_signal_pending(sigset_t *oldset, struct pt_regs *regs)
+static int do_signal(struct pt_regs *regs)
{
+ sigset_t *oldset = sigmask_to_save();
siginfo_t info;
int signr;
struct k_sigaction ka;
int ret;
int is32 = is_32bit_task();
- if (current_thread_info()->local_flags & _TLF_RESTORE_SIGMASK)
- oldset = &current->saved_sigmask;
- else if (!oldset)
- oldset = &current->blocked;
-
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
/* Is there any syscall restart business here ? */
check_syscall_restart(regs, &ka, signr > 0);
if (signr <= 0) {
- struct thread_info *ti = current_thread_info();
/* No signal to deliver -- put the saved sigmask back */
- if (ti->local_flags & _TLF_RESTORE_SIGMASK) {
- ti->local_flags &= ~_TLF_RESTORE_SIGMASK;
- sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
- }
+ restore_saved_sigmask();
regs->trap = 0;
return 0; /* no signals delivered */
}
@@ -148,8 +132,9 @@ static int do_signal_pending(sigset_t *oldset, struct pt_regs *regs)
* user space. The DABR will have been cleared if it
* triggered inside the kernel.
*/
- if (current->thread.dabr)
- set_dabr(current->thread.dabr);
+ if (current->thread.hw_brk.address &&
+ current->thread.hw_brk.type)
+ __set_breakpoint(&current->thread.hw_brk);
#endif
/* Re-enable the breakpoints for the signal stack */
thread_change_pc(current, regs);
@@ -167,44 +152,61 @@ static int do_signal_pending(sigset_t *oldset, struct pt_regs *regs)
regs->trap = 0;
if (ret) {
- spin_lock_irq(&current->sighand->siglock);
- sigorsets(&current->blocked, &current->blocked,
- &ka.sa.sa_mask);
- if (!(ka.sa.sa_flags & SA_NODEFER))
- sigaddset(&current->blocked, signr);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- /*
- * A signal was successfully delivered; the saved sigmask is in
- * its frame, and we can clear the TLF_RESTORE_SIGMASK flag.
- */
- current_thread_info()->local_flags &= ~_TLF_RESTORE_SIGMASK;
-
- /*
- * Let tracing know that we've done the handler setup.
- */
- tracehook_signal_handler(signr, &info, &ka, regs,
+ signal_delivered(signr, &info, &ka, regs,
test_thread_flag(TIF_SINGLESTEP));
}
return ret;
}
-void do_signal(struct pt_regs *regs, unsigned long thread_info_flags)
+void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
{
+ user_exit();
+
+ if (thread_info_flags & _TIF_UPROBE)
+ uprobe_notify_resume(regs);
+
if (thread_info_flags & _TIF_SIGPENDING)
- do_signal_pending(NULL, regs);
+ do_signal(regs);
if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
}
+
+ user_enter();
}
-long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
- unsigned long r5, unsigned long r6, unsigned long r7,
- unsigned long r8, struct pt_regs *regs)
+unsigned long get_tm_stackpointer(struct pt_regs *regs)
{
- return do_sigaltstack(uss, uoss, regs->gpr[1]);
+ /* When in an active transaction that takes a signal, we need to be
+ * careful with the stack. It's possible that the stack has moved back
+ * up after the tbegin. The obvious case here is when the tbegin is
+ * called inside a function that returns before a tend. In this case,
+ * the stack is part of the checkpointed transactional memory state.
+ * If we write over this non transactionally or in suspend, we are in
+ * trouble because if we get a tm abort, the program counter and stack
+ * pointer will be back at the tbegin but our in memory stack won't be
+ * valid anymore.
+ *
+ * To avoid this, when taking a signal in an active transaction, we
+ * need to use the stack pointer from the checkpointed state, rather
+ * than the speculated state. This ensures that the signal context
+ * (written tm suspended) will be written below the stack required for
+ * the rollback. The transaction is aborted becuase of the treclaim,
+ * so any memory written between the tbegin and the signal will be
+ * rolled back anyway.
+ *
+ * For signals taken in non-TM or suspended mode, we use the
+ * normal/non-checkpointed stack pointer.
+ */
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ if (MSR_TM_ACTIVE(regs->msr)) {
+ tm_reclaim_current(TM_CAUSE_SIGNAL);
+ if (MSR_TM_TRANSACTIONAL(regs->msr))
+ return current->thread.ckpt_regs.gpr[1];
+ }
+#endif
+ return regs->gpr[1];
}
diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index 6c0ddfc0603..c69b9aeb9f2 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -10,13 +10,10 @@
#ifndef _POWERPC_ARCH_SIGNAL_H
#define _POWERPC_ARCH_SIGNAL_H
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+extern void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags);
-extern void do_signal(struct pt_regs *regs, unsigned long thread_info_flags);
-
-extern void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
+extern void __user * get_sigframe(struct k_sigaction *ka, unsigned long sp,
size_t frame_size, int is_32);
-extern void restore_sigmask(sigset_t *set);
extern int handle_signal32(unsigned long sig, struct k_sigaction *ka,
siginfo_t *info, sigset_t *oldset,
@@ -28,13 +25,21 @@ extern int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
extern unsigned long copy_fpr_to_user(void __user *to,
struct task_struct *task);
+extern unsigned long copy_transact_fpr_to_user(void __user *to,
+ struct task_struct *task);
extern unsigned long copy_fpr_from_user(struct task_struct *task,
void __user *from);
+extern unsigned long copy_transact_fpr_from_user(struct task_struct *task,
+ void __user *from);
#ifdef CONFIG_VSX
extern unsigned long copy_vsx_to_user(void __user *to,
struct task_struct *task);
+extern unsigned long copy_transact_vsx_to_user(void __user *to,
+ struct task_struct *task);
extern unsigned long copy_vsx_from_user(struct task_struct *task,
void __user *from);
+extern unsigned long copy_transact_vsx_from_user(struct task_struct *task,
+ void __user *from);
#endif
#ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index b96a3a010c2..1bc5a1755ed 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -25,6 +25,7 @@
#include <linux/errno.h>
#include <linux/elf.h>
#include <linux/ptrace.h>
+#include <linux/ratelimit.h>
#ifdef CONFIG_PPC64
#include <linux/syscalls.h>
#include <linux/compat.h>
@@ -34,7 +35,6 @@
#include <linux/stddef.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
-#include <linux/freezer.h>
#endif
#include <asm/uaccess.h>
@@ -42,6 +42,8 @@
#include <asm/syscalls.h>
#include <asm/sigcontext.h>
#include <asm/vdso.h>
+#include <asm/switch_to.h>
+#include <asm/tm.h>
#ifdef CONFIG_PPC64
#include "ppc32.h"
#include <asm/unistd.h>
@@ -52,13 +54,9 @@
#include "signal.h"
-#undef DEBUG_SIG
#ifdef CONFIG_PPC64
-#define sys_sigsuspend compat_sys_sigsuspend
-#define sys_rt_sigsuspend compat_sys_rt_sigsuspend
#define sys_rt_sigreturn compat_sys_rt_sigreturn
-#define sys_sigaction compat_sys_sigaction
#define sys_swapcontext compat_sys_swapcontext
#define sys_sigreturn compat_sys_sigreturn
@@ -67,6 +65,8 @@
#define mcontext mcontext32
#define ucontext ucontext32
+#define __save_altstack __compat_save_altstack
+
/*
* Userspace code may pass a ucontext which doesn't include VSX added
* at the end. We need to check for this case.
@@ -96,7 +96,7 @@ static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set)
compat_sigset_t cset;
switch (_NSIG_WORDS) {
- case 4: cset.sig[5] = set->sig[3] & 0xffffffffull;
+ case 4: cset.sig[6] = set->sig[3] & 0xffffffffull;
cset.sig[7] = set->sig[3] >> 32;
case 3: cset.sig[4] = set->sig[2] & 0xffffffffull;
cset.sig[5] = set->sig[2] >> 32;
@@ -129,23 +129,6 @@ static inline int get_sigset_t(sigset_t *set,
return 0;
}
-static inline int get_old_sigaction(struct k_sigaction *new_ka,
- struct old_sigaction __user *act)
-{
- compat_old_sigset_t mask;
- compat_uptr_t handler, restorer;
-
- if (get_user(handler, &act->sa_handler) ||
- __get_user(restorer, &act->sa_restorer) ||
- __get_user(new_ka->sa.sa_flags, &act->sa_flags) ||
- __get_user(mask, &act->sa_mask))
- return -EFAULT;
- new_ka->sa.sa_handler = compat_ptr(handler);
- new_ka->sa.sa_restorer = compat_ptr(restorer);
- siginitset(&new_ka->sa.sa_mask, mask);
- return 0;
-}
-
#define to_user_ptr(p) ptr_to_compat(p)
#define from_user_ptr(p) compat_ptr(p)
@@ -195,21 +178,6 @@ static inline int get_sigset_t(sigset_t *set, const sigset_t __user *uset)
return copy_from_user(set, uset, sizeof(*uset));
}
-static inline int get_old_sigaction(struct k_sigaction *new_ka,
- struct old_sigaction __user *act)
-{
- old_sigset_t mask;
-
- if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
- __get_user(new_ka->sa.sa_handler, &act->sa_handler) ||
- __get_user(new_ka->sa.sa_restorer, &act->sa_restorer))
- return -EFAULT;
- __get_user(new_ka->sa.sa_flags, &act->sa_flags);
- __get_user(mask, &act->sa_mask);
- siginitset(&new_ka->sa.sa_mask, mask);
- return 0;
-}
-
#define to_user_ptr(p) ((unsigned long)(p))
#define from_user_ptr(p) ((void __user *)(p))
@@ -233,58 +201,8 @@ static inline int restore_general_regs(struct pt_regs *regs,
return -EFAULT;
return 0;
}
-
-#endif /* CONFIG_PPC64 */
-
-/*
- * Atomically swap in the new signal mask, and wait for a signal.
- */
-long sys_sigsuspend(old_sigset_t mask)
-{
- mask &= _BLOCKABLE;
- spin_lock_irq(&current->sighand->siglock);
- current->saved_sigmask = current->blocked;
- siginitset(&current->blocked, mask);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- set_restore_sigmask();
- return -ERESTARTNOHAND;
-}
-
-long sys_sigaction(int sig, struct old_sigaction __user *act,
- struct old_sigaction __user *oact)
-{
- struct k_sigaction new_ka, old_ka;
- int ret;
-
-#ifdef CONFIG_PPC64
- if (sig < 0)
- sig = -sig;
#endif
- if (act) {
- if (get_old_sigaction(&new_ka, act))
- return -EFAULT;
- }
-
- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
- if (!ret && oact) {
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
- __put_user(to_user_ptr(old_ka.sa.sa_handler),
- &oact->sa_handler) ||
- __put_user(to_user_ptr(old_ka.sa.sa_restorer),
- &oact->sa_restorer) ||
- __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
- __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
- return -EFAULT;
- }
-
- return ret;
-}
-
/*
* When we have signals to deliver, we set up on the
* user stack, going down from the original stack pointer:
@@ -300,6 +218,10 @@ long sys_sigaction(int sig, struct old_sigaction __user *act,
struct sigframe {
struct sigcontext sctx; /* the sigcontext */
struct mcontext mctx; /* all the register values */
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ struct sigcontext sctx_transact;
+ struct mcontext mctx_transact;
+#endif
/*
* Programs using the rs6000/xcoff abi can save up to 19 gp
* regs and 18 fp regs below sp before decrementing it.
@@ -328,6 +250,9 @@ struct rt_sigframe {
struct siginfo info;
#endif
struct ucontext uc;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ struct ucontext uc_transact;
+#endif
/*
* Programs using the rs6000/xcoff abi can save up to 19 gp
* regs and 18 fp regs below sp before decrementing it.
@@ -339,27 +264,27 @@ struct rt_sigframe {
unsigned long copy_fpr_to_user(void __user *to,
struct task_struct *task)
{
- double buf[ELF_NFPREG];
+ u64 buf[ELF_NFPREG];
int i;
/* save FPR copy to local buffer then write to the thread_struct */
for (i = 0; i < (ELF_NFPREG - 1) ; i++)
buf[i] = task->thread.TS_FPR(i);
- memcpy(&buf[i], &task->thread.fpscr, sizeof(double));
+ buf[i] = task->thread.fp_state.fpscr;
return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
}
unsigned long copy_fpr_from_user(struct task_struct *task,
void __user *from)
{
- double buf[ELF_NFPREG];
+ u64 buf[ELF_NFPREG];
int i;
if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
return 1;
for (i = 0; i < (ELF_NFPREG - 1) ; i++)
task->thread.TS_FPR(i) = buf[i];
- memcpy(&task->thread.fpscr, &buf[i], sizeof(double));
+ task->thread.fp_state.fpscr = buf[i];
return 0;
}
@@ -367,41 +292,112 @@ unsigned long copy_fpr_from_user(struct task_struct *task,
unsigned long copy_vsx_to_user(void __user *to,
struct task_struct *task)
{
- double buf[ELF_NVSRHALFREG];
+ u64 buf[ELF_NVSRHALFREG];
int i;
/* save FPR copy to local buffer then write to the thread_struct */
for (i = 0; i < ELF_NVSRHALFREG; i++)
- buf[i] = task->thread.fpr[i][TS_VSRLOWOFFSET];
+ buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
}
unsigned long copy_vsx_from_user(struct task_struct *task,
void __user *from)
{
- double buf[ELF_NVSRHALFREG];
+ u64 buf[ELF_NVSRHALFREG];
+ int i;
+
+ if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
+ return 1;
+ for (i = 0; i < ELF_NVSRHALFREG ; i++)
+ task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+ return 0;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+unsigned long copy_transact_fpr_to_user(void __user *to,
+ struct task_struct *task)
+{
+ u64 buf[ELF_NFPREG];
+ int i;
+
+ /* save FPR copy to local buffer then write to the thread_struct */
+ for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+ buf[i] = task->thread.TS_TRANS_FPR(i);
+ buf[i] = task->thread.transact_fp.fpscr;
+ return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
+}
+
+unsigned long copy_transact_fpr_from_user(struct task_struct *task,
+ void __user *from)
+{
+ u64 buf[ELF_NFPREG];
+ int i;
+
+ if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
+ return 1;
+ for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+ task->thread.TS_TRANS_FPR(i) = buf[i];
+ task->thread.transact_fp.fpscr = buf[i];
+
+ return 0;
+}
+
+unsigned long copy_transact_vsx_to_user(void __user *to,
+ struct task_struct *task)
+{
+ u64 buf[ELF_NVSRHALFREG];
+ int i;
+
+ /* save FPR copy to local buffer then write to the thread_struct */
+ for (i = 0; i < ELF_NVSRHALFREG; i++)
+ buf[i] = task->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET];
+ return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
+}
+
+unsigned long copy_transact_vsx_from_user(struct task_struct *task,
+ void __user *from)
+{
+ u64 buf[ELF_NVSRHALFREG];
int i;
if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
return 1;
for (i = 0; i < ELF_NVSRHALFREG ; i++)
- task->thread.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+ task->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET] = buf[i];
return 0;
}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
#else
inline unsigned long copy_fpr_to_user(void __user *to,
struct task_struct *task)
{
- return __copy_to_user(to, task->thread.fpr,
+ return __copy_to_user(to, task->thread.fp_state.fpr,
ELF_NFPREG * sizeof(double));
}
inline unsigned long copy_fpr_from_user(struct task_struct *task,
void __user *from)
{
- return __copy_from_user(task->thread.fpr, from,
+ return __copy_from_user(task->thread.fp_state.fpr, from,
ELF_NFPREG * sizeof(double));
}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+inline unsigned long copy_transact_fpr_to_user(void __user *to,
+ struct task_struct *task)
+{
+ return __copy_to_user(to, task->thread.transact_fp.fpr,
+ ELF_NFPREG * sizeof(double));
+}
+
+inline unsigned long copy_transact_fpr_from_user(struct task_struct *task,
+ void __user *from)
+{
+ return __copy_from_user(task->thread.transact_fp.fpr, from,
+ ELF_NFPREG * sizeof(double));
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
#endif
/*
@@ -410,7 +406,8 @@ inline unsigned long copy_fpr_from_user(struct task_struct *task,
* altivec/spe instructions at some point.
*/
static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
- int sigret, int ctx_has_vsx_region)
+ struct mcontext __user *tm_frame, int sigret,
+ int ctx_has_vsx_region)
{
unsigned long msr = regs->msr;
@@ -425,7 +422,7 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
/* save altivec registers */
if (current->thread.used_vr) {
flush_altivec_to_thread(current);
- if (__copy_to_user(&frame->mc_vregs, current->thread.vr,
+ if (__copy_to_user(&frame->mc_vregs, &current->thread.vr_state,
ELF_NVRREG * sizeof(vector128)))
return 1;
/* set MSR_VEC in the saved MSR value to indicate that
@@ -438,12 +435,21 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
* use altivec. Since VSCR only contains 32 bits saved in the least
* significant bits of a vector, we "cheat" and stuff VRSAVE in the
* most significant bits of that same vector. --BenH
+ * Note that the current VRSAVE value is in the SPR at this point.
*/
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ current->thread.vrsave = mfspr(SPRN_VRSAVE);
if (__put_user(current->thread.vrsave, (u32 __user *)&frame->mc_vregs[32]))
return 1;
#endif /* CONFIG_ALTIVEC */
if (copy_fpr_to_user(&frame->mc_fregs, current))
return 1;
+
+ /*
+ * Clear the MSR VSX bit to indicate there is no valid state attached
+ * to this context, except in the specific case below where we set it.
+ */
+ msr &= ~MSR_VSX;
#ifdef CONFIG_VSX
/*
* Copy VSR 0-31 upper half from thread_struct to local
@@ -478,6 +484,164 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
if (__put_user(msr, &frame->mc_gregs[PT_MSR]))
return 1;
+ /* We need to write 0 the MSR top 32 bits in the tm frame so that we
+ * can check it on the restore to see if TM is active
+ */
+ if (tm_frame && __put_user(0, &tm_frame->mc_gregs[PT_MSR]))
+ return 1;
+
+ if (sigret) {
+ /* Set up the sigreturn trampoline: li r0,sigret; sc */
+ if (__put_user(0x38000000UL + sigret, &frame->tramp[0])
+ || __put_user(0x44000002UL, &frame->tramp[1]))
+ return 1;
+ flush_icache_range((unsigned long) &frame->tramp[0],
+ (unsigned long) &frame->tramp[2]);
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Save the current user registers on the user stack.
+ * We only save the altivec/spe registers if the process has used
+ * altivec/spe instructions at some point.
+ * We also save the transactional registers to a second ucontext in the
+ * frame.
+ *
+ * See save_user_regs() and signal_64.c:setup_tm_sigcontexts().
+ */
+static int save_tm_user_regs(struct pt_regs *regs,
+ struct mcontext __user *frame,
+ struct mcontext __user *tm_frame, int sigret)
+{
+ unsigned long msr = regs->msr;
+
+ /* Remove TM bits from thread's MSR. The MSR in the sigcontext
+ * just indicates to userland that we were doing a transaction, but we
+ * don't want to return in transactional state. This also ensures
+ * that flush_fp_to_thread won't set TIF_RESTORE_TM again.
+ */
+ regs->msr &= ~MSR_TS_MASK;
+
+ /* Make sure floating point registers are stored in regs */
+ flush_fp_to_thread(current);
+
+ /* Save both sets of general registers */
+ if (save_general_regs(&current->thread.ckpt_regs, frame)
+ || save_general_regs(regs, tm_frame))
+ return 1;
+
+ /* Stash the top half of the 64bit MSR into the 32bit MSR word
+ * of the transactional mcontext. This way we have a backward-compatible
+ * MSR in the 'normal' (checkpointed) mcontext and additionally one can
+ * also look at what type of transaction (T or S) was active at the
+ * time of the signal.
+ */
+ if (__put_user((msr >> 32), &tm_frame->mc_gregs[PT_MSR]))
+ return 1;
+
+#ifdef CONFIG_ALTIVEC
+ /* save altivec registers */
+ if (current->thread.used_vr) {
+ flush_altivec_to_thread(current);
+ if (__copy_to_user(&frame->mc_vregs, &current->thread.vr_state,
+ ELF_NVRREG * sizeof(vector128)))
+ return 1;
+ if (msr & MSR_VEC) {
+ if (__copy_to_user(&tm_frame->mc_vregs,
+ &current->thread.transact_vr,
+ ELF_NVRREG * sizeof(vector128)))
+ return 1;
+ } else {
+ if (__copy_to_user(&tm_frame->mc_vregs,
+ &current->thread.vr_state,
+ ELF_NVRREG * sizeof(vector128)))
+ return 1;
+ }
+
+ /* set MSR_VEC in the saved MSR value to indicate that
+ * frame->mc_vregs contains valid data
+ */
+ msr |= MSR_VEC;
+ }
+
+ /* We always copy to/from vrsave, it's 0 if we don't have or don't
+ * use altivec. Since VSCR only contains 32 bits saved in the least
+ * significant bits of a vector, we "cheat" and stuff VRSAVE in the
+ * most significant bits of that same vector. --BenH
+ */
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ current->thread.vrsave = mfspr(SPRN_VRSAVE);
+ if (__put_user(current->thread.vrsave,
+ (u32 __user *)&frame->mc_vregs[32]))
+ return 1;
+ if (msr & MSR_VEC) {
+ if (__put_user(current->thread.transact_vrsave,
+ (u32 __user *)&tm_frame->mc_vregs[32]))
+ return 1;
+ } else {
+ if (__put_user(current->thread.vrsave,
+ (u32 __user *)&tm_frame->mc_vregs[32]))
+ return 1;
+ }
+#endif /* CONFIG_ALTIVEC */
+
+ if (copy_fpr_to_user(&frame->mc_fregs, current))
+ return 1;
+ if (msr & MSR_FP) {
+ if (copy_transact_fpr_to_user(&tm_frame->mc_fregs, current))
+ return 1;
+ } else {
+ if (copy_fpr_to_user(&tm_frame->mc_fregs, current))
+ return 1;
+ }
+
+#ifdef CONFIG_VSX
+ /*
+ * Copy VSR 0-31 upper half from thread_struct to local
+ * buffer, then write that to userspace. Also set MSR_VSX in
+ * the saved MSR value to indicate that frame->mc_vregs
+ * contains valid data
+ */
+ if (current->thread.used_vsr) {
+ __giveup_vsx(current);
+ if (copy_vsx_to_user(&frame->mc_vsregs, current))
+ return 1;
+ if (msr & MSR_VSX) {
+ if (copy_transact_vsx_to_user(&tm_frame->mc_vsregs,
+ current))
+ return 1;
+ } else {
+ if (copy_vsx_to_user(&tm_frame->mc_vsregs, current))
+ return 1;
+ }
+
+ msr |= MSR_VSX;
+ }
+#endif /* CONFIG_VSX */
+#ifdef CONFIG_SPE
+ /* SPE regs are not checkpointed with TM, so this section is
+ * simply the same as in save_user_regs().
+ */
+ if (current->thread.used_spe) {
+ flush_spe_to_thread(current);
+ if (__copy_to_user(&frame->mc_vregs, current->thread.evr,
+ ELF_NEVRREG * sizeof(u32)))
+ return 1;
+ /* set MSR_SPE in the saved MSR value to indicate that
+ * frame->mc_vregs contains valid data */
+ msr |= MSR_SPE;
+ }
+
+ /* We always copy to/from spefscr */
+ if (__put_user(current->thread.spefscr, (u32 __user *)&frame->mc_vregs + ELF_NEVRREG))
+ return 1;
+#endif /* CONFIG_SPE */
+
+ if (__put_user(msr, &frame->mc_gregs[PT_MSR]))
+ return 1;
if (sigret) {
/* Set up the sigreturn trampoline: li r0,sigret; sc */
if (__put_user(0x38000000UL + sigret, &frame->tramp[0])
@@ -489,6 +653,7 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
return 0;
}
+#endif
/*
* Restore the current user register values from the user stack,
@@ -539,15 +704,18 @@ static long restore_user_regs(struct pt_regs *regs,
regs->msr &= ~MSR_VEC;
if (msr & MSR_VEC) {
/* restore altivec registers from the stack */
- if (__copy_from_user(current->thread.vr, &sr->mc_vregs,
+ if (__copy_from_user(&current->thread.vr_state, &sr->mc_vregs,
sizeof(sr->mc_vregs)))
return 1;
} else if (current->thread.used_vr)
- memset(current->thread.vr, 0, ELF_NVRREG * sizeof(vector128));
+ memset(&current->thread.vr_state, 0,
+ ELF_NVRREG * sizeof(vector128));
/* Always get VRSAVE back */
if (__get_user(current->thread.vrsave, (u32 __user *)&sr->mc_vregs[32]))
return 1;
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ mtspr(SPRN_VRSAVE, current->thread.vrsave);
#endif /* CONFIG_ALTIVEC */
if (copy_fpr_from_user(current, &sr->mc_fregs))
return 1;
@@ -567,7 +735,7 @@ static long restore_user_regs(struct pt_regs *regs,
return 1;
} else if (current->thread.used_vsr)
for (i = 0; i < 32 ; i++)
- current->thread.fpr[i][TS_VSRLOWOFFSET] = 0;
+ current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
#endif /* CONFIG_VSX */
/*
* force the process to reload the FP registers from
@@ -595,91 +763,151 @@ static long restore_user_regs(struct pt_regs *regs,
return 0;
}
-#ifdef CONFIG_PPC64
-long compat_sys_rt_sigaction(int sig, const struct sigaction32 __user *act,
- struct sigaction32 __user *oact, size_t sigsetsize)
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Restore the current user register values from the user stack, except for
+ * MSR, and recheckpoint the original checkpointed register state for processes
+ * in transactions.
+ */
+static long restore_tm_user_regs(struct pt_regs *regs,
+ struct mcontext __user *sr,
+ struct mcontext __user *tm_sr)
{
- struct k_sigaction new_ka, old_ka;
- int ret;
+ long err;
+ unsigned long msr, msr_hi;
+#ifdef CONFIG_VSX
+ int i;
+#endif
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
+ /*
+ * restore general registers but not including MSR or SOFTE. Also
+ * take care of keeping r2 (TLS) intact if not a signal.
+ * See comment in signal_64.c:restore_tm_sigcontexts();
+ * TFHAR is restored from the checkpointed NIP; TEXASR and TFIAR
+ * were set by the signal delivery.
+ */
+ err = restore_general_regs(regs, tm_sr);
+ err |= restore_general_regs(&current->thread.ckpt_regs, sr);
- if (act) {
- compat_uptr_t handler;
+ err |= __get_user(current->thread.tm_tfhar, &sr->mc_gregs[PT_NIP]);
- ret = get_user(handler, &act->sa_handler);
- new_ka.sa.sa_handler = compat_ptr(handler);
- ret |= get_sigset_t(&new_ka.sa.sa_mask, &act->sa_mask);
- ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
- if (ret)
- return -EFAULT;
- }
+ err |= __get_user(msr, &sr->mc_gregs[PT_MSR]);
+ if (err)
+ return 1;
- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
- if (!ret && oact) {
- ret = put_user(to_user_ptr(old_ka.sa.sa_handler), &oact->sa_handler);
- ret |= put_sigset_t(&oact->sa_mask, &old_ka.sa.sa_mask);
- ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
- }
- return ret;
-}
+ /* Restore the previous little-endian mode */
+ regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
-/*
- * Note: it is necessary to treat how as an unsigned int, with the
- * corresponding cast to a signed int to insure that the proper
- * conversion (sign extension) between the register representation
- * of a signed int (msr in 32-bit mode) and the register representation
- * of a signed int (msr in 64-bit mode) is performed.
- */
-long compat_sys_rt_sigprocmask(u32 how, compat_sigset_t __user *set,
- compat_sigset_t __user *oset, size_t sigsetsize)
-{
- sigset_t s;
- sigset_t __user *up;
- int ret;
- mm_segment_t old_fs = get_fs();
+ /*
+ * Do this before updating the thread state in
+ * current->thread.fpr/vr/evr. That way, if we get preempted
+ * and another task grabs the FPU/Altivec/SPE, it won't be
+ * tempted to save the current CPU state into the thread_struct
+ * and corrupt what we are writing there.
+ */
+ discard_lazy_cpu_state();
- if (set) {
- if (get_sigset_t(&s, set))
- return -EFAULT;
+#ifdef CONFIG_ALTIVEC
+ regs->msr &= ~MSR_VEC;
+ if (msr & MSR_VEC) {
+ /* restore altivec registers from the stack */
+ if (__copy_from_user(&current->thread.vr_state, &sr->mc_vregs,
+ sizeof(sr->mc_vregs)) ||
+ __copy_from_user(&current->thread.transact_vr,
+ &tm_sr->mc_vregs,
+ sizeof(sr->mc_vregs)))
+ return 1;
+ } else if (current->thread.used_vr) {
+ memset(&current->thread.vr_state, 0,
+ ELF_NVRREG * sizeof(vector128));
+ memset(&current->thread.transact_vr, 0,
+ ELF_NVRREG * sizeof(vector128));
}
- set_fs(KERNEL_DS);
- /* This is valid because of the set_fs() */
- up = (sigset_t __user *) &s;
- ret = sys_rt_sigprocmask((int)how, set ? up : NULL, oset ? up : NULL,
- sigsetsize);
- set_fs(old_fs);
- if (ret)
- return ret;
- if (oset) {
- if (put_sigset_t(oset, &s))
- return -EFAULT;
- }
- return 0;
-}
+ /* Always get VRSAVE back */
+ if (__get_user(current->thread.vrsave,
+ (u32 __user *)&sr->mc_vregs[32]) ||
+ __get_user(current->thread.transact_vrsave,
+ (u32 __user *)&tm_sr->mc_vregs[32]))
+ return 1;
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ mtspr(SPRN_VRSAVE, current->thread.vrsave);
+#endif /* CONFIG_ALTIVEC */
-long compat_sys_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize)
-{
- sigset_t s;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- /* The __user pointer cast is valid because of the set_fs() */
- ret = sys_rt_sigpending((sigset_t __user *) &s, sigsetsize);
- set_fs(old_fs);
- if (!ret) {
- if (put_sigset_t(set, &s))
- return -EFAULT;
+ regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1);
+
+ if (copy_fpr_from_user(current, &sr->mc_fregs) ||
+ copy_transact_fpr_from_user(current, &tm_sr->mc_fregs))
+ return 1;
+
+#ifdef CONFIG_VSX
+ regs->msr &= ~MSR_VSX;
+ if (msr & MSR_VSX) {
+ /*
+ * Restore altivec registers from the stack to a local
+ * buffer, then write this out to the thread_struct
+ */
+ if (copy_vsx_from_user(current, &sr->mc_vsregs) ||
+ copy_transact_vsx_from_user(current, &tm_sr->mc_vsregs))
+ return 1;
+ } else if (current->thread.used_vsr)
+ for (i = 0; i < 32 ; i++) {
+ current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
+ current->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET] = 0;
+ }
+#endif /* CONFIG_VSX */
+
+#ifdef CONFIG_SPE
+ /* SPE regs are not checkpointed with TM, so this section is
+ * simply the same as in restore_user_regs().
+ */
+ regs->msr &= ~MSR_SPE;
+ if (msr & MSR_SPE) {
+ if (__copy_from_user(current->thread.evr, &sr->mc_vregs,
+ ELF_NEVRREG * sizeof(u32)))
+ return 1;
+ } else if (current->thread.used_spe)
+ memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32));
+
+ /* Always get SPEFSCR back */
+ if (__get_user(current->thread.spefscr, (u32 __user *)&sr->mc_vregs
+ + ELF_NEVRREG))
+ return 1;
+#endif /* CONFIG_SPE */
+
+ /* Now, recheckpoint. This loads up all of the checkpointed (older)
+ * registers, including FP and V[S]Rs. After recheckpointing, the
+ * transactional versions should be loaded.
+ */
+ tm_enable();
+ /* Make sure the transaction is marked as failed */
+ current->thread.tm_texasr |= TEXASR_FS;
+ /* This loads the checkpointed FP/VEC state, if used */
+ tm_recheckpoint(&current->thread, msr);
+ /* Get the top half of the MSR */
+ if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR]))
+ return 1;
+ /* Pull in MSR TM from user context */
+ regs->msr = (regs->msr & ~MSR_TS_MASK) | ((msr_hi<<32) & MSR_TS_MASK);
+
+ /* This loads the speculative FP/VEC state, if used */
+ if (msr & MSR_FP) {
+ do_load_up_transact_fpu(&current->thread);
+ regs->msr |= (MSR_FP | current->thread.fpexc_mode);
}
- return ret;
-}
+#ifdef CONFIG_ALTIVEC
+ if (msr & MSR_VEC) {
+ do_load_up_transact_altivec(&current->thread);
+ regs->msr |= MSR_VEC;
+ }
+#endif
+ return 0;
+}
+#endif
-int copy_siginfo_to_user32(struct compat_siginfo __user *d, siginfo_t *s)
+#ifdef CONFIG_PPC64
+int copy_siginfo_to_user32(struct compat_siginfo __user *d, const siginfo_t *s)
{
int err;
@@ -747,79 +975,6 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from)
return 0;
}
-
-/*
- * Note: it is necessary to treat pid and sig as unsigned ints, with the
- * corresponding cast to a signed int to insure that the proper conversion
- * (sign extension) between the register representation of a signed int
- * (msr in 32-bit mode) and the register representation of a signed int
- * (msr in 64-bit mode) is performed.
- */
-long compat_sys_rt_sigqueueinfo(u32 pid, u32 sig, compat_siginfo_t __user *uinfo)
-{
- siginfo_t info;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- ret = copy_siginfo_from_user32(&info, uinfo);
- if (unlikely(ret))
- return ret;
-
- set_fs (KERNEL_DS);
- /* The __user pointer cast is valid becasuse of the set_fs() */
- ret = sys_rt_sigqueueinfo((int)pid, (int)sig, (siginfo_t __user *) &info);
- set_fs (old_fs);
- return ret;
-}
-/*
- * Start Alternate signal stack support
- *
- * System Calls
- * sigaltatck compat_sys_sigaltstack
- */
-
-int compat_sys_sigaltstack(u32 __new, u32 __old, int r5,
- int r6, int r7, int r8, struct pt_regs *regs)
-{
- stack_32_t __user * newstack = compat_ptr(__new);
- stack_32_t __user * oldstack = compat_ptr(__old);
- stack_t uss, uoss;
- int ret;
- mm_segment_t old_fs;
- unsigned long sp;
- compat_uptr_t ss_sp;
-
- /*
- * set sp to the user stack on entry to the system call
- * the system call router sets R9 to the saved registers
- */
- sp = regs->gpr[1];
-
- /* Put new stack info in local 64 bit stack struct */
- if (newstack) {
- if (get_user(ss_sp, &newstack->ss_sp) ||
- __get_user(uss.ss_flags, &newstack->ss_flags) ||
- __get_user(uss.ss_size, &newstack->ss_size))
- return -EFAULT;
- uss.ss_sp = compat_ptr(ss_sp);
- }
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- /* The __user pointer casts are valid because of the set_fs() */
- ret = do_sigaltstack(
- newstack ? (stack_t __user *) &uss : NULL,
- oldstack ? (stack_t __user *) &uoss : NULL,
- sp);
- set_fs(old_fs);
- /* Copy the stack information to the user output buffer */
- if (!ret && oldstack &&
- (put_user(ptr_to_compat(uoss.ss_sp), &oldstack->ss_sp) ||
- __put_user(uoss.ss_flags, &oldstack->ss_flags) ||
- __put_user(uoss.ss_size, &oldstack->ss_size)))
- return -EFAULT;
- return ret;
-}
#endif /* CONFIG_PPC64 */
/*
@@ -832,12 +987,15 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
{
struct rt_sigframe __user *rt_sf;
struct mcontext __user *frame;
+ struct mcontext __user *tm_frame = NULL;
void __user *addr;
unsigned long newsp = 0;
+ int sigret;
+ unsigned long tramp;
/* Set up Signal Frame */
/* Put a Real Time Context onto stack */
- rt_sf = get_sigframe(ka, regs, sizeof(*rt_sf), 1);
+ rt_sf = get_sigframe(ka, get_tm_stackpointer(regs), sizeof(*rt_sf), 1);
addr = rt_sf;
if (unlikely(rt_sf == NULL))
goto badframe;
@@ -845,11 +1003,7 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
/* Put the siginfo & fill in most of the ucontext */
if (copy_siginfo_to_user(&rt_sf->info, info)
|| __put_user(0, &rt_sf->uc.uc_flags)
- || __put_user(0, &rt_sf->uc.uc_link)
- || __put_user(current->sas_ss_sp, &rt_sf->uc.uc_stack.ss_sp)
- || __put_user(sas_ss_flags(regs->gpr[1]),
- &rt_sf->uc.uc_stack.ss_flags)
- || __put_user(current->sas_ss_size, &rt_sf->uc.uc_stack.ss_size)
+ || __save_altstack(&rt_sf->uc.uc_stack, regs->gpr[1])
|| __put_user(to_user_ptr(&rt_sf->uc.uc_mcontext),
&rt_sf->uc.uc_regs)
|| put_sigset_t(&rt_sf->uc.uc_sigmask, oldset))
@@ -859,16 +1013,35 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
frame = &rt_sf->uc.uc_mcontext;
addr = frame;
if (vdso32_rt_sigtramp && current->mm->context.vdso_base) {
- if (save_user_regs(regs, frame, 0, 1))
- goto badframe;
- regs->link = current->mm->context.vdso_base + vdso32_rt_sigtramp;
+ sigret = 0;
+ tramp = current->mm->context.vdso_base + vdso32_rt_sigtramp;
} else {
- if (save_user_regs(regs, frame, __NR_rt_sigreturn, 1))
+ sigret = __NR_rt_sigreturn;
+ tramp = (unsigned long) frame->tramp;
+ }
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ tm_frame = &rt_sf->uc_transact.uc_mcontext;
+ if (MSR_TM_ACTIVE(regs->msr)) {
+ if (__put_user((unsigned long)&rt_sf->uc_transact,
+ &rt_sf->uc.uc_link) ||
+ __put_user((unsigned long)tm_frame,
+ &rt_sf->uc_transact.uc_regs))
+ goto badframe;
+ if (save_tm_user_regs(regs, frame, tm_frame, sigret))
goto badframe;
- regs->link = (unsigned long) frame->tramp;
}
+ else
+#endif
+ {
+ if (__put_user(0, &rt_sf->uc.uc_link))
+ goto badframe;
+ if (save_user_regs(regs, frame, tm_frame, sigret, 1))
+ goto badframe;
+ }
+ regs->link = tramp;
- current->thread.fpscr.val = 0; /* turn off all fp exceptions */
+ current->thread.fp_state.fpscr = 0; /* turn off all fp exceptions */
/* create a stack frame for the caller of the handler */
newsp = ((unsigned long)rt_sf) - (__SIGNAL_FRAMESIZE + 16);
@@ -883,20 +1056,18 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
regs->gpr[5] = (unsigned long) &rt_sf->uc;
regs->gpr[6] = (unsigned long) rt_sf;
regs->nip = (unsigned long) ka->sa.sa_handler;
- /* enter the signal handler in big-endian mode */
+ /* enter the signal handler in native-endian mode */
regs->msr &= ~MSR_LE;
+ regs->msr |= (MSR_KERNEL & MSR_LE);
return 1;
badframe:
-#ifdef DEBUG_SIG
- printk("badframe in handle_rt_signal, regs=%p frame=%p newsp=%lx\n",
- regs, frame, newsp);
-#endif
- if (show_unhandled_signals && printk_ratelimit())
- printk(KERN_INFO "%s[%d]: bad frame in handle_rt_signal32: "
- "%p nip %08lx lr %08lx\n",
- current->comm, current->pid,
- addr, regs->nip, regs->link);
+ if (show_unhandled_signals)
+ printk_ratelimited(KERN_INFO
+ "%s[%d]: bad frame in handle_rt_signal32: "
+ "%p nip %08lx lr %08lx\n",
+ current->comm, current->pid,
+ addr, regs->nip, regs->link);
force_sigsegv(sig, current);
return 0;
@@ -924,13 +1095,42 @@ static int do_setcontext(struct ucontext __user *ucp, struct pt_regs *regs, int
if (!access_ok(VERIFY_READ, mcp, sizeof(*mcp)))
return -EFAULT;
#endif
- restore_sigmask(&set);
+ set_current_blocked(&set);
if (restore_user_regs(regs, mcp, sig))
return -EFAULT;
return 0;
}
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static int do_setcontext_tm(struct ucontext __user *ucp,
+ struct ucontext __user *tm_ucp,
+ struct pt_regs *regs)
+{
+ sigset_t set;
+ struct mcontext __user *mcp;
+ struct mcontext __user *tm_mcp;
+ u32 cmcp;
+ u32 tm_cmcp;
+
+ if (get_sigset_t(&set, &ucp->uc_sigmask))
+ return -EFAULT;
+
+ if (__get_user(cmcp, &ucp->uc_regs) ||
+ __get_user(tm_cmcp, &tm_ucp->uc_regs))
+ return -EFAULT;
+ mcp = (struct mcontext __user *)(u64)cmcp;
+ tm_mcp = (struct mcontext __user *)(u64)tm_cmcp;
+ /* no need to check access_ok(mcp), since mcp < 4GB */
+
+ set_current_blocked(&set);
+ if (restore_tm_user_regs(regs, mcp, tm_mcp))
+ return -EFAULT;
+
+ return 0;
+}
+#endif
+
long sys_swapcontext(struct ucontext __user *old_ctx,
struct ucontext __user *new_ctx,
int ctx_size, int r6, int r7, int r8, struct pt_regs *regs)
@@ -992,7 +1192,7 @@ long sys_swapcontext(struct ucontext __user *old_ctx,
mctx = (struct mcontext __user *)
((unsigned long) &old_ctx->uc_mcontext & ~0xfUL);
if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size)
- || save_user_regs(regs, mctx, 0, ctx_has_vsx_region)
+ || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region)
|| put_sigset_t(&old_ctx->uc_sigmask, &current->blocked)
|| __put_user(to_user_ptr(mctx), &old_ctx->uc_regs))
return -EFAULT;
@@ -1026,7 +1226,12 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
struct pt_regs *regs)
{
struct rt_sigframe __user *rt_sf;
-
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ struct ucontext __user *uc_transact;
+ unsigned long msr_hi;
+ unsigned long tmp;
+ int tm_restore = 0;
+#endif
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
@@ -1034,6 +1239,34 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
(regs->gpr[1] + __SIGNAL_FRAMESIZE + 16);
if (!access_ok(VERIFY_READ, rt_sf, sizeof(*rt_sf)))
goto bad;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ if (__get_user(tmp, &rt_sf->uc.uc_link))
+ goto bad;
+ uc_transact = (struct ucontext __user *)(uintptr_t)tmp;
+ if (uc_transact) {
+ u32 cmcp;
+ struct mcontext __user *mcp;
+
+ if (__get_user(cmcp, &uc_transact->uc_regs))
+ return -EFAULT;
+ mcp = (struct mcontext __user *)(u64)cmcp;
+ /* The top 32 bits of the MSR are stashed in the transactional
+ * ucontext. */
+ if (__get_user(msr_hi, &mcp->mc_gregs[PT_MSR]))
+ goto bad;
+
+ if (MSR_TM_ACTIVE(msr_hi<<32)) {
+ /* We only recheckpoint on return if we're
+ * transaction.
+ */
+ tm_restore = 1;
+ if (do_setcontext_tm(&rt_sf->uc, uc_transact, regs))
+ goto bad;
+ }
+ }
+ if (!tm_restore)
+ /* Fall through, for non-TM restore */
+#endif
if (do_setcontext(&rt_sf->uc, regs, 1))
goto bad;
@@ -1045,24 +1278,22 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
* change it. -- paulus
*/
#ifdef CONFIG_PPC64
- /*
- * We use the compat_sys_ version that does the 32/64 bits conversion
- * and takes userland pointer directly. What about error checking ?
- * nobody does any...
- */
- compat_sys_sigaltstack((u32)(u64)&rt_sf->uc.uc_stack, 0, 0, 0, 0, 0, regs);
+ if (compat_restore_altstack(&rt_sf->uc.uc_stack))
+ goto bad;
#else
- do_sigaltstack(&rt_sf->uc.uc_stack, NULL, regs->gpr[1]);
+ if (restore_altstack(&rt_sf->uc.uc_stack))
+ goto bad;
#endif
set_thread_flag(TIF_RESTOREALL);
return 0;
bad:
- if (show_unhandled_signals && printk_ratelimit())
- printk(KERN_INFO "%s[%d]: bad frame in sys_rt_sigreturn: "
- "%p nip %08lx lr %08lx\n",
- current->comm, current->pid,
- rt_sf, regs->nip, regs->link);
+ if (show_unhandled_signals)
+ printk_ratelimited(KERN_INFO
+ "%s[%d]: bad frame in sys_rt_sigreturn: "
+ "%p nip %08lx lr %08lx\n",
+ current->comm, current->pid,
+ rt_sf, regs->nip, regs->link);
force_sig(SIGSEGV, current);
return 0;
@@ -1079,7 +1310,7 @@ int sys_debug_setcontext(struct ucontext __user *ctx,
unsigned char tmp;
unsigned long new_msr = regs->msr;
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
- unsigned long new_dbcr0 = current->thread.dbcr0;
+ unsigned long new_dbcr0 = current->thread.debug.dbcr0;
#endif
for (i=0; i<ndbg; i++) {
@@ -1094,7 +1325,7 @@ int sys_debug_setcontext(struct ucontext __user *ctx,
} else {
new_dbcr0 &= ~DBCR0_IC;
if (!DBCR_ACTIVE_EVENTS(new_dbcr0,
- current->thread.dbcr1)) {
+ current->thread.debug.dbcr1)) {
new_msr &= ~MSR_DE;
new_dbcr0 &= ~DBCR0_IDM;
}
@@ -1129,7 +1360,7 @@ int sys_debug_setcontext(struct ucontext __user *ctx,
the user is really doing something wrong. */
regs->msr = new_msr;
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
- current->thread.dbcr0 = new_dbcr0;
+ current->thread.debug.dbcr0 = new_dbcr0;
#endif
if (!access_ok(VERIFY_READ, ctx, sizeof(*ctx))
@@ -1149,12 +1380,12 @@ int sys_debug_setcontext(struct ucontext __user *ctx,
* We kill the task with a SIGSEGV in this situation.
*/
if (do_setcontext(ctx, regs, 1)) {
- if (show_unhandled_signals && printk_ratelimit())
- printk(KERN_INFO "%s[%d]: bad frame in "
- "sys_debug_setcontext: %p nip %08lx "
- "lr %08lx\n",
- current->comm, current->pid,
- ctx, regs->nip, regs->link);
+ if (show_unhandled_signals)
+ printk_ratelimited(KERN_INFO "%s[%d]: bad frame in "
+ "sys_debug_setcontext: %p nip %08lx "
+ "lr %08lx\n",
+ current->comm, current->pid,
+ ctx, regs->nip, regs->link);
force_sig(SIGSEGV, current);
goto out;
@@ -1167,7 +1398,7 @@ int sys_debug_setcontext(struct ucontext __user *ctx,
* always done it up until now so it is probably better not to
* change it. -- paulus
*/
- do_sigaltstack(&ctx->uc_stack, NULL, regs->gpr[1]);
+ restore_altstack(&ctx->uc_stack);
set_thread_flag(TIF_RESTOREALL);
out:
@@ -1183,10 +1414,13 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
{
struct sigcontext __user *sc;
struct sigframe __user *frame;
+ struct mcontext __user *tm_mctx = NULL;
unsigned long newsp = 0;
+ int sigret;
+ unsigned long tramp;
/* Set up Signal Frame */
- frame = get_sigframe(ka, regs, sizeof(*frame), 1);
+ frame = get_sigframe(ka, get_tm_stackpointer(regs), sizeof(*frame), 1);
if (unlikely(frame == NULL))
goto badframe;
sc = (struct sigcontext __user *) &frame->sctx;
@@ -1206,16 +1440,30 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
goto badframe;
if (vdso32_sigtramp && current->mm->context.vdso_base) {
- if (save_user_regs(regs, &frame->mctx, 0, 1))
- goto badframe;
- regs->link = current->mm->context.vdso_base + vdso32_sigtramp;
+ sigret = 0;
+ tramp = current->mm->context.vdso_base + vdso32_sigtramp;
} else {
- if (save_user_regs(regs, &frame->mctx, __NR_sigreturn, 1))
+ sigret = __NR_sigreturn;
+ tramp = (unsigned long) frame->mctx.tramp;
+ }
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ tm_mctx = &frame->mctx_transact;
+ if (MSR_TM_ACTIVE(regs->msr)) {
+ if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact,
+ sigret))
+ goto badframe;
+ }
+ else
+#endif
+ {
+ if (save_user_regs(regs, &frame->mctx, tm_mctx, sigret, 1))
goto badframe;
- regs->link = (unsigned long) frame->mctx.tramp;
}
- current->thread.fpscr.val = 0; /* turn off all fp exceptions */
+ regs->link = tramp;
+
+ current->thread.fp_state.fpscr = 0; /* turn off all fp exceptions */
/* create a stack frame for the caller of the handler */
newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
@@ -1228,19 +1476,15 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
regs->nip = (unsigned long) ka->sa.sa_handler;
/* enter the signal handler in big-endian mode */
regs->msr &= ~MSR_LE;
-
return 1;
badframe:
-#ifdef DEBUG_SIG
- printk("badframe in handle_signal, regs=%p frame=%p newsp=%lx\n",
- regs, frame, newsp);
-#endif
- if (show_unhandled_signals && printk_ratelimit())
- printk(KERN_INFO "%s[%d]: bad frame in handle_signal32: "
- "%p nip %08lx lr %08lx\n",
- current->comm, current->pid,
- frame, regs->nip, regs->link);
+ if (show_unhandled_signals)
+ printk_ratelimited(KERN_INFO
+ "%s[%d]: bad frame in handle_signal32: "
+ "%p nip %08lx lr %08lx\n",
+ current->comm, current->pid,
+ frame, regs->nip, regs->link);
force_sigsegv(sig, current);
return 0;
@@ -1252,16 +1496,22 @@ badframe:
long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
struct pt_regs *regs)
{
+ struct sigframe __user *sf;
struct sigcontext __user *sc;
struct sigcontext sigctx;
struct mcontext __user *sr;
void __user *addr;
sigset_t set;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ struct mcontext __user *mcp, *tm_mcp;
+ unsigned long msr_hi;
+#endif
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
- sc = (struct sigcontext __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
+ sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
+ sc = &sf->sctx;
addr = sc;
if (copy_from_user(&sigctx, sc, sizeof(sigctx)))
goto badframe;
@@ -1276,23 +1526,38 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
set.sig[0] = sigctx.oldmask;
set.sig[1] = sigctx._unused[3];
#endif
- restore_sigmask(&set);
+ set_current_blocked(&set);
- sr = (struct mcontext __user *)from_user_ptr(sigctx.regs);
- addr = sr;
- if (!access_ok(VERIFY_READ, sr, sizeof(*sr))
- || restore_user_regs(regs, sr, 1))
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ mcp = (struct mcontext __user *)&sf->mctx;
+ tm_mcp = (struct mcontext __user *)&sf->mctx_transact;
+ if (__get_user(msr_hi, &tm_mcp->mc_gregs[PT_MSR]))
goto badframe;
+ if (MSR_TM_ACTIVE(msr_hi<<32)) {
+ if (!cpu_has_feature(CPU_FTR_TM))
+ goto badframe;
+ if (restore_tm_user_regs(regs, mcp, tm_mcp))
+ goto badframe;
+ } else
+#endif
+ {
+ sr = (struct mcontext __user *)from_user_ptr(sigctx.regs);
+ addr = sr;
+ if (!access_ok(VERIFY_READ, sr, sizeof(*sr))
+ || restore_user_regs(regs, sr, 1))
+ goto badframe;
+ }
set_thread_flag(TIF_RESTOREALL);
return 0;
badframe:
- if (show_unhandled_signals && printk_ratelimit())
- printk(KERN_INFO "%s[%d]: bad frame in sys_sigreturn: "
- "%p nip %08lx lr %08lx\n",
- current->comm, current->pid,
- addr, regs->nip, regs->link);
+ if (show_unhandled_signals)
+ printk_ratelimited(KERN_INFO
+ "%s[%d]: bad frame in sys_sigreturn: "
+ "%p nip %08lx lr %08lx\n",
+ current->comm, current->pid,
+ addr, regs->nip, regs->link);
force_sig(SIGSEGV, current);
return 0;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 27c4a4584f8..97c1e4b683f 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -23,7 +23,7 @@
#include <linux/stddef.h>
#include <linux/elf.h>
#include <linux/ptrace.h>
-#include <linux/module.h>
+#include <linux/ratelimit.h>
#include <asm/sigcontext.h>
#include <asm/ucontext.h>
@@ -33,10 +33,11 @@
#include <asm/cacheflush.h>
#include <asm/syscalls.h>
#include <asm/vdso.h>
+#include <asm/switch_to.h>
+#include <asm/tm.h>
#include "signal.h"
-#define DEBUG_SIG 0
#define GP_REGS_SIZE min(sizeof(elf_gregset_t), sizeof(struct pt_regs))
#define FP_REGS_SIZE sizeof(elf_fpregset_t)
@@ -55,13 +56,16 @@
struct rt_sigframe {
/* sys_rt_sigreturn requires the ucontext be the first field */
struct ucontext uc;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ struct ucontext uc_transact;
+#endif
unsigned long _unused[2];
unsigned int tramp[TRAMP_SIZE];
struct siginfo __user *pinfo;
void __user *puc;
struct siginfo info;
- /* 64 bit ABI allows for 288 bytes below sp before decrementing it. */
- char abigap[288];
+ /* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */
+ char abigap[USER_REDZONE_SIZE];
} __attribute__ ((aligned (16)));
static const char fmt32[] = KERN_INFO \
@@ -82,7 +86,7 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
* the context). This is very important because we must ensure we
* don't lose the VRSAVE content that may have been set prior to
* the process doing its first vector operation
- * Userland shall check AT_HWCAP to know wether it can rely on the
+ * Userland shall check AT_HWCAP to know whether it can rely on the
* v_regs pointer or not
*/
#ifdef CONFIG_ALTIVEC
@@ -91,8 +95,6 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
unsigned long msr = regs->msr;
long err = 0;
- flush_fp_to_thread(current);
-
#ifdef CONFIG_ALTIVEC
err |= __put_user(v_regs, &sc->v_regs);
@@ -100,7 +102,8 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
if (current->thread.used_vr) {
flush_altivec_to_thread(current);
/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
- err |= __copy_to_user(v_regs, current->thread.vr, 33 * sizeof(vector128));
+ err |= __copy_to_user(v_regs, &current->thread.vr_state,
+ 33 * sizeof(vector128));
/* set MSR_VEC in the MSR value in the frame to indicate that sc->v_reg)
* contains valid data.
*/
@@ -109,6 +112,8 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
/* We always copy to/from vrsave, it's 0 if we don't have or don't
* use altivec.
*/
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ current->thread.vrsave = mfspr(SPRN_VRSAVE);
err |= __put_user(current->thread.vrsave, (u32 __user *)&v_regs[33]);
#else /* CONFIG_ALTIVEC */
err |= __put_user(0, &sc->v_regs);
@@ -116,6 +121,12 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
flush_fp_to_thread(current);
/* copy fpr regs and fpscr */
err |= copy_fpr_to_user(&sc->fp_regs, current);
+
+ /*
+ * Clear the MSR VSX bit to indicate there is no valid state attached
+ * to this context, except in the specific case below where we set it.
+ */
+ msr &= ~MSR_VSX;
#ifdef CONFIG_VSX
/*
* Copy VSX low doubleword to local buffer for formatting,
@@ -144,6 +155,145 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
return err;
}
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * As above, but Transactional Memory is in use, so deliver sigcontexts
+ * containing checkpointed and transactional register states.
+ *
+ * To do this, we treclaim (done before entering here) to gather both sets of
+ * registers and set up the 'normal' sigcontext registers with rolled-back
+ * register values such that a simple signal handler sees a correct
+ * checkpointed register state. If interested, a TM-aware sighandler can
+ * examine the transactional registers in the 2nd sigcontext to determine the
+ * real origin of the signal.
+ */
+static long setup_tm_sigcontexts(struct sigcontext __user *sc,
+ struct sigcontext __user *tm_sc,
+ struct pt_regs *regs,
+ int signr, sigset_t *set, unsigned long handler)
+{
+ /* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
+ * process never used altivec yet (MSR_VEC is zero in pt_regs of
+ * the context). This is very important because we must ensure we
+ * don't lose the VRSAVE content that may have been set prior to
+ * the process doing its first vector operation
+ * Userland shall check AT_HWCAP to know wether it can rely on the
+ * v_regs pointer or not.
+ */
+#ifdef CONFIG_ALTIVEC
+ elf_vrreg_t __user *v_regs = (elf_vrreg_t __user *)
+ (((unsigned long)sc->vmx_reserve + 15) & ~0xful);
+ elf_vrreg_t __user *tm_v_regs = (elf_vrreg_t __user *)
+ (((unsigned long)tm_sc->vmx_reserve + 15) & ~0xful);
+#endif
+ unsigned long msr = regs->msr;
+ long err = 0;
+
+ BUG_ON(!MSR_TM_ACTIVE(regs->msr));
+
+ /* Remove TM bits from thread's MSR. The MSR in the sigcontext
+ * just indicates to userland that we were doing a transaction, but we
+ * don't want to return in transactional state. This also ensures
+ * that flush_fp_to_thread won't set TIF_RESTORE_TM again.
+ */
+ regs->msr &= ~MSR_TS_MASK;
+
+ flush_fp_to_thread(current);
+
+#ifdef CONFIG_ALTIVEC
+ err |= __put_user(v_regs, &sc->v_regs);
+ err |= __put_user(tm_v_regs, &tm_sc->v_regs);
+
+ /* save altivec registers */
+ if (current->thread.used_vr) {
+ flush_altivec_to_thread(current);
+ /* Copy 33 vec registers (vr0..31 and vscr) to the stack */
+ err |= __copy_to_user(v_regs, &current->thread.vr_state,
+ 33 * sizeof(vector128));
+ /* If VEC was enabled there are transactional VRs valid too,
+ * else they're a copy of the checkpointed VRs.
+ */
+ if (msr & MSR_VEC)
+ err |= __copy_to_user(tm_v_regs,
+ &current->thread.transact_vr,
+ 33 * sizeof(vector128));
+ else
+ err |= __copy_to_user(tm_v_regs,
+ &current->thread.vr_state,
+ 33 * sizeof(vector128));
+
+ /* set MSR_VEC in the MSR value in the frame to indicate
+ * that sc->v_reg contains valid data.
+ */
+ msr |= MSR_VEC;
+ }
+ /* We always copy to/from vrsave, it's 0 if we don't have or don't
+ * use altivec.
+ */
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ current->thread.vrsave = mfspr(SPRN_VRSAVE);
+ err |= __put_user(current->thread.vrsave, (u32 __user *)&v_regs[33]);
+ if (msr & MSR_VEC)
+ err |= __put_user(current->thread.transact_vrsave,
+ (u32 __user *)&tm_v_regs[33]);
+ else
+ err |= __put_user(current->thread.vrsave,
+ (u32 __user *)&tm_v_regs[33]);
+
+#else /* CONFIG_ALTIVEC */
+ err |= __put_user(0, &sc->v_regs);
+ err |= __put_user(0, &tm_sc->v_regs);
+#endif /* CONFIG_ALTIVEC */
+
+ /* copy fpr regs and fpscr */
+ err |= copy_fpr_to_user(&sc->fp_regs, current);
+ if (msr & MSR_FP)
+ err |= copy_transact_fpr_to_user(&tm_sc->fp_regs, current);
+ else
+ err |= copy_fpr_to_user(&tm_sc->fp_regs, current);
+
+#ifdef CONFIG_VSX
+ /*
+ * Copy VSX low doubleword to local buffer for formatting,
+ * then out to userspace. Update v_regs to point after the
+ * VMX data.
+ */
+ if (current->thread.used_vsr) {
+ __giveup_vsx(current);
+ v_regs += ELF_NVRREG;
+ tm_v_regs += ELF_NVRREG;
+
+ err |= copy_vsx_to_user(v_regs, current);
+
+ if (msr & MSR_VSX)
+ err |= copy_transact_vsx_to_user(tm_v_regs, current);
+ else
+ err |= copy_vsx_to_user(tm_v_regs, current);
+
+ /* set MSR_VSX in the MSR value in the frame to
+ * indicate that sc->vs_reg) contains valid data.
+ */
+ msr |= MSR_VSX;
+ }
+#endif /* CONFIG_VSX */
+
+ err |= __put_user(&sc->gp_regs, &sc->regs);
+ err |= __put_user(&tm_sc->gp_regs, &tm_sc->regs);
+ WARN_ON(!FULL_REGS(regs));
+ err |= __copy_to_user(&tm_sc->gp_regs, regs, GP_REGS_SIZE);
+ err |= __copy_to_user(&sc->gp_regs,
+ &current->thread.ckpt_regs, GP_REGS_SIZE);
+ err |= __put_user(msr, &tm_sc->gp_regs[PT_MSR]);
+ err |= __put_user(msr, &sc->gp_regs[PT_MSR]);
+ err |= __put_user(signr, &sc->signal);
+ err |= __put_user(handler, &sc->handler);
+ if (set != NULL)
+ err |= __put_user(set->sig[0], &sc->oldmask);
+
+ return err;
+}
+#endif
+
/*
* Restore the sigcontext from the signal frame.
*/
@@ -211,16 +361,18 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128)))
return -EFAULT;
/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
- if (v_regs != 0 && (msr & MSR_VEC) != 0)
- err |= __copy_from_user(current->thread.vr, v_regs,
+ if (v_regs != NULL && (msr & MSR_VEC) != 0)
+ err |= __copy_from_user(&current->thread.vr_state, v_regs,
33 * sizeof(vector128));
else if (current->thread.used_vr)
- memset(current->thread.vr, 0, 33 * sizeof(vector128));
+ memset(&current->thread.vr_state, 0, 33 * sizeof(vector128));
/* Always get VRSAVE back */
- if (v_regs != 0)
+ if (v_regs != NULL)
err |= __get_user(current->thread.vrsave, (u32 __user *)&v_regs[33]);
else
current->thread.vrsave = 0;
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ mtspr(SPRN_VRSAVE, current->thread.vrsave);
#endif /* CONFIG_ALTIVEC */
/* restore floating point */
err |= copy_fpr_from_user(current, &sc->fp_regs);
@@ -235,10 +387,165 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
err |= copy_vsx_from_user(current, v_regs);
else
for (i = 0; i < 32 ; i++)
- current->thread.fpr[i][TS_VSRLOWOFFSET] = 0;
+ current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
+#endif
+ return err;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Restore the two sigcontexts from the frame of a transactional processes.
+ */
+
+static long restore_tm_sigcontexts(struct pt_regs *regs,
+ struct sigcontext __user *sc,
+ struct sigcontext __user *tm_sc)
+{
+#ifdef CONFIG_ALTIVEC
+ elf_vrreg_t __user *v_regs, *tm_v_regs;
+#endif
+ unsigned long err = 0;
+ unsigned long msr;
+#ifdef CONFIG_VSX
+ int i;
#endif
+ /* copy the GPRs */
+ err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr));
+ err |= __copy_from_user(&current->thread.ckpt_regs, sc->gp_regs,
+ sizeof(regs->gpr));
+
+ /*
+ * TFHAR is restored from the checkpointed 'wound-back' ucontext's NIP.
+ * TEXASR was set by the signal delivery reclaim, as was TFIAR.
+ * Users doing anything abhorrent like thread-switching w/ signals for
+ * TM-Suspended code will have to back TEXASR/TFIAR up themselves.
+ * For the case of getting a signal and simply returning from it,
+ * we don't need to re-copy them here.
+ */
+ err |= __get_user(regs->nip, &tm_sc->gp_regs[PT_NIP]);
+ err |= __get_user(current->thread.tm_tfhar, &sc->gp_regs[PT_NIP]);
+
+ /* get MSR separately, transfer the LE bit if doing signal return */
+ err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
+ /* pull in MSR TM from user context */
+ regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
+
+ /* pull in MSR LE from user context */
+ regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
+
+ /* The following non-GPR non-FPR non-VR state is also checkpointed: */
+ err |= __get_user(regs->ctr, &tm_sc->gp_regs[PT_CTR]);
+ err |= __get_user(regs->link, &tm_sc->gp_regs[PT_LNK]);
+ err |= __get_user(regs->xer, &tm_sc->gp_regs[PT_XER]);
+ err |= __get_user(regs->ccr, &tm_sc->gp_regs[PT_CCR]);
+ err |= __get_user(current->thread.ckpt_regs.ctr,
+ &sc->gp_regs[PT_CTR]);
+ err |= __get_user(current->thread.ckpt_regs.link,
+ &sc->gp_regs[PT_LNK]);
+ err |= __get_user(current->thread.ckpt_regs.xer,
+ &sc->gp_regs[PT_XER]);
+ err |= __get_user(current->thread.ckpt_regs.ccr,
+ &sc->gp_regs[PT_CCR]);
+
+ /* These regs are not checkpointed; they can go in 'regs'. */
+ err |= __get_user(regs->trap, &sc->gp_regs[PT_TRAP]);
+ err |= __get_user(regs->dar, &sc->gp_regs[PT_DAR]);
+ err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]);
+ err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]);
+
+ /*
+ * Do this before updating the thread state in
+ * current->thread.fpr/vr. That way, if we get preempted
+ * and another task grabs the FPU/Altivec, it won't be
+ * tempted to save the current CPU state into the thread_struct
+ * and corrupt what we are writing there.
+ */
+ discard_lazy_cpu_state();
+
+ /*
+ * Force reload of FP/VEC.
+ * This has to be done before copying stuff into current->thread.fpr/vr
+ * for the reasons explained in the previous comment.
+ */
+ regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX);
+
+#ifdef CONFIG_ALTIVEC
+ err |= __get_user(v_regs, &sc->v_regs);
+ err |= __get_user(tm_v_regs, &tm_sc->v_regs);
+ if (err)
+ return err;
+ if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128)))
+ return -EFAULT;
+ if (tm_v_regs && !access_ok(VERIFY_READ,
+ tm_v_regs, 34 * sizeof(vector128)))
+ return -EFAULT;
+ /* Copy 33 vec registers (vr0..31 and vscr) from the stack */
+ if (v_regs != NULL && tm_v_regs != NULL && (msr & MSR_VEC) != 0) {
+ err |= __copy_from_user(&current->thread.vr_state, v_regs,
+ 33 * sizeof(vector128));
+ err |= __copy_from_user(&current->thread.transact_vr, tm_v_regs,
+ 33 * sizeof(vector128));
+ }
+ else if (current->thread.used_vr) {
+ memset(&current->thread.vr_state, 0, 33 * sizeof(vector128));
+ memset(&current->thread.transact_vr, 0, 33 * sizeof(vector128));
+ }
+ /* Always get VRSAVE back */
+ if (v_regs != NULL && tm_v_regs != NULL) {
+ err |= __get_user(current->thread.vrsave,
+ (u32 __user *)&v_regs[33]);
+ err |= __get_user(current->thread.transact_vrsave,
+ (u32 __user *)&tm_v_regs[33]);
+ }
+ else {
+ current->thread.vrsave = 0;
+ current->thread.transact_vrsave = 0;
+ }
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ mtspr(SPRN_VRSAVE, current->thread.vrsave);
+#endif /* CONFIG_ALTIVEC */
+ /* restore floating point */
+ err |= copy_fpr_from_user(current, &sc->fp_regs);
+ err |= copy_transact_fpr_from_user(current, &tm_sc->fp_regs);
+#ifdef CONFIG_VSX
+ /*
+ * Get additional VSX data. Update v_regs to point after the
+ * VMX data. Copy VSX low doubleword from userspace to local
+ * buffer for formatting, then into the taskstruct.
+ */
+ if (v_regs && ((msr & MSR_VSX) != 0)) {
+ v_regs += ELF_NVRREG;
+ tm_v_regs += ELF_NVRREG;
+ err |= copy_vsx_from_user(current, v_regs);
+ err |= copy_transact_vsx_from_user(current, tm_v_regs);
+ } else {
+ for (i = 0; i < 32 ; i++) {
+ current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
+ current->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET] = 0;
+ }
+ }
+#endif
+ tm_enable();
+ /* Make sure the transaction is marked as failed */
+ current->thread.tm_texasr |= TEXASR_FS;
+ /* This loads the checkpointed FP/VEC state, if used */
+ tm_recheckpoint(&current->thread, msr);
+
+ /* This loads the speculative FP/VEC state, if used */
+ if (msr & MSR_FP) {
+ do_load_up_transact_fpu(&current->thread);
+ regs->msr |= (MSR_FP | current->thread.fpexc_mode);
+ }
+#ifdef CONFIG_ALTIVEC
+ if (msr & MSR_VEC) {
+ do_load_up_transact_altivec(&current->thread);
+ regs->msr |= MSR_VEC;
+ }
+#endif
+
return err;
}
+#endif
/*
* Setup the trampoline code on the stack
@@ -334,7 +641,7 @@ int sys_swapcontext(struct ucontext __user *old_ctx,
if (__copy_from_user(&set, &new_ctx->uc_sigmask, sizeof(set)))
do_exit(SIGSEGV);
- restore_sigmask(&set);
+ set_current_blocked(&set);
if (restore_sigcontext(regs, NULL, 0, &new_ctx->uc_mcontext))
do_exit(SIGSEGV);
@@ -354,6 +661,9 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
{
struct ucontext __user *uc = (struct ucontext __user *)regs->gpr[1];
sigset_t set;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ unsigned long msr;
+#endif
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
@@ -363,27 +673,36 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
if (__copy_from_user(&set, &uc->uc_sigmask, sizeof(set)))
goto badframe;
- restore_sigmask(&set);
+ set_current_blocked(&set);
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR]))
+ goto badframe;
+ if (MSR_TM_ACTIVE(msr)) {
+ /* We recheckpoint on return. */
+ struct ucontext __user *uc_transact;
+ if (__get_user(uc_transact, &uc->uc_link))
+ goto badframe;
+ if (restore_tm_sigcontexts(regs, &uc->uc_mcontext,
+ &uc_transact->uc_mcontext))
+ goto badframe;
+ }
+ else
+ /* Fall through, for non-TM restore */
+#endif
if (restore_sigcontext(regs, NULL, 1, &uc->uc_mcontext))
goto badframe;
- /* do_sigaltstack expects a __user pointer and won't modify
- * what's in there anyway
- */
- do_sigaltstack(&uc->uc_stack, NULL, regs->gpr[1]);
+ if (restore_altstack(&uc->uc_stack))
+ goto badframe;
set_thread_flag(TIF_RESTOREALL);
return 0;
badframe:
-#if DEBUG_SIG
- printk("badframe in sys_rt_sigreturn, regs=%p uc=%p &uc->uc_mcontext=%p\n",
- regs, uc, &uc->uc_mcontext);
-#endif
- if (show_unhandled_signals && printk_ratelimit())
- printk(regs->msr & MSR_SF ? fmt64 : fmt32,
- current->comm, current->pid, "rt_sigreturn",
- (long)uc, regs->nip, regs->link);
+ if (show_unhandled_signals)
+ printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32,
+ current->comm, current->pid, "rt_sigreturn",
+ (long)uc, regs->nip, regs->link);
force_sig(SIGSEGV, current);
return 0;
@@ -392,17 +711,11 @@ badframe:
int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs *regs)
{
- /* Handler is *really* a pointer to the function descriptor for
- * the signal routine. The first entry in the function
- * descriptor is the entry address of signal and the second
- * entry is the TOC value we need to use.
- */
- func_descr_t __user *funct_desc_ptr;
struct rt_sigframe __user *frame;
unsigned long newsp = 0;
long err = 0;
- frame = get_sigframe(ka, regs, sizeof(*frame), 0);
+ frame = get_sigframe(ka, get_tm_stackpointer(regs), sizeof(*frame), 0);
if (unlikely(frame == NULL))
goto badframe;
@@ -414,19 +727,32 @@ int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info,
/* Create the ucontext. */
err |= __put_user(0, &frame->uc.uc_flags);
- err |= __put_user(0, &frame->uc.uc_link);
- err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- err |= __put_user(sas_ss_flags(regs->gpr[1]),
- &frame->uc.uc_stack.ss_flags);
- err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, signr, NULL,
- (unsigned long)ka->sa.sa_handler, 1);
+ err |= __save_altstack(&frame->uc.uc_stack, regs->gpr[1]);
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ if (MSR_TM_ACTIVE(regs->msr)) {
+ /* The ucontext_t passed to userland points to the second
+ * ucontext_t (for transactional state) with its uc_link ptr.
+ */
+ err |= __put_user(&frame->uc_transact, &frame->uc.uc_link);
+ err |= setup_tm_sigcontexts(&frame->uc.uc_mcontext,
+ &frame->uc_transact.uc_mcontext,
+ regs, signr,
+ NULL,
+ (unsigned long)ka->sa.sa_handler);
+ } else
+#endif
+ {
+ err |= __put_user(0, &frame->uc.uc_link);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, signr,
+ NULL, (unsigned long)ka->sa.sa_handler,
+ 1);
+ }
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
if (err)
goto badframe;
/* Make sure signal handler doesn't get spurious FP exceptions */
- current->thread.fpscr.val = 0;
+ current->thread.fp_state.fpscr = 0;
/* Set up to return from userspace. */
if (vdso64_rt_sigtramp && current->mm->context.vdso_base) {
@@ -437,18 +763,32 @@ int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info,
goto badframe;
regs->link = (unsigned long) &frame->tramp[0];
}
- funct_desc_ptr = (func_descr_t __user *) ka->sa.sa_handler;
/* Allocate a dummy caller frame for the signal handler. */
newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
err |= put_user(regs->gpr[1], (unsigned long __user *)newsp);
/* Set up "regs" so we "return" to the signal handler. */
- err |= get_user(regs->nip, &funct_desc_ptr->entry);
- /* enter the signal handler in big-endian mode */
+ if (is_elf2_task()) {
+ regs->nip = (unsigned long) ka->sa.sa_handler;
+ regs->gpr[12] = regs->nip;
+ } else {
+ /* Handler is *really* a pointer to the function descriptor for
+ * the signal routine. The first entry in the function
+ * descriptor is the entry address of signal and the second
+ * entry is the TOC value we need to use.
+ */
+ func_descr_t __user *funct_desc_ptr =
+ (func_descr_t __user *) ka->sa.sa_handler;
+
+ err |= get_user(regs->nip, &funct_desc_ptr->entry);
+ err |= get_user(regs->gpr[2], &funct_desc_ptr->toc);
+ }
+
+ /* enter the signal handler in native-endian mode */
regs->msr &= ~MSR_LE;
+ regs->msr |= (MSR_KERNEL & MSR_LE);
regs->gpr[1] = newsp;
- err |= get_user(regs->gpr[2], &funct_desc_ptr->toc);
regs->gpr[3] = signr;
regs->result = 0;
if (ka->sa.sa_flags & SA_SIGINFO) {
@@ -464,14 +804,10 @@ int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info,
return 1;
badframe:
-#if DEBUG_SIG
- printk("badframe in setup_rt_frame, regs=%p frame=%p newsp=%lx\n",
- regs, frame, newsp);
-#endif
- if (show_unhandled_signals && printk_ratelimit())
- printk(regs->msr & MSR_SF ? fmt64 : fmt32,
- current->comm, current->pid, "setup_rt_frame",
- (long)frame, regs->nip, regs->link);
+ if (show_unhandled_signals)
+ printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32,
+ current->comm, current->pid, "setup_rt_frame",
+ (long)frame, regs->nip, regs->link);
force_sigsegv(signr, current);
return 0;
diff --git a/arch/powerpc/kernel/smp-tbsync.c b/arch/powerpc/kernel/smp-tbsync.c
index 03e45c4a9ef..7a37ecd3afa 100644
--- a/arch/powerpc/kernel/smp-tbsync.c
+++ b/arch/powerpc/kernel/smp-tbsync.c
@@ -9,9 +9,8 @@
#include <linux/sched.h>
#include <linux/smp.h>
#include <linux/unistd.h>
-#include <linux/init.h>
#include <linux/slab.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/smp.h>
#include <asm/time.h>
@@ -36,13 +35,13 @@ static struct {
static volatile int running;
-static void __devinit enter_contest(u64 mark, long add)
+static void enter_contest(u64 mark, long add)
{
while (get_tb() < mark)
tbsync->race_result = add;
}
-void __devinit smp_generic_take_timebase(void)
+void smp_generic_take_timebase(void)
{
int cmd;
u64 tb;
@@ -75,7 +74,7 @@ void __devinit smp_generic_take_timebase(void)
local_irq_restore(flags);
}
-static int __devinit start_contest(int cmd, long offset, int num)
+static int start_contest(int cmd, long offset, int num)
{
int i, score=0;
u64 tb;
@@ -110,7 +109,7 @@ static int __devinit start_contest(int cmd, long offset, int num)
return score;
}
-void __devinit smp_generic_give_timebase(void)
+void smp_generic_give_timebase(void)
{
int i, score, score2, old, min=0, max=5000, offset=1000;
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 68034bbf2e4..1007fb802e6 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -18,7 +18,7 @@
#undef DEBUG
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/sched.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
@@ -27,14 +27,16 @@
#include <linux/spinlock.h>
#include <linux/cache.h>
#include <linux/err.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/topology.h>
#include <asm/ptrace.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/irq.h>
+#include <asm/hw_irq.h>
+#include <asm/kvm_ppc.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/prom.h>
@@ -43,12 +45,13 @@
#include <asm/machdep.h>
#include <asm/cputhreads.h>
#include <asm/cputable.h>
-#include <asm/system.h>
#include <asm/mpic.h>
#include <asm/vdso_datapage.h>
#ifdef CONFIG_PPC64
#include <asm/paca.h>
#endif
+#include <asm/vdso.h>
+#include <asm/debug.h>
#ifdef DEBUG
#include <asm/udbg.h>
@@ -57,6 +60,11 @@
#define DBG(fmt...)
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+/* State of each CPU during hotplug phases */
+static DEFINE_PER_CPU(int, cpu_state) = { 0 };
+#endif
+
struct thread_info *secondary_ti;
DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
@@ -75,8 +83,30 @@ int smt_enabled_at_boot = 1;
static void (*crash_ipi_function_ptr)(struct pt_regs *) = NULL;
+/*
+ * Returns 1 if the specified cpu should be brought up during boot.
+ * Used to inhibit booting threads if they've been disabled or
+ * limited on the command line
+ */
+int smp_generic_cpu_bootable(unsigned int nr)
+{
+ /* Special case - we inhibit secondary thread startup
+ * during boot if the user requests it.
+ */
+ if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
+ if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
+ return 0;
+ if (smt_enabled_at_boot
+ && cpu_thread_in_core(nr) >= smt_enabled_at_boot)
+ return 0;
+ }
+
+ return 1;
+}
+
+
#ifdef CONFIG_PPC64
-void __devinit smp_generic_kick_cpu(int nr)
+int smp_generic_kick_cpu(int nr)
{
BUG_ON(nr < 0 || nr >= NR_CPUS);
@@ -85,39 +115,25 @@ void __devinit smp_generic_kick_cpu(int nr)
* cpu_start field to become non-zero After we set cpu_start,
* the processor will continue on to secondary_start
*/
- paca[nr].cpu_start = 1;
- smp_mb();
-}
-#endif
-
-void smp_message_recv(int msg)
-{
- switch(msg) {
- case PPC_MSG_CALL_FUNCTION:
- generic_smp_call_function_interrupt();
- break;
- case PPC_MSG_RESCHEDULE:
- /* we notice need_resched on exit */
- break;
- case PPC_MSG_CALL_FUNC_SINGLE:
- generic_smp_call_function_single_interrupt();
- break;
- case PPC_MSG_DEBUGGER_BREAK:
- if (crash_ipi_function_ptr) {
- crash_ipi_function_ptr(get_irq_regs());
- break;
- }
-#ifdef CONFIG_DEBUGGER
- debugger_ipi(get_irq_regs());
- break;
-#endif /* CONFIG_DEBUGGER */
- /* FALLTHROUGH */
- default:
- printk("SMP %d: smp_message_recv(): unknown msg %d\n",
- smp_processor_id(), msg);
- break;
+ if (!paca[nr].cpu_start) {
+ paca[nr].cpu_start = 1;
+ smp_mb();
+ return 0;
}
+
+#ifdef CONFIG_HOTPLUG_CPU
+ /*
+ * Ok it's not there, so it might be soft-unplugged, let's
+ * try to bring it back
+ */
+ generic_set_cpu_up(nr);
+ smp_wmb();
+ smp_send_reschedule(nr);
+#endif /* CONFIG_HOTPLUG_CPU */
+
+ return 0;
}
+#endif /* CONFIG_PPC64 */
static irqreturn_t call_function_action(int irq, void *data)
{
@@ -127,33 +143,41 @@ static irqreturn_t call_function_action(int irq, void *data)
static irqreturn_t reschedule_action(int irq, void *data)
{
- /* we just need the return path side effect of checking need_resched */
+ scheduler_ipi();
return IRQ_HANDLED;
}
-static irqreturn_t call_function_single_action(int irq, void *data)
+static irqreturn_t tick_broadcast_ipi_action(int irq, void *data)
{
- generic_smp_call_function_single_interrupt();
+ tick_broadcast_ipi_handler();
return IRQ_HANDLED;
}
static irqreturn_t debug_ipi_action(int irq, void *data)
{
- smp_message_recv(PPC_MSG_DEBUGGER_BREAK);
+ if (crash_ipi_function_ptr) {
+ crash_ipi_function_ptr(get_irq_regs());
+ return IRQ_HANDLED;
+ }
+
+#ifdef CONFIG_DEBUGGER
+ debugger_ipi(get_irq_regs());
+#endif /* CONFIG_DEBUGGER */
+
return IRQ_HANDLED;
}
static irq_handler_t smp_ipi_action[] = {
[PPC_MSG_CALL_FUNCTION] = call_function_action,
[PPC_MSG_RESCHEDULE] = reschedule_action,
- [PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
+ [PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
};
const char *smp_ipi_name[] = {
[PPC_MSG_CALL_FUNCTION] = "ipi call function",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
- [PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
+ [PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
};
@@ -170,23 +194,95 @@ int smp_request_message_ipi(int virq, int msg)
return 1;
}
#endif
- err = request_irq(virq, smp_ipi_action[msg], IRQF_DISABLED|IRQF_PERCPU,
- smp_ipi_name[msg], 0);
+ err = request_irq(virq, smp_ipi_action[msg],
+ IRQF_PERCPU | IRQF_NO_THREAD | IRQF_NO_SUSPEND,
+ smp_ipi_name[msg], NULL);
WARN(err < 0, "unable to request_irq %d for %s (rc %d)\n",
virq, smp_ipi_name[msg], err);
return err;
}
+#ifdef CONFIG_PPC_SMP_MUXED_IPI
+struct cpu_messages {
+ int messages; /* current messages */
+ unsigned long data; /* data for cause ipi */
+};
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
+
+void smp_muxed_ipi_set_data(int cpu, unsigned long data)
+{
+ struct cpu_messages *info = &per_cpu(ipi_message, cpu);
+
+ info->data = data;
+}
+
+void smp_muxed_ipi_message_pass(int cpu, int msg)
+{
+ struct cpu_messages *info = &per_cpu(ipi_message, cpu);
+ char *message = (char *)&info->messages;
+
+ /*
+ * Order previous accesses before accesses in the IPI handler.
+ */
+ smp_mb();
+ message[msg] = 1;
+ /*
+ * cause_ipi functions are required to include a full barrier
+ * before doing whatever causes the IPI.
+ */
+ smp_ops->cause_ipi(cpu, info->data);
+}
+
+#ifdef __BIG_ENDIAN__
+#define IPI_MESSAGE(A) (1 << (24 - 8 * (A)))
+#else
+#define IPI_MESSAGE(A) (1 << (8 * (A)))
+#endif
+
+irqreturn_t smp_ipi_demux(void)
+{
+ struct cpu_messages *info = &__get_cpu_var(ipi_message);
+ unsigned int all;
+
+ mb(); /* order any irq clear */
+
+ do {
+ all = xchg(&info->messages, 0);
+ if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION))
+ generic_smp_call_function_interrupt();
+ if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
+ scheduler_ipi();
+ if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST))
+ tick_broadcast_ipi_handler();
+ if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK))
+ debug_ipi_action(0, NULL);
+ } while (info->messages);
+
+ return IRQ_HANDLED;
+}
+#endif /* CONFIG_PPC_SMP_MUXED_IPI */
+
+static inline void do_message_pass(int cpu, int msg)
+{
+ if (smp_ops->message_pass)
+ smp_ops->message_pass(cpu, msg);
+#ifdef CONFIG_PPC_SMP_MUXED_IPI
+ else
+ smp_muxed_ipi_message_pass(cpu, msg);
+#endif
+}
+
void smp_send_reschedule(int cpu)
{
if (likely(smp_ops))
- smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE);
+ do_message_pass(cpu, PPC_MSG_RESCHEDULE);
}
+EXPORT_SYMBOL_GPL(smp_send_reschedule);
void arch_send_call_function_single_ipi(int cpu)
{
- smp_ops->message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
+ do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
}
void arch_send_call_function_ipi_mask(const struct cpumask *mask)
@@ -194,14 +290,31 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
unsigned int cpu;
for_each_cpu(cpu, mask)
- smp_ops->message_pass(cpu, PPC_MSG_CALL_FUNCTION);
+ do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
}
-#ifdef CONFIG_DEBUGGER
-void smp_send_debugger_break(int cpu)
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+void tick_broadcast(const struct cpumask *mask)
{
- if (likely(smp_ops))
- smp_ops->message_pass(cpu, PPC_MSG_DEBUGGER_BREAK);
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask)
+ do_message_pass(cpu, PPC_MSG_TICK_BROADCAST);
+}
+#endif
+
+#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
+void smp_send_debugger_break(void)
+{
+ int cpu;
+ int me = raw_smp_processor_id();
+
+ if (unlikely(!smp_ops))
+ return;
+
+ for_each_online_cpu(cpu)
+ if (cpu != me)
+ do_message_pass(cpu, PPC_MSG_DEBUGGER_BREAK);
}
#endif
@@ -209,9 +322,9 @@ void smp_send_debugger_break(int cpu)
void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
{
crash_ipi_function_ptr = crash_ipi_callback;
- if (crash_ipi_callback && smp_ops) {
+ if (crash_ipi_callback) {
mb();
- smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_DEBUGGER_BREAK);
+ smp_send_debugger_break();
}
}
#endif
@@ -233,26 +346,13 @@ void smp_send_stop(void)
struct thread_info *current_set[NR_CPUS];
-static void __devinit smp_store_cpu_info(int id)
+static void smp_store_cpu_info(int id)
{
per_cpu(cpu_pvr, id) = mfspr(SPRN_PVR);
-}
-
-static void __init smp_create_idle(unsigned int cpu)
-{
- struct task_struct *p;
-
- /* create a process for the processor */
- p = fork_idle(cpu);
- if (IS_ERR(p))
- panic("failed fork for CPU %u: %li", cpu, PTR_ERR(p));
-#ifdef CONFIG_PPC64
- paca[cpu].__current = p;
- paca[cpu].kstack = (unsigned long) task_thread_info(p)
- + THREAD_SIZE - STACK_FRAME_OVERHEAD;
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ per_cpu(next_tlbcam_idx, id)
+ = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
#endif
- current_set[cpu] = task_thread_info(p);
- task_thread_info(p)->cpu = cpu;
}
void __init smp_prepare_cpus(unsigned int max_cpus)
@@ -281,31 +381,21 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
- if (smp_ops)
- if (smp_ops->probe)
- max_cpus = smp_ops->probe();
- else
- max_cpus = NR_CPUS;
- else
- max_cpus = 1;
-
- for_each_possible_cpu(cpu)
- if (cpu != boot_cpuid)
- smp_create_idle(cpu);
+ if (smp_ops && smp_ops->probe)
+ smp_ops->probe();
}
-void __devinit smp_prepare_boot_cpu(void)
+void smp_prepare_boot_cpu(void)
{
BUG_ON(smp_processor_id() != boot_cpuid);
#ifdef CONFIG_PPC64
paca[boot_cpuid].__current = current;
#endif
+ set_numa_node(numa_cpu_lookup_table[boot_cpuid]);
current_set[boot_cpuid] = task_thread_info(current);
}
#ifdef CONFIG_HOTPLUG_CPU
-/* State of each CPU during hotplug phases */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
int generic_cpu_disable(void)
{
@@ -317,30 +407,8 @@ int generic_cpu_disable(void)
set_cpu_online(cpu, false);
#ifdef CONFIG_PPC64
vdso_data->processorCount--;
- fixup_irqs(cpu_online_mask);
-#endif
- return 0;
-}
-
-int generic_cpu_enable(unsigned int cpu)
-{
- /* Do the normal bootup if we haven't
- * already bootstrapped. */
- if (system_state != SYSTEM_RUNNING)
- return -ENOSYS;
-
- /* get the target out of it's holding state */
- per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
- smp_wmb();
-
- while (!cpu_online(cpu))
- cpu_relax();
-
-#ifdef CONFIG_PPC64
- fixup_irqs(cpu_online_mask);
- /* counter the irq disable in fixup_irqs */
- local_irq_enable();
#endif
+ migrate_irqs();
return 0;
}
@@ -362,37 +430,75 @@ void generic_mach_cpu_die(void)
unsigned int cpu;
local_irq_disable();
+ idle_task_exit();
cpu = smp_processor_id();
printk(KERN_DEBUG "CPU%d offline\n", cpu);
__get_cpu_var(cpu_state) = CPU_DEAD;
smp_wmb();
while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
cpu_relax();
- set_cpu_online(cpu, true);
- local_irq_enable();
}
+
+void generic_set_cpu_dead(unsigned int cpu)
+{
+ per_cpu(cpu_state, cpu) = CPU_DEAD;
+}
+
+/*
+ * The cpu_state should be set to CPU_UP_PREPARE in kick_cpu(), otherwise
+ * the cpu_state is always CPU_DEAD after calling generic_set_cpu_dead(),
+ * which makes the delay in generic_cpu_die() not happen.
+ */
+void generic_set_cpu_up(unsigned int cpu)
+{
+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+}
+
+int generic_check_cpu_restart(unsigned int cpu)
+{
+ return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE;
+}
+
+static bool secondaries_inhibited(void)
+{
+ return kvm_hv_mode_active();
+}
+
+#else /* HOTPLUG_CPU */
+
+#define secondaries_inhibited() 0
+
#endif
-static int __devinit cpu_enable(unsigned int cpu)
+static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
{
- if (smp_ops && smp_ops->cpu_enable)
- return smp_ops->cpu_enable(cpu);
+ struct thread_info *ti = task_thread_info(idle);
- return -ENOSYS;
+#ifdef CONFIG_PPC64
+ paca[cpu].__current = idle;
+ paca[cpu].kstack = (unsigned long)ti + THREAD_SIZE - STACK_FRAME_OVERHEAD;
+#endif
+ ti->cpu = cpu;
+ secondary_ti = current_set[cpu] = ti;
}
-int __cpuinit __cpu_up(unsigned int cpu)
+int __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
- int c;
+ int rc, c;
- secondary_ti = current_set[cpu];
- if (!cpu_enable(cpu))
- return 0;
+ /*
+ * Don't allow secondary threads to come online if inhibited
+ */
+ if (threads_per_core > 1 && secondaries_inhibited() &&
+ cpu_thread_in_subcore(cpu))
+ return -EBUSY;
if (smp_ops == NULL ||
(smp_ops->cpu_bootable && !smp_ops->cpu_bootable(cpu)))
return -EINVAL;
+ cpu_idle_thread_init(cpu, tidle);
+
/* Make sure callin-map entry is 0 (can be leftover a CPU
* hotplug
*/
@@ -406,7 +512,11 @@ int __cpuinit __cpu_up(unsigned int cpu)
/* wake up cpus */
DBG("smp: kicking cpu %d\n", cpu);
- smp_ops->kick_cpu(cpu);
+ rc = smp_ops->kick_cpu(cpu);
+ if (rc) {
+ pr_err("smp: failed starting cpu %d (rc %d)\n", cpu, rc);
+ return rc;
+ }
/*
* wait to see if the cpu made a callin (is actually up).
@@ -449,7 +559,7 @@ int __cpuinit __cpu_up(unsigned int cpu)
int cpu_to_core_id(int cpu)
{
struct device_node *np;
- const int *reg;
+ const __be32 *reg;
int id = -1;
np = of_get_cpu_node(cpu, NULL);
@@ -460,12 +570,52 @@ int cpu_to_core_id(int cpu)
if (!reg)
goto out;
- id = *reg;
+ id = be32_to_cpup(reg);
out:
of_node_put(np);
return id;
}
+/* Helper routines for cpu to core mapping */
+int cpu_core_index_of_thread(int cpu)
+{
+ return cpu >> threads_shift;
+}
+EXPORT_SYMBOL_GPL(cpu_core_index_of_thread);
+
+int cpu_first_thread_of_core(int core)
+{
+ return core << threads_shift;
+}
+EXPORT_SYMBOL_GPL(cpu_first_thread_of_core);
+
+static void traverse_siblings_chip_id(int cpu, bool add, int chipid)
+{
+ const struct cpumask *mask;
+ struct device_node *np;
+ int i, plen;
+ const __be32 *prop;
+
+ mask = add ? cpu_online_mask : cpu_present_mask;
+ for_each_cpu(i, mask) {
+ np = of_get_cpu_node(i, NULL);
+ if (!np)
+ continue;
+ prop = of_get_property(np, "ibm,chip-id", &plen);
+ if (prop && plen == sizeof(int) &&
+ of_read_number(prop, 1) == chipid) {
+ if (add) {
+ cpumask_set_cpu(cpu, cpu_core_mask(i));
+ cpumask_set_cpu(i, cpu_core_mask(cpu));
+ } else {
+ cpumask_clear_cpu(cpu, cpu_core_mask(i));
+ cpumask_clear_cpu(i, cpu_core_mask(cpu));
+ }
+ }
+ of_node_put(np);
+ }
+}
+
/* Must be called when no change can occur to cpu_present_mask,
* i.e. during cpu online or offline.
*/
@@ -488,11 +638,51 @@ static struct device_node *cpu_to_l2cache(int cpu)
return cache;
}
+static void traverse_core_siblings(int cpu, bool add)
+{
+ struct device_node *l2_cache, *np;
+ const struct cpumask *mask;
+ int i, chip, plen;
+ const __be32 *prop;
+
+ /* First see if we have ibm,chip-id properties in cpu nodes */
+ np = of_get_cpu_node(cpu, NULL);
+ if (np) {
+ chip = -1;
+ prop = of_get_property(np, "ibm,chip-id", &plen);
+ if (prop && plen == sizeof(int))
+ chip = of_read_number(prop, 1);
+ of_node_put(np);
+ if (chip >= 0) {
+ traverse_siblings_chip_id(cpu, add, chip);
+ return;
+ }
+ }
+
+ l2_cache = cpu_to_l2cache(cpu);
+ mask = add ? cpu_online_mask : cpu_present_mask;
+ for_each_cpu(i, mask) {
+ np = cpu_to_l2cache(i);
+ if (!np)
+ continue;
+ if (np == l2_cache) {
+ if (add) {
+ cpumask_set_cpu(cpu, cpu_core_mask(i));
+ cpumask_set_cpu(i, cpu_core_mask(cpu));
+ } else {
+ cpumask_clear_cpu(cpu, cpu_core_mask(i));
+ cpumask_clear_cpu(i, cpu_core_mask(cpu));
+ }
+ }
+ of_node_put(np);
+ }
+ of_node_put(l2_cache);
+}
+
/* Activate a secondary processor. */
-int __devinit start_secondary(void *unused)
+void start_secondary(void *unused)
{
unsigned int cpu = smp_processor_id();
- struct device_node *l2_cache;
int i, base;
atomic_inc(&init_mm.mm_count);
@@ -510,13 +700,16 @@ int __devinit start_secondary(void *unused)
secondary_cpu_time_init();
- ipi_call_lock();
- notify_cpu_starting(cpu);
- set_cpu_online(cpu, true);
+#ifdef CONFIG_PPC64
+ if (system_state == SYSTEM_RUNNING)
+ vdso_data->processorCount++;
+
+ vdso_getcpu_init();
+#endif
/* Update sibling maps */
- base = cpu_first_thread_in_core(cpu);
+ base = cpu_first_thread_sibling(cpu);
for (i = 0; i < threads_per_core; i++) {
- if (cpu_is_offline(base + i))
+ if (cpu_is_offline(base + i) && (cpu != base + i))
continue;
cpumask_set_cpu(cpu, cpu_sibling_mask(base + i));
cpumask_set_cpu(base + i, cpu_sibling_mask(cpu));
@@ -528,24 +721,23 @@ int __devinit start_secondary(void *unused)
cpumask_set_cpu(cpu, cpu_core_mask(base + i));
cpumask_set_cpu(base + i, cpu_core_mask(cpu));
}
- l2_cache = cpu_to_l2cache(cpu);
- for_each_online_cpu(i) {
- struct device_node *np = cpu_to_l2cache(i);
- if (!np)
- continue;
- if (np == l2_cache) {
- cpumask_set_cpu(cpu, cpu_core_mask(i));
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- }
- of_node_put(np);
- }
- of_node_put(l2_cache);
- ipi_call_unlock();
+ traverse_core_siblings(cpu, true);
+
+ /*
+ * numa_node_id() works after this.
+ */
+ set_numa_node(numa_cpu_lookup_table[cpu]);
+ set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
+
+ smp_wmb();
+ notify_cpu_starting(cpu);
+ set_cpu_online(cpu, true);
local_irq_enable();
- cpu_idle();
- return 0;
+ cpu_startup_entry(CPUHP_ONLINE);
+
+ BUG();
}
int setup_profiling_timer(unsigned int multiplier)
@@ -553,6 +745,28 @@ int setup_profiling_timer(unsigned int multiplier)
return 0;
}
+#ifdef CONFIG_SCHED_SMT
+/* cpumask of CPUs with asymetric SMT dependancy */
+static int powerpc_smt_flags(void)
+{
+ int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+
+ if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
+ printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
+ flags |= SD_ASYM_PACKING;
+ }
+ return flags;
+}
+#endif
+
+static struct sched_domain_topology_level powerpc_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
void __init smp_cpus_done(unsigned int max_cpus)
{
cpumask_var_t old_mask;
@@ -562,7 +776,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
* se we pin us down to CPU 0 for a short while
*/
alloc_cpumask_var(&old_mask, GFP_NOWAIT);
- cpumask_copy(old_mask, &current->cpus_allowed);
+ cpumask_copy(old_mask, tsk_cpus_allowed(current));
set_cpus_allowed_ptr(current, cpumask_of(boot_cpuid));
if (smp_ops && smp_ops->setup_cpu)
@@ -572,22 +786,18 @@ void __init smp_cpus_done(unsigned int max_cpus)
free_cpumask_var(old_mask);
+ if (smp_ops && smp_ops->bringup_done)
+ smp_ops->bringup_done();
+
dump_numa_cpu_topology();
-}
-int arch_sd_sibling_asym_packing(void)
-{
- if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
- printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
- return SD_ASYM_PACKING;
- }
- return 0;
+ set_sched_topology(powerpc_topology);
+
}
#ifdef CONFIG_HOTPLUG_CPU
int __cpu_disable(void)
{
- struct device_node *l2_cache;
int cpu = smp_processor_id();
int base, i;
int err;
@@ -600,27 +810,14 @@ int __cpu_disable(void)
return err;
/* Update sibling maps */
- base = cpu_first_thread_in_core(cpu);
+ base = cpu_first_thread_sibling(cpu);
for (i = 0; i < threads_per_core; i++) {
cpumask_clear_cpu(cpu, cpu_sibling_mask(base + i));
cpumask_clear_cpu(base + i, cpu_sibling_mask(cpu));
cpumask_clear_cpu(cpu, cpu_core_mask(base + i));
cpumask_clear_cpu(base + i, cpu_core_mask(cpu));
}
-
- l2_cache = cpu_to_l2cache(cpu);
- for_each_present_cpu(i) {
- struct device_node *np = cpu_to_l2cache(i);
- if (!np)
- continue;
- if (np == l2_cache) {
- cpumask_clear_cpu(cpu, cpu_core_mask(i));
- cpumask_clear_cpu(i, cpu_core_mask(cpu));
- }
- of_node_put(np);
- }
- of_node_put(l2_cache);
-
+ traverse_core_siblings(cpu, false);
return 0;
}
@@ -631,21 +828,13 @@ void __cpu_die(unsigned int cpu)
smp_ops->cpu_die(cpu);
}
-static DEFINE_MUTEX(powerpc_cpu_hotplug_driver_mutex);
-
-void cpu_hotplug_driver_lock()
-{
- mutex_lock(&powerpc_cpu_hotplug_driver_mutex);
-}
-
-void cpu_hotplug_driver_unlock()
-{
- mutex_unlock(&powerpc_cpu_hotplug_driver_mutex);
-}
-
void cpu_die(void)
{
if (ppc_md.cpu_die)
ppc_md.cpu_die();
+
+ /* If we return, we re-enter start_secondary */
+ start_secondary_resume();
}
+
#endif
diff --git a/arch/powerpc/kernel/softemu8xx.c b/arch/powerpc/kernel/softemu8xx.c
deleted file mode 100644
index af0e8290b4f..00000000000
--- a/arch/powerpc/kernel/softemu8xx.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Software emulation of some PPC instructions for the 8xx core.
- *
- * Copyright (C) 1998 Dan Malek (dmalek@jlc.net)
- *
- * Software floating emuation for the MPC8xx processor. I did this mostly
- * because it was easier than trying to get the libraries compiled for
- * software floating point. The goal is still to get the libraries done,
- * but I lost patience and needed some hacks to at least get init and
- * shells running. The first problem is the setjmp/longjmp that save
- * and restore the floating point registers.
- *
- * For this emulation, our working registers are found on the register
- * save area.
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/interrupt.h>
-
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/io.h>
-
-/* Eventually we may need a look-up table, but this works for now.
-*/
-#define LFS 48
-#define LFD 50
-#define LFDU 51
-#define STFD 54
-#define STFDU 55
-#define FMR 63
-
-void print_8xx_pte(struct mm_struct *mm, unsigned long addr)
-{
- pgd_t *pgd;
- pmd_t *pmd;
- pte_t *pte;
-
- printk(" pte @ 0x%8lx: ", addr);
- pgd = pgd_offset(mm, addr & PAGE_MASK);
- if (pgd) {
- pmd = pmd_offset(pud_offset(pgd, addr & PAGE_MASK),
- addr & PAGE_MASK);
- if (pmd && pmd_present(*pmd)) {
- pte = pte_offset_kernel(pmd, addr & PAGE_MASK);
- if (pte) {
- printk(" (0x%08lx)->(0x%08lx)->0x%08lx\n",
- (long)pgd, (long)pte, (long)pte_val(*pte));
-#define pp ((long)pte_val(*pte))
- printk(" RPN: %05lx PP: %lx SPS: %lx SH: %lx "
- "CI: %lx v: %lx\n",
- pp>>12, /* rpn */
- (pp>>10)&3, /* pp */
- (pp>>3)&1, /* small */
- (pp>>2)&1, /* shared */
- (pp>>1)&1, /* cache inhibit */
- pp&1 /* valid */
- );
-#undef pp
- }
- else {
- printk("no pte\n");
- }
- }
- else {
- printk("no pmd\n");
- }
- }
- else {
- printk("no pgd\n");
- }
-}
-
-int get_8xx_pte(struct mm_struct *mm, unsigned long addr)
-{
- pgd_t *pgd;
- pmd_t *pmd;
- pte_t *pte;
- int retval = 0;
-
- pgd = pgd_offset(mm, addr & PAGE_MASK);
- if (pgd) {
- pmd = pmd_offset(pud_offset(pgd, addr & PAGE_MASK),
- addr & PAGE_MASK);
- if (pmd && pmd_present(*pmd)) {
- pte = pte_offset_kernel(pmd, addr & PAGE_MASK);
- if (pte) {
- retval = (int)pte_val(*pte);
- }
- }
- }
- return retval;
-}
-
-/*
- * We return 0 on success, 1 on unimplemented instruction, and EFAULT
- * if a load/store faulted.
- */
-int Soft_emulate_8xx(struct pt_regs *regs)
-{
- u32 inst, instword;
- u32 flreg, idxreg, disp;
- int retval;
- s16 sdisp;
- u32 *ea, *ip;
-
- retval = 0;
-
- instword = *((u32 *)regs->nip);
- inst = instword >> 26;
-
- flreg = (instword >> 21) & 0x1f;
- idxreg = (instword >> 16) & 0x1f;
- disp = instword & 0xffff;
-
- ea = (u32 *)(regs->gpr[idxreg] + disp);
- ip = (u32 *)&current->thread.TS_FPR(flreg);
-
- switch ( inst )
- {
- case LFD:
- /* this is a 16 bit quantity that is sign extended
- * so use a signed short here -- Cort
- */
- sdisp = (instword & 0xffff);
- ea = (u32 *)(regs->gpr[idxreg] + sdisp);
- if (copy_from_user(ip, ea, sizeof(double)))
- retval = -EFAULT;
- break;
-
- case LFDU:
- if (copy_from_user(ip, ea, sizeof(double)))
- retval = -EFAULT;
- else
- regs->gpr[idxreg] = (u32)ea;
- break;
- case LFS:
- sdisp = (instword & 0xffff);
- ea = (u32 *)(regs->gpr[idxreg] + sdisp);
- if (copy_from_user(ip, ea, sizeof(float)))
- retval = -EFAULT;
- break;
- case STFD:
- /* this is a 16 bit quantity that is sign extended
- * so use a signed short here -- Cort
- */
- sdisp = (instword & 0xffff);
- ea = (u32 *)(regs->gpr[idxreg] + sdisp);
- if (copy_to_user(ea, ip, sizeof(double)))
- retval = -EFAULT;
- break;
-
- case STFDU:
- if (copy_to_user(ea, ip, sizeof(double)))
- retval = -EFAULT;
- else
- regs->gpr[idxreg] = (u32)ea;
- break;
- case FMR:
- /* assume this is a fp move -- Cort */
- memcpy(ip, &current->thread.TS_FPR((instword>>11)&0x1f),
- sizeof(double));
- break;
- default:
- retval = 1;
- printk("Bad emulation %s/%d\n"
- " NIP: %08lx instruction: %08x opcode: %x "
- "A: %x B: %x C: %x code: %x rc: %x\n",
- current->comm,current->pid,
- regs->nip,
- instword,inst,
- (instword>>16)&0x1f,
- (instword>>11)&0x1f,
- (instword>>6)&0x1f,
- (instword>>1)&0x3ff,
- instword&1);
- {
- int pa;
- print_8xx_pte(current->mm,regs->nip);
- pa = get_8xx_pte(current->mm,regs->nip) & PAGE_MASK;
- pa |= (regs->nip & ~PAGE_MASK);
- pa = (unsigned long)__va(pa);
- printk("Kernel VA for NIP %x ", pa);
- print_8xx_pte(current->mm,pa);
- }
- }
-
- if (retval == 0)
- regs->nip += 4;
-
- return retval;
-}
diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index b0dbb1daa4d..3d30ef1038e 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -10,7 +10,7 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/sched.h>
#include <linux/stacktrace.h>
#include <asm/ptrace.h>
diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c
index 560c9611950..eae33e10b65 100644
--- a/arch/powerpc/kernel/swsusp.c
+++ b/arch/powerpc/kernel/swsusp.c
@@ -10,10 +10,9 @@
*/
#include <linux/sched.h>
-#include <asm/suspend.h>
-#include <asm/system.h>
#include <asm/current.h>
#include <asm/mmu_context.h>
+#include <asm/switch_to.h>
void save_processor_state(void)
{
@@ -34,6 +33,6 @@ void save_processor_state(void)
void restore_processor_state(void)
{
#ifdef CONFIG_PPC32
- switch_mmu_context(NULL, current->active_mm);
+ switch_mmu_context(current->active_mm, current->active_mm);
#endif
}
diff --git a/arch/powerpc/kernel/swsusp_32.S b/arch/powerpc/kernel/swsusp_32.S
index b0754e23743..ba4dee3d233 100644
--- a/arch/powerpc/kernel/swsusp_32.S
+++ b/arch/powerpc/kernel/swsusp_32.S
@@ -143,7 +143,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
/* Disable MSR:DR to make sure we don't take a TLB or
* hash miss during the copy, as our hash table will
- * for a while be unuseable. For .text, we assume we are
+ * for a while be unusable. For .text, we assume we are
* covered by a BAT. This works only for non-G5 at this
* point. G5 will need a better approach, possibly using
* a small temporary hash table filled with large mappings,
diff --git a/arch/powerpc/kernel/swsusp_64.c b/arch/powerpc/kernel/swsusp_64.c
index 6f3f0697274..0e899e47c32 100644
--- a/arch/powerpc/kernel/swsusp_64.c
+++ b/arch/powerpc/kernel/swsusp_64.c
@@ -6,9 +6,9 @@
* GPLv2
*/
-#include <asm/system.h>
#include <asm/iommu.h>
#include <linux/irq.h>
+#include <linux/sched.h>
#include <linux/interrupt.h>
void do_after_copyback(void)
diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S
index 86ac1d90d02..988f38dced0 100644
--- a/arch/powerpc/kernel/swsusp_asm64.S
+++ b/arch/powerpc/kernel/swsusp_asm64.S
@@ -46,10 +46,19 @@
#define SL_r29 0xe8
#define SL_r30 0xf0
#define SL_r31 0xf8
-#define SL_SIZE SL_r31+8
+#define SL_SPRG1 0x100
+#define SL_TCR 0x108
+#define SL_SIZE SL_TCR+8
/* these macros rely on the save area being
* pointed to by r11 */
+
+#define SAVE_SPR(register) \
+ mfspr r0, SPRN_##register ;\
+ std r0, SL_##register(r11)
+#define RESTORE_SPR(register) \
+ ld r0, SL_##register(r11) ;\
+ mtspr SPRN_##register, r0
#define SAVE_SPECIAL(special) \
mf##special r0 ;\
std r0, SL_##special(r11)
@@ -103,8 +112,17 @@ _GLOBAL(swsusp_arch_suspend)
SAVE_REGISTER(r30)
SAVE_REGISTER(r31)
SAVE_SPECIAL(MSR)
- SAVE_SPECIAL(SDR1)
SAVE_SPECIAL(XER)
+#ifdef CONFIG_PPC_BOOK3S_64
+BEGIN_FW_FTR_SECTION
+ SAVE_SPECIAL(SDR1)
+END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR)
+#else
+ SAVE_SPR(TCR)
+
+ /* Save SPRG1, SPRG1 be used save paca */
+ SAVE_SPR(SPRG1)
+#endif
/* we push the stack up 128 bytes but don't store the
* stack pointer on the stack like a real stackframe */
@@ -151,6 +169,7 @@ copy_page_loop:
bne+ copyloop
nothing_to_copy:
+#ifdef CONFIG_PPC_BOOK3S_64
/* flush caches */
lis r3, 0x10
mtctr r3
@@ -167,6 +186,7 @@ nothing_to_copy:
sync
tlbia
+#endif
ld r11,swsusp_save_area_ptr@toc(r2)
@@ -208,16 +228,41 @@ nothing_to_copy:
RESTORE_REGISTER(r29)
RESTORE_REGISTER(r30)
RESTORE_REGISTER(r31)
+
+#ifdef CONFIG_PPC_BOOK3S_64
/* can't use RESTORE_SPECIAL(MSR) */
ld r0, SL_MSR(r11)
mtmsrd r0, 0
+BEGIN_FW_FTR_SECTION
RESTORE_SPECIAL(SDR1)
+END_FW_FTR_SECTION_IFCLR(FW_FEATURE_LPAR)
+#else
+ /* Restore SPRG1, be used to save paca */
+ ld r0, SL_SPRG1(r11)
+ mtsprg 1, r0
+
+ RESTORE_SPECIAL(MSR)
+
+ /* Restore TCR and clear any pending bits in TSR. */
+ RESTORE_SPR(TCR)
+ lis r0, (TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS)@h
+ mtspr SPRN_TSR, r0
+
+ /* Kick decrementer */
+ li r0, 1
+ mtdec r0
+
+ /* Invalidate all tlbs */
+ bl _tlbil_all
+#endif
RESTORE_SPECIAL(XER)
sync
addi r1,r1,-128
+#ifdef CONFIG_PPC_BOOK3S_64
bl slb_flush_and_rebolt
+#endif
bl do_after_copyback
addi r1,r1,128
diff --git a/arch/powerpc/kernel/swsusp_booke.S b/arch/powerpc/kernel/swsusp_booke.S
index 11a39307dd7..553c1405ee0 100644
--- a/arch/powerpc/kernel/swsusp_booke.S
+++ b/arch/powerpc/kernel/swsusp_booke.S
@@ -74,21 +74,21 @@ _GLOBAL(swsusp_arch_suspend)
bne 1b
/* Save SPRGs */
- mfsprg r4,0
+ mfspr r4,SPRN_SPRG0
stw r4,SL_SPRG0(r11)
- mfsprg r4,1
+ mfspr r4,SPRN_SPRG1
stw r4,SL_SPRG1(r11)
- mfsprg r4,2
+ mfspr r4,SPRN_SPRG2
stw r4,SL_SPRG2(r11)
- mfsprg r4,3
+ mfspr r4,SPRN_SPRG3
stw r4,SL_SPRG3(r11)
- mfsprg r4,4
+ mfspr r4,SPRN_SPRG4
stw r4,SL_SPRG4(r11)
- mfsprg r4,5
+ mfspr r4,SPRN_SPRG5
stw r4,SL_SPRG5(r11)
- mfsprg r4,6
+ mfspr r4,SPRN_SPRG6
stw r4,SL_SPRG6(r11)
- mfsprg r4,7
+ mfspr r4,SPRN_SPRG7
stw r4,SL_SPRG7(r11)
/* Call the low level suspend stuff (we should probably have made
@@ -141,22 +141,30 @@ _GLOBAL(swsusp_arch_resume)
lis r11,swsusp_save_area@h
ori r11,r11,swsusp_save_area@l
+ /*
+ * Mappings from virtual addresses to physical addresses may be
+ * different than they were prior to restoring hibernation state.
+ * Invalidate the TLB so that the boot CPU is using the new
+ * mappings.
+ */
+ bl _tlbil_all
+
lwz r4,SL_SPRG0(r11)
- mtsprg 0,r4
+ mtspr SPRN_SPRG0,r4
lwz r4,SL_SPRG1(r11)
- mtsprg 1,r4
+ mtspr SPRN_SPRG1,r4
lwz r4,SL_SPRG2(r11)
- mtsprg 2,r4
+ mtspr SPRN_SPRG2,r4
lwz r4,SL_SPRG3(r11)
- mtsprg 3,r4
+ mtspr SPRN_SPRG3,r4
lwz r4,SL_SPRG4(r11)
- mtsprg 4,r4
+ mtspr SPRN_SPRG4,r4
lwz r4,SL_SPRG5(r11)
- mtsprg 5,r4
+ mtspr SPRN_SPRG5,r4
lwz r4,SL_SPRG6(r11)
- mtsprg 6,r4
+ mtspr SPRN_SPRG6,r4
lwz r4,SL_SPRG7(r11)
- mtsprg 7,r4
+ mtspr SPRN_SPRG7,r4
/* restore the MSR */
lwz r3,SL_MSR(r11)
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index b1b6043a56c..8a285876aef 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -23,7 +23,6 @@
#include <linux/resource.h>
#include <linux/times.h>
#include <linux/smp.h>
-#include <linux/smp_lock.h>
#include <linux/sem.h>
#include <linux/msg.h>
#include <linux/shm.h>
@@ -51,6 +50,7 @@
#include <asm/mmu_context.h>
#include <asm/ppc-pci.h>
#include <asm/syscalls.h>
+#include <asm/switch_to.h>
asmlinkage long ppc32_select(u32 n, compat_ulong_t __user *inp,
@@ -61,466 +61,6 @@ asmlinkage long ppc32_select(u32 n, compat_ulong_t __user *inp,
return compat_sys_select((int)n, inp, outp, exp, compat_ptr(tvp_x));
}
-/* Note: it is necessary to treat option as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_sysfs(u32 option, u32 arg1, u32 arg2)
-{
- return sys_sysfs((int)option, arg1, arg2);
-}
-
-#ifdef CONFIG_SYSVIPC
-long compat_sys_ipc(u32 call, u32 first, u32 second, u32 third, compat_uptr_t ptr,
- u32 fifth)
-{
- int version;
-
- version = call >> 16; /* hack for backward compatibility */
- call &= 0xffff;
-
- switch (call) {
-
- case SEMTIMEDOP:
- if (fifth)
- /* sign extend semid */
- return compat_sys_semtimedop((int)first,
- compat_ptr(ptr), second,
- compat_ptr(fifth));
- /* else fall through for normal semop() */
- case SEMOP:
- /* struct sembuf is the same on 32 and 64bit :)) */
- /* sign extend semid */
- return sys_semtimedop((int)first, compat_ptr(ptr), second,
- NULL);
- case SEMGET:
- /* sign extend key, nsems */
- return sys_semget((int)first, (int)second, third);
- case SEMCTL:
- /* sign extend semid, semnum */
- return compat_sys_semctl((int)first, (int)second, third,
- compat_ptr(ptr));
-
- case MSGSND:
- /* sign extend msqid */
- return compat_sys_msgsnd((int)first, (int)second, third,
- compat_ptr(ptr));
- case MSGRCV:
- /* sign extend msqid, msgtyp */
- return compat_sys_msgrcv((int)first, second, (int)fifth,
- third, version, compat_ptr(ptr));
- case MSGGET:
- /* sign extend key */
- return sys_msgget((int)first, second);
- case MSGCTL:
- /* sign extend msqid */
- return compat_sys_msgctl((int)first, second, compat_ptr(ptr));
-
- case SHMAT:
- /* sign extend shmid */
- return compat_sys_shmat((int)first, second, third, version,
- compat_ptr(ptr));
- case SHMDT:
- return sys_shmdt(compat_ptr(ptr));
- case SHMGET:
- /* sign extend key_t */
- return sys_shmget((int)first, second, third);
- case SHMCTL:
- /* sign extend shmid */
- return compat_sys_shmctl((int)first, second, compat_ptr(ptr));
-
- default:
- return -ENOSYS;
- }
-
- return -ENOSYS;
-}
-#endif
-
-/* Note: it is necessary to treat out_fd and in_fd as unsigned ints,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_sendfile(u32 out_fd, u32 in_fd, compat_off_t __user * offset, u32 count)
-{
- mm_segment_t old_fs = get_fs();
- int ret;
- off_t of;
- off_t __user *up;
-
- if (offset && get_user(of, offset))
- return -EFAULT;
-
- /* The __user pointer cast is valid because of the set_fs() */
- set_fs(KERNEL_DS);
- up = offset ? (off_t __user *) &of : NULL;
- ret = sys_sendfile((int)out_fd, (int)in_fd, up, count);
- set_fs(old_fs);
-
- if (offset && put_user(of, offset))
- return -EFAULT;
-
- return ret;
-}
-
-asmlinkage int compat_sys_sendfile64(int out_fd, int in_fd, compat_loff_t __user *offset, s32 count)
-{
- mm_segment_t old_fs = get_fs();
- int ret;
- loff_t lof;
- loff_t __user *up;
-
- if (offset && get_user(lof, offset))
- return -EFAULT;
-
- /* The __user pointer cast is valid because of the set_fs() */
- set_fs(KERNEL_DS);
- up = offset ? (loff_t __user *) &lof : NULL;
- ret = sys_sendfile64(out_fd, in_fd, up, count);
- set_fs(old_fs);
-
- if (offset && put_user(lof, offset))
- return -EFAULT;
-
- return ret;
-}
-
-long compat_sys_execve(unsigned long a0, unsigned long a1, unsigned long a2,
- unsigned long a3, unsigned long a4, unsigned long a5,
- struct pt_regs *regs)
-{
- int error;
- char * filename;
-
- filename = getname((char __user *) a0);
- error = PTR_ERR(filename);
- if (IS_ERR(filename))
- goto out;
- flush_fp_to_thread(current);
- flush_altivec_to_thread(current);
-
- error = compat_do_execve(filename, compat_ptr(a1), compat_ptr(a2), regs);
-
- putname(filename);
-
-out:
- return error;
-}
-
-/* Note: it is necessary to treat option as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_prctl(u32 option, u32 arg2, u32 arg3, u32 arg4, u32 arg5)
-{
- return sys_prctl((int)option,
- (unsigned long) arg2,
- (unsigned long) arg3,
- (unsigned long) arg4,
- (unsigned long) arg5);
-}
-
-/* Note: it is necessary to treat pid as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_sched_rr_get_interval(u32 pid, struct compat_timespec __user *interval)
-{
- struct timespec t;
- int ret;
- mm_segment_t old_fs = get_fs ();
-
- /* The __user pointer cast is valid because of the set_fs() */
- set_fs (KERNEL_DS);
- ret = sys_sched_rr_get_interval((int)pid, (struct timespec __user *) &t);
- set_fs (old_fs);
- if (put_compat_timespec(&t, interval))
- return -EFAULT;
- return ret;
-}
-
-/* Note: it is necessary to treat mode as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_access(const char __user * filename, u32 mode)
-{
- return sys_access(filename, (int)mode);
-}
-
-
-/* Note: it is necessary to treat mode as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_creat(const char __user * pathname, u32 mode)
-{
- return sys_creat(pathname, (int)mode);
-}
-
-
-/* Note: it is necessary to treat pid and options as unsigned ints,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_waitpid(u32 pid, unsigned int __user * stat_addr, u32 options)
-{
- return sys_waitpid((int)pid, stat_addr, (int)options);
-}
-
-
-/* Note: it is necessary to treat gidsetsize as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_getgroups(u32 gidsetsize, gid_t __user *grouplist)
-{
- return sys_getgroups((int)gidsetsize, grouplist);
-}
-
-
-/* Note: it is necessary to treat pid as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_getpgid(u32 pid)
-{
- return sys_getpgid((int)pid);
-}
-
-
-
-/* Note: it is necessary to treat pid as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_getsid(u32 pid)
-{
- return sys_getsid((int)pid);
-}
-
-
-/* Note: it is necessary to treat pid and sig as unsigned ints,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_kill(u32 pid, u32 sig)
-{
- return sys_kill((int)pid, (int)sig);
-}
-
-
-/* Note: it is necessary to treat mode as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_mkdir(const char __user * pathname, u32 mode)
-{
- return sys_mkdir(pathname, (int)mode);
-}
-
-long compat_sys_nice(u32 increment)
-{
- /* sign extend increment */
- return sys_nice((int)increment);
-}
-
-off_t ppc32_lseek(unsigned int fd, u32 offset, unsigned int origin)
-{
- /* sign extend n */
- return sys_lseek(fd, (int)offset, origin);
-}
-
-long compat_sys_truncate(const char __user * path, u32 length)
-{
- /* sign extend length */
- return sys_truncate(path, (int)length);
-}
-
-long compat_sys_ftruncate(int fd, u32 length)
-{
- /* sign extend length */
- return sys_ftruncate(fd, (int)length);
-}
-
-/* Note: it is necessary to treat bufsiz as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_readlink(const char __user * path, char __user * buf, u32 bufsiz)
-{
- return sys_readlink(path, buf, (int)bufsiz);
-}
-
-/* Note: it is necessary to treat option as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_sched_get_priority_max(u32 policy)
-{
- return sys_sched_get_priority_max((int)policy);
-}
-
-
-/* Note: it is necessary to treat policy as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_sched_get_priority_min(u32 policy)
-{
- return sys_sched_get_priority_min((int)policy);
-}
-
-
-/* Note: it is necessary to treat pid as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_sched_getparam(u32 pid, struct sched_param __user *param)
-{
- return sys_sched_getparam((int)pid, param);
-}
-
-
-/* Note: it is necessary to treat pid as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_sched_getscheduler(u32 pid)
-{
- return sys_sched_getscheduler((int)pid);
-}
-
-
-/* Note: it is necessary to treat pid as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_sched_setparam(u32 pid, struct sched_param __user *param)
-{
- return sys_sched_setparam((int)pid, param);
-}
-
-
-/* Note: it is necessary to treat pid and policy as unsigned ints,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_sched_setscheduler(u32 pid, u32 policy, struct sched_param __user *param)
-{
- return sys_sched_setscheduler((int)pid, (int)policy, param);
-}
-
-
-/* Note: it is necessary to treat len as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_setdomainname(char __user *name, u32 len)
-{
- return sys_setdomainname(name, (int)len);
-}
-
-
-/* Note: it is necessary to treat gidsetsize as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_setgroups(u32 gidsetsize, gid_t __user *grouplist)
-{
- return sys_setgroups((int)gidsetsize, grouplist);
-}
-
-
-asmlinkage long compat_sys_sethostname(char __user *name, u32 len)
-{
- /* sign extend len */
- return sys_sethostname(name, (int)len);
-}
-
-
-/* Note: it is necessary to treat pid and pgid as unsigned ints,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_setpgid(u32 pid, u32 pgid)
-{
- return sys_setpgid((int)pid, (int)pgid);
-}
-
-long compat_sys_getpriority(u32 which, u32 who)
-{
- /* sign extend which and who */
- return sys_getpriority((int)which, (int)who);
-}
-
-long compat_sys_setpriority(u32 which, u32 who, u32 niceval)
-{
- /* sign extend which, who and niceval */
- return sys_setpriority((int)which, (int)who, (int)niceval);
-}
-
-long compat_sys_ioprio_get(u32 which, u32 who)
-{
- /* sign extend which and who */
- return sys_ioprio_get((int)which, (int)who);
-}
-
-long compat_sys_ioprio_set(u32 which, u32 who, u32 ioprio)
-{
- /* sign extend which, who and ioprio */
- return sys_ioprio_set((int)which, (int)who, (int)ioprio);
-}
-
-/* Note: it is necessary to treat newmask as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_ssetmask(u32 newmask)
-{
- return sys_ssetmask((int) newmask);
-}
-
-asmlinkage long compat_sys_syslog(u32 type, char __user * buf, u32 len)
-{
- /* sign extend len */
- return sys_syslog(type, buf, (int)len);
-}
-
-
-/* Note: it is necessary to treat mask as an unsigned int,
- * with the corresponding cast to a signed int to insure that the
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_umask(u32 mask)
-{
- return sys_umask((int)mask);
-}
-
unsigned long compat_sys_mmap2(unsigned long addr, size_t len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
@@ -529,12 +69,6 @@ unsigned long compat_sys_mmap2(unsigned long addr, size_t len,
return sys_mmap(addr, len, prot, flags, fd, pgoff << 12);
}
-long compat_sys_tgkill(u32 tgid, u32 pid, int sig)
-{
- /* sign extend tgid, pid */
- return sys_tgkill((int)tgid, (int)pid, sig);
-}
-
/*
* long long munging:
* The 32 bit ABI passes long longs in an odd even register pair.
@@ -576,13 +110,6 @@ asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long h
return sys_ftruncate(fd, (high << 32) | low);
}
-long ppc32_lookup_dcookie(u32 cookie_high, u32 cookie_low, char __user *buf,
- size_t len)
-{
- return sys_lookup_dcookie((u64)cookie_high << 32 | cookie_low,
- buf, len);
-}
-
long ppc32_fadvise64(int fd, u32 unused, u32 offset_high, u32 offset_low,
size_t len, int advice)
{
@@ -590,23 +117,6 @@ long ppc32_fadvise64(int fd, u32 unused, u32 offset_high, u32 offset_low,
advice);
}
-asmlinkage long compat_sys_add_key(const char __user *_type,
- const char __user *_description,
- const void __user *_payload,
- u32 plen,
- u32 ringid)
-{
- return sys_add_key(_type, _description, _payload, plen, ringid);
-}
-
-asmlinkage long compat_sys_request_key(const char __user *_type,
- const char __user *_description,
- const char __user *_callout_info,
- u32 destringid)
-{
- return sys_request_key(_type, _description, _callout_info, destringid);
-}
-
asmlinkage long compat_sys_sync_file_range2(int fd, unsigned int flags,
unsigned offset_hi, unsigned offset_lo,
unsigned nbytes_hi, unsigned nbytes_lo)
@@ -616,11 +126,3 @@ asmlinkage long compat_sys_sync_file_range2(int fd, unsigned int flags,
return sys_sync_file_range(fd, offset, nbytes, flags);
}
-
-asmlinkage long compat_sys_fanotify_mark(int fanotify_fd, unsigned int flags,
- unsigned mask_hi, unsigned mask_lo,
- int dfd, const char __user *pathname)
-{
- u64 mask = ((u64)mask_hi << 32) | mask_lo;
- return sys_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
-}
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index f2496f2faec..cd9be9aa016 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -34,7 +34,6 @@
#include <linux/ipc.h>
#include <linux/utsname.h>
#include <linux/file.h>
-#include <linux/init.h>
#include <linux/personality.h>
#include <asm/uaccess.h>
@@ -107,11 +106,11 @@ long ppc64_personality(unsigned long personality)
long ret;
if (personality(current->personality) == PER_LINUX32
- && personality == PER_LINUX)
- personality = PER_LINUX32;
+ && personality(personality) == PER_LINUX)
+ personality = (personality & ~PER_MASK) | PER_LINUX32;
ret = sys_personality(personality);
- if (ret == PER_LINUX32)
- ret = PER_LINUX;
+ if (personality(ret) == PER_LINUX32)
+ ret = (ret & ~PER_MASK) | PER_LINUX;
return ret;
}
#endif
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index c0d8c2006bf..67fd2fd2620 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -1,10 +1,10 @@
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/nodemask.h>
#include <linux/cpumask.h>
#include <linux/notifier.h>
@@ -12,12 +12,12 @@
#include <asm/current.h>
#include <asm/processor.h>
#include <asm/cputable.h>
-#include <asm/firmware.h>
#include <asm/hvcall.h>
#include <asm/prom.h>
#include <asm/machdep.h>
#include <asm/smp.h>
#include <asm/pmc.h>
+#include <asm/firmware.h>
#include "cacheinfo.h"
@@ -37,12 +37,12 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices);
/* Time in microseconds we delay before sleeping in the idle loop */
DEFINE_PER_CPU(long, smt_snooze_delay) = { 100 };
-static ssize_t store_smt_snooze_delay(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t store_smt_snooze_delay(struct device *dev,
+ struct device_attribute *attr,
const char *buf,
size_t count)
{
- struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+ struct cpu *cpu = container_of(dev, struct cpu, dev);
ssize_t ret;
long snooze;
@@ -50,21 +50,20 @@ static ssize_t store_smt_snooze_delay(struct sys_device *dev,
if (ret != 1)
return -EINVAL;
- per_cpu(smt_snooze_delay, cpu->sysdev.id) = snooze;
-
+ per_cpu(smt_snooze_delay, cpu->dev.id) = snooze;
return count;
}
-static ssize_t show_smt_snooze_delay(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t show_smt_snooze_delay(struct device *dev,
+ struct device_attribute *attr,
char *buf)
{
- struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+ struct cpu *cpu = container_of(dev, struct cpu, dev);
- return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->sysdev.id));
+ return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->dev.id));
}
-static SYSDEV_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
+static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
store_smt_snooze_delay);
static int __init setup_smt_snooze_delay(char *str)
@@ -85,6 +84,304 @@ __setup("smt-snooze-delay=", setup_smt_snooze_delay);
#endif /* CONFIG_PPC64 */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+#define MAX_BIT 63
+
+static u64 pw20_wt;
+static u64 altivec_idle_wt;
+
+static unsigned int get_idle_ticks_bit(u64 ns)
+{
+ u64 cycle;
+
+ if (ns >= 10000)
+ cycle = div_u64(ns + 500, 1000) * tb_ticks_per_usec;
+ else
+ cycle = div_u64(ns * tb_ticks_per_usec, 1000);
+
+ if (!cycle)
+ return 0;
+
+ return ilog2(cycle);
+}
+
+static void do_show_pwrmgtcr0(void *val)
+{
+ u32 *value = val;
+
+ *value = mfspr(SPRN_PWRMGTCR0);
+}
+
+static ssize_t show_pw20_state(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u32 value;
+ unsigned int cpu = dev->id;
+
+ smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1);
+
+ value &= PWRMGTCR0_PW20_WAIT;
+
+ return sprintf(buf, "%u\n", value ? 1 : 0);
+}
+
+static void do_store_pw20_state(void *val)
+{
+ u32 *value = val;
+ u32 pw20_state;
+
+ pw20_state = mfspr(SPRN_PWRMGTCR0);
+
+ if (*value)
+ pw20_state |= PWRMGTCR0_PW20_WAIT;
+ else
+ pw20_state &= ~PWRMGTCR0_PW20_WAIT;
+
+ mtspr(SPRN_PWRMGTCR0, pw20_state);
+}
+
+static ssize_t store_pw20_state(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ u32 value;
+ unsigned int cpu = dev->id;
+
+ if (kstrtou32(buf, 0, &value))
+ return -EINVAL;
+
+ if (value > 1)
+ return -EINVAL;
+
+ smp_call_function_single(cpu, do_store_pw20_state, &value, 1);
+
+ return count;
+}
+
+static ssize_t show_pw20_wait_time(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u32 value;
+ u64 tb_cycle = 1;
+ u64 time;
+
+ unsigned int cpu = dev->id;
+
+ if (!pw20_wt) {
+ smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1);
+ value = (value & PWRMGTCR0_PW20_ENT) >>
+ PWRMGTCR0_PW20_ENT_SHIFT;
+
+ tb_cycle = (tb_cycle << (MAX_BIT - value + 1));
+ /* convert ms to ns */
+ if (tb_ticks_per_usec > 1000) {
+ time = div_u64(tb_cycle, tb_ticks_per_usec / 1000);
+ } else {
+ u32 rem_us;
+
+ time = div_u64_rem(tb_cycle, tb_ticks_per_usec,
+ &rem_us);
+ time = time * 1000 + rem_us * 1000 / tb_ticks_per_usec;
+ }
+ } else {
+ time = pw20_wt;
+ }
+
+ return sprintf(buf, "%llu\n", time > 0 ? time : 0);
+}
+
+static void set_pw20_wait_entry_bit(void *val)
+{
+ u32 *value = val;
+ u32 pw20_idle;
+
+ pw20_idle = mfspr(SPRN_PWRMGTCR0);
+
+ /* Set Automatic PW20 Core Idle Count */
+ /* clear count */
+ pw20_idle &= ~PWRMGTCR0_PW20_ENT;
+
+ /* set count */
+ pw20_idle |= ((MAX_BIT - *value) << PWRMGTCR0_PW20_ENT_SHIFT);
+
+ mtspr(SPRN_PWRMGTCR0, pw20_idle);
+}
+
+static ssize_t store_pw20_wait_time(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ u32 entry_bit;
+ u64 value;
+
+ unsigned int cpu = dev->id;
+
+ if (kstrtou64(buf, 0, &value))
+ return -EINVAL;
+
+ if (!value)
+ return -EINVAL;
+
+ entry_bit = get_idle_ticks_bit(value);
+ if (entry_bit > MAX_BIT)
+ return -EINVAL;
+
+ pw20_wt = value;
+
+ smp_call_function_single(cpu, set_pw20_wait_entry_bit,
+ &entry_bit, 1);
+
+ return count;
+}
+
+static ssize_t show_altivec_idle(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u32 value;
+ unsigned int cpu = dev->id;
+
+ smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1);
+
+ value &= PWRMGTCR0_AV_IDLE_PD_EN;
+
+ return sprintf(buf, "%u\n", value ? 1 : 0);
+}
+
+static void do_store_altivec_idle(void *val)
+{
+ u32 *value = val;
+ u32 altivec_idle;
+
+ altivec_idle = mfspr(SPRN_PWRMGTCR0);
+
+ if (*value)
+ altivec_idle |= PWRMGTCR0_AV_IDLE_PD_EN;
+ else
+ altivec_idle &= ~PWRMGTCR0_AV_IDLE_PD_EN;
+
+ mtspr(SPRN_PWRMGTCR0, altivec_idle);
+}
+
+static ssize_t store_altivec_idle(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ u32 value;
+ unsigned int cpu = dev->id;
+
+ if (kstrtou32(buf, 0, &value))
+ return -EINVAL;
+
+ if (value > 1)
+ return -EINVAL;
+
+ smp_call_function_single(cpu, do_store_altivec_idle, &value, 1);
+
+ return count;
+}
+
+static ssize_t show_altivec_idle_wait_time(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u32 value;
+ u64 tb_cycle = 1;
+ u64 time;
+
+ unsigned int cpu = dev->id;
+
+ if (!altivec_idle_wt) {
+ smp_call_function_single(cpu, do_show_pwrmgtcr0, &value, 1);
+ value = (value & PWRMGTCR0_AV_IDLE_CNT) >>
+ PWRMGTCR0_AV_IDLE_CNT_SHIFT;
+
+ tb_cycle = (tb_cycle << (MAX_BIT - value + 1));
+ /* convert ms to ns */
+ if (tb_ticks_per_usec > 1000) {
+ time = div_u64(tb_cycle, tb_ticks_per_usec / 1000);
+ } else {
+ u32 rem_us;
+
+ time = div_u64_rem(tb_cycle, tb_ticks_per_usec,
+ &rem_us);
+ time = time * 1000 + rem_us * 1000 / tb_ticks_per_usec;
+ }
+ } else {
+ time = altivec_idle_wt;
+ }
+
+ return sprintf(buf, "%llu\n", time > 0 ? time : 0);
+}
+
+static void set_altivec_idle_wait_entry_bit(void *val)
+{
+ u32 *value = val;
+ u32 altivec_idle;
+
+ altivec_idle = mfspr(SPRN_PWRMGTCR0);
+
+ /* Set Automatic AltiVec Idle Count */
+ /* clear count */
+ altivec_idle &= ~PWRMGTCR0_AV_IDLE_CNT;
+
+ /* set count */
+ altivec_idle |= ((MAX_BIT - *value) << PWRMGTCR0_AV_IDLE_CNT_SHIFT);
+
+ mtspr(SPRN_PWRMGTCR0, altivec_idle);
+}
+
+static ssize_t store_altivec_idle_wait_time(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ u32 entry_bit;
+ u64 value;
+
+ unsigned int cpu = dev->id;
+
+ if (kstrtou64(buf, 0, &value))
+ return -EINVAL;
+
+ if (!value)
+ return -EINVAL;
+
+ entry_bit = get_idle_ticks_bit(value);
+ if (entry_bit > MAX_BIT)
+ return -EINVAL;
+
+ altivec_idle_wt = value;
+
+ smp_call_function_single(cpu, set_altivec_idle_wait_entry_bit,
+ &entry_bit, 1);
+
+ return count;
+}
+
+/*
+ * Enable/Disable interface:
+ * 0, disable. 1, enable.
+ */
+static DEVICE_ATTR(pw20_state, 0600, show_pw20_state, store_pw20_state);
+static DEVICE_ATTR(altivec_idle, 0600, show_altivec_idle, store_altivec_idle);
+
+/*
+ * Set wait time interface:(Nanosecond)
+ * Example: Base on TBfreq is 41MHZ.
+ * 1~48(ns): TB[63]
+ * 49~97(ns): TB[62]
+ * 98~195(ns): TB[61]
+ * 196~390(ns): TB[60]
+ * 391~780(ns): TB[59]
+ * 781~1560(ns): TB[58]
+ * ...
+ */
+static DEVICE_ATTR(pw20_wait_time, 0600,
+ show_pw20_wait_time,
+ store_pw20_wait_time);
+static DEVICE_ATTR(altivec_idle_wait_time, 0600,
+ show_altivec_idle_wait_time,
+ store_altivec_idle_wait_time);
+#endif
+
/*
* Enabling PMCs will slow partition context switch times so we only do
* it the first time we write to the PMCs.
@@ -107,38 +404,49 @@ void ppc_enable_pmcs(void)
}
EXPORT_SYMBOL(ppc_enable_pmcs);
-#define SYSFS_PMCSETUP(NAME, ADDRESS) \
+#define __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, EXTRA) \
static void read_##NAME(void *val) \
{ \
*(unsigned long *)val = mfspr(ADDRESS); \
} \
static void write_##NAME(void *val) \
{ \
- ppc_enable_pmcs(); \
+ EXTRA; \
mtspr(ADDRESS, *(unsigned long *)val); \
-} \
-static ssize_t show_##NAME(struct sys_device *dev, \
- struct sysdev_attribute *attr, \
+}
+
+#define __SYSFS_SPRSETUP_SHOW_STORE(NAME) \
+static ssize_t show_##NAME(struct device *dev, \
+ struct device_attribute *attr, \
char *buf) \
{ \
- struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
+ struct cpu *cpu = container_of(dev, struct cpu, dev); \
unsigned long val; \
- smp_call_function_single(cpu->sysdev.id, read_##NAME, &val, 1); \
+ smp_call_function_single(cpu->dev.id, read_##NAME, &val, 1); \
return sprintf(buf, "%lx\n", val); \
} \
static ssize_t __used \
- store_##NAME(struct sys_device *dev, struct sysdev_attribute *attr, \
+ store_##NAME(struct device *dev, struct device_attribute *attr, \
const char *buf, size_t count) \
{ \
- struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
+ struct cpu *cpu = container_of(dev, struct cpu, dev); \
unsigned long val; \
int ret = sscanf(buf, "%lx", &val); \
if (ret != 1) \
return -EINVAL; \
- smp_call_function_single(cpu->sysdev.id, write_##NAME, &val, 1); \
+ smp_call_function_single(cpu->dev.id, write_##NAME, &val, 1); \
return count; \
}
+#define SYSFS_PMCSETUP(NAME, ADDRESS) \
+ __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ppc_enable_pmcs()) \
+ __SYSFS_SPRSETUP_SHOW_STORE(NAME)
+#define SYSFS_SPRSETUP(NAME, ADDRESS) \
+ __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ) \
+ __SYSFS_SPRSETUP_SHOW_STORE(NAME)
+
+#define SYSFS_SPRSETUP_SHOW_STORE(NAME) \
+ __SYSFS_SPRSETUP_SHOW_STORE(NAME)
/* Let's define all possible registers, we'll only hook up the ones
* that are implemented on the current processor
@@ -174,14 +482,76 @@ SYSFS_PMCSETUP(pmc7, SPRN_PMC7);
SYSFS_PMCSETUP(pmc8, SPRN_PMC8);
SYSFS_PMCSETUP(mmcra, SPRN_MMCRA);
-SYSFS_PMCSETUP(purr, SPRN_PURR);
-SYSFS_PMCSETUP(spurr, SPRN_SPURR);
-SYSFS_PMCSETUP(dscr, SPRN_DSCR);
-
-static SYSDEV_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
-static SYSDEV_ATTR(spurr, 0600, show_spurr, NULL);
-static SYSDEV_ATTR(dscr, 0600, show_dscr, store_dscr);
-static SYSDEV_ATTR(purr, 0600, show_purr, store_purr);
+SYSFS_SPRSETUP(purr, SPRN_PURR);
+SYSFS_SPRSETUP(spurr, SPRN_SPURR);
+SYSFS_SPRSETUP(pir, SPRN_PIR);
+
+/*
+ Lets only enable read for phyp resources and
+ enable write when needed with a separate function.
+ Lets be conservative and default to pseries.
+*/
+static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
+static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
+static DEVICE_ATTR(purr, 0400, show_purr, store_purr);
+static DEVICE_ATTR(pir, 0400, show_pir, NULL);
+
+static unsigned long dscr_default;
+
+static void read_dscr(void *val)
+{
+ *(unsigned long *)val = get_paca()->dscr_default;
+}
+
+static void write_dscr(void *val)
+{
+ get_paca()->dscr_default = *(unsigned long *)val;
+ if (!current->thread.dscr_inherit) {
+ current->thread.dscr = *(unsigned long *)val;
+ mtspr(SPRN_DSCR, *(unsigned long *)val);
+ }
+}
+
+SYSFS_SPRSETUP_SHOW_STORE(dscr);
+static DEVICE_ATTR(dscr, 0600, show_dscr, store_dscr);
+
+static void add_write_permission_dev_attr(struct device_attribute *attr)
+{
+ attr->attr.mode |= 0200;
+}
+
+static ssize_t show_dscr_default(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lx\n", dscr_default);
+}
+
+static ssize_t __used store_dscr_default(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ unsigned long val;
+ int ret = 0;
+
+ ret = sscanf(buf, "%lx", &val);
+ if (ret != 1)
+ return -EINVAL;
+ dscr_default = val;
+
+ on_each_cpu(write_dscr, &val, 1);
+
+ return count;
+}
+
+static DEVICE_ATTR(dscr_default, 0600,
+ show_dscr_default, store_dscr_default);
+
+static void sysfs_create_dscr_default(void)
+{
+ int err = 0;
+ if (cpu_has_feature(CPU_FTR_DSCR))
+ err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default);
+}
#endif /* CONFIG_PPC64 */
#ifdef HAS_PPC_PMC_PA6T
@@ -192,120 +562,119 @@ SYSFS_PMCSETUP(pa6t_pmc3, SPRN_PA6T_PMC3);
SYSFS_PMCSETUP(pa6t_pmc4, SPRN_PA6T_PMC4);
SYSFS_PMCSETUP(pa6t_pmc5, SPRN_PA6T_PMC5);
#ifdef CONFIG_DEBUG_KERNEL
-SYSFS_PMCSETUP(hid0, SPRN_HID0);
-SYSFS_PMCSETUP(hid1, SPRN_HID1);
-SYSFS_PMCSETUP(hid4, SPRN_HID4);
-SYSFS_PMCSETUP(hid5, SPRN_HID5);
-SYSFS_PMCSETUP(ima0, SPRN_PA6T_IMA0);
-SYSFS_PMCSETUP(ima1, SPRN_PA6T_IMA1);
-SYSFS_PMCSETUP(ima2, SPRN_PA6T_IMA2);
-SYSFS_PMCSETUP(ima3, SPRN_PA6T_IMA3);
-SYSFS_PMCSETUP(ima4, SPRN_PA6T_IMA4);
-SYSFS_PMCSETUP(ima5, SPRN_PA6T_IMA5);
-SYSFS_PMCSETUP(ima6, SPRN_PA6T_IMA6);
-SYSFS_PMCSETUP(ima7, SPRN_PA6T_IMA7);
-SYSFS_PMCSETUP(ima8, SPRN_PA6T_IMA8);
-SYSFS_PMCSETUP(ima9, SPRN_PA6T_IMA9);
-SYSFS_PMCSETUP(imaat, SPRN_PA6T_IMAAT);
-SYSFS_PMCSETUP(btcr, SPRN_PA6T_BTCR);
-SYSFS_PMCSETUP(pccr, SPRN_PA6T_PCCR);
-SYSFS_PMCSETUP(rpccr, SPRN_PA6T_RPCCR);
-SYSFS_PMCSETUP(der, SPRN_PA6T_DER);
-SYSFS_PMCSETUP(mer, SPRN_PA6T_MER);
-SYSFS_PMCSETUP(ber, SPRN_PA6T_BER);
-SYSFS_PMCSETUP(ier, SPRN_PA6T_IER);
-SYSFS_PMCSETUP(sier, SPRN_PA6T_SIER);
-SYSFS_PMCSETUP(siar, SPRN_PA6T_SIAR);
-SYSFS_PMCSETUP(tsr0, SPRN_PA6T_TSR0);
-SYSFS_PMCSETUP(tsr1, SPRN_PA6T_TSR1);
-SYSFS_PMCSETUP(tsr2, SPRN_PA6T_TSR2);
-SYSFS_PMCSETUP(tsr3, SPRN_PA6T_TSR3);
+SYSFS_SPRSETUP(hid0, SPRN_HID0);
+SYSFS_SPRSETUP(hid1, SPRN_HID1);
+SYSFS_SPRSETUP(hid4, SPRN_HID4);
+SYSFS_SPRSETUP(hid5, SPRN_HID5);
+SYSFS_SPRSETUP(ima0, SPRN_PA6T_IMA0);
+SYSFS_SPRSETUP(ima1, SPRN_PA6T_IMA1);
+SYSFS_SPRSETUP(ima2, SPRN_PA6T_IMA2);
+SYSFS_SPRSETUP(ima3, SPRN_PA6T_IMA3);
+SYSFS_SPRSETUP(ima4, SPRN_PA6T_IMA4);
+SYSFS_SPRSETUP(ima5, SPRN_PA6T_IMA5);
+SYSFS_SPRSETUP(ima6, SPRN_PA6T_IMA6);
+SYSFS_SPRSETUP(ima7, SPRN_PA6T_IMA7);
+SYSFS_SPRSETUP(ima8, SPRN_PA6T_IMA8);
+SYSFS_SPRSETUP(ima9, SPRN_PA6T_IMA9);
+SYSFS_SPRSETUP(imaat, SPRN_PA6T_IMAAT);
+SYSFS_SPRSETUP(btcr, SPRN_PA6T_BTCR);
+SYSFS_SPRSETUP(pccr, SPRN_PA6T_PCCR);
+SYSFS_SPRSETUP(rpccr, SPRN_PA6T_RPCCR);
+SYSFS_SPRSETUP(der, SPRN_PA6T_DER);
+SYSFS_SPRSETUP(mer, SPRN_PA6T_MER);
+SYSFS_SPRSETUP(ber, SPRN_PA6T_BER);
+SYSFS_SPRSETUP(ier, SPRN_PA6T_IER);
+SYSFS_SPRSETUP(sier, SPRN_PA6T_SIER);
+SYSFS_SPRSETUP(siar, SPRN_PA6T_SIAR);
+SYSFS_SPRSETUP(tsr0, SPRN_PA6T_TSR0);
+SYSFS_SPRSETUP(tsr1, SPRN_PA6T_TSR1);
+SYSFS_SPRSETUP(tsr2, SPRN_PA6T_TSR2);
+SYSFS_SPRSETUP(tsr3, SPRN_PA6T_TSR3);
#endif /* CONFIG_DEBUG_KERNEL */
#endif /* HAS_PPC_PMC_PA6T */
#ifdef HAS_PPC_PMC_IBM
-static struct sysdev_attribute ibm_common_attrs[] = {
- _SYSDEV_ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
- _SYSDEV_ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
+static struct device_attribute ibm_common_attrs[] = {
+ __ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
+ __ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
};
#endif /* HAS_PPC_PMC_G4 */
#ifdef HAS_PPC_PMC_G4
-static struct sysdev_attribute g4_common_attrs[] = {
- _SYSDEV_ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
- _SYSDEV_ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
- _SYSDEV_ATTR(mmcr2, 0600, show_mmcr2, store_mmcr2),
+static struct device_attribute g4_common_attrs[] = {
+ __ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
+ __ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
+ __ATTR(mmcr2, 0600, show_mmcr2, store_mmcr2),
};
#endif /* HAS_PPC_PMC_G4 */
-static struct sysdev_attribute classic_pmc_attrs[] = {
- _SYSDEV_ATTR(pmc1, 0600, show_pmc1, store_pmc1),
- _SYSDEV_ATTR(pmc2, 0600, show_pmc2, store_pmc2),
- _SYSDEV_ATTR(pmc3, 0600, show_pmc3, store_pmc3),
- _SYSDEV_ATTR(pmc4, 0600, show_pmc4, store_pmc4),
- _SYSDEV_ATTR(pmc5, 0600, show_pmc5, store_pmc5),
- _SYSDEV_ATTR(pmc6, 0600, show_pmc6, store_pmc6),
+static struct device_attribute classic_pmc_attrs[] = {
+ __ATTR(pmc1, 0600, show_pmc1, store_pmc1),
+ __ATTR(pmc2, 0600, show_pmc2, store_pmc2),
+ __ATTR(pmc3, 0600, show_pmc3, store_pmc3),
+ __ATTR(pmc4, 0600, show_pmc4, store_pmc4),
+ __ATTR(pmc5, 0600, show_pmc5, store_pmc5),
+ __ATTR(pmc6, 0600, show_pmc6, store_pmc6),
#ifdef CONFIG_PPC64
- _SYSDEV_ATTR(pmc7, 0600, show_pmc7, store_pmc7),
- _SYSDEV_ATTR(pmc8, 0600, show_pmc8, store_pmc8),
+ __ATTR(pmc7, 0600, show_pmc7, store_pmc7),
+ __ATTR(pmc8, 0600, show_pmc8, store_pmc8),
#endif
};
#ifdef HAS_PPC_PMC_PA6T
-static struct sysdev_attribute pa6t_attrs[] = {
- _SYSDEV_ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
- _SYSDEV_ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
- _SYSDEV_ATTR(pmc0, 0600, show_pa6t_pmc0, store_pa6t_pmc0),
- _SYSDEV_ATTR(pmc1, 0600, show_pa6t_pmc1, store_pa6t_pmc1),
- _SYSDEV_ATTR(pmc2, 0600, show_pa6t_pmc2, store_pa6t_pmc2),
- _SYSDEV_ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3),
- _SYSDEV_ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4),
- _SYSDEV_ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5),
+static struct device_attribute pa6t_attrs[] = {
+ __ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0),
+ __ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1),
+ __ATTR(pmc0, 0600, show_pa6t_pmc0, store_pa6t_pmc0),
+ __ATTR(pmc1, 0600, show_pa6t_pmc1, store_pa6t_pmc1),
+ __ATTR(pmc2, 0600, show_pa6t_pmc2, store_pa6t_pmc2),
+ __ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3),
+ __ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4),
+ __ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5),
#ifdef CONFIG_DEBUG_KERNEL
- _SYSDEV_ATTR(hid0, 0600, show_hid0, store_hid0),
- _SYSDEV_ATTR(hid1, 0600, show_hid1, store_hid1),
- _SYSDEV_ATTR(hid4, 0600, show_hid4, store_hid4),
- _SYSDEV_ATTR(hid5, 0600, show_hid5, store_hid5),
- _SYSDEV_ATTR(ima0, 0600, show_ima0, store_ima0),
- _SYSDEV_ATTR(ima1, 0600, show_ima1, store_ima1),
- _SYSDEV_ATTR(ima2, 0600, show_ima2, store_ima2),
- _SYSDEV_ATTR(ima3, 0600, show_ima3, store_ima3),
- _SYSDEV_ATTR(ima4, 0600, show_ima4, store_ima4),
- _SYSDEV_ATTR(ima5, 0600, show_ima5, store_ima5),
- _SYSDEV_ATTR(ima6, 0600, show_ima6, store_ima6),
- _SYSDEV_ATTR(ima7, 0600, show_ima7, store_ima7),
- _SYSDEV_ATTR(ima8, 0600, show_ima8, store_ima8),
- _SYSDEV_ATTR(ima9, 0600, show_ima9, store_ima9),
- _SYSDEV_ATTR(imaat, 0600, show_imaat, store_imaat),
- _SYSDEV_ATTR(btcr, 0600, show_btcr, store_btcr),
- _SYSDEV_ATTR(pccr, 0600, show_pccr, store_pccr),
- _SYSDEV_ATTR(rpccr, 0600, show_rpccr, store_rpccr),
- _SYSDEV_ATTR(der, 0600, show_der, store_der),
- _SYSDEV_ATTR(mer, 0600, show_mer, store_mer),
- _SYSDEV_ATTR(ber, 0600, show_ber, store_ber),
- _SYSDEV_ATTR(ier, 0600, show_ier, store_ier),
- _SYSDEV_ATTR(sier, 0600, show_sier, store_sier),
- _SYSDEV_ATTR(siar, 0600, show_siar, store_siar),
- _SYSDEV_ATTR(tsr0, 0600, show_tsr0, store_tsr0),
- _SYSDEV_ATTR(tsr1, 0600, show_tsr1, store_tsr1),
- _SYSDEV_ATTR(tsr2, 0600, show_tsr2, store_tsr2),
- _SYSDEV_ATTR(tsr3, 0600, show_tsr3, store_tsr3),
+ __ATTR(hid0, 0600, show_hid0, store_hid0),
+ __ATTR(hid1, 0600, show_hid1, store_hid1),
+ __ATTR(hid4, 0600, show_hid4, store_hid4),
+ __ATTR(hid5, 0600, show_hid5, store_hid5),
+ __ATTR(ima0, 0600, show_ima0, store_ima0),
+ __ATTR(ima1, 0600, show_ima1, store_ima1),
+ __ATTR(ima2, 0600, show_ima2, store_ima2),
+ __ATTR(ima3, 0600, show_ima3, store_ima3),
+ __ATTR(ima4, 0600, show_ima4, store_ima4),
+ __ATTR(ima5, 0600, show_ima5, store_ima5),
+ __ATTR(ima6, 0600, show_ima6, store_ima6),
+ __ATTR(ima7, 0600, show_ima7, store_ima7),
+ __ATTR(ima8, 0600, show_ima8, store_ima8),
+ __ATTR(ima9, 0600, show_ima9, store_ima9),
+ __ATTR(imaat, 0600, show_imaat, store_imaat),
+ __ATTR(btcr, 0600, show_btcr, store_btcr),
+ __ATTR(pccr, 0600, show_pccr, store_pccr),
+ __ATTR(rpccr, 0600, show_rpccr, store_rpccr),
+ __ATTR(der, 0600, show_der, store_der),
+ __ATTR(mer, 0600, show_mer, store_mer),
+ __ATTR(ber, 0600, show_ber, store_ber),
+ __ATTR(ier, 0600, show_ier, store_ier),
+ __ATTR(sier, 0600, show_sier, store_sier),
+ __ATTR(siar, 0600, show_siar, store_siar),
+ __ATTR(tsr0, 0600, show_tsr0, store_tsr0),
+ __ATTR(tsr1, 0600, show_tsr1, store_tsr1),
+ __ATTR(tsr2, 0600, show_tsr2, store_tsr2),
+ __ATTR(tsr3, 0600, show_tsr3, store_tsr3),
#endif /* CONFIG_DEBUG_KERNEL */
};
#endif /* HAS_PPC_PMC_PA6T */
#endif /* HAS_PPC_PMC_CLASSIC */
-static void __cpuinit register_cpu_online(unsigned int cpu)
+static void register_cpu_online(unsigned int cpu)
{
struct cpu *c = &per_cpu(cpu_devices, cpu);
- struct sys_device *s = &c->sysdev;
- struct sysdev_attribute *attrs, *pmc_attrs;
+ struct device *s = &c->dev;
+ struct device_attribute *attrs, *pmc_attrs;
int i, nattrs;
#ifdef CONFIG_PPC64
- if (!firmware_has_feature(FW_FEATURE_ISERIES) &&
- cpu_has_feature(CPU_FTR_SMT))
- sysdev_create_file(s, &attr_smt_snooze_delay);
+ if (cpu_has_feature(CPU_FTR_SMT))
+ device_create_file(s, &dev_attr_smt_snooze_delay);
#endif
/* PMC stuff */
@@ -313,14 +682,14 @@ static void __cpuinit register_cpu_online(unsigned int cpu)
#ifdef HAS_PPC_PMC_IBM
case PPC_PMC_IBM:
attrs = ibm_common_attrs;
- nattrs = sizeof(ibm_common_attrs) / sizeof(struct sysdev_attribute);
+ nattrs = sizeof(ibm_common_attrs) / sizeof(struct device_attribute);
pmc_attrs = classic_pmc_attrs;
break;
#endif /* HAS_PPC_PMC_IBM */
#ifdef HAS_PPC_PMC_G4
case PPC_PMC_G4:
attrs = g4_common_attrs;
- nattrs = sizeof(g4_common_attrs) / sizeof(struct sysdev_attribute);
+ nattrs = sizeof(g4_common_attrs) / sizeof(struct device_attribute);
pmc_attrs = classic_pmc_attrs;
break;
#endif /* HAS_PPC_PMC_G4 */
@@ -328,7 +697,7 @@ static void __cpuinit register_cpu_online(unsigned int cpu)
case PPC_PMC_PA6T:
/* PA Semi starts counting at PMC0 */
attrs = pa6t_attrs;
- nattrs = sizeof(pa6t_attrs) / sizeof(struct sysdev_attribute);
+ nattrs = sizeof(pa6t_attrs) / sizeof(struct device_attribute);
pmc_attrs = NULL;
break;
#endif /* HAS_PPC_PMC_PA6T */
@@ -339,26 +708,41 @@ static void __cpuinit register_cpu_online(unsigned int cpu)
}
for (i = 0; i < nattrs; i++)
- sysdev_create_file(s, &attrs[i]);
+ device_create_file(s, &attrs[i]);
if (pmc_attrs)
for (i = 0; i < cur_cpu_spec->num_pmcs; i++)
- sysdev_create_file(s, &pmc_attrs[i]);
+ device_create_file(s, &pmc_attrs[i]);
#ifdef CONFIG_PPC64
if (cpu_has_feature(CPU_FTR_MMCRA))
- sysdev_create_file(s, &attr_mmcra);
+ device_create_file(s, &dev_attr_mmcra);
- if (cpu_has_feature(CPU_FTR_PURR))
- sysdev_create_file(s, &attr_purr);
+ if (cpu_has_feature(CPU_FTR_PURR)) {
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ add_write_permission_dev_attr(&dev_attr_purr);
+ device_create_file(s, &dev_attr_purr);
+ }
if (cpu_has_feature(CPU_FTR_SPURR))
- sysdev_create_file(s, &attr_spurr);
+ device_create_file(s, &dev_attr_spurr);
if (cpu_has_feature(CPU_FTR_DSCR))
- sysdev_create_file(s, &attr_dscr);
+ device_create_file(s, &dev_attr_dscr);
+
+ if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2))
+ device_create_file(s, &dev_attr_pir);
#endif /* CONFIG_PPC64 */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) {
+ device_create_file(s, &dev_attr_pw20_state);
+ device_create_file(s, &dev_attr_pw20_wait_time);
+
+ device_create_file(s, &dev_attr_altivec_idle);
+ device_create_file(s, &dev_attr_altivec_idle_wait_time);
+ }
+#endif
cacheinfo_cpu_online(cpu);
}
@@ -366,16 +750,15 @@ static void __cpuinit register_cpu_online(unsigned int cpu)
static void unregister_cpu_online(unsigned int cpu)
{
struct cpu *c = &per_cpu(cpu_devices, cpu);
- struct sys_device *s = &c->sysdev;
- struct sysdev_attribute *attrs, *pmc_attrs;
+ struct device *s = &c->dev;
+ struct device_attribute *attrs, *pmc_attrs;
int i, nattrs;
BUG_ON(!c->hotpluggable);
#ifdef CONFIG_PPC64
- if (!firmware_has_feature(FW_FEATURE_ISERIES) &&
- cpu_has_feature(CPU_FTR_SMT))
- sysdev_remove_file(s, &attr_smt_snooze_delay);
+ if (cpu_has_feature(CPU_FTR_SMT))
+ device_remove_file(s, &dev_attr_smt_snooze_delay);
#endif
/* PMC stuff */
@@ -383,14 +766,14 @@ static void unregister_cpu_online(unsigned int cpu)
#ifdef HAS_PPC_PMC_IBM
case PPC_PMC_IBM:
attrs = ibm_common_attrs;
- nattrs = sizeof(ibm_common_attrs) / sizeof(struct sysdev_attribute);
+ nattrs = sizeof(ibm_common_attrs) / sizeof(struct device_attribute);
pmc_attrs = classic_pmc_attrs;
break;
#endif /* HAS_PPC_PMC_IBM */
#ifdef HAS_PPC_PMC_G4
case PPC_PMC_G4:
attrs = g4_common_attrs;
- nattrs = sizeof(g4_common_attrs) / sizeof(struct sysdev_attribute);
+ nattrs = sizeof(g4_common_attrs) / sizeof(struct device_attribute);
pmc_attrs = classic_pmc_attrs;
break;
#endif /* HAS_PPC_PMC_G4 */
@@ -398,7 +781,7 @@ static void unregister_cpu_online(unsigned int cpu)
case PPC_PMC_PA6T:
/* PA Semi starts counting at PMC0 */
attrs = pa6t_attrs;
- nattrs = sizeof(pa6t_attrs) / sizeof(struct sysdev_attribute);
+ nattrs = sizeof(pa6t_attrs) / sizeof(struct device_attribute);
pmc_attrs = NULL;
break;
#endif /* HAS_PPC_PMC_PA6T */
@@ -409,26 +792,38 @@ static void unregister_cpu_online(unsigned int cpu)
}
for (i = 0; i < nattrs; i++)
- sysdev_remove_file(s, &attrs[i]);
+ device_remove_file(s, &attrs[i]);
if (pmc_attrs)
for (i = 0; i < cur_cpu_spec->num_pmcs; i++)
- sysdev_remove_file(s, &pmc_attrs[i]);
+ device_remove_file(s, &pmc_attrs[i]);
#ifdef CONFIG_PPC64
if (cpu_has_feature(CPU_FTR_MMCRA))
- sysdev_remove_file(s, &attr_mmcra);
+ device_remove_file(s, &dev_attr_mmcra);
if (cpu_has_feature(CPU_FTR_PURR))
- sysdev_remove_file(s, &attr_purr);
+ device_remove_file(s, &dev_attr_purr);
if (cpu_has_feature(CPU_FTR_SPURR))
- sysdev_remove_file(s, &attr_spurr);
+ device_remove_file(s, &dev_attr_spurr);
if (cpu_has_feature(CPU_FTR_DSCR))
- sysdev_remove_file(s, &attr_dscr);
+ device_remove_file(s, &dev_attr_dscr);
+
+ if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2))
+ device_remove_file(s, &dev_attr_pir);
#endif /* CONFIG_PPC64 */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+ if (PVR_VER(cur_cpu_spec->pvr_value) == PVR_VER_E6500) {
+ device_remove_file(s, &dev_attr_pw20_state);
+ device_remove_file(s, &dev_attr_pw20_wait_time);
+
+ device_remove_file(s, &dev_attr_altivec_idle);
+ device_remove_file(s, &dev_attr_altivec_idle_wait_time);
+ }
+#endif
cacheinfo_cpu_offline(cpu);
}
@@ -452,7 +847,7 @@ ssize_t arch_cpu_release(const char *buf, size_t count)
#endif /* CONFIG_HOTPLUG_CPU */
-static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
+static int sysfs_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned int)(long)hcpu;
@@ -472,76 +867,76 @@ static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata sysfs_cpu_nb = {
+static struct notifier_block sysfs_cpu_nb = {
.notifier_call = sysfs_cpu_notify,
};
static DEFINE_MUTEX(cpu_mutex);
-int cpu_add_sysdev_attr(struct sysdev_attribute *attr)
+int cpu_add_dev_attr(struct device_attribute *attr)
{
int cpu;
mutex_lock(&cpu_mutex);
for_each_possible_cpu(cpu) {
- sysdev_create_file(get_cpu_sysdev(cpu), attr);
+ device_create_file(get_cpu_device(cpu), attr);
}
mutex_unlock(&cpu_mutex);
return 0;
}
-EXPORT_SYMBOL_GPL(cpu_add_sysdev_attr);
+EXPORT_SYMBOL_GPL(cpu_add_dev_attr);
-int cpu_add_sysdev_attr_group(struct attribute_group *attrs)
+int cpu_add_dev_attr_group(struct attribute_group *attrs)
{
int cpu;
- struct sys_device *sysdev;
+ struct device *dev;
int ret;
mutex_lock(&cpu_mutex);
for_each_possible_cpu(cpu) {
- sysdev = get_cpu_sysdev(cpu);
- ret = sysfs_create_group(&sysdev->kobj, attrs);
+ dev = get_cpu_device(cpu);
+ ret = sysfs_create_group(&dev->kobj, attrs);
WARN_ON(ret != 0);
}
mutex_unlock(&cpu_mutex);
return 0;
}
-EXPORT_SYMBOL_GPL(cpu_add_sysdev_attr_group);
+EXPORT_SYMBOL_GPL(cpu_add_dev_attr_group);
-void cpu_remove_sysdev_attr(struct sysdev_attribute *attr)
+void cpu_remove_dev_attr(struct device_attribute *attr)
{
int cpu;
mutex_lock(&cpu_mutex);
for_each_possible_cpu(cpu) {
- sysdev_remove_file(get_cpu_sysdev(cpu), attr);
+ device_remove_file(get_cpu_device(cpu), attr);
}
mutex_unlock(&cpu_mutex);
}
-EXPORT_SYMBOL_GPL(cpu_remove_sysdev_attr);
+EXPORT_SYMBOL_GPL(cpu_remove_dev_attr);
-void cpu_remove_sysdev_attr_group(struct attribute_group *attrs)
+void cpu_remove_dev_attr_group(struct attribute_group *attrs)
{
int cpu;
- struct sys_device *sysdev;
+ struct device *dev;
mutex_lock(&cpu_mutex);
for_each_possible_cpu(cpu) {
- sysdev = get_cpu_sysdev(cpu);
- sysfs_remove_group(&sysdev->kobj, attrs);
+ dev = get_cpu_device(cpu);
+ sysfs_remove_group(&dev->kobj, attrs);
}
mutex_unlock(&cpu_mutex);
}
-EXPORT_SYMBOL_GPL(cpu_remove_sysdev_attr_group);
+EXPORT_SYMBOL_GPL(cpu_remove_dev_attr_group);
/* NUMA stuff */
@@ -555,18 +950,18 @@ static void register_nodes(void)
register_one_node(i);
}
-int sysfs_add_device_to_node(struct sys_device *dev, int nid)
+int sysfs_add_device_to_node(struct device *dev, int nid)
{
- struct node *node = &node_devices[nid];
- return sysfs_create_link(&node->sysdev.kobj, &dev->kobj,
+ struct node *node = node_devices[nid];
+ return sysfs_create_link(&node->dev.kobj, &dev->kobj,
kobject_name(&dev->kobj));
}
EXPORT_SYMBOL_GPL(sysfs_add_device_to_node);
-void sysfs_remove_device_from_node(struct sys_device *dev, int nid)
+void sysfs_remove_device_from_node(struct device *dev, int nid)
{
- struct node *node = &node_devices[nid];
- sysfs_remove_link(&node->sysdev.kobj, kobject_name(&dev->kobj));
+ struct node *node = node_devices[nid];
+ sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj));
}
EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node);
@@ -579,21 +974,22 @@ static void register_nodes(void)
#endif
/* Only valid if CPU is present. */
-static ssize_t show_physical_id(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
+static ssize_t show_physical_id(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+ struct cpu *cpu = container_of(dev, struct cpu, dev);
- return sprintf(buf, "%d\n", get_hard_smp_processor_id(cpu->sysdev.id));
+ return sprintf(buf, "%d\n", get_hard_smp_processor_id(cpu->dev.id));
}
-static SYSDEV_ATTR(physical_id, 0444, show_physical_id, NULL);
+static DEVICE_ATTR(physical_id, 0444, show_physical_id, NULL);
static int __init topology_init(void)
{
int cpu;
register_nodes();
- register_cpu_notifier(&sysfs_cpu_nb);
+
+ cpu_notifier_register_begin();
for_each_possible_cpu(cpu) {
struct cpu *c = &per_cpu(cpu_devices, cpu);
@@ -611,13 +1007,21 @@ static int __init topology_init(void)
if (cpu_online(cpu) || c->hotpluggable) {
register_cpu(c, cpu);
- sysdev_create_file(&c->sysdev, &attr_physical_id);
+ device_create_file(&c->dev, &dev_attr_physical_id);
}
if (cpu_online(cpu))
register_cpu_online(cpu);
}
+ __register_cpu_notifier(&sysfs_cpu_nb);
+
+ cpu_notifier_register_done();
+
+#ifdef CONFIG_PPC64
+ sysfs_create_dscr_default();
+#endif /* CONFIG_PPC64 */
+
return 0;
}
subsys_initcall(topology_init);
diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
index 93219c34af3..895c50ca943 100644
--- a/arch/powerpc/kernel/systbl.S
+++ b/arch/powerpc/kernel/systbl.S
@@ -17,12 +17,12 @@
#include <asm/ppc_asm.h>
#ifdef CONFIG_PPC64
-#define SYSCALL(func) .llong .sys_##func,.sys_##func
-#define COMPAT_SYS(func) .llong .sys_##func,.compat_sys_##func
-#define PPC_SYS(func) .llong .ppc_##func,.ppc_##func
-#define OLDSYS(func) .llong .sys_ni_syscall,.sys_ni_syscall
-#define SYS32ONLY(func) .llong .sys_ni_syscall,.compat_sys_##func
-#define SYSX(f, f3264, f32) .llong .f,.f3264
+#define SYSCALL(func) .llong DOTSYM(sys_##func),DOTSYM(sys_##func)
+#define COMPAT_SYS(func) .llong DOTSYM(sys_##func),DOTSYM(compat_sys_##func)
+#define PPC_SYS(func) .llong DOTSYM(ppc_##func),DOTSYM(ppc_##func)
+#define OLDSYS(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall)
+#define SYS32ONLY(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func)
+#define SYSX(f, f3264, f32) .llong DOTSYM(f),DOTSYM(f3264)
#else
#define SYSCALL(func) .long sys_##func
#define COMPAT_SYS(func) .long sys_##func
@@ -36,6 +36,8 @@
#define PPC_SYS_SPU(func) PPC_SYS(func)
#define SYSX_SPU(f, f3264, f32) SYSX(f, f3264, f32)
+.section .rodata,"a"
+
#ifdef CONFIG_PPC64
#define sys_sigpending sys_ni_syscall
#define sys_old_getrlimit sys_ni_syscall
@@ -43,5 +45,7 @@
.p2align 3
#endif
-_GLOBAL(sys_call_table)
+.globl sys_call_table
+sys_call_table:
+
#include <asm/systbl.h>
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 010406958d9..9fff9cdcc51 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -17,8 +17,7 @@
*
* TODO (not necessarily in this file):
* - improve precision and reproducibility of timebase frequency
- * measurement at boot time. (for iSeries, we calibrate the timebase
- * against the Titan chip's clock.)
+ * measurement at boot time.
* - for astronomical applications: add a new function to get
* non ambiguous timestamps even around leap seconds. This needs
* a new timestamp format and a good name.
@@ -33,7 +32,7 @@
*/
#include <linux/errno.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/param.h>
@@ -43,6 +42,7 @@
#include <linux/timex.h>
#include <linux/kernel_stat.h>
#include <linux/time.h>
+#include <linux/clockchips.h>
#include <linux/init.h>
#include <linux/profile.h>
#include <linux/cpu.h>
@@ -70,15 +70,11 @@
#include <asm/vdso_datapage.h>
#include <asm/firmware.h>
#include <asm/cputime.h>
-#ifdef CONFIG_PPC_ISERIES
-#include <asm/iseries/it_lp_queue.h>
-#include <asm/iseries/hv_call_xm.h>
-#endif
/* powerpc clocksource/clockevent code */
#include <linux/clockchips.h>
-#include <linux/clocksource.h>
+#include <linux/timekeeper_internal.h>
static cycle_t rtc_read(struct clocksource *);
static struct clocksource clocksource_rtc = {
@@ -86,8 +82,6 @@ static struct clocksource clocksource_rtc = {
.rating = 400,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.mask = CLOCKSOURCE_MASK(64),
- .shift = 22,
- .mult = 0, /* To be filled in */
.read = rtc_read,
};
@@ -97,8 +91,6 @@ static struct clocksource clocksource_timebase = {
.rating = 400,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.mask = CLOCKSOURCE_MASK(64),
- .shift = 22,
- .mult = 0, /* To be filled in */
.read = timebase_read,
};
@@ -109,31 +101,18 @@ static int decrementer_set_next_event(unsigned long evt,
static void decrementer_set_mode(enum clock_event_mode mode,
struct clock_event_device *dev);
-static struct clock_event_device decrementer_clockevent = {
- .name = "decrementer",
- .rating = 200,
- .shift = 0, /* To be filled in */
- .mult = 0, /* To be filled in */
- .irq = 0,
- .set_next_event = decrementer_set_next_event,
- .set_mode = decrementer_set_mode,
- .features = CLOCK_EVT_FEAT_ONESHOT,
-};
-
-struct decrementer_clock {
- struct clock_event_device event;
- u64 next_tb;
+struct clock_event_device decrementer_clockevent = {
+ .name = "decrementer",
+ .rating = 200,
+ .irq = 0,
+ .set_next_event = decrementer_set_next_event,
+ .set_mode = decrementer_set_mode,
+ .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP,
};
+EXPORT_SYMBOL(decrementer_clockevent);
-static DEFINE_PER_CPU(struct decrementer_clock, decrementers);
-
-#ifdef CONFIG_PPC_ISERIES
-static unsigned long __initdata iSeries_recal_titan;
-static signed long __initdata iSeries_recal_tb;
-
-/* Forward declaration is only needed for iSereis compiles */
-static void __init clocksource_init(void);
-#endif
+DEFINE_PER_CPU(u64, decrementers_next_tb);
+static DEFINE_PER_CPU(struct clock_event_device, decrementers);
#define XSEC_PER_SEC (1024*1024)
@@ -155,7 +134,7 @@ EXPORT_SYMBOL_GPL(rtc_lock);
static u64 tb_to_ns_scale __read_mostly;
static unsigned tb_to_ns_shift __read_mostly;
-static unsigned long boot_tb __read_mostly;
+static u64 boot_tb __read_mostly;
extern struct timezone sys_tz;
static long timezone_offset;
@@ -165,16 +144,16 @@ EXPORT_SYMBOL_GPL(ppc_proc_freq);
unsigned long ppc_tb_freq;
EXPORT_SYMBOL_GPL(ppc_tb_freq);
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Factors for converting from cputime_t (timebase ticks) to
- * jiffies, milliseconds, seconds, and clock_t (1/USER_HZ seconds).
+ * jiffies, microseconds, seconds, and clock_t (1/USER_HZ seconds).
* These are all stored as 0.64 fixed-point binary fractions.
*/
u64 __cputime_jiffies_factor;
EXPORT_SYMBOL(__cputime_jiffies_factor);
-u64 __cputime_msec_factor;
-EXPORT_SYMBOL(__cputime_msec_factor);
+u64 __cputime_usec_factor;
+EXPORT_SYMBOL(__cputime_usec_factor);
u64 __cputime_sec_factor;
EXPORT_SYMBOL(__cputime_sec_factor);
u64 __cputime_clockt_factor;
@@ -192,8 +171,8 @@ static void calc_cputime_factors(void)
div128_by_32(HZ, 0, tb_ticks_per_sec, &res);
__cputime_jiffies_factor = res.result_low;
- div128_by_32(1000, 0, tb_ticks_per_sec, &res);
- __cputime_msec_factor = res.result_low;
+ div128_by_32(1000000, 0, tb_ticks_per_sec, &res);
+ __cputime_usec_factor = res.result_low;
div128_by_32(1, 0, tb_ticks_per_sec, &res);
__cputime_sec_factor = res.result_low;
div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res);
@@ -229,23 +208,26 @@ static u64 scan_dispatch_log(u64 stop_tb)
u64 stolen = 0;
u64 dtb;
- if (i == vpa->dtl_idx)
+ if (!dtl)
return 0;
- while (i < vpa->dtl_idx) {
- if (dtl_consumer)
- dtl_consumer(dtl, i);
- dtb = dtl->timebase;
- tb_delta = dtl->enqueue_to_dispatch_time +
- dtl->ready_to_enqueue_time;
+
+ if (i == be64_to_cpu(vpa->dtl_idx))
+ return 0;
+ while (i < be64_to_cpu(vpa->dtl_idx)) {
+ dtb = be64_to_cpu(dtl->timebase);
+ tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) +
+ be32_to_cpu(dtl->ready_to_enqueue_time);
barrier();
- if (i + N_DISPATCH_LOG < vpa->dtl_idx) {
+ if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
/* buffer has overflowed */
- i = vpa->dtl_idx - N_DISPATCH_LOG;
+ i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
continue;
}
if (dtb > stop_tb)
break;
+ if (dtl_consumer)
+ dtl_consumer(dtl, i);
stolen += tb_delta;
++i;
++dtl;
@@ -265,18 +247,30 @@ void accumulate_stolen_time(void)
{
u64 sst, ust;
- sst = scan_dispatch_log(get_paca()->starttime_user);
- ust = scan_dispatch_log(get_paca()->starttime);
- get_paca()->system_time -= sst;
- get_paca()->user_time -= ust;
- get_paca()->stolen_time += ust + sst;
+ u8 save_soft_enabled = local_paca->soft_enabled;
+
+ /* We are called early in the exception entry, before
+ * soft/hard_enabled are sync'ed to the expected state
+ * for the exception. We are hard disabled but the PACA
+ * needs to reflect that so various debug stuff doesn't
+ * complain
+ */
+ local_paca->soft_enabled = 0;
+
+ sst = scan_dispatch_log(local_paca->starttime_user);
+ ust = scan_dispatch_log(local_paca->starttime);
+ local_paca->system_time -= sst;
+ local_paca->user_time -= ust;
+ local_paca->stolen_time += ust + sst;
+
+ local_paca->soft_enabled = save_soft_enabled;
}
static inline u64 calculate_stolen_time(u64 stop_tb)
{
u64 stolen = 0;
- if (get_paca()->dtl_ridx != get_paca()->lppaca_ptr->dtl_idx) {
+ if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) {
stolen = scan_dispatch_log(stop_tb);
get_paca()->system_time -= stolen;
}
@@ -298,13 +292,14 @@ static inline u64 calculate_stolen_time(u64 stop_tb)
* Account time for a transition between system, hard irq
* or soft irq state.
*/
-void account_system_vtime(struct task_struct *tsk)
+static u64 vtime_delta(struct task_struct *tsk,
+ u64 *sys_scaled, u64 *stolen)
{
- u64 now, nowscaled, delta, deltascaled;
- unsigned long flags;
- u64 stolen, udelta, sys_scaled, user_scaled;
+ u64 now, nowscaled, deltascaled;
+ u64 udelta, delta, user_scaled;
+
+ WARN_ON_ONCE(!irqs_disabled());
- local_irq_save(flags);
now = mftb();
nowscaled = read_spurr(now);
get_paca()->system_time += now - get_paca()->starttime;
@@ -312,7 +307,7 @@ void account_system_vtime(struct task_struct *tsk)
deltascaled = nowscaled - get_paca()->startspurr;
get_paca()->startspurr = nowscaled;
- stolen = calculate_stolen_time(now);
+ *stolen = calculate_stolen_time(now);
delta = get_paca()->system_time;
get_paca()->system_time = 0;
@@ -329,39 +324,50 @@ void account_system_vtime(struct task_struct *tsk)
* the user ticks get saved up in paca->user_time_scaled to be
* used by account_process_tick.
*/
- sys_scaled = delta;
+ *sys_scaled = delta;
user_scaled = udelta;
if (deltascaled != delta + udelta) {
if (udelta) {
- sys_scaled = deltascaled * delta / (delta + udelta);
- user_scaled = deltascaled - sys_scaled;
+ *sys_scaled = deltascaled * delta / (delta + udelta);
+ user_scaled = deltascaled - *sys_scaled;
} else {
- sys_scaled = deltascaled;
+ *sys_scaled = deltascaled;
}
}
get_paca()->user_time_scaled += user_scaled;
- if (in_irq() || idle_task(smp_processor_id()) != tsk) {
- account_system_time(tsk, 0, delta, sys_scaled);
- if (stolen)
- account_steal_time(stolen);
- } else {
- account_idle_time(delta + stolen);
- }
- local_irq_restore(flags);
+ return delta;
+}
+
+void vtime_account_system(struct task_struct *tsk)
+{
+ u64 delta, sys_scaled, stolen;
+
+ delta = vtime_delta(tsk, &sys_scaled, &stolen);
+ account_system_time(tsk, 0, delta, sys_scaled);
+ if (stolen)
+ account_steal_time(stolen);
+}
+EXPORT_SYMBOL_GPL(vtime_account_system);
+
+void vtime_account_idle(struct task_struct *tsk)
+{
+ u64 delta, sys_scaled, stolen;
+
+ delta = vtime_delta(tsk, &sys_scaled, &stolen);
+ account_idle_time(delta + stolen);
}
-EXPORT_SYMBOL_GPL(account_system_vtime);
/*
- * Transfer the user and system times accumulated in the paca
- * by the exception entry and exit code to the generic process
- * user and system time records.
+ * Transfer the user time accumulated in the paca
+ * by the exception entry and exit code to the generic
+ * process user time records.
* Must be called with interrupts disabled.
- * Assumes that account_system_vtime() has been called recently
- * (i.e. since the last entry from usermode) so that
+ * Assumes that vtime_account_system/idle() has been called
+ * recently (i.e. since the last entry from usermode) so that
* get_paca()->user_time_scaled is up to date.
*/
-void account_process_tick(struct task_struct *tsk, int user_tick)
+void vtime_account_user(struct task_struct *tsk)
{
cputime_t utime, utimescaled;
@@ -373,7 +379,7 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
account_user_time(tsk, utime, utimescaled);
}
-#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
+#else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#define calc_cputime_factors()
#endif
@@ -418,74 +424,6 @@ unsigned long profile_pc(struct pt_regs *regs)
EXPORT_SYMBOL(profile_pc);
#endif
-#ifdef CONFIG_PPC_ISERIES
-
-/*
- * This function recalibrates the timebase based on the 49-bit time-of-day
- * value in the Titan chip. The Titan is much more accurate than the value
- * returned by the service processor for the timebase frequency.
- */
-
-static int __init iSeries_tb_recal(void)
-{
- unsigned long titan, tb;
-
- /* Make sure we only run on iSeries */
- if (!firmware_has_feature(FW_FEATURE_ISERIES))
- return -ENODEV;
-
- tb = get_tb();
- titan = HvCallXm_loadTod();
- if ( iSeries_recal_titan ) {
- unsigned long tb_ticks = tb - iSeries_recal_tb;
- unsigned long titan_usec = (titan - iSeries_recal_titan) >> 12;
- unsigned long new_tb_ticks_per_sec = (tb_ticks * USEC_PER_SEC)/titan_usec;
- unsigned long new_tb_ticks_per_jiffy =
- DIV_ROUND_CLOSEST(new_tb_ticks_per_sec, HZ);
- long tick_diff = new_tb_ticks_per_jiffy - tb_ticks_per_jiffy;
- char sign = '+';
- /* make sure tb_ticks_per_sec and tb_ticks_per_jiffy are consistent */
- new_tb_ticks_per_sec = new_tb_ticks_per_jiffy * HZ;
-
- if ( tick_diff < 0 ) {
- tick_diff = -tick_diff;
- sign = '-';
- }
- if ( tick_diff ) {
- if ( tick_diff < tb_ticks_per_jiffy/25 ) {
- printk( "Titan recalibrate: new tb_ticks_per_jiffy = %lu (%c%ld)\n",
- new_tb_ticks_per_jiffy, sign, tick_diff );
- tb_ticks_per_jiffy = new_tb_ticks_per_jiffy;
- tb_ticks_per_sec = new_tb_ticks_per_sec;
- calc_cputime_factors();
- vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
- setup_cputime_one_jiffy();
- }
- else {
- printk( "Titan recalibrate: FAILED (difference > 4 percent)\n"
- " new tb_ticks_per_jiffy = %lu\n"
- " old tb_ticks_per_jiffy = %lu\n",
- new_tb_ticks_per_jiffy, tb_ticks_per_jiffy );
- }
- }
- }
- iSeries_recal_titan = titan;
- iSeries_recal_tb = tb;
-
- /* Called here as now we know accurate values for the timebase */
- clocksource_init();
- return 0;
-}
-late_initcall(iSeries_tb_recal);
-
-/* Called from platform early init */
-void __init iSeries_time_init_early(void)
-{
- iSeries_recal_tb = get_tb();
- iSeries_recal_titan = HvCallXm_loadTod();
-}
-#endif /* CONFIG_PPC_ISERIES */
-
#ifdef CONFIG_IRQ_WORK
/*
@@ -526,7 +464,7 @@ DEFINE_PER_CPU(u8, irq_work_pending);
#endif /* 32 vs 64 bit */
-void set_irq_work_pending(void)
+void arch_irq_work_raise(void)
{
preempt_disable();
set_irq_work_pending_flag();
@@ -541,69 +479,36 @@ void set_irq_work_pending(void)
#endif /* CONFIG_IRQ_WORK */
-/*
- * For iSeries shared processors, we have to let the hypervisor
- * set the hardware decrementer. We set a virtual decrementer
- * in the lppaca and call the hypervisor if the virtual
- * decrementer is less than the current value in the hardware
- * decrementer. (almost always the new decrementer value will
- * be greater than the current hardware decementer so the hypervisor
- * call will not be needed)
- */
-
-/*
- * timer_interrupt - gets called when the decrementer overflows,
- * with interrupts disabled.
- */
-void timer_interrupt(struct pt_regs * regs)
+void __timer_interrupt(void)
{
- struct pt_regs *old_regs;
- struct decrementer_clock *decrementer = &__get_cpu_var(decrementers);
- struct clock_event_device *evt = &decrementer->event;
+ struct pt_regs *regs = get_irq_regs();
+ u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
+ struct clock_event_device *evt = &__get_cpu_var(decrementers);
u64 now;
trace_timer_interrupt_entry(regs);
- __get_cpu_var(irq_stat).timer_irqs++;
-
- /* Ensure a positive value is written to the decrementer, or else
- * some CPUs will continuue to take decrementer exceptions */
- set_dec(DECREMENTER_MAX);
-
-#if defined(CONFIG_PPC32) && defined(CONFIG_PMAC)
- if (atomic_read(&ppc_n_lost_interrupts) != 0)
- do_IRQ(regs);
-#endif
-
- old_regs = set_irq_regs(regs);
- irq_enter();
-
if (test_irq_work_pending()) {
clear_irq_work_pending();
irq_work_run();
}
-#ifdef CONFIG_PPC_ISERIES
- if (firmware_has_feature(FW_FEATURE_ISERIES))
- get_lppaca()->int_dword.fields.decr_int = 0;
-#endif
-
now = get_tb_or_rtc();
- if (now >= decrementer->next_tb) {
- decrementer->next_tb = ~(u64)0;
+ if (now >= *next_tb) {
+ *next_tb = ~(u64)0;
if (evt->event_handler)
evt->event_handler(evt);
+ __get_cpu_var(irq_stat).timer_irqs_event++;
} else {
- now = decrementer->next_tb - now;
+ now = *next_tb - now;
if (now <= DECREMENTER_MAX)
set_dec((int)now);
+ /* We may have raced with new irq work */
+ if (test_irq_work_pending())
+ set_dec(1);
+ __get_cpu_var(irq_stat).timer_irqs_others++;
}
-#ifdef CONFIG_PPC_ISERIES
- if (firmware_has_feature(FW_FEATURE_ISERIES) && hvlpevent_is_pending())
- process_hvlpevents();
-#endif
-
#ifdef CONFIG_PPC64
/* collect purr register values often, for accurate calculations */
if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
@@ -612,10 +517,60 @@ void timer_interrupt(struct pt_regs * regs)
}
#endif
+ trace_timer_interrupt_exit(regs);
+}
+
+/*
+ * timer_interrupt - gets called when the decrementer overflows,
+ * with interrupts disabled.
+ */
+void timer_interrupt(struct pt_regs * regs)
+{
+ struct pt_regs *old_regs;
+ u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
+
+ /* Ensure a positive value is written to the decrementer, or else
+ * some CPUs will continue to take decrementer exceptions.
+ */
+ set_dec(DECREMENTER_MAX);
+
+ /* Some implementations of hotplug will get timer interrupts while
+ * offline, just ignore these and we also need to set
+ * decrementers_next_tb as MAX to make sure __check_irq_replay
+ * don't replay timer interrupt when return, otherwise we'll trap
+ * here infinitely :(
+ */
+ if (!cpu_online(smp_processor_id())) {
+ *next_tb = ~(u64)0;
+ return;
+ }
+
+ /* Conditionally hard-enable interrupts now that the DEC has been
+ * bumped to its maximum value
+ */
+ may_hard_irq_enable();
+
+
+#if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC)
+ if (atomic_read(&ppc_n_lost_interrupts) != 0)
+ do_IRQ(regs);
+#endif
+
+ old_regs = set_irq_regs(regs);
+ irq_enter();
+
+ __timer_interrupt();
irq_exit();
set_irq_regs(old_regs);
+}
- trace_timer_interrupt_exit(regs);
+/*
+ * Hypervisor decrementer interrupts shouldn't occur but are sometimes
+ * left pending on exit from a KVM guest. We don't need to do anything
+ * to clear them, as they are edge-triggered.
+ */
+void hdec_interrupt(struct pt_regs *regs)
+{
}
#ifdef CONFIG_SUSPEND
@@ -625,9 +580,9 @@ static void generic_suspend_disable_irqs(void)
* with suspending.
*/
- set_dec(0x7fffffff);
+ set_dec(DECREMENTER_MAX);
local_irq_disable();
- set_dec(0x7fffffff);
+ set_dec(DECREMENTER_MAX);
}
static void generic_suspend_enable_irqs(void)
@@ -669,7 +624,7 @@ unsigned long long sched_clock(void)
static int __init get_freq(char *name, int cells, unsigned long *val)
{
struct device_node *cpu;
- const unsigned int *fp;
+ const __be32 *fp;
int found = 0;
/* The cpu node should have timebase and clock frequency properties */
@@ -688,7 +643,6 @@ static int __init get_freq(char *name, int cells, unsigned long *val)
return found;
}
-/* should become __cpuinit when secondary_cpu_time_init also is */
void start_cpu_decrementer(void)
{
#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
@@ -726,7 +680,7 @@ int update_persistent_clock(struct timespec now)
struct rtc_time tm;
if (!ppc_md.set_rtc_time)
- return 0;
+ return -ENODEV;
to_tm(now.tv_sec + 1 + timezone_offset, &tm);
tm.tm_year -= 1900;
@@ -786,7 +740,7 @@ static cycle_t timebase_read(struct clocksource *cs)
return (cycle_t)get_tb();
}
-void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm,
struct clocksource *clock, u32 mult)
{
u64 new_tb_to_xs, new_stamp_xsec;
@@ -799,9 +753,8 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
++vdso_data->tb_update_count;
smp_mb();
- /* XXX this assumes clock->shift == 22 */
- /* 4611686018 ~= 2^(20+64-22) / 1e9 */
- new_tb_to_xs = (u64) mult * 4611686018ULL;
+ /* 19342813113834067 ~= 2^(20+64) / 1e9 */
+ new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift);
new_stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC;
do_div(new_stamp_xsec, 1000000000);
new_stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC;
@@ -834,13 +787,8 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
void update_vsyscall_tz(void)
{
- /* Make userspace gettimeofday spin until we're done. */
- ++vdso_data->tb_update_count;
- smp_mb();
vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
vdso_data->tz_dsttime = sys_tz.tz_dsttime;
- smp_mb();
- ++vdso_data->tb_update_count;
}
static void __init clocksource_init(void)
@@ -852,9 +800,7 @@ static void __init clocksource_init(void)
else
clock = &clocksource_timebase;
- clock->mult = clocksource_hz2mult(tb_ticks_per_sec, clock->shift);
-
- if (clocksource_register(clock)) {
+ if (clocksource_register_hz(clock, tb_ticks_per_sec)) {
printk(KERN_ERR "clocksource: %s is already registered\n",
clock->name);
return;
@@ -867,8 +813,13 @@ static void __init clocksource_init(void)
static int decrementer_set_next_event(unsigned long evt,
struct clock_event_device *dev)
{
- __get_cpu_var(decrementers).next_tb = get_tb_or_rtc() + evt;
+ __get_cpu_var(decrementers_next_tb) = get_tb_or_rtc() + evt;
set_dec(evt);
+
+ /* We may have raced with new irq work */
+ if (test_irq_work_pending())
+ set_dec(1);
+
return 0;
}
@@ -879,34 +830,18 @@ static void decrementer_set_mode(enum clock_event_mode mode,
decrementer_set_next_event(DECREMENTER_MAX, dev);
}
-static inline uint64_t div_sc64(unsigned long ticks, unsigned long nsec,
- int shift)
-{
- uint64_t tmp = ((uint64_t)ticks) << shift;
-
- do_div(tmp, nsec);
- return tmp;
-}
-
-static void __init setup_clockevent_multiplier(unsigned long hz)
+/* Interrupt handler for the timer broadcast IPI */
+void tick_broadcast_ipi_handler(void)
{
- u64 mult, shift = 32;
+ u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
- while (1) {
- mult = div_sc64(hz, NSEC_PER_SEC, shift);
- if (mult && (mult >> 32UL) == 0UL)
- break;
-
- shift--;
- }
-
- decrementer_clockevent.shift = shift;
- decrementer_clockevent.mult = mult;
+ *next_tb = get_tb_or_rtc();
+ __timer_interrupt();
}
static void register_decrementer_clockevent(int cpu)
{
- struct clock_event_device *dec = &per_cpu(decrementers, cpu).event;
+ struct clock_event_device *dec = &per_cpu(decrementers, cpu);
*dec = decrementer_clockevent;
dec->cpumask = cpumask_of(cpu);
@@ -921,7 +856,8 @@ static void __init init_decrementer_clockevent(void)
{
int cpu = smp_processor_id();
- setup_clockevent_multiplier(ppc_tb_freq);
+ clockevents_calc_mult_shift(&decrementer_clockevent, ppc_tb_freq, 4);
+
decrementer_clockevent.max_delta_ns =
clockevent_delta2ns(DECREMENTER_MAX, &decrementer_clockevent);
decrementer_clockevent.min_delta_ns =
@@ -989,10 +925,10 @@ void __init time_init(void)
boot_tb = get_tb_or_rtc();
/* If platform provided a timezone (pmac), we correct the time */
- if (timezone_offset) {
+ if (timezone_offset) {
sys_tz.tz_minuteswest = -timezone_offset / 60;
sys_tz.tz_dsttime = 0;
- }
+ }
vdso_data->tb_update_count = 0;
vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
@@ -1002,11 +938,11 @@ void __init time_init(void)
*/
start_cpu_decrementer();
- /* Register the clocksource, if we're not running on iSeries */
- if (!firmware_has_feature(FW_FEATURE_ISERIES))
- clocksource_init();
+ /* Register the clocksource */
+ clocksource_init();
init_decrementer_clockevent();
+ tick_setup_hrtimer_broadcast();
}
@@ -1139,10 +1075,8 @@ static int __init rtc_init(void)
return -ENODEV;
pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0);
- if (IS_ERR(pdev))
- return PTR_ERR(pdev);
- return 0;
+ return PTR_ERR_OR_ZERO(pdev);
}
module_init(rtc_init);
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
new file mode 100644
index 00000000000..2a324f4cb1b
--- /dev/null
+++ b/arch/powerpc/kernel/tm.S
@@ -0,0 +1,481 @@
+/*
+ * Transactional memory support routines to reclaim and recheckpoint
+ * transactional process state.
+ *
+ * Copyright 2012 Matt Evans & Michael Neuling, IBM Corporation.
+ */
+
+#include <asm/asm-offsets.h>
+#include <asm/ppc_asm.h>
+#include <asm/ppc-opcode.h>
+#include <asm/ptrace.h>
+#include <asm/reg.h>
+#include <asm/bug.h>
+
+#ifdef CONFIG_VSX
+/* See fpu.S, this is borrowed from there */
+#define __SAVE_32FPRS_VSRS(n,c,base) \
+BEGIN_FTR_SECTION \
+ b 2f; \
+END_FTR_SECTION_IFSET(CPU_FTR_VSX); \
+ SAVE_32FPRS(n,base); \
+ b 3f; \
+2: SAVE_32VSRS(n,c,base); \
+3:
+#define __REST_32FPRS_VSRS(n,c,base) \
+BEGIN_FTR_SECTION \
+ b 2f; \
+END_FTR_SECTION_IFSET(CPU_FTR_VSX); \
+ REST_32FPRS(n,base); \
+ b 3f; \
+2: REST_32VSRS(n,c,base); \
+3:
+#else
+#define __SAVE_32FPRS_VSRS(n,c,base) SAVE_32FPRS(n, base)
+#define __REST_32FPRS_VSRS(n,c,base) REST_32FPRS(n, base)
+#endif
+#define SAVE_32FPRS_VSRS(n,c,base) \
+ __SAVE_32FPRS_VSRS(n,__REG_##c,__REG_##base)
+#define REST_32FPRS_VSRS(n,c,base) \
+ __REST_32FPRS_VSRS(n,__REG_##c,__REG_##base)
+
+/* Stack frame offsets for local variables. */
+#define TM_FRAME_L0 TM_FRAME_SIZE-16
+#define TM_FRAME_L1 TM_FRAME_SIZE-8
+
+
+/* In order to access the TM SPRs, TM must be enabled. So, do so: */
+_GLOBAL(tm_enable)
+ mfmsr r4
+ li r3, MSR_TM >> 32
+ sldi r3, r3, 32
+ and. r0, r4, r3
+ bne 1f
+ or r4, r4, r3
+ mtmsrd r4
+1: blr
+
+_GLOBAL(tm_save_sprs)
+ mfspr r0, SPRN_TFHAR
+ std r0, THREAD_TM_TFHAR(r3)
+ mfspr r0, SPRN_TEXASR
+ std r0, THREAD_TM_TEXASR(r3)
+ mfspr r0, SPRN_TFIAR
+ std r0, THREAD_TM_TFIAR(r3)
+ blr
+
+_GLOBAL(tm_restore_sprs)
+ ld r0, THREAD_TM_TFHAR(r3)
+ mtspr SPRN_TFHAR, r0
+ ld r0, THREAD_TM_TEXASR(r3)
+ mtspr SPRN_TEXASR, r0
+ ld r0, THREAD_TM_TFIAR(r3)
+ mtspr SPRN_TFIAR, r0
+ blr
+
+ /* Passed an 8-bit failure cause as first argument. */
+_GLOBAL(tm_abort)
+ TABORT(R3)
+ blr
+
+/* void tm_reclaim(struct thread_struct *thread,
+ * unsigned long orig_msr,
+ * uint8_t cause)
+ *
+ * - Performs a full reclaim. This destroys outstanding
+ * transactions and updates thread->regs.tm_ckpt_* with the
+ * original checkpointed state. Note that thread->regs is
+ * unchanged.
+ * - FP regs are written back to thread->transact_fpr before
+ * reclaiming. These are the transactional (current) versions.
+ *
+ * Purpose is to both abort transactions of, and preserve the state of,
+ * a transactions at a context switch. We preserve/restore both sets of process
+ * state to restore them when the thread's scheduled again. We continue in
+ * userland as though nothing happened, but when the transaction is resumed
+ * they will abort back to the checkpointed state we save out here.
+ *
+ * Call with IRQs off, stacks get all out of sync for some periods in here!
+ */
+_GLOBAL(tm_reclaim)
+ mfcr r6
+ mflr r0
+ stw r6, 8(r1)
+ std r0, 16(r1)
+ std r2, STK_GOT(r1)
+ stdu r1, -TM_FRAME_SIZE(r1)
+
+ /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */
+
+ std r3, STK_PARAM(R3)(r1)
+ SAVE_NVGPRS(r1)
+
+ /* We need to setup MSR for VSX register save instructions. Here we
+ * also clear the MSR RI since when we do the treclaim, we won't have a
+ * valid kernel pointer for a while. We clear RI here as it avoids
+ * adding another mtmsr closer to the treclaim. This makes the region
+ * maked as non-recoverable wider than it needs to be but it saves on
+ * inserting another mtmsrd later.
+ */
+ mfmsr r14
+ mr r15, r14
+ ori r15, r15, MSR_FP
+ li r16, MSR_RI
+ ori r16, r16, MSR_EE /* IRQs hard off */
+ andc r15, r15, r16
+ oris r15, r15, MSR_VEC@h
+#ifdef CONFIG_VSX
+ BEGIN_FTR_SECTION
+ oris r15,r15, MSR_VSX@h
+ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+ mtmsrd r15
+ std r14, TM_FRAME_L0(r1)
+
+ /* Stash the stack pointer away for use after reclaim */
+ std r1, PACAR1(r13)
+
+ /* ******************** FPR/VR/VSRs ************
+ * Before reclaiming, capture the current/transactional FPR/VR
+ * versions /if used/.
+ *
+ * (If VSX used, FP and VMX are implied. Or, we don't need to look
+ * at MSR.VSX as copying FP regs if .FP, vector regs if .VMX covers it.)
+ *
+ * We're passed the thread's MSR as parameter 2.
+ *
+ * We enabled VEC/FP/VSX in the msr above, so we can execute these
+ * instructions!
+ */
+ andis. r0, r4, MSR_VEC@h
+ beq dont_backup_vec
+
+ addi r7, r3, THREAD_TRANSACT_VRSTATE
+ SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 transact vr state */
+ mfvscr vr0
+ li r6, VRSTATE_VSCR
+ stvx vr0, r7, r6
+dont_backup_vec:
+ mfspr r0, SPRN_VRSAVE
+ std r0, THREAD_TRANSACT_VRSAVE(r3)
+
+ andi. r0, r4, MSR_FP
+ beq dont_backup_fp
+
+ addi r7, r3, THREAD_TRANSACT_FPSTATE
+ SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 transact fp state */
+
+ mffs fr0
+ stfd fr0,FPSTATE_FPSCR(r7)
+
+dont_backup_fp:
+ /* Do sanity check on MSR to make sure we are suspended */
+ li r7, (MSR_TS_S)@higher
+ srdi r6, r14, 32
+ and r6, r6, r7
+1: tdeqi r6, 0
+ EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0
+
+ /* The moment we treclaim, ALL of our GPRs will switch
+ * to user register state. (FPRs, CCR etc. also!)
+ * Use an sprg and a tm_scratch in the PACA to shuffle.
+ */
+ TRECLAIM(R5) /* Cause in r5 */
+
+ /* ******************** GPRs ******************** */
+ /* Stash the checkpointed r13 away in the scratch SPR and get the real
+ * paca
+ */
+ SET_SCRATCH0(r13)
+ GET_PACA(r13)
+
+ /* Stash the checkpointed r1 away in paca tm_scratch and get the real
+ * stack pointer back
+ */
+ std r1, PACATMSCRATCH(r13)
+ ld r1, PACAR1(r13)
+
+ /* Store the PPR in r11 and reset to decent value */
+ std r11, GPR11(r1) /* Temporary stash */
+ mfspr r11, SPRN_PPR
+ HMT_MEDIUM
+
+ /* Now get some more GPRS free */
+ std r7, GPR7(r1) /* Temporary stash */
+ std r12, GPR12(r1) /* '' '' '' */
+ ld r12, STK_PARAM(R3)(r1) /* Param 0, thread_struct * */
+
+ std r11, THREAD_TM_PPR(r12) /* Store PPR and free r11 */
+
+ addi r7, r12, PT_CKPT_REGS /* Thread's ckpt_regs */
+
+ /* Make r7 look like an exception frame so that we
+ * can use the neat GPRx(n) macros. r7 is NOT a pt_regs ptr!
+ */
+ subi r7, r7, STACK_FRAME_OVERHEAD
+
+ /* Sync the userland GPRs 2-12, 14-31 to thread->regs: */
+ SAVE_GPR(0, r7) /* user r0 */
+ SAVE_GPR(2, r7) /* user r2 */
+ SAVE_4GPRS(3, r7) /* user r3-r6 */
+ SAVE_GPR(8, r7) /* user r8 */
+ SAVE_GPR(9, r7) /* user r9 */
+ SAVE_GPR(10, r7) /* user r10 */
+ ld r3, PACATMSCRATCH(r13) /* user r1 */
+ ld r4, GPR7(r1) /* user r7 */
+ ld r5, GPR11(r1) /* user r11 */
+ ld r6, GPR12(r1) /* user r12 */
+ GET_SCRATCH0(8) /* user r13 */
+ std r3, GPR1(r7)
+ std r4, GPR7(r7)
+ std r5, GPR11(r7)
+ std r6, GPR12(r7)
+ std r8, GPR13(r7)
+
+ SAVE_NVGPRS(r7) /* user r14-r31 */
+
+ /* ******************** NIP ******************** */
+ mfspr r3, SPRN_TFHAR
+ std r3, _NIP(r7) /* Returns to failhandler */
+ /* The checkpointed NIP is ignored when rescheduling/rechkpting,
+ * but is used in signal return to 'wind back' to the abort handler.
+ */
+
+ /* ******************** CR,LR,CCR,MSR ********** */
+ mfctr r3
+ mflr r4
+ mfcr r5
+ mfxer r6
+
+ std r3, _CTR(r7)
+ std r4, _LINK(r7)
+ std r5, _CCR(r7)
+ std r6, _XER(r7)
+
+
+ /* ******************** TAR, DSCR ********** */
+ mfspr r3, SPRN_TAR
+ mfspr r4, SPRN_DSCR
+
+ std r3, THREAD_TM_TAR(r12)
+ std r4, THREAD_TM_DSCR(r12)
+
+ /* MSR and flags: We don't change CRs, and we don't need to alter
+ * MSR.
+ */
+
+ /* TM regs, incl TEXASR -- these live in thread_struct. Note they've
+ * been updated by the treclaim, to explain to userland the failure
+ * cause (aborted).
+ */
+ mfspr r0, SPRN_TEXASR
+ mfspr r3, SPRN_TFHAR
+ mfspr r4, SPRN_TFIAR
+ std r0, THREAD_TM_TEXASR(r12)
+ std r3, THREAD_TM_TFHAR(r12)
+ std r4, THREAD_TM_TFIAR(r12)
+
+ /* AMR is checkpointed too, but is unsupported by Linux. */
+
+ /* Restore original MSR/IRQ state & clear TM mode */
+ ld r14, TM_FRAME_L0(r1) /* Orig MSR */
+ li r15, 0
+ rldimi r14, r15, MSR_TS_LG, (63-MSR_TS_LG)-1
+ mtmsrd r14
+
+ REST_NVGPRS(r1)
+
+ addi r1, r1, TM_FRAME_SIZE
+ lwz r4, 8(r1)
+ ld r0, 16(r1)
+ mtcr r4
+ mtlr r0
+ ld r2, STK_GOT(r1)
+
+ /* Load CPU's default DSCR */
+ ld r0, PACA_DSCR(r13)
+ mtspr SPRN_DSCR, r0
+
+ blr
+
+
+ /* void tm_recheckpoint(struct thread_struct *thread,
+ * unsigned long orig_msr)
+ * - Restore the checkpointed register state saved by tm_reclaim
+ * when we switch_to a process.
+ *
+ * Call with IRQs off, stacks get all out of sync for
+ * some periods in here!
+ */
+_GLOBAL(__tm_recheckpoint)
+ mfcr r5
+ mflr r0
+ stw r5, 8(r1)
+ std r0, 16(r1)
+ std r2, STK_GOT(r1)
+ stdu r1, -TM_FRAME_SIZE(r1)
+
+ /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD].
+ * This is used for backing up the NVGPRs:
+ */
+ SAVE_NVGPRS(r1)
+
+ /* Load complete register state from ts_ckpt* registers */
+
+ addi r7, r3, PT_CKPT_REGS /* Thread's ckpt_regs */
+
+ /* Make r7 look like an exception frame so that we
+ * can use the neat GPRx(n) macros. r7 is now NOT a pt_regs ptr!
+ */
+ subi r7, r7, STACK_FRAME_OVERHEAD
+
+ SET_SCRATCH0(r1)
+
+ mfmsr r6
+ /* R4 = original MSR to indicate whether thread used FP/Vector etc. */
+
+ /* Enable FP/vec in MSR if necessary! */
+ lis r5, MSR_VEC@h
+ ori r5, r5, MSR_FP
+ and. r5, r4, r5
+ beq restore_gprs /* if neither, skip both */
+
+#ifdef CONFIG_VSX
+ BEGIN_FTR_SECTION
+ oris r5, r5, MSR_VSX@h
+ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+ or r5, r6, r5 /* Set MSR.FP+.VSX/.VEC */
+ mtmsr r5
+
+#ifdef CONFIG_ALTIVEC
+ /* FP and VEC registers: These are recheckpointed from thread.fpr[]
+ * and thread.vr[] respectively. The thread.transact_fpr[] version
+ * is more modern, and will be loaded subsequently by any FPUnavailable
+ * trap.
+ */
+ andis. r0, r4, MSR_VEC@h
+ beq dont_restore_vec
+
+ addi r8, r3, THREAD_VRSTATE
+ li r5, VRSTATE_VSCR
+ lvx vr0, r8, r5
+ mtvscr vr0
+ REST_32VRS(0, r5, r8) /* r5 scratch, r8 ptr */
+dont_restore_vec:
+ ld r5, THREAD_VRSAVE(r3)
+ mtspr SPRN_VRSAVE, r5
+#endif
+
+ andi. r0, r4, MSR_FP
+ beq dont_restore_fp
+
+ addi r8, r3, THREAD_FPSTATE
+ lfd fr0, FPSTATE_FPSCR(r8)
+ MTFSF_L(fr0)
+ REST_32FPRS_VSRS(0, R4, R8)
+
+dont_restore_fp:
+ mtmsr r6 /* FP/Vec off again! */
+
+restore_gprs:
+
+ /* ******************** CR,LR,CCR,MSR ********** */
+ ld r4, _CTR(r7)
+ ld r5, _LINK(r7)
+ ld r8, _XER(r7)
+
+ mtctr r4
+ mtlr r5
+ mtxer r8
+
+ /* ******************** TAR ******************** */
+ ld r4, THREAD_TM_TAR(r3)
+ mtspr SPRN_TAR, r4
+
+ /* Load up the PPR and DSCR in GPRs only at this stage */
+ ld r5, THREAD_TM_DSCR(r3)
+ ld r6, THREAD_TM_PPR(r3)
+
+ /* Clear the MSR RI since we are about to change R1. EE is already off
+ */
+ li r4, 0
+ mtmsrd r4, 1
+
+ REST_GPR(0, r7) /* GPR0 */
+ REST_2GPRS(2, r7) /* GPR2-3 */
+ REST_GPR(4, r7) /* GPR4 */
+ REST_4GPRS(8, r7) /* GPR8-11 */
+ REST_2GPRS(12, r7) /* GPR12-13 */
+
+ REST_NVGPRS(r7) /* GPR14-31 */
+
+ /* Load up PPR and DSCR here so we don't run with user values for long
+ */
+ mtspr SPRN_DSCR, r5
+ mtspr SPRN_PPR, r6
+
+ /* Do final sanity check on TEXASR to make sure FS is set. Do this
+ * here before we load up the userspace r1 so any bugs we hit will get
+ * a call chain */
+ mfspr r5, SPRN_TEXASR
+ srdi r5, r5, 16
+ li r6, (TEXASR_FS)@h
+ and r6, r6, r5
+1: tdeqi r6, 0
+ EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0
+
+ /* Do final sanity check on MSR to make sure we are not transactional
+ * or suspended
+ */
+ mfmsr r6
+ li r5, (MSR_TS_MASK)@higher
+ srdi r6, r6, 32
+ and r6, r6, r5
+1: tdnei r6, 0
+ EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0
+
+ /* Restore CR */
+ ld r6, _CCR(r7)
+ mtcr r6
+
+ REST_GPR(1, r7) /* GPR1 */
+ REST_GPR(5, r7) /* GPR5-7 */
+ REST_GPR(6, r7)
+ ld r7, GPR7(r7)
+
+ /* Commit register state as checkpointed state: */
+ TRECHKPT
+
+ HMT_MEDIUM
+
+ /* Our transactional state has now changed.
+ *
+ * Now just get out of here. Transactional (current) state will be
+ * updated once restore is called on the return path in the _switch-ed
+ * -to process.
+ */
+
+ GET_PACA(r13)
+ GET_SCRATCH0(r1)
+
+ /* R1 is restored, so we are recoverable again. EE is still off */
+ li r4, MSR_RI
+ mtmsrd r4, 1
+
+ REST_NVGPRS(r1)
+
+ addi r1, r1, TM_FRAME_SIZE
+ lwz r4, 8(r1)
+ ld r0, 16(r1)
+ mtcr r4
+ mtlr r0
+ ld r2, STK_GOT(r1)
+
+ /* Load CPU's default DSCR */
+ ld r0, PACA_DSCR(r13)
+ mtspr SPRN_DSCR, r0
+
+ blr
+
+ /* ****************************************************************** */
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 1b2cdc8eec9..239f1cde3ff 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -34,27 +34,33 @@
#include <linux/bug.h>
#include <linux/kdebug.h>
#include <linux/debugfs.h>
+#include <linux/ratelimit.h>
+#include <linux/context_tracking.h>
#include <asm/emulated_ops.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <asm/io.h>
#include <asm/machdep.h>
#include <asm/rtas.h>
#include <asm/pmc.h>
-#ifdef CONFIG_PPC32
#include <asm/reg.h>
-#endif
#ifdef CONFIG_PMAC_BACKLIGHT
#include <asm/backlight.h>
#endif
#ifdef CONFIG_PPC64
#include <asm/firmware.h>
#include <asm/processor.h>
+#include <asm/tm.h>
#endif
#include <asm/kexec.h>
#include <asm/ppc-opcode.h>
+#include <asm/rio.h>
+#include <asm/fadump.h>
+#include <asm/switch_to.h>
+#include <asm/tm.h>
+#include <asm/debug.h>
+#include <sysdev/fsl_pci.h>
#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
int (*__debugger)(struct pt_regs *regs) __read_mostly;
@@ -62,7 +68,7 @@ int (*__debugger_ipi)(struct pt_regs *regs) __read_mostly;
int (*__debugger_bpt)(struct pt_regs *regs) __read_mostly;
int (*__debugger_sstep)(struct pt_regs *regs) __read_mostly;
int (*__debugger_iabr_match)(struct pt_regs *regs) __read_mostly;
-int (*__debugger_dabr_match)(struct pt_regs *regs) __read_mostly;
+int (*__debugger_break_match)(struct pt_regs *regs) __read_mostly;
int (*__debugger_fault_handler)(struct pt_regs *regs) __read_mostly;
EXPORT_SYMBOL(__debugger);
@@ -70,10 +76,17 @@ EXPORT_SYMBOL(__debugger_ipi);
EXPORT_SYMBOL(__debugger_bpt);
EXPORT_SYMBOL(__debugger_sstep);
EXPORT_SYMBOL(__debugger_iabr_match);
-EXPORT_SYMBOL(__debugger_dabr_match);
+EXPORT_SYMBOL(__debugger_break_match);
EXPORT_SYMBOL(__debugger_fault_handler);
#endif
+/* Transactional Memory trap debug */
+#ifdef TM_DEBUG_SW
+#define TM_DEBUG(x...) printk(KERN_INFO x)
+#else
+#define TM_DEBUG(x...) do { } while(0)
+#endif
+
/*
* Trap & Exception support
*/
@@ -96,18 +109,14 @@ static void pmac_backlight_unblank(void)
static inline void pmac_backlight_unblank(void) { }
#endif
-int die(const char *str, struct pt_regs *regs, long err)
+static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+static int die_counter;
+
+static unsigned __kprobes long oops_begin(struct pt_regs *regs)
{
- static struct {
- raw_spinlock_t lock;
- u32 lock_owner;
- int lock_owner_depth;
- } die = {
- .lock = __RAW_SPIN_LOCK_UNLOCKED(die.lock),
- .lock_owner = -1,
- .lock_owner_depth = 0
- };
- static int die_counter;
+ int cpu;
unsigned long flags;
if (debugger(regs))
@@ -115,67 +124,111 @@ int die(const char *str, struct pt_regs *regs, long err)
oops_enter();
- if (die.lock_owner != raw_smp_processor_id()) {
- console_verbose();
- raw_spin_lock_irqsave(&die.lock, flags);
- die.lock_owner = smp_processor_id();
- die.lock_owner_depth = 0;
- bust_spinlocks(1);
- if (machine_is(powermac))
- pmac_backlight_unblank();
- } else {
- local_save_flags(flags);
+ /* racy, but better than risking deadlock. */
+ raw_local_irq_save(flags);
+ cpu = smp_processor_id();
+ if (!arch_spin_trylock(&die_lock)) {
+ if (cpu == die_owner)
+ /* nested oops. should stop eventually */;
+ else
+ arch_spin_lock(&die_lock);
}
+ die_nest_count++;
+ die_owner = cpu;
+ console_verbose();
+ bust_spinlocks(1);
+ if (machine_is(powermac))
+ pmac_backlight_unblank();
+ return flags;
+}
- if (++die.lock_owner_depth < 3) {
- printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
-#ifdef CONFIG_PREEMPT
- printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
- printk("SMP NR_CPUS=%d ", NR_CPUS);
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC ");
-#endif
-#ifdef CONFIG_NUMA
- printk("NUMA ");
-#endif
- printk("%s\n", ppc_md.name ? ppc_md.name : "");
+static void __kprobes oops_end(unsigned long flags, struct pt_regs *regs,
+ int signr)
+{
+ bust_spinlocks(0);
+ die_owner = -1;
+ add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
+ die_nest_count--;
+ oops_exit();
+ printk("\n");
+ if (!die_nest_count)
+ /* Nest count reaches zero, release the lock. */
+ arch_spin_unlock(&die_lock);
+ raw_local_irq_restore(flags);
- sysfs_printk_last_file();
- if (notify_die(DIE_OOPS, str, regs, err, 255,
- SIGSEGV) == NOTIFY_STOP)
- return 1;
+ crash_fadump(regs, "die oops");
- print_modules();
- show_regs(regs);
- } else {
- printk("Recursive die() failure, output suppressed\n");
+ /*
+ * A system reset (0x100) is a request to dump, so we always send
+ * it through the crashdump code.
+ */
+ if (kexec_should_crash(current) || (TRAP(regs) == 0x100)) {
+ crash_kexec(regs);
+
+ /*
+ * We aren't the primary crash CPU. We need to send it
+ * to a holding pattern to avoid it ending up in the panic
+ * code.
+ */
+ crash_kexec_secondary(regs);
}
- bust_spinlocks(0);
- die.lock_owner = -1;
- add_taint(TAINT_DIE);
- raw_spin_unlock_irqrestore(&die.lock, flags);
+ if (!signr)
+ return;
- if (kexec_should_crash(current) ||
- kexec_sr_activated(smp_processor_id()))
- crash_kexec(regs);
- crash_kexec_secondary(regs);
+ /*
+ * While our oops output is serialised by a spinlock, output
+ * from panic() called below can race and corrupt it. If we
+ * know we are going to panic, delay for 1 second so we have a
+ * chance to get clean backtraces from all CPUs that are oopsing.
+ */
+ if (in_interrupt() || panic_on_oops || !current->pid ||
+ is_global_init(current)) {
+ mdelay(MSEC_PER_SEC);
+ }
if (in_interrupt())
panic("Fatal exception in interrupt");
-
if (panic_on_oops)
panic("Fatal exception");
+ do_exit(signr);
+}
- oops_exit();
- do_exit(err);
+static int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+{
+ printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
+#ifdef CONFIG_PREEMPT
+ printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+ printk("SMP NR_CPUS=%d ", NR_CPUS);
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk("DEBUG_PAGEALLOC ");
+#endif
+#ifdef CONFIG_NUMA
+ printk("NUMA ");
+#endif
+ printk("%s\n", ppc_md.name ? ppc_md.name : "");
+
+ if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP)
+ return 1;
+
+ print_modules();
+ show_regs(regs);
return 0;
}
+void die(const char *str, struct pt_regs *regs, long err)
+{
+ unsigned long flags = oops_begin(regs);
+
+ if (__die(str, regs, err))
+ err = 0;
+ oops_end(flags, regs, err);
+}
+
void user_single_step_siginfo(struct task_struct *tsk,
struct pt_regs *regs, siginfo_t *info)
{
@@ -194,16 +247,20 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
"at %016lx nip %016lx lr %016lx code %x\n";
if (!user_mode(regs)) {
- if (die("Exception in kernel mode", regs, signr))
- return;
- } else if (show_unhandled_signals &&
- unhandled_signal(current, signr) &&
- printk_ratelimit()) {
- printk(regs->msr & MSR_SF ? fmt64 : fmt32,
- current->comm, current->pid, signr,
- addr, regs->nip, regs->link, code);
- }
+ die("Exception in kernel mode", regs, signr);
+ return;
+ }
+
+ if (show_unhandled_signals && unhandled_signal(current, signr)) {
+ printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32,
+ current->comm, current->pid, signr,
+ addr, regs->nip, regs->link, code);
+ }
+ if (arch_irqs_disabled() && !arch_irq_disabled_regs(regs))
+ local_irq_enable();
+
+ current->thread.trap_nr = code;
memset(&info, 0, sizeof(info));
info.si_signo = signr;
info.si_code = code;
@@ -220,31 +277,31 @@ void system_reset_exception(struct pt_regs *regs)
return;
}
-#ifdef CONFIG_KEXEC
- cpu_set(smp_processor_id(), cpus_in_sr);
-#endif
-
die("System Reset", regs, SIGABRT);
- /*
- * Some CPUs when released from the debugger will execute this path.
- * These CPUs entered the debugger via a soft-reset. If the CPU was
- * hung before entering the debugger it will return to the hung
- * state when exiting this function. This causes a problem in
- * kdump since the hung CPU(s) will not respond to the IPI sent
- * from kdump. To prevent the problem we call crash_kexec_secondary()
- * here. If a kdump had not been initiated or we exit the debugger
- * with the "exit and recover" command (x) crash_kexec_secondary()
- * will return after 5ms and the CPU returns to its previous state.
- */
- crash_kexec_secondary(regs);
-
/* Must die if the interrupt is not recoverable */
if (!(regs->msr & MSR_RI))
panic("Unrecoverable System Reset");
/* What should we do here? We could issue a shutdown or hard reset. */
}
+
+/*
+ * This function is called in real mode. Strictly no printk's please.
+ *
+ * regs->nip and regs->msr contains srr0 and ssr1.
+ */
+long machine_check_early(struct pt_regs *regs)
+{
+ long handled = 0;
+
+ __get_cpu_var(irq_stat).mce_exceptions++;
+
+ if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
+ handled = cur_cpu_spec->machine_check_early(regs);
+ return handled;
+}
+
#endif
/*
@@ -311,14 +368,15 @@ static inline int check_io_access(struct pt_regs *regs)
#define REASON_TRAP ESR_PTR
/* single-step stuff */
-#define single_stepping(regs) (current->thread.dbcr0 & DBCR0_IC)
-#define clear_single_step(regs) (current->thread.dbcr0 &= ~DBCR0_IC)
+#define single_stepping(regs) (current->thread.debug.dbcr0 & DBCR0_IC)
+#define clear_single_step(regs) (current->thread.debug.dbcr0 &= ~DBCR0_IC)
#else
/* On non-4xx, the reason for the machine check or program
exception is in the MSR. */
#define get_reason(regs) ((regs)->msr)
#define get_mc_reason(regs) ((regs)->msr)
+#define REASON_TM 0x200000
#define REASON_FP 0x100000
#define REASON_ILLEGAL 0x80000
#define REASON_PRIVILEGED 0x40000
@@ -425,6 +483,12 @@ int machine_check_e500mc(struct pt_regs *regs)
unsigned long reason = mcsr;
int recoverable = 1;
+ if (reason & MCSR_LD) {
+ recoverable = fsl_rio_mcheck_exception(regs);
+ if (recoverable == 1)
+ goto silent_out;
+ }
+
printk("Machine check in kernel mode.\n");
printk("Caused by (from MCSR=%lx): ", reason);
@@ -451,7 +515,14 @@ int machine_check_e500mc(struct pt_regs *regs)
if (reason & MCSR_DCPERR_MC) {
printk("Data Cache Parity Error\n");
- recoverable = 0;
+
+ /*
+ * In write shadow mode we auto-recover from the error, but it
+ * may still get logged and cause a machine check. We should
+ * only treat the non-write shadow case as non-recoverable.
+ */
+ if (!(mfspr(SPRN_L1CSR2) & L1CSR2_DCWS))
+ recoverable = 0;
}
if (reason & MCSR_L2MMU_MHIT) {
@@ -500,6 +571,7 @@ int machine_check_e500mc(struct pt_regs *regs)
reason & MCSR_MEA ? "Effective" : "Physical", addr);
}
+silent_out:
mtspr(SPRN_MCSR, mcsr);
return mfspr(SPRN_MCSR) == 0 && recoverable;
}
@@ -508,6 +580,13 @@ int machine_check_e500(struct pt_regs *regs)
{
unsigned long reason = get_mc_reason(regs);
+ if (reason & MCSR_BUS_RBERR) {
+ if (fsl_rio_mcheck_exception(regs))
+ return 1;
+ if (fsl_pci_mcheck_exception(regs))
+ return 1;
+ }
+
printk("Machine check in kernel mode.\n");
printk("Caused by (from MCSR=%lx): ", reason);
@@ -608,6 +687,7 @@ int machine_check_generic(struct pt_regs *regs)
void machine_check_exception(struct pt_regs *regs)
{
+ enum ctx_state prev_state = exception_enter();
int recover = 0;
__get_cpu_var(irq_stat).mce_exceptions++;
@@ -624,13 +704,7 @@ void machine_check_exception(struct pt_regs *regs)
recover = cur_cpu_spec->machine_check(regs);
if (recover > 0)
- return;
-
- if (user_mode(regs)) {
- regs->msr |= MSR_RI;
- _exception(SIGBUS, regs, BUS_ADRERR, regs->nip);
- return;
- }
+ goto bail;
#if defined(CONFIG_8xx) && defined(CONFIG_PCI)
/* the qspan pci read routines can cause machine checks -- Cort
@@ -640,24 +714,23 @@ void machine_check_exception(struct pt_regs *regs)
* -- BenH
*/
bad_page_fault(regs, regs->dar, SIGBUS);
- return;
+ goto bail;
#endif
- if (debugger_fault_handler(regs)) {
- regs->msr |= MSR_RI;
- return;
- }
+ if (debugger_fault_handler(regs))
+ goto bail;
if (check_io_access(regs))
- return;
+ goto bail;
- if (debugger_fault_handler(regs))
- return;
die("Machine check", regs, SIGBUS);
/* Must die if the interrupt is not recoverable */
if (!(regs->msr & MSR_RI))
panic("Unrecoverable Machine check");
+
+bail:
+ exception_exit(prev_state);
}
void SMIException(struct pt_regs *regs)
@@ -667,20 +740,29 @@ void SMIException(struct pt_regs *regs)
void unknown_exception(struct pt_regs *regs)
{
+ enum ctx_state prev_state = exception_enter();
+
printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n",
regs->nip, regs->msr, regs->trap);
_exception(SIGTRAP, regs, 0, 0);
+
+ exception_exit(prev_state);
}
void instruction_breakpoint_exception(struct pt_regs *regs)
{
+ enum ctx_state prev_state = exception_enter();
+
if (notify_die(DIE_IABR_MATCH, "iabr_match", regs, 5,
5, SIGTRAP) == NOTIFY_STOP)
- return;
+ goto bail;
if (debugger_iabr_match(regs))
- return;
+ goto bail;
_exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
+
+bail:
+ exception_exit(prev_state);
}
void RunModeException(struct pt_regs *regs)
@@ -690,15 +772,20 @@ void RunModeException(struct pt_regs *regs)
void __kprobes single_step_exception(struct pt_regs *regs)
{
+ enum ctx_state prev_state = exception_enter();
+
clear_single_step(regs);
if (notify_die(DIE_SSTEP, "single_step", regs, 5,
5, SIGTRAP) == NOTIFY_STOP)
- return;
+ goto bail;
if (debugger_sstep(regs))
- return;
+ goto bail;
_exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
+
+bail:
+ exception_exit(prev_state);
}
/*
@@ -746,7 +833,7 @@ static void parse_fpe(struct pt_regs *regs)
flush_fp_to_thread(current);
- code = __parse_fpscr(current->thread.fpscr.val);
+ code = __parse_fpscr(current->thread.fp_state.fpscr);
_exception(SIGFPE, regs, code, regs->nip);
}
@@ -797,6 +884,10 @@ static int emulate_string_inst(struct pt_regs *regs, u32 instword)
u8 val;
u32 shift = 8 * (3 - (pos & 0x3));
+ /* if process is 32-bit, clear upper 32 bits of EA */
+ if ((regs->msr & MSR_64BIT) == 0)
+ EA &= 0xFFFFFFFF;
+
switch ((instword & PPC_INST_STRING_MASK)) {
case PPC_INST_LSWX:
case PPC_INST_LSWI:
@@ -864,12 +955,34 @@ static int emulate_isel(struct pt_regs *regs, u32 instword)
return 0;
}
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline bool tm_abort_check(struct pt_regs *regs, int cause)
+{
+ /* If we're emulating a load/store in an active transaction, we cannot
+ * emulate it as the kernel operates in transaction suspended context.
+ * We need to abort the transaction. This creates a persistent TM
+ * abort so tell the user what caused it with a new code.
+ */
+ if (MSR_TM_TRANSACTIONAL(regs->msr)) {
+ tm_enable();
+ tm_abort(cause);
+ return true;
+ }
+ return false;
+}
+#else
+static inline bool tm_abort_check(struct pt_regs *regs, int reason)
+{
+ return false;
+}
+#endif
+
static int emulate_instruction(struct pt_regs *regs)
{
u32 instword;
u32 rd;
- if (!user_mode(regs) || (regs->msr & MSR_LE))
+ if (!user_mode(regs))
return -EINVAL;
CHECK_FULL_REGS(regs);
@@ -903,6 +1016,9 @@ static int emulate_instruction(struct pt_regs *regs)
/* Emulate load/store string insn. */
if ((instword & PPC_INST_STRING_GEN_MASK) == PPC_INST_STRING) {
+ if (tm_abort_check(regs,
+ TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT))
+ return -EINVAL;
PPC_WARN_EMULATED(string, regs);
return emulate_string_inst(regs, instword);
}
@@ -919,6 +1035,40 @@ static int emulate_instruction(struct pt_regs *regs)
return emulate_isel(regs, instword);
}
+ /* Emulate sync instruction variants */
+ if ((instword & PPC_INST_SYNC_MASK) == PPC_INST_SYNC) {
+ PPC_WARN_EMULATED(sync, regs);
+ asm volatile("sync");
+ return 0;
+ }
+
+#ifdef CONFIG_PPC64
+ /* Emulate the mfspr rD, DSCR. */
+ if ((((instword & PPC_INST_MFSPR_DSCR_USER_MASK) ==
+ PPC_INST_MFSPR_DSCR_USER) ||
+ ((instword & PPC_INST_MFSPR_DSCR_MASK) ==
+ PPC_INST_MFSPR_DSCR)) &&
+ cpu_has_feature(CPU_FTR_DSCR)) {
+ PPC_WARN_EMULATED(mfdscr, regs);
+ rd = (instword >> 21) & 0x1f;
+ regs->gpr[rd] = mfspr(SPRN_DSCR);
+ return 0;
+ }
+ /* Emulate the mtspr DSCR, rD. */
+ if ((((instword & PPC_INST_MTSPR_DSCR_USER_MASK) ==
+ PPC_INST_MTSPR_DSCR_USER) ||
+ ((instword & PPC_INST_MTSPR_DSCR_MASK) ==
+ PPC_INST_MTSPR_DSCR)) &&
+ cpu_has_feature(CPU_FTR_DSCR)) {
+ PPC_WARN_EMULATED(mtdscr, regs);
+ rd = (instword >> 21) & 0x1f;
+ current->thread.dscr = regs->gpr[rd];
+ current->thread.dscr_inherit = 1;
+ mtspr(SPRN_DSCR, current->thread.dscr);
+ return 0;
+ }
+#endif
+
return -EINVAL;
}
@@ -927,10 +1077,41 @@ int is_valid_bugaddr(unsigned long addr)
return is_kernel_addr(addr);
}
+#ifdef CONFIG_MATH_EMULATION
+static int emulate_math(struct pt_regs *regs)
+{
+ int ret;
+ extern int do_mathemu(struct pt_regs *regs);
+
+ ret = do_mathemu(regs);
+ if (ret >= 0)
+ PPC_WARN_EMULATED(math, regs);
+
+ switch (ret) {
+ case 0:
+ emulate_single_step(regs);
+ return 0;
+ case 1: {
+ int code = 0;
+ code = __parse_fpscr(current->thread.fp_state.fpscr);
+ _exception(SIGFPE, regs, code, regs->nip);
+ return 0;
+ }
+ case -EFAULT:
+ _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
+ return 0;
+ }
+
+ return -1;
+}
+#else
+static inline int emulate_math(struct pt_regs *regs) { return -1; }
+#endif
+
void __kprobes program_check_exception(struct pt_regs *regs)
{
+ enum ctx_state prev_state = exception_enter();
unsigned int reason = get_reason(regs);
- extern int do_mathemu(struct pt_regs *regs);
/* We can now get here via a FP Unavailable exception if the core
* has no FPU, in that case the reason flags will be 0 */
@@ -938,54 +1119,84 @@ void __kprobes program_check_exception(struct pt_regs *regs)
if (reason & REASON_FP) {
/* IEEE FP exception */
parse_fpe(regs);
- return;
+ goto bail;
}
if (reason & REASON_TRAP) {
/* Debugger is first in line to stop recursive faults in
* rcu_lock, notify_die, or atomic_notifier_call_chain */
if (debugger_bpt(regs))
- return;
+ goto bail;
/* trap exception */
if (notify_die(DIE_BPT, "breakpoint", regs, 5, 5, SIGTRAP)
== NOTIFY_STOP)
- return;
+ goto bail;
if (!(regs->msr & MSR_PR) && /* not user-mode */
report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) {
regs->nip += 4;
- return;
+ goto bail;
}
_exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
- return;
+ goto bail;
}
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ if (reason & REASON_TM) {
+ /* This is a TM "Bad Thing Exception" program check.
+ * This occurs when:
+ * - An rfid/hrfid/mtmsrd attempts to cause an illegal
+ * transition in TM states.
+ * - A trechkpt is attempted when transactional.
+ * - A treclaim is attempted when non transactional.
+ * - A tend is illegally attempted.
+ * - writing a TM SPR when transactional.
+ */
+ if (!user_mode(regs) &&
+ report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) {
+ regs->nip += 4;
+ goto bail;
+ }
+ /* If usermode caused this, it's done something illegal and
+ * gets a SIGILL slap on the wrist. We call it an illegal
+ * operand to distinguish from the instruction just being bad
+ * (e.g. executing a 'tend' on a CPU without TM!); it's an
+ * illegal /placement/ of a valid instruction.
+ */
+ if (user_mode(regs)) {
+ _exception(SIGILL, regs, ILL_ILLOPN, regs->nip);
+ goto bail;
+ } else {
+ printk(KERN_EMERG "Unexpected TM Bad Thing exception "
+ "at %lx (msr 0x%x)\n", regs->nip, reason);
+ die("Unrecoverable exception", regs, SIGABRT);
+ }
+ }
+#endif
- local_irq_enable();
+ /*
+ * If we took the program check in the kernel skip down to sending a
+ * SIGILL. The subsequent cases all relate to emulating instructions
+ * which we should only do for userspace. We also do not want to enable
+ * interrupts for kernel faults because that might lead to further
+ * faults, and loose the context of the original exception.
+ */
+ if (!user_mode(regs))
+ goto sigill;
+
+ /* We restore the interrupt state now */
+ if (!arch_irq_disabled_regs(regs))
+ local_irq_enable();
-#ifdef CONFIG_MATH_EMULATION
/* (reason & REASON_ILLEGAL) would be the obvious thing here,
* but there seems to be a hardware bug on the 405GP (RevD)
* that means ESR is sometimes set incorrectly - either to
* ESR_DST (!?) or 0. In the process of chasing this with the
* hardware people - not sure if it can happen on any illegal
* instruction or only on FP instructions, whether there is a
- * pattern to occurences etc. -dgibson 31/Mar/2003 */
- switch (do_mathemu(regs)) {
- case 0:
- emulate_single_step(regs);
- return;
- case 1: {
- int code = 0;
- code = __parse_fpscr(current->thread.fpscr.val);
- _exception(SIGFPE, regs, code, regs->nip);
- return;
- }
- case -EFAULT:
- _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
- return;
- }
- /* fall through on any other errors */
-#endif /* CONFIG_MATH_EMULATION */
+ * pattern to occurrences etc. -dgibson 31/Mar/2003
+ */
+ if (!emulate_math(regs))
+ goto bail;
/* Try to emulate it if we should. */
if (reason & (REASON_ILLEGAL | REASON_PRIVILEGED)) {
@@ -993,23 +1204,45 @@ void __kprobes program_check_exception(struct pt_regs *regs)
case 0:
regs->nip += 4;
emulate_single_step(regs);
- return;
+ goto bail;
case -EFAULT:
_exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
- return;
+ goto bail;
}
}
+sigill:
if (reason & REASON_PRIVILEGED)
_exception(SIGILL, regs, ILL_PRVOPC, regs->nip);
else
_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
+
+bail:
+ exception_exit(prev_state);
+}
+
+/*
+ * This occurs when running in hypervisor mode on POWER6 or later
+ * and an illegal instruction is encountered.
+ */
+void __kprobes emulation_assist_interrupt(struct pt_regs *regs)
+{
+ regs->msr |= REASON_ILLEGAL;
+ program_check_exception(regs);
}
void alignment_exception(struct pt_regs *regs)
{
+ enum ctx_state prev_state = exception_enter();
int sig, code, fixed = 0;
+ /* We restore the interrupt state now */
+ if (!arch_irq_disabled_regs(regs))
+ local_irq_enable();
+
+ if (tm_abort_check(regs, TM_CAUSE_ALIGNMENT | TM_CAUSE_PERSISTENT))
+ goto bail;
+
/* we don't implement logging of alignment exceptions */
if (!(current->thread.align_ctl & PR_UNALIGN_SIGBUS))
fixed = fix_alignment(regs);
@@ -1017,7 +1250,7 @@ void alignment_exception(struct pt_regs *regs)
if (fixed == 1) {
regs->nip += 4; /* skip over emulated instruction */
emulate_single_step(regs);
- return;
+ goto bail;
}
/* Operand address was bad */
@@ -1032,6 +1265,9 @@ void alignment_exception(struct pt_regs *regs)
_exception(sig, regs, code, regs->dar);
else
bad_page_fault(regs, regs->dar, sig);
+
+bail:
+ exception_exit(prev_state);
}
void StackOverflow(struct pt_regs *regs)
@@ -1060,23 +1296,32 @@ void trace_syscall(struct pt_regs *regs)
void kernel_fp_unavailable_exception(struct pt_regs *regs)
{
+ enum ctx_state prev_state = exception_enter();
+
printk(KERN_EMERG "Unrecoverable FP Unavailable Exception "
"%lx at %lx\n", regs->trap, regs->nip);
die("Unrecoverable FP Unavailable Exception", regs, SIGABRT);
+
+ exception_exit(prev_state);
}
void altivec_unavailable_exception(struct pt_regs *regs)
{
+ enum ctx_state prev_state = exception_enter();
+
if (user_mode(regs)) {
/* A user program has executed an altivec instruction,
but this kernel doesn't support altivec. */
_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
- return;
+ goto bail;
}
printk(KERN_EMERG "Unrecoverable VMX/Altivec Unavailable Exception "
"%lx at %lx\n", regs->trap, regs->nip);
die("Unrecoverable VMX/Altivec Unavailable Exception", regs, SIGABRT);
+
+bail:
+ exception_exit(prev_state);
}
void vsx_unavailable_exception(struct pt_regs *regs)
@@ -1093,6 +1338,161 @@ void vsx_unavailable_exception(struct pt_regs *regs)
die("Unrecoverable VSX Unavailable Exception", regs, SIGABRT);
}
+#ifdef CONFIG_PPC64
+void facility_unavailable_exception(struct pt_regs *regs)
+{
+ static char *facility_strings[] = {
+ [FSCR_FP_LG] = "FPU",
+ [FSCR_VECVSX_LG] = "VMX/VSX",
+ [FSCR_DSCR_LG] = "DSCR",
+ [FSCR_PM_LG] = "PMU SPRs",
+ [FSCR_BHRB_LG] = "BHRB",
+ [FSCR_TM_LG] = "TM",
+ [FSCR_EBB_LG] = "EBB",
+ [FSCR_TAR_LG] = "TAR",
+ };
+ char *facility = "unknown";
+ u64 value;
+ u8 status;
+ bool hv;
+
+ hv = (regs->trap == 0xf80);
+ if (hv)
+ value = mfspr(SPRN_HFSCR);
+ else
+ value = mfspr(SPRN_FSCR);
+
+ status = value >> 56;
+ if (status == FSCR_DSCR_LG) {
+ /* User is acessing the DSCR. Set the inherit bit and allow
+ * the user to set it directly in future by setting via the
+ * FSCR DSCR bit. We always leave HFSCR DSCR set.
+ */
+ current->thread.dscr_inherit = 1;
+ mtspr(SPRN_FSCR, value | FSCR_DSCR);
+ return;
+ }
+
+ if ((status < ARRAY_SIZE(facility_strings)) &&
+ facility_strings[status])
+ facility = facility_strings[status];
+
+ /* We restore the interrupt state now */
+ if (!arch_irq_disabled_regs(regs))
+ local_irq_enable();
+
+ pr_err_ratelimited(
+ "%sFacility '%s' unavailable, exception at 0x%lx, MSR=%lx\n",
+ hv ? "Hypervisor " : "", facility, regs->nip, regs->msr);
+
+ if (user_mode(regs)) {
+ _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
+ return;
+ }
+
+ die("Unexpected facility unavailable exception", regs, SIGABRT);
+}
+#endif
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+
+void fp_unavailable_tm(struct pt_regs *regs)
+{
+ /* Note: This does not handle any kind of FP laziness. */
+
+ TM_DEBUG("FP Unavailable trap whilst transactional at 0x%lx, MSR=%lx\n",
+ regs->nip, regs->msr);
+
+ /* We can only have got here if the task started using FP after
+ * beginning the transaction. So, the transactional regs are just a
+ * copy of the checkpointed ones. But, we still need to recheckpoint
+ * as we're enabling FP for the process; it will return, abort the
+ * transaction, and probably retry but now with FP enabled. So the
+ * checkpointed FP registers need to be loaded.
+ */
+ tm_reclaim_current(TM_CAUSE_FAC_UNAV);
+ /* Reclaim didn't save out any FPRs to transact_fprs. */
+
+ /* Enable FP for the task: */
+ regs->msr |= (MSR_FP | current->thread.fpexc_mode);
+
+ /* This loads and recheckpoints the FP registers from
+ * thread.fpr[]. They will remain in registers after the
+ * checkpoint so we don't need to reload them after.
+ * If VMX is in use, the VRs now hold checkpointed values,
+ * so we don't want to load the VRs from the thread_struct.
+ */
+ tm_recheckpoint(&current->thread, MSR_FP);
+
+ /* If VMX is in use, get the transactional values back */
+ if (regs->msr & MSR_VEC) {
+ do_load_up_transact_altivec(&current->thread);
+ /* At this point all the VSX state is loaded, so enable it */
+ regs->msr |= MSR_VSX;
+ }
+}
+
+void altivec_unavailable_tm(struct pt_regs *regs)
+{
+ /* See the comments in fp_unavailable_tm(). This function operates
+ * the same way.
+ */
+
+ TM_DEBUG("Vector Unavailable trap whilst transactional at 0x%lx,"
+ "MSR=%lx\n",
+ regs->nip, regs->msr);
+ tm_reclaim_current(TM_CAUSE_FAC_UNAV);
+ regs->msr |= MSR_VEC;
+ tm_recheckpoint(&current->thread, MSR_VEC);
+ current->thread.used_vr = 1;
+
+ if (regs->msr & MSR_FP) {
+ do_load_up_transact_fpu(&current->thread);
+ regs->msr |= MSR_VSX;
+ }
+}
+
+void vsx_unavailable_tm(struct pt_regs *regs)
+{
+ unsigned long orig_msr = regs->msr;
+
+ /* See the comments in fp_unavailable_tm(). This works similarly,
+ * though we're loading both FP and VEC registers in here.
+ *
+ * If FP isn't in use, load FP regs. If VEC isn't in use, load VEC
+ * regs. Either way, set MSR_VSX.
+ */
+
+ TM_DEBUG("VSX Unavailable trap whilst transactional at 0x%lx,"
+ "MSR=%lx\n",
+ regs->nip, regs->msr);
+
+ current->thread.used_vsr = 1;
+
+ /* If FP and VMX are already loaded, we have all the state we need */
+ if ((orig_msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC)) {
+ regs->msr |= MSR_VSX;
+ return;
+ }
+
+ /* This reclaims FP and/or VR regs if they're already enabled */
+ tm_reclaim_current(TM_CAUSE_FAC_UNAV);
+
+ regs->msr |= MSR_VEC | MSR_FP | current->thread.fpexc_mode |
+ MSR_VSX;
+
+ /* This loads & recheckpoints FP and VRs; but we have
+ * to be sure not to overwrite previously-valid state.
+ */
+ tm_recheckpoint(&current->thread, regs->msr & ~orig_msr);
+
+ if (orig_msr & MSR_FP)
+ do_load_up_transact_fpu(&current->thread);
+ if (orig_msr & MSR_VEC)
+ do_load_up_transact_altivec(&current->thread);
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
void performance_monitor_exception(struct pt_regs *regs)
{
__get_cpu_var(irq_stat).pmu_irqs++;
@@ -1103,61 +1503,18 @@ void performance_monitor_exception(struct pt_regs *regs)
#ifdef CONFIG_8xx
void SoftwareEmulation(struct pt_regs *regs)
{
- extern int do_mathemu(struct pt_regs *);
- extern int Soft_emulate_8xx(struct pt_regs *);
-#if defined(CONFIG_MATH_EMULATION) || defined(CONFIG_8XX_MINIMAL_FPEMU)
- int errcode;
-#endif
-
CHECK_FULL_REGS(regs);
if (!user_mode(regs)) {
debugger(regs);
- die("Kernel Mode Software FPU Emulation", regs, SIGFPE);
+ die("Kernel Mode Unimplemented Instruction or SW FPU Emulation",
+ regs, SIGFPE);
}
-#ifdef CONFIG_MATH_EMULATION
- errcode = do_mathemu(regs);
- if (errcode >= 0)
- PPC_WARN_EMULATED(math, regs);
-
- switch (errcode) {
- case 0:
- emulate_single_step(regs);
+ if (!emulate_math(regs))
return;
- case 1: {
- int code = 0;
- code = __parse_fpscr(current->thread.fpscr.val);
- _exception(SIGFPE, regs, code, regs->nip);
- return;
- }
- case -EFAULT:
- _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
- return;
- default:
- _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
- return;
- }
-#elif defined(CONFIG_8XX_MINIMAL_FPEMU)
- errcode = Soft_emulate_8xx(regs);
- if (errcode >= 0)
- PPC_WARN_EMULATED(8xx, regs);
-
- switch (errcode) {
- case 0:
- emulate_single_step(regs);
- return;
- case 1:
- _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
- return;
- case -EFAULT:
- _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
- return;
- }
-#else
_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
-#endif
}
#endif /* CONFIG_8xx */
@@ -1172,7 +1529,7 @@ static void handle_debug(struct pt_regs *regs, unsigned long debug_status)
if (debug_status & (DBSR_DAC1R | DBSR_DAC1W)) {
dbcr_dac(current) &= ~(DBCR_DAC1R | DBCR_DAC1W);
#ifdef CONFIG_PPC_ADV_DEBUG_DAC_RANGE
- current->thread.dbcr2 &= ~DBCR2_DAC12MODE;
+ current->thread.debug.dbcr2 &= ~DBCR2_DAC12MODE;
#endif
do_send_trap(regs, mfspr(SPRN_DAC1), debug_status, TRAP_HWBKPT,
5);
@@ -1183,24 +1540,24 @@ static void handle_debug(struct pt_regs *regs, unsigned long debug_status)
6);
changed |= 0x01;
} else if (debug_status & DBSR_IAC1) {
- current->thread.dbcr0 &= ~DBCR0_IAC1;
+ current->thread.debug.dbcr0 &= ~DBCR0_IAC1;
dbcr_iac_range(current) &= ~DBCR_IAC12MODE;
do_send_trap(regs, mfspr(SPRN_IAC1), debug_status, TRAP_HWBKPT,
1);
changed |= 0x01;
} else if (debug_status & DBSR_IAC2) {
- current->thread.dbcr0 &= ~DBCR0_IAC2;
+ current->thread.debug.dbcr0 &= ~DBCR0_IAC2;
do_send_trap(regs, mfspr(SPRN_IAC2), debug_status, TRAP_HWBKPT,
2);
changed |= 0x01;
} else if (debug_status & DBSR_IAC3) {
- current->thread.dbcr0 &= ~DBCR0_IAC3;
+ current->thread.debug.dbcr0 &= ~DBCR0_IAC3;
dbcr_iac_range(current) &= ~DBCR_IAC34MODE;
do_send_trap(regs, mfspr(SPRN_IAC3), debug_status, TRAP_HWBKPT,
3);
changed |= 0x01;
} else if (debug_status & DBSR_IAC4) {
- current->thread.dbcr0 &= ~DBCR0_IAC4;
+ current->thread.debug.dbcr0 &= ~DBCR0_IAC4;
do_send_trap(regs, mfspr(SPRN_IAC4), debug_status, TRAP_HWBKPT,
4);
changed |= 0x01;
@@ -1210,19 +1567,20 @@ static void handle_debug(struct pt_regs *regs, unsigned long debug_status)
* Check all other debug flags and see if that bit needs to be turned
* back on or not.
*/
- if (DBCR_ACTIVE_EVENTS(current->thread.dbcr0, current->thread.dbcr1))
+ if (DBCR_ACTIVE_EVENTS(current->thread.debug.dbcr0,
+ current->thread.debug.dbcr1))
regs->msr |= MSR_DE;
else
/* Make sure the IDM flag is off */
- current->thread.dbcr0 &= ~DBCR0_IDM;
+ current->thread.debug.dbcr0 &= ~DBCR0_IDM;
if (changed & 0x01)
- mtspr(SPRN_DBCR0, current->thread.dbcr0);
+ mtspr(SPRN_DBCR0, current->thread.debug.dbcr0);
}
void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status)
{
- current->thread.dbsr = debug_status;
+ current->thread.debug.dbsr = debug_status;
/* Hack alert: On BookE, Branch Taken stops on the branch itself, while
* on server, it stops on the target of the branch. In order to simulate
@@ -1239,8 +1597,8 @@ void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status)
/* Do the single step trick only when coming from userspace */
if (user_mode(regs)) {
- current->thread.dbcr0 &= ~DBCR0_BT;
- current->thread.dbcr0 |= DBCR0_IDM | DBCR0_IC;
+ current->thread.debug.dbcr0 &= ~DBCR0_BT;
+ current->thread.debug.dbcr0 |= DBCR0_IDM | DBCR0_IC;
regs->msr |= MSR_DE;
return;
}
@@ -1268,15 +1626,13 @@ void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status)
return;
if (user_mode(regs)) {
- current->thread.dbcr0 &= ~DBCR0_IC;
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
- if (DBCR_ACTIVE_EVENTS(current->thread.dbcr0,
- current->thread.dbcr1))
+ current->thread.debug.dbcr0 &= ~DBCR0_IC;
+ if (DBCR_ACTIVE_EVENTS(current->thread.debug.dbcr0,
+ current->thread.debug.dbcr1))
regs->msr |= MSR_DE;
else
/* Make sure the IDM bit is off */
- current->thread.dbcr0 &= ~DBCR0_IDM;
-#endif
+ current->thread.debug.dbcr0 &= ~DBCR0_IDM;
}
_exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
@@ -1320,10 +1676,9 @@ void altivec_assist_exception(struct pt_regs *regs)
} else {
/* didn't recognize the instruction */
/* XXX quick hack for now: set the non-Java bit in the VSCR */
- if (printk_ratelimit())
- printk(KERN_ERR "Unrecognized altivec instruction "
- "in %s at %lx\n", current->comm, regs->nip);
- current->thread.vscr.u[3] |= 0x10000;
+ printk_ratelimited(KERN_ERR "Unrecognized altivec instruction "
+ "in %s at %lx\n", current->comm, regs->nip);
+ current->thread.vr_state.vscr.u[3] |= 0x10000;
}
}
#endif /* CONFIG_ALTIVEC */
@@ -1366,10 +1721,7 @@ void SPEFloatingPointException(struct pt_regs *regs)
int code = 0;
int err;
- preempt_disable();
- if (regs->msr & MSR_SPE)
- giveup_spe(current);
- preempt_enable();
+ flush_spe_to_thread(current);
spefscr = current->thread.spefscr;
fpexc_mode = current->thread.fpexc_mode;
@@ -1454,7 +1806,7 @@ void unrecoverable_exception(struct pt_regs *regs)
die("Unrecoverable exception", regs, SIGABRT);
}
-#ifdef CONFIG_BOOKE_WDT
+#if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x)
/*
* Default handler for a Watchdog exception,
* spins until a reboot occurs
@@ -1507,24 +1859,27 @@ struct ppc_emulated ppc_emulated = {
WARN_EMULATED_SETUP(popcntb),
WARN_EMULATED_SETUP(spe),
WARN_EMULATED_SETUP(string),
+ WARN_EMULATED_SETUP(sync),
WARN_EMULATED_SETUP(unaligned),
#ifdef CONFIG_MATH_EMULATION
WARN_EMULATED_SETUP(math),
-#elif defined(CONFIG_8XX_MINIMAL_FPEMU)
- WARN_EMULATED_SETUP(8xx),
#endif
#ifdef CONFIG_VSX
WARN_EMULATED_SETUP(vsx),
#endif
+#ifdef CONFIG_PPC64
+ WARN_EMULATED_SETUP(mfdscr),
+ WARN_EMULATED_SETUP(mtdscr),
+ WARN_EMULATED_SETUP(lq_stq),
+#endif
};
u32 ppc_warn_emulated;
void ppc_warn_emulated_print(const char *type)
{
- if (printk_ratelimit())
- pr_warning("%s used emulated %s instruction\n", current->comm,
- type);
+ pr_warn_ratelimited("%s used emulated %s instruction\n", current->comm,
+ type);
}
static int __init ppc_warn_emulated_init(void)
diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c
index e39cad83c88..b7aa07279a6 100644
--- a/arch/powerpc/kernel/udbg.c
+++ b/arch/powerpc/kernel/udbg.c
@@ -31,6 +31,9 @@ void __init udbg_early_init(void)
#if defined(CONFIG_PPC_EARLY_DEBUG_LPAR)
/* For LPAR machines that have an HVC console on vterm 0 */
udbg_init_debug_lpar();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_LPAR_HVSI)
+ /* For LPAR machines that have an HVSI console on vterm 0 */
+ udbg_init_debug_lpar_hvsi();
#elif defined(CONFIG_PPC_EARLY_DEBUG_G5)
/* For use on Apple G5 machines */
udbg_init_pmac_realmode();
@@ -43,14 +46,11 @@ void __init udbg_early_init(void)
#elif defined(CONFIG_PPC_EARLY_DEBUG_MAPLE)
/* Maple real mode debug */
udbg_init_maple_realmode();
-#elif defined(CONFIG_PPC_EARLY_DEBUG_ISERIES)
- /* For iSeries - hit Ctrl-x Ctrl-x to see the output */
- udbg_init_iseries();
#elif defined(CONFIG_PPC_EARLY_DEBUG_BEAT)
udbg_init_debug_beat();
#elif defined(CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE)
udbg_init_pas_realmode();
-#elif defined(CONFIG_BOOTX_TEXT)
+#elif defined(CONFIG_PPC_EARLY_DEBUG_BOOTX)
udbg_init_btext();
#elif defined(CONFIG_PPC_EARLY_DEBUG_44x)
/* PPC44x debug */
@@ -62,10 +62,23 @@ void __init udbg_early_init(void)
udbg_init_cpm();
#elif defined(CONFIG_PPC_EARLY_DEBUG_USBGECKO)
udbg_init_usbgecko();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_MEMCONS)
+ /* In memory console */
+ udbg_init_memcons();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_EHV_BC)
+ udbg_init_ehv_bc();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_PS3GELIC)
+ udbg_init_ps3gelic();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_OPAL_RAW)
+ udbg_init_debug_opal_raw();
+#elif defined(CONFIG_PPC_EARLY_DEBUG_OPAL_HVSI)
+ udbg_init_debug_opal_hvsi();
#endif
#ifdef CONFIG_PPC_EARLY_DEBUG
console_loglevel = 10;
+
+ register_early_udbg_console();
#endif
}
@@ -110,29 +123,6 @@ int udbg_write(const char *s, int n)
return n - remain;
}
-int udbg_read(char *buf, int buflen)
-{
- char *p = buf;
- int i, c;
-
- if (!udbg_getc)
- return 0;
-
- for (i = 0; i < buflen; ++i) {
- do {
- c = udbg_getc();
- if (c == -1 && i == 0)
- return -1;
-
- } while (c == 0x11 || c == 0x13);
- if (c == 0 || c == -1)
- break;
- *p++ = c;
- }
-
- return i;
-}
-
#define UDBG_BUFSIZE 256
void udbg_printf(const char *fmt, ...)
{
@@ -167,15 +157,13 @@ static struct console udbg_console = {
.index = 0,
};
-static int early_console_initialized;
-
/*
* Called by setup_system after ppc_md->probe and ppc_md->early_init.
* Call it again after setting udbg_putc in ppc_md->setup_arch.
*/
void __init register_early_udbg_console(void)
{
- if (early_console_initialized)
+ if (early_console)
return;
if (!udbg_putc)
@@ -185,7 +173,7 @@ void __init register_early_udbg_console(void)
printk(KERN_INFO "early console immortal !\n");
udbg_console.flags &= ~CON_BOOT;
}
- early_console_initialized = 1;
+ early_console = &udbg_console;
register_console(&udbg_console);
}
diff --git a/arch/powerpc/kernel/udbg_16550.c b/arch/powerpc/kernel/udbg_16550.c
index b4b167b3364..6e7c4923b5e 100644
--- a/arch/powerpc/kernel/udbg_16550.c
+++ b/arch/powerpc/kernel/udbg_16550.c
@@ -1,5 +1,5 @@
/*
- * udbg for NS16550 compatable serial ports
+ * udbg for NS16550 compatible serial ports
*
* Copyright (C) 2001-2005 PPC 64 Team, IBM Corp
*
@@ -11,29 +11,26 @@
#include <linux/types.h>
#include <asm/udbg.h>
#include <asm/io.h>
+#include <asm/reg_a2.h>
extern u8 real_readb(volatile u8 __iomem *addr);
extern void real_writeb(u8 data, volatile u8 __iomem *addr);
extern u8 real_205_readb(volatile u8 __iomem *addr);
extern void real_205_writeb(u8 data, volatile u8 __iomem *addr);
-struct NS16550 {
- /* this struct must be packed */
- unsigned char rbr; /* 0 */
- unsigned char ier; /* 1 */
- unsigned char fcr; /* 2 */
- unsigned char lcr; /* 3 */
- unsigned char mcr; /* 4 */
- unsigned char lsr; /* 5 */
- unsigned char msr; /* 6 */
- unsigned char scr; /* 7 */
-};
-
-#define thr rbr
-#define iir fcr
-#define dll rbr
-#define dlm ier
-#define dlab lcr
+#define UART_RBR 0
+#define UART_IER 1
+#define UART_FCR 2
+#define UART_LCR 3
+#define UART_MCR 4
+#define UART_LSR 5
+#define UART_MSR 6
+#define UART_SCR 7
+#define UART_THR UART_RBR
+#define UART_IIR UART_FCR
+#define UART_DLL UART_RBR
+#define UART_DLM UART_IER
+#define UART_DLAB UART_LCR
#define LSR_DR 0x01 /* Data ready */
#define LSR_OE 0x02 /* Overrun */
@@ -46,52 +43,62 @@ struct NS16550 {
#define LCR_DLAB 0x80
-static struct NS16550 __iomem *udbg_comport;
+static u8 (*udbg_uart_in)(unsigned int reg);
+static void (*udbg_uart_out)(unsigned int reg, u8 data);
-static void udbg_550_flush(void)
+static void udbg_uart_flush(void)
{
- if (udbg_comport) {
- while ((in_8(&udbg_comport->lsr) & LSR_THRE) == 0)
- /* wait for idle */;
- }
+ if (!udbg_uart_in)
+ return;
+
+ /* wait for idle */
+ while ((udbg_uart_in(UART_LSR) & LSR_THRE) == 0)
+ cpu_relax();
}
-static void udbg_550_putc(char c)
+static void udbg_uart_putc(char c)
{
- if (udbg_comport) {
- if (c == '\n')
- udbg_550_putc('\r');
- udbg_550_flush();
- out_8(&udbg_comport->thr, c);
- }
+ if (!udbg_uart_out)
+ return;
+
+ if (c == '\n')
+ udbg_uart_putc('\r');
+ udbg_uart_flush();
+ udbg_uart_out(UART_THR, c);
}
-static int udbg_550_getc_poll(void)
+static int udbg_uart_getc_poll(void)
{
- if (udbg_comport) {
- if ((in_8(&udbg_comport->lsr) & LSR_DR) != 0)
- return in_8(&udbg_comport->rbr);
- else
- return -1;
- }
+ if (!udbg_uart_in || !(udbg_uart_in(UART_LSR) & LSR_DR))
+ return udbg_uart_in(UART_RBR);
return -1;
}
-static int udbg_550_getc(void)
+static int udbg_uart_getc(void)
{
- if (udbg_comport) {
- while ((in_8(&udbg_comport->lsr) & LSR_DR) == 0)
- /* wait for char */;
- return in_8(&udbg_comport->rbr);
- }
- return -1;
+ if (!udbg_uart_in)
+ return -1;
+ /* wait for char */
+ while (!(udbg_uart_in(UART_LSR) & LSR_DR))
+ cpu_relax();
+ return udbg_uart_in(UART_RBR);
}
-void udbg_init_uart(void __iomem *comport, unsigned int speed,
- unsigned int clock)
+static void udbg_use_uart(void)
+{
+ udbg_putc = udbg_uart_putc;
+ udbg_flush = udbg_uart_flush;
+ udbg_getc = udbg_uart_getc;
+ udbg_getc_poll = udbg_uart_getc_poll;
+}
+
+void udbg_uart_setup(unsigned int speed, unsigned int clock)
{
unsigned int dll, base_bauds;
+ if (!udbg_uart_out)
+ return;
+
if (clock == 0)
clock = 1843200;
if (speed == 0)
@@ -100,51 +107,43 @@ void udbg_init_uart(void __iomem *comport, unsigned int speed,
base_bauds = clock / 16;
dll = base_bauds / speed;
- if (comport) {
- udbg_comport = (struct NS16550 __iomem *)comport;
- out_8(&udbg_comport->lcr, 0x00);
- out_8(&udbg_comport->ier, 0xff);
- out_8(&udbg_comport->ier, 0x00);
- out_8(&udbg_comport->lcr, LCR_DLAB);
- out_8(&udbg_comport->dll, dll & 0xff);
- out_8(&udbg_comport->dlm, dll >> 8);
- /* 8 data, 1 stop, no parity */
- out_8(&udbg_comport->lcr, 0x03);
- /* RTS/DTR */
- out_8(&udbg_comport->mcr, 0x03);
- /* Clear & enable FIFOs */
- out_8(&udbg_comport->fcr ,0x07);
- udbg_putc = udbg_550_putc;
- udbg_flush = udbg_550_flush;
- udbg_getc = udbg_550_getc;
- udbg_getc_poll = udbg_550_getc_poll;
- }
+ udbg_uart_out(UART_LCR, 0x00);
+ udbg_uart_out(UART_IER, 0xff);
+ udbg_uart_out(UART_IER, 0x00);
+ udbg_uart_out(UART_LCR, LCR_DLAB);
+ udbg_uart_out(UART_DLL, dll & 0xff);
+ udbg_uart_out(UART_DLM, dll >> 8);
+ /* 8 data, 1 stop, no parity */
+ udbg_uart_out(UART_LCR, 0x3);
+ /* RTS/DTR */
+ udbg_uart_out(UART_MCR, 0x3);
+ /* Clear & enable FIFOs */
+ udbg_uart_out(UART_FCR, 0x7);
}
-unsigned int udbg_probe_uart_speed(void __iomem *comport, unsigned int clock)
+unsigned int udbg_probe_uart_speed(unsigned int clock)
{
unsigned int dll, dlm, divisor, prescaler, speed;
u8 old_lcr;
- struct NS16550 __iomem *port = comport;
- old_lcr = in_8(&port->lcr);
+ old_lcr = udbg_uart_in(UART_LCR);
/* select divisor latch registers. */
- out_8(&port->lcr, LCR_DLAB);
+ udbg_uart_out(UART_LCR, old_lcr | LCR_DLAB);
/* now, read the divisor */
- dll = in_8(&port->dll);
- dlm = in_8(&port->dlm);
+ dll = udbg_uart_in(UART_DLL);
+ dlm = udbg_uart_in(UART_DLM);
divisor = dlm << 8 | dll;
/* check prescaling */
- if (in_8(&port->mcr) & 0x80)
+ if (udbg_uart_in(UART_MCR) & 0x80)
prescaler = 4;
else
prescaler = 1;
/* restore the LCR */
- out_8(&port->lcr, old_lcr);
+ udbg_uart_out(UART_LCR, old_lcr);
/* calculate speed */
speed = (clock / prescaler) / (divisor * 16);
@@ -156,145 +155,144 @@ unsigned int udbg_probe_uart_speed(void __iomem *comport, unsigned int clock)
return speed;
}
-#ifdef CONFIG_PPC_MAPLE
-void udbg_maple_real_flush(void)
+static union {
+ unsigned char __iomem *mmio_base;
+ unsigned long pio_base;
+} udbg_uart;
+
+static unsigned int udbg_uart_stride = 1;
+
+static u8 udbg_uart_in_pio(unsigned int reg)
{
- if (udbg_comport) {
- while ((real_readb(&udbg_comport->lsr) & LSR_THRE) == 0)
- /* wait for idle */;
- }
+ return inb(udbg_uart.pio_base + (reg * udbg_uart_stride));
}
-void udbg_maple_real_putc(char c)
+static void udbg_uart_out_pio(unsigned int reg, u8 data)
{
- if (udbg_comport) {
- if (c == '\n')
- udbg_maple_real_putc('\r');
- udbg_maple_real_flush();
- real_writeb(c, &udbg_comport->thr); eieio();
- }
+ outb(data, udbg_uart.pio_base + (reg * udbg_uart_stride));
}
-void __init udbg_init_maple_realmode(void)
+void udbg_uart_init_pio(unsigned long port, unsigned int stride)
{
- udbg_comport = (struct NS16550 __iomem *)0xf40003f8;
+ if (!port)
+ return;
+ udbg_uart.pio_base = port;
+ udbg_uart_stride = stride;
+ udbg_uart_in = udbg_uart_in_pio;
+ udbg_uart_out = udbg_uart_out_pio;
+ udbg_use_uart();
+}
- udbg_putc = udbg_maple_real_putc;
- udbg_flush = udbg_maple_real_flush;
- udbg_getc = NULL;
- udbg_getc_poll = NULL;
+static u8 udbg_uart_in_mmio(unsigned int reg)
+{
+ return in_8(udbg_uart.mmio_base + (reg * udbg_uart_stride));
}
-#endif /* CONFIG_PPC_MAPLE */
-#ifdef CONFIG_PPC_PASEMI
-void udbg_pas_real_flush(void)
+static void udbg_uart_out_mmio(unsigned int reg, u8 data)
+{
+ out_8(udbg_uart.mmio_base + (reg * udbg_uart_stride), data);
+}
+
+
+void udbg_uart_init_mmio(void __iomem *addr, unsigned int stride)
{
- if (udbg_comport) {
- while ((real_205_readb(&udbg_comport->lsr) & LSR_THRE) == 0)
- /* wait for idle */;
- }
+ if (!addr)
+ return;
+ udbg_uart.mmio_base = addr;
+ udbg_uart_stride = stride;
+ udbg_uart_in = udbg_uart_in_mmio;
+ udbg_uart_out = udbg_uart_out_mmio;
+ udbg_use_uart();
}
-void udbg_pas_real_putc(char c)
+#ifdef CONFIG_PPC_MAPLE
+
+#define UDBG_UART_MAPLE_ADDR ((void __iomem *)0xf40003f8)
+
+static u8 udbg_uart_in_maple(unsigned int reg)
{
- if (udbg_comport) {
- if (c == '\n')
- udbg_pas_real_putc('\r');
- udbg_pas_real_flush();
- real_205_writeb(c, &udbg_comport->thr); eieio();
- }
+ return real_readb(UDBG_UART_MAPLE_ADDR + reg);
}
-void udbg_init_pas_realmode(void)
+static void udbg_uart_out_maple(unsigned int reg, u8 val)
{
- udbg_comport = (struct NS16550 __iomem *)0xfcff03f8UL;
+ real_writeb(val, UDBG_UART_MAPLE_ADDR + reg);
+}
- udbg_putc = udbg_pas_real_putc;
- udbg_flush = udbg_pas_real_flush;
- udbg_getc = NULL;
- udbg_getc_poll = NULL;
+void __init udbg_init_maple_realmode(void)
+{
+ udbg_uart_in = udbg_uart_in_maple;
+ udbg_uart_out = udbg_uart_out_maple;
+ udbg_use_uart();
}
+
#endif /* CONFIG_PPC_MAPLE */
-#ifdef CONFIG_PPC_EARLY_DEBUG_44x
-#include <platforms/44x/44x.h>
+#ifdef CONFIG_PPC_PASEMI
+
+#define UDBG_UART_PAS_ADDR ((void __iomem *)0xfcff03f8UL)
-static void udbg_44x_as1_flush(void)
+static u8 udbg_uart_in_pas(unsigned int reg)
{
- if (udbg_comport) {
- while ((as1_readb(&udbg_comport->lsr) & LSR_THRE) == 0)
- /* wait for idle */;
- }
+ return real_205_readb(UDBG_UART_PAS_ADDR + reg);
}
-static void udbg_44x_as1_putc(char c)
+static void udbg_uart_out_pas(unsigned int reg, u8 val)
{
- if (udbg_comport) {
- if (c == '\n')
- udbg_44x_as1_putc('\r');
- udbg_44x_as1_flush();
- as1_writeb(c, &udbg_comport->thr); eieio();
- }
+ real_205_writeb(val, UDBG_UART_PAS_ADDR + reg);
}
-static int udbg_44x_as1_getc(void)
+void __init udbg_init_pas_realmode(void)
{
- if (udbg_comport) {
- while ((as1_readb(&udbg_comport->lsr) & LSR_DR) == 0)
- ; /* wait for char */
- return as1_readb(&udbg_comport->rbr);
- }
- return -1;
+ udbg_uart_in = udbg_uart_in_pas;
+ udbg_uart_out = udbg_uart_out_pas;
+ udbg_use_uart();
}
-void __init udbg_init_44x_as1(void)
+#endif /* CONFIG_PPC_PASEMI */
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_44x
+
+#include <platforms/44x/44x.h>
+
+static u8 udbg_uart_in_44x_as1(unsigned int reg)
{
- udbg_comport =
- (struct NS16550 __iomem *)PPC44x_EARLY_DEBUG_VIRTADDR;
+ return as1_readb((void __iomem *)PPC44x_EARLY_DEBUG_VIRTADDR + reg);
+}
- udbg_putc = udbg_44x_as1_putc;
- udbg_flush = udbg_44x_as1_flush;
- udbg_getc = udbg_44x_as1_getc;
+static void udbg_uart_out_44x_as1(unsigned int reg, u8 val)
+{
+ as1_writeb(val, (void __iomem *)PPC44x_EARLY_DEBUG_VIRTADDR + reg);
}
-#endif /* CONFIG_PPC_EARLY_DEBUG_44x */
-#ifdef CONFIG_PPC_EARLY_DEBUG_40x
-static void udbg_40x_real_flush(void)
+void __init udbg_init_44x_as1(void)
{
- if (udbg_comport) {
- while ((real_readb(&udbg_comport->lsr) & LSR_THRE) == 0)
- /* wait for idle */;
- }
+ udbg_uart_in = udbg_uart_in_44x_as1;
+ udbg_uart_out = udbg_uart_out_44x_as1;
+ udbg_use_uart();
}
-static void udbg_40x_real_putc(char c)
+#endif /* CONFIG_PPC_EARLY_DEBUG_44x */
+
+#ifdef CONFIG_PPC_EARLY_DEBUG_40x
+
+static u8 udbg_uart_in_40x(unsigned int reg)
{
- if (udbg_comport) {
- if (c == '\n')
- udbg_40x_real_putc('\r');
- udbg_40x_real_flush();
- real_writeb(c, &udbg_comport->thr); eieio();
- }
+ return real_readb((void __iomem *)CONFIG_PPC_EARLY_DEBUG_40x_PHYSADDR
+ + reg);
}
-static int udbg_40x_real_getc(void)
+static void udbg_uart_out_40x(unsigned int reg, u8 val)
{
- if (udbg_comport) {
- while ((real_readb(&udbg_comport->lsr) & LSR_DR) == 0)
- ; /* wait for char */
- return real_readb(&udbg_comport->rbr);
- }
- return -1;
+ real_writeb(val, (void __iomem *)CONFIG_PPC_EARLY_DEBUG_40x_PHYSADDR
+ + reg);
}
void __init udbg_init_40x_realmode(void)
{
- udbg_comport = (struct NS16550 __iomem *)
- CONFIG_PPC_EARLY_DEBUG_40x_PHYSADDR;
-
- udbg_putc = udbg_40x_real_putc;
- udbg_flush = udbg_40x_real_flush;
- udbg_getc = udbg_40x_real_getc;
- udbg_getc_poll = NULL;
+ udbg_uart_in = udbg_uart_in_40x;
+ udbg_uart_out = udbg_uart_out_40x;
+ udbg_use_uart();
}
+
#endif /* CONFIG_PPC_EARLY_DEBUG_40x */
diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c
new file mode 100644
index 00000000000..003b20964ea
--- /dev/null
+++ b/arch/powerpc/kernel/uprobes.c
@@ -0,0 +1,207 @@
+/*
+ * User-space Probes (UProbes) for powerpc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2007-2012
+ *
+ * Adapted from the x86 port by Ananth N Mavinakayanahalli <ananth@in.ibm.com>
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/uprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+
+#include <asm/sstep.h>
+
+#define UPROBE_TRAP_NR UINT_MAX
+
+/**
+ * is_trap_insn - check if the instruction is a trap variant
+ * @insn: instruction to be checked.
+ * Returns true if @insn is a trap variant.
+ */
+bool is_trap_insn(uprobe_opcode_t *insn)
+{
+ return (is_trap(*insn));
+}
+
+/**
+ * arch_uprobe_analyze_insn
+ * @mm: the probed address space.
+ * @arch_uprobe: the probepoint information.
+ * @addr: vaddr to probe.
+ * Return 0 on success or a -ve number on error.
+ */
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe,
+ struct mm_struct *mm, unsigned long addr)
+{
+ if (addr & 0x03)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * arch_uprobe_pre_xol - prepare to execute out of line.
+ * @auprobe: the probepoint information.
+ * @regs: reflects the saved user state of current task.
+ */
+int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct arch_uprobe_task *autask = &current->utask->autask;
+
+ autask->saved_trap_nr = current->thread.trap_nr;
+ current->thread.trap_nr = UPROBE_TRAP_NR;
+ regs->nip = current->utask->xol_vaddr;
+
+ user_enable_single_step(current);
+ return 0;
+}
+
+/**
+ * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
+ * @regs: Reflects the saved state of the task after it has hit a breakpoint
+ * instruction.
+ * Return the address of the breakpoint instruction.
+ */
+unsigned long uprobe_get_swbp_addr(struct pt_regs *regs)
+{
+ return instruction_pointer(regs);
+}
+
+/*
+ * If xol insn itself traps and generates a signal (SIGILL/SIGSEGV/etc),
+ * then detect the case where a singlestepped instruction jumps back to its
+ * own address. It is assumed that anything like do_page_fault/do_trap/etc
+ * sets thread.trap_nr != UINT_MAX.
+ *
+ * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
+ * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
+ * UPROBE_TRAP_NR == UINT_MAX set by arch_uprobe_pre_xol().
+ */
+bool arch_uprobe_xol_was_trapped(struct task_struct *t)
+{
+ if (t->thread.trap_nr != UPROBE_TRAP_NR)
+ return true;
+
+ return false;
+}
+
+/*
+ * Called after single-stepping. To avoid the SMP problems that can
+ * occur when we temporarily put back the original opcode to
+ * single-step, we single-stepped a copy of the instruction.
+ *
+ * This function prepares to resume execution after the single-step.
+ */
+int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
+
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+
+ /*
+ * On powerpc, except for loads and stores, most instructions
+ * including ones that alter code flow (branches, calls, returns)
+ * are emulated in the kernel. We get here only if the emulation
+ * support doesn't exist and have to fix-up the next instruction
+ * to be executed.
+ */
+ regs->nip = utask->vaddr + MAX_UINSN_BYTES;
+
+ user_disable_single_step(current);
+ return 0;
+}
+
+/* callback routine for handling exceptions. */
+int arch_uprobe_exception_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct die_args *args = data;
+ struct pt_regs *regs = args->regs;
+
+ /* regs == NULL is a kernel bug */
+ if (WARN_ON(!regs))
+ return NOTIFY_DONE;
+
+ /* We are only interested in userspace traps */
+ if (!user_mode(regs))
+ return NOTIFY_DONE;
+
+ switch (val) {
+ case DIE_BPT:
+ if (uprobe_pre_sstep_notifier(regs))
+ return NOTIFY_STOP;
+ break;
+ case DIE_SSTEP:
+ if (uprobe_post_sstep_notifier(regs))
+ return NOTIFY_STOP;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+/*
+ * This function gets called when XOL instruction either gets trapped or
+ * the thread has a fatal signal, so reset the instruction pointer to its
+ * probed address.
+ */
+void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+ instruction_pointer_set(regs, utask->vaddr);
+
+ user_disable_single_step(current);
+}
+
+/*
+ * See if the instruction can be emulated.
+ * Returns true if instruction was emulated, false otherwise.
+ */
+bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ int ret;
+
+ /*
+ * emulate_step() returns 1 if the insn was successfully emulated.
+ * For all other cases, we need to single-step in hardware.
+ */
+ ret = emulate_step(regs, auprobe->insn);
+ if (ret > 0)
+ return true;
+
+ return false;
+}
+
+unsigned long
+arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
+{
+ unsigned long orig_ret_vaddr;
+
+ orig_ret_vaddr = regs->link;
+
+ /* Replace the return addr with trampoline addr */
+ regs->link = trampoline_vaddr;
+
+ return orig_ret_vaddr;
+}
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index fd8728729ab..ce74c335a6a 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -9,7 +9,6 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/module.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
@@ -25,7 +24,6 @@
#include <linux/memblock.h>
#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/processor.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
@@ -36,8 +34,7 @@
#include <asm/firmware.h>
#include <asm/vdso.h>
#include <asm/vdso_datapage.h>
-
-#include "setup.h"
+#include <asm/setup.h>
#undef DEBUG
@@ -115,6 +112,10 @@ static struct vdso_patch_def vdso_patches[] = {
CPU_FTR_USE_TB, 0,
"__kernel_get_tbfreq", NULL
},
+ {
+ CPU_FTR_USE_TB, 0,
+ "__kernel_time", NULL
+ },
};
/*
@@ -264,17 +265,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
* the "data" page of the vDSO or you'll stop getting kernel updates
* and your nice userland gettimeofday will be totally dead.
* It's fine to use that for setting breakpoints in the vDSO code
- * pages though
- *
- * Make sure the vDSO gets into every core dump.
- * Dumping its contents makes post-mortem fully interpretable later
- * without matching up the same kernel and hardware config to see
- * what PC values meant.
+ * pages though.
*/
rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT,
VM_READ|VM_EXEC|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
- VM_ALWAYSDUMP,
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
vdso_pagelist);
if (rc) {
current->mm->context.vdso_base = 0;
@@ -714,6 +709,32 @@ static void __init vdso_setup_syscall_map(void)
}
}
+#ifdef CONFIG_PPC64
+int vdso_getcpu_init(void)
+{
+ unsigned long cpu, node, val;
+
+ /*
+ * SPRG_VDSO contains the CPU in the bottom 16 bits and the NUMA node
+ * in the next 16 bits. The VDSO uses this to implement getcpu().
+ */
+ cpu = get_cpu();
+ WARN_ON_ONCE(cpu > 0xffff);
+
+ node = cpu_to_node(cpu);
+ WARN_ON_ONCE(node > 0xffff);
+
+ val = (cpu & 0xfff) | ((node & 0xffff) << 16);
+ mtspr(SPRN_SPRG_VDSO_WRITE, val);
+ get_paca()->sprg_vdso = val;
+
+ put_cpu();
+
+ return 0;
+}
+/* We need to call this before SMP init */
+early_initcall(vdso_getcpu_init);
+#endif
static int __init vdso_init(void)
{
@@ -728,10 +749,10 @@ static int __init vdso_init(void)
vdso_data->version.minor = SYSTEMCFG_MINOR;
vdso_data->processor = mfspr(SPRN_PVR);
/*
- * Fake the old platform number for pSeries and iSeries and add
+ * Fake the old platform number for pSeries and add
* in LPAR bit if necessary
*/
- vdso_data->platform = machine_is(iseries) ? 0x200 : 0x100;
+ vdso_data->platform = 0x100;
if (firmware_has_feature(FW_FEATURE_LPAR))
vdso_data->platform |= 1;
vdso_data->physicalMemorySize = memblock_phys_mem_size();
@@ -820,17 +841,17 @@ static int __init vdso_init(void)
}
arch_initcall(vdso_init);
-int in_gate_area_no_task(unsigned long addr)
+int in_gate_area_no_mm(unsigned long addr)
{
return 0;
}
-int in_gate_area(struct task_struct *task, unsigned long addr)
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
return 0;
}
-struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
return NULL;
}
diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
index 9a7946c4173..53e6c9b979e 100644
--- a/arch/powerpc/kernel/vdso32/Makefile
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -1,7 +1,9 @@
# List of files in the vdso, has to be asm only for now
-obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o
+obj-vdso32-$(CONFIG_PPC64) = getcpu.o
+obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o \
+ $(obj-vdso32-y)
# Build rules
diff --git a/arch/powerpc/kernel/vdso32/getcpu.S b/arch/powerpc/kernel/vdso32/getcpu.S
new file mode 100644
index 00000000000..23eb9a9441b
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/getcpu.S
@@ -0,0 +1,45 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+
+ .text
+/*
+ * Exact prototype of getcpu
+ *
+ * int __kernel_getcpu(unsigned *cpu, unsigned *node);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_getcpu)
+ .cfi_startproc
+ mfspr r5,SPRN_SPRG_VDSO_READ
+ cmpdi cr0,r3,0
+ cmpdi cr1,r4,0
+ clrlwi r6,r5,16
+ rlwinm r7,r5,16,31-15,31-0
+ beq cr0,1f
+ stw r6,0(r3)
+1: beq cr1,2f
+ stw r7,0(r4)
+2: crclr cr0*4+so
+ li r3,0 /* always success */
+ blr
+ .cfi_endproc
+V_FUNCTION_END(__kernel_getcpu)
diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S
index 4ee09ee2e83..6b2b69616e7 100644
--- a/arch/powerpc/kernel/vdso32/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
@@ -181,6 +181,32 @@ V_FUNCTION_END(__kernel_clock_getres)
/*
+ * Exact prototype of time()
+ *
+ * time_t time(time *t);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_time)
+ .cfi_startproc
+ mflr r12
+ .cfi_register lr,r12
+
+ mr r11,r3 /* r11 holds t */
+ bl __get_datapage@local
+ mr r9, r3 /* datapage ptr in r9 */
+
+ lwz r3,STAMP_XTIME+TSPEC_TV_SEC(r9)
+
+ cmplwi r11,0 /* check if t is NULL */
+ beq 2f
+ stw r3,0(r11) /* store result at *t */
+2: mtlr r12
+ crclr cr0*4+so
+ blr
+ .cfi_endproc
+V_FUNCTION_END(__kernel_time)
+
+/*
* This is the core of clock_gettime() and gettimeofday(),
* it returns the current time in r3 (seconds) and r4.
* On entry, r7 gives the resolution of r4, either USEC_PER_SEC
@@ -206,9 +232,15 @@ __do_get_tspec:
lwz r6,(CFG_TB_ORIG_STAMP+4)(r9)
/* Get a stable TB value */
+#ifdef CONFIG_8xx
2: mftbu r3
mftbl r4
mftbu r0
+#else
+2: mfspr r3, SPRN_TBRU
+ mfspr r4, SPRN_TBRL
+ mfspr r0, SPRN_TBRU
+#endif
cmplw cr0,r3,r0
bne- 2b
diff --git a/arch/powerpc/kernel/vdso32/sigtramp.S b/arch/powerpc/kernel/vdso32/sigtramp.S
index 68d49dd71dc..cf0c9c9c24f 100644
--- a/arch/powerpc/kernel/vdso32/sigtramp.S
+++ b/arch/powerpc/kernel/vdso32/sigtramp.S
@@ -19,7 +19,7 @@
/* The nop here is a hack. The dwarf2 unwind routines subtract 1 from
the return address to get an address in the middle of the presumed
- call instruction. Since we don't have a call here, we artifically
+ call instruction. Since we don't have a call here, we artificially
extend the range covered by the unwind info by adding a nop before
the real start. */
nop
diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S
index 0546bcd49cd..e58ee10fa5c 100644
--- a/arch/powerpc/kernel/vdso32/vdso32.lds.S
+++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
@@ -4,7 +4,11 @@
*/
#include <asm/vdso.h>
+#ifdef __LITTLE_ENDIAN__
+OUTPUT_FORMAT("elf32-powerpcle", "elf32-powerpcle", "elf32-powerpcle")
+#else
OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", "elf32-powerpc")
+#endif
OUTPUT_ARCH(powerpc:common)
ENTRY(_start)
@@ -147,6 +151,10 @@ VERSION
__kernel_sync_dicache_p5;
__kernel_sigtramp32;
__kernel_sigtramp_rt32;
+#ifdef CONFIG_PPC64
+ __kernel_getcpu;
+#endif
+ __kernel_time;
local: *;
};
diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
index 6e8f507ed32..6ac107ac402 100644
--- a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
+++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
@@ -1,4 +1,3 @@
-#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/page.h>
@@ -7,7 +6,7 @@
.globl vdso32_start, vdso32_end
.balign PAGE_SIZE
vdso32_start:
- .incbin "arch/powerpc/kernel/vdso32/vdso32.so"
+ .incbin "arch/powerpc/kernel/vdso32/vdso32.so.dbg"
.balign PAGE_SIZE
vdso32_end:
diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile
index 8c500d8622e..effca9404b1 100644
--- a/arch/powerpc/kernel/vdso64/Makefile
+++ b/arch/powerpc/kernel/vdso64/Makefile
@@ -1,6 +1,6 @@
# List of files in the vdso, has to be asm only for now
-obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o
+obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o
# Build rules
diff --git a/arch/powerpc/kernel/vdso64/getcpu.S b/arch/powerpc/kernel/vdso64/getcpu.S
new file mode 100644
index 00000000000..23eb9a9441b
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/getcpu.S
@@ -0,0 +1,45 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/ppc_asm.h>
+#include <asm/vdso.h>
+
+ .text
+/*
+ * Exact prototype of getcpu
+ *
+ * int __kernel_getcpu(unsigned *cpu, unsigned *node);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_getcpu)
+ .cfi_startproc
+ mfspr r5,SPRN_SPRG_VDSO_READ
+ cmpdi cr0,r3,0
+ cmpdi cr1,r4,0
+ clrlwi r6,r5,16
+ rlwinm r7,r5,16,31-15,31-0
+ beq cr0,1f
+ stw r6,0(r3)
+1: beq cr1,2f
+ stw r7,0(r4)
+2: crclr cr0*4+so
+ li r3,0 /* always success */
+ blr
+ .cfi_endproc
+V_FUNCTION_END(__kernel_getcpu)
diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S
index e97a9a0dc4a..a76b4af37ef 100644
--- a/arch/powerpc/kernel/vdso64/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
@@ -164,6 +164,32 @@ V_FUNCTION_BEGIN(__kernel_clock_getres)
.cfi_endproc
V_FUNCTION_END(__kernel_clock_getres)
+/*
+ * Exact prototype of time()
+ *
+ * time_t time(time *t);
+ *
+ */
+V_FUNCTION_BEGIN(__kernel_time)
+ .cfi_startproc
+ mflr r12
+ .cfi_register lr,r12
+
+ mr r11,r3 /* r11 holds t */
+ bl V_LOCAL_FUNC(__get_datapage)
+
+ ld r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
+
+ cmpldi r11,0 /* check if t is NULL */
+ beq 2f
+ std r4,0(r11) /* store result at *t */
+2: mtlr r12
+ crclr cr0*4+so
+ mr r3,r4
+ blr
+ .cfi_endproc
+V_FUNCTION_END(__kernel_time)
+
/*
* This is the core of clock_gettime() and gettimeofday(),
diff --git a/arch/powerpc/kernel/vdso64/sigtramp.S b/arch/powerpc/kernel/vdso64/sigtramp.S
index 59eb59bb408..542c6f422e4 100644
--- a/arch/powerpc/kernel/vdso64/sigtramp.S
+++ b/arch/powerpc/kernel/vdso64/sigtramp.S
@@ -20,7 +20,7 @@
/* The nop here is a hack. The dwarf2 unwind routines subtract 1 from
the return address to get an address in the middle of the presumed
- call instruction. Since we don't have a call here, we artifically
+ call instruction. Since we don't have a call here, we artificially
extend the range covered by the unwind info by padding before the
real start. */
nop
@@ -142,6 +142,13 @@ V_FUNCTION_END(__kernel_sigtramp_rt64)
/* Size of CR reg in DWARF unwind info. */
#define CRSIZE 4
+/* Offset of CR reg within a full word. */
+#ifdef __LITTLE_ENDIAN__
+#define CROFF 0
+#else
+#define CROFF (RSIZE - CRSIZE)
+#endif
+
/* This is the offset of the VMX reg pointer. */
#define VREGS 48*RSIZE+33*8
@@ -181,7 +188,14 @@ V_FUNCTION_END(__kernel_sigtramp_rt64)
rsave (31, 31*RSIZE); \
rsave (67, 32*RSIZE); /* ap, used as temp for nip */ \
rsave (65, 36*RSIZE); /* lr */ \
- rsave (70, 38*RSIZE + (RSIZE - CRSIZE)) /* cr */
+ rsave (68, 38*RSIZE + CROFF); /* cr fields */ \
+ rsave (69, 38*RSIZE + CROFF); \
+ rsave (70, 38*RSIZE + CROFF); \
+ rsave (71, 38*RSIZE + CROFF); \
+ rsave (72, 38*RSIZE + CROFF); \
+ rsave (73, 38*RSIZE + CROFF); \
+ rsave (74, 38*RSIZE + CROFF); \
+ rsave (75, 38*RSIZE + CROFF)
/* Describe where the FP regs are saved. */
#define EH_FRAME_FP \
diff --git a/arch/powerpc/kernel/vdso64/vdso64.lds.S b/arch/powerpc/kernel/vdso64/vdso64.lds.S
index 0e615404e24..64fb183a47c 100644
--- a/arch/powerpc/kernel/vdso64/vdso64.lds.S
+++ b/arch/powerpc/kernel/vdso64/vdso64.lds.S
@@ -4,7 +4,11 @@
*/
#include <asm/vdso.h>
+#ifdef __LITTLE_ENDIAN__
+OUTPUT_FORMAT("elf64-powerpcle", "elf64-powerpcle", "elf64-powerpcle")
+#else
OUTPUT_FORMAT("elf64-powerpc", "elf64-powerpc", "elf64-powerpc")
+#endif
OUTPUT_ARCH(powerpc:common64)
ENTRY(_start)
@@ -146,6 +150,8 @@ VERSION
__kernel_sync_dicache;
__kernel_sync_dicache_p5;
__kernel_sigtramp_rt64;
+ __kernel_getcpu;
+ __kernel_time;
local: *;
};
diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
index b8553d62b79..df60fca6a13 100644
--- a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
+++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
@@ -1,4 +1,3 @@
-#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/page.h>
@@ -7,7 +6,7 @@
.globl vdso64_start, vdso64_end
.balign PAGE_SIZE
vdso64_start:
- .incbin "arch/powerpc/kernel/vdso64/vdso64.so"
+ .incbin "arch/powerpc/kernel/vdso64/vdso64.so.dbg"
.balign PAGE_SIZE
vdso64_end:
diff --git a/arch/powerpc/kernel/vecemu.c b/arch/powerpc/kernel/vecemu.c
index 604d0947cb2..c4bfadb2606 100644
--- a/arch/powerpc/kernel/vecemu.c
+++ b/arch/powerpc/kernel/vecemu.c
@@ -271,7 +271,7 @@ int emulate_altivec(struct pt_regs *regs)
vb = (instr >> 11) & 0x1f;
vc = (instr >> 6) & 0x1f;
- vrs = current->thread.vr;
+ vrs = current->thread.vr_state.vr;
switch (instr & 0x3f) {
case 10:
switch (vc) {
@@ -320,12 +320,12 @@ int emulate_altivec(struct pt_regs *regs)
case 14: /* vctuxs */
for (i = 0; i < 4; ++i)
vrs[vd].u[i] = ctuxs(vrs[vb].u[i], va,
- &current->thread.vscr.u[3]);
+ &current->thread.vr_state.vscr.u[3]);
break;
case 15: /* vctsxs */
for (i = 0; i < 4; ++i)
vrs[vd].u[i] = ctsxs(vrs[vb].u[i], va,
- &current->thread.vscr.u[3]);
+ &current->thread.vr_state.vscr.u[3]);
break;
default:
return -EINVAL;
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index fe460482fa6..74f8050518d 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -5,14 +5,78 @@
#include <asm/cputable.h>
#include <asm/thread_info.h>
#include <asm/page.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/* void do_load_up_transact_altivec(struct thread_struct *thread)
+ *
+ * This is similar to load_up_altivec but for the transactional version of the
+ * vector regs. It doesn't mess with the task MSR or valid flags.
+ * Furthermore, VEC laziness is not supported with TM currently.
+ */
+_GLOBAL(do_load_up_transact_altivec)
+ mfmsr r6
+ oris r5,r6,MSR_VEC@h
+ MTMSRD(r5)
+ isync
+
+ li r4,1
+ stw r4,THREAD_USED_VR(r3)
+
+ li r10,THREAD_TRANSACT_VRSTATE+VRSTATE_VSCR
+ lvx vr0,r10,r3
+ mtvscr vr0
+ addi r10,r3,THREAD_TRANSACT_VRSTATE
+ REST_32VRS(0,r4,r10)
+
+ /* Disable VEC again. */
+ MTMSRD(r6)
+ isync
+
+ blr
+#endif
+
+/*
+ * Enable use of VMX/Altivec for the caller.
+ */
+_GLOBAL(vec_enable)
+ mfmsr r3
+ oris r3,r3,MSR_VEC@h
+ MTMSRD(r3)
+ isync
+ blr
+
+/*
+ * Load state from memory into VMX registers including VSCR.
+ * Assumes the caller has enabled VMX in the MSR.
+ */
+_GLOBAL(load_vr_state)
+ li r4,VRSTATE_VSCR
+ lvx vr0,r4,r3
+ mtvscr vr0
+ REST_32VRS(0,r4,r3)
+ blr
+
+/*
+ * Store VMX state into memory, including VSCR.
+ * Assumes the caller has enabled VMX in the MSR.
+ */
+_GLOBAL(store_vr_state)
+ SAVE_32VRS(0, r4, r3)
+ mfvscr vr0
+ li r4, VRSTATE_VSCR
+ stvx vr0, r4, r3
+ blr
/*
- * load_up_altivec(unused, unused, tsk)
* Disable VMX for the task which had it previously,
* and save its vector registers in its thread_struct.
* Enables the VMX for use in the kernel on return.
* On SMP we know the VMX is free, since we give it up every
* switch (ie, no lazy save of the vector registers).
+ *
+ * Note that on 32-bit this can only use registers that will be
+ * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
*/
_GLOBAL(load_up_altivec)
mfmsr r5 /* grab the current MSR */
@@ -38,10 +102,11 @@ _GLOBAL(load_up_altivec)
/* Save VMX state to last_task_used_altivec's THREAD struct */
toreal(r4)
addi r4,r4,THREAD
- SAVE_32VRS(0,r5,r4)
+ addi r6,r4,THREAD_VRSTATE
+ SAVE_32VRS(0,r5,r6)
mfvscr vr0
- li r10,THREAD_VSCR
- stvx vr0,r10,r4
+ li r10,VRSTATE_VSCR
+ stvx vr0,r10,r6
/* Disable VMX for last_task_used_altivec */
PPC_LL r5,PT_REGS(r4)
toreal(r5)
@@ -73,12 +138,13 @@ _GLOBAL(load_up_altivec)
oris r12,r12,MSR_VEC@h
std r12,_MSR(r1)
#endif
+ addi r6,r5,THREAD_VRSTATE
li r4,1
- li r10,THREAD_VSCR
+ li r10,VRSTATE_VSCR
stw r4,THREAD_USED_VR(r5)
- lvx vr0,r10,r5
+ lvx vr0,r10,r6
mtvscr vr0
- REST_32VRS(0,r4,r5)
+ REST_32VRS(0,r4,r6)
#ifndef CONFIG_SMP
/* Update last_task_used_altivec to 'current' */
subi r4,r5,THREAD /* Back to 'current' */
@@ -88,6 +154,16 @@ _GLOBAL(load_up_altivec)
/* restore registers and return */
blr
+_GLOBAL(giveup_altivec_notask)
+ mfmsr r3
+ andis. r4,r3,MSR_VEC@h
+ bnelr /* Already enabled? */
+ oris r3,r3,MSR_VEC@h
+ SYNC
+ MTMSRD(r3) /* enable use of VMX now */
+ isync
+ blr
+
/*
* giveup_altivec(tsk)
* Disable VMX for the task given as the argument,
@@ -101,14 +177,18 @@ _GLOBAL(giveup_altivec)
MTMSRD(r5) /* enable use of VMX now */
isync
PPC_LCMPI 0,r3,0
- beqlr- /* if no previous owner, done */
+ beqlr /* if no previous owner, done */
addi r3,r3,THREAD /* want THREAD of task */
+ PPC_LL r7,THREAD_VRSAVEAREA(r3)
PPC_LL r5,PT_REGS(r3)
- PPC_LCMPI 0,r5,0
- SAVE_32VRS(0,r4,r3)
+ PPC_LCMPI 0,r7,0
+ bne 2f
+ addi r7,r3,THREAD_VRSTATE
+2: PPC_LCMPI 0,r5,0
+ SAVE_32VRS(0,r4,r7)
mfvscr vr0
- li r4,THREAD_VSCR
- stvx vr0,r4,r3
+ li r4,VRSTATE_VSCR
+ stvx vr0,r4,r7
beq 1f
PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5)
#ifdef CONFIG_VSX
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 441d2a722f0..904c66128fa 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -14,12 +14,15 @@
* 2 of the License, or (at your option) any later version.
*/
+#include <linux/cpu.h>
#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
#include <linux/device.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/console.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm.h>
#include <linux/dma-mapping.h>
#include <linux/kobject.h>
@@ -30,16 +33,8 @@
#include <asm/prom.h>
#include <asm/firmware.h>
#include <asm/tce.h>
-#include <asm/abs_addr.h>
#include <asm/page.h>
#include <asm/hvcall.h>
-#include <asm/iseries/vio.h>
-#include <asm/iseries/hv_types.h>
-#include <asm/iseries/hv_lp_config.h>
-#include <asm/iseries/hv_call_xm.h>
-#include <asm/iseries/iommu.h>
-
-static struct bus_type vio_bus_type;
static struct vio_dev vio_bus_device = { /* fake "parent" device */
.name = "vio",
@@ -486,7 +481,8 @@ static void vio_cmo_balance(struct work_struct *work)
}
static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag)
+ dma_addr_t *dma_handle, gfp_t flag,
+ struct dma_attrs *attrs)
{
struct vio_dev *viodev = to_vio_dev(dev);
void *ret;
@@ -496,7 +492,7 @@ static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
return NULL;
}
- ret = dma_iommu_ops.alloc_coherent(dev, size, dma_handle, flag);
+ ret = dma_iommu_ops.alloc(dev, size, dma_handle, flag, attrs);
if (unlikely(ret == NULL)) {
vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
atomic_inc(&viodev->cmo.allocs_failed);
@@ -506,11 +502,12 @@ static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
}
static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
- void *vaddr, dma_addr_t dma_handle)
+ void *vaddr, dma_addr_t dma_handle,
+ struct dma_attrs *attrs)
{
struct vio_dev *viodev = to_vio_dev(dev);
- dma_iommu_ops.free_coherent(dev, size, vaddr, dma_handle);
+ dma_iommu_ops.free(dev, size, vaddr, dma_handle, attrs);
vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
}
@@ -521,16 +518,18 @@ static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page,
struct dma_attrs *attrs)
{
struct vio_dev *viodev = to_vio_dev(dev);
+ struct iommu_table *tbl;
dma_addr_t ret = DMA_ERROR_CODE;
- if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
+ tbl = get_iommu_table_base(dev);
+ if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) {
atomic_inc(&viodev->cmo.allocs_failed);
return ret;
}
ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs);
if (unlikely(dma_mapping_error(dev, ret))) {
- vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+ vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
atomic_inc(&viodev->cmo.allocs_failed);
}
@@ -543,10 +542,12 @@ static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
struct dma_attrs *attrs)
{
struct vio_dev *viodev = to_vio_dev(dev);
+ struct iommu_table *tbl;
+ tbl = get_iommu_table_base(dev);
dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs);
- vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+ vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
}
static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -554,12 +555,14 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
struct dma_attrs *attrs)
{
struct vio_dev *viodev = to_vio_dev(dev);
+ struct iommu_table *tbl;
struct scatterlist *sgl;
int ret, count = 0;
size_t alloc_size = 0;
+ tbl = get_iommu_table_base(dev);
for (sgl = sglist; count < nelems; count++, sgl++)
- alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE);
+ alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
if (vio_cmo_alloc(viodev, alloc_size)) {
atomic_inc(&viodev->cmo.allocs_failed);
@@ -575,7 +578,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
}
for (sgl = sglist, count = 0; count < ret; count++, sgl++)
- alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+ alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
if (alloc_size)
vio_cmo_dealloc(viodev, alloc_size);
@@ -588,33 +591,47 @@ static void vio_dma_iommu_unmap_sg(struct device *dev,
struct dma_attrs *attrs)
{
struct vio_dev *viodev = to_vio_dev(dev);
+ struct iommu_table *tbl;
struct scatterlist *sgl;
size_t alloc_size = 0;
int count = 0;
+ tbl = get_iommu_table_base(dev);
for (sgl = sglist; count < nelems; count++, sgl++)
- alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+ alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
vio_cmo_dealloc(viodev, alloc_size);
}
-struct dma_map_ops vio_dma_mapping_ops = {
- .alloc_coherent = vio_dma_iommu_alloc_coherent,
- .free_coherent = vio_dma_iommu_free_coherent,
- .map_sg = vio_dma_iommu_map_sg,
- .unmap_sg = vio_dma_iommu_unmap_sg,
- .map_page = vio_dma_iommu_map_page,
- .unmap_page = vio_dma_iommu_unmap_page,
+static int vio_dma_iommu_dma_supported(struct device *dev, u64 mask)
+{
+ return dma_iommu_ops.dma_supported(dev, mask);
+}
+static u64 vio_dma_get_required_mask(struct device *dev)
+{
+ return dma_iommu_ops.get_required_mask(dev);
+}
+
+struct dma_map_ops vio_dma_mapping_ops = {
+ .alloc = vio_dma_iommu_alloc_coherent,
+ .free = vio_dma_iommu_free_coherent,
+ .mmap = dma_direct_mmap_coherent,
+ .map_sg = vio_dma_iommu_map_sg,
+ .unmap_sg = vio_dma_iommu_unmap_sg,
+ .map_page = vio_dma_iommu_map_page,
+ .unmap_page = vio_dma_iommu_unmap_page,
+ .dma_supported = vio_dma_iommu_dma_supported,
+ .get_required_mask = vio_dma_get_required_mask,
};
/**
* vio_cmo_set_dev_desired - Set desired entitlement for a device
*
* @viodev: struct vio_dev for device to alter
- * @new_desired: new desired entitlement level in bytes
+ * @desired: new desired entitlement level in bytes
*
* For use by devices to request a change to their entitlement at runtime or
* through sysfs. The desired entitlement level is changed and a balancing
@@ -697,16 +714,32 @@ static int vio_cmo_bus_probe(struct vio_dev *viodev)
{
struct vio_cmo_dev_entry *dev_ent;
struct device *dev = &viodev->dev;
+ struct iommu_table *tbl;
struct vio_driver *viodrv = to_vio_driver(dev->driver);
unsigned long flags;
size_t size;
+ bool dma_capable = false;
+
+ tbl = get_iommu_table_base(dev);
+
+ /* A device requires entitlement if it has a DMA window property */
+ switch (viodev->family) {
+ case VDEVICE:
+ if (of_get_property(viodev->dev.of_node,
+ "ibm,my-dma-window", NULL))
+ dma_capable = true;
+ break;
+ case PFO:
+ dma_capable = false;
+ break;
+ default:
+ dev_warn(dev, "unknown device family: %d\n", viodev->family);
+ BUG();
+ break;
+ }
- /*
- * Check to see that device has a DMA window and configure
- * entitlement for the device.
- */
- if (of_get_property(viodev->dev.of_node,
- "ibm,my-dma-window", NULL)) {
+ /* Configure entitlement for the device. */
+ if (dma_capable) {
/* Check that the driver is CMO enabled and get desired DMA */
if (!viodrv->get_desired_dma) {
dev_err(dev, "%s: device driver does not support CMO\n",
@@ -714,7 +747,8 @@ static int vio_cmo_bus_probe(struct vio_dev *viodev)
return -EINVAL;
}
- viodev->cmo.desired = IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev));
+ viodev->cmo.desired =
+ IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev), tbl);
if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
viodev->cmo.desired = VIO_CMO_MIN_ENT;
size = VIO_CMO_MIN_ENT;
@@ -858,8 +892,7 @@ static void vio_cmo_bus_remove(struct vio_dev *viodev)
static void vio_cmo_set_dma_ops(struct vio_dev *viodev)
{
- vio_dma_mapping_ops.dma_supported = dma_iommu_ops.dma_supported;
- viodev->dev.archdata.dma_ops = &vio_dma_mapping_ops;
+ set_dma_ops(&viodev->dev, &vio_dma_mapping_ops);
}
/**
@@ -976,21 +1009,36 @@ static struct device_attribute vio_cmo_dev_attrs[] = {
/* sysfs bus functions and data structures for CMO */
#define viobus_cmo_rd_attr(name) \
-static ssize_t \
-viobus_cmo_##name##_show(struct bus_type *bt, char *buf) \
+static ssize_t cmo_##name##_show(struct bus_type *bt, char *buf) \
{ \
return sprintf(buf, "%lu\n", vio_cmo.name); \
-}
+} \
+static BUS_ATTR_RO(cmo_##name)
#define viobus_cmo_pool_rd_attr(name, var) \
static ssize_t \
-viobus_cmo_##name##_pool_show_##var(struct bus_type *bt, char *buf) \
+cmo_##name##_##var##_show(struct bus_type *bt, char *buf) \
{ \
return sprintf(buf, "%lu\n", vio_cmo.name.var); \
+} \
+static BUS_ATTR_RO(cmo_##name##_##var)
+
+viobus_cmo_rd_attr(entitled);
+viobus_cmo_rd_attr(spare);
+viobus_cmo_rd_attr(min);
+viobus_cmo_rd_attr(desired);
+viobus_cmo_rd_attr(curr);
+viobus_cmo_pool_rd_attr(reserve, size);
+viobus_cmo_pool_rd_attr(excess, size);
+viobus_cmo_pool_rd_attr(excess, free);
+
+static ssize_t cmo_high_show(struct bus_type *bt, char *buf)
+{
+ return sprintf(buf, "%lu\n", vio_cmo.high);
}
-static ssize_t viobus_cmo_high_reset(struct bus_type *bt, const char *buf,
- size_t count)
+static ssize_t cmo_high_store(struct bus_type *bt, const char *buf,
+ size_t count)
{
unsigned long flags;
@@ -1000,38 +1048,28 @@ static ssize_t viobus_cmo_high_reset(struct bus_type *bt, const char *buf,
return count;
}
-
-viobus_cmo_rd_attr(entitled);
-viobus_cmo_pool_rd_attr(reserve, size);
-viobus_cmo_pool_rd_attr(excess, size);
-viobus_cmo_pool_rd_attr(excess, free);
-viobus_cmo_rd_attr(spare);
-viobus_cmo_rd_attr(min);
-viobus_cmo_rd_attr(desired);
-viobus_cmo_rd_attr(curr);
-viobus_cmo_rd_attr(high);
-
-static struct bus_attribute vio_cmo_bus_attrs[] = {
- __ATTR(cmo_entitled, S_IRUGO, viobus_cmo_entitled_show, NULL),
- __ATTR(cmo_reserve_size, S_IRUGO, viobus_cmo_reserve_pool_show_size, NULL),
- __ATTR(cmo_excess_size, S_IRUGO, viobus_cmo_excess_pool_show_size, NULL),
- __ATTR(cmo_excess_free, S_IRUGO, viobus_cmo_excess_pool_show_free, NULL),
- __ATTR(cmo_spare, S_IRUGO, viobus_cmo_spare_show, NULL),
- __ATTR(cmo_min, S_IRUGO, viobus_cmo_min_show, NULL),
- __ATTR(cmo_desired, S_IRUGO, viobus_cmo_desired_show, NULL),
- __ATTR(cmo_curr, S_IRUGO, viobus_cmo_curr_show, NULL),
- __ATTR(cmo_high, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
- viobus_cmo_high_show, viobus_cmo_high_reset),
- __ATTR_NULL
+static BUS_ATTR_RW(cmo_high);
+
+static struct attribute *vio_bus_attrs[] = {
+ &bus_attr_cmo_entitled.attr,
+ &bus_attr_cmo_spare.attr,
+ &bus_attr_cmo_min.attr,
+ &bus_attr_cmo_desired.attr,
+ &bus_attr_cmo_curr.attr,
+ &bus_attr_cmo_high.attr,
+ &bus_attr_cmo_reserve_size.attr,
+ &bus_attr_cmo_excess_size.attr,
+ &bus_attr_cmo_excess_free.attr,
+ NULL,
};
+ATTRIBUTE_GROUPS(vio_bus);
static void vio_cmo_sysfs_init(void)
{
vio_bus_type.dev_attrs = vio_cmo_dev_attrs;
- vio_bus_type.bus_attrs = vio_cmo_bus_attrs;
+ vio_bus_type.bus_groups = vio_bus_groups;
}
#else /* CONFIG_PPC_SMLPAR */
-/* Dummy functions for iSeries platform */
int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; }
void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {}
static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; }
@@ -1043,15 +1081,100 @@ static void vio_cmo_sysfs_init(void) { }
EXPORT_SYMBOL(vio_cmo_entitlement_update);
EXPORT_SYMBOL(vio_cmo_set_dev_desired);
+
+/*
+ * Platform Facilities Option (PFO) support
+ */
+
+/**
+ * vio_h_cop_sync - Perform a synchronous PFO co-processor operation
+ *
+ * @vdev - Pointer to a struct vio_dev for device
+ * @op - Pointer to a struct vio_pfo_op for the operation parameters
+ *
+ * Calls the hypervisor to synchronously perform the PFO operation
+ * described in @op. In the case of a busy response from the hypervisor,
+ * the operation will be re-submitted indefinitely unless a non-zero timeout
+ * is specified or an error occurs. The timeout places a limit on when to
+ * stop re-submitting a operation, the total time can be exceeded if an
+ * operation is in progress.
+ *
+ * If op->hcall_ret is not NULL, this will be set to the return from the
+ * last h_cop_op call or it will be 0 if an error not involving the h_call
+ * was encountered.
+ *
+ * Returns:
+ * 0 on success,
+ * -EINVAL if the h_call fails due to an invalid parameter,
+ * -E2BIG if the h_call can not be performed synchronously,
+ * -EBUSY if a timeout is specified and has elapsed,
+ * -EACCES if the memory area for data/status has been rescinded, or
+ * -EPERM if a hardware fault has been indicated
+ */
+int vio_h_cop_sync(struct vio_dev *vdev, struct vio_pfo_op *op)
+{
+ struct device *dev = &vdev->dev;
+ unsigned long deadline = 0;
+ long hret = 0;
+ int ret = 0;
+
+ if (op->timeout)
+ deadline = jiffies + msecs_to_jiffies(op->timeout);
+
+ while (true) {
+ hret = plpar_hcall_norets(H_COP, op->flags,
+ vdev->resource_id,
+ op->in, op->inlen, op->out,
+ op->outlen, op->csbcpb);
+
+ if (hret == H_SUCCESS ||
+ (hret != H_NOT_ENOUGH_RESOURCES &&
+ hret != H_BUSY && hret != H_RESOURCE) ||
+ (op->timeout && time_after(deadline, jiffies)))
+ break;
+
+ dev_dbg(dev, "%s: hcall ret(%ld), retrying.\n", __func__, hret);
+ }
+
+ switch (hret) {
+ case H_SUCCESS:
+ ret = 0;
+ break;
+ case H_OP_MODE:
+ case H_TOO_BIG:
+ ret = -E2BIG;
+ break;
+ case H_RESCINDED:
+ ret = -EACCES;
+ break;
+ case H_HARDWARE:
+ ret = -EPERM;
+ break;
+ case H_NOT_ENOUGH_RESOURCES:
+ case H_RESOURCE:
+ case H_BUSY:
+ ret = -EBUSY;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret)
+ dev_dbg(dev, "%s: Sync h_cop_op failure (ret:%d) (hret:%ld)\n",
+ __func__, ret, hret);
+
+ op->hcall_err = hret;
+ return ret;
+}
+EXPORT_SYMBOL(vio_h_cop_sync);
+
static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
{
- const unsigned char *dma_window;
+ const __be32 *dma_window;
struct iommu_table *tbl;
unsigned long offset, size;
- if (firmware_has_feature(FW_FEATURE_ISERIES))
- return vio_build_iommu_table_iseries(dev);
-
dma_window = of_get_property(dev->dev.of_node,
"ibm,my-dma-window", NULL);
if (!dma_window)
@@ -1065,9 +1188,10 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
&tbl->it_index, &offset, &size);
/* TCE table size - measured in tce entries */
- tbl->it_size = size >> IOMMU_PAGE_SHIFT;
+ tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
+ tbl->it_size = size >> tbl->it_page_shift;
/* offset for VIO should always be 0 */
- tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
+ tbl->it_offset = offset >> tbl->it_page_shift;
tbl->it_busno = 0;
tbl->it_type = TCE_VB;
tbl->it_blocksize = 16;
@@ -1155,23 +1279,27 @@ static int vio_bus_remove(struct device *dev)
/**
* vio_register_driver: - Register a new vio driver
- * @drv: The vio_driver structure to be registered.
+ * @viodrv: The vio_driver structure to be registered.
*/
-int vio_register_driver(struct vio_driver *viodrv)
+int __vio_register_driver(struct vio_driver *viodrv, struct module *owner,
+ const char *mod_name)
{
- printk(KERN_DEBUG "%s: driver %s registering\n", __func__,
- viodrv->driver.name);
+ pr_debug("%s: driver %s registering\n", __func__, viodrv->name);
/* fill in 'struct driver' fields */
+ viodrv->driver.name = viodrv->name;
+ viodrv->driver.pm = viodrv->pm;
viodrv->driver.bus = &vio_bus_type;
+ viodrv->driver.owner = owner;
+ viodrv->driver.mod_name = mod_name;
return driver_register(&viodrv->driver);
}
-EXPORT_SYMBOL(vio_register_driver);
+EXPORT_SYMBOL(__vio_register_driver);
/**
* vio_unregister_driver - Remove registration of vio driver.
- * @driver: The vio_driver struct to be removed form registration
+ * @viodrv: The vio_driver struct to be removed form registration
*/
void vio_unregister_driver(struct vio_driver *viodrv)
{
@@ -1180,14 +1308,12 @@ void vio_unregister_driver(struct vio_driver *viodrv)
EXPORT_SYMBOL(vio_unregister_driver);
/* vio_dev refcount hit 0 */
-static void __devinit vio_dev_release(struct device *dev)
+static void vio_dev_release(struct device *dev)
{
struct iommu_table *tbl = get_iommu_table_base(dev);
- /* iSeries uses a common table for all vio devices */
- if (!firmware_has_feature(FW_FEATURE_ISERIES) && tbl)
- iommu_free_table(tbl, dev->of_node ?
- dev->of_node->full_name : dev_name(dev));
+ if (tbl)
+ iommu_free_table(tbl, of_node_full_name(dev->of_node));
of_node_put(dev->of_node);
kfree(to_vio_dev(dev));
}
@@ -1204,48 +1330,90 @@ static void __devinit vio_dev_release(struct device *dev)
struct vio_dev *vio_register_device_node(struct device_node *of_node)
{
struct vio_dev *viodev;
- const unsigned int *unit_address;
+ struct device_node *parent_node;
+ const __be32 *prop;
+ enum vio_dev_family family;
+ const char *of_node_name = of_node->name ? of_node->name : "<unknown>";
- /* we need the 'device_type' property, in order to match with drivers */
- if (of_node->type == NULL) {
- printk(KERN_WARNING "%s: node %s missing 'device_type'\n",
- __func__,
- of_node->name ? of_node->name : "<unknown>");
+ /*
+ * Determine if this node is a under the /vdevice node or under the
+ * /ibm,platform-facilities node. This decides the device's family.
+ */
+ parent_node = of_get_parent(of_node);
+ if (parent_node) {
+ if (!strcmp(parent_node->full_name, "/ibm,platform-facilities"))
+ family = PFO;
+ else if (!strcmp(parent_node->full_name, "/vdevice"))
+ family = VDEVICE;
+ else {
+ pr_warn("%s: parent(%s) of %s not recognized.\n",
+ __func__,
+ parent_node->full_name,
+ of_node_name);
+ of_node_put(parent_node);
+ return NULL;
+ }
+ of_node_put(parent_node);
+ } else {
+ pr_warn("%s: could not determine the parent of node %s.\n",
+ __func__, of_node_name);
return NULL;
}
- unit_address = of_get_property(of_node, "reg", NULL);
- if (unit_address == NULL) {
- printk(KERN_WARNING "%s: node %s missing 'reg'\n",
- __func__,
- of_node->name ? of_node->name : "<unknown>");
- return NULL;
+ if (family == PFO) {
+ if (of_get_property(of_node, "interrupt-controller", NULL)) {
+ pr_debug("%s: Skipping the interrupt controller %s.\n",
+ __func__, of_node_name);
+ return NULL;
+ }
}
/* allocate a vio_dev for this node */
viodev = kzalloc(sizeof(struct vio_dev), GFP_KERNEL);
- if (viodev == NULL)
+ if (viodev == NULL) {
+ pr_warn("%s: allocation failure for VIO device.\n", __func__);
return NULL;
+ }
- viodev->irq = irq_of_parse_and_map(of_node, 0);
+ /* we need the 'device_type' property, in order to match with drivers */
+ viodev->family = family;
+ if (viodev->family == VDEVICE) {
+ unsigned int unit_address;
+
+ if (of_node->type != NULL)
+ viodev->type = of_node->type;
+ else {
+ pr_warn("%s: node %s is missing the 'device_type' "
+ "property.\n", __func__, of_node_name);
+ goto out;
+ }
- dev_set_name(&viodev->dev, "%x", *unit_address);
- viodev->name = of_node->name;
- viodev->type = of_node->type;
- viodev->unit_address = *unit_address;
- if (firmware_has_feature(FW_FEATURE_ISERIES)) {
- unit_address = of_get_property(of_node,
- "linux,unit_address", NULL);
- if (unit_address != NULL)
- viodev->unit_address = *unit_address;
+ prop = of_get_property(of_node, "reg", NULL);
+ if (prop == NULL) {
+ pr_warn("%s: node %s missing 'reg'\n",
+ __func__, of_node_name);
+ goto out;
+ }
+ unit_address = of_read_number(prop, 1);
+ dev_set_name(&viodev->dev, "%x", unit_address);
+ viodev->irq = irq_of_parse_and_map(of_node, 0);
+ viodev->unit_address = unit_address;
+ } else {
+ /* PFO devices need their resource_id for submitting COP_OPs
+ * This is an optional field for devices, but is required when
+ * performing synchronous ops */
+ prop = of_get_property(of_node, "ibm,resource-id", NULL);
+ if (prop != NULL)
+ viodev->resource_id = of_read_number(prop, 1);
+
+ dev_set_name(&viodev->dev, "%s", of_node_name);
+ viodev->type = of_node_name;
+ viodev->irq = 0;
}
+
+ viodev->name = of_node->name;
viodev->dev.of_node = of_node_get(of_node);
- if (firmware_has_feature(FW_FEATURE_CMO))
- vio_cmo_set_dma_ops(viodev);
- else
- viodev->dev.archdata.dma_ops = &dma_iommu_ops;
- set_iommu_table_base(&viodev->dev, vio_build_iommu_table(viodev));
set_dev_node(&viodev->dev, of_node_to_nid(of_node));
/* init generic 'struct device' fields: */
@@ -1253,6 +1421,21 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node)
viodev->dev.bus = &vio_bus_type;
viodev->dev.release = vio_dev_release;
+ if (of_get_property(viodev->dev.of_node, "ibm,my-dma-window", NULL)) {
+ if (firmware_has_feature(FW_FEATURE_CMO))
+ vio_cmo_set_dma_ops(viodev);
+ else
+ set_dma_ops(&viodev->dev, &dma_iommu_ops);
+
+ set_iommu_table_base(&viodev->dev,
+ vio_build_iommu_table(viodev));
+
+ /* needed to ensure proper operation of coherent allocations
+ * later, in case driver doesn't set it explicitly */
+ viodev->dev.coherent_dma_mask = DMA_BIT_MASK(64);
+ viodev->dev.dma_mask = &viodev->dev.coherent_dma_mask;
+ }
+
/* register with generic device framework */
if (device_register(&viodev->dev)) {
printk(KERN_ERR "%s: failed to register device %s\n",
@@ -1262,16 +1445,51 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node)
}
return viodev;
+
+out: /* Use this exit point for any return prior to device_register */
+ kfree(viodev);
+
+ return NULL;
}
EXPORT_SYMBOL(vio_register_device_node);
+/*
+ * vio_bus_scan_for_devices - Scan OF and register each child device
+ * @root_name - OF node name for the root of the subtree to search.
+ * This must be non-NULL
+ *
+ * Starting from the root node provide, register the device node for
+ * each child beneath the root.
+ */
+static void vio_bus_scan_register_devices(char *root_name)
+{
+ struct device_node *node_root, *node_child;
+
+ if (!root_name)
+ return;
+
+ node_root = of_find_node_by_name(NULL, root_name);
+ if (node_root) {
+
+ /*
+ * Create struct vio_devices for each virtual device in
+ * the device tree. Drivers will associate with them later.
+ */
+ node_child = of_get_next_child(node_root, NULL);
+ while (node_child) {
+ vio_register_device_node(node_child);
+ node_child = of_get_next_child(node_root, node_child);
+ }
+ of_node_put(node_root);
+ }
+}
+
/**
* vio_bus_init: - Initialize the virtual IO bus
*/
static int __init vio_bus_init(void)
{
int err;
- struct device_node *node_vroot;
if (firmware_has_feature(FW_FEATURE_CMO))
vio_cmo_sysfs_init();
@@ -1296,23 +1514,18 @@ static int __init vio_bus_init(void)
if (firmware_has_feature(FW_FEATURE_CMO))
vio_cmo_bus_init();
- node_vroot = of_find_node_by_name(NULL, "vdevice");
- if (node_vroot) {
- struct device_node *of_node;
+ return 0;
+}
+postcore_initcall(vio_bus_init);
- /*
- * Create struct vio_devices for each virtual device in
- * the device tree. Drivers will associate with them later.
- */
- for (of_node = node_vroot->child; of_node != NULL;
- of_node = of_node->sibling)
- vio_register_device_node(of_node);
- of_node_put(node_vroot);
- }
+static int __init vio_device_init(void)
+{
+ vio_bus_scan_register_devices("vdevice");
+ vio_bus_scan_register_devices("ibm,platform-facilities");
return 0;
}
-__initcall(vio_bus_init);
+device_initcall(vio_device_init);
static ssize_t name_show(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -1325,7 +1538,7 @@ static ssize_t devspec_show(struct device *dev,
{
struct device_node *of_node = dev->of_node;
- return sprintf(buf, "%s\n", of_node ? of_node->full_name : "none");
+ return sprintf(buf, "%s\n", of_node_full_name(of_node));
}
static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
@@ -1336,11 +1549,15 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
const char *cp;
dn = dev->of_node;
- if (!dn)
- return -ENODEV;
+ if (!dn) {
+ strcpy(buf, "\n");
+ return strlen(buf);
+ }
cp = of_get_property(dn, "compatible", NULL);
- if (!cp)
- return -ENODEV;
+ if (!cp) {
+ strcpy(buf, "\n");
+ return strlen(buf);
+ }
return sprintf(buf, "vio:T%sS%s\n", vio_dev->type, cp);
}
@@ -1352,7 +1569,7 @@ static struct device_attribute vio_dev_attrs[] = {
__ATTR_NULL
};
-void __devinit vio_unregister_device(struct vio_dev *viodev)
+void vio_unregister_device(struct vio_dev *viodev)
{
device_unregister(&viodev->dev);
}
@@ -1384,14 +1601,13 @@ static int vio_hotplug(struct device *dev, struct kobj_uevent_env *env)
return 0;
}
-static struct bus_type vio_bus_type = {
+struct bus_type vio_bus_type = {
.name = "vio",
.dev_attrs = vio_dev_attrs,
.uevent = vio_hotplug,
.match = vio_bus_match,
.probe = vio_bus_probe,
.remove = vio_bus_remove,
- .pm = GENERIC_SUBSYS_PM_OPS,
};
/**
@@ -1430,14 +1646,32 @@ static struct vio_dev *vio_find_name(const char *name)
*/
struct vio_dev *vio_find_node(struct device_node *vnode)
{
- const uint32_t *unit_address;
char kobj_name[20];
+ struct device_node *vnode_parent;
+ const char *dev_type;
+
+ vnode_parent = of_get_parent(vnode);
+ if (!vnode_parent)
+ return NULL;
+
+ dev_type = of_get_property(vnode_parent, "device_type", NULL);
+ of_node_put(vnode_parent);
+ if (!dev_type)
+ return NULL;
/* construct the kobject name from the device node */
- unit_address = of_get_property(vnode, "reg", NULL);
- if (!unit_address)
+ if (!strcmp(dev_type, "vdevice")) {
+ const __be32 *prop;
+
+ prop = of_get_property(vnode, "reg", NULL);
+ if (!prop)
+ return NULL;
+ snprintf(kobj_name, sizeof(kobj_name), "%x",
+ (uint32_t)of_read_number(prop, 1));
+ } else if (!strcmp(dev_type, "ibm,platform-facilities"))
+ snprintf(kobj_name, sizeof(kobj_name), "%s", vnode->name);
+ else
return NULL;
- snprintf(kobj_name, sizeof(kobj_name), "%x", *unit_address);
return vio_find_name(kobj_name);
}
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 8a0deefac08..f096e72262f 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -38,9 +38,6 @@ jiffies = jiffies_64 + 4;
#endif
SECTIONS
{
- . = 0;
- reloc_start = .;
-
. = KERNELBASE;
/*
@@ -109,11 +106,6 @@ SECTIONS
__ptov_table_begin = .;
*(.ptov_fixup);
__ptov_table_end = .;
-#ifdef CONFIG_PPC_ISERIES
- __dt_strings_start = .;
- *(.dt_strings);
- __dt_strings_end = .;
-#endif
}
.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
@@ -160,7 +152,7 @@ SECTIONS
INIT_RAM_FS
}
- PERCPU(PAGE_SIZE)
+ PERCPU_SECTION(L1_CACHE_BYTES)
. = ALIGN(8);
.machine.desc : AT(ADDR(.machine.desc) - LOAD_OFFSET) {
@@ -170,7 +162,13 @@ SECTIONS
}
#ifdef CONFIG_RELOCATABLE
. = ALIGN(8);
- .dynsym : AT(ADDR(.dynsym) - LOAD_OFFSET) { *(.dynsym) }
+ .dynsym : AT(ADDR(.dynsym) - LOAD_OFFSET)
+ {
+#ifdef CONFIG_RELOCATABLE_PPC32
+ __dynamic_symtab = .;
+#endif
+ *(.dynsym)
+ }
.dynstr : AT(ADDR(.dynstr) - LOAD_OFFSET) { *(.dynstr) }
.dynamic : AT(ADDR(.dynamic) - LOAD_OFFSET)
{
@@ -217,6 +215,11 @@ SECTIONS
.got : AT(ADDR(.got) - LOAD_OFFSET) {
__toc_start = .;
+#ifndef CONFIG_RELOCATABLE
+ __prom_init_toc_start = .;
+ arch/powerpc/kernel/prom_init.o*(.toc .got)
+ __prom_init_toc_end = .;
+#endif
*(.got)
*(.toc)
}