aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile40
-rw-r--r--arch/x86/kernel/acpi/Makefile9
-rw-r--r--arch/x86/kernel/acpi/boot.c200
-rw-r--r--arch/x86/kernel/acpi/cstate.c28
-rw-r--r--arch/x86/kernel/acpi/realmode/.gitignore3
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile59
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/copy.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-bios.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-mode.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vesa.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vga.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/wakemain.c81
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S170
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h48
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S62
-rw-r--r--arch/x86/kernel/acpi/sleep.c83
-rw-r--r--arch/x86/kernel/acpi/sleep.h8
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S7
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S2
-rw-r--r--arch/x86/kernel/acpi/wakeup_rm.S12
-rw-r--r--arch/x86/kernel/alternative.c297
-rw-r--r--arch/x86/kernel/amd_gart_64.c18
-rw-r--r--arch/x86/kernel/amd_nb.c33
-rw-r--r--arch/x86/kernel/apb_timer.c20
-rw-r--r--arch/x86/kernel/aperture_64.c87
-rw-r--r--arch/x86/kernel/apic/Makefile3
-rw-r--r--arch/x86/kernel/apic/apic.c411
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c89
-rw-r--r--arch/x86/kernel/apic/apic_noop.c15
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c76
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c53
-rw-r--r--arch/x86/kernel/apic/es7000_32.c755
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c24
-rw-r--r--arch/x86/kernel/apic/io_apic.c1280
-rw-r--r--arch/x86/kernel/apic/ipi.c3
-rw-r--r--arch/x86/kernel/apic/numaq_32.c541
-rw-r--r--arch/x86/kernel/apic/probe_32.c28
-rw-r--r--arch/x86/kernel/apic/probe_64.c11
-rw-r--r--arch/x86/kernel/apic/summit_32.c556
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c89
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c61
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c423
-rw-r--r--arch/x86/kernel/apm_32.c122
-rw-r--r--arch/x86/kernel/asm-offsets.c4
-rw-r--r--arch/x86/kernel/asm-offsets_32.c4
-rw-r--r--arch/x86/kernel/asm-offsets_64.c7
-rw-r--r--arch/x86/kernel/check.c22
-rw-r--r--arch/x86/kernel/cpu/Makefile19
-rw-r--r--arch/x86/kernel/cpu/amd.c309
-rw-r--r--arch/x86/kernel/cpu/bugs.c126
-rw-r--r--arch/x86/kernel/cpu/centaur.c291
-rw-r--r--arch/x86/kernel/cpu/common.c321
-rw-r--r--arch/x86/kernel/cpu/cpu.h29
-rw-r--r--arch/x86/kernel/cpu/cyrix.c48
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c27
-rw-r--r--arch/x86/kernel/cpu/intel.c350
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c236
-rw-r--r--arch/x86/kernel/cpu/match.c49
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h17
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c46
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c792
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c393
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c282
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c158
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c24
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile7
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c492
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_early.c395
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c (renamed from arch/x86/kernel/microcode_core.c)179
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c178
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c (renamed from arch/x86/kernel/microcode_intel.c)217
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c787
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c174
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl32
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh41
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c85
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c16
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c34
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c98
-rw-r--r--arch/x86/kernel/cpu/perf_event.c728
-rw-r--r--arch/x86/kernel/cpu/perf_event.h259
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c353
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c683
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_iommu.c502
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_iommu.h40
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_uncore.c547
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c1160
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c507
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c609
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c714
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c4298
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h696
-rw-r--r--arch/x86/kernel/cpu/perf_event_knc.c319
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c74
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c196
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpu/powerflags.c8
-rw-r--r--arch/x86/kernel/cpu/proc.c37
-rw-r--r--arch/x86/kernel/cpu/rdrand.c17
-rw-r--r--arch/x86/kernel/cpu/scattered.c10
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--arch/x86/kernel/cpu/topology.c2
-rw-r--r--arch/x86/kernel/cpu/transmeta.c7
-rw-r--r--arch/x86/kernel/cpu/umc.c7
-rw-r--r--arch/x86/kernel/cpu/vmware.c23
-rw-r--r--arch/x86/kernel/cpuid.c22
-rw-r--r--arch/x86/kernel/crash.c41
-rw-r--r--arch/x86/kernel/crash_dump_32.c6
-rw-r--r--arch/x86/kernel/devicetree.c194
-rw-r--r--arch/x86/kernel/doublefault.c (renamed from arch/x86/kernel/doublefault_32.c)18
-rw-r--r--arch/x86/kernel/dumpstack.c87
-rw-r--r--arch/x86/kernel/dumpstack_32.c73
-rw-r--r--arch/x86/kernel/dumpstack_64.c143
-rw-r--r--arch/x86/kernel/e820.c81
-rw-r--r--arch/x86/kernel/early-quirks.c388
-rw-r--r--arch/x86/kernel/early_printk.c30
-rw-r--r--arch/x86/kernel/entry_32.S380
-rw-r--r--arch/x86/kernel/entry_64.S826
-rw-r--r--arch/x86/kernel/espfix_64.c208
-rw-r--r--arch/x86/kernel/ftrace.c699
-rw-r--r--arch/x86/kernel/head.c53
-rw-r--r--arch/x86/kernel/head32.c28
-rw-r--r--arch/x86/kernel/head64.c158
-rw-r--r--arch/x86/kernel/head_32.S377
-rw-r--r--arch/x86/kernel/head_64.S326
-rw-r--r--arch/x86/kernel/hpet.c92
-rw-r--r--arch/x86/kernel/hw_breakpoint.c9
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c8
-rw-r--r--arch/x86/kernel/i387.c428
-rw-r--r--arch/x86/kernel/i8259.c26
-rw-r--r--arch/x86/kernel/init_task.c42
-rw-r--r--arch/x86/kernel/ioport.c3
-rw-r--r--arch/x86/kernel/iosf_mbi.c237
-rw-r--r--arch/x86/kernel/irq.c190
-rw-r--r--arch/x86/kernel/irq_32.c120
-rw-r--r--arch/x86/kernel/irq_64.c21
-rw-r--r--arch/x86/kernel/irq_work.c24
-rw-r--r--arch/x86/kernel/irqinit.c128
-rw-r--r--arch/x86/kernel/jump_label.c95
-rw-r--r--arch/x86/kernel/kdebugfs.c15
-rw-r--r--arch/x86/kernel/kgdb.c78
-rw-r--r--arch/x86/kernel/kprobes/Makefile7
-rw-r--r--arch/x86/kernel/kprobes/common.h108
-rw-r--r--arch/x86/kernel/kprobes/core.c (renamed from arch/x86/kernel/kprobes.c)817
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c96
-rw-r--r--arch/x86/kernel/kprobes/opt.c445
-rw-r--r--arch/x86/kernel/ksysfs.c340
-rw-r--r--arch/x86/kernel/kvm.c447
-rw-r--r--arch/x86/kernel/kvmclock.c133
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/kernel/machine_kexec_32.c2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c173
-rw-r--r--arch/x86/kernel/mca_32.c477
-rw-r--r--arch/x86/kernel/mcount_64.S217
-rw-r--r--arch/x86/kernel/microcode_amd.c394
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c12
-rw-r--r--arch/x86/kernel/module.c81
-rw-r--r--arch/x86/kernel/mpparse.c22
-rw-r--r--arch/x86/kernel/msr.c32
-rw-r--r--arch/x86/kernel/nmi.c215
-rw-r--r--arch/x86/kernel/nmi_selftest.c55
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c18
-rw-r--r--arch/x86/kernel/paravirt.c61
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c83
-rw-r--r--arch/x86/kernel/pci-dma.c51
-rw-r--r--arch/x86/kernel/pci-nommu.c11
-rw-r--r--arch/x86/kernel/pci-swiotlb.c22
-rw-r--r--arch/x86/kernel/perf_regs.c105
-rw-r--r--arch/x86/kernel/preempt.S25
-rw-r--r--arch/x86/kernel/probe_roms.c3
-rw-r--r--arch/x86/kernel/process.c424
-rw-r--r--arch/x86/kernel/process_32.c148
-rw-r--r--arch/x86/kernel/process_64.c247
-rw-r--r--arch/x86/kernel/ptrace.c355
-rw-r--r--arch/x86/kernel/pvclock.c112
-rw-r--r--arch/x86/kernel/quirks.c61
-rw-r--r--arch/x86/kernel/reboot.c696
-rw-r--r--arch/x86/kernel/reboot_32.S135
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S2
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S34
-rw-r--r--arch/x86/kernel/rtc.c107
-rw-r--r--arch/x86/kernel/setup.c547
-rw-r--r--arch/x86/kernel/setup_percpu.c16
-rw-r--r--arch/x86/kernel/signal.c429
-rw-r--r--arch/x86/kernel/smp.c172
-rw-r--r--arch/x86/kernel/smpboot.c635
-rw-r--r--arch/x86/kernel/step.c54
-rw-r--r--arch/x86/kernel/sys_i386_32.c40
-rw-r--r--arch/x86/kernel/sys_x86_64.c157
-rw-r--r--arch/x86/kernel/syscall_32.c2
-rw-r--r--arch/x86/kernel/syscall_64.c13
-rw-r--r--arch/x86/kernel/sysfb.c74
-rw-r--r--arch/x86/kernel/sysfb_efi.c214
-rw-r--r--arch/x86/kernel/sysfb_simplefb.c95
-rw-r--r--arch/x86/kernel/tboot.c105
-rw-r--r--arch/x86/kernel/tce_64.c1
-rw-r--r--arch/x86/kernel/test_rodata.c10
-rw-r--r--arch/x86/kernel/time.c13
-rw-r--r--arch/x86/kernel/tls.c19
-rw-r--r--arch/x86/kernel/topology.c104
-rw-r--r--arch/x86/kernel/trace_clock.c21
-rw-r--r--arch/x86/kernel/tracepoint.c59
-rw-r--r--arch/x86/kernel/trampoline.c42
-rw-r--r--arch/x86/kernel/trampoline_32.S83
-rw-r--r--arch/x86/kernel/trampoline_64.S171
-rw-r--r--arch/x86/kernel/traps.c507
-rw-r--r--arch/x86/kernel/tsc.c418
-rw-r--r--arch/x86/kernel/tsc_msr.c127
-rw-r--r--arch/x86/kernel/tsc_sync.c46
-rw-r--r--arch/x86/kernel/uprobes.c928
-rw-r--r--arch/x86/kernel/vm86_32.c60
-rw-r--r--arch/x86/kernel/vmlinux.lds.S33
-rw-r--r--arch/x86/kernel/vsmp_64.c101
-rw-r--r--arch/x86/kernel/vsyscall_64.c166
-rw-r--r--arch/x86/kernel/vsyscall_gtod.c69
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c17
-rw-r--r--arch/x86/kernel/x86_init.c78
-rw-r--r--arch/x86/kernel/xsave.c543
227 files changed, 28682 insertions, 13926 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5369059c07a..047f9ff2e36 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,14 +2,13 @@
# Makefile for the linux kernel.
#
-extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
+extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_tsc.o = -pg
-CFLAGS_REMOVE_rtc.o = -pg
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
CFLAGS_REMOVE_pvclock.o = -pg
CFLAGS_REMOVE_kvmclock.o = -pg
@@ -17,25 +16,31 @@ CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
endif
+CFLAGS_irq.o := -I$(src)/../include/asm/trace
+
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
-obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
+obj-$(CONFIG_X86_32) += i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
-obj-y += syscall_$(BITS).o
+obj-$(CONFIG_X86_64) += mcount_64.o
+obj-y += syscall_$(BITS).o vsyscall_gtod.o
obj-$(CONFIG_X86_64) += vsyscall_64.o
obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
+obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
+obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
-obj-y += tsc.o io_delay.o rtc.o
+obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
obj-y += resource.o
-obj-y += trampoline.o trampoline_$(BITS).o
+obj-$(CONFIG_PREEMPT) += preempt.o
+
obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
@@ -48,8 +53,6 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
obj-y += acpi/
obj-y += reboot.o
-obj-$(CONFIG_X86_32) += reboot_32.o
-obj-$(CONFIG_MCA) += mca_32.o
obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_PCI) += early-quirks.o
@@ -65,12 +68,13 @@ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
+obj-$(CONFIG_X86_TSC) += trace_clock.o
obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
-obj-$(CONFIG_KPROBES) += kprobes.o
+obj-y += kprobes/
obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
+obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
@@ -83,23 +87,25 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
-obj-$(CONFIG_KVM_GUEST) += kvm.o
-obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
+obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
-microcode-y := microcode_core.o
-microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
-microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
-obj-$(CONFIG_MICROCODE) += microcode.o
-
obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
obj-$(CONFIG_OF) += devicetree.o
+obj-$(CONFIG_UPROBES) += uprobes.o
+obj-y += sysfb.o
+obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o
+obj-$(CONFIG_EFI) += sysfb_efi.o
+
+obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
+obj-$(CONFIG_TRACING) += tracepoint.o
+obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
###
# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 6f35260bb3e..163b2258147 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,14 +1,7 @@
-subdir- := realmode
-
obj-$(CONFIG_ACPI) += boot.o
-obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
endif
-$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
-
-$(obj)/realmode/wakeup.bin: FORCE
- $(Q)$(MAKE) $(build)=$(obj)/realmode
-
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ce664f33ea8..86281ffb96d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -44,20 +44,15 @@
#include <asm/mpspec.h>
#include <asm/smp.h>
+#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
static int __initdata acpi_force = 0;
-u32 acpi_rsdt_forced;
int acpi_disabled;
EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
# include <asm/proto.h>
-# include <asm/numa_64.h>
#endif /* X86 */
-#define BAD_MADT_ENTRY(entry, end) ( \
- (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
- ((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
-
#define PREFIX "ACPI: "
int acpi_noirq; /* skip ACPI IRQ initialization */
@@ -67,6 +62,7 @@ EXPORT_SYMBOL(acpi_pci_disabled);
int acpi_lapic;
int acpi_ioapic;
int acpi_strict;
+int acpi_disable_cmcff;
u8 acpi_sci_flags __initdata;
int acpi_sci_override_gsi __initdata;
@@ -141,16 +137,8 @@ static u32 irq_to_gsi(int irq)
}
/*
- * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
- * to map the target physical address. The problem is that set_fixmap()
- * provides a single page, and it is possible that the page is not
- * sufficient.
- * By using this area, we can map up to MAX_IO_APICS pages temporarily,
- * i.e. until the next __va_range() call.
- *
- * Important Safety Note: The fixed I/O APIC page numbers are *subtracted*
- * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and
- * count idx down while incrementing the phys address.
+ * This is just a simple wrapper around early_ioremap(),
+ * with sanity checks for phys == 0 and size == 0.
*/
char *__init __acpi_map_table(unsigned long phys, unsigned long size)
{
@@ -160,6 +148,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
return early_ioremap(phys, size);
}
+
void __init __acpi_unmap_table(char *map, unsigned long size)
{
if (!map || !size)
@@ -195,24 +184,31 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
return 0;
}
-static void __cpuinit acpi_register_lapic(int id, u8 enabled)
+/**
+ * acpi_register_lapic - register a local apic and generates a logic cpu number
+ * @id: local apic id to register
+ * @enabled: this cpu is enabled or not
+ *
+ * Returns the logic cpu number which maps to the local apic
+ */
+static int acpi_register_lapic(int id, u8 enabled)
{
unsigned int ver = 0;
- if (id >= (MAX_LOCAL_APIC-1)) {
+ if (id >= MAX_LOCAL_APIC) {
printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
- return;
+ return -EINVAL;
}
if (!enabled) {
++disabled_cpus;
- return;
+ return -EINVAL;
}
if (boot_cpu_physical_apicid != -1U)
ver = apic_version[boot_cpu_physical_apicid];
- generic_processor_info(id, ver);
+ return generic_processor_info(id, ver);
}
static int __init
@@ -239,7 +235,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
* to not preallocating memory for all NR_CPUS
* when we use CPU hotplug.
*/
- if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled)
+ if (!apic->apic_id_valid(apic_id) && enabled)
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
else
acpi_register_lapic(apic_id, enabled);
@@ -422,12 +418,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
return 0;
}
- if (intsrc->source_irq == 0 && intsrc->global_irq == 2) {
+ if (intsrc->source_irq == 0) {
if (acpi_skip_timer_override) {
- printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
+ printk(PREFIX "BIOS IRQ0 override ignored.\n");
return 0;
}
- if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
+
+ if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
+ && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
}
@@ -558,6 +556,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
int trigger, int polarity) = acpi_register_gsi_pic;
+#ifdef CONFIG_ACPI_SLEEP
+int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel;
+#else
+int (*acpi_suspend_lowlevel)(void);
+#endif
+
/*
* success: return IRQ number (>=0)
* failure: return < 0
@@ -572,6 +576,12 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
return irq;
}
+EXPORT_SYMBOL_GPL(acpi_register_gsi);
+
+void acpi_unregister_gsi(u32 gsi)
+{
+}
+EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
void __init acpi_set_irq_model_pic(void)
{
@@ -599,95 +609,43 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
int nid;
nid = acpi_get_node(handle);
- if (nid == -1 || !node_online(nid))
- return;
- set_apicid_to_node(physid, nid);
- numa_set_node(cpu, nid);
+ if (nid != -1) {
+ set_apicid_to_node(physid, nid);
+ numa_set_node(cpu, nid);
+ }
#endif
}
-static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
+static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
{
- struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
- union acpi_object *obj;
- struct acpi_madt_local_apic *lapic;
- cpumask_var_t tmp_map, new_map;
- u8 physid;
int cpu;
- int retval = -ENOMEM;
-
- if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
- return -EINVAL;
-
- if (!buffer.length || !buffer.pointer)
- return -EINVAL;
-
- obj = buffer.pointer;
- if (obj->type != ACPI_TYPE_BUFFER ||
- obj->buffer.length < sizeof(*lapic)) {
- kfree(buffer.pointer);
- return -EINVAL;
- }
-
- lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
-
- if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
- !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
- kfree(buffer.pointer);
- return -EINVAL;
- }
-
- physid = lapic->id;
- kfree(buffer.pointer);
- buffer.length = ACPI_ALLOCATE_BUFFER;
- buffer.pointer = NULL;
-
- if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL))
- goto out;
-
- if (!alloc_cpumask_var(&new_map, GFP_KERNEL))
- goto free_tmp_map;
-
- cpumask_copy(tmp_map, cpu_present_mask);
- acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
-
- /*
- * If mp_register_lapic successfully generates a new logical cpu
- * number, then the following will get us exactly what was mapped
- */
- cpumask_andnot(new_map, cpu_present_mask, tmp_map);
- if (cpumask_empty(new_map)) {
- printk ("Unable to map lapic to logical cpu number\n");
- retval = -EINVAL;
- goto free_new_map;
+ cpu = acpi_register_lapic(physid, ACPI_MADT_ENABLED);
+ if (cpu < 0) {
+ pr_info(PREFIX "Unable to map lapic to logical cpu number\n");
+ return cpu;
}
acpi_processor_set_pdc(handle);
-
- cpu = cpumask_first(new_map);
acpi_map_cpu2node(handle, cpu, physid);
*pcpu = cpu;
- retval = 0;
-
-free_new_map:
- free_cpumask_var(new_map);
-free_tmp_map:
- free_cpumask_var(tmp_map);
-out:
- return retval;
+ return 0;
}
/* wrapper to silence section mismatch warning */
-int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu)
+int __ref acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
{
- return _acpi_map_lsapic(handle, pcpu);
+ return _acpi_map_lsapic(handle, physid, pcpu);
}
EXPORT_SYMBOL(acpi_map_lsapic);
int acpi_unmap_lsapic(int cpu)
{
+#ifdef CONFIG_ACPI_NUMA
+ set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
+#endif
+
per_cpu(x86_cpu_to_apicid, cpu) = -1;
set_cpu_present(cpu, false);
num_processors--;
@@ -732,7 +690,7 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table)
#ifdef CONFIG_HPET_TIMER
#include <asm/hpet.h>
-static struct __initdata resource *hpet_res;
+static struct resource *hpet_res __initdata;
static int __init acpi_parse_hpet(struct acpi_table_header *table)
{
@@ -945,10 +903,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
#ifdef CONFIG_X86_IO_APIC
#define MP_ISA_BUS 0
-#ifdef CONFIG_X86_ES7000
-extern int es7000_plat;
-#endif
-
void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
{
int ioapic;
@@ -989,7 +943,7 @@ void __init mp_config_acpi_legacy_irqs(void)
int i;
struct mpc_intsrc mp_irq;
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+#ifdef CONFIG_EISA
/*
* Fabricate the legacy ISA bus (bus #31).
*/
@@ -998,14 +952,6 @@ void __init mp_config_acpi_legacy_irqs(void)
set_bit(MP_ISA_BUS, mp_bus_not_pci);
pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
-#ifdef CONFIG_X86_ES7000
- /*
- * Older generations of ES7000 have no legacy identity mappings
- */
- if (es7000_plat == 1)
- return;
-#endif
-
/*
* Use the default configuration for the IRQs 0-15. Unless
* overridden by (MADT) interrupt source override entries.
@@ -1071,9 +1017,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
if (!acpi_ioapic)
return 0;
- if (!dev)
- return 0;
- if (dev->bus != &pci_bus_type)
+ if (!dev || !dev_is_pci(dev))
return 0;
pdev = to_pci_dev(dev);
@@ -1101,6 +1045,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
int ioapic;
int ioapic_pin;
struct io_apic_irq_attr irq_attr;
+ int ret;
if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
return gsi;
@@ -1130,7 +1075,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
+ ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
+ if (ret < 0)
+ gsi = INT_MIN;
return gsi;
}
@@ -1333,17 +1280,12 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
}
/*
- * Force ignoring BIOS IRQ0 pin2 override
+ * Force ignoring BIOS IRQ0 override
*/
static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
{
- /*
- * The ati_ixp4x0_rev() early PCI quirk should have set
- * the acpi_skip_timer_override flag already:
- */
if (!acpi_skip_timer_override) {
- WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
- pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
+ pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
d->ident);
acpi_skip_timer_override = 1;
}
@@ -1437,7 +1379,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
* is enabled. This input is incorrectly designated the
* ISA IRQ 0 via an interrupt source override even though
* it is wired to the output of the master 8259A and INTIN0
- * is not connected at all. Force ignoring BIOS IRQ0 pin2
+ * is not connected at all. Force ignoring BIOS IRQ0
* override in that cases.
*/
{
@@ -1472,6 +1414,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
},
},
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "FUJITSU SIEMENS",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"),
+ },
+ },
{}
};
@@ -1595,7 +1545,7 @@ static int __init parse_acpi(char *arg)
}
/* acpi=rsdt use RSDT instead of XSDT */
else if (strcmp(arg, "rsdt") == 0) {
- acpi_rsdt_forced = 1;
+ acpi_gbl_do_not_use_xsdt = TRUE;
}
/* "acpi=noirq" disables ACPI interrupt routing */
else if (strcmp(arg, "noirq") == 0) {
@@ -1604,6 +1554,10 @@ static int __init parse_acpi(char *arg)
/* "acpi=copy_dsdt" copys DSDT */
else if (strcmp(arg, "copy_dsdt") == 0) {
acpi_gbl_copy_dsdt_locally = 1;
+ }
+ /* "acpi=nocmcff" disables FF mode for corrected errors */
+ else if (strcmp(arg, "nocmcff") == 0) {
+ acpi_disable_cmcff = 1;
} else {
/* Core will printk when we return error. */
return -EINVAL;
@@ -1694,3 +1648,9 @@ int __acpi_release_global_lock(unsigned int *lock)
} while (unlikely (val != old));
return old & 0x1;
}
+
+void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
+{
+ e820_add_region(addr, size, E820_ACPI);
+ update_e820();
+}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index f50e7fb2a20..4b28159e042 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -14,6 +14,7 @@
#include <acpi/processor.h>
#include <asm/acpi.h>
#include <asm/mwait.h>
+#include <asm/special_insns.h>
/*
* Initialize bm_flags based on the CPU cache properties
@@ -86,7 +87,9 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
retval = 0;
- if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) {
+ /* If the HW does not support any sub-states in this C-state */
+ if (num_cstate_subtype == 0) {
+ pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part);
retval = -1;
goto out;
}
@@ -149,29 +152,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-{
- if (!need_resched()) {
- if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
- clflush((void *)&current_thread_info()->flags);
-
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __mwait(ax, cx);
- }
-}
-
void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore
deleted file mode 100644
index 58f1f48a58f..00000000000
--- a/arch/x86/kernel/acpi/realmode/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-wakeup.bin
-wakeup.elf
-wakeup.lds
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
deleted file mode 100644
index 6a564ac67ef..00000000000
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
-#
-# arch/x86/kernel/acpi/realmode/Makefile
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License. See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-
-always := wakeup.bin
-targets := wakeup.elf wakeup.lds
-
-wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
-
-# The link order of the video-*.o modules can matter. In particular,
-# video-vga.o *must* be listed first, followed by video-vesa.o.
-# Hardware-specific drivers should follow in the order they should be
-# probed, and video-bios.o should typically be last.
-wakeup-y += video-vga.o
-wakeup-y += video-vesa.o
-wakeup-y += video-bios.o
-
-targets += $(wakeup-y)
-
-bootsrc := $(src)/../../../boot
-
-# ---------------------------------------------------------------------------
-
-# How to compile the 16-bit code. Note we always compile for -march=i386,
-# that way we can complain to the user if the CPU is insufficient.
-# Compile with _SETUP since this is similar to the boot-time setup code.
-KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
- -I$(srctree)/$(bootsrc) \
- $(cflags-y) \
- -Wall -Wstrict-prototypes \
- -march=i386 -mregparm=3 \
- -include $(srctree)/$(bootsrc)/code16gcc.h \
- -fno-strict-aliasing -fomit-frame-pointer \
- $(call cc-option, -ffreestanding) \
- $(call cc-option, -fno-toplevel-reorder,\
- $(call cc-option, -fno-unit-at-a-time)) \
- $(call cc-option, -fno-stack-protector) \
- $(call cc-option, -mpreferred-stack-boundary=2)
-KBUILD_CFLAGS += $(call cc-option, -m32)
-KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
-GCOV_PROFILE := n
-
-WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
-
-LDFLAGS_wakeup.elf := -T
-
-CPPFLAGS_wakeup.lds += -P -C
-
-$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE
- $(call if_changed,ld)
-
-OBJCOPYFLAGS_wakeup.bin := -O binary
-
-$(obj)/wakeup.bin: $(obj)/wakeup.elf FORCE
- $(call if_changed,objcopy)
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
deleted file mode 100644
index f51eb0bb56c..00000000000
--- a/arch/x86/kernel/acpi/realmode/bioscall.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/copy.S b/arch/x86/kernel/acpi/realmode/copy.S
deleted file mode 100644
index dc59ebee69d..00000000000
--- a/arch/x86/kernel/acpi/realmode/copy.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/copy.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
deleted file mode 100644
index 6206033ba20..00000000000
--- a/arch/x86/kernel/acpi/realmode/regs.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-bios.c b/arch/x86/kernel/acpi/realmode/video-bios.c
deleted file mode 100644
index 7deabc144a2..00000000000
--- a/arch/x86/kernel/acpi/realmode/video-bios.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-bios.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-mode.c b/arch/x86/kernel/acpi/realmode/video-mode.c
deleted file mode 100644
index 328ad209f11..00000000000
--- a/arch/x86/kernel/acpi/realmode/video-mode.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-mode.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vesa.c b/arch/x86/kernel/acpi/realmode/video-vesa.c
deleted file mode 100644
index 9dbb9672226..00000000000
--- a/arch/x86/kernel/acpi/realmode/video-vesa.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vesa.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vga.c b/arch/x86/kernel/acpi/realmode/video-vga.c
deleted file mode 100644
index bcc81255f37..00000000000
--- a/arch/x86/kernel/acpi/realmode/video-vga.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vga.c"
diff --git a/arch/x86/kernel/acpi/realmode/wakemain.c b/arch/x86/kernel/acpi/realmode/wakemain.c
deleted file mode 100644
index 883962d9eef..00000000000
--- a/arch/x86/kernel/acpi/realmode/wakemain.c
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "wakeup.h"
-#include "boot.h"
-
-static void udelay(int loops)
-{
- while (loops--)
- io_delay(); /* Approximately 1 us */
-}
-
-static void beep(unsigned int hz)
-{
- u8 enable;
-
- if (!hz) {
- enable = 0x00; /* Turn off speaker */
- } else {
- u16 div = 1193181/hz;
-
- outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */
- io_delay();
- outb(div, 0x42); /* LSB of counter */
- io_delay();
- outb(div >> 8, 0x42); /* MSB of counter */
- io_delay();
-
- enable = 0x03; /* Turn on speaker */
- }
- inb(0x61); /* Dummy read of System Control Port B */
- io_delay();
- outb(enable, 0x61); /* Enable timer 2 output to speaker */
- io_delay();
-}
-
-#define DOT_HZ 880
-#define DASH_HZ 587
-#define US_PER_DOT 125000
-
-/* Okay, this is totally silly, but it's kind of fun. */
-static void send_morse(const char *pattern)
-{
- char s;
-
- while ((s = *pattern++)) {
- switch (s) {
- case '.':
- beep(DOT_HZ);
- udelay(US_PER_DOT);
- beep(0);
- udelay(US_PER_DOT);
- break;
- case '-':
- beep(DASH_HZ);
- udelay(US_PER_DOT * 3);
- beep(0);
- udelay(US_PER_DOT);
- break;
- default: /* Assume it's a space */
- udelay(US_PER_DOT * 3);
- break;
- }
- }
-}
-
-void main(void)
-{
- /* Kill machine if structures are wrong */
- if (wakeup_header.real_magic != 0x12345678)
- while (1);
-
- if (wakeup_header.realmode_flags & 4)
- send_morse("...-");
-
- if (wakeup_header.realmode_flags & 1)
- asm volatile("lcallw $0xc000,$3");
-
- if (wakeup_header.realmode_flags & 2) {
- /* Need to call BIOS */
- probe_cards(0);
- set_mode(wakeup_header.video_mode);
- }
-}
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
deleted file mode 100644
index b4fd836e405..00000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * ACPI wakeup real mode startup stub
- */
-#include <asm/segment.h>
-#include <asm/msr-index.h>
-#include <asm/page_types.h>
-#include <asm/pgtable_types.h>
-#include <asm/processor-flags.h>
-#include "wakeup.h"
-
- .code16
- .section ".jump", "ax"
- .globl _start
-_start:
- cli
- jmp wakeup_code
-
-/* This should match the structure in wakeup.h */
- .section ".header", "a"
- .globl wakeup_header
-wakeup_header:
-video_mode: .short 0 /* Video mode number */
-pmode_return: .byte 0x66, 0xea /* ljmpl */
- .long 0 /* offset goes here */
- .short __KERNEL_CS
-pmode_cr0: .long 0 /* Saved %cr0 */
-pmode_cr3: .long 0 /* Saved %cr3 */
-pmode_cr4: .long 0 /* Saved %cr4 */
-pmode_efer: .quad 0 /* Saved EFER */
-pmode_gdt: .quad 0
-pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */
-pmode_behavior: .long 0 /* Wakeup behavior flags */
-realmode_flags: .long 0
-real_magic: .long 0
-trampoline_segment: .word 0
-_pad1: .byte 0
-wakeup_jmp: .byte 0xea /* ljmpw */
-wakeup_jmp_off: .word 3f
-wakeup_jmp_seg: .word 0
-wakeup_gdt: .quad 0, 0, 0
-signature: .long WAKEUP_HEADER_SIGNATURE
-
- .text
- .code16
-wakeup_code:
- cld
-
- /* Apparently some dimwit BIOS programmers don't know how to
- program a PM to RM transition, and we might end up here with
- junk in the data segment descriptor registers. The only way
- to repair that is to go into PM and fix it ourselves... */
- movw $16, %cx
- lgdtl %cs:wakeup_gdt
- movl %cr0, %eax
- orb $X86_CR0_PE, %al
- movl %eax, %cr0
- jmp 1f
-1: ljmpw $8, $2f
-2:
- movw %cx, %ds
- movw %cx, %es
- movw %cx, %ss
- movw %cx, %fs
- movw %cx, %gs
-
- andb $~X86_CR0_PE, %al
- movl %eax, %cr0
- jmp wakeup_jmp
-3:
- /* Set up segments */
- movw %cs, %ax
- movw %ax, %ds
- movw %ax, %es
- movw %ax, %ss
- lidtl wakeup_idt
-
- movl $wakeup_stack_end, %esp
-
- /* Clear the EFLAGS */
- pushl $0
- popfl
-
- /* Check header signature... */
- movl signature, %eax
- cmpl $WAKEUP_HEADER_SIGNATURE, %eax
- jne bogus_real_magic
-
- /* Check we really have everything... */
- movl end_signature, %eax
- cmpl $WAKEUP_END_SIGNATURE, %eax
- jne bogus_real_magic
-
- /* Call the C code */
- calll main
-
- /* Restore MISC_ENABLE before entering protected mode, in case
- BIOS decided to clear XD_DISABLE during S3. */
- movl pmode_behavior, %eax
- btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
- jnc 1f
-
- movl pmode_misc_en, %eax
- movl pmode_misc_en + 4, %edx
- movl $MSR_IA32_MISC_ENABLE, %ecx
- wrmsr
-1:
-
- /* Do any other stuff... */
-
-#ifndef CONFIG_64BIT
- /* This could also be done in C code... */
- movl pmode_cr3, %eax
- movl %eax, %cr3
-
- movl pmode_cr4, %ecx
- jecxz 1f
- movl %ecx, %cr4
-1:
- movl pmode_efer, %eax
- movl pmode_efer + 4, %edx
- movl %eax, %ecx
- orl %edx, %ecx
- jz 1f
- movl $MSR_EFER, %ecx
- wrmsr
-1:
-
- lgdtl pmode_gdt
-
- /* This really couldn't... */
- movl pmode_cr0, %eax
- movl %eax, %cr0
- jmp pmode_return
-#else
- pushw $0
- pushw trampoline_segment
- pushw $0
- lret
-#endif
-
-bogus_real_magic:
-1:
- hlt
- jmp 1b
-
- .data
- .balign 8
-
- /* This is the standard real-mode IDT */
-wakeup_idt:
- .word 0xffff /* limit */
- .long 0 /* address */
- .word 0
-
- .globl HEAP, heap_end
-HEAP:
- .long wakeup_heap
-heap_end:
- .long wakeup_stack
-
- .bss
-wakeup_heap:
- .space 2048
-wakeup_stack:
- .space 2048
-wakeup_stack_end:
-
- .section ".signature","a"
-end_signature:
- .long WAKEUP_END_SIGNATURE
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
deleted file mode 100644
index 97a29e1430e..00000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Definitions for the wakeup data structure at the head of the
- * wakeup code.
- */
-
-#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
-#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
-
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-
-/* This must match data at wakeup.S */
-struct wakeup_header {
- u16 video_mode; /* Video mode number */
- u16 _jmp1; /* ljmpl opcode, 32-bit only */
- u32 pmode_entry; /* Protected mode resume point, 32-bit only */
- u16 _jmp2; /* CS value, 32-bit only */
- u32 pmode_cr0; /* Protected mode cr0 */
- u32 pmode_cr3; /* Protected mode cr3 */
- u32 pmode_cr4; /* Protected mode cr4 */
- u32 pmode_efer_low; /* Protected mode EFER */
- u32 pmode_efer_high;
- u64 pmode_gdt;
- u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */
- u32 pmode_misc_en_high;
- u32 pmode_behavior; /* Wakeup routine behavior flags */
- u32 realmode_flags;
- u32 real_magic;
- u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
- u8 _pad1;
- u8 wakeup_jmp;
- u16 wakeup_jmp_off;
- u16 wakeup_jmp_seg;
- u64 wakeup_gdt[3];
- u32 signature; /* To check we have correct structure */
-} __attribute__((__packed__));
-
-extern struct wakeup_header wakeup_header;
-#endif
-
-#define WAKEUP_HEADER_OFFSET 8
-#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
-#define WAKEUP_END_SIGNATURE 0x65a22c82
-
-/* Wakeup behavior bits */
-#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0
-
-#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
deleted file mode 100644
index d4f8010a5b1..00000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * wakeup.ld
- *
- * Linker script for the real-mode wakeup code
- */
-#undef i386
-#include "wakeup.h"
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(_start)
-
-SECTIONS
-{
- . = 0;
- .jump : {
- *(.jump)
- } = 0x90909090
-
- . = WAKEUP_HEADER_OFFSET;
- .header : {
- *(.header)
- }
-
- . = ALIGN(16);
- .text : {
- *(.text*)
- } = 0x90909090
-
- . = ALIGN(16);
- .rodata : {
- *(.rodata*)
- }
-
- .videocards : {
- video_cards = .;
- *(.videocards)
- video_cards_end = .;
- }
-
- . = ALIGN(16);
- .data : {
- *(.data*)
- }
-
- . = ALIGN(16);
- .bss : {
- __bss_start = .;
- *(.bss)
- __bss_end = .;
- }
-
- .signature : {
- *(.signature)
- }
-
- _end = .;
-
- /DISCARD/ : {
- *(.note*)
- }
-}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 103b6ab368d..31368207837 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -14,8 +14,9 @@
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
+#include <asm/realmode.h>
-#include "realmode/wakeup.h"
+#include "../../realmode/rm/wakeup.h"
#include "sleep.h"
unsigned long acpi_realmode_flags;
@@ -25,20 +26,27 @@ static char temp_stack[4096];
#endif
/**
- * acpi_suspend_lowlevel - save kernel state
+ * x86_acpi_enter_sleep_state - enter sleep state
+ * @state: Sleep state to enter.
+ *
+ * Wrapper around acpi_enter_sleep_state() to be called by assmebly.
+ */
+acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state)
+{
+ return acpi_enter_sleep_state(state);
+}
+
+/**
+ * x86_acpi_suspend_lowlevel - save kernel state
*
* Create an identity mapped page table and copy the wakeup routine to
* low memory.
*/
-int acpi_suspend_lowlevel(void)
+int x86_acpi_suspend_lowlevel(void)
{
- struct wakeup_header *header;
- /* address in low memory of the wakeup routine. */
- char *acpi_realmode;
+ struct wakeup_header *header =
+ (struct wakeup_header *) __va(real_mode_header->wakeup_header);
- acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
-
- header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
if (header->signature != WAKEUP_HEADER_SIGNATURE) {
printk(KERN_ERR "wakeup header does not match\n");
return -EINVAL;
@@ -46,41 +54,39 @@ int acpi_suspend_lowlevel(void)
header->video_mode = saved_video_mode;
- header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
-
- /*
- * Set up the wakeup GDT. We set these up as Big Real Mode,
- * that is, with limits set to 4 GB. At least the Lenovo
- * Thinkpad X61 is known to need this for the video BIOS
- * initialization quirk to work; this is likely to also
- * be the case for other laptops or integrated video devices.
- */
-
- /* GDT[0]: GDT self-pointer */
- header->wakeup_gdt[0] =
- (u64)(sizeof(header->wakeup_gdt) - 1) +
- ((u64)__pa(&header->wakeup_gdt) << 16);
- /* GDT[1]: big real mode-like code segment */
- header->wakeup_gdt[1] =
- GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
- /* GDT[2]: big real mode-like data segment */
- header->wakeup_gdt[2] =
- GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
+ header->pmode_behavior = 0;
#ifndef CONFIG_64BIT
- store_gdt((struct desc_ptr *)&header->pmode_gdt);
+ native_store_gdt((struct desc_ptr *)&header->pmode_gdt);
- if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
- &header->pmode_efer_high))
- header->pmode_efer_low = header->pmode_efer_high = 0;
+ /*
+ * We have to check that we can write back the value, and not
+ * just read it. At least on 90 nm Pentium M (Family 6, Model
+ * 13), reading an invalid MSR is not guaranteed to trap, see
+ * Erratum X4 in "Intel Pentium M Processor on 90 nm Process
+ * with 2-MB L2 Cache and Intel® Processor A100 and A110 on 90
+ * nm process with 512-KB L2 Cache Specification Update".
+ */
+ if (!rdmsr_safe(MSR_EFER,
+ &header->pmode_efer_low,
+ &header->pmode_efer_high) &&
+ !wrmsr_safe(MSR_EFER,
+ header->pmode_efer_low,
+ header->pmode_efer_high))
+ header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER);
#endif /* !CONFIG_64BIT */
header->pmode_cr0 = read_cr0();
- header->pmode_cr4 = read_cr4_safe();
- header->pmode_behavior = 0;
+ if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
+ header->pmode_cr4 = read_cr4();
+ header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
+ }
if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
&header->pmode_misc_en_low,
- &header->pmode_misc_en_high))
+ &header->pmode_misc_en_high) &&
+ !wrmsr_safe(MSR_IA32_MISC_ENABLE,
+ header->pmode_misc_en_low,
+ header->pmode_misc_en_high))
header->pmode_behavior |=
(1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
header->realmode_flags = acpi_realmode_flags;
@@ -88,10 +94,9 @@ int acpi_suspend_lowlevel(void)
#ifndef CONFIG_64BIT
header->pmode_entry = (u32)&wakeup_pmode_return;
- header->pmode_cr3 = (u32)__pa(&initial_page_table);
+ header->pmode_cr3 = (u32)__pa_symbol(initial_page_table);
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
- header->trampoline_segment = trampoline_address() >> 4;
#ifdef CONFIG_SMP
stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
@@ -121,6 +126,8 @@ static int __init acpi_sleep_setup(char *str)
#endif
if (strncmp(str, "nonvs", 5) == 0)
acpi_nvs_nosave();
+ if (strncmp(str, "nonvs_s3", 8) == 0)
+ acpi_nvs_nosave_s3();
if (strncmp(str, "old_ordering", 12) == 0)
acpi_old_suspend_ordering();
str = strchr(str, ',');
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index 416d4be13fe..65c7b606b60 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -2,14 +2,20 @@
* Variables and functions used by the code in sleep.c
*/
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
extern unsigned long saved_video_mode;
extern long saved_magic;
extern int wakeup_pmode_return;
+extern u8 wake_sleep_flags;
+
extern unsigned long acpi_copy_wakeup_routine(unsigned long);
extern void wakeup_long64(void);
extern void do_suspend_lowlevel(void);
+
+extern int x86_acpi_suspend_lowlevel(void);
+
+acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state);
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 13ab720573e..665c6b7d2ea 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,4 +1,4 @@
- .section .text..page_aligned
+ .text
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page_types.h>
@@ -18,7 +18,6 @@ wakeup_pmode_return:
movw %ax, %gs
# reload the gdt, as we need the full 32 bit address
- lgdt saved_gdt
lidt saved_idt
lldt saved_ldt
ljmp $(__KERNEL_CS), $1f
@@ -44,7 +43,6 @@ bogus_magic:
save_registers:
- sgdt saved_gdt
sidt saved_idt
sldt saved_ldt
str saved_tss
@@ -75,7 +73,7 @@ ENTRY(do_suspend_lowlevel)
call save_processor_state
call save_registers
pushl $3
- call acpi_enter_sleep_state
+ call x86_acpi_enter_sleep_state
addl $4, %esp
# In case of S3 failure, we'll emerge here. Jump
@@ -93,7 +91,6 @@ ENTRY(saved_magic) .long 0
ENTRY(saved_eip) .long 0
# saved registers
-saved_gdt: .long 0,0
saved_idt: .long 0,0
saved_ldt: .long 0
saved_tss: .long 0
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 8ea5164cbd0..ae693b51ed8 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -73,7 +73,7 @@ ENTRY(do_suspend_lowlevel)
addq $8, %rsp
movl $3, %edi
xorl %eax, %eax
- call acpi_enter_sleep_state
+ call x86_acpi_enter_sleep_state
/* in case something went wrong, restore the machine status and go on */
jmp resume_point
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
deleted file mode 100644
index 63b8ab524f2..00000000000
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Wrapper script for the realmode binary as a transport object
- * before copying to low memory.
- */
-#include <asm/page_types.h>
-
- .section ".x86_trampoline","a"
- .balign PAGE_SIZE
- .globl acpi_wakeup_code
-acpi_wakeup_code:
- .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
- .size acpi_wakeup_code, .-acpi_wakeup_code
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1f84794f075..703130f469e 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,14 +1,16 @@
+#define pr_fmt(fmt) "SMP alternatives: " fmt
+
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
-#include <linux/kprobes.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
#include <linux/stop_machine.h>
#include <linux/slab.h>
+#include <linux/kdebug.h>
#include <asm/alternative.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
@@ -21,19 +23,6 @@
#define MAX_PATCH_LEN (255-1)
-#ifdef CONFIG_HOTPLUG_CPU
-static int smp_alt_once;
-
-static int __init bootonly(char *str)
-{
- smp_alt_once = 1;
- return 1;
-}
-__setup("smp-alt-boot", bootonly);
-#else
-#define smp_alt_once 1
-#endif
-
static int __initdata_or_module debug_alternative;
static int __init debug_alt(char *str)
@@ -63,8 +52,11 @@ static int __init setup_noreplace_paravirt(char *str)
__setup("noreplace-paravirt", setup_noreplace_paravirt);
#endif
-#define DPRINTK(fmt, args...) if (debug_alternative) \
- printk(KERN_DEBUG fmt, args)
+#define DPRINTK(fmt, ...) \
+do { \
+ if (debug_alternative) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+} while (0)
/*
* Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
@@ -160,7 +152,7 @@ static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
#endif
#ifdef P6_NOP1
-static const unsigned char __initconst_or_module p6nops[] =
+static const unsigned char p6nops[] =
{
P6_NOP1,
P6_NOP2,
@@ -219,7 +211,7 @@ void __init arch_init_ideal_nops(void)
ideal_nops = intel_nops;
#endif
}
-
+ break;
default:
#ifdef CONFIG_X86_64
ideal_nops = k8_nops;
@@ -279,7 +271,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf));
- BUG_ON(a->cpuid >= NCAPINTS*32);
+ BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
if (!boot_cpu_has(a->cpuid))
continue;
@@ -312,7 +304,7 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
/* turn DS segment override prefix into lock prefix */
if (*ptr == 0x3e)
text_poke(ptr, ((unsigned char []){0xf0}), 1);
- };
+ }
mutex_unlock(&text_mutex);
}
@@ -321,9 +313,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
{
const s32 *poff;
- if (noreplace_smp)
- return;
-
mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
u8 *ptr = (u8 *)poff + *poff;
@@ -333,7 +322,7 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
/* turn lock prefix into DS segment override prefix */
if (*ptr == 0xf0)
text_poke(ptr, ((unsigned char []){0x3E}), 1);
- };
+ }
mutex_unlock(&text_mutex);
}
@@ -354,7 +343,7 @@ struct smp_alt_module {
};
static LIST_HEAD(smp_alt_modules);
static DEFINE_MUTEX(smp_alt);
-static int smp_mode = 1; /* protected by smp_alt */
+static bool uniproc_patched = false; /* protected by smp_alt */
void __init_or_module alternatives_smp_module_add(struct module *mod,
char *name,
@@ -363,19 +352,18 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
{
struct smp_alt_module *smp;
- if (noreplace_smp)
- return;
+ mutex_lock(&smp_alt);
+ if (!uniproc_patched)
+ goto unlock;
- if (smp_alt_once) {
- if (boot_cpu_has(X86_FEATURE_UP))
- alternatives_smp_unlock(locks, locks_end,
- text, text_end);
- return;
- }
+ if (num_possible_cpus() == 1)
+ /* Don't bother remembering, we'll never have to undo it. */
+ goto smp_unlock;
smp = kzalloc(sizeof(*smp), GFP_KERNEL);
if (NULL == smp)
- return; /* we'll run the (safe but slow) SMP code then ... */
+ /* we'll run the (safe but slow) SMP code then ... */
+ goto unlock;
smp->mod = mod;
smp->name = name;
@@ -387,11 +375,10 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
__func__, smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
- mutex_lock(&smp_alt);
list_add_tail(&smp->next, &smp_alt_modules);
- if (boot_cpu_has(X86_FEATURE_UP))
- alternatives_smp_unlock(smp->locks, smp->locks_end,
- smp->text, smp->text_end);
+smp_unlock:
+ alternatives_smp_unlock(locks, locks_end, text, text_end);
+unlock:
mutex_unlock(&smp_alt);
}
@@ -399,66 +386,36 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
{
struct smp_alt_module *item;
- if (smp_alt_once || noreplace_smp)
- return;
-
mutex_lock(&smp_alt);
list_for_each_entry(item, &smp_alt_modules, next) {
if (mod != item->mod)
continue;
list_del(&item->next);
- mutex_unlock(&smp_alt);
- DPRINTK("%s: %s\n", __func__, item->name);
kfree(item);
- return;
+ break;
}
mutex_unlock(&smp_alt);
}
-bool skip_smp_alternatives;
-void alternatives_smp_switch(int smp)
+void alternatives_enable_smp(void)
{
struct smp_alt_module *mod;
-#ifdef CONFIG_LOCKDEP
- /*
- * Older binutils section handling bug prevented
- * alternatives-replacement from working reliably.
- *
- * If this still occurs then you should see a hang
- * or crash shortly after this line:
- */
- printk("lockdep: fixing up alternatives.\n");
-#endif
-
- if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
- return;
- BUG_ON(!smp && (num_online_cpus() > 1));
+ /* Why bother if there are no other CPUs? */
+ BUG_ON(num_possible_cpus() == 1);
mutex_lock(&smp_alt);
- /*
- * Avoid unnecessary switches because it forces JIT based VMs to
- * throw away all cached translations, which can be quite costly.
- */
- if (smp == smp_mode) {
- /* nothing */
- } else if (smp) {
- printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
+ if (uniproc_patched) {
+ pr_info("switching to SMP code\n");
+ BUG_ON(num_online_cpus() != 1);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
list_for_each_entry(mod, &smp_alt_modules, next)
alternatives_smp_lock(mod->locks, mod->locks_end,
mod->text, mod->text_end);
- } else {
- printk(KERN_INFO "SMP alternatives: switching to UP code\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
- set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
- list_for_each_entry(mod, &smp_alt_modules, next)
- alternatives_smp_unlock(mod->locks, mod->locks_end,
- mod->text, mod->text_end);
+ uniproc_patched = false;
}
- smp_mode = smp;
mutex_unlock(&smp_alt);
}
@@ -535,40 +492,22 @@ void __init alternative_instructions(void)
apply_alternatives(__alt_instructions, __alt_instructions_end);
- /* switch to patch-once-at-boottime-only mode and free the
- * tables in case we know the number of CPUs will never ever
- * change */
-#ifdef CONFIG_HOTPLUG_CPU
- if (num_possible_cpus() < 2)
- smp_alt_once = 1;
-#endif
-
#ifdef CONFIG_SMP
- if (smp_alt_once) {
- if (1 == num_possible_cpus()) {
- printk(KERN_INFO "SMP alternatives: switching to UP code\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
- set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
-
- alternatives_smp_unlock(__smp_locks, __smp_locks_end,
- _text, _etext);
- }
- } else {
+ /* Patch to UP if other cpus not imminent. */
+ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
+ uniproc_patched = true;
alternatives_smp_module_add(NULL, "core kernel",
__smp_locks, __smp_locks_end,
_text, _etext);
-
- /* Only switch to UP mode if we don't immediately boot others */
- if (num_present_cpus() == 1 || setup_max_cpus <= 1)
- alternatives_smp_switch(0);
}
-#endif
- apply_paravirt(__parainstructions, __parainstructions_end);
- if (smp_alt_once)
+ if (!uniproc_patched || num_possible_cpus() == 1)
free_init_pages("SMP alternatives",
(unsigned long)__smp_locks,
(unsigned long)__smp_locks_end);
+#endif
+
+ apply_paravirt(__parainstructions, __parainstructions_end);
restart_nmi();
}
@@ -611,7 +550,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
*
* Note: Must be called under text_mutex.
*/
-void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
+void *text_poke(void *addr, const void *opcode, size_t len)
{
unsigned long flags;
char *vaddr;
@@ -646,97 +585,93 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
return addr;
}
-/*
- * Cross-modifying kernel text with stop_machine().
- * This code originally comes from immediate value.
- */
-static atomic_t stop_machine_first;
-static int wrote_text;
+static void do_sync_core(void *info)
+{
+ sync_core();
+}
-struct text_poke_params {
- struct text_poke_param *params;
- int nparams;
-};
+static bool bp_patching_in_progress;
+static void *bp_int3_handler, *bp_int3_addr;
-static int __kprobes stop_machine_text_poke(void *data)
+int poke_int3_handler(struct pt_regs *regs)
{
- struct text_poke_params *tpp = data;
- struct text_poke_param *p;
- int i;
+ /* bp_patching_in_progress */
+ smp_rmb();
- if (atomic_dec_and_test(&stop_machine_first)) {
- for (i = 0; i < tpp->nparams; i++) {
- p = &tpp->params[i];
- text_poke(p->addr, p->opcode, p->len);
- }
- smp_wmb(); /* Make sure other cpus see that this has run */
- wrote_text = 1;
- } else {
- while (!wrote_text)
- cpu_relax();
- smp_mb(); /* Load wrote_text before following execution */
- }
+ if (likely(!bp_patching_in_progress))
+ return 0;
- for (i = 0; i < tpp->nparams; i++) {
- p = &tpp->params[i];
- flush_icache_range((unsigned long)p->addr,
- (unsigned long)p->addr + p->len);
- }
- /*
- * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
- * that a core serializing instruction such as "cpuid" should be
- * executed on _each_ core before the new instruction is made visible.
- */
- sync_core();
- return 0;
-}
+ if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr)
+ return 0;
+
+ /* set up the specified breakpoint handler */
+ regs->ip = (unsigned long) bp_int3_handler;
+
+ return 1;
-/**
- * text_poke_smp - Update instructions on a live kernel on SMP
- * @addr: address to modify
- * @opcode: source of the copy
- * @len: length to copy
- *
- * Modify multi-byte instruction by using stop_machine() on SMP. This allows
- * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
- * should be allowed, since stop_machine() does _not_ protect code against
- * NMI and MCE.
- *
- * Note: Must be called under get_online_cpus() and text_mutex.
- */
-void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
-{
- struct text_poke_params tpp;
- struct text_poke_param p;
-
- p.addr = addr;
- p.opcode = opcode;
- p.len = len;
- tpp.params = &p;
- tpp.nparams = 1;
- atomic_set(&stop_machine_first, 1);
- wrote_text = 0;
- /* Use __stop_machine() because the caller already got online_cpus. */
- __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
- return addr;
}
/**
- * text_poke_smp_batch - Update instructions on a live kernel on SMP
- * @params: an array of text_poke parameters
- * @n: the number of elements in params.
+ * text_poke_bp() -- update instructions on live kernel on SMP
+ * @addr: address to patch
+ * @opcode: opcode of new instruction
+ * @len: length to copy
+ * @handler: address to jump to when the temporary breakpoint is hit
*
- * Modify multi-byte instruction by using stop_machine() on SMP. Since the
- * stop_machine() is heavy task, it is better to aggregate text_poke requests
- * and do it once if possible.
+ * Modify multi-byte instruction by using int3 breakpoint on SMP.
+ * We completely avoid stop_machine() here, and achieve the
+ * synchronization using int3 breakpoint.
*
- * Note: Must be called under get_online_cpus() and text_mutex.
+ * The way it is done:
+ * - add a int3 trap to the address that will be patched
+ * - sync cores
+ * - update all but the first byte of the patched range
+ * - sync cores
+ * - replace the first byte (int3) by the first byte of
+ * replacing opcode
+ * - sync cores
+ *
+ * Note: must be called under text_mutex.
*/
-void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
+void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{
- struct text_poke_params tpp = {.params = params, .nparams = n};
+ unsigned char int3 = 0xcc;
+
+ bp_int3_handler = handler;
+ bp_int3_addr = (u8 *)addr + sizeof(int3);
+ bp_patching_in_progress = true;
+ /*
+ * Corresponding read barrier in int3 notifier for
+ * making sure the in_progress flags is correctly ordered wrt.
+ * patching
+ */
+ smp_wmb();
+
+ text_poke(addr, &int3, sizeof(int3));
- atomic_set(&stop_machine_first, 1);
- wrote_text = 0;
- __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
+ on_each_cpu(do_sync_core, NULL, 1);
+
+ if (len - sizeof(int3) > 0) {
+ /* patch all but the first byte */
+ text_poke((char *)addr + sizeof(int3),
+ (const char *) opcode + sizeof(int3),
+ len - sizeof(int3));
+ /*
+ * According to Intel, this core syncing is very likely
+ * not necessary and we'd be safe even without it. But
+ * better safe than sorry (plus there's not only Intel).
+ */
+ on_each_cpu(do_sync_core, NULL, 1);
+ }
+
+ /* patch the first byte */
+ text_poke(addr, opcode, sizeof(int3));
+
+ on_each_cpu(do_sync_core, NULL, 1);
+
+ bp_patching_in_progress = false;
+ smp_wmb();
+
+ return addr;
}
+
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index b1e7c7f7a0a..8e3842fc8be 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -477,7 +477,7 @@ error:
/* allocate and map a coherent mapping */
static void *
gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
- gfp_t flag)
+ gfp_t flag, struct dma_attrs *attrs)
{
dma_addr_t paddr;
unsigned long align_mask;
@@ -500,7 +500,8 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
}
__free_pages(page, get_order(size));
} else
- return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
+ return dma_generic_alloc_coherent(dev, size, dma_addr, flag,
+ attrs);
return NULL;
}
@@ -508,10 +509,10 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
/* free a coherent mapping */
static void
gart_free_coherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_addr)
+ dma_addr_t dma_addr, struct dma_attrs *attrs)
{
gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
- free_pages((unsigned long)vaddr, get_order(size));
+ dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
}
static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
@@ -700,8 +701,8 @@ static struct dma_map_ops gart_dma_ops = {
.unmap_sg = gart_unmap_sg,
.map_page = gart_map_page,
.unmap_page = gart_unmap_page,
- .alloc_coherent = gart_alloc_coherent,
- .free_coherent = gart_free_coherent,
+ .alloc = gart_alloc_coherent,
+ .free = gart_free_coherent,
.mapping_error = gart_mapping_error,
};
@@ -767,10 +768,9 @@ int __init gart_iommu_init(void)
aper_base = info.aper_base;
end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
- if (end_pfn > max_low_pfn_mapped) {
- start_pfn = (aper_base>>PAGE_SHIFT);
+ start_pfn = PFN_DOWN(aper_base);
+ if (!pfn_range_is_mapped(start_pfn, end_pfn))
init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
- }
pr_info("PCI-DMA: using GART IOMMU.\n");
iommu_size = check_iommu_size(info.aper_base, aper_size);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index be16854591c..f04dbb3069b 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -2,6 +2,9 @@
* Shared support code for AMD K8 northbridges and derivates.
* Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/init.h>
@@ -16,12 +19,19 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
{}
};
EXPORT_SYMBOL(amd_nb_misc_ids);
-static struct pci_device_id amd_nb_link_ids[] = {
+static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
{}
};
@@ -75,14 +85,20 @@ int amd_cache_northbridges(void)
next_northbridge(misc, amd_nb_misc_ids);
node_to_amd_nb(i)->link = link =
next_northbridge(link, amd_nb_link_ids);
- }
+ }
- /* some CPU families (e.g. family 0x11) do not support GART */
+ /* GART present only on Fam15h upto model 0fh */
if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
- boot_cpu_data.x86 == 0x15)
+ (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
amd_northbridges.flags |= AMD_NB_GART;
/*
+ * Check for L3 cache presence.
+ */
+ if (!cpuid_edx(0x80000006))
+ return 0;
+
+ /*
* Some CPU families support L3 Cache Index Disable. There are some
* limitations because of E382 and E388 on family 0x10.
*/
@@ -165,7 +181,7 @@ int amd_get_subcaches(int cpu)
return (mask >> (4 * cuid)) & 0xf;
}
-int amd_set_subcaches(int cpu, int mask)
+int amd_set_subcaches(int cpu, unsigned long mask)
{
static unsigned int reset, ban;
struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
@@ -258,7 +274,7 @@ void amd_flush_garts(void)
}
spin_unlock_irqrestore(&gart_lock, flags);
if (!flushed)
- printk("nothing to flush?\n");
+ pr_notice("nothing to flush?\n");
}
EXPORT_SYMBOL_GPL(amd_flush_garts);
@@ -269,11 +285,10 @@ static __init int init_amd_nbs(void)
err = amd_cache_northbridges();
if (err < 0)
- printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
+ pr_notice("Cannot enumerate AMD northbridges\n");
if (amd_cache_gart() < 0)
- printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
- "GART support disabled.\n");
+ pr_notice("Cannot initialize GART flush words, GART support disabled\n");
return err;
}
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index afdc3f756de..af5b08ab3b7 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -40,7 +40,7 @@
#include <asm/fixmap.h>
#include <asm/apb_timer.h>
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
#include <asm/time.h>
#define APBT_CLOCKEVENT_RATING 110
@@ -157,13 +157,13 @@ static int __init apbt_clockevent_register(void)
adev->num = smp_processor_id();
adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
- mrst_timer_options == MRST_TIMER_LAPIC_APBT ?
+ intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ?
APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
adev_virt_addr(adev), 0, apbt_freq);
/* Firmware does EOI handling for us. */
adev->timer->eoi = NULL;
- if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
+ if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
global_clock_event = &adev->timer->ced;
printk(KERN_DEBUG "%s clockevent registered as global\n",
global_clock_event->name);
@@ -240,7 +240,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
dw_apb_clockevent_pause(adev->timer);
if (system_state == SYSTEM_RUNNING) {
pr_debug("skipping APBT CPU %lu offline\n", cpu);
- } else if (adev) {
+ } else {
pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
dw_apb_clockevent_stop(adev->timer);
}
@@ -253,7 +253,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
static __init int apbt_late_init(void)
{
- if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
+ if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ||
!apb_timer_block_enabled)
return 0;
/* This notifier should be called after workqueue is ready */
@@ -311,7 +311,6 @@ void __init apbt_time_init(void)
#ifdef CONFIG_SMP
int i;
struct sfi_timer_table_entry *p_mtmr;
- unsigned int percpu_timer;
struct apbt_dev *adev;
#endif
@@ -341,18 +340,15 @@ void __init apbt_time_init(void)
}
#ifdef CONFIG_SMP
/* kernel cmdline disable apb timer, so we will use lapic timers */
- if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
+ if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
printk(KERN_INFO "apbt: disabled per cpu timer\n");
return;
}
pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
- if (num_possible_cpus() <= sfi_mtimer_num) {
- percpu_timer = 1;
+ if (num_possible_cpus() <= sfi_mtimer_num)
apbt_num_timers_used = num_possible_cpus();
- } else {
- percpu_timer = 0;
+ else
apbt_num_timers_used = 1;
- }
pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
/* here we set up per CPU timer data structure */
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 6e76c191a83..76164e173a2 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -10,6 +10,8 @@
*
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
+#define pr_fmt(fmt) "AGP: " fmt
+
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
@@ -18,9 +20,7 @@
#include <linux/pci_ids.h>
#include <linux/pci.h>
#include <linux/bitops.h>
-#include <linux/ioport.h>
#include <linux/suspend.h>
-#include <linux/kmemleak.h>
#include <asm/e820.h>
#include <asm/io.h>
#include <asm/iommu.h>
@@ -55,18 +55,6 @@ int fallback_aper_force __initdata;
int fix_aperture __initdata = 1;
-static struct resource gart_resource = {
- .name = "GART",
- .flags = IORESOURCE_MEM,
-};
-
-static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
-{
- gart_resource.start = aper_base;
- gart_resource.end = aper_base + aper_size - 1;
- insert_resource(&iomem_resource, &gart_resource);
-}
-
/* This code runs before the PCI subsystem is initialized, so just
access the northbridge directly. */
@@ -88,21 +76,14 @@ static u32 __init allocate_aperture(void)
*/
addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
aper_size, aper_size);
- if (!addr || addr + aper_size > GART_MAX_ADDR) {
- printk(KERN_ERR
- "Cannot allocate aperture memory hole (%lx,%uK)\n",
- addr, aper_size>>10);
+ if (!addr) {
+ pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
return 0;
}
memblock_reserve(addr, aper_size);
- /*
- * Kmemleak should not scan this block as it may not be mapped via the
- * kernel direct mapping.
- */
- kmemleak_ignore(phys_to_virt(addr));
- printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
- aper_size >> 10, addr);
- insert_aperture_resource((u32)addr, aper_size);
+ pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
register_nosave_region(addr >> PAGE_SHIFT,
(addr+aper_size) >> PAGE_SHIFT);
@@ -146,10 +127,11 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
u64 aper;
u32 old_order;
- printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func);
+ pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func);
apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
if (apsizereg == 0xffffffff) {
- printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
+ pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n",
+ bus, slot, func);
return 0;
}
@@ -173,16 +155,18 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
* On some sick chips, APSIZE is 0. It means it wants 4G
* so let double check that order, and lets trust AMD NB settings:
*/
- printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n",
- aper, 32 << old_order);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n",
+ bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1,
+ 32 << old_order);
if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
- printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
- 32 << *order, apsizereg);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n",
+ bus, slot, func, 32 << *order, apsizereg);
*order = old_order;
}
- printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
- aper, 32 << *order, apsizereg);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n",
+ bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1,
+ 32 << *order, apsizereg);
if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
return 0;
@@ -238,7 +222,7 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
}
}
}
- printk(KERN_INFO "No AGP bridge found\n");
+ pr_info("No AGP bridge found\n");
return 0;
}
@@ -330,7 +314,8 @@ void __init early_gart_iommu_check(void)
if (e820_any_mapped(aper_base, aper_base + aper_size,
E820_RAM)) {
/* reserve it, so we can reuse it in second kernel */
- printk(KERN_INFO "update e820 for GART\n");
+ pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n",
+ aper_base, aper_base + aper_size - 1);
e820_add_region(aper_base, aper_size, E820_RESERVED);
update_e820();
}
@@ -374,7 +359,7 @@ int __init gart_iommu_hole_init(void)
!early_pci_allowed())
return -ENODEV;
- printk(KERN_INFO "Checking aperture...\n");
+ pr_info("Checking aperture...\n");
if (!fallback_aper_force)
agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
@@ -415,8 +400,9 @@ int __init gart_iommu_hole_init(void)
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
aper_base <<= 25;
- printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
- node, aper_base, aper_size >> 20);
+ pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n",
+ node, aper_base, aper_base + aper_size - 1,
+ aper_size >> 20);
node++;
if (!aperture_valid(aper_base, aper_size, 64<<20)) {
@@ -427,9 +413,9 @@ int __init gart_iommu_hole_init(void)
if (!no_iommu &&
max_pfn > MAX_DMA32_PFN &&
!printed_gart_size_msg) {
- printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
- printk(KERN_ERR "please increase GART size in your BIOS setup\n");
- printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
+ pr_err("you are using iommu with agp, but GART size is less than 64MB\n");
+ pr_err("please increase GART size in your BIOS setup\n");
+ pr_err("if BIOS doesn't have that option, contact your HW vendor!\n");
printed_gart_size_msg = 1;
}
} else {
@@ -450,12 +436,8 @@ int __init gart_iommu_hole_init(void)
out:
if (!fix && !fallback_aper_force) {
- if (last_aper_base) {
- unsigned long n = (32 * 1024 * 1024) << last_aper_order;
-
- insert_aperture_resource((u32)last_aper_base, n);
+ if (last_aper_base)
return 1;
- }
return 0;
}
@@ -470,13 +452,10 @@ out:
force_iommu ||
valid_agp ||
fallback_aper_force) {
- printk(KERN_INFO
- "Your BIOS doesn't leave a aperture memory hole\n");
- printk(KERN_INFO
- "Please enable the IOMMU option in the BIOS setup\n");
- printk(KERN_INFO
- "This costs you %d MB of RAM\n",
- 32 << fallback_aper_order);
+ pr_info("Your BIOS doesn't leave a aperture memory hole\n");
+ pr_info("Please enable the IOMMU option in the BIOS setup\n");
+ pr_info("This costs you %dMB of RAM\n",
+ 32 << fallback_aper_order);
aper_order = fallback_aper_order;
aper_alloc = allocate_aperture();
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 0ae0323b1f9..dcb5b15401c 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -18,10 +18,7 @@ obj-y += apic_flat_64.o
endif
# APIC probe will depend on the listing order here
-obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
-obj-$(CONFIG_X86_SUMMIT) += summit_32.o
obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
-obj-$(CONFIG_X86_ES7000) += es7000_32.o
# For 32bit, probe_32 need to be listed last
obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2eec05b6d1b..ad28db7e6bd 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -35,6 +35,8 @@
#include <linux/smp.h>
#include <linux/mm.h>
+#include <asm/trace/irq_vectors.h>
+#include <asm/irq_remapping.h>
#include <asm/perf_event.h>
#include <asm/x86_init.h>
#include <asm/pgalloc.h>
@@ -56,10 +58,11 @@
unsigned int num_processors;
-unsigned disabled_cpus __cpuinitdata;
+unsigned disabled_cpus;
/* Processor that is doing the boot up */
unsigned int boot_cpu_physical_apicid = -1U;
+EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
/*
* The highest APIC ID seen during enumeration.
@@ -72,10 +75,17 @@ unsigned int max_physical_apicid;
physid_mask_t phys_cpu_present_map;
/*
+ * Processor to be disabled specified by kernel parameter
+ * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
+ * avoid undefined behaviour caused by sending INIT from AP to BSP.
+ */
+static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
+
+/*
* Map cpu index to physical APIC ID
*/
-DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
-DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
@@ -87,23 +97,8 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
* used for the mapping. This is where the behaviors of x86_64 and 32
* actually diverge. Let's keep it ugly for now.
*/
-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);
-/*
- * Knob to control our willingness to enable the local APIC.
- *
- * +1=force-enable
- */
-static int force_enable_local_apic __initdata;
-/*
- * APIC command line parameters
- */
-static int __init parse_lapic(char *arg)
-{
- force_enable_local_apic = 1;
- return 0;
-}
-early_param("lapic", parse_lapic);
/* Local APIC was disabled by the BIOS and enabled by the kernel */
static int enabled_via_apicbase;
@@ -132,6 +127,29 @@ static inline void imcr_apic_to_pic(void)
}
#endif
+/*
+ * Knob to control our willingness to enable the local APIC.
+ *
+ * +1=force-enable
+ */
+static int force_enable_local_apic __initdata;
+
+/* Control whether x2APIC mode is enabled or not */
+static bool nox2apic __initdata;
+
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+ if (config_enabled(CONFIG_X86_32) && !arg)
+ force_enable_local_apic = 1;
+ else if (arg && !strncmp(arg, "notscdeadline", 13))
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return 0;
+}
+early_param("lapic", parse_lapic);
+
#ifdef CONFIG_X86_64
static int apic_calibrate_pmtmr __initdata;
static __init int setup_apicpmtimer(char *s)
@@ -148,8 +166,7 @@ int x2apic_mode;
/* x2apic enabled before OS handover */
int x2apic_preenabled;
static int x2apic_disabled;
-static int nox2apic;
-static __init int setup_nox2apic(char *str)
+static int __init setup_nox2apic(char *str)
{
if (x2apic_enabled()) {
int apicid = native_apic_msr_read(APIC_ID);
@@ -164,7 +181,7 @@ static __init int setup_nox2apic(char *str)
} else
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
- nox2apic = 1;
+ nox2apic = true;
return 0;
}
@@ -269,8 +286,12 @@ u32 native_safe_apic_wait_icr_idle(void)
void native_apic_icr_write(u32 low, u32 id)
{
+ unsigned long flags;
+
+ local_irq_save(flags);
apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
apic_write(APIC_ICR, low);
+ local_irq_restore(flags);
}
u64 native_apic_icr_read(void)
@@ -314,6 +335,7 @@ int lapic_get_maxlvt(void)
/* Clock divisor */
#define APIC_DIVISOR 16
+#define TSC_DIVISOR 32
/*
* This function sets up the local APIC timer, with a timeout of
@@ -332,6 +354,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
lvtt_value = LOCAL_TIMER_VECTOR;
if (!oneshot)
lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+ else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE;
+
if (!lapic_is_integrated())
lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
@@ -340,6 +365,11 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
apic_write(APIC_LVTT, lvtt_value);
+ if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
+ printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
+ return;
+ }
+
/*
* Divide PICLK by 16
*/
@@ -383,20 +413,25 @@ static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
{
- unsigned int rsvd; /* 0: uninitialized */
+ unsigned int rsvd, vector;
if (offset >= APIC_EILVT_NR_MAX)
return ~0;
- rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
+ rsvd = atomic_read(&eilvt_offsets[offset]);
do {
- if (rsvd &&
- !eilvt_entry_is_changeable(rsvd, new))
+ vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */
+ if (vector && !eilvt_entry_is_changeable(vector, new))
/* may not change if vectors are different */
return rsvd;
rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
} while (rsvd != new);
+ rsvd &= ~APIC_EILVT_MASKED;
+ if (rsvd && rsvd != vector)
+ pr_info("LVT offset %d assigned for vector 0x%02x\n",
+ offset, rsvd);
+
return new;
}
@@ -447,6 +482,16 @@ static int lapic_next_event(unsigned long delta,
return 0;
}
+static int lapic_next_deadline(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ u64 tsc;
+
+ rdtscll(tsc);
+ wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
+ return 0;
+}
+
/*
* Setup the lapic timer in periodic or oneshot mode
*/
@@ -514,7 +559,7 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
* Setup the local APIC timer for this CPU. Copy the initialized values
* of the boot CPU and register the clock event in the framework.
*/
-static void __cpuinit setup_APIC_timer(void)
+static void setup_APIC_timer(void)
{
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
@@ -527,7 +572,15 @@ static void __cpuinit setup_APIC_timer(void)
memcpy(levt, &lapic_clockevent, sizeof(*levt));
levt->cpumask = cpumask_of(smp_processor_id());
- clockevents_register_device(levt);
+ if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
+ levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_DUMMY);
+ levt->set_next_event = lapic_next_deadline;
+ clockevents_config_and_register(levt,
+ (tsc_khz / TSC_DIVISOR) * 1000,
+ 0xF, ~0UL);
+ } else
+ clockevents_register_device(levt);
}
/*
@@ -655,7 +708,9 @@ static int __init calibrate_APIC_clock(void)
* in the clockevent structure and return.
*/
- if (lapic_timer_frequency) {
+ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
+ return 0;
+ } else if (lapic_timer_frequency) {
apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n",
lapic_timer_frequency);
lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR,
@@ -668,6 +723,9 @@ static int __init calibrate_APIC_clock(void)
return 0;
}
+ apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+ "calibrating APIC timer ...\n");
+
local_irq_disable();
/* Replace the global interrupt handler */
@@ -805,9 +863,6 @@ void __init setup_boot_APIC_clock(void)
return;
}
- apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
- "calibrating APIC timer ...\n");
-
if (calibrate_APIC_clock()) {
/* No broadcast on UP ! */
if (num_possible_cpus() > 1)
@@ -826,7 +881,7 @@ void __init setup_boot_APIC_clock(void)
setup_APIC_timer();
}
-void __cpuinit setup_secondary_APIC_clock(void)
+void setup_secondary_APIC_clock(void)
{
setup_APIC_timer();
}
@@ -873,24 +928,42 @@ static void local_apic_timer_interrupt(void)
* [ if a single-CPU system runs an SMP kernel then we call the local
* interrupt as well. Thus we cannot inline the local irq ... ]
*/
-void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
/*
* NOTE! We'd better ACK the irq immediately,
* because timer handling can be slow.
+ *
+ * update_process_times() expects us to have done irq_enter().
+ * Besides, if we don't timer interrupts ignore the global
+ * interrupt lock, which is the WrongThing (tm) to do.
*/
- ack_APIC_irq();
+ entering_ack_irq();
+ local_apic_timer_interrupt();
+ exiting_irq();
+
+ set_irq_regs(old_regs);
+}
+
+__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
/*
+ * NOTE! We'd better ACK the irq immediately,
+ * because timer handling can be slow.
+ *
* update_process_times() expects us to have done irq_enter().
* Besides, if we don't timer interrupts ignore the global
* interrupt lock, which is the WrongThing (tm) to do.
*/
- irq_enter();
- exit_idle();
+ entering_ack_irq();
+ trace_local_timer_entry(LOCAL_TIMER_VECTOR);
local_apic_timer_interrupt();
- irq_exit();
+ trace_local_timer_exit(LOCAL_TIMER_VECTOR);
+ exiting_irq();
set_irq_regs(old_regs);
}
@@ -1171,7 +1244,7 @@ void __init init_bsp_APIC(void)
apic_write(APIC_LVT1, value);
}
-static void __cpuinit lapic_setup_esr(void)
+static void lapic_setup_esr(void)
{
unsigned int oldvalue, value, maxlvt;
@@ -1218,7 +1291,7 @@ static void __cpuinit lapic_setup_esr(void)
* Used to setup local APIC while initializing BSP or bringin up APs.
* Always called with preemption disabled.
*/
-void __cpuinit setup_local_APIC(void)
+void setup_local_APIC(void)
{
int cpu = smp_processor_id();
unsigned int value, queued;
@@ -1320,11 +1393,13 @@ void __cpuinit setup_local_APIC(void)
acked);
break;
}
- if (cpu_has_tsc) {
- rdtscll(ntsc);
- max_loops = (cpu_khz << 10) - (ntsc - tsc);
- } else
- max_loops--;
+ if (queued) {
+ if (cpu_has_tsc) {
+ rdtscll(ntsc);
+ max_loops = (cpu_khz << 10) - (ntsc - tsc);
+ } else
+ max_loops--;
+ }
} while (queued && max_loops > 0);
WARN_ON(max_loops <= 0);
@@ -1411,7 +1486,7 @@ void __cpuinit setup_local_APIC(void)
#endif
}
-void __cpuinit end_local_APIC_setup(void)
+void end_local_APIC_setup(void)
{
lapic_setup_esr();
@@ -1436,8 +1511,7 @@ void __init bsp_end_local_APIC_setup(void)
* Now that local APIC setup is completed for BP, configure the fault
* handling for interrupt remapping.
*/
- if (intr_remapping_enabled)
- enable_drhd_fault_handling();
+ irq_remap_enable_fault_handling();
}
@@ -1512,7 +1586,7 @@ void enable_x2apic(void)
int __init enable_IR(void)
{
#ifdef CONFIG_IRQ_REMAP
- if (!intr_remapping_supported()) {
+ if (!irq_remapping_supported()) {
pr_debug("intr-remapping not supported\n");
return -1;
}
@@ -1523,7 +1597,7 @@ int __init enable_IR(void)
return -1;
}
- return enable_intr_remapping();
+ return irq_remapping_enable();
#endif
return -1;
}
@@ -1532,10 +1606,13 @@ void __init enable_IR_x2apic(void)
{
unsigned long flags;
int ret, x2apic_enabled = 0;
- int dmar_table_init_ret;
+ int hardware_init_ret;
+
+ /* Make sure irq_remap_ops are initialized */
+ setup_irq_remapping_ops();
- dmar_table_init_ret = dmar_table_init();
- if (dmar_table_init_ret && !x2apic_supported())
+ hardware_init_ret = irq_remapping_prepare();
+ if (hardware_init_ret && !x2apic_supported())
return;
ret = save_ioapic_entries();
@@ -1551,7 +1628,7 @@ void __init enable_IR_x2apic(void)
if (x2apic_preenabled && nox2apic)
disable_x2apic();
- if (dmar_table_init_ret)
+ if (hardware_init_ret)
ret = -1;
else
ret = enable_IR();
@@ -1632,9 +1709,11 @@ static int __init apic_verify(void)
mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
/* The BIOS may have set up the APIC at some other address */
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (l & MSR_IA32_APICBASE_ENABLE)
- mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (l & MSR_IA32_APICBASE_ENABLE)
+ mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+ }
pr_info("Found and enabled local APIC!\n");
return 0;
@@ -1652,13 +1731,15 @@ int __init apic_force_enable(unsigned long addr)
* MSR. This can only be done in software for Intel P6 or later
* and AMD K7 (Model > 1) or later.
*/
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (!(l & MSR_IA32_APICBASE_ENABLE)) {
- pr_info("Local APIC disabled by BIOS -- reenabling.\n");
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
- enabled_via_apicbase = 1;
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+ pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ enabled_via_apicbase = 1;
+ }
}
return apic_verify();
}
@@ -1860,12 +1941,10 @@ int __init APIC_init_uniprocessor(void)
/*
* This interrupt should _never_ happen with our APIC/SMP architecture
*/
-void smp_spurious_interrupt(struct pt_regs *regs)
+static inline void __smp_spurious_interrupt(void)
{
u32 v;
- irq_enter();
- exit_idle();
/*
* Check if this really is a spurious interrupt and ACK it
* if it is a vectored one. Just in case...
@@ -1880,15 +1959,30 @@ void smp_spurious_interrupt(struct pt_regs *regs)
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
pr_info("spurious APIC interrupt on CPU#%d, "
"should never happen.\n", smp_processor_id());
- irq_exit();
+}
+
+__visible void smp_spurious_interrupt(struct pt_regs *regs)
+{
+ entering_irq();
+ __smp_spurious_interrupt();
+ exiting_irq();
+}
+
+__visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
+{
+ entering_irq();
+ trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR);
+ __smp_spurious_interrupt();
+ trace_spurious_apic_exit(SPURIOUS_APIC_VECTOR);
+ exiting_irq();
}
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
-void smp_error_interrupt(struct pt_regs *regs)
+static inline void __smp_error_interrupt(struct pt_regs *regs)
{
- u32 v0, v1;
+ u32 v;
u32 i = 0;
static const char * const error_interrupt_reason[] = {
"Send CS error", /* APIC Error Bit 0 */
@@ -1901,29 +1995,42 @@ void smp_error_interrupt(struct pt_regs *regs)
"Illegal register address", /* APIC Error Bit 7 */
};
- irq_enter();
- exit_idle();
/* First tickle the hardware, only then report what went on. -- REW */
- v0 = apic_read(APIC_ESR);
- apic_write(APIC_ESR, 0);
- v1 = apic_read(APIC_ESR);
+ if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ v = apic_read(APIC_ESR);
ack_APIC_irq();
atomic_inc(&irq_err_count);
- apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
- smp_processor_id(), v0 , v1);
+ apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
+ smp_processor_id(), v);
- v1 = v1 & 0xff;
- while (v1) {
- if (v1 & 0x1)
+ v &= 0xff;
+ while (v) {
+ if (v & 0x1)
apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
i++;
- v1 >>= 1;
- };
+ v >>= 1;
+ }
apic_printk(APIC_DEBUG, KERN_CONT "\n");
- irq_exit();
+}
+
+__visible void smp_error_interrupt(struct pt_regs *regs)
+{
+ entering_irq();
+ __smp_error_interrupt(regs);
+ exiting_irq();
+}
+
+__visible void smp_trace_error_interrupt(struct pt_regs *regs)
+{
+ entering_irq();
+ trace_error_apic_entry(ERROR_APIC_VECTOR);
+ __smp_error_interrupt(regs);
+ trace_error_apic_exit(ERROR_APIC_VECTOR);
+ exiting_irq();
}
/**
@@ -2015,13 +2122,45 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT1, value);
}
-void __cpuinit generic_processor_info(int apicid, int version)
+int generic_processor_info(int apicid, int version)
{
int cpu, max = nr_cpu_ids;
bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
phys_cpu_present_map);
/*
+ * boot_cpu_physical_apicid is designed to have the apicid
+ * returned by read_apic_id(), i.e, the apicid of the
+ * currently booting-up processor. However, on some platforms,
+ * it is temporarily modified by the apicid reported as BSP
+ * through MP table. Concretely:
+ *
+ * - arch/x86/kernel/mpparse.c: MP_processor_info()
+ * - arch/x86/mm/amdtopology.c: amd_numa_init()
+ *
+ * This function is executed with the modified
+ * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel
+ * parameter doesn't work to disable APs on kdump 2nd kernel.
+ *
+ * Since fixing handling of boot_cpu_physical_apicid requires
+ * another discussion and tests on each platform, we leave it
+ * for now and here we use read_apic_id() directly in this
+ * function, generic_processor_info().
+ */
+ if (disabled_cpu_apicid != BAD_APICID &&
+ disabled_cpu_apicid != read_apic_id() &&
+ disabled_cpu_apicid == apicid) {
+ int thiscpu = num_processors + disabled_cpus;
+
+ pr_warning("APIC: Disabling requested cpu."
+ " Processor %d/0x%x ignored.\n",
+ thiscpu, apicid);
+
+ disabled_cpus++;
+ return -ENODEV;
+ }
+
+ /*
* If boot cpu has not been detected yet, then only allow upto
* nr_cpu_ids - 1 processors and keep one slot free for boot cpu
*/
@@ -2035,7 +2174,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
" Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
disabled_cpus++;
- return;
+ return -ENODEV;
}
if (num_processors >= nr_cpu_ids) {
@@ -2046,7 +2185,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
" Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
disabled_cpus++;
- return;
+ return -EINVAL;
}
num_processors++;
@@ -2091,6 +2230,8 @@ void __cpuinit generic_processor_info(int apicid, int version)
#endif
set_cpu_possible(cpu, true);
set_cpu_present(cpu, true);
+
+ return cpu;
}
int hard_smp_processor_id(void)
@@ -2108,6 +2249,42 @@ void default_init_apic_ldr(void)
apic_write(APIC_LDR, val);
}
+int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask,
+ unsigned int *apicid)
+{
+ unsigned int cpu;
+
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+
+ if (likely(cpu < nr_cpu_ids)) {
+ *apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+/*
+ * Override the generic EOI implementation with an optimized version.
+ * Only called during early boot when only one CPU is active and with
+ * interrupts disabled, so we know this does not race with actual APIC driver
+ * use.
+ */
+void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ /* Should happen once for each apic */
+ WARN_ON((*drv)->eoi_write == eoi_write);
+ (*drv)->eoi_write = eoi_write;
+ }
+}
+
/*
* Power management
*/
@@ -2167,8 +2344,7 @@ static int lapic_suspend(void)
local_irq_save(flags);
disable_local_APIC();
- if (intr_remapping_enabled)
- disable_intr_remapping();
+ irq_remapping_disable();
local_irq_restore(flags);
return 0;
@@ -2184,16 +2360,15 @@ static void lapic_resume(void)
return;
local_irq_save(flags);
- if (intr_remapping_enabled) {
- /*
- * IO-APIC and PIC have their own resume routines.
- * We just mask them here to make sure the interrupt
- * subsystem is completely quiet while we enable x2apic
- * and interrupt-remapping.
- */
- mask_ioapic_entries();
- legacy_pic->mask_all();
- }
+
+ /*
+ * IO-APIC and PIC have their own resume routines.
+ * We just mask them here to make sure the interrupt
+ * subsystem is completely quiet while we enable x2apic
+ * and interrupt-remapping.
+ */
+ mask_ioapic_entries();
+ legacy_pic->mask_all();
if (x2apic_mode)
enable_x2apic();
@@ -2204,10 +2379,12 @@ static void lapic_resume(void)
* FIXME! This will be wrong if we ever support suspend on
* SMP! We'll need to do this as part of the CPU restore!
*/
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
}
maxlvt = lapic_get_maxlvt();
@@ -2219,7 +2396,7 @@ static void lapic_resume(void)
apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
+#if defined(CONFIG_X86_MCE_INTEL)
if (maxlvt >= 5)
apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
#endif
@@ -2234,8 +2411,7 @@ static void lapic_resume(void)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
- if (intr_remapping_enabled)
- reenable_intr_remapping(x2apic_mode);
+ irq_remapping_reenable(x2apic_mode);
local_irq_restore(flags);
}
@@ -2250,7 +2426,7 @@ static struct syscore_ops lapic_syscore_ops = {
.suspend = lapic_suspend,
};
-static void __cpuinit apic_pm_activate(void)
+static void apic_pm_activate(void)
{
apic_pm_state.active = 1;
}
@@ -2275,7 +2451,7 @@ static void apic_pm_activate(void) { }
#ifdef CONFIG_X86_64
-static int __cpuinit apic_cluster_num(void)
+static int apic_cluster_num(void)
{
int i, clusters, zeros;
unsigned id;
@@ -2320,10 +2496,10 @@ static int __cpuinit apic_cluster_num(void)
return clusters;
}
-static int __cpuinitdata multi_checked;
-static int __cpuinitdata multi;
+static int multi_checked;
+static int multi;
-static int __cpuinit set_multi(const struct dmi_system_id *d)
+static int set_multi(const struct dmi_system_id *d)
{
if (multi)
return 0;
@@ -2332,7 +2508,7 @@ static int __cpuinit set_multi(const struct dmi_system_id *d)
return 0;
}
-static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
+static const struct dmi_system_id multi_dmi_table[] = {
{
.callback = set_multi,
.ident = "IBM System Summit2",
@@ -2344,7 +2520,7 @@ static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
{}
};
-static void __cpuinit dmi_check_multi(void)
+static void dmi_check_multi(void)
{
if (multi_checked)
return;
@@ -2361,7 +2537,7 @@ static void __cpuinit dmi_check_multi(void)
* multi-chassis.
* Use DMI to check them
*/
-__cpuinit int apic_is_clustered_box(void)
+int apic_is_clustered_box(void)
{
dmi_check_multi();
if (multi)
@@ -2462,3 +2638,12 @@ static int __init lapic_insert_resource(void)
* that is using request_resource
*/
late_initcall(lapic_insert_resource);
+
+static int __init apic_set_disabled_cpu_apicid(char *arg)
+{
+ if (!arg || !get_option(&arg, &disabled_cpu_apicid))
+ return -EINVAL;
+
+ return 0;
+}
+early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 8c3cdded6f2..7c1b2947951 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -14,16 +14,13 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
-#include <linux/init.h>
#include <linux/hardirq.h>
#include <linux/module.h>
#include <asm/smp.h>
#include <asm/apic.h>
#include <asm/ipi.h>
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
+#include <linux/acpi.h>
static struct apic apic_physflat;
static struct apic apic_flat;
@@ -36,25 +33,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 1;
}
-static const struct cpumask *flat_target_cpus(void)
-{
- return cpu_online_mask;
-}
-
-static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
/*
* Set up the logical destination ID.
*
@@ -92,7 +70,7 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
}
static void
- flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
+flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
{
unsigned long mask = cpumask_bits(cpumask)[0];
int cpu = smp_processor_id();
@@ -180,12 +158,13 @@ static struct apic apic_flat = {
.name = "flat",
.probe = flat_probe,
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
.irq_dest_mode = 1, /* logical */
- .target_cpus = flat_target_cpus,
+ .target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
@@ -209,8 +188,7 @@ static struct apic apic_flat = {
.set_apic_id = set_apic_id,
.apic_id_mask = 0xFFu << 24,
- .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
.send_IPI_mask = flat_send_IPI_mask,
.send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
@@ -220,12 +198,13 @@ static struct apic apic_flat = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
@@ -260,17 +239,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
-static const struct cpumask *physflat_target_cpus(void)
-{
- return cpu_online_mask;
-}
-
-static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
{
default_send_IPI_mask_sequence_phys(cpumask, vector);
@@ -292,38 +260,6 @@ static void physflat_send_IPI_all(int vector)
physflat_send_IPI_mask(cpu_online_mask, vector);
}
-static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- cpu = cpumask_first(cpumask);
- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
- else
- return BAD_APICID;
-}
-
-static unsigned int
-physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
- return per_cpu(x86_cpu_to_apicid, cpu);
-}
-
static int physflat_probe(void)
{
if (apic == &apic_physflat || num_possible_cpus() > 8)
@@ -337,18 +273,19 @@ static struct apic apic_physflat = {
.name = "physical flat",
.probe = physflat_probe,
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 0, /* physical */
- .target_cpus = physflat_target_cpus,
+ .target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
.check_apicid_present = NULL,
- .vector_allocation_domain = physflat_vector_allocation_domain,
+ .vector_allocation_domain = default_vector_allocation_domain,
/* not needed, but shouldn't hurt: */
.init_apic_ldr = flat_init_apic_ldr,
@@ -367,8 +304,7 @@ static struct apic apic_physflat = {
.set_apic_id = set_apic_id,
.apic_id_mask = 0xFFu << 24,
- .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
.send_IPI_mask = physflat_send_IPI_mask,
.send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
@@ -378,12 +314,13 @@ static struct apic apic_physflat = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 775b82bc655..8c7c98249c2 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -15,7 +15,6 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
-#include <linux/init.h>
#include <linux/errno.h>
#include <asm/fixmap.h>
#include <asm/mpspec.h>
@@ -100,12 +99,12 @@ static unsigned long noop_check_apicid_present(int bit)
return physid_isset(bit, phys_cpu_present_map);
}
-static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask,
+ const struct cpumask *mask)
{
if (cpu != 0)
pr_warning("APIC: Vector allocated for non-BSP cpu\n");
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
+ cpumask_copy(retmask, cpumask_of(cpu));
}
static u32 noop_apic_read(u32 reg)
@@ -124,6 +123,7 @@ struct apic apic_noop = {
.probe = noop_probe,
.acpi_madt_oem_check = NULL,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = noop_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
@@ -158,8 +158,7 @@ struct apic apic_noop = {
.set_apic_id = NULL,
.apic_id_mask = 0x0F << 24,
- .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
.send_IPI_mask = noop_send_IPI_mask,
.send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
@@ -173,13 +172,13 @@ struct apic apic_noop = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
-
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
.read = noop_apic_read,
.write = noop_apic_write,
+ .eoi_write = noop_apic_write,
.icr_read = noop_apic_icr_read,
.icr_write = noop_apic_icr_write,
.wait_icr_idle = noop_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 09d3d8c1cd9..a5b45df8bc8 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -22,15 +22,17 @@
#include <linux/hardirq.h>
#include <linux/delay.h>
+#include <asm/numachip/numachip.h>
#include <asm/numachip/numachip_csr.h>
#include <asm/smp.h>
#include <asm/apic.h>
#include <asm/ipi.h>
#include <asm/apic_flat_64.h>
+#include <asm/pgtable.h>
static int numachip_system __read_mostly;
-static struct apic apic_numachip __read_mostly;
+static const struct apic apic_numachip __read_mostly;
static unsigned int get_apic_id(unsigned long x)
{
@@ -56,6 +58,12 @@ static unsigned int read_xapic_id(void)
return get_apic_id(apic_read(APIC_ID));
}
+static int numachip_apic_id_valid(int apicid)
+{
+ /* Trust what bootloader passes in MADT */
+ return 1;
+}
+
static int numachip_apic_id_registered(void)
{
return physid_isset(read_xapic_id(), phys_cpu_present_map);
@@ -66,18 +74,7 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
return initial_apic_id >> index_msb;
}
-static const struct cpumask *numachip_target_cpus(void)
-{
- return cpu_online_mask;
-}
-
-static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
-static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
{
union numachip_csr_g3_ext_irq_gen int_gen;
@@ -151,38 +148,6 @@ static void numachip_send_IPI_self(int vector)
__default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
}
-static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- cpu = cpumask_first(cpumask);
- if (likely((unsigned)cpu < nr_cpu_ids))
- return per_cpu(x86_cpu_to_apicid, cpu);
-
- return BAD_APICID;
-}
-
-static unsigned int
-numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
- return per_cpu(x86_cpu_to_apicid, cpu);
-}
-
static int __init numachip_probe(void)
{
return apic == &apic_numachip;
@@ -201,8 +166,11 @@ static void __init map_csrs(void)
static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
{
- c->phys_proc_id = node;
- per_cpu(cpu_llc_id, smp_processor_id()) = node;
+
+ if (c->phys_proc_id != node) {
+ c->phys_proc_id = node;
+ per_cpu(cpu_llc_id, smp_processor_id()) = node;
+ }
}
static int __init numachip_system_init(void)
@@ -213,6 +181,7 @@ static int __init numachip_system_init(void)
return 0;
x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+ x86_init.pci.arch_init = pci_numachip_init;
map_csrs();
@@ -233,23 +202,24 @@ static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
-static struct apic apic_numachip __refconst = {
+static const struct apic apic_numachip __refconst = {
.name = "NumaConnect system",
.probe = numachip_probe,
.acpi_madt_oem_check = numachip_acpi_madt_oem_check,
+ .apic_id_valid = numachip_apic_id_valid,
.apic_id_registered = numachip_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 0, /* physical */
- .target_cpus = numachip_target_cpus,
+ .target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
.check_apicid_present = NULL,
- .vector_allocation_domain = numachip_vector_allocation_domain,
+ .vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = flat_init_apic_ldr,
.ioapic_phys_id_map = NULL,
@@ -267,8 +237,7 @@ static struct apic apic_numachip __refconst = {
.set_apic_id = set_apic_id,
.apic_id_mask = 0xffU << 24,
- .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
.send_IPI_mask = numachip_send_IPI_mask,
.send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
@@ -279,12 +248,13 @@ static struct apic apic_numachip __refconst = {
.wakeup_secondary_cpu = numachip_wakeup_secondary,
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL, /* REMRD not supported */
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 521bead0113..e4840aa7a25 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -26,15 +26,6 @@ static int bigsmp_apic_id_registered(void)
return 1;
}
-static const struct cpumask *bigsmp_target_cpus(void)
-{
-#ifdef CONFIG_SMP
- return cpu_online_mask;
-#else
- return cpumask_of(0);
-#endif
-}
-
static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
{
return 0;
@@ -105,32 +96,6 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
return 1;
}
-/* As we are using single CPU as destination, pick only one CPU here */
-static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- int cpu = cpumask_first(cpumask);
-
- if (cpu < nr_cpu_ids)
- return cpu_physical_id(cpu);
- return BAD_APICID;
-}
-
-static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- return cpu_physical_id(cpu);
- }
- return BAD_APICID;
-}
-
static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
{
return cpuid_apic >> index_msb;
@@ -177,12 +142,6 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
{ } /* NULL entry stops DMI scanning */
};
-static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
static int probe_bigsmp(void)
{
if (def_to_bigsmp)
@@ -198,19 +157,20 @@ static struct apic apic_bigsmp = {
.name = "bigsmp",
.probe = probe_bigsmp,
.acpi_madt_oem_check = NULL,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = bigsmp_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
/* phys delivery to target CPU: */
.irq_dest_mode = 0,
- .target_cpus = bigsmp_target_cpus,
+ .target_cpus = default_target_cpus,
.disable_esr = 1,
.dest_logical = 0,
.check_apicid_used = bigsmp_check_apicid_used,
.check_apicid_present = bigsmp_check_apicid_present,
- .vector_allocation_domain = bigsmp_vector_allocation_domain,
+ .vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = bigsmp_init_apic_ldr,
.ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
@@ -228,8 +188,7 @@ static struct apic apic_bigsmp = {
.set_apic_id = NULL,
.apic_id_mask = 0xFF << 24,
- .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
.send_IPI_mask = bigsmp_send_IPI_mask,
.send_IPI_mask_allbutself = NULL,
@@ -240,13 +199,13 @@ static struct apic apic_bigsmp = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
+ .wait_for_init_deassert = true,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
deleted file mode 100644
index 5d513bc47b6..00000000000
--- a/arch/x86/kernel/apic/es7000_32.c
+++ /dev/null
@@ -1,755 +0,0 @@
-/*
- * Written by: Garry Forsgren, Unisys Corporation
- * Natalie Protasevich, Unisys Corporation
- *
- * This file contains the code to configure and interface
- * with Unisys ES7000 series hardware system manager.
- *
- * Copyright (c) 2003 Unisys Corporation.
- * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
- *
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Unisys Corporation, Township Line & Union Meeting
- * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
- *
- * http://www.unisys.com
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/notifier.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/threads.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/reboot.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/gfp.h>
-#include <linux/nmi.h>
-#include <linux/smp.h>
-#include <linux/io.h>
-
-#include <asm/apicdef.h>
-#include <linux/atomic.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
-
-/*
- * ES7000 chipsets
- */
-
-#define NON_UNISYS 0
-#define ES7000_CLASSIC 1
-#define ES7000_ZORRO 2
-
-#define MIP_REG 1
-#define MIP_PSAI_REG 4
-
-#define MIP_BUSY 1
-#define MIP_SPIN 0xf0000
-#define MIP_VALID 0x0100000000000000ULL
-#define MIP_SW_APIC 0x1020b
-
-#define MIP_PORT(val) ((val >> 32) & 0xffff)
-
-#define MIP_RD_LO(val) (val & 0xffffffff)
-
-struct mip_reg {
- unsigned long long off_0x00;
- unsigned long long off_0x08;
- unsigned long long off_0x10;
- unsigned long long off_0x18;
- unsigned long long off_0x20;
- unsigned long long off_0x28;
- unsigned long long off_0x30;
- unsigned long long off_0x38;
-};
-
-struct mip_reg_info {
- unsigned long long mip_info;
- unsigned long long delivery_info;
- unsigned long long host_reg;
- unsigned long long mip_reg;
-};
-
-struct psai {
- unsigned long long entry_type;
- unsigned long long addr;
- unsigned long long bep_addr;
-};
-
-#ifdef CONFIG_ACPI
-
-struct es7000_oem_table {
- struct acpi_table_header Header;
- u32 OEMTableAddr;
- u32 OEMTableSize;
-};
-
-static unsigned long oem_addrX;
-static unsigned long oem_size;
-
-#endif
-
-/*
- * ES7000 Globals
- */
-
-static volatile unsigned long *psai;
-static struct mip_reg *mip_reg;
-static struct mip_reg *host_reg;
-static int mip_port;
-static unsigned long mip_addr;
-static unsigned long host_addr;
-
-int es7000_plat;
-
-/*
- * GSI override for ES7000 platforms.
- */
-
-
-static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
-{
- unsigned long vect = 0, psaival = 0;
-
- if (psai == NULL)
- return -1;
-
- vect = ((unsigned long)__pa(eip)/0x1000) << 16;
- psaival = (0x1000000 | vect | cpu);
-
- while (*psai & 0x1000000)
- ;
-
- *psai = psaival;
-
- return 0;
-}
-
-static int es7000_apic_is_cluster(void)
-{
- /* MPENTIUMIII */
- if (boot_cpu_data.x86 == 6 &&
- (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11))
- return 1;
-
- return 0;
-}
-
-static void setup_unisys(void)
-{
- /*
- * Determine the generation of the ES7000 currently running.
- *
- * es7000_plat = 1 if the machine is a 5xx ES7000 box
- * es7000_plat = 2 if the machine is a x86_64 ES7000 box
- *
- */
- if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
- es7000_plat = ES7000_ZORRO;
- else
- es7000_plat = ES7000_CLASSIC;
-}
-
-/*
- * Parse the OEM Table:
- */
-static int parse_unisys_oem(char *oemptr)
-{
- int i;
- int success = 0;
- unsigned char type, size;
- unsigned long val;
- char *tp = NULL;
- struct psai *psaip = NULL;
- struct mip_reg_info *mi;
- struct mip_reg *host, *mip;
-
- tp = oemptr;
-
- tp += 8;
-
- for (i = 0; i <= 6; i++) {
- type = *tp++;
- size = *tp++;
- tp -= 2;
- switch (type) {
- case MIP_REG:
- mi = (struct mip_reg_info *)tp;
- val = MIP_RD_LO(mi->host_reg);
- host_addr = val;
- host = (struct mip_reg *)val;
- host_reg = __va(host);
- val = MIP_RD_LO(mi->mip_reg);
- mip_port = MIP_PORT(mi->mip_info);
- mip_addr = val;
- mip = (struct mip_reg *)val;
- mip_reg = __va(mip);
- pr_debug("host_reg = 0x%lx\n",
- (unsigned long)host_reg);
- pr_debug("mip_reg = 0x%lx\n",
- (unsigned long)mip_reg);
- success++;
- break;
- case MIP_PSAI_REG:
- psaip = (struct psai *)tp;
- if (tp != NULL) {
- if (psaip->addr)
- psai = __va(psaip->addr);
- else
- psai = NULL;
- success++;
- }
- break;
- default:
- break;
- }
- tp += size;
- }
-
- if (success < 2)
- es7000_plat = NON_UNISYS;
- else
- setup_unisys();
-
- return es7000_plat;
-}
-
-#ifdef CONFIG_ACPI
-static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
-{
- struct acpi_table_header *header = NULL;
- struct es7000_oem_table *table;
- acpi_size tbl_size;
- acpi_status ret;
- int i = 0;
-
- for (;;) {
- ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size);
- if (!ACPI_SUCCESS(ret))
- return -1;
-
- if (!memcmp((char *) &header->oem_id, "UNISYS", 6))
- break;
-
- early_acpi_os_unmap_memory(header, tbl_size);
- }
-
- table = (void *)header;
-
- oem_addrX = table->OEMTableAddr;
- oem_size = table->OEMTableSize;
-
- early_acpi_os_unmap_memory(header, tbl_size);
-
- *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, oem_size);
-
- return 0;
-}
-
-static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
-{
- if (!oem_addr)
- return;
-
- __acpi_unmap_table((char *)oem_addr, oem_size);
-}
-
-static int es7000_check_dsdt(void)
-{
- struct acpi_table_header header;
-
- if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
- !strncmp(header.oem_id, "UNISYS", 6))
- return 1;
- return 0;
-}
-
-static int es7000_acpi_ret;
-
-/* Hook from generic ACPI tables.c */
-static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- unsigned long oem_addr = 0;
- int check_dsdt;
- int ret = 0;
-
- /* check dsdt at first to avoid clear fix_map for oem_addr */
- check_dsdt = es7000_check_dsdt();
-
- if (!find_unisys_acpi_oem_table(&oem_addr)) {
- if (check_dsdt) {
- ret = parse_unisys_oem((char *)oem_addr);
- } else {
- setup_unisys();
- ret = 1;
- }
- /*
- * we need to unmap it
- */
- unmap_unisys_acpi_oem_table(oem_addr);
- }
-
- es7000_acpi_ret = ret;
-
- return ret && !es7000_apic_is_cluster();
-}
-
-static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
-{
- int ret = es7000_acpi_ret;
-
- return ret && es7000_apic_is_cluster();
-}
-
-#else /* !CONFIG_ACPI: */
-static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return 0;
-}
-
-static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
-{
- return 0;
-}
-#endif /* !CONFIG_ACPI */
-
-static void es7000_spin(int n)
-{
- int i = 0;
-
- while (i++ < n)
- rep_nop();
-}
-
-static int es7000_mip_write(struct mip_reg *mip_reg)
-{
- int status = 0;
- int spin;
-
- spin = MIP_SPIN;
- while ((host_reg->off_0x38 & MIP_VALID) != 0) {
- if (--spin <= 0) {
- WARN(1, "Timeout waiting for Host Valid Flag\n");
- return -1;
- }
- es7000_spin(MIP_SPIN);
- }
-
- memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
- outb(1, mip_port);
-
- spin = MIP_SPIN;
-
- while ((mip_reg->off_0x38 & MIP_VALID) == 0) {
- if (--spin <= 0) {
- WARN(1, "Timeout waiting for MIP Valid Flag\n");
- return -1;
- }
- es7000_spin(MIP_SPIN);
- }
-
- status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48;
- mip_reg->off_0x38 &= ~MIP_VALID;
-
- return status;
-}
-
-static void es7000_enable_apic_mode(void)
-{
- struct mip_reg es7000_mip_reg;
- int mip_status;
-
- if (!es7000_plat)
- return;
-
- pr_info("Enabling APIC mode.\n");
- memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
- es7000_mip_reg.off_0x00 = MIP_SW_APIC;
- es7000_mip_reg.off_0x38 = MIP_VALID;
-
- while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
- WARN(1, "Command failed, status = %x\n", mip_status);
-}
-
-static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
-
-static void es7000_wait_for_init_deassert(atomic_t *deassert)
-{
- while (!atomic_read(deassert))
- cpu_relax();
-}
-
-static unsigned int es7000_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0xFF;
-}
-
-static void es7000_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_phys(mask, vector);
-}
-
-static void es7000_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
-}
-
-static void es7000_send_IPI_all(int vector)
-{
- es7000_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static int es7000_apic_id_registered(void)
-{
- return 1;
-}
-
-static const struct cpumask *target_cpus_cluster(void)
-{
- return cpu_all_mask;
-}
-
-static const struct cpumask *es7000_target_cpus(void)
-{
- return cpumask_of(smp_processor_id());
-}
-
-static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return 0;
-}
-
-static unsigned long es7000_check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
-static int es7000_early_logical_apicid(int cpu)
-{
- /* on es7000, logical apicid is the same as physical */
- return early_per_cpu(x86_bios_cpu_apicid, cpu);
-}
-
-static unsigned long calculate_ldr(int cpu)
-{
- unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
-
- return SET_APIC_LOGICAL_ID(id);
-}
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LdR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-static void es7000_init_apic_ldr_cluster(void)
-{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_CLUSTER);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
-}
-
-static void es7000_init_apic_ldr(void)
-{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_FLAT);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
-}
-
-static void es7000_setup_apic_routing(void)
-{
- int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
-
- pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
- (apic_version[apic] == 0x14) ?
- "Physical Cluster" : "Logical Cluster",
- nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
-}
-
-static int es7000_cpu_present_to_apicid(int mps_cpu)
-{
- if (!mps_cpu)
- return boot_cpu_physical_apicid;
- else if (mps_cpu < nr_cpu_ids)
- return per_cpu(x86_bios_cpu_apicid, mps_cpu);
- else
- return BAD_APICID;
-}
-
-static int cpu_id;
-
-static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
-{
- physid_set_mask_of_physid(cpu_id, retmap);
- ++cpu_id;
-}
-
-static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- physids_promote(0xFFL, retmap);
-}
-
-static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
-{
- boot_cpu_physical_apicid = read_apic_id();
- return 1;
-}
-
-static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- unsigned int round = 0;
- int cpu, uninitialized_var(apicid);
-
- /*
- * The cpus in the mask must all be on the apic cluster.
- */
- for_each_cpu(cpu, cpumask) {
- int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
-
- if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
- WARN(1, "Not a valid mask!");
-
- return BAD_APICID;
- }
- apicid = new_apicid;
- round++;
- }
- return apicid;
-}
-
-static unsigned int
-es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
- const struct cpumask *andmask)
-{
- int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
- cpumask_var_t cpumask;
-
- if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
- return apicid;
-
- cpumask_and(cpumask, inmask, andmask);
- cpumask_and(cpumask, cpumask, cpu_online_mask);
- apicid = es7000_cpu_mask_to_apicid(cpumask);
-
- free_cpumask_var(cpumask);
-
- return apicid;
-}
-
-static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-static int probe_es7000(void)
-{
- /* probed later in mptable/ACPI hooks */
- return 0;
-}
-
-static int es7000_mps_ret;
-static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem,
- char *productid)
-{
- int ret = 0;
-
- if (mpc->oemptr) {
- struct mpc_oemtable *oem_table =
- (struct mpc_oemtable *)mpc->oemptr;
-
- if (!strncmp(oem, "UNISYS", 6))
- ret = parse_unisys_oem((char *)oem_table);
- }
-
- es7000_mps_ret = ret;
-
- return ret && !es7000_apic_is_cluster();
-}
-
-static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
- char *productid)
-{
- int ret = es7000_mps_ret;
-
- return ret && es7000_apic_is_cluster();
-}
-
-/* We've been warned by a false positive warning.Use __refdata to keep calm. */
-static struct apic __refdata apic_es7000_cluster = {
-
- .name = "es7000",
- .probe = probe_es7000,
- .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
- .apic_id_registered = es7000_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- /* logical delivery broadcast to all procs: */
- .irq_dest_mode = 1,
-
- .target_cpus = target_cpus_cluster,
- .disable_esr = 1,
- .dest_logical = 0,
- .check_apicid_used = es7000_check_apicid_used,
- .check_apicid_present = es7000_check_apicid_present,
-
- .vector_allocation_domain = es7000_vector_allocation_domain,
- .init_apic_ldr = es7000_init_apic_ldr_cluster,
-
- .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
- .setup_apic_routing = es7000_setup_apic_routing,
- .multi_timer_check = NULL,
- .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
- .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = es7000_check_phys_apicid_present,
- .enable_apic_mode = es7000_enable_apic_mode,
- .phys_pkg_id = es7000_phys_pkg_id,
- .mps_oem_check = es7000_mps_oem_check_cluster,
-
- .get_apic_id = es7000_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = es7000_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = es7000_send_IPI_allbutself,
- .send_IPI_all = es7000_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip,
-
- .trampoline_phys_low = 0x467,
- .trampoline_phys_high = 0x469,
-
- .wait_for_init_deassert = NULL,
-
- /* Nothing to do for most platforms, since cleared by the INIT cycle: */
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-
- .x86_32_early_logical_apicid = es7000_early_logical_apicid,
-};
-
-static struct apic __refdata apic_es7000 = {
-
- .name = "es7000",
- .probe = probe_es7000,
- .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
- .apic_id_registered = es7000_apic_id_registered,
-
- .irq_delivery_mode = dest_Fixed,
- /* phys delivery to target CPUs: */
- .irq_dest_mode = 0,
-
- .target_cpus = es7000_target_cpus,
- .disable_esr = 1,
- .dest_logical = 0,
- .check_apicid_used = es7000_check_apicid_used,
- .check_apicid_present = es7000_check_apicid_present,
-
- .vector_allocation_domain = es7000_vector_allocation_domain,
- .init_apic_ldr = es7000_init_apic_ldr,
-
- .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
- .setup_apic_routing = es7000_setup_apic_routing,
- .multi_timer_check = NULL,
- .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
- .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = es7000_check_phys_apicid_present,
- .enable_apic_mode = es7000_enable_apic_mode,
- .phys_pkg_id = es7000_phys_pkg_id,
- .mps_oem_check = es7000_mps_oem_check,
-
- .get_apic_id = es7000_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = es7000_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = es7000_send_IPI_allbutself,
- .send_IPI_all = es7000_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .trampoline_phys_low = 0x467,
- .trampoline_phys_high = 0x469,
-
- .wait_for_init_deassert = es7000_wait_for_init_deassert,
-
- /* Nothing to do for most platforms, since cleared by the INIT cycle: */
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-
- .x86_32_early_logical_apicid = es7000_early_logical_apicid,
-};
-
-/*
- * Need to check for es7000 followed by es7000_cluster, so this order
- * in apic_drivers is important.
- */
-apic_drivers(apic_es7000, apic_es7000_cluster);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 31cb9ae992b..6a1e71bde32 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -9,6 +9,7 @@
*
*/
#include <asm/apic.h>
+#include <asm/nmi.h>
#include <linux/cpumask.h>
#include <linux/kdebug.h>
@@ -32,34 +33,44 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
-void arch_trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(bool include_self)
{
int i;
+ int cpu = get_cpu();
- if (test_and_set_bit(0, &backtrace_flag))
+ if (test_and_set_bit(0, &backtrace_flag)) {
/*
* If there is already a trigger_all_cpu_backtrace() in progress
* (backtrace_flag == 1), don't output double cpu dump infos.
*/
+ put_cpu();
return;
+ }
cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+ if (!include_self)
+ cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
- printk(KERN_INFO "sending NMI to all CPUs:\n");
- apic->send_IPI_all(NMI_VECTOR);
+ if (!cpumask_empty(to_cpumask(backtrace_mask))) {
+ pr_info("sending NMI to %s CPUs:\n",
+ (include_self ? "all" : "other"));
+ apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
+ }
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
if (cpumask_empty(to_cpumask(backtrace_mask)))
break;
mdelay(1);
+ touch_softlockup_watchdog();
}
clear_bit(0, &backtrace_flag);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
+ put_cpu();
}
-static int __kprobes
+static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
int cpu;
@@ -79,6 +90,7 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
return NMI_DONE;
}
+NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler);
static int __init register_trigger_all_cpu_backtrace(void)
{
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index fb072754bc1..81e08eff05e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -37,9 +37,6 @@
#include <linux/kthread.h>
#include <linux/jiffies.h> /* time_after() */
#include <linux/slab.h>
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
#include <linux/bootmem.h>
#include <linux/dmar.h>
#include <linux/hpet.h>
@@ -64,6 +61,7 @@
#include <asm/apic.h>
#define __apicdebuginit(type) static type __init
+
#define for_each_irq_pin(entry, head) \
for (entry = head; entry; entry = entry->next)
@@ -123,7 +121,7 @@ int mp_irq_entries;
/* GSI interrupts */
static int nr_irqs_gsi = NR_IRQS_LEGACY;
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+#ifdef CONFIG_EISA
int mp_bus_id_to_type[MAX_MP_BUSSES];
#endif
@@ -208,20 +206,17 @@ int __init arch_early_irq_init(void)
count = ARRAY_SIZE(irq_cfgx);
node = cpu_to_node(0);
- /* Make sure the legacy interrupts are marked in the bitmap */
- irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
-
for (i = 0; i < count; i++) {
irq_set_chip_data(i, &cfg[i]);
zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
/*
* For legacy IRQ's, start with assigning irq0 to irq15 to
- * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
+ * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
*/
if (i < legacy_pic->nr_legacy_irqs) {
cfg[i].vector = IRQ0_VECTOR + i;
- cpumask_set_cpu(0, cfg[i].domain);
+ cpumask_setall(cfg[i].domain);
}
}
@@ -283,17 +278,6 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
return cfg;
}
-static int alloc_irq_from(unsigned int from, int node)
-{
- return irq_alloc_desc_from(from, node);
-}
-
-static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
-{
- free_irq_cfg(at, cfg);
- irq_free_desc(at);
-}
-
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -308,22 +292,23 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+ (mpc_ioapic_addr(idx) & ~PAGE_MASK);
}
-static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+void io_apic_eoi(unsigned int apic, unsigned int vector)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
writel(vector, &io_apic->eoi);
}
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
writel(reg, &io_apic->index);
return readl(&io_apic->data);
}
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
+
writel(reg, &io_apic->index);
writel(value, &io_apic->data);
}
@@ -334,7 +319,7 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
*
* Older SiS APIC requires we rewrite the index register
*/
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -343,29 +328,6 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
writel(value, &io_apic->data);
}
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
-{
- struct irq_pin_list *entry;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- for_each_irq_pin(entry, cfg->irq_2_pin) {
- unsigned int reg;
- int pin;
-
- pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin*2);
- /* Is the remote IRR bit set? */
- if (reg & IO_APIC_REDIR_REMOTE_IRR) {
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- return true;
- }
- }
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return false;
-}
-
union entry_union {
struct { u32 w1, w2; };
struct IO_APIC_route_entry entry;
@@ -377,6 +339,7 @@ static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+
return eu.entry;
}
@@ -384,9 +347,11 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
{
union entry_union eu;
unsigned long flags;
+
raw_spin_lock_irqsave(&ioapic_lock, flags);
eu.entry = __ioapic_read_entry(apic, pin);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
return eu.entry;
}
@@ -396,8 +361,7 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
* the interrupt, and we need to make sure the entry is fully populated
* before that happens.
*/
-static void
-__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
union entry_union eu = {{0, 0}};
@@ -409,6 +373,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
unsigned long flags;
+
raw_spin_lock_irqsave(&ioapic_lock, flags);
__ioapic_write_entry(apic, pin, e);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -435,8 +400,7 @@ static void ioapic_mask_entry(int apic, int pin)
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static int
-__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
struct irq_pin_list **last, *entry;
@@ -450,8 +414,8 @@ __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
entry = alloc_irq_pin_list(node);
if (!entry) {
- printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
- node, apic, pin);
+ pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
+ node, apic, pin);
return -ENOMEM;
}
entry->apic = apic;
@@ -521,6 +485,7 @@ static void io_apic_sync(struct irq_pin_list *entry)
* a dummy read from the IO-APIC
*/
struct io_apic __iomem *io_apic;
+
io_apic = io_apic_base(entry->apic);
readl(&io_apic->data);
}
@@ -574,19 +539,10 @@ static void unmask_ioapic_irq(struct irq_data *data)
* Otherwise, we simulate the EOI message manually by changing the trigger
* mode to edge and then back to level, with RTE being masked during this.
*/
-static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg)
+void native_eoi_ioapic_pin(int apic, int pin, int vector)
{
if (mpc_ioapic_ver(apic) >= 0x20) {
- /*
- * Intr-remapping uses pin number as the virtual vector
- * in the RTE. Actual vector is programmed in
- * intr-remapping table entry. Hence for the io-apic
- * EOI we use the pin number.
- */
- if (cfg && irq_remapped(cfg))
- io_apic_eoi(apic, pin);
- else
- io_apic_eoi(apic, vector);
+ io_apic_eoi(apic, vector);
} else {
struct IO_APIC_route_entry entry, entry1;
@@ -607,14 +563,15 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg)
}
}
-static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
{
struct irq_pin_list *entry;
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
for_each_irq_pin(entry, cfg->irq_2_pin)
- __eoi_ioapic_pin(entry->apic, entry->pin, cfg->vector, cfg);
+ x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin,
+ cfg->vector);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -651,7 +608,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
}
raw_spin_lock_irqsave(&ioapic_lock, flags);
- __eoi_ioapic_pin(apic, pin, entry.vector, NULL);
+ x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -662,7 +619,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
ioapic_mask_entry(apic, pin);
entry = ioapic_read_entry(apic, pin);
if (entry.irr)
- printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n",
+ pr_err("Unable to reset IRR for apic: %d, pin :%d\n",
mpc_ioapic_id(apic), pin);
}
@@ -836,7 +793,7 @@ static int __init find_isa_irq_apic(int irq, int type)
return -1;
}
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
/*
* EISA Edge/Level control register, ELCR
*/
@@ -873,12 +830,6 @@ static int EISA_ELCR(unsigned int irq)
#define default_PCI_trigger(idx) (1)
#define default_PCI_polarity(idx) (1)
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx) (1)
-#define default_MCA_polarity(idx) default_ISA_polarity(idx)
-
static int irq_polarity(int idx)
{
int bus = mp_irqs[idx].srcbus;
@@ -902,7 +853,7 @@ static int irq_polarity(int idx)
}
case 2: /* reserved */
{
- printk(KERN_WARNING "broken BIOS!!\n");
+ pr_warn("broken BIOS!!\n");
polarity = 1;
break;
}
@@ -913,7 +864,7 @@ static int irq_polarity(int idx)
}
default: /* invalid */
{
- printk(KERN_WARNING "broken BIOS!!\n");
+ pr_warn("broken BIOS!!\n");
polarity = 1;
break;
}
@@ -936,7 +887,7 @@ static int irq_trigger(int idx)
trigger = default_ISA_trigger(idx);
else
trigger = default_PCI_trigger(idx);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
switch (mp_bus_id_to_type[bus]) {
case MP_BUS_ISA: /* ISA pin */
{
@@ -953,14 +904,9 @@ static int irq_trigger(int idx)
/* set before the switch */
break;
}
- case MP_BUS_MCA: /* MCA pin */
- {
- trigger = default_MCA_trigger(idx);
- break;
- }
default:
{
- printk(KERN_WARNING "broken BIOS!!\n");
+ pr_warn("broken BIOS!!\n");
trigger = 1;
break;
}
@@ -974,7 +920,7 @@ static int irq_trigger(int idx)
}
case 2: /* reserved */
{
- printk(KERN_WARNING "broken BIOS!!\n");
+ pr_warn("broken BIOS!!\n");
trigger = 1;
break;
}
@@ -985,7 +931,7 @@ static int irq_trigger(int idx)
}
default: /* invalid */
{
- printk(KERN_WARNING "broken BIOS!!\n");
+ pr_warn("broken BIOS!!\n");
trigger = 0;
break;
}
@@ -1003,7 +949,7 @@ static int pin_2_irq(int idx, int apic, int pin)
* Debugging check, we are in big trouble if this message pops up!
*/
if (mp_irqs[idx].dstirq != pin)
- printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+ pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
if (test_bit(bus, mp_bus_not_pci)) {
irq = mp_irqs[idx].srcbusirq;
@@ -1124,8 +1070,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
- static int current_offset = VECTOR_OFFSET_START % 8;
- unsigned int old_vector;
+ static int current_offset = VECTOR_OFFSET_START % 16;
int cpu, err;
cpumask_var_t tmp_mask;
@@ -1135,48 +1080,61 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
return -ENOMEM;
- old_vector = cfg->vector;
- if (old_vector) {
- cpumask_and(tmp_mask, mask, cpu_online_mask);
- cpumask_and(tmp_mask, cfg->domain, tmp_mask);
- if (!cpumask_empty(tmp_mask)) {
- free_cpumask_var(tmp_mask);
- return 0;
- }
- }
-
/* Only try and allocate irqs on cpus that are present */
err = -ENOSPC;
- for_each_cpu_and(cpu, mask, cpu_online_mask) {
- int new_cpu;
- int vector, offset;
+ cpumask_clear(cfg->old_domain);
+ cpu = cpumask_first_and(mask, cpu_online_mask);
+ while (cpu < nr_cpu_ids) {
+ int new_cpu, vector, offset;
- apic->vector_allocation_domain(cpu, tmp_mask);
+ apic->vector_allocation_domain(cpu, tmp_mask, mask);
+
+ if (cpumask_subset(tmp_mask, cfg->domain)) {
+ err = 0;
+ if (cpumask_equal(tmp_mask, cfg->domain))
+ break;
+ /*
+ * New cpumask using the vector is a proper subset of
+ * the current in use mask. So cleanup the vector
+ * allocation for the members that are not used anymore.
+ */
+ cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
+ cfg->move_in_progress =
+ cpumask_intersects(cfg->old_domain, cpu_online_mask);
+ cpumask_and(cfg->domain, cfg->domain, tmp_mask);
+ break;
+ }
vector = current_vector;
offset = current_offset;
next:
- vector += 8;
+ vector += 16;
if (vector >= first_system_vector) {
- /* If out of vectors on large boxen, must share them. */
- offset = (offset + 1) % 8;
+ offset = (offset + 1) % 16;
vector = FIRST_EXTERNAL_VECTOR + offset;
}
- if (unlikely(current_vector == vector))
+
+ if (unlikely(current_vector == vector)) {
+ cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
+ cpumask_andnot(tmp_mask, mask, cfg->old_domain);
+ cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
continue;
+ }
if (test_bit(vector, used_vectors))
goto next;
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
- if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+ if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED)
goto next;
+ }
/* Found one! */
current_vector = vector;
current_offset = offset;
- if (old_vector) {
- cfg->move_in_progress = 1;
+ if (cfg->vector) {
cpumask_copy(cfg->old_domain, cfg->domain);
+ cfg->move_in_progress =
+ cpumask_intersects(cfg->old_domain, cpu_online_mask);
}
for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq;
@@ -1208,7 +1166,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
vector = cfg->vector;
for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
- per_cpu(vector_irq, cpu)[vector] = -1;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
cfg->vector = 0;
cpumask_clear(cfg->domain);
@@ -1216,11 +1174,10 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
if (likely(!cfg->move_in_progress))
return;
for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
- vector++) {
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
if (per_cpu(vector_irq, cpu)[vector] != irq)
continue;
- per_cpu(vector_irq, cpu)[vector] = -1;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
break;
}
}
@@ -1244,12 +1201,6 @@ void __setup_vector_irq(int cpu)
cfg = irq_get_chip_data(irq);
if (!cfg)
continue;
- /*
- * If it is a legacy IRQ handled by the legacy PIC, this cpu
- * will be part of the irq_cfg's domain.
- */
- if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
- cpumask_set_cpu(cpu, cfg->domain);
if (!cpumask_test_cpu(cpu, cfg->domain))
continue;
@@ -1259,12 +1210,12 @@ void __setup_vector_irq(int cpu)
/* Mark the free vectors */
for (vector = 0; vector < NR_VECTORS; ++vector) {
irq = per_cpu(vector_irq, cpu)[vector];
- if (irq < 0)
+ if (irq <= VECTOR_UNDEFINED)
continue;
cfg = irq_cfg(irq);
if (!cpumask_test_cpu(cpu, cfg->domain))
- per_cpu(vector_irq, cpu)[vector] = -1;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
}
raw_spin_unlock(&vector_lock);
}
@@ -1311,89 +1262,18 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
fasteoi = false;
}
- if (irq_remapped(cfg)) {
- irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
- irq_remap_modify_chip_defaults(chip);
+ if (setup_remapped_irq(irq, cfg, chip))
fasteoi = trigger != 0;
- }
hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
irq_set_chip_and_handler_name(irq, chip, hdl,
fasteoi ? "fasteoi" : "edge");
}
-
-static int setup_ir_ioapic_entry(int irq,
- struct IR_IO_APIC_route_entry *entry,
+int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
unsigned int destination, int vector,
struct io_apic_irq_attr *attr)
{
- int index;
- struct irte irte;
- int ioapic_id = mpc_ioapic_id(attr->ioapic);
- struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id);
-
- if (!iommu) {
- pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
- return -ENODEV;
- }
-
- index = alloc_irte(iommu, irq, 1);
- if (index < 0) {
- pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id);
- return -ENOMEM;
- }
-
- prepare_irte(&irte, vector, destination);
-
- /* Set source-id of interrupt request */
- set_ioapic_sid(&irte, ioapic_id);
-
- modify_irte(irq, &irte);
-
- apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
- "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
- "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
- "Avail:%X Vector:%02X Dest:%08X "
- "SID:%04X SQ:%X SVT:%X)\n",
- attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
- irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
- irte.avail, irte.vector, irte.dest_id,
- irte.sid, irte.sq, irte.svt);
-
- memset(entry, 0, sizeof(*entry));
-
- entry->index2 = (index >> 15) & 0x1;
- entry->zero = 0;
- entry->format = 1;
- entry->index = (index & 0x7fff);
- /*
- * IO-APIC RTE will be configured with virtual vector.
- * irq handler will do the explicit EOI to the io-apic.
- */
- entry->vector = attr->ioapic_pin;
- entry->mask = 0; /* enable IRQ */
- entry->trigger = attr->trigger;
- entry->polarity = attr->polarity;
-
- /* Mask level triggered irqs.
- * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
- */
- if (attr->trigger)
- entry->mask = 1;
-
- return 0;
-}
-
-static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
- unsigned int destination, int vector,
- struct io_apic_irq_attr *attr)
-{
- if (intr_remapping_enabled)
- return setup_ir_ioapic_entry(irq,
- (struct IR_IO_APIC_route_entry *)entry,
- destination, vector, attr);
-
memset(entry, 0, sizeof(*entry));
entry->delivery_mode = apic->irq_delivery_mode;
@@ -1422,18 +1302,18 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
if (!IO_APIC_IRQ(irq))
return;
- /*
- * For legacy irqs, cfg->domain starts with cpu 0 for legacy
- * controllers like 8259. Now that IO-APIC can handle this irq, update
- * the cfg->domain.
- */
- if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
- apic->vector_allocation_domain(0, cfg->domain);
if (assign_irq_vector(irq, cfg, apic->target_cpus()))
return;
- dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
+ if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(),
+ &dest)) {
+ pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
+ mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
+ __clear_irq_vector(irq, cfg);
+
+ return;
+ }
apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@ -1441,7 +1321,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
cfg->vector, irq, attr->trigger, attr->polarity, dest);
- if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
+ if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) {
pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",
mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
__clear_irq_vector(irq, cfg);
@@ -1545,12 +1425,10 @@ void setup_IO_APIC_irq_extra(u32 gsi)
* Set up the timer pin, possibly with the 8259A-master behind.
*/
static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
- unsigned int pin, int vector)
+ unsigned int pin, int vector)
{
struct IO_APIC_route_entry entry;
-
- if (intr_remapping_enabled)
- return;
+ unsigned int dest;
memset(&entry, 0, sizeof(entry));
@@ -1558,9 +1436,13 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
* We use logical delivery to get the timer IRQ
* to the first CPU.
*/
+ if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
+ apic->target_cpus(), &dest)))
+ dest = BAD_APICID;
+
entry.dest_mode = apic->irq_dest_mode;
entry.mask = 0; /* don't mask IRQ for edge */
- entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
+ entry.dest = dest;
entry.delivery_mode = apic->irq_delivery_mode;
entry.polarity = 0;
entry.trigger = 0;
@@ -1579,9 +1461,68 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
ioapic_write_entry(ioapic_idx, pin, entry);
}
-__apicdebuginit(void) print_IO_APIC(int ioapic_idx)
+void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
+{
+ int i;
+
+ pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n");
+
+ for (i = 0; i <= nr_entries; i++) {
+ struct IO_APIC_route_entry entry;
+
+ entry = ioapic_read_entry(apic, i);
+
+ pr_debug(" %02x %02X ", i, entry.dest);
+ pr_cont("%1d %1d %1d %1d %1d "
+ "%1d %1d %02X\n",
+ entry.mask,
+ entry.trigger,
+ entry.irr,
+ entry.polarity,
+ entry.delivery_status,
+ entry.dest_mode,
+ entry.delivery_mode,
+ entry.vector);
+ }
+}
+
+void intel_ir_io_apic_print_entries(unsigned int apic,
+ unsigned int nr_entries)
{
int i;
+
+ pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n");
+
+ for (i = 0; i <= nr_entries; i++) {
+ struct IR_IO_APIC_route_entry *ir_entry;
+ struct IO_APIC_route_entry entry;
+
+ entry = ioapic_read_entry(apic, i);
+
+ ir_entry = (struct IR_IO_APIC_route_entry *)&entry;
+
+ pr_debug(" %02x %04X ", i, ir_entry->index);
+ pr_cont("%1d %1d %1d %1d %1d "
+ "%1d %1d %X %02X\n",
+ ir_entry->format,
+ ir_entry->mask,
+ ir_entry->trigger,
+ ir_entry->irr,
+ ir_entry->polarity,
+ ir_entry->delivery_status,
+ ir_entry->index2,
+ ir_entry->zero,
+ ir_entry->vector);
+ }
+}
+
+void ioapic_zap_locks(void)
+{
+ raw_spin_lock_init(&ioapic_lock);
+}
+
+__apicdebuginit(void) print_IO_APIC(int ioapic_idx)
+{
union IO_APIC_reg_00 reg_00;
union IO_APIC_reg_01 reg_01;
union IO_APIC_reg_02 reg_02;
@@ -1597,7 +1538,6 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
reg_03.raw = io_apic_read(ioapic_idx, 3);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- printk("\n");
printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
@@ -1635,58 +1575,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
printk(KERN_DEBUG ".... IRQ redirection table:\n");
- if (intr_remapping_enabled) {
- printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
- " Pol Stat Indx2 Zero Vect:\n");
- } else {
- printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
- " Stat Dmod Deli Vect:\n");
- }
-
- for (i = 0; i <= reg_01.bits.entries; i++) {
- if (intr_remapping_enabled) {
- struct IO_APIC_route_entry entry;
- struct IR_IO_APIC_route_entry *ir_entry;
-
- entry = ioapic_read_entry(ioapic_idx, i);
- ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
- printk(KERN_DEBUG " %02x %04X ",
- i,
- ir_entry->index
- );
- printk("%1d %1d %1d %1d %1d "
- "%1d %1d %X %02X\n",
- ir_entry->format,
- ir_entry->mask,
- ir_entry->trigger,
- ir_entry->irr,
- ir_entry->polarity,
- ir_entry->delivery_status,
- ir_entry->index2,
- ir_entry->zero,
- ir_entry->vector
- );
- } else {
- struct IO_APIC_route_entry entry;
-
- entry = ioapic_read_entry(ioapic_idx, i);
- printk(KERN_DEBUG " %02x %02X ",
- i,
- entry.dest
- );
- printk("%1d %1d %1d %1d %1d "
- "%1d %1d %02X\n",
- entry.mask,
- entry.trigger,
- entry.irr,
- entry.polarity,
- entry.delivery_status,
- entry.dest_mode,
- entry.delivery_mode,
- entry.vector
- );
- }
- }
+ x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries);
}
__apicdebuginit(void) print_IO_APICs(void)
@@ -1727,8 +1616,8 @@ __apicdebuginit(void) print_IO_APICs(void)
continue;
printk(KERN_DEBUG "IRQ%d ", irq);
for_each_irq_pin(entry, cfg->irq_2_pin)
- printk("-> %d:%d", entry->apic, entry->pin);
- printk("\n");
+ pr_cont("-> %d:%d", entry->apic, entry->pin);
+ pr_cont("\n");
}
printk(KERN_INFO ".................................... done.\n");
@@ -1741,9 +1630,9 @@ __apicdebuginit(void) print_APIC_field(int base)
printk(KERN_DEBUG);
for (i = 0; i < 8; i++)
- printk(KERN_CONT "%08x", apic_read(base + i*0x10));
+ pr_cont("%08x", apic_read(base + i*0x10));
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
__apicdebuginit(void) print_local_APIC(void *dummy)
@@ -1845,7 +1734,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
}
}
- printk("\n");
+ pr_cont("\n");
}
__apicdebuginit(void) print_local_APICs(int maxcpu)
@@ -1988,30 +1877,14 @@ void __init enable_IO_APIC(void)
clear_IO_APIC();
}
-/*
- * Not an __init, needed by the reboot code
- */
-void disable_IO_APIC(void)
+void native_disable_io_apic(void)
{
/*
- * Clear the IO-APIC before rebooting:
- */
- clear_IO_APIC();
-
- if (!legacy_pic->nr_legacy_irqs)
- return;
-
- /*
* If the i8259 is routed through an IOAPIC
* Put that IOAPIC in virtual wire mode
* so legacy interrupts can be delivered.
- *
- * With interrupt-remapping, for now we will use virtual wire A mode,
- * as virtual wire B is little complex (need to configure both
- * IOAPIC RTE as well as interrupt-remapping table entry).
- * As this gets called during crash dump, keep this simple for now.
*/
- if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
+ if (ioapic_i8259.pin != -1) {
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
@@ -2031,12 +1904,25 @@ void disable_IO_APIC(void)
ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
+ if (cpu_has_apic || apic_from_smp_config())
+ disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
/*
- * Use virtual wire A mode when interrupt remapping is enabled.
+ * Clear the IO-APIC before rebooting:
*/
- if (cpu_has_apic || apic_from_smp_config())
- disconnect_bsp_APIC(!intr_remapping_enabled &&
- ioapic_i8259.pin != -1);
+ clear_IO_APIC();
+
+ if (!legacy_pic->nr_legacy_irqs)
+ return;
+
+ x86_io_apic_ops.disable();
}
#ifdef CONFIG_X86_32
@@ -2141,7 +2027,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
reg_00.raw = io_apic_read(ioapic_idx, 0);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
- printk("could not set ID!\n");
+ pr_cont("could not set ID!\n");
else
apic_printk(APIC_VERBOSE, " ok.\n");
}
@@ -2252,9 +2138,11 @@ static int ioapic_retrigger_irq(struct irq_data *data)
{
struct irq_cfg *cfg = data->chip_data;
unsigned long flags;
+ int cpu;
raw_spin_lock_irqsave(&vector_lock, flags);
- apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
+ cpu = cpumask_first_and(cfg->domain, cpu_online_mask);
+ apic->send_IPI_mask(cpumask_of(cpu), cfg->vector);
raw_spin_unlock_irqrestore(&vector_lock, flags);
return 1;
@@ -2286,137 +2174,7 @@ void send_cleanup_vector(struct irq_cfg *cfg)
cfg->move_in_progress = 0;
}
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
- int apic, pin;
- struct irq_pin_list *entry;
- u8 vector = cfg->vector;
-
- for_each_irq_pin(entry, cfg->irq_2_pin) {
- unsigned int reg;
-
- apic = entry->apic;
- pin = entry->pin;
- /*
- * With interrupt-remapping, destination information comes
- * from interrupt-remapping table entry.
- */
- if (!irq_remapped(cfg))
- io_apic_write(apic, 0x11 + pin*2, dest);
- reg = io_apic_read(apic, 0x10 + pin*2);
- reg &= ~IO_APIC_REDIR_VECTOR_MASK;
- reg |= vector;
- io_apic_modify(apic, 0x10 + pin*2, reg);
- }
-}
-
-/*
- * Either sets data->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves data->affinity untouched.
- */
-int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
- unsigned int *dest_id)
-{
- struct irq_cfg *cfg = data->chip_data;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return -1;
-
- if (assign_irq_vector(data->irq, data->chip_data, mask))
- return -1;
-
- cpumask_copy(data->affinity, mask);
-
- *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
- return 0;
-}
-
-static int
-ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
- bool force)
-{
- unsigned int dest, irq = data->irq;
- unsigned long flags;
- int ret;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- ret = __ioapic_set_affinity(data, mask, &dest);
- if (!ret) {
- /* Only the high 8 bits are valid. */
- dest = SET_APIC_LOGICAL_ID(dest);
- __target_IO_APIC_irq(irq, dest, data->chip_data);
- }
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- return ret;
-}
-
-#ifdef CONFIG_IRQ_REMAP
-
-/*
- * Migrate the IO-APIC irq in the presence of intr-remapping.
- *
- * For both level and edge triggered, irq migration is a simple atomic
- * update(of vector and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we eliminate the io-apic RTE modification (with the
- * updated vector information), by using a virtual vector (io-apic pin number).
- * Real vector that is used for interrupting cpu will be coming from
- * the interrupt-remapping table entry.
- *
- * As the migration is a simple atomic update of IRTE, the same mechanism
- * is used to migrate MSI irq's in the presence of interrupt-remapping.
- */
-static int
-ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
- bool force)
-{
- struct irq_cfg *cfg = data->chip_data;
- unsigned int dest, irq = data->irq;
- struct irte irte;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return -EINVAL;
-
- if (get_irte(irq, &irte))
- return -EBUSY;
-
- if (assign_irq_vector(irq, cfg, mask))
- return -EBUSY;
-
- dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
-
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
-
- /*
- * Atomically updates the IRTE with the new destination, vector
- * and flushes the interrupt entry cache.
- */
- modify_irte(irq, &irte);
-
- /*
- * After this point, all the interrupts will start arriving
- * at the new destination. So, time to cleanup the previous
- * vector allocation.
- */
- if (cfg->move_in_progress)
- send_cleanup_vector(cfg);
-
- cpumask_copy(data->affinity, mask);
- return 0;
-}
-
-#else
-static inline int
-ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
- bool force)
-{
- return 0;
-}
-#endif
-
-asmlinkage void smp_irq_move_cleanup_interrupt(void)
+asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
@@ -2426,13 +2184,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
me = smp_processor_id();
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- unsigned int irq;
+ int irq;
unsigned int irr;
struct irq_desc *desc;
struct irq_cfg *cfg;
irq = __this_cpu_read(vector_irq[vector]);
- if (irq == -1)
+ if (irq <= VECTOR_UNDEFINED)
continue;
desc = irq_to_desc(irq);
@@ -2440,6 +2198,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
continue;
cfg = irq_cfg(irq);
+ if (!cfg)
+ continue;
+
raw_spin_lock(&desc->lock);
/*
@@ -2503,6 +2264,84 @@ void irq_force_complete_move(int irq)
static inline void irq_complete_move(struct irq_cfg *cfg) { }
#endif
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+{
+ int apic, pin;
+ struct irq_pin_list *entry;
+ u8 vector = cfg->vector;
+
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ unsigned int reg;
+
+ apic = entry->apic;
+ pin = entry->pin;
+
+ io_apic_write(apic, 0x11 + pin*2, dest);
+ reg = io_apic_read(apic, 0x10 + pin*2);
+ reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+ reg |= vector;
+ io_apic_modify(apic, 0x10 + pin*2, reg);
+ }
+}
+
+/*
+ * Either sets data->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
+ * leaves data->affinity untouched.
+ */
+int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ unsigned int *dest_id)
+{
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int irq = data->irq;
+ int err;
+
+ if (!config_enabled(CONFIG_SMP))
+ return -EPERM;
+
+ if (!cpumask_intersects(mask, cpu_online_mask))
+ return -EINVAL;
+
+ err = assign_irq_vector(irq, cfg, mask);
+ if (err)
+ return err;
+
+ err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
+ if (err) {
+ if (assign_irq_vector(irq, cfg, data->affinity))
+ pr_err("Failed to recover vector for irq %d\n", irq);
+ return err;
+ }
+
+ cpumask_copy(data->affinity, mask);
+
+ return 0;
+}
+
+
+int native_ioapic_set_affinity(struct irq_data *data,
+ const struct cpumask *mask,
+ bool force)
+{
+ unsigned int dest, irq = data->irq;
+ unsigned long flags;
+ int ret;
+
+ if (!config_enabled(CONFIG_SMP))
+ return -EPERM;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (!ret) {
+ /* Only the high 8 bits are valid. */
+ dest = SET_APIC_LOGICAL_ID(dest);
+ __target_IO_APIC_irq(irq, dest, data->chip_data);
+ ret = IRQ_SET_MASK_OK_NOCOPY;
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return ret;
+}
+
static void ack_apic_edge(struct irq_data *data)
{
irq_complete_move(data->chip_data);
@@ -2512,21 +2351,96 @@ static void ack_apic_edge(struct irq_data *data)
atomic_t irq_mis_count;
-static void ack_apic_level(struct irq_data *data)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
{
- struct irq_cfg *cfg = data->chip_data;
- int i, do_unmask_irq = 0, irq = data->irq;
- unsigned long v;
+ struct irq_pin_list *entry;
+ unsigned long flags;
- irq_complete_move(cfg);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ unsigned int reg;
+ int pin;
+
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ /* Is the remote IRR bit set? */
+ if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return true;
+ }
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return false;
+}
+
+static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+{
/* If we are moving the irq we need to mask it */
if (unlikely(irqd_is_setaffinity_pending(data))) {
- do_unmask_irq = 1;
mask_ioapic(cfg);
+ return true;
}
+ return false;
+}
+
+static inline void ioapic_irqd_unmask(struct irq_data *data,
+ struct irq_cfg *cfg, bool masked)
+{
+ if (unlikely(masked)) {
+ /* Only migrate the irq if the ack has been received.
+ *
+ * On rare occasions the broadcast level triggered ack gets
+ * delayed going to ioapics, and if we reprogram the
+ * vector while Remote IRR is still set the irq will never
+ * fire again.
+ *
+ * To prevent this scenario we read the Remote IRR bit
+ * of the ioapic. This has two effects.
+ * - On any sane system the read of the ioapic will
+ * flush writes (and acks) going to the ioapic from
+ * this cpu.
+ * - We get to see if the ACK has actually been delivered.
+ *
+ * Based on failed experiments of reprogramming the
+ * ioapic entry from outside of irq context starting
+ * with masking the ioapic entry and then polling until
+ * Remote IRR was clear before reprogramming the
+ * ioapic I don't trust the Remote IRR bit to be
+ * completey accurate.
+ *
+ * However there appears to be no other way to plug
+ * this race, so if the Remote IRR bit is not
+ * accurate and is causing problems then it is a hardware bug
+ * and you can go talk to the chipset vendor about it.
+ */
+ if (!io_apic_level_ack_pending(cfg))
+ irq_move_masked_irq(data);
+ unmask_ioapic(cfg);
+ }
+}
+#else
+static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+{
+ return false;
+}
+static inline void ioapic_irqd_unmask(struct irq_data *data,
+ struct irq_cfg *cfg, bool masked)
+{
+}
#endif
+static void ack_apic_level(struct irq_data *data)
+{
+ struct irq_cfg *cfg = data->chip_data;
+ int i, irq = data->irq;
+ unsigned long v;
+ bool masked;
+
+ irq_complete_move(cfg);
+ masked = ioapic_irqd_mask(data, cfg);
+
/*
* It appears there is an erratum which affects at least version 0x11
* of I/O APIC (that's the 82093AA and cores integrated into various
@@ -2581,69 +2495,9 @@ static void ack_apic_level(struct irq_data *data)
eoi_ioapic_irq(irq, cfg);
}
- /* Now we can move and renable the irq */
- if (unlikely(do_unmask_irq)) {
- /* Only migrate the irq if the ack has been received.
- *
- * On rare occasions the broadcast level triggered ack gets
- * delayed going to ioapics, and if we reprogram the
- * vector while Remote IRR is still set the irq will never
- * fire again.
- *
- * To prevent this scenario we read the Remote IRR bit
- * of the ioapic. This has two effects.
- * - On any sane system the read of the ioapic will
- * flush writes (and acks) going to the ioapic from
- * this cpu.
- * - We get to see if the ACK has actually been delivered.
- *
- * Based on failed experiments of reprogramming the
- * ioapic entry from outside of irq context starting
- * with masking the ioapic entry and then polling until
- * Remote IRR was clear before reprogramming the
- * ioapic I don't trust the Remote IRR bit to be
- * completey accurate.
- *
- * However there appears to be no other way to plug
- * this race, so if the Remote IRR bit is not
- * accurate and is causing problems then it is a hardware bug
- * and you can go talk to the chipset vendor about it.
- */
- if (!io_apic_level_ack_pending(cfg))
- irq_move_masked_irq(data);
- unmask_ioapic(cfg);
- }
-}
-
-#ifdef CONFIG_IRQ_REMAP
-static void ir_ack_apic_edge(struct irq_data *data)
-{
- ack_APIC_irq();
-}
-
-static void ir_ack_apic_level(struct irq_data *data)
-{
- ack_APIC_irq();
- eoi_ioapic_irq(data->irq, data->chip_data);
+ ioapic_irqd_unmask(data, cfg, masked);
}
-static void ir_print_prefix(struct irq_data *data, struct seq_file *p)
-{
- seq_printf(p, " IR-%s", data->chip->name);
-}
-
-static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
-{
- chip->irq_print_chip = ir_print_prefix;
- chip->irq_ack = ir_ack_apic_edge;
- chip->irq_eoi = ir_ack_apic_level;
-
-#ifdef CONFIG_SMP
- chip->irq_set_affinity = ir_ioapic_set_affinity;
-#endif
-}
-#endif /* CONFIG_IRQ_REMAP */
-
static struct irq_chip ioapic_chip __read_mostly = {
.name = "IO-APIC",
.irq_startup = startup_ioapic_irq,
@@ -2651,9 +2505,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
.irq_unmask = unmask_ioapic_irq,
.irq_ack = ack_apic_edge,
.irq_eoi = ack_apic_level,
-#ifdef CONFIG_SMP
- .irq_set_affinity = ioapic_set_affinity,
-#endif
+ .irq_set_affinity = native_ioapic_set_affinity,
.irq_retrigger = ioapic_retrigger_irq,
};
@@ -2852,8 +2704,7 @@ static inline void __init check_timer(void)
* 8259A.
*/
if (pin1 == -1) {
- if (intr_remapping_enabled)
- panic("BIOS bug: timer not connected to IO-APIC");
+ panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC");
pin1 = pin2;
apic1 = apic2;
no_pin1 = 1;
@@ -2885,8 +2736,7 @@ static inline void __init check_timer(void)
clear_IO_APIC_pin(0, pin1);
goto out;
}
- if (intr_remapping_enabled)
- panic("timer doesn't work through Interrupt-remapped IO-APIC");
+ panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC");
local_irq_disable();
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
@@ -3051,74 +2901,74 @@ static int __init ioapic_init_ops(void)
device_initcall(ioapic_init_ops);
/*
- * Dynamic irq allocate and deallocation
+ * Dynamic irq allocate and deallocation. Should be replaced by irq domains!
*/
-unsigned int create_irq_nr(unsigned int from, int node)
+int arch_setup_hwirq(unsigned int irq, int node)
{
struct irq_cfg *cfg;
unsigned long flags;
- unsigned int ret = 0;
- int irq;
-
- if (from < nr_irqs_gsi)
- from = nr_irqs_gsi;
+ int ret;
- irq = alloc_irq_from(from, node);
- if (irq < 0)
- return 0;
cfg = alloc_irq_cfg(irq, node);
- if (!cfg) {
- free_irq_at(irq, NULL);
- return 0;
- }
+ if (!cfg)
+ return -ENOMEM;
raw_spin_lock_irqsave(&vector_lock, flags);
- if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
- ret = irq;
+ ret = __assign_irq_vector(irq, cfg, apic->target_cpus());
raw_spin_unlock_irqrestore(&vector_lock, flags);
- if (ret) {
+ if (!ret)
irq_set_chip_data(irq, cfg);
- irq_clear_status_flags(irq, IRQ_NOREQUEST);
- } else {
- free_irq_at(irq, cfg);
- }
+ else
+ free_irq_cfg(irq, cfg);
return ret;
}
-int create_irq(void)
-{
- int node = cpu_to_node(0);
- unsigned int irq_want;
- int irq;
-
- irq_want = nr_irqs_gsi;
- irq = create_irq_nr(irq_want, node);
-
- if (irq == 0)
- irq = -1;
-
- return irq;
-}
-
-void destroy_irq(unsigned int irq)
+void arch_teardown_hwirq(unsigned int irq)
{
struct irq_cfg *cfg = irq_get_chip_data(irq);
unsigned long flags;
- irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
-
- if (irq_remapped(cfg))
- free_irte(irq);
+ free_remapped_irq(irq);
raw_spin_lock_irqsave(&vector_lock, flags);
__clear_irq_vector(irq, cfg);
raw_spin_unlock_irqrestore(&vector_lock, flags);
- free_irq_at(irq, cfg);
+ free_irq_cfg(irq, cfg);
}
/*
* MSI message composition
*/
+void native_compose_msi_msg(struct pci_dev *pdev,
+ unsigned int irq, unsigned int dest,
+ struct msi_msg *msg, u8 hpet_id)
+{
+ struct irq_cfg *cfg = irq_cfg(irq);
+
+ msg->address_hi = MSI_ADDR_BASE_HI;
+
+ if (x2apic_enabled())
+ msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest);
+
+ msg->address_lo =
+ MSI_ADDR_BASE_LO |
+ ((apic->irq_dest_mode == 0) ?
+ MSI_ADDR_DEST_MODE_PHYSICAL:
+ MSI_ADDR_DEST_MODE_LOGICAL) |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ MSI_ADDR_REDIRECTION_CPU:
+ MSI_ADDR_REDIRECTION_LOWPRI) |
+ MSI_ADDR_DEST_ID(dest);
+
+ msg->data =
+ MSI_DATA_TRIGGER_EDGE |
+ MSI_DATA_LEVEL_ASSERT |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ MSI_DATA_DELIVERY_FIXED:
+ MSI_DATA_DELIVERY_LOWPRI) |
+ MSI_DATA_VECTOR(cfg->vector);
+}
+
#ifdef CONFIG_PCI_MSI
static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
struct msi_msg *msg, u8 hpet_id)
@@ -3135,70 +2985,27 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
if (err)
return err;
- dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
-
- if (irq_remapped(cfg)) {
- struct irte irte;
- int ir_index;
- u16 sub_handle;
-
- ir_index = map_irq_to_irte_handle(irq, &sub_handle);
- BUG_ON(ir_index == -1);
-
- prepare_irte(&irte, cfg->vector, dest);
-
- /* Set source-id of interrupt request */
- if (pdev)
- set_msi_sid(&irte, pdev);
- else
- set_hpet_sid(&irte, hpet_id);
+ err = apic->cpu_mask_to_apicid_and(cfg->domain,
+ apic->target_cpus(), &dest);
+ if (err)
+ return err;
- modify_irte(irq, &irte);
+ x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id);
- msg->address_hi = MSI_ADDR_BASE_HI;
- msg->data = sub_handle;
- msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
- MSI_ADDR_IR_SHV |
- MSI_ADDR_IR_INDEX1(ir_index) |
- MSI_ADDR_IR_INDEX2(ir_index);
- } else {
- if (x2apic_enabled())
- msg->address_hi = MSI_ADDR_BASE_HI |
- MSI_ADDR_EXT_DEST_ID(dest);
- else
- msg->address_hi = MSI_ADDR_BASE_HI;
-
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- ((apic->irq_dest_mode == 0) ?
- MSI_ADDR_DEST_MODE_PHYSICAL:
- MSI_ADDR_DEST_MODE_LOGICAL) |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_ADDR_REDIRECTION_CPU:
- MSI_ADDR_REDIRECTION_LOWPRI) |
- MSI_ADDR_DEST_ID(dest);
-
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_DATA_DELIVERY_FIXED:
- MSI_DATA_DELIVERY_LOWPRI) |
- MSI_DATA_VECTOR(cfg->vector);
- }
- return err;
+ return 0;
}
-#ifdef CONFIG_SMP
static int
msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
{
struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
__get_cached_msi_msg(data->msi_desc, &msg);
@@ -3209,9 +3016,8 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
__write_msi_msg(data->msi_desc, &msg);
- return 0;
+ return IRQ_SET_MASK_OK_NOCOPY;
}
-#endif /* CONFIG_SMP */
/*
* IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
@@ -3222,56 +3028,32 @@ static struct irq_chip msi_chip = {
.irq_unmask = unmask_msi_irq,
.irq_mask = mask_msi_irq,
.irq_ack = ack_apic_edge,
-#ifdef CONFIG_SMP
.irq_set_affinity = msi_set_affinity,
-#endif
.irq_retrigger = ioapic_retrigger_irq,
};
-/*
- * Map the PCI dev to the corresponding remapping hardware unit
- * and allocate 'nvec' consecutive interrupt-remapping table entries
- * in it.
- */
-static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
-{
- struct intel_iommu *iommu;
- int index;
-
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- printk(KERN_ERR
- "Unable to map PCI %s to iommu\n", pci_name(dev));
- return -ENOENT;
- }
-
- index = alloc_irte(iommu, irq, nvec);
- if (index < 0) {
- printk(KERN_ERR
- "Unable to allocate %d IRTE for PCI %s\n", nvec,
- pci_name(dev));
- return -ENOSPC;
- }
- return index;
-}
-
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+ unsigned int irq_base, unsigned int irq_offset)
{
struct irq_chip *chip = &msi_chip;
struct msi_msg msg;
+ unsigned int irq = irq_base + irq_offset;
int ret;
ret = msi_compose_msg(dev, irq, &msg, -1);
if (ret < 0)
return ret;
- irq_set_msi_desc(irq, msidesc);
- write_msi_msg(irq, &msg);
+ irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
- if (irq_remapped(irq_get_chip_data(irq))) {
- irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
- irq_remap_modify_chip_defaults(chip);
- }
+ /*
+ * MSI-X message is written per-IRQ, the offset is always 0.
+ * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
+ */
+ if (!irq_offset)
+ write_msi_msg(irq, &msg);
+
+ setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
@@ -3282,69 +3064,37 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
- int node, ret, sub_handle, index = 0;
- unsigned int irq, irq_want;
struct msi_desc *msidesc;
- struct intel_iommu *iommu = NULL;
+ unsigned int irq;
+ int node, ret;
- /* x86 doesn't support multiple MSI yet */
+ /* Multiple MSI vectors only supported with interrupt remapping */
if (type == PCI_CAP_ID_MSI && nvec > 1)
return 1;
node = dev_to_node(&dev->dev);
- irq_want = nr_irqs_gsi;
- sub_handle = 0;
- list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = create_irq_nr(irq_want, node);
- if (irq == 0)
- return -1;
- irq_want = irq + 1;
- if (!intr_remapping_enabled)
- goto no_ir;
- if (!sub_handle) {
- /*
- * allocate the consecutive block of IRTE's
- * for 'nvec'
- */
- index = msi_alloc_irte(dev, irq, nvec);
- if (index < 0) {
- ret = index;
- goto error;
- }
- } else {
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- ret = -ENOENT;
- goto error;
- }
- /*
- * setup the mapping between the irq and the IRTE
- * base index, the sub_handle pointing to the
- * appropriate interrupt remap table entry.
- */
- set_irte_irq(irq, iommu, index, sub_handle);
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = irq_alloc_hwirq(node);
+ if (!irq)
+ return -ENOSPC;
+
+ ret = setup_msi_irq(dev, msidesc, irq, 0);
+ if (ret < 0) {
+ irq_free_hwirq(irq);
+ return ret;
}
-no_ir:
- ret = setup_msi_irq(dev, msidesc, irq);
- if (ret < 0)
- goto error;
- sub_handle++;
+
}
return 0;
-
-error:
- destroy_irq(irq);
- return ret;
}
void native_teardown_msi_irq(unsigned int irq)
{
- destroy_irq(irq);
+ irq_free_hwirq(irq);
}
#ifdef CONFIG_DMAR_TABLE
-#ifdef CONFIG_SMP
static int
dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
@@ -3352,9 +3102,11 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
struct irq_cfg *cfg = data->chip_data;
unsigned int dest, irq = data->irq;
struct msi_msg msg;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
dmar_msi_read(irq, &msg);
@@ -3366,19 +3118,15 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
dmar_msi_write(irq, &msg);
- return 0;
+ return IRQ_SET_MASK_OK_NOCOPY;
}
-#endif /* CONFIG_SMP */
-
static struct irq_chip dmar_msi_type = {
.name = "DMAR_MSI",
.irq_unmask = dmar_msi_unmask,
.irq_mask = dmar_msi_mask,
.irq_ack = ack_apic_edge,
-#ifdef CONFIG_SMP
.irq_set_affinity = dmar_msi_set_affinity,
-#endif
.irq_retrigger = ioapic_retrigger_irq,
};
@@ -3399,16 +3147,17 @@ int arch_setup_dmar_msi(unsigned int irq)
#ifdef CONFIG_HPET_TIMER
-#ifdef CONFIG_SMP
static int hpet_msi_set_affinity(struct irq_data *data,
const struct cpumask *mask, bool force)
{
struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
hpet_msi_read(data->handler_data, &msg);
@@ -3419,48 +3168,31 @@ static int hpet_msi_set_affinity(struct irq_data *data,
hpet_msi_write(data->handler_data, &msg);
- return 0;
+ return IRQ_SET_MASK_OK_NOCOPY;
}
-#endif /* CONFIG_SMP */
-
static struct irq_chip hpet_msi_type = {
.name = "HPET_MSI",
.irq_unmask = hpet_msi_unmask,
.irq_mask = hpet_msi_mask,
.irq_ack = ack_apic_edge,
-#ifdef CONFIG_SMP
.irq_set_affinity = hpet_msi_set_affinity,
-#endif
.irq_retrigger = ioapic_retrigger_irq,
};
-int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
+int default_setup_hpet_msi(unsigned int irq, unsigned int id)
{
struct irq_chip *chip = &hpet_msi_type;
struct msi_msg msg;
int ret;
- if (intr_remapping_enabled) {
- struct intel_iommu *iommu = map_hpet_to_ir(id);
- int index;
-
- if (!iommu)
- return -1;
-
- index = alloc_irte(iommu, irq, 1);
- if (index < 0)
- return -1;
- }
-
ret = msi_compose_msg(NULL, irq, &msg, id);
if (ret < 0)
return ret;
hpet_msi_write(irq_get_handler_data(irq), &msg);
irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
- if (irq_remapped(irq_get_chip_data(irq)))
- irq_remap_modify_chip_defaults(chip);
+ setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
return 0;
@@ -3473,8 +3205,6 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
*/
#ifdef CONFIG_HT_IRQ
-#ifdef CONFIG_SMP
-
static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
{
struct ht_irq_msg msg;
@@ -3494,30 +3224,30 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
{
struct irq_cfg *cfg = data->chip_data;
unsigned int dest;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
target_ht_irq(data->irq, dest, cfg->vector);
- return 0;
+ return IRQ_SET_MASK_OK_NOCOPY;
}
-#endif
-
static struct irq_chip ht_irq_chip = {
.name = "PCI-HT",
.irq_mask = mask_ht_irq,
.irq_unmask = unmask_ht_irq,
.irq_ack = ack_apic_edge,
-#ifdef CONFIG_SMP
.irq_set_affinity = ht_set_affinity,
-#endif
.irq_retrigger = ioapic_retrigger_irq,
};
int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
{
struct irq_cfg *cfg;
+ struct ht_irq_msg msg;
+ unsigned dest;
int err;
if (disable_apic)
@@ -3525,36 +3255,37 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
cfg = irq_cfg(irq);
err = assign_irq_vector(irq, cfg, apic->target_cpus());
- if (!err) {
- struct ht_irq_msg msg;
- unsigned dest;
+ if (err)
+ return err;
- dest = apic->cpu_mask_to_apicid_and(cfg->domain,
- apic->target_cpus());
+ err = apic->cpu_mask_to_apicid_and(cfg->domain,
+ apic->target_cpus(), &dest);
+ if (err)
+ return err;
- msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+ msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+
+ msg.address_lo =
+ HT_IRQ_LOW_BASE |
+ HT_IRQ_LOW_DEST_ID(dest) |
+ HT_IRQ_LOW_VECTOR(cfg->vector) |
+ ((apic->irq_dest_mode == 0) ?
+ HT_IRQ_LOW_DM_PHYSICAL :
+ HT_IRQ_LOW_DM_LOGICAL) |
+ HT_IRQ_LOW_RQEOI_EDGE |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ HT_IRQ_LOW_MT_FIXED :
+ HT_IRQ_LOW_MT_ARBITRATED) |
+ HT_IRQ_LOW_IRQ_MASKED;
- msg.address_lo =
- HT_IRQ_LOW_BASE |
- HT_IRQ_LOW_DEST_ID(dest) |
- HT_IRQ_LOW_VECTOR(cfg->vector) |
- ((apic->irq_dest_mode == 0) ?
- HT_IRQ_LOW_DM_PHYSICAL :
- HT_IRQ_LOW_DM_LOGICAL) |
- HT_IRQ_LOW_RQEOI_EDGE |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- HT_IRQ_LOW_MT_FIXED :
- HT_IRQ_LOW_MT_ARBITRATED) |
- HT_IRQ_LOW_IRQ_MASKED;
+ write_ht_irq_msg(irq, &msg);
- write_ht_irq_msg(irq, &msg);
+ irq_set_chip_and_handler_name(irq, &ht_irq_chip,
+ handle_edge_irq, "edge");
- irq_set_chip_and_handler_name(irq, &ht_irq_chip,
- handle_edge_irq, "edge");
+ dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
- dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
- }
- return err;
+ return 0;
}
#endif /* CONFIG_HT_IRQ */
@@ -3577,12 +3308,15 @@ int io_apic_setup_irq_pin_once(unsigned int irq, int node,
{
unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin;
int ret;
+ struct IO_APIC_route_entry orig_entry;
/* Avoid redundant programming */
if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) {
- pr_debug("Pin %d-%d already programmed\n",
- mpc_ioapic_id(ioapic_idx), pin);
- return 0;
+ pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin);
+ orig_entry = ioapic_read_entry(attr->ioapic, pin);
+ if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity)
+ return 0;
+ return -EBUSY;
}
ret = io_apic_setup_irq_pin(irq, node, attr);
if (!ret)
@@ -3617,9 +3351,9 @@ static void __init probe_nr_irqs_gsi(void)
printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
}
-int get_nr_irqs_gsi(void)
+unsigned int arch_dynirq_lower_bound(unsigned int from)
{
- return nr_irqs_gsi;
+ return from < nr_irqs_gsi ? nr_irqs_gsi : from;
}
int __init arch_probe_nr_irqs(void)
@@ -3722,7 +3456,8 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id)
/* Sanity check */
if (reg_00.bits.ID != apic_id) {
- printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+ pr_err("IOAPIC[%d]: Unable to change apic_id!\n",
+ ioapic);
return -1;
}
}
@@ -3828,10 +3563,7 @@ void __init setup_ioapic_dest(void)
else
mask = apic->target_cpus();
- if (intr_remapping_enabled)
- ir_ioapic_set_affinity(idata, mask, false);
- else
- ioapic_set_affinity(idata, mask, false);
+ x86_io_apic_ops.set_affinity(idata, mask, false);
}
}
@@ -3871,7 +3603,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
return res;
}
-void __init ioapic_and_gsi_init(void)
+void __init native_io_apic_init_mappings(void)
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
struct resource *ioapic_res;
@@ -3967,15 +3699,33 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
static __init int bad_ioapic(unsigned long address)
{
if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
- "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
+ pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n",
+ MAX_IO_APICS, nr_ioapics);
return 1;
}
if (!address) {
- printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address"
- " found in table, skipping!\n");
+ pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n");
+ return 1;
+ }
+ return 0;
+}
+
+static __init int bad_ioapic_register(int idx)
+{
+ union IO_APIC_reg_00 reg_00;
+ union IO_APIC_reg_01 reg_01;
+ union IO_APIC_reg_02 reg_02;
+
+ reg_00.raw = io_apic_read(idx, 0);
+ reg_01.raw = io_apic_read(idx, 1);
+ reg_02.raw = io_apic_read(idx, 2);
+
+ if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) {
+ pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n",
+ mpc_ioapic_addr(idx));
return 1;
}
+
return 0;
}
@@ -3995,6 +3745,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
ioapics[idx].mp_config.apicaddr = address;
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+
+ if (bad_ioapic_register(idx)) {
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return;
+ }
+
ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
@@ -4015,10 +3771,10 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
if (gsi_cfg->gsi_end >= gsi_top)
gsi_top = gsi_cfg->gsi_end + 1;
- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
- "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
- mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
- gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+ pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
+ idx, mpc_ioapic_id(idx),
+ mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+ gsi_cfg->gsi_base, gsi_cfg->gsi_end);
nr_ioapics++;
}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index cce91bf2667..62071569bd5 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -1,6 +1,5 @@
#include <linux/cpumask.h>
#include <linux/interrupt.h>
-#include <linux/init.h>
#include <linux/mm.h>
#include <linux/delay.h>
@@ -106,7 +105,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
unsigned long mask = cpumask_bits(cpumask)[0];
unsigned long flags;
- if (WARN_ONCE(!mask, "empty IPI mask"))
+ if (!mask)
return;
local_irq_save(flags);
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
deleted file mode 100644
index c4a61ca1349..00000000000
--- a/arch/x86/kernel/apic/numaq_32.c
+++ /dev/null
@@ -1,541 +0,0 @@
-/*
- * Written by: Patricia Gaughen, IBM Corporation
- *
- * Copyright (C) 2002, IBM Corp.
- * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <gone@us.ibm.com>
- */
-#include <linux/nodemask.h>
-#include <linux/topology.h>
-#include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/kernel.h>
-#include <linux/mmzone.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/numa.h>
-#include <linux/smp.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-
-#include <asm/processor.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/numaq.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/ipi.h>
-
-int found_numaq;
-
-/*
- * Have to match translation table entries to main table entries by counter
- * hence the mpc_record variable .... can't see a less disgusting way of
- * doing this ....
- */
-struct mpc_trans {
- unsigned char mpc_type;
- unsigned char trans_len;
- unsigned char trans_type;
- unsigned char trans_quad;
- unsigned char trans_global;
- unsigned char trans_local;
- unsigned short trans_reserved;
-};
-
-static int mpc_record;
-
-static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
-
-int mp_bus_id_to_node[MAX_MP_BUSSES];
-int mp_bus_id_to_local[MAX_MP_BUSSES];
-int quad_local_to_mp_bus_id[NR_CPUS/4][4];
-
-
-static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
-{
- struct eachquadmem *eq = scd->eq + node;
- u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
- u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
- int ret;
-
- node_set(node, numa_nodes_parsed);
- ret = numa_add_memblk(node, start, end);
- BUG_ON(ret < 0);
-}
-
-/*
- * Function: smp_dump_qct()
- *
- * Description: gets memory layout from the quad config table. This
- * function also updates numa_nodes_parsed with the nodes (quads) present.
- */
-static void __init smp_dump_qct(void)
-{
- struct sys_cfg_data *scd;
- int node;
-
- scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
-
- for_each_node(node) {
- if (scd->quads_present31_0 & (1 << node))
- numaq_register_node(node, scd);
- }
-}
-
-void __cpuinit numaq_tsc_disable(void)
-{
- if (!found_numaq)
- return;
-
- if (num_online_nodes() > 1) {
- printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
- setup_clear_cpu_cap(X86_FEATURE_TSC);
- }
-}
-
-static void __init numaq_tsc_init(void)
-{
- numaq_tsc_disable();
-}
-
-static inline int generate_logical_apicid(int quad, int phys_apicid)
-{
- return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
-}
-
-/* x86_quirks member */
-static int mpc_apic_id(struct mpc_cpu *m)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int logical_apicid = generate_logical_apicid(quad, m->apicid);
-
- printk(KERN_DEBUG
- "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
- m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
- (m->cpufeature & CPU_MODEL_MASK) >> 4,
- m->apicver, quad, logical_apicid);
-
- return logical_apicid;
-}
-
-/* x86_quirks member */
-static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int local = translation_table[mpc_record]->trans_local;
-
- mp_bus_id_to_node[m->busid] = quad;
- mp_bus_id_to_local[m->busid] = local;
-
- printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad);
-}
-
-/* x86_quirks member */
-static void mpc_oem_pci_bus(struct mpc_bus *m)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int local = translation_table[mpc_record]->trans_local;
-
- quad_local_to_mp_bus_id[quad][local] = m->busid;
-}
-
-/*
- * Called from mpparse code.
- * mode = 0: prescan
- * mode = 1: one mpc entry scanned
- */
-static void numaq_mpc_record(unsigned int mode)
-{
- if (!mode)
- mpc_record = 0;
- else
- mpc_record++;
-}
-
-static void __init MP_translation_info(struct mpc_trans *m)
-{
- printk(KERN_INFO
- "Translation: record %d, type %d, quad %d, global %d, local %d\n",
- mpc_record, m->trans_type, m->trans_quad, m->trans_global,
- m->trans_local);
-
- if (mpc_record >= MAX_MPC_ENTRY)
- printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
- else
- translation_table[mpc_record] = m; /* stash this for later */
-
- if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
- node_set_online(m->trans_quad);
-}
-
-static int __init mpf_checksum(unsigned char *mp, int len)
-{
- int sum = 0;
-
- while (len--)
- sum += *mp++;
-
- return sum & 0xFF;
-}
-
-/*
- * Read/parse the MPC oem tables
- */
-static void __init smp_read_mpc_oem(struct mpc_table *mpc)
-{
- struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr;
- int count = sizeof(*oemtable); /* the header size */
- unsigned char *oemptr = ((unsigned char *)oemtable) + count;
-
- mpc_record = 0;
- printk(KERN_INFO
- "Found an OEM MPC table at %8p - parsing it...\n", oemtable);
-
- if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
- printk(KERN_WARNING
- "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
- oemtable->signature[0], oemtable->signature[1],
- oemtable->signature[2], oemtable->signature[3]);
- return;
- }
-
- if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
- printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
- return;
- }
-
- while (count < oemtable->length) {
- switch (*oemptr) {
- case MP_TRANSLATION:
- {
- struct mpc_trans *m = (void *)oemptr;
-
- MP_translation_info(m);
- oemptr += sizeof(*m);
- count += sizeof(*m);
- ++mpc_record;
- break;
- }
- default:
- printk(KERN_WARNING
- "Unrecognised OEM table entry type! - %d\n",
- (int)*oemptr);
- return;
- }
- }
-}
-
-static __init void early_check_numaq(void)
-{
- /*
- * get boot-time SMP configuration:
- */
- if (smp_found_config)
- early_get_smp_config();
-
- if (found_numaq) {
- x86_init.mpparse.mpc_record = numaq_mpc_record;
- x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
- x86_init.mpparse.mpc_apic_id = mpc_apic_id;
- x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
- x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
- x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
- x86_init.timers.tsc_pre_init = numaq_tsc_init;
- x86_init.pci.init = pci_numaq_init;
- }
-}
-
-int __init numaq_numa_init(void)
-{
- early_check_numaq();
- if (!found_numaq)
- return -ENOENT;
- smp_dump_qct();
-
- return 0;
-}
-
-#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
-
-static inline unsigned int numaq_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0x0F;
-}
-
-static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_logical(mask, vector);
-}
-
-static inline void numaq_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
-}
-
-static inline void numaq_send_IPI_all(int vector)
-{
- numaq_send_IPI_mask(cpu_online_mask, vector);
-}
-
-#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8)
-#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa)
-
-/*
- * Because we use NMIs rather than the INIT-STARTUP sequence to
- * bootstrap the CPUs, the APIC may be in a weird state. Kick it:
- */
-static inline void numaq_smp_callin_clear_local_apic(void)
-{
- clear_local_APIC();
-}
-
-static inline const struct cpumask *numaq_target_cpus(void)
-{
- return cpu_all_mask;
-}
-
-static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return physid_isset(apicid, *map);
-}
-
-static inline unsigned long numaq_check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
-static inline int numaq_apic_id_registered(void)
-{
- return 1;
-}
-
-static inline void numaq_init_apic_ldr(void)
-{
- /* Already done in NUMA-Q firmware */
-}
-
-static inline void numaq_setup_apic_routing(void)
-{
- printk(KERN_INFO
- "Enabling APIC mode: NUMA-Q. Using %d I/O APICs\n",
- nr_ioapics);
-}
-
-/*
- * Skip adding the timer int on secondary nodes, which causes
- * a small but painful rift in the time-space continuum.
- */
-static inline int numaq_multi_timer_check(int apic, int irq)
-{
- return apic != 0 && irq == 0;
-}
-
-static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
-{
- /* We don't have a good way to do this yet - hack */
- return physids_promote(0xFUL, retmap);
-}
-
-/*
- * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
- * cpu to APIC ID relation to properly interact with the intelligent
- * mode of the cluster controller.
- */
-static inline int numaq_cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < 60)
- return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
- else
- return BAD_APICID;
-}
-
-static inline int numaq_apicid_to_node(int logical_apicid)
-{
- return logical_apicid >> 4;
-}
-
-static int numaq_numa_cpu_node(int cpu)
-{
- int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
-
- if (logical_apicid != BAD_APICID)
- return numaq_apicid_to_node(logical_apicid);
- return NUMA_NO_NODE;
-}
-
-static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
-{
- int node = numaq_apicid_to_node(logical_apicid);
- int cpu = __ffs(logical_apicid & 0xf);
-
- physid_set_mask_of_physid(cpu + 4*node, retmap);
-}
-
-/* Where the IO area was mapped on multiquad, always 0 otherwise */
-void *xquad_portio;
-
-static inline int numaq_check_phys_apicid_present(int phys_apicid)
-{
- return 1;
-}
-
-/*
- * We use physical apicids here, not logical, so just return the default
- * physical broadcast to stop people from breaking us
- */
-static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- return 0x0F;
-}
-
-static inline unsigned int
-numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- return 0x0F;
-}
-
-/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
-static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-static int
-numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
-{
- if (strncmp(oem, "IBM NUMA", 8))
- printk(KERN_ERR "Warning! Not a NUMA-Q system!\n");
- else
- found_numaq = 1;
-
- return found_numaq;
-}
-
-static int probe_numaq(void)
-{
- /* already know from get_memcfg_numaq() */
- return found_numaq;
-}
-
-static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
-static void numaq_setup_portio_remap(void)
-{
- int num_quads = num_online_nodes();
-
- if (num_quads <= 1)
- return;
-
- printk(KERN_INFO
- "Remapping cross-quad port I/O for %d quads\n", num_quads);
-
- xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
-
- printk(KERN_INFO
- "xquad_portio vaddr 0x%08lx, len %08lx\n",
- (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
-}
-
-/* Use __refdata to keep false positive warning calm. */
-static struct apic __refdata apic_numaq = {
-
- .name = "NUMAQ",
- .probe = probe_numaq,
- .acpi_madt_oem_check = NULL,
- .apic_id_registered = numaq_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- /* physical delivery on LOCAL quad: */
- .irq_dest_mode = 0,
-
- .target_cpus = numaq_target_cpus,
- .disable_esr = 1,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = numaq_check_apicid_used,
- .check_apicid_present = numaq_check_apicid_present,
-
- .vector_allocation_domain = numaq_vector_allocation_domain,
- .init_apic_ldr = numaq_init_apic_ldr,
-
- .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
- .setup_apic_routing = numaq_setup_apic_routing,
- .multi_timer_check = numaq_multi_timer_check,
- .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
- .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
- .setup_portio_remap = numaq_setup_portio_remap,
- .check_phys_apicid_present = numaq_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = numaq_phys_pkg_id,
- .mps_oem_check = numaq_mps_oem_check,
-
- .get_apic_id = numaq_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0x0F << 24,
-
- .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = numaq_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = numaq_send_IPI_allbutself,
- .send_IPI_all = numaq_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi,
- .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH,
-
- /* We don't do anything here because we use NMI's to boot instead */
- .wait_for_init_deassert = NULL,
-
- .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic,
- .inquire_remote_apic = NULL,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-
- .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
- .x86_32_numa_cpu_node = numaq_numa_cpu_node,
-};
-
-apic_driver(apic_numaq);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0787bb3412f..cceb352c968 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -66,21 +66,6 @@ static void setup_apic_flat_routing(void)
#endif
}
-static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /*
- * Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
/* should be called last. */
static int probe_default(void)
{
@@ -92,6 +77,7 @@ static struct apic apic_default = {
.name = "default",
.probe = probe_default,
.acpi_madt_oem_check = NULL,
+ .apic_id_valid = default_apic_id_valid,
.apic_id_registered = default_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
@@ -104,7 +90,7 @@ static struct apic apic_default = {
.check_apicid_used = default_check_apicid_used,
.check_apicid_present = default_check_apicid_present,
- .vector_allocation_domain = default_vector_allocation_domain,
+ .vector_allocation_domain = flat_vector_allocation_domain,
.init_apic_ldr = default_init_apic_ldr,
.ioapic_phys_id_map = default_ioapic_phys_id_map,
@@ -122,8 +108,7 @@ static struct apic apic_default = {
.set_apic_id = NULL,
.apic_id_mask = 0x0F << 24,
- .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
.send_IPI_mask = default_send_IPI_mask_logical,
.send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
@@ -134,13 +119,13 @@ static struct apic apic_default = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
+ .wait_for_init_deassert = true,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
@@ -206,6 +191,9 @@ void __init default_setup_apic_routing(void)
if (apic->setup_apic_routing)
apic->setup_apic_routing();
+
+ if (x86_platform.apic_post_init)
+ x86_platform.apic_post_init();
}
void __init generic_apic_probe(void)
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 3fe98669892..1793dba7a74 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,11 +23,6 @@
#include <asm/ipi.h>
#include <asm/setup.h>
-static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
-{
- return hard_smp_processor_id() >> index_msb;
-}
-
/*
* Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
*/
@@ -48,10 +43,8 @@ void __init default_setup_apic_routing(void)
}
}
- if (is_vsmp_box()) {
- /* need to update phys_pkg_id */
- apic->phys_pkg_id = apicid_phys_pkg_id;
- }
+ if (x86_platform.apic_post_init)
+ x86_platform.apic_post_init();
}
/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
deleted file mode 100644
index 19114423c58..00000000000
--- a/arch/x86/kernel/apic/summit_32.c
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- * IBM Summit-Specific Code
- *
- * Written By: Matthew Dobson, IBM Corporation
- *
- * Copyright (c) 2003 IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- *
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <asm/io.h>
-#include <asm/bios_ebda.h>
-
-/*
- * APIC driver for the IBM "Summit" chipset.
- */
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <asm/smp.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/ipi.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/gfp.h>
-#include <linux/smp.h>
-
-static unsigned summit_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0xFF;
-}
-
-static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_logical(mask, vector);
-}
-
-static void summit_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
-}
-
-static void summit_send_IPI_all(int vector)
-{
- summit_send_IPI_mask(cpu_online_mask, vector);
-}
-
-#include <asm/tsc.h>
-
-extern int use_cyclone;
-
-#ifdef CONFIG_X86_SUMMIT_NUMA
-static void setup_summit(void);
-#else
-static inline void setup_summit(void) {}
-#endif
-
-static int summit_mps_oem_check(struct mpc_table *mpc, char *oem,
- char *productid)
-{
- if (!strncmp(oem, "IBM ENSW", 8) &&
- (!strncmp(productid, "VIGIL SMP", 9)
- || !strncmp(productid, "EXA", 3)
- || !strncmp(productid, "RUTHLESS SMP", 12))){
- mark_tsc_unstable("Summit based system");
- use_cyclone = 1; /*enable cyclone-timer*/
- setup_summit();
- return 1;
- }
- return 0;
-}
-
-/* Hook from generic ACPI tables.c */
-static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- if (!strncmp(oem_id, "IBM", 3) &&
- (!strncmp(oem_table_id, "SERVIGIL", 8)
- || !strncmp(oem_table_id, "EXA", 3))){
- mark_tsc_unstable("Summit based system");
- use_cyclone = 1; /*enable cyclone-timer*/
- setup_summit();
- return 1;
- }
- return 0;
-}
-
-struct rio_table_hdr {
- unsigned char version; /* Version number of this data structure */
- /* Version 3 adds chassis_num & WP_index */
- unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
- unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
-} __attribute__((packed));
-
-struct scal_detail {
- unsigned char node_id; /* Scalability Node ID */
- unsigned long CBAR; /* Address of 1MB register space */
- unsigned char port0node; /* Node ID port connected to: 0xFF=None */
- unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port1node; /* Node ID port connected to: 0xFF = None */
- unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port2node; /* Node ID port connected to: 0xFF = None */
- unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
-} __attribute__((packed));
-
-struct rio_detail {
- unsigned char node_id; /* RIO Node ID */
- unsigned long BBAR; /* Address of 1MB register space */
- unsigned char type; /* Type of device */
- unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
- /* For CYC: Node ID of Twister that owns this CYC */
- unsigned char port0node; /* Node ID port connected to: 0xFF=None */
- unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port1node; /* Node ID port connected to: 0xFF=None */
- unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
- /* For CYC: 0 */
- unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
- /* = 0 : the XAPIC is not used, ie:*/
- /* ints fwded to another XAPIC */
- /* Bits1:7 Reserved */
- /* For CYC: Bits0:7 Reserved */
- unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
- /* lower slot numbers/PCI bus numbers */
- /* For CYC: No meaning */
- unsigned char chassis_num; /* 1 based Chassis number */
- /* For LookOut WPEGs this field indicates the */
- /* Expansion Chassis #, enumerated from Boot */
- /* Node WPEG external port, then Boot Node CYC */
- /* external port, then Next Vigil chassis WPEG */
- /* external port, etc. */
- /* Shared Lookouts have only 1 chassis number (the */
- /* first one assigned) */
-} __attribute__((packed));
-
-
-typedef enum {
- CompatTwister = 0, /* Compatibility Twister */
- AltTwister = 1, /* Alternate Twister of internal 8-way */
- CompatCyclone = 2, /* Compatibility Cyclone */
- AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
- CompatWPEG = 4, /* Compatibility WPEG */
- AltWPEG = 5, /* Second Planar WPEG */
- LookOutAWPEG = 6, /* LookOut WPEG */
- LookOutBWPEG = 7, /* LookOut WPEG */
-} node_type;
-
-static inline int is_WPEG(struct rio_detail *rio){
- return (rio->type == CompatWPEG || rio->type == AltWPEG ||
- rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
-}
-
-#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
-
-static const struct cpumask *summit_target_cpus(void)
-{
- /* CPU_MASK_ALL (0xff) has undefined behaviour with
- * dest_LowestPrio mode logical clustered apic interrupt routing
- * Just start on cpu 0. IRQ balancing will spread load
- */
- return cpumask_of(0);
-}
-
-static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return 0;
-}
-
-/* we don't use the phys_cpu_present_map to indicate apicid presence */
-static unsigned long summit_check_apicid_present(int bit)
-{
- return 1;
-}
-
-static int summit_early_logical_apicid(int cpu)
-{
- int count = 0;
- u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu);
- u8 my_cluster = APIC_CLUSTER(my_id);
-#ifdef CONFIG_SMP
- u8 lid;
- int i;
-
- /* Create logical APIC IDs by counting CPUs already in cluster. */
- for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
- lid = early_per_cpu(x86_cpu_to_logical_apicid, i);
- if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
- ++count;
- }
-#endif
- /* We only have a 4 wide bitmap in cluster mode. If a deranged
- * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
- BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
- return my_cluster | (1UL << count);
-}
-
-static void summit_init_apic_ldr(void)
-{
- int cpu = smp_processor_id();
- unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
- unsigned long val;
-
- apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(id);
- apic_write(APIC_LDR, val);
-}
-
-static int summit_apic_id_registered(void)
-{
- return 1;
-}
-
-static void summit_setup_apic_routing(void)
-{
- printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
- nr_ioapics);
-}
-
-static int summit_cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < nr_cpu_ids)
- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
- else
- return BAD_APICID;
-}
-
-static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- physids_promote(0x0FL, retmap);
-}
-
-static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
-{
- physid_set_mask_of_physid(0, retmap);
-}
-
-static int summit_check_phys_apicid_present(int physical_apicid)
-{
- return 1;
-}
-
-static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- unsigned int round = 0;
- int cpu, apicid = 0;
-
- /*
- * The cpus in the mask must all be on the apic cluster.
- */
- for_each_cpu(cpu, cpumask) {
- int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
-
- if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
- printk("%s: Not a valid mask!\n", __func__);
- return BAD_APICID;
- }
- apicid |= new_apicid;
- round++;
- }
- return apicid;
-}
-
-static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
- const struct cpumask *andmask)
-{
- int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
- cpumask_var_t cpumask;
-
- if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
- return apicid;
-
- cpumask_and(cpumask, inmask, andmask);
- cpumask_and(cpumask, cpumask, cpu_online_mask);
- apicid = summit_cpu_mask_to_apicid(cpumask);
-
- free_cpumask_var(cpumask);
-
- return apicid;
-}
-
-/*
- * cpuid returns the value latched in the HW at reset, not the APIC ID
- * register's value. For any box whose BIOS changes APIC IDs, like
- * clustered APIC systems, we must use hard_smp_processor_id.
- *
- * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
- */
-static int summit_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return hard_smp_processor_id() >> index_msb;
-}
-
-static int probe_summit(void)
-{
- /* probed later in mptable/ACPI hooks */
- return 0;
-}
-
-static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
-#ifdef CONFIG_X86_SUMMIT_NUMA
-static struct rio_table_hdr *rio_table_hdr;
-static struct scal_detail *scal_devs[MAX_NUMNODES];
-static struct rio_detail *rio_devs[MAX_NUMNODES*4];
-
-#ifndef CONFIG_X86_NUMAQ
-static int mp_bus_id_to_node[MAX_MP_BUSSES];
-#endif
-
-static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
-{
- int twister = 0, node = 0;
- int i, bus, num_buses;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
- if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) {
- twister = rio_devs[i]->owner_id;
- break;
- }
- }
- if (i == rio_table_hdr->num_rio_dev) {
- printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__);
- return last_bus;
- }
-
- for (i = 0; i < rio_table_hdr->num_scal_dev; i++) {
- if (scal_devs[i]->node_id == twister) {
- node = scal_devs[i]->node_id;
- break;
- }
- }
- if (i == rio_table_hdr->num_scal_dev) {
- printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__);
- return last_bus;
- }
-
- switch (rio_devs[wpeg_num]->type) {
- case CompatWPEG:
- /*
- * The Compatibility Winnipeg controls the 2 legacy buses,
- * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
- * a PCI-PCI bridge card is used in either slot: total 5 buses.
- */
- num_buses = 5;
- break;
- case AltWPEG:
- /*
- * The Alternate Winnipeg controls the 2 133MHz buses [1 slot
- * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
- * the "extra" buses for each of those slots: total 7 buses.
- */
- num_buses = 7;
- break;
- case LookOutAWPEG:
- case LookOutBWPEG:
- /*
- * A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
- * & the "extra" buses for each of those slots: total 9 buses.
- */
- num_buses = 9;
- break;
- default:
- printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__);
- return last_bus;
- }
-
- for (bus = last_bus; bus < last_bus + num_buses; bus++)
- mp_bus_id_to_node[bus] = node;
- return bus;
-}
-
-static int build_detail_arrays(void)
-{
- unsigned long ptr;
- int i, scal_detail_size, rio_detail_size;
-
- if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
- printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
- return 0;
- }
-
- switch (rio_table_hdr->version) {
- default:
- printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version);
- return 0;
- case 2:
- scal_detail_size = 11;
- rio_detail_size = 13;
- break;
- case 3:
- scal_detail_size = 12;
- rio_detail_size = 15;
- break;
- }
-
- ptr = (unsigned long)rio_table_hdr + 3;
- for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
- scal_devs[i] = (struct scal_detail *)ptr;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
- rio_devs[i] = (struct rio_detail *)ptr;
-
- return 1;
-}
-
-void setup_summit(void)
-{
- unsigned long ptr;
- unsigned short offset;
- int i, next_wpeg, next_bus = 0;
-
- /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
- ptr = get_bios_ebda();
- ptr = (unsigned long)phys_to_virt(ptr);
-
- rio_table_hdr = NULL;
- offset = 0x180;
- while (offset) {
- /* The block id is stored in the 2nd word */
- if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) {
- /* set the pointer past the offset & block id */
- rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
- break;
- }
- /* The next offset is stored in the 1st word. 0 means no more */
- offset = *((unsigned short *)(ptr + offset));
- }
- if (!rio_table_hdr) {
- printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__);
- return;
- }
-
- if (!build_detail_arrays())
- return;
-
- /* The first Winnipeg we're looking for has an index of 0 */
- next_wpeg = 0;
- do {
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
- if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) {
- /* It's the Winnipeg we're looking for! */
- next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
- next_wpeg++;
- break;
- }
- }
- /*
- * If we go through all Rio devices and don't find one with
- * the next index, it means we've found all the Winnipegs,
- * and thus all the PCI buses.
- */
- if (i == rio_table_hdr->num_rio_dev)
- next_wpeg = 0;
- } while (next_wpeg != 0);
-}
-#endif
-
-static struct apic apic_summit = {
-
- .name = "summit",
- .probe = probe_summit,
- .acpi_madt_oem_check = summit_acpi_madt_oem_check,
- .apic_id_registered = summit_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- /* logical delivery broadcast to all CPUs: */
- .irq_dest_mode = 1,
-
- .target_cpus = summit_target_cpus,
- .disable_esr = 1,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = summit_check_apicid_used,
- .check_apicid_present = summit_check_apicid_present,
-
- .vector_allocation_domain = summit_vector_allocation_domain,
- .init_apic_ldr = summit_init_apic_ldr,
-
- .ioapic_phys_id_map = summit_ioapic_phys_id_map,
- .setup_apic_routing = summit_setup_apic_routing,
- .multi_timer_check = NULL,
- .cpu_present_to_apicid = summit_cpu_present_to_apicid,
- .apicid_to_cpu_present = summit_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = summit_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = summit_phys_pkg_id,
- .mps_oem_check = summit_mps_oem_check,
-
- .get_apic_id = summit_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid = summit_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = summit_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = summit_send_IPI_allbutself,
- .send_IPI_all = summit_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-
- .x86_32_early_logical_apicid = summit_early_logical_apicid,
-};
-
-apic_driver(apic_summit);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 50079587582..e66766bf164 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -3,7 +3,6 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
-#include <linux/init.h>
#include <linux/dmar.h>
#include <linux/cpu.h>
@@ -81,7 +80,7 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
}
static void
- x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
@@ -96,36 +95,37 @@ static void x2apic_send_IPI_all(int vector)
__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
}
-static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
+static int
+x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask,
+ unsigned int *apicid)
{
- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
- int cpu = cpumask_first(cpumask);
+ u32 dest = 0;
+ u16 cluster;
+ int i;
- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
- else
- return BAD_APICID;
-}
+ for_each_cpu_and(i, cpumask, andmask) {
+ if (!cpumask_test_cpu(i, cpu_online_mask))
+ continue;
+ dest = per_cpu(x86_cpu_to_logical_apicid, i);
+ cluster = x2apic_cluster(i);
+ break;
+ }
-static unsigned int
-x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- int cpu;
+ if (!dest)
+ return -EINVAL;
- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
+ for_each_cpu_and(i, cpumask, andmask) {
+ if (!cpumask_test_cpu(i, cpu_online_mask))
+ continue;
+ if (cluster != x2apic_cluster(i))
+ continue;
+ dest |= per_cpu(x86_cpu_to_logical_apicid, i);
}
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ *apicid = dest;
+
+ return 0;
}
static void init_x2apic_ldr(void)
@@ -147,7 +147,7 @@ static void init_x2apic_ldr(void)
/*
* At CPU state changes, update the x2apic cluster sibling info.
*/
-static int __cpuinit
+static int
update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int this_cpu = (unsigned long)hcpu;
@@ -208,23 +208,50 @@ static int x2apic_cluster_probe(void)
return 0;
}
+static const struct cpumask *x2apic_cluster_target_cpus(void)
+{
+ return cpu_all_mask;
+}
+
+/*
+ * Each x2apic cluster is an allocation domain.
+ */
+static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
+ const struct cpumask *mask)
+{
+ /*
+ * To minimize vector pressure, default case of boot, device bringup
+ * etc will use a single cpu for the interrupt destination.
+ *
+ * On explicit migration requests coming from irqbalance etc,
+ * interrupts will be routed to the x2apic cluster (cluster-id
+ * derived from the first cpu in the mask) members specified
+ * in the mask.
+ */
+ if (mask == x2apic_cluster_target_cpus())
+ cpumask_copy(retmask, cpumask_of(cpu));
+ else
+ cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
+}
+
static struct apic apic_x2apic_cluster = {
.name = "cluster x2apic",
.probe = x2apic_cluster_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio,
.irq_dest_mode = 1, /* logical */
- .target_cpus = x2apic_target_cpus,
+ .target_cpus = x2apic_cluster_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
.check_apicid_present = NULL,
- .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .vector_allocation_domain = cluster_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,
.ioapic_phys_id_map = NULL,
@@ -242,7 +269,6 @@ static struct apic apic_x2apic_cluster = {
.set_apic_id = x2apic_set_apic_id,
.apic_id_mask = 0xFFFFFFFFu,
- .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
.cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
.send_IPI_mask = x2apic_send_IPI_mask,
@@ -253,12 +279,13 @@ static struct apic apic_x2apic_cluster = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index f5373dfde21..6d600ebf6c1 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -3,7 +3,6 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
-#include <linux/init.h>
#include <linux/dmar.h>
#include <asm/smp.h>
@@ -20,12 +19,19 @@ static int set_x2apic_phys_mode(char *arg)
}
early_param("x2apic_phys", set_x2apic_phys_mode);
+static bool x2apic_fadt_phys(void)
+{
+ if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
+ printk(KERN_DEBUG "System requires x2apic physical mode\n");
+ return true;
+ }
+ return false;
+}
+
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- if (x2apic_phys)
- return x2apic_enabled();
- else
- return 0;
+ return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
}
static void
@@ -70,45 +76,13 @@ static void x2apic_send_IPI_all(int vector)
__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
}
-static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- int cpu = cpumask_first(cpumask);
-
- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
- else
- return BAD_APICID;
-}
-
-static unsigned int
-x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
-
- return per_cpu(x86_cpu_to_apicid, cpu);
-}
-
static void init_x2apic_ldr(void)
{
}
static int x2apic_phys_probe(void)
{
- if (x2apic_mode && x2apic_phys)
+ if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys()))
return 1;
return apic == &apic_x2apic_phys;
@@ -119,18 +93,19 @@ static struct apic apic_x2apic_phys = {
.name = "physical x2apic",
.probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 0, /* physical */
- .target_cpus = x2apic_target_cpus,
+ .target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
.check_apicid_present = NULL,
- .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,
.ioapic_phys_id_map = NULL,
@@ -148,8 +123,7 @@ static struct apic apic_x2apic_phys = {
.set_apic_id = x2apic_set_apic_id,
.apic_id_mask = 0xFFFFFFFFu,
- .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
@@ -159,12 +133,13 @@ static struct apic apic_x2apic_phys = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 79b05b88aa1..293b41df54e 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
*
* SGI UV APIC functions (note: not an Intel compatible APIC)
*
- * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
*/
#include <linux/cpumask.h>
#include <linux/hardirq.h>
@@ -25,6 +25,7 @@
#include <linux/kdebug.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
+#include <linux/reboot.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
@@ -36,27 +37,21 @@
#include <asm/ipi.h>
#include <asm/smp.h>
#include <asm/x86_init.h>
-#include <asm/emergency-restart.h>
#include <asm/nmi.h>
-/* BMC sets a bit this MMR non-zero before sending an NMI */
-#define UVH_NMI_MMR UVH_SCRATCH5
-#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8)
-#define UV_NMI_PENDING_MASK (1UL << 63)
-DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count);
-
DEFINE_PER_CPU(int, x2apic_extra_bits);
#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
static enum uv_system_type uv_system_type;
static u64 gru_start_paddr, gru_end_paddr;
+static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
+static u64 gru_dist_lmask, gru_dist_umask;
static union uvh_apicid uvh_apicid;
int uv_min_hub_revision_id;
EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
unsigned int uv_apicid_hibits;
EXPORT_SYMBOL_GPL(uv_apicid_hibits);
-static DEFINE_SPINLOCK(uv_nmi_lock);
static struct apic apic_x2apic_uv_x;
@@ -72,7 +67,20 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr)
static inline bool is_GRU_range(u64 start, u64 end)
{
- return start >= gru_start_paddr && end <= gru_end_paddr;
+ if (gru_dist_base) {
+ u64 su = start & gru_dist_umask; /* upper (incl pnode) bits */
+ u64 sl = start & gru_dist_lmask; /* base offset bits */
+ u64 eu = end & gru_dist_umask;
+ u64 el = end & gru_dist_lmask;
+
+ /* Must reside completely within a single GRU range */
+ return (sl == gru_dist_base && el == gru_dist_base &&
+ su >= gru_first_node_paddr &&
+ su <= gru_last_node_paddr &&
+ eu == su);
+ } else {
+ return start >= gru_start_paddr && end <= gru_end_paddr;
+ }
}
static bool uv_is_untracked_pat_range(u64 start, u64 end)
@@ -91,10 +99,16 @@ static int __init early_get_pnodeid(void)
m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
uv_min_hub_revision_id = node_id.s.revision;
- if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
- uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
- if (node_id.s.part_number == UV2_HUB_PART_NUMBER_X)
+ switch (node_id.s.part_number) {
+ case UV2_HUB_PART_NUMBER:
+ case UV2_HUB_PART_NUMBER_X:
uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
+ break;
+ case UV3_HUB_PART_NUMBER:
+ case UV3_HUB_PART_NUMBER_X:
+ uv_min_hub_revision_id += UV3_HUB_REVISION_BASE;
+ break;
+ }
uv_hub_info->hub_revision = uv_min_hub_revision_id;
pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
@@ -130,13 +144,16 @@ static void __init uv_set_apicid_hibit(void)
static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- int pnodeid, is_uv1, is_uv2;
+ int pnodeid, is_uv1, is_uv2, is_uv3;
is_uv1 = !strcmp(oem_id, "SGI");
is_uv2 = !strcmp(oem_id, "SGI2");
- if (is_uv1 || is_uv2) {
+ is_uv3 = !strncmp(oem_id, "SGI3", 4); /* there are varieties of UV3 */
+ if (is_uv1 || is_uv2 || is_uv3) {
uv_hub_info->hub_revision =
- is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE;
+ (is_uv1 ? UV1_HUB_REVISION_BASE :
+ (is_uv2 ? UV2_HUB_REVISION_BASE :
+ UV3_HUB_REVISION_BASE));
pnodeid = early_get_pnodeid();
early_get_apic_pnode_shift();
x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
@@ -185,18 +202,7 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
unsigned long sn_rtc_cycles_per_second;
EXPORT_SYMBOL(sn_rtc_cycles_per_second);
-static const struct cpumask *uv_target_cpus(void)
-{
- return cpu_online_mask;
-}
-
-static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
-static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
{
#ifdef CONFIG_SMP
unsigned long val;
@@ -266,34 +272,26 @@ static void uv_send_IPI_all(int vector)
uv_send_IPI_mask(cpu_online_mask, vector);
}
-static int uv_apic_id_registered(void)
+static int uv_apic_id_valid(int apicid)
{
return 1;
}
-static void uv_init_apic_ldr(void)
+static int uv_apic_id_registered(void)
{
+ return 1;
}
-static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
+static void uv_init_apic_ldr(void)
{
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- int cpu = cpumask_first(cpumask);
-
- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
- else
- return BAD_APICID;
}
-static unsigned int
+static int
uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
+ const struct cpumask *andmask,
+ unsigned int *apicid)
{
- int cpu;
+ int unsigned cpu;
/*
* We're using fixed IRQ delivery, can only return one phys APIC ID.
@@ -303,7 +301,13 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
if (cpumask_test_cpu(cpu, cpu_online_mask))
break;
}
- return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
+
+ if (likely(cpu < nr_cpu_ids)) {
+ *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
+ return 0;
+ }
+
+ return -EINVAL;
}
static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -351,18 +355,19 @@ static struct apic __refdata apic_x2apic_uv_x = {
.name = "UV large system",
.probe = uv_probe,
.acpi_madt_oem_check = uv_acpi_madt_oem_check,
+ .apic_id_valid = uv_apic_id_valid,
.apic_id_registered = uv_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 0, /* physical */
- .target_cpus = uv_target_cpus,
+ .target_cpus = online_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
.check_apicid_present = NULL,
- .vector_allocation_domain = uv_vector_allocation_domain,
+ .vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = uv_init_apic_ldr,
.ioapic_phys_id_map = NULL,
@@ -380,7 +385,6 @@ static struct apic __refdata apic_x2apic_uv_x = {
.set_apic_id = set_apic_id,
.apic_id_mask = 0xFFFFFFFFu,
- .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
.cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
.send_IPI_mask = uv_send_IPI_mask,
@@ -392,19 +396,20 @@ static struct apic __refdata apic_x2apic_uv_x = {
.wakeup_secondary_cpu = uv_wakeup_secondary,
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
-static __cpuinit void set_x2apic_extra_bits(int pnode)
+static void set_x2apic_extra_bits(int pnode)
{
__this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
}
@@ -435,6 +440,20 @@ static __initdata struct redir_addr redir_addrs[] = {
{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR},
};
+static unsigned char get_n_lshift(int m_val)
+{
+ union uv3h_gr0_gam_gr_config_u m_gr_config;
+
+ if (is_uv1_hub())
+ return m_val;
+
+ if (is_uv2_hub())
+ return m_val == 40 ? 40 : 39;
+
+ m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
+ return m_gr_config.s3.m_skt;
+}
+
static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
{
union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
@@ -462,26 +481,67 @@ static __init void map_high(char *id, unsigned long base, int pshift,
paddr = base << pshift;
bytes = (1UL << bshift) * (max_pnode + 1);
- printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
- paddr + bytes);
+ if (!paddr) {
+ pr_info("UV: Map %s_HI base address NULL\n", id);
+ return;
+ }
+ pr_debug("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes);
if (map_type == map_uc)
init_extra_mapping_uc(paddr, bytes);
else
init_extra_mapping_wb(paddr, bytes);
+}
+static __init void map_gru_distributed(unsigned long c)
+{
+ union uvh_rh_gam_gru_overlay_config_mmr_u gru;
+ u64 paddr;
+ unsigned long bytes;
+ int nid;
+
+ gru.v = c;
+ /* only base bits 42:28 relevant in dist mode */
+ gru_dist_base = gru.v & 0x000007fff0000000UL;
+ if (!gru_dist_base) {
+ pr_info("UV: Map GRU_DIST base address NULL\n");
+ return;
+ }
+ bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1);
+ gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1);
+ gru_dist_base &= gru_dist_lmask; /* Clear bits above M */
+ for_each_online_node(nid) {
+ paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) |
+ gru_dist_base;
+ init_extra_mapping_wb(paddr, bytes);
+ gru_first_node_paddr = min(paddr, gru_first_node_paddr);
+ gru_last_node_paddr = max(paddr, gru_last_node_paddr);
+ }
+ /* Save upper (63:M) bits of address only for is_GRU_range */
+ gru_first_node_paddr &= gru_dist_umask;
+ gru_last_node_paddr &= gru_dist_umask;
+ pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n",
+ gru_dist_base, gru_first_node_paddr, gru_last_node_paddr);
}
+
static __init void map_gru_high(int max_pnode)
{
union uvh_rh_gam_gru_overlay_config_mmr_u gru;
int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
- if (gru.s.enable) {
- map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
- gru_start_paddr = ((u64)gru.s.base << shift);
- gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
+ if (!gru.s.enable) {
+ pr_info("UV: GRU disabled\n");
+ return;
+ }
+ if (is_uv3_hub() && gru.s3.mode) {
+ map_gru_distributed(gru.v);
+ return;
}
+ map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
+ gru_start_paddr = ((u64)gru.s.base << shift);
+ gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
}
static __init void map_mmr_high(int max_pnode)
@@ -492,23 +552,146 @@ static __init void map_mmr_high(int max_pnode)
mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
if (mmr.s.enable)
map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
+ else
+ pr_info("UV: MMR disabled\n");
+}
+
+/*
+ * This commonality works because both 0 & 1 versions of the MMIOH OVERLAY
+ * and REDIRECT MMR regs are exactly the same on UV3.
+ */
+struct mmioh_config {
+ unsigned long overlay;
+ unsigned long redirect;
+ char *id;
+};
+
+static __initdata struct mmioh_config mmiohs[] = {
+ {
+ UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR,
+ UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR,
+ "MMIOH0"
+ },
+ {
+ UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR,
+ UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR,
+ "MMIOH1"
+ },
+};
+
+static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
+{
+ union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay;
+ unsigned long mmr;
+ unsigned long base;
+ int i, n, shift, m_io, max_io;
+ int nasid, lnasid, fi, li;
+ char *id;
+
+ id = mmiohs[index].id;
+ overlay.v = uv_read_local_mmr(mmiohs[index].overlay);
+ pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n",
+ id, overlay.v, overlay.s3.base, overlay.s3.m_io);
+ if (!overlay.s3.enable) {
+ pr_info("UV: %s disabled\n", id);
+ return;
+ }
+
+ shift = UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT;
+ base = (unsigned long)overlay.s3.base;
+ m_io = overlay.s3.m_io;
+ mmr = mmiohs[index].redirect;
+ n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;
+ min_pnode *= 2; /* convert to NASID */
+ max_pnode *= 2;
+ max_io = lnasid = fi = li = -1;
+
+ for (i = 0; i < n; i++) {
+ union uv3h_rh_gam_mmioh_redirect_config0_mmr_u redirect;
+
+ redirect.v = uv_read_local_mmr(mmr + i * 8);
+ nasid = redirect.s3.nasid;
+ if (nasid < min_pnode || max_pnode < nasid)
+ nasid = -1; /* invalid NASID */
+
+ if (nasid == lnasid) {
+ li = i;
+ if (i != n-1) /* last entry check */
+ continue;
+ }
+
+ /* check if we have a cached (or last) redirect to print */
+ if (lnasid != -1 || (i == n-1 && nasid != -1)) {
+ unsigned long addr1, addr2;
+ int f, l;
+
+ if (lnasid == -1) {
+ f = l = i;
+ lnasid = nasid;
+ } else {
+ f = fi;
+ l = li;
+ }
+ addr1 = (base << shift) +
+ f * (unsigned long)(1 << m_io);
+ addr2 = (base << shift) +
+ (l + 1) * (unsigned long)(1 << m_io);
+ pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n",
+ id, fi, li, lnasid, addr1, addr2);
+ if (max_io < l)
+ max_io = l;
+ }
+ fi = li = i;
+ lnasid = nasid;
+ }
+
+ pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n",
+ id, base, shift, m_io, max_io);
+
+ if (max_io >= 0)
+ map_high(id, base, shift, m_io, max_io, map_uc);
}
-static __init void map_mmioh_high(int max_pnode)
+static __init void map_mmioh_high(int min_pnode, int max_pnode)
{
union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
- int shift;
+ unsigned long mmr, base;
+ int shift, enable, m_io, n_io;
- mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
- if (is_uv1_hub() && mmioh.s1.enable) {
- shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
- map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io,
- max_pnode, map_uc);
+ if (is_uv3_hub()) {
+ /* Map both MMIOH Regions */
+ map_mmioh_high_uv3(0, min_pnode, max_pnode);
+ map_mmioh_high_uv3(1, min_pnode, max_pnode);
+ return;
}
- if (is_uv2_hub() && mmioh.s2.enable) {
+
+ if (is_uv1_hub()) {
+ mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
+ shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ mmioh.v = uv_read_local_mmr(mmr);
+ enable = !!mmioh.s1.enable;
+ base = mmioh.s1.base;
+ m_io = mmioh.s1.m_io;
+ n_io = mmioh.s1.n_io;
+ } else if (is_uv2_hub()) {
+ mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
- map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io,
- max_pnode, map_uc);
+ mmioh.v = uv_read_local_mmr(mmr);
+ enable = !!mmioh.s2.enable;
+ base = mmioh.s2.base;
+ m_io = mmioh.s2.m_io;
+ n_io = mmioh.s2.n_io;
+ } else
+ return;
+
+ if (enable) {
+ max_pnode &= (1 << n_io) - 1;
+ pr_info(
+ "UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n",
+ base, shift, m_io, n_io, max_pnode);
+ map_high("MMIOH", base, shift, m_io, max_pnode, map_uc);
+ } else {
+ pr_info("UV: MMIOH disabled\n");
}
}
@@ -559,7 +742,7 @@ static void uv_heartbeat(unsigned long ignored)
mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
}
-static void __cpuinit uv_heartbeat_enable(int cpu)
+static void uv_heartbeat_enable(int cpu)
{
while (!uv_cpu_hub_info(cpu)->scir.enabled) {
struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
@@ -576,7 +759,7 @@ static void __cpuinit uv_heartbeat_enable(int cpu)
}
#ifdef CONFIG_HOTPLUG_CPU
-static void __cpuinit uv_heartbeat_disable(int cpu)
+static void uv_heartbeat_disable(int cpu)
{
if (uv_cpu_hub_info(cpu)->scir.enabled) {
uv_cpu_hub_info(cpu)->scir.enabled = 0;
@@ -588,8 +771,8 @@ static void __cpuinit uv_heartbeat_disable(int cpu)
/*
* cpu hotplug notifier
*/
-static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action,
+ void *hcpu)
{
long cpu = (long)hcpu;
@@ -659,7 +842,7 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode,
* Called on each cpu to initialize the per_cpu UV data area.
* FIXME: hotplug not supported yet
*/
-void __cpuinit uv_cpu_init(void)
+void uv_cpu_init(void)
{
/* CPU 0 initilization will be done via uv_system_init. */
if (!uv_blade_info)
@@ -671,107 +854,47 @@ void __cpuinit uv_cpu_init(void)
set_x2apic_extra_bits(uv_hub_info->pnode);
}
-/*
- * When NMI is received, print a stack trace.
- */
-int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
-{
- unsigned long real_uv_nmi;
- int bid;
-
- /*
- * Each blade has an MMR that indicates when an NMI has been sent
- * to cpus on the blade. If an NMI is detected, atomically
- * clear the MMR and update a per-blade NMI count used to
- * cause each cpu on the blade to notice a new NMI.
- */
- bid = uv_numa_blade_id();
- real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
-
- if (unlikely(real_uv_nmi)) {
- spin_lock(&uv_blade_info[bid].nmi_lock);
- real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
- if (real_uv_nmi) {
- uv_blade_info[bid].nmi_count++;
- uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK);
- }
- spin_unlock(&uv_blade_info[bid].nmi_lock);
- }
-
- if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
- return NMI_DONE;
-
- __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
-
- /*
- * Use a lock so only one cpu prints at a time.
- * This prevents intermixed output.
- */
- spin_lock(&uv_nmi_lock);
- pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id());
- dump_stack();
- spin_unlock(&uv_nmi_lock);
-
- return NMI_HANDLED;
-}
-
-void uv_register_nmi_notifier(void)
-{
- if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv"))
- printk(KERN_WARNING "UV NMI handler failed to register\n");
-}
-
-void uv_nmi_init(void)
-{
- unsigned int value;
-
- /*
- * Unmask NMI on all cpus
- */
- value = apic_read(APIC_LVT1) | APIC_DM_NMI;
- value &= ~APIC_LVT_MASKED;
- apic_write(APIC_LVT1, value);
-}
-
void __init uv_system_init(void)
{
union uvh_rh_gam_config_mmr_u m_n_config;
- union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
union uvh_node_id_u node_id;
unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
- int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
- int gnode_extra, max_pnode = 0;
+ int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+ int gnode_extra, min_pnode = 999999, max_pnode = -1;
unsigned long mmr_base, present, paddr;
- unsigned short pnode_mask, pnode_io_mask;
+ unsigned short pnode_mask;
+ unsigned char n_lshift;
+ char *hub = (is_uv1_hub() ? "UV1" :
+ (is_uv2_hub() ? "UV2" :
+ "UV3"));
- printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2");
+ pr_info("UV: Found %s hub\n", hub);
map_low_mmrs();
m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
m_val = m_n_config.s.m_skt;
n_val = m_n_config.s.n_skt;
- mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
- n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io;
+ pnode_mask = (1 << n_val) - 1;
+ n_lshift = get_n_lshift(m_val);
mmr_base =
uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
~UV_MMR_ENABLE;
- pnode_mask = (1 << n_val) - 1;
- pnode_io_mask = (1 << n_io) - 1;
node_id.v = uv_read_local_mmr(UVH_NODE_ID);
gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
gnode_upper = ((unsigned long)gnode_extra << m_val);
- printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
- n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
+ pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x n_lshift 0x%x\n",
+ n_val, m_val, pnode_mask, gnode_upper, gnode_extra,
+ n_lshift);
- printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
+ pr_info("UV: global MMR base 0x%lx\n", mmr_base);
for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
uv_possible_blades +=
hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
/* uv_num_possible_blades() is really the hub count */
- printk(KERN_INFO "UV: Found %d blades, %d hubs\n",
+ pr_info("UV: Found %d blades, %d hubs\n",
is_uv1_hub() ? uv_num_possible_blades() :
(uv_num_possible_blades() + 1) / 2,
uv_num_possible_blades());
@@ -806,6 +929,7 @@ void __init uv_system_init(void)
uv_blade_info[blade].nr_possible_cpus = 0;
uv_blade_info[blade].nr_online_cpus = 0;
spin_lock_init(&uv_blade_info[blade].nmi_lock);
+ min_pnode = min(pnode, min_pnode);
max_pnode = max(pnode, max_pnode);
blade++;
}
@@ -828,8 +952,7 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
uv_cpu_hub_info(cpu)->m_shift = 64 - m_val;
- uv_cpu_hub_info(cpu)->n_lshift = is_uv2_1_hub() ?
- (m_val == 40 ? 40 : 39) : m_val;
+ uv_cpu_hub_info(cpu)->n_lshift = n_lshift;
pnode = uv_apicid_to_pnode(apicid);
blade = boot_pnode_to_blade(pnode);
@@ -868,11 +991,11 @@ void __init uv_system_init(void)
map_gru_high(max_pnode);
map_mmr_high(max_pnode);
- map_mmioh_high(max_pnode & pnode_io_mask);
+ map_mmioh_high(min_pnode, max_pnode);
+ uv_nmi_setup();
uv_cpu_init();
uv_scir_register_cpu_notifier();
- uv_register_nmi_notifier();
proc_mkdir("sgi_uv", NULL);
/* register Legacy VGA I/O redirection handler */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index f76623cbe26..58487445141 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -201,6 +201,8 @@
* http://www.microsoft.com/whdc/archive/amp_12.mspx]
*/
+#define pr_fmt(fmt) "apm: " fmt
+
#include <linux/module.h>
#include <linux/poll.h>
@@ -230,8 +232,8 @@
#include <linux/acpi.h>
#include <linux/syscore_ops.h>
#include <linux/i8253.h>
+#include <linux/cpuidle.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
#include <asm/olpc.h>
@@ -359,24 +361,44 @@ struct apm_user {
* idle percentage above which bios idle calls are done
*/
#ifdef CONFIG_APM_CPU_IDLE
-#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012
#define DEFAULT_IDLE_THRESHOLD 95
#else
#define DEFAULT_IDLE_THRESHOLD 100
#endif
#define DEFAULT_IDLE_PERIOD (100 / 3)
+static int apm_cpu_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index);
+
+static struct cpuidle_driver apm_idle_driver = {
+ .name = "apm_idle",
+ .owner = THIS_MODULE,
+ .states = {
+ { /* entry 0 is for polling */ },
+ { /* entry 1 is for APM idle */
+ .name = "APM",
+ .desc = "APM idle",
+ .flags = CPUIDLE_FLAG_TIME_VALID,
+ .exit_latency = 250, /* WAG */
+ .target_residency = 500, /* WAG */
+ .enter = &apm_cpu_idle
+ },
+ },
+ .state_count = 2,
+};
+
+static struct cpuidle_device apm_cpuidle_device;
+
/*
* Local variables
*/
-static struct {
+__visible struct {
unsigned long offset;
unsigned short segment;
} apm_bios_entry;
static int clock_slowed;
static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
-static int set_pm_idle;
static int suspends_pending;
static int standbys_pending;
static int ignore_sys_suspend;
@@ -486,11 +508,11 @@ static void apm_error(char *str, int err)
if (error_table[i].key == err)
break;
if (i < ERROR_COUNT)
- printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
+ pr_notice("%s: %s\n", str, error_table[i].msg);
else if (err < 0)
- printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err);
+ pr_notice("%s: linux error code %i\n", str, err);
else
- printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
+ pr_notice("%s: unknown error code %#2.2x\n",
str, err);
}
@@ -819,24 +841,12 @@ static int apm_do_idle(void)
u32 eax;
u8 ret = 0;
int idled = 0;
- int polling;
int err = 0;
- polling = !!(current_thread_info()->status & TS_POLLING);
- if (polling) {
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
- }
if (!need_resched()) {
idled = 1;
ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
}
- if (polling)
- current_thread_info()->status |= TS_POLLING;
if (!idled)
return 0;
@@ -883,8 +893,6 @@ static void apm_do_busy(void)
#define IDLE_CALC_LIMIT (HZ * 100)
#define IDLE_LEAKY_MAX 16
-static void (*original_pm_idle)(void) __read_mostly;
-
/**
* apm_cpu_idle - cpu idling for APM capable Linux
*
@@ -893,35 +901,36 @@ static void (*original_pm_idle)(void) __read_mostly;
* Furthermore it calls the system default idle routine.
*/
-static void apm_cpu_idle(void)
+static int apm_cpu_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index)
{
static int use_apm_idle; /* = 0 */
static unsigned int last_jiffies; /* = 0 */
static unsigned int last_stime; /* = 0 */
+ cputime_t stime;
int apm_idle_done = 0;
unsigned int jiffies_since_last_check = jiffies - last_jiffies;
unsigned int bucket;
- WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012");
recalc:
+ task_cputime(current, NULL, &stime);
if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
use_apm_idle = 0;
- last_jiffies = jiffies;
- last_stime = current->stime;
} else if (jiffies_since_last_check > idle_period) {
unsigned int idle_percentage;
- idle_percentage = current->stime - last_stime;
+ idle_percentage = stime - last_stime;
idle_percentage *= 100;
idle_percentage /= jiffies_since_last_check;
use_apm_idle = (idle_percentage > idle_threshold);
if (apm_info.forbid_idle)
use_apm_idle = 0;
- last_jiffies = jiffies;
- last_stime = current->stime;
}
+ last_jiffies = jiffies;
+ last_stime = stime;
+
bucket = IDLE_LEAKY_MAX;
while (!need_resched()) {
@@ -949,10 +958,7 @@ recalc:
break;
}
}
- if (original_pm_idle)
- original_pm_idle();
- else
- default_idle();
+ default_idle();
local_irq_disable();
jiffies_since_last_check = jiffies - last_jiffies;
if (jiffies_since_last_check > idle_period)
@@ -962,7 +968,7 @@ recalc:
if (apm_idle_done)
apm_do_busy();
- local_irq_enable();
+ return index;
}
/**
@@ -1185,7 +1191,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
static int notified;
if (notified++ == 0)
- printk(KERN_ERR "apm: an event queue overflowed\n");
+ pr_err("an event queue overflowed\n");
if (++as->event_tail >= APM_MAX_EVENTS)
as->event_tail = 0;
}
@@ -1234,8 +1240,7 @@ static int suspend(int vetoable)
struct apm_user *as;
dpm_suspend_start(PMSG_SUSPEND);
-
- dpm_suspend_noirq(PMSG_SUSPEND);
+ dpm_suspend_end(PMSG_SUSPEND);
local_irq_disable();
syscore_suspend();
@@ -1259,9 +1264,9 @@ static int suspend(int vetoable)
syscore_resume();
local_irq_enable();
- dpm_resume_noirq(PMSG_RESUME);
-
+ dpm_resume_start(PMSG_RESUME);
dpm_resume_end(PMSG_RESUME);
+
queue_event(APM_NORMAL_RESUME, NULL);
spin_lock(&user_list_lock);
for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1282,7 @@ static void standby(void)
{
int err;
- dpm_suspend_noirq(PMSG_SUSPEND);
+ dpm_suspend_end(PMSG_SUSPEND);
local_irq_disable();
syscore_suspend();
@@ -1291,7 +1296,7 @@ static void standby(void)
syscore_resume();
local_irq_enable();
- dpm_resume_noirq(PMSG_RESUME);
+ dpm_resume_start(PMSG_RESUME);
}
static apm_event_t get_event(void)
@@ -1449,7 +1454,7 @@ static void apm_mainloop(void)
static int check_apm_user(struct apm_user *as, const char *func)
{
if (as == NULL || as->magic != APM_BIOS_MAGIC) {
- printk(KERN_ERR "apm: %s passed bad filp\n", func);
+ pr_err("%s passed bad filp\n", func);
return 1;
}
return 0;
@@ -1588,7 +1593,7 @@ static int do_release(struct inode *inode, struct file *filp)
as1 = as1->next)
;
if (as1 == NULL)
- printk(KERN_ERR "apm: filp not in user list\n");
+ pr_err("filp not in user list\n");
else
as1->next = as->next;
}
@@ -1602,11 +1607,9 @@ static int do_open(struct inode *inode, struct file *filp)
struct apm_user *as;
as = kmalloc(sizeof(*as), GFP_KERNEL);
- if (as == NULL) {
- printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
- sizeof(*as));
+ if (as == NULL)
return -ENOMEM;
- }
+
as->magic = APM_BIOS_MAGIC;
as->event_tail = as->event_head = 0;
as->suspends_pending = as->standbys_pending = 0;
@@ -2315,16 +2318,16 @@ static int __init apm_init(void)
}
if (apm_info.disabled) {
- printk(KERN_NOTICE "apm: disabled on user request.\n");
+ pr_notice("disabled on user request.\n");
return -ENODEV;
}
if ((num_online_cpus() > 1) && !power_off && !smp) {
- printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n");
+ pr_notice("disabled - APM is not SMP safe.\n");
apm_info.disabled = 1;
return -ENODEV;
}
if (!acpi_disabled) {
- printk(KERN_NOTICE "apm: overridden by ACPI.\n");
+ pr_notice("overridden by ACPI.\n");
apm_info.disabled = 1;
return -ENODEV;
}
@@ -2358,8 +2361,7 @@ static int __init apm_init(void)
kapmd_task = kthread_create(apm, NULL, "kapmd");
if (IS_ERR(kapmd_task)) {
- printk(KERN_ERR "apm: disabled - Unable to start kernel "
- "thread.\n");
+ pr_err("disabled - Unable to start kernel thread\n");
err = PTR_ERR(kapmd_task);
kapmd_task = NULL;
remove_proc_entry("apm", NULL);
@@ -2384,9 +2386,9 @@ static int __init apm_init(void)
if (HZ != 100)
idle_period = (idle_period * HZ) / 100;
if (idle_threshold < 100) {
- original_pm_idle = pm_idle;
- pm_idle = apm_cpu_idle;
- set_pm_idle = 1;
+ if (!cpuidle_register_driver(&apm_idle_driver))
+ if (cpuidle_register_device(&apm_cpuidle_device))
+ cpuidle_unregister_driver(&apm_idle_driver);
}
return 0;
@@ -2396,15 +2398,9 @@ static void __exit apm_exit(void)
{
int error;
- if (set_pm_idle) {
- pm_idle = original_pm_idle;
- /*
- * We are about to unload the current idle thread pm callback
- * (pm_idle), Wait for all processors to update cached/local
- * copies of pm_idle before proceeding.
- */
- cpu_idle_wait();
- }
+ cpuidle_unregister_device(&apm_cpuidle_device);
+ cpuidle_unregister_driver(&apm_idle_driver);
+
if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
&& (apm_info.connection_version > 0x0100)) {
error = apm_engage_power_management(APM_DEVICE_ALL, 0);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 68de2dc962e..9f6b9341950 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -32,7 +32,6 @@ void common(void) {
OFFSET(TI_flags, thread_info, flags);
OFFSET(TI_status, thread_info, status);
OFFSET(TI_addr_limit, thread_info, addr_limit);
- OFFSET(TI_preempt_count, thread_info, preempt_count);
BLANK();
OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
@@ -69,4 +68,7 @@ void common(void) {
OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
OFFSET(BP_pref_address, boot_params, hdr.pref_address);
OFFSET(BP_code32_start, boot_params, hdr.code32_start);
+
+ BLANK();
+ DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 85d98ab15cd..d67c4be3e8b 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -28,7 +28,6 @@ void foo(void)
OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
- OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math);
OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
@@ -60,6 +59,9 @@ void foo(void)
OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
BLANK();
+ OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
+ BLANK();
+
/* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
sizeof(struct tss_struct));
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 834e897b1e2..e7c798b354f 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,6 +1,12 @@
#include <asm/ia32.h>
#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
+#ifdef CONFIG_X86_X32_ABI
+# define __SYSCALL_X32(nr, sym, compat) [nr] = 1,
+#else
+# define __SYSCALL_X32(nr, sym, compat) /* nothing */
+#endif
static char syscalls_64[] = {
#include <asm/syscalls_64.h>
};
@@ -67,6 +73,7 @@ int main(void)
ENTRY(cr3);
ENTRY(cr4);
ENTRY(cr8);
+ ENTRY(gdt_desc);
BLANK();
#undef ENTRY
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 5da1269e8dd..83a7995625a 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -27,21 +27,29 @@ static int num_scan_areas;
static __init int set_corruption_check(char *arg)
{
- char *end;
+ ssize_t ret;
+ unsigned long val;
- memory_corruption_check = simple_strtol(arg, &end, 10);
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
- return (*end == 0) ? 0 : -EINVAL;
+ memory_corruption_check = val;
+ return 0;
}
early_param("memory_corruption_check", set_corruption_check);
static __init int set_corruption_check_period(char *arg)
{
- char *end;
+ ssize_t ret;
+ unsigned long val;
- corruption_check_period = simple_strtoul(arg, &end, 10);
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
- return (*end == 0) ? 0 : -EINVAL;
+ corruption_check_period = val;
+ return 0;
}
early_param("memory_corruption_check_period", set_corruption_check_period);
@@ -83,7 +91,7 @@ void __init setup_bios_corruption_check(void)
corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
- for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
PAGE_SIZE, corruption_check_size);
end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 25f24dccdcf..7fd54f09b01 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,8 +14,8 @@ CFLAGS_common.o := $(nostackp)
obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += proc.o capflags.o powerflags.o common.o
-obj-y += vmware.o hypervisor.o sched.o mshyperv.o
obj-y += rdrand.o
+obj-y += match.o
obj-$(CONFIG_X86_32) += bugs.o
obj-$(CONFIG_X86_64) += bugs_64.o
@@ -30,20 +30,29 @@ obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
+obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o perf_event_amd_uncore.o
+ifdef CONFIG_AMD_IOMMU
+obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
endif
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o
+endif
+
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
+obj-$(CONFIG_MICROCODE) += microcode/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o
+obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
+
quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
+ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
cpufeature = $(src)/../../include/asm/cpufeature.h
targets += capflags.c
-$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
+$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f4773f4aae3..ce8b8ff0e0e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,23 +1,55 @@
#include <linux/export.h>
-#include <linux/init.h>
#include <linux/bitops.h>
#include <linux/elf.h>
#include <linux/mm.h>
#include <linux/io.h>
+#include <linux/sched.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
#include <asm/pci-direct.h>
#ifdef CONFIG_X86_64
-# include <asm/numa_64.h>
# include <asm/mmconfig.h>
# include <asm/cacheflush.h>
#endif
#include "cpu.h"
+static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+{
+ u32 gprs[8] = { 0 };
+ int err;
+
+ WARN_ONCE((boot_cpu_data.x86 != 0xf),
+ "%s should only be used on K8!\n", __func__);
+
+ gprs[1] = msr;
+ gprs[7] = 0x9c5a203a;
+
+ err = rdmsr_safe_regs(gprs);
+
+ *p = gprs[0] | ((u64)gprs[2] << 32);
+
+ return err;
+}
+
+static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
+{
+ u32 gprs[8] = { 0 };
+
+ WARN_ONCE((boot_cpu_data.x86 != 0xf),
+ "%s should only be used on K8!\n", __func__);
+
+ gprs[0] = (u32)val;
+ gprs[1] = msr;
+ gprs[2] = val >> 32;
+ gprs[7] = 0x9c5a203a;
+
+ return wrmsr_safe_regs(gprs);
+}
+
#ifdef CONFIG_X86_32
/*
* B step AMD K6 before B 9730xxxx have hardware bugs that can cause
@@ -25,17 +57,18 @@
* contact AMD for precise details and a CPU swap.
*
* See http://www.multimania.com/poulot/k6bug.html
- * http://www.amd.com/K6/k6docs/revgd.html
+ * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6"
+ * (Publication # 21266 Issue Date: August 1998)
*
* The following test is erm.. interesting. AMD neglected to up
* the chip setting when fixing the bug but they also tweaked some
* performance at the same time..
*/
-extern void vide(void);
-__asm__(".align 4\nvide: ret");
+extern __visible void vide(void);
+__asm__(".globl vide\n\t.align 4\nvide: ret");
-static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
+static void init_amd_k5(struct cpuinfo_x86 *c)
{
/*
* General Systems BIOSen alias the cpu frequency registers
@@ -53,10 +86,10 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
}
-static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
+static void init_amd_k6(struct cpuinfo_x86 *c)
{
u32 l, h;
- int mbytes = num_physpages >> (20-PAGE_SHIFT);
+ int mbytes = get_num_physpages() >> (20-PAGE_SHIFT);
if (c->x86_model < 6) {
/* Based on AMD doc 20734R - June 2000 */
@@ -93,7 +126,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
"system stability may be impaired when more than 32 MB are used.\n");
else
printk(KERN_CONT "probably OK (after B9730xxxx).\n");
- printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
}
/* K6 with old style WHCR */
@@ -146,7 +178,7 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+static void amd_k7_smp_check(struct cpuinfo_x86 *c)
{
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
@@ -159,11 +191,11 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
/* Athlon 660/661 is valid. */
if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
(c->x86_mask == 1)))
- goto valid_k7;
+ return;
/* Duron 670 is valid */
if ((c->x86_model == 7) && (c->x86_mask == 0))
- goto valid_k7;
+ return;
/*
* Athlon 662, Duron 671, and Athlon >model 7 have capability
@@ -176,7 +208,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
((c->x86_model == 7) && (c->x86_mask >= 1)) ||
(c->x86_model > 7))
if (cpu_has_mp)
- goto valid_k7;
+ return;
/* If we get here, not a certified SMP capable AMD system. */
@@ -186,14 +218,10 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
*/
WARN_ONCE(1, "WARNING: This combination of AMD"
" processors is not suitable for SMP.\n");
- if (!test_taint(TAINT_UNSAFE_SMP))
- add_taint(TAINT_UNSAFE_SMP);
-
-valid_k7:
- ;
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
}
-static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
+static void init_amd_k7(struct cpuinfo_x86 *c)
{
u32 l, h;
@@ -205,9 +233,7 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
if (c->x86_model >= 6 && c->x86_model <= 10) {
if (!cpu_has(c, X86_FEATURE_XMM)) {
printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
- rdmsr(MSR_K7_HWCR, l, h);
- l &= ~0x00008000;
- wrmsr(MSR_K7_HWCR, l, h);
+ msr_clear_bit(MSR_K7_HWCR, 15);
set_cpu_cap(c, X86_FEATURE_XMM);
}
}
@@ -238,7 +264,7 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
* To workaround broken NUMA config. Read the comment in
* srat_detect_node().
*/
-static int __cpuinit nearby_node(int apicid)
+static int nearby_node(int apicid)
{
int i, node;
@@ -263,14 +289,14 @@ static int __cpuinit nearby_node(int apicid)
* (2) AMD processors supporting compute units
*/
#ifdef CONFIG_X86_HT
-static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
+static void amd_get_topology(struct cpuinfo_x86 *c)
{
u32 nodes, cores_per_cu = 1;
u8 node_id;
int cpu = smp_processor_id();
/* get information required for multi-node processors */
- if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+ if (cpu_has_topoext) {
u32 eax, ebx, ecx, edx;
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
@@ -310,10 +336,10 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
#endif
/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
+ * On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
* Assumes number of cores is a power of two.
*/
-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
+static void amd_detect_cmp(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_HT
unsigned bits;
@@ -330,9 +356,9 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
#endif
}
-int amd_get_nb_id(int cpu)
+u16 amd_get_nb_id(int cpu)
{
- int id = 0;
+ u16 id = 0;
#ifdef CONFIG_SMP
id = per_cpu(cpu_llc_id, cpu);
#endif
@@ -340,7 +366,7 @@ int amd_get_nb_id(int cpu)
}
EXPORT_SYMBOL_GPL(amd_get_nb_id);
-static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
+static void srat_detect_node(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_NUMA
int cpu = smp_processor_id();
@@ -352,10 +378,11 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
node = per_cpu(cpu_llc_id, cpu);
/*
- * If core numbers are inconsistent, it's likely a multi-fabric platform,
- * so invoke platform-specific handler
+ * On multi-fabric platform (e.g. Numascale NumaChip) a
+ * platform-specific handler needs to be called to fixup some
+ * IDs of the CPU.
*/
- if (c->phys_proc_id != node)
+ if (x86_cpuinit.fixup_cpu_id)
x86_cpuinit.fixup_cpu_id(c, node);
if (!node_online(node)) {
@@ -391,7 +418,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+static void early_init_amd_mc(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_HT
unsigned bits, ecx;
@@ -417,7 +444,7 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
#endif
}
-static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c)
+static void bsp_init_amd(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
@@ -445,7 +472,7 @@ static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+static void early_init_amd(struct cpuinfo_x86 *c)
{
early_init_amd_mc(c);
@@ -456,6 +483,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ if (!check_tsc_unstable())
+ set_sched_clock_stable();
}
#ifdef CONFIG_X86_64
@@ -476,15 +505,22 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
}
#endif
+
+ /* F16h erratum 793, CVE-2013-6885 */
+ if (c->x86 == 0x16 && c->x86_model <= 0xf)
+ msr_set_bit(MSR_AMD64_LS_CFG, 15);
}
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+static const int amd_erratum_383[];
+static const int amd_erratum_400[];
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+
+static void init_amd(struct cpuinfo_x86 *c)
{
u32 dummy;
-
-#ifdef CONFIG_SMP
unsigned long long value;
+#ifdef CONFIG_SMP
/*
* Disable TLB flush filter by setting HWCR.FFDIS on K8
* bit 6 of msr C001_0015
@@ -492,11 +528,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
* Errata 63 for SH-B3 steppings
* Errata 122 for all steppings (F+ have it disabled by default)
*/
- if (c->x86 == 0xf) {
- rdmsrl(MSR_K7_HWCR, value);
- value |= 1 << 6;
- wrmsrl(MSR_K7_HWCR, value);
- }
+ if (c->x86 == 0xf)
+ msr_set_bit(MSR_K7_HWCR, 6);
#endif
early_init_amd(c);
@@ -522,12 +555,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
* (AMD Erratum #110, docId: 25759).
*/
if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
- u64 val;
-
clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
- if (!rdmsrl_amd_safe(0xc001100d, &val)) {
- val &= ~(1ULL << 32);
- wrmsrl_amd_safe(0xc001100d, val);
+ if (!rdmsrl_amd_safe(0xc001100d, &value)) {
+ value &= ~(1ULL << 32);
+ wrmsrl_amd_safe(0xc001100d, value);
}
}
@@ -576,6 +607,33 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
}
}
+ /* re-enable TopologyExtensions if switched off by BIOS */
+ if ((c->x86 == 0x15) &&
+ (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
+ !cpu_has(c, X86_FEATURE_TOPOEXT)) {
+
+ if (msr_set_bit(0xc0011005, 54) > 0) {
+ rdmsrl(0xc0011005, value);
+ if (value & BIT_64(54)) {
+ set_cpu_cap(c, X86_FEATURE_TOPOEXT);
+ pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
+ }
+ }
+ }
+
+ /*
+ * The way access filter has a performance penalty on some workloads.
+ * Disable it on the affected CPUs.
+ */
+ if ((c->x86 == 0x15) &&
+ (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
+
+ if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) {
+ value |= 0x1E;
+ wrmsrl_safe(0xc0011021, value);
+ }
+ }
+
cpu_detect_cache_sizes(c);
/* Multi core CPU? */
@@ -588,12 +646,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
detect_ht(c);
#endif
- if (c->extended_cpuid_level >= 0x80000006) {
- if (cpuid_edx(0x80000006) & 0xf000)
- num_cache_leaves = 4;
- else
- num_cache_leaves = 3;
- }
+ init_amd_cacheinfo(c);
if (c->x86 >= 0xf)
set_cpu_cap(c, X86_FEATURE_K8);
@@ -621,12 +674,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
* benefit in doing so.
*/
if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ unsigned long pfn = tseg >> PAGE_SHIFT;
+
printk(KERN_DEBUG "tseg: %010llx\n", tseg);
- if ((tseg>>PMD_SHIFT) <
- (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
- ((tseg>>PMD_SHIFT) <
- (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
- (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
+ if (pfn_range_is_mapped(pfn, pfn + 1))
set_memory_4k((unsigned long)__va(tseg), 1);
}
}
@@ -639,34 +690,42 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
- /*
- * Disable GART TLB Walk Errors on Fam10h. We do this here
- * because this is always needed when GART is enabled, even in a
- * kernel which has no MCE support built in.
- */
if (c->x86 == 0x10) {
/*
- * BIOS should disable GartTlbWlk Errors themself. If
- * it doesn't do it here as suggested by the BKDG.
+ * Disable GART TLB Walk Errors on Fam10h. We do this here
+ * because this is always needed when GART is enabled, even in a
+ * kernel which has no MCE support built in.
+ * BIOS should disable GartTlbWlk Errors already. If
+ * it doesn't, do it here as suggested by the BKDG.
*
* Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
*/
- u64 mask;
- int err;
+ msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
- err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
- if (err == 0) {
- mask |= (1 << 10);
- checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
- }
+ /*
+ * On family 10h BIOS may not have properly enabled WC+ support,
+ * causing it to be converted to CD memtype. This may result in
+ * performance degradation for certain nested-paging guests.
+ * Prevent this conversion by clearing bit 24 in
+ * MSR_AMD64_BU_CFG2.
+ *
+ * NOTE: we want to use the _safe accessors so as not to #GP kvm
+ * guests on older kvm hosts.
+ */
+ msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
+
+ if (cpu_has_amd_erratum(c, amd_erratum_383))
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
}
+ if (cpu_has_amd_erratum(c, amd_erratum_400))
+ set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
+
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
}
#ifdef CONFIG_X86_32
-static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c,
- unsigned int size)
+static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/* AMD errata T13 (order #21922) */
if ((c->x86 == 6)) {
@@ -682,12 +741,68 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c,
}
#endif
-static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
+static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
+{
+ tlb_flushall_shift = 6;
+}
+
+static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
+{
+ u32 ebx, eax, ecx, edx;
+ u16 mask = 0xfff;
+
+ if (c->x86 < 0xf)
+ return;
+
+ if (c->extended_cpuid_level < 0x80000006)
+ return;
+
+ cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+
+ tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
+ tlb_lli_4k[ENTRIES] = ebx & mask;
+
+ /*
+ * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
+ * characteristics from the CPUID function 0x80000005 instead.
+ */
+ if (c->x86 == 0xf) {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ mask = 0xff;
+ }
+
+ /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!((eax >> 16) & mask))
+ tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ else
+ tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+
+ /* a 4M entry uses two 2M entries */
+ tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+
+ /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!(eax & mask)) {
+ /* Erratum 658 */
+ if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
+ tlb_lli_2m[ENTRIES] = 1024;
+ } else {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ tlb_lli_2m[ENTRIES] = eax & 0xff;
+ }
+ } else
+ tlb_lli_2m[ENTRIES] = eax & mask;
+
+ tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+
+ cpu_set_tlb_flushall_shift(c);
+}
+
+static const struct cpu_dev amd_cpu_dev = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
#ifdef CONFIG_X86_32
- .c_models = {
- { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[3] = "486 DX/2",
[7] = "486 DX/2-WB",
@@ -698,9 +813,10 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
}
},
},
- .c_size_cache = amd_size_cache,
+ .legacy_cache_size = amd_size_cache,
#endif
.c_early_init = early_init_amd,
+ .c_detect_tlb = cpu_detect_tlb_amd,
.c_bsp_init = bsp_init_amd,
.c_init = init_amd,
.c_x86_vendor = X86_VENDOR_AMD,
@@ -715,8 +831,7 @@ cpu_dev_register(amd_cpu_dev);
* AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
* have an OSVW id assigned, which it takes as first argument. Both take a
* variable number of family-specific model-stepping ranges created by
- * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
- * int[] in arch/x86/include/asm/processor.h.
+ * AMD_MODEL_RANGE().
*
* Example:
*
@@ -726,32 +841,28 @@ cpu_dev_register(amd_cpu_dev);
* AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
*/
-const int amd_erratum_400[] =
+#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
+#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
+#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
+ ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
+#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
+#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
+#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
+
+static const int amd_erratum_400[] =
AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
-EXPORT_SYMBOL_GPL(amd_erratum_400);
-const int amd_erratum_383[] =
+static const int amd_erratum_383[] =
AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
-EXPORT_SYMBOL_GPL(amd_erratum_383);
-bool cpu_has_amd_erratum(const int *erratum)
+
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
{
- struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info);
int osvw_id = *erratum++;
u32 range;
u32 ms;
- /*
- * If called early enough that current_cpu_data hasn't been initialized
- * yet, fall back to boot_cpu_data.
- */
- if (cpu->x86 == 0)
- cpu = &boot_cpu_data;
-
- if (cpu->x86_vendor != X86_VENDOR_AMD)
- return false;
-
if (osvw_id >= 0 && osvw_id < 65536 &&
cpu_has(cpu, X86_FEATURE_OSVW)) {
u64 osvw_len;
@@ -776,5 +887,3 @@ bool cpu_has_amd_erratum(const int *erratum)
return false;
}
-
-EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 46674fbb62b..03445346ee0 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -17,24 +17,6 @@
#include <asm/paravirt.h>
#include <asm/alternative.h>
-static int __init no_halt(char *s)
-{
- WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
- boot_cpu_data.hlt_works_ok = 0;
- return 1;
-}
-
-__setup("no-hlt", no_halt);
-
-static int __init no_387(char *s)
-{
- boot_cpu_data.hard_math = 0;
- write_cr0(X86_CR0_TS | X86_CR0_EM | X86_CR0_MP | read_cr0());
- return 1;
-}
-
-__setup("no387", no_387);
-
static double __initdata x = 4195835.0;
static double __initdata y = 3145727.0;
@@ -53,22 +35,13 @@ static void __init check_fpu(void)
{
s32 fdiv_bug;
- if (!boot_cpu_data.hard_math) {
-#ifndef CONFIG_MATH_EMULATION
- printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
- printk(KERN_EMERG "Giving up.\n");
- for (;;) ;
-#endif
- return;
- }
-
kernel_fpu_begin();
/*
* trap_init() enabled FXSR and company _before_ testing for FP
* problems here.
*
- * Test for the divl bug..
+ * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
*/
__asm__("fninit\n\t"
"fldl %1\n\t"
@@ -84,91 +57,38 @@ static void __init check_fpu(void)
kernel_fpu_end();
- boot_cpu_data.fdiv_bug = fdiv_bug;
- if (boot_cpu_data.fdiv_bug)
- printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
-}
-
-static void __init check_hlt(void)
-{
- if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
- return;
-
- printk(KERN_INFO "Checking 'hlt' instruction... ");
- if (!boot_cpu_data.hlt_works_ok) {
- printk("disabled\n");
- return;
+ if (fdiv_bug) {
+ set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
+ pr_warn("Hmm, FPU with FDIV bug\n");
}
- halt();
- halt();
- halt();
- halt();
- printk(KERN_CONT "OK.\n");
-}
-
-/*
- * Most 386 processors have a bug where a POPAD can lock the
- * machine even from user space.
- */
-
-static void __init check_popad(void)
-{
-#ifndef CONFIG_X86_POPAD_OK
- int res, inp = (int) &res;
-
- printk(KERN_INFO "Checking for popad bug... ");
- __asm__ __volatile__(
- "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
- : "=&a" (res)
- : "d" (inp)
- : "ecx", "edi");
- /*
- * If this fails, it means that any user program may lock the
- * CPU hard. Too bad.
- */
- if (res != 12345678)
- printk(KERN_CONT "Buggy.\n");
- else
- printk(KERN_CONT "OK.\n");
-#endif
}
-/*
- * Check whether we are able to run this kernel safely on SMP.
- *
- * - In order to run on a i386, we need to be compiled for i386
- * (for due to lack of "invlpg" and working WP on a i386)
- * - In order to run on anything without a TSC, we need to be
- * compiled for a i486.
- */
-
-static void __init check_config(void)
-{
-/*
- * We'd better not be a i386 if we're configured to use some
- * i486+ only features! (WP works in supervisor mode and the
- * new "invlpg" and "bswap" instructions)
- */
-#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \
- defined(CONFIG_X86_BSWAP)
- if (boot_cpu_data.x86 == 3)
- panic("Kernel requires i486+ for 'invlpg' and other features");
-#endif
-}
-
-
void __init check_bugs(void)
{
identify_boot_cpu();
#ifndef CONFIG_SMP
- printk(KERN_INFO "CPU: ");
+ pr_info("CPU: ");
print_cpu_info(&boot_cpu_data);
#endif
- check_config();
- check_fpu();
- check_hlt();
- check_popad();
+
+ /*
+ * Check whether we are able to run this kernel safely on SMP.
+ *
+ * - i386 is no longer supported.
+ * - In order to run on anything without a TSC, we need to be
+ * compiled for a i486.
+ */
+ if (boot_cpu_data.x86 < 4)
+ panic("Kernel requires i486+ for 'invlpg' and other features");
+
init_utsname()->machine[1] =
'0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
alternative_instructions();
+
+ /*
+ * kernel_fpu_begin/end() in check_fpu() relies on the patched
+ * alternative instructions.
+ */
+ if (cpu_has_fpu)
+ check_fpu();
}
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 159103c0b1f..d8fba5c15fb 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,6 +1,5 @@
#include <linux/bitops.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <asm/processor.h>
#include <asm/e820.h>
@@ -9,236 +8,6 @@
#include "cpu.h"
-#ifdef CONFIG_X86_OOSTORE
-
-static u32 __cpuinit power2(u32 x)
-{
- u32 s = 1;
-
- while (s <= x)
- s <<= 1;
-
- return s >>= 1;
-}
-
-
-/*
- * Set up an actual MCR
- */
-static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key)
-{
- u32 lo, hi;
-
- hi = base & ~0xFFF;
- lo = ~(size-1); /* Size is a power of 2 so this makes a mask */
- lo &= ~0xFFF; /* Remove the ctrl value bits */
- lo |= key; /* Attribute we wish to set */
- wrmsr(reg+MSR_IDT_MCR0, lo, hi);
- mtrr_centaur_report_mcr(reg, lo, hi); /* Tell the mtrr driver */
-}
-
-/*
- * Figure what we can cover with MCR's
- *
- * Shortcut: We know you can't put 4Gig of RAM on a winchip
- */
-static u32 __cpuinit ramtop(void)
-{
- u32 clip = 0xFFFFFFFFUL;
- u32 top = 0;
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- unsigned long start, end;
-
- if (e820.map[i].addr > 0xFFFFFFFFUL)
- continue;
- /*
- * Don't MCR over reserved space. Ignore the ISA hole
- * we frob around that catastrophe already
- */
- if (e820.map[i].type == E820_RESERVED) {
- if (e820.map[i].addr >= 0x100000UL &&
- e820.map[i].addr < clip)
- clip = e820.map[i].addr;
- continue;
- }
- start = e820.map[i].addr;
- end = e820.map[i].addr + e820.map[i].size;
- if (start >= end)
- continue;
- if (end > top)
- top = end;
- }
- /*
- * Everything below 'top' should be RAM except for the ISA hole.
- * Because of the limited MCR's we want to map NV/ACPI into our
- * MCR range for gunk in RAM
- *
- * Clip might cause us to MCR insufficient RAM but that is an
- * acceptable failure mode and should only bite obscure boxes with
- * a VESA hole at 15Mb
- *
- * The second case Clip sometimes kicks in is when the EBDA is marked
- * as reserved. Again we fail safe with reasonable results
- */
- if (top > clip)
- top = clip;
-
- return top;
-}
-
-/*
- * Compute a set of MCR's to give maximum coverage
- */
-static int __cpuinit centaur_mcr_compute(int nr, int key)
-{
- u32 mem = ramtop();
- u32 root = power2(mem);
- u32 base = root;
- u32 top = root;
- u32 floor = 0;
- int ct = 0;
-
- while (ct < nr) {
- u32 fspace = 0;
- u32 high;
- u32 low;
-
- /*
- * Find the largest block we will fill going upwards
- */
- high = power2(mem-top);
-
- /*
- * Find the largest block we will fill going downwards
- */
- low = base/2;
-
- /*
- * Don't fill below 1Mb going downwards as there
- * is an ISA hole in the way.
- */
- if (base <= 1024*1024)
- low = 0;
-
- /*
- * See how much space we could cover by filling below
- * the ISA hole
- */
-
- if (floor == 0)
- fspace = 512*1024;
- else if (floor == 512*1024)
- fspace = 128*1024;
-
- /* And forget ROM space */
-
- /*
- * Now install the largest coverage we get
- */
- if (fspace > high && fspace > low) {
- centaur_mcr_insert(ct, floor, fspace, key);
- floor += fspace;
- } else if (high > low) {
- centaur_mcr_insert(ct, top, high, key);
- top += high;
- } else if (low > 0) {
- base -= low;
- centaur_mcr_insert(ct, base, low, key);
- } else
- break;
- ct++;
- }
- /*
- * We loaded ct values. We now need to set the mask. The caller
- * must do this bit.
- */
- return ct;
-}
-
-static void __cpuinit centaur_create_optimal_mcr(void)
-{
- int used;
- int i;
-
- /*
- * Allocate up to 6 mcrs to mark as much of ram as possible
- * as write combining and weak write ordered.
- *
- * To experiment with: Linux never uses stack operations for
- * mmio spaces so we could globally enable stack operation wc
- *
- * Load the registers with type 31 - full write combining, all
- * writes weakly ordered.
- */
- used = centaur_mcr_compute(6, 31);
-
- /*
- * Wipe unused MCRs
- */
- for (i = used; i < 8; i++)
- wrmsr(MSR_IDT_MCR0+i, 0, 0);
-}
-
-static void __cpuinit winchip2_create_optimal_mcr(void)
-{
- u32 lo, hi;
- int used;
- int i;
-
- /*
- * Allocate up to 6 mcrs to mark as much of ram as possible
- * as write combining, weak store ordered.
- *
- * Load the registers with type 25
- * 8 - weak write ordering
- * 16 - weak read ordering
- * 1 - write combining
- */
- used = centaur_mcr_compute(6, 25);
-
- /*
- * Mark the registers we are using.
- */
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- for (i = 0; i < used; i++)
- lo |= 1<<(9+i);
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-
- /*
- * Wipe unused MCRs
- */
-
- for (i = used; i < 8; i++)
- wrmsr(MSR_IDT_MCR0+i, 0, 0);
-}
-
-/*
- * Handle the MCR key on the Winchip 2.
- */
-static void __cpuinit winchip2_unprotect_mcr(void)
-{
- u32 lo, hi;
- u32 key;
-
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- lo &= ~0x1C0; /* blank bits 8-6 */
- key = (lo>>17) & 7;
- lo |= key<<6; /* replace with unlock key */
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-}
-
-static void __cpuinit winchip2_protect_mcr(void)
-{
- u32 lo, hi;
-
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- lo &= ~0x1C0; /* blank bits 8-6 */
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-}
-#endif /* CONFIG_X86_OOSTORE */
-
#define ACE_PRESENT (1 << 6)
#define ACE_ENABLED (1 << 7)
#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */
@@ -247,7 +16,7 @@ static void __cpuinit winchip2_protect_mcr(void)
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
-static void __cpuinit init_c3(struct cpuinfo_x86 *c)
+static void init_c3(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -318,7 +87,7 @@ enum {
EAMD3D = 1<<20,
};
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+static void early_init_centaur(struct cpuinfo_x86 *c)
{
switch (c->x86) {
#ifdef CONFIG_X86_32
@@ -337,7 +106,7 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
#endif
}
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+static void init_centaur(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
char *name;
@@ -363,20 +132,6 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
fcr_clr = DPDC;
printk(KERN_NOTICE "Disabling bugged TSC.\n");
clear_cpu_cap(c, X86_FEATURE_TSC);
-#ifdef CONFIG_X86_OOSTORE
- centaur_create_optimal_mcr();
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- *
- * The C6 original lacks weak read order
- *
- * Note 0x120 is write only on Winchip 1
- */
- wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0);
-#endif
break;
case 8:
switch (c->x86_mask) {
@@ -393,40 +148,12 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
-#ifdef CONFIG_X86_OOSTORE
- winchip2_unprotect_mcr();
- winchip2_create_optimal_mcr();
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- */
- lo |= 31;
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
- winchip2_protect_mcr();
-#endif
break;
case 9:
name = "3";
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
-#ifdef CONFIG_X86_OOSTORE
- winchip2_unprotect_mcr();
- winchip2_create_optimal_mcr();
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- */
- lo |= 31;
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
- winchip2_protect_mcr();
-#endif
break;
default:
name = "??";
@@ -468,10 +195,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
#endif
}
-static unsigned int __cpuinit
+#ifdef CONFIG_X86_32
+static unsigned int
centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
-#ifdef CONFIG_X86_32
/* VIA C3 CPUs (670-68F) need further shifting. */
if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
size >>= 8;
@@ -484,16 +211,18 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
if ((c->x86 == 6) && (c->x86_model == 9) &&
(c->x86_mask == 1) && (size == 65))
size -= 1;
-#endif
return size;
}
+#endif
-static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
+static const struct cpu_dev centaur_cpu_dev = {
.c_vendor = "Centaur",
.c_ident = { "CentaurHauls" },
.c_early_init = early_init_centaur,
.c_init = init_centaur,
- .c_size_cache = centaur_size_cache,
+#ifdef CONFIG_X86_32
+ .legacy_cache_size = centaur_size_cache,
+#endif
.c_x86_vendor = X86_VENDOR_CENTAUR,
};
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d43cad74f16..ef1b93f18ed 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -8,6 +8,7 @@
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/init.h>
+#include <linux/kprobes.h>
#include <linux/kgdb.h>
#include <linux/smp.h>
#include <linux/io.h>
@@ -18,7 +19,9 @@
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
+#include <asm/debugreg.h>
#include <asm/sections.h>
+#include <asm/vsyscall.h>
#include <linux/topology.h>
#include <linux/cpumask.h>
#include <asm/pgtable.h>
@@ -28,6 +31,7 @@
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/mtrr.h>
#include <linux/numa.h>
#include <asm/asm.h>
@@ -35,6 +39,8 @@
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/pat.h>
+#include <asm/microcode.h>
+#include <asm/microcode_intel.h>
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/uv/uv.h>
@@ -59,7 +65,7 @@ void __init setup_cpu_local_masks(void)
alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
}
-static void __cpuinit default_init(struct cpuinfo_x86 *c)
+static void default_init(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
cpu_detect_cache_sizes(c);
@@ -76,13 +82,13 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
#endif
}
-static const struct cpu_dev __cpuinitconst default_cpu = {
+static const struct cpu_dev default_cpu = {
.c_init = default_init,
.c_vendor = "Unknown",
.c_x86_vendor = X86_VENDOR_UNKNOWN,
};
-static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+static const struct cpu_dev *this_cpu = &default_cpu;
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
@@ -142,6 +148,8 @@ static int __init x86_xsave_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ setup_clear_cpu_cap(X86_FEATURE_AVX2);
return 1;
}
__setup("noxsave", x86_xsave_setup);
@@ -154,8 +162,8 @@ static int __init x86_xsaveopt_setup(char *s)
__setup("noxsaveopt", x86_xsaveopt_setup);
#ifdef CONFIG_X86_32
-static int cachesize_override __cpuinitdata = -1;
-static int disable_x86_serial_nr __cpuinitdata = 1;
+static int cachesize_override = -1;
+static int disable_x86_serial_nr = 1;
static int __init cachesize_setup(char *str)
{
@@ -209,12 +217,12 @@ static inline int flag_is_changeable_p(u32 flag)
}
/* Probe for the CPUID instruction */
-static int __cpuinit have_cpuid_p(void)
+int have_cpuid_p(void)
{
return flag_is_changeable_p(X86_EFLAGS_ID);
}
-static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
unsigned long lo, hi;
@@ -245,32 +253,45 @@ static inline int flag_is_changeable_p(u32 flag)
{
return 1;
}
-/* Probe for the CPUID instruction */
-static inline int have_cpuid_p(void)
-{
- return 1;
-}
static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
}
#endif
-static int disable_smep __cpuinitdata;
static __init int setup_disable_smep(char *arg)
{
- disable_smep = 1;
+ setup_clear_cpu_cap(X86_FEATURE_SMEP);
return 1;
}
__setup("nosmep", setup_disable_smep);
-static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
+static __always_inline void setup_smep(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_SMEP))
+ set_in_cr4(X86_CR4_SMEP);
+}
+
+static __init int setup_disable_smap(char *arg)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SMAP);
+ return 1;
+}
+__setup("nosmap", setup_disable_smap);
+
+static __always_inline void setup_smap(struct cpuinfo_x86 *c)
{
- if (cpu_has(c, X86_FEATURE_SMEP)) {
- if (unlikely(disable_smep)) {
- setup_clear_cpu_cap(X86_FEATURE_SMEP);
- clear_in_cr4(X86_CR4_SMEP);
- } else
- set_in_cr4(X86_CR4_SMEP);
+ unsigned long eflags;
+
+ /* This should have been cleared long ago */
+ raw_local_save_flags(eflags);
+ BUG_ON(eflags & X86_EFLAGS_AC);
+
+ if (cpu_has(c, X86_FEATURE_SMAP)) {
+#ifdef CONFIG_X86_SMAP
+ set_in_cr4(X86_CR4_SMAP);
+#else
+ clear_in_cr4(X86_CR4_SMAP);
+#endif
}
}
@@ -284,7 +305,7 @@ struct cpuid_dependent_feature {
u32 level;
};
-static const struct cpuid_dependent_feature __cpuinitconst
+static const struct cpuid_dependent_feature
cpuid_dependent_features[] = {
{ X86_FEATURE_MWAIT, 0x00000005 },
{ X86_FEATURE_DCA, 0x00000009 },
@@ -292,7 +313,7 @@ cpuid_dependent_features[] = {
{ 0, 0 }
};
-static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
+static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
{
const struct cpuid_dependent_feature *df;
@@ -330,9 +351,10 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
*/
/* Look up CPU names by table lookup. */
-static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
+static const char *table_lookup_model(struct cpuinfo_x86 *c)
{
- const struct cpu_model_info *info;
+#ifdef CONFIG_X86_32
+ const struct legacy_cpu_model_info *info;
if (c->x86_model >= 16)
return NULL; /* Range check */
@@ -340,18 +362,19 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
if (!this_cpu)
return NULL;
- info = this_cpu->c_models;
+ info = this_cpu->legacy_models;
- while (info && info->family) {
+ while (info->family) {
if (info->family == c->x86)
return info->model_names[c->x86_model];
info++;
}
+#endif
return NULL; /* Not found */
}
-__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
-__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_cleared[NCAPINTS];
+__u32 cpu_caps_set[NCAPINTS];
void load_percpu_segment(int cpu)
{
@@ -380,9 +403,9 @@ void switch_to_new_gdt(int cpu)
load_percpu_segment(cpu);
}
-static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
+static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
-static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
+static void get_model_name(struct cpuinfo_x86 *c)
{
unsigned int *v;
char *p, *q;
@@ -411,7 +434,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
}
}
-void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
+void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
{
unsigned int n, dummy, ebx, ecx, edx, l2size;
@@ -436,8 +459,8 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
#else
/* do processor-specific cache resizing */
- if (this_cpu->c_size_cache)
- l2size = this_cpu->c_size_cache(c, l2size);
+ if (this_cpu->legacy_cache_size)
+ l2size = this_cpu->legacy_cache_size(c, l2size);
/* Allow user to override all this if necessary. */
if (cachesize_override != -1)
@@ -450,7 +473,37 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
c->x86_cache_size = l2size;
}
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+u16 __read_mostly tlb_lli_4k[NR_INFO];
+u16 __read_mostly tlb_lli_2m[NR_INFO];
+u16 __read_mostly tlb_lli_4m[NR_INFO];
+u16 __read_mostly tlb_lld_4k[NR_INFO];
+u16 __read_mostly tlb_lld_2m[NR_INFO];
+u16 __read_mostly tlb_lld_4m[NR_INFO];
+u16 __read_mostly tlb_lld_1g[NR_INFO];
+
+/*
+ * tlb_flushall_shift shows the balance point in replacing cr3 write
+ * with multiple 'invlpg'. It will do this replacement when
+ * flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
+ * If tlb_flushall_shift is -1, means the replacement will be disabled.
+ */
+s8 __read_mostly tlb_flushall_shift = -1;
+
+void cpu_detect_tlb(struct cpuinfo_x86 *c)
+{
+ if (this_cpu->c_detect_tlb)
+ this_cpu->c_detect_tlb(c);
+
+ printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
+ "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"
+ "tlb_flushall_shift: %d\n",
+ tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
+ tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
+ tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
+ tlb_lld_1g[ENTRIES], tlb_flushall_shift);
+}
+
+void detect_ht(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_HT
u32 eax, ebx, ecx, edx;
@@ -501,7 +554,7 @@ out:
#endif
}
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+static void get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
int i;
@@ -528,7 +581,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
this_cpu = &default_cpu;
}
-void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
+void cpu_detect(struct cpuinfo_x86 *c)
{
/* Get vendor name */
cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -558,7 +611,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
}
}
-void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
+void get_cpu_cap(struct cpuinfo_x86 *c)
{
u32 tfms, xlvl;
u32 ebx;
@@ -609,7 +662,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
init_scattered_cpuid_features(c);
}
-static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
int i;
@@ -668,10 +721,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
return;
cpu_detect(c);
-
get_cpu_vendor(c);
-
get_cpu_cap(c);
+ fpu_detect(c);
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
@@ -679,10 +731,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
c->cpu_index = 0;
filter_cpuid_features(c, false);
- setup_smep(c);
-
if (this_cpu->c_bsp_init)
this_cpu->c_bsp_init(c);
+
+ setup_force_cpu_cap(X86_FEATURE_ALWAYS);
}
void __init early_cpu_init(void)
@@ -727,7 +779,7 @@ void __init early_cpu_init(void)
* unless we can find a reliable way to detect all the broken cases.
* Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
*/
-static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
+static void detect_nopl(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
clear_cpu_cap(c, X86_FEATURE_NOPL);
@@ -736,7 +788,7 @@ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
#endif
}
-static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
+static void generic_identify(struct cpuinfo_x86 *c)
{
c->extended_cpuid_level = 0;
@@ -765,8 +817,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
c->phys_proc_id = c->initial_apicid;
}
- setup_smep(c);
-
get_model_name(c); /* Default name */
detect_nopl(c);
@@ -775,7 +825,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
-static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+static void identify_cpu(struct cpuinfo_x86 *c)
{
int i;
@@ -831,6 +881,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
+ /* Set up SMEP/SMAP */
+ setup_smep(c);
+ setup_smap(c);
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
@@ -877,6 +931,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* AND the already accumulated flags with these */
for (i = 0; i < NCAPINTS; i++)
boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+
+ /* OR, i.e. replicate the bug flags */
+ for (i = NCAPINTS; i < NCAPINTS + NBUGINTS; i++)
+ c->x86_capability[i] |= boot_cpu_data.x86_capability[i];
}
/* Init Machine Check Exception if available. */
@@ -897,6 +955,38 @@ static void vgetcpu_set_mode(void)
else
vgetcpu_mode = VGETCPU_LSL;
}
+
+/* May not be __init: called during resume */
+static void syscall32_cpu_init(void)
+{
+ /* Load these always in case some future AMD CPU supports
+ SYSENTER from compat mode too. */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+
+ wrmsrl(MSR_CSTAR, ia32_cstar_target);
+}
+#endif
+
+#ifdef CONFIG_X86_32
+void enable_sep_cpu(void)
+{
+ int cpu = get_cpu();
+ struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+ if (!boot_cpu_has(X86_FEATURE_SEP)) {
+ put_cpu();
+ return;
+ }
+
+ tss->x86_tss.ss1 = __KERNEL_CS;
+ tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
+ wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+ wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
+ put_cpu();
+}
#endif
void __init identify_boot_cpu(void)
@@ -909,9 +999,10 @@ void __init identify_boot_cpu(void)
#else
vgetcpu_set_mode();
#endif
+ cpu_detect_tlb(&boot_cpu_data);
}
-void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+void identify_secondary_cpu(struct cpuinfo_x86 *c)
{
BUG_ON(c == &boot_cpu_data);
identify_cpu(c);
@@ -926,14 +1017,14 @@ struct msr_range {
unsigned max;
};
-static const struct msr_range msr_range_array[] __cpuinitconst = {
+static const struct msr_range msr_range_array[] = {
{ 0x00000000, 0x00000418},
{ 0xc0000000, 0xc000040b},
{ 0xc0010000, 0xc0010142},
{ 0xc0011000, 0xc001103b},
};
-static void __cpuinit print_cpu_msr(void)
+static void __print_cpu_msr(void)
{
unsigned index_min, index_max;
unsigned index;
@@ -945,14 +1036,14 @@ static void __cpuinit print_cpu_msr(void)
index_max = msr_range_array[i].max;
for (index = index_min; index < index_max; index++) {
- if (rdmsrl_amd_safe(index, &val))
+ if (rdmsrl_safe(index, &val))
continue;
printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
}
}
}
-static int show_msr __cpuinitdata;
+static int show_msr;
static __init int setup_show_msr(char *arg)
{
@@ -968,12 +1059,13 @@ __setup("show_msr=", setup_show_msr);
static __init int setup_noclflush(char *arg)
{
- setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+ setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
+ setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT);
return 1;
}
__setup("noclflush", setup_noclflush);
-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+void print_cpu_info(struct cpuinfo_x86 *c)
{
const char *vendor = NULL;
@@ -988,22 +1080,24 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
printk(KERN_CONT "%s ", vendor);
if (c->x86_model_id[0])
- printk(KERN_CONT "%s", c->x86_model_id);
+ printk(KERN_CONT "%s", strim(c->x86_model_id));
else
printk(KERN_CONT "%d86", c->x86);
+ printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model);
+
if (c->x86_mask || c->cpuid_level >= 0)
- printk(KERN_CONT " stepping %02x\n", c->x86_mask);
+ printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask);
else
- printk(KERN_CONT "\n");
+ printk(KERN_CONT ")\n");
-#ifdef CONFIG_SMP
+ print_cpu_msr(c);
+}
+
+void print_cpu_msr(struct cpuinfo_x86 *c)
+{
if (c->cpu_index < show_msr)
- print_cpu_msr();
-#else
- if (show_msr)
- print_cpu_msr();
-#endif
+ __print_cpu_msr();
}
static __init int setup_disablecpuid(char *arg)
@@ -1019,13 +1113,17 @@ static __init int setup_disablecpuid(char *arg)
}
__setup("clearcpuid=", setup_disablecpuid);
+DEFINE_PER_CPU(unsigned long, kernel_stack) =
+ (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(kernel_stack);
+
#ifdef CONFIG_X86_64
struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
-struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
- (unsigned long) nmi_idt_table };
+struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
+ (unsigned long) debug_idt_table };
DEFINE_PER_CPU_FIRST(union irq_stack_union,
- irq_stack_union) __aligned(PAGE_SIZE);
+ irq_stack_union) __aligned(PAGE_SIZE) __visible;
/*
* The following four percpu variables are hot. Align current_task to
@@ -1035,14 +1133,15 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
&init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(unsigned long, kernel_stack) =
- (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(kernel_stack);
-
DEFINE_PER_CPU(char *, irq_stack_ptr) =
init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
-DEFINE_PER_CPU(unsigned int, irq_count) = -1;
+DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
+
+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
+
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
/*
* Special IST stacks which the CPU switches to when it calls
@@ -1076,11 +1175,10 @@ void syscall_init(void)
/* Flags to clear on syscall */
wrmsrl(MSR_SYSCALL_MASK,
- X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
+ X86_EFLAGS_IOPL|X86_EFLAGS_AC);
}
-unsigned long kernel_eflags;
-
/*
* Copies of the original ist values from the tss are only accessed during
* debugging, no special alignment required.
@@ -1096,35 +1194,38 @@ int is_debug_stack(unsigned long addr)
(addr <= __get_cpu_var(debug_stack_addr) &&
addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
}
+NOKPROBE_SYMBOL(is_debug_stack);
+
+DEFINE_PER_CPU(u32, debug_idt_ctr);
void debug_stack_set_zero(void)
{
- load_idt((const struct desc_ptr *)&nmi_idt_descr);
+ this_cpu_inc(debug_idt_ctr);
+ load_current_idt();
}
+NOKPROBE_SYMBOL(debug_stack_set_zero);
void debug_stack_reset(void)
{
- load_idt((const struct desc_ptr *)&idt_descr);
+ if (WARN_ON(!this_cpu_read(debug_idt_ctr)))
+ return;
+ if (this_cpu_dec_return(debug_idt_ctr) == 0)
+ load_current_idt();
}
+NOKPROBE_SYMBOL(debug_stack_reset);
#else /* CONFIG_X86_64 */
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
#ifdef CONFIG_CC_STACKPROTECTOR
DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
#endif
-/* Make sure %fs and %gs are initialized properly in idle threads */
-struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
-{
- memset(regs, 0, sizeof(struct pt_regs));
- regs->fs = __KERNEL_PERCPU;
- regs->gs = __KERNEL_STACK_CANARY;
-
- return regs;
-}
#endif /* CONFIG_X86_64 */
/*
@@ -1158,15 +1259,6 @@ static void dbg_restore_debug_regs(void)
#endif /* ! CONFIG_KGDB */
/*
- * Prints an error where the NUMA and configured core-number mismatch and the
- * platform didn't override this to fix it up
- */
-void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node)
-{
- pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id);
-}
-
-/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
* and IDT. We reload them nevertheless, this function acts as a
@@ -1175,7 +1267,7 @@ void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node)
*/
#ifdef CONFIG_X86_64
-void __cpuinit cpu_init(void)
+void cpu_init(void)
{
struct orig_ist *oist;
struct task_struct *me;
@@ -1184,12 +1276,18 @@ void __cpuinit cpu_init(void)
int cpu;
int i;
+ /*
+ * Load microcode on this cpu if a valid microcode is available.
+ * This is early microcode loading procedure.
+ */
+ load_ucode_ap();
+
cpu = stack_smp_processor_id();
t = &per_cpu(init_tss, cpu);
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
- if (cpu != 0 && percpu_read(numa_node) == 0 &&
+ if (this_cpu_read(numa_node) == 0 &&
early_cpu_to_node(cpu) != NUMA_NO_NODE)
set_numa_node(early_cpu_to_node(cpu));
#endif
@@ -1211,7 +1309,7 @@ void __cpuinit cpu_init(void)
switch_to_new_gdt(cpu);
loadsegment(fs, 0);
- load_idt((const struct desc_ptr *)&idt_descr);
+ load_current_idt();
memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
syscall_init();
@@ -1221,8 +1319,7 @@ void __cpuinit cpu_init(void)
barrier();
x86_configure_nx();
- if (cpu != 0)
- enable_x2apic();
+ enable_x2apic();
/*
* set up and load the per-CPU TSS
@@ -1262,9 +1359,6 @@ void __cpuinit cpu_init(void)
dbg_restore_debug_regs();
fpu_init();
- xsave_init();
-
- raw_local_save_flags(kernel_eflags);
if (is_uv_system())
uv_cpu_init();
@@ -1272,13 +1366,15 @@ void __cpuinit cpu_init(void)
#else
-void __cpuinit cpu_init(void)
+void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
struct tss_struct *t = &per_cpu(init_tss, cpu);
struct thread_struct *thread = &curr->thread;
+ show_ucode_info_early();
+
if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
for (;;)
@@ -1290,7 +1386,7 @@ void __cpuinit cpu_init(void)
if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
- load_idt(&idt_descr);
+ load_current_idt();
switch_to_new_gdt(cpu);
/*
@@ -1317,6 +1413,19 @@ void __cpuinit cpu_init(void)
dbg_restore_debug_regs();
fpu_init();
- xsave_init();
}
#endif
+
+#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
+void warn_pre_alternatives(void)
+{
+ WARN(1, "You're using static_cpu_has before alternatives have run!\n");
+}
+EXPORT_SYMBOL_GPL(warn_pre_alternatives);
+#endif
+
+inline bool __static_cpu_has_safe(u16 bit)
+{
+ return boot_cpu_has(bit);
+}
+EXPORT_SYMBOL_GPL(__static_cpu_has_safe);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 8bacc7826fb..c37dc37e831 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,12 +1,6 @@
#ifndef ARCH_X86_CPU_H
#define ARCH_X86_CPU_H
-struct cpu_model_info {
- int vendor;
- int family;
- const char *model_names[16];
-};
-
/* attempt to consolidate cpu attributes */
struct cpu_dev {
const char *c_vendor;
@@ -14,14 +8,31 @@ struct cpu_dev {
/* some have two possibilities for cpuid string */
const char *c_ident[2];
- struct cpu_model_info c_models[4];
-
void (*c_early_init)(struct cpuinfo_x86 *);
void (*c_bsp_init)(struct cpuinfo_x86 *);
void (*c_init)(struct cpuinfo_x86 *);
void (*c_identify)(struct cpuinfo_x86 *);
- unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
+ void (*c_detect_tlb)(struct cpuinfo_x86 *);
int c_x86_vendor;
+#ifdef CONFIG_X86_32
+ /* Optional vendor specific routine to obtain the cache size. */
+ unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *,
+ unsigned int);
+
+ /* Family/stepping-based lookup table for model names. */
+ struct legacy_cpu_model_info {
+ int family;
+ const char *model_names[16];
+ } legacy_models[5];
+#endif
+};
+
+struct _tlb_table {
+ unsigned char descriptor;
+ char tlb_type;
+ unsigned int entries;
+ /* unsigned int ways; */
+ char info[128];
};
#define cpu_dev_register(cpu_devX) \
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 4fbd384fb64..aaf152e7963 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -1,4 +1,3 @@
-#include <linux/init.h>
#include <linux/bitops.h>
#include <linux/delay.h>
#include <linux/pci.h>
@@ -15,7 +14,7 @@
/*
* Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
*/
-static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned char ccr2, ccr3;
@@ -44,7 +43,7 @@ static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
}
}
-static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned long flags;
@@ -59,25 +58,25 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
* Actually since bugs.h doesn't even reference this perhaps someone should
* fix the documentation ???
*/
-static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
+static unsigned char Cx86_dir0_msb = 0;
-static const char __cpuinitconst Cx86_model[][9] = {
+static const char Cx86_model[][9] = {
"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
"M II ", "Unknown"
};
-static const char __cpuinitconst Cx486_name[][5] = {
+static const char Cx486_name[][5] = {
"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
"SRx2", "DRx2"
};
-static const char __cpuinitconst Cx486S_name[][4] = {
+static const char Cx486S_name[][4] = {
"S", "S2", "Se", "S2e"
};
-static const char __cpuinitconst Cx486D_name[][4] = {
+static const char Cx486D_name[][4] = {
"DX", "DX2", "?", "?", "?", "DX4"
};
-static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
-static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
-static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
+static char Cx86_cb[] = "?.5x Core/Bus Clock";
+static const char cyrix_model_mult1[] = "12??43";
+static const char cyrix_model_mult2[] = "12233445";
/*
* Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -87,7 +86,7 @@ static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
* FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP
*/
-static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
+static void check_cx686_slop(struct cpuinfo_x86 *c)
{
unsigned long flags;
@@ -112,7 +111,7 @@ static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
}
-static void __cpuinit set_cx86_reorder(void)
+static void set_cx86_reorder(void)
{
u8 ccr3;
@@ -127,7 +126,7 @@ static void __cpuinit set_cx86_reorder(void)
setCx86(CX86_CCR3, ccr3);
}
-static void __cpuinit set_cx86_memwb(void)
+static void set_cx86_memwb(void)
{
printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
@@ -143,7 +142,7 @@ static void __cpuinit set_cx86_memwb(void)
* Configure later MediaGX and/or Geode processor.
*/
-static void __cpuinit geode_configure(void)
+static void geode_configure(void)
{
unsigned long flags;
u8 ccr3;
@@ -166,7 +165,7 @@ static void __cpuinit geode_configure(void)
local_irq_restore(flags);
}
-static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
+static void early_init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir1 = 0;
@@ -185,7 +184,7 @@ static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
+static void init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
char *buf = c->x86_model_id;
@@ -249,7 +248,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
/* Emulate MTRRs using Cyrix's ARRs. */
set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
/* 6x86's contain this bug */
- c->coma_bug = 1;
+ set_cpu_bug(c, X86_BUG_COMA);
break;
case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
@@ -317,7 +316,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
/* Enable MMX extensions (App note 108) */
setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
} else {
- c->coma_bug = 1; /* 6x86MX, it has the bug. */
+ /* A 6x86MX - it has the bug. */
+ set_cpu_bug(c, X86_BUG_COMA);
}
tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0;
Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7];
@@ -332,7 +332,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
switch (dir0_lsn) {
case 0xd: /* either a 486SLC or DLC w/o DEVID */
dir0_msn = 0;
- p = Cx486_name[(c->hard_math) ? 1 : 0];
+ p = Cx486_name[(cpu_has_fpu ? 1 : 0)];
break;
case 0xe: /* a 486S A step */
@@ -355,7 +355,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
/*
* Handle National Semiconductor branded processors
*/
-static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
+static void init_nsc(struct cpuinfo_x86 *c)
{
/*
* There may be GX1 processors in the wild that are branded
@@ -404,7 +404,7 @@ static inline int test_cyrix_52div(void)
return (unsigned char) (test >> 8) == 0x02;
}
-static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
+static void cyrix_identify(struct cpuinfo_x86 *c)
{
/* Detect Cyrix with disabled CPUID */
if (c->x86 == 4 && test_cyrix_52div()) {
@@ -440,7 +440,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
}
}
-static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
+static const struct cpu_dev cyrix_cpu_dev = {
.c_vendor = "Cyrix",
.c_ident = { "CyrixInstead" },
.c_early_init = early_init_cyrix,
@@ -451,7 +451,7 @@ static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
cpu_dev_register(cyrix_cpu_dev);
-static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
+static const struct cpu_dev nsc_cpu_dev = {
.c_vendor = "NSC",
.c_ident = { "Geode by NSC" },
.c_init = init_nsc,
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 755f64fb074..36ce402a3fa 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -25,11 +25,6 @@
#include <asm/processor.h>
#include <asm/hypervisor.h>
-/*
- * Hypervisor detect order. This is specified explicitly here because
- * some hypervisors might implement compatibility modes for other
- * hypervisors and therefore need to be detected in specific sequence.
- */
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
#ifdef CONFIG_XEN_PVHVM
@@ -37,6 +32,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
#endif
&x86_hyper_vmware,
&x86_hyper_ms_hyperv,
+#ifdef CONFIG_KVM_GUEST
+ &x86_hyper_kvm,
+#endif
};
const struct hypervisor_x86 *x86_hyper;
@@ -46,18 +44,22 @@ static inline void __init
detect_hypervisor_vendor(void)
{
const struct hypervisor_x86 *h, * const *p;
+ uint32_t pri, max_pri = 0;
for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
h = *p;
- if (h->detect()) {
+ pri = h->detect();
+ if (pri != 0 && pri > max_pri) {
+ max_pri = pri;
x86_hyper = h;
- printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
- break;
}
}
+
+ if (max_pri)
+ printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name);
}
-void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
+void init_hypervisor(struct cpuinfo_x86 *c)
{
if (x86_hyper && x86_hyper->set_cpu_features)
x86_hyper->set_cpu_features(c);
@@ -76,3 +78,10 @@ void __init init_hypervisor_platform(void)
if (x86_hyper->init_platform)
x86_hyper->init_platform();
}
+
+bool __init hypervisor_x2apic_available(void)
+{
+ return x86_hyper &&
+ x86_hyper->x2apic_available &&
+ x86_hyper->x2apic_available();
+}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 3e6ff6cbf42..f9e4fdd3b87 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,4 +1,3 @@
-#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/string.h>
@@ -17,7 +16,6 @@
#ifdef CONFIG_X86_64
#include <linux/topology.h>
-#include <asm/numa_64.h>
#endif
#include "cpu.h"
@@ -27,17 +25,14 @@
#include <asm/apic.h>
#endif
-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
/* Unmask CPUID levels if masked: */
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
-
- if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
- misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
- wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
c->cpuid_level = cpuid_eax(0);
get_cpu_cap(c);
}
@@ -94,7 +89,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
if (!check_tsc_unstable())
- sched_clock_stable = 1;
+ set_sched_clock_stable();
+ }
+
+ /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
+ if (c->x86 == 6) {
+ switch (c->x86_model) {
+ case 0x27: /* Penwell */
+ case 0x35: /* Cloverview */
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
+ break;
+ default:
+ break;
+ }
}
/*
@@ -119,16 +126,10 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
* Ingo Molnar reported a Pentium D (model 6) and a Xeon
* (model 2) with the same problem.
*/
- if (c->x86 == 15) {
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
-
- if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
- printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
-
- misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
- wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
- }
- }
+ if (c->x86 == 15)
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0)
+ pr_info("kmemcheck: Disabling fast string operations\n");
#endif
/*
@@ -152,7 +153,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
* This is called before we do cpu ident work
*/
-int __cpuinit ppro_with_ram_bug(void)
+int ppro_with_ram_bug(void)
{
/* Uses data from early_cpu_detect now */
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
@@ -165,21 +166,7 @@ int __cpuinit ppro_with_ram_bug(void)
return 0;
}
-#ifdef CONFIG_X86_F00F_BUG
-static void __cpuinit trap_init_f00f_bug(void)
-{
- __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
-
- /*
- * Update the IDT descriptor and reload the IDT so that
- * it uses the read-only mapped virtual address.
- */
- idt_descr.address = fix_to_virt(FIX_F00F_IDT);
- load_idt(&idt_descr);
-}
-#endif
-
-static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
+static void intel_smp_check(struct cpuinfo_x86 *c)
{
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
@@ -199,24 +186,28 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
+static int forcepae;
+static int __init forcepae_setup(char *__unused)
{
- unsigned long lo, hi;
+ forcepae = 1;
+ return 1;
+}
+__setup("forcepae", forcepae_setup);
+static void intel_workarounds(struct cpuinfo_x86 *c)
+{
#ifdef CONFIG_X86_F00F_BUG
/*
* All current models of Pentium and Pentium with MMX technology CPUs
* have the F0 0F bug, which lets nonprivileged users lock up the
- * system.
- * Note that the workaround only should be initialized once...
+ * system. Announce that the fault handler will be checking for it.
*/
- c->f00f_bug = 0;
+ clear_cpu_bug(c, X86_BUG_F00F);
if (!paravirt_enabled() && c->x86 == 5) {
static int f00f_workaround_enabled;
- c->f00f_bug = 1;
+ set_cpu_bug(c, X86_BUG_F00F);
if (!f00f_workaround_enabled) {
- trap_init_f00f_bug();
printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
f00f_workaround_enabled = 1;
}
@@ -231,16 +222,26 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
clear_cpu_cap(c, X86_FEATURE_SEP);
/*
+ * PAE CPUID issue: many Pentium M report no PAE but may have a
+ * functionally usable PAE implementation.
+ * Forcefully enable PAE if kernel parameter "forcepae" is present.
+ */
+ if (forcepae) {
+ printk(KERN_WARNING "PAE forced!\n");
+ set_cpu_cap(c, X86_FEATURE_PAE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
+ }
+
+ /*
* P4 Xeon errata 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
- rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
- if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
- printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
- printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
- lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
- wrmsr(MSR_IA32_MISC_ENABLE, lo, hi);
+ if (msr_set_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT)
+ > 0) {
+ pr_info("CPU: C0 stepping P4 Xeon detected.\n");
+ pr_info("CPU: Disabling hardware prefetching (Errata 037)\n");
}
}
@@ -273,19 +274,15 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
}
#endif
-#ifdef CONFIG_X86_NUMAQ
- numaq_tsc_disable();
-#endif
-
intel_smp_check(c);
}
#else
-static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
+static void intel_workarounds(struct cpuinfo_x86 *c)
{
}
#endif
-static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
+static void srat_detect_node(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_NUMA
unsigned node;
@@ -305,7 +302,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
/*
* find out the number of processor cores on the die
*/
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
+static int intel_num_cpu_cores(struct cpuinfo_x86 *c)
{
unsigned int eax, ebx, ecx, edx;
@@ -320,7 +317,7 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
return 1;
}
-static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
+static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
{
/* Intel VMX MSR indicated features */
#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
@@ -358,7 +355,7 @@ static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+static void init_intel(struct cpuinfo_x86 *c)
{
unsigned int l2 = 0;
@@ -373,6 +370,17 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
*/
detect_extended_topology(c);
+ if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
+ /*
+ * let's use the legacy cpuid vector 0x1 and 0x4 for topology
+ * detection.
+ */
+ c->x86_max_cores = intel_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+ detect_ht(c);
+#endif
+ }
+
l2 = init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
@@ -392,7 +400,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_PEBS);
}
- if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
+ if (c->x86 == 6 && cpu_has_clflush &&
+ (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
#ifdef CONFIG_X86_64
@@ -440,17 +449,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_P3);
#endif
- if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
- /*
- * let's use the legacy cpuid vector 0x1 and 0x4 for topology
- * detection.
- */
- c->x86_max_cores = intel_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
- }
-
/* Work around errata */
srat_detect_node(c);
@@ -477,7 +475,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
}
#ifdef CONFIG_X86_32
-static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/*
* Intel PIII Tualatin. This comes in two flavours.
@@ -491,12 +489,209 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
}
#endif
-static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
+#define TLB_INST_4K 0x01
+#define TLB_INST_4M 0x02
+#define TLB_INST_2M_4M 0x03
+
+#define TLB_INST_ALL 0x05
+#define TLB_INST_1G 0x06
+
+#define TLB_DATA_4K 0x11
+#define TLB_DATA_4M 0x12
+#define TLB_DATA_2M_4M 0x13
+#define TLB_DATA_4K_4M 0x14
+
+#define TLB_DATA_1G 0x16
+
+#define TLB_DATA0_4K 0x21
+#define TLB_DATA0_4M 0x22
+#define TLB_DATA0_2M_4M 0x23
+
+#define STLB_4K 0x41
+#define STLB_4K_2M 0x42
+
+static const struct _tlb_table intel_tlb_table[] = {
+ { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
+ { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
+ { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
+ { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
+ { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
+ { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
+ { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
+ { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
+ { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
+ { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
+ { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
+ { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
+ { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
+ { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
+ { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
+ { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
+ { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
+ { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
+ { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
+ { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
+ { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
+ { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
+ { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
+ { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
+ { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
+ { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
+ { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" },
+ { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" },
+ { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
+ { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
+ { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
+ { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
+ { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
+ { 0x00, 0, 0 }
+};
+
+static void intel_tlb_lookup(const unsigned char desc)
+{
+ unsigned char k;
+ if (desc == 0)
+ return;
+
+ /* look up this descriptor in the table */
+ for (k = 0; intel_tlb_table[k].descriptor != desc && \
+ intel_tlb_table[k].descriptor != 0; k++)
+ ;
+
+ if (intel_tlb_table[k].tlb_type == 0)
+ return;
+
+ switch (intel_tlb_table[k].tlb_type) {
+ case STLB_4K:
+ if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case STLB_4K_2M:
+ if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_INST_ALL:
+ if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_INST_4K:
+ if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_INST_4M:
+ if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_INST_2M_4M:
+ if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_4K:
+ case TLB_DATA0_4K:
+ if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_4M:
+ case TLB_DATA0_4M:
+ if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_2M_4M:
+ case TLB_DATA0_2M_4M:
+ if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_4K_4M:
+ if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_1G:
+ if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ }
+}
+
+static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
+{
+ switch ((c->x86 << 8) + c->x86_model) {
+ case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+ case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+ case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+ case 0x61d: /* six-core 45 nm xeon "Dunnington" */
+ tlb_flushall_shift = -1;
+ break;
+ case 0x63a: /* Ivybridge */
+ tlb_flushall_shift = 2;
+ break;
+ case 0x61a: /* 45 nm nehalem, "Bloomfield" */
+ case 0x61e: /* 45 nm nehalem, "Lynnfield" */
+ case 0x625: /* 32 nm nehalem, "Clarkdale" */
+ case 0x62c: /* 32 nm nehalem, "Gulftown" */
+ case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
+ case 0x62f: /* 32 nm Xeon E7 */
+ case 0x62a: /* SandyBridge */
+ case 0x62d: /* SandyBridge, "Romely-EP" */
+ default:
+ tlb_flushall_shift = 6;
+ }
+}
+
+static void intel_detect_tlb(struct cpuinfo_x86 *c)
+{
+ int i, j, n;
+ unsigned int regs[4];
+ unsigned char *desc = (unsigned char *)regs;
+
+ if (c->cpuid_level < 2)
+ return;
+
+ /* Number of times to iterate */
+ n = cpuid_eax(2) & 0xFF;
+
+ for (i = 0 ; i < n ; i++) {
+ cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
+
+ /* If bit 31 is set, this is an unknown format */
+ for (j = 0 ; j < 3 ; j++)
+ if (regs[j] & (1 << 31))
+ regs[j] = 0;
+
+ /* Byte 0 is level count, not a descriptor */
+ for (j = 1 ; j < 16 ; j++)
+ intel_tlb_lookup(desc[j]);
+ }
+ intel_tlb_flushall_shift_set(c);
+}
+
+static const struct cpu_dev intel_cpu_dev = {
.c_vendor = "Intel",
.c_ident = { "GenuineIntel" },
#ifdef CONFIG_X86_32
- .c_models = {
- { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[0] = "486 DX-25/33",
[1] = "486 DX-50",
@@ -509,7 +704,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
[9] = "486 DX/4-WB"
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names =
+ { .family = 5, .model_names =
{
[0] = "Pentium 60/66 A-step",
[1] = "Pentium 60/66",
@@ -520,7 +715,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
[8] = "Mobile Pentium MMX"
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names =
+ { .family = 6, .model_names =
{
[0] = "Pentium Pro A-step",
[1] = "Pentium Pro",
@@ -534,7 +729,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
[11] = "Pentium III (Tualatin)",
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names =
+ { .family = 15, .model_names =
{
[0] = "Pentium 4 (Unknown)",
[1] = "Pentium 4 (Willamette)",
@@ -544,8 +739,9 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
}
},
},
- .c_size_cache = intel_size_cache,
+ .legacy_cache_size = intel_size_cache,
#endif
+ .c_detect_tlb = intel_detect_tlb,
.c_early_init = early_init_intel,
.c_init = init_intel,
.c_x86_vendor = X86_VENDOR_INTEL,
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6b45e5e7a90..9c8f7394c61 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -1,5 +1,5 @@
/*
- * Routines to indentify caches on Intel CPU.
+ * Routines to identify caches on Intel CPU.
*
* Changes:
* Venkatesh Pallipadi : Adding cache identification through cpuid(4)
@@ -37,7 +37,7 @@ struct _cache_table {
/* All the cache descriptor types we care about (no TLB or
trace cache entries) */
-static const struct _cache_table __cpuinitconst cache_table[] =
+static const struct _cache_table cache_table[] =
{
{ 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
{ 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
@@ -203,7 +203,7 @@ union l3_cache {
unsigned val;
};
-static const unsigned short __cpuinitconst assocs[] = {
+static const unsigned short assocs[] = {
[1] = 1,
[2] = 2,
[4] = 4,
@@ -217,10 +217,10 @@ static const unsigned short __cpuinitconst assocs[] = {
[0xf] = 0xffff /* fully associative - no way to show this currently */
};
-static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
-static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
+static const unsigned char levels[] = { 1, 1, 2, 3 };
+static const unsigned char types[] = { 1, 2, 3, 3 };
-static void __cpuinit
+static void
amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
union _cpuid4_leaf_ebx *ebx,
union _cpuid4_leaf_ecx *ecx)
@@ -298,12 +298,11 @@ struct _cache_attr {
unsigned int);
};
-#ifdef CONFIG_AMD_NB
-
+#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
/*
* L3 cache descriptors
*/
-static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb)
+static void amd_calc_l3_indices(struct amd_northbridge *nb)
{
struct amd_l3_cache *l3 = &nb->l3_cache;
unsigned int sc0, sc1, sc2, sc3;
@@ -326,8 +325,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb)
l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
}
-static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
- int index)
+static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
{
int node;
@@ -434,14 +432,14 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
/* check if @slot is already used or the index is already disabled */
ret = amd_get_l3_disable_slot(nb, slot);
if (ret >= 0)
- return -EINVAL;
+ return -EEXIST;
if (index > nb->l3_cache.indices)
return -EINVAL;
/* check whether the other slot has disabled the same index already */
if (index == amd_get_l3_disable_slot(nb, !slot))
- return -EINVAL;
+ return -EEXIST;
amd_l3_disable_index(nb, cpu, slot, index);
@@ -469,8 +467,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
if (err) {
if (err == -EEXIST)
- printk(KERN_WARNING "L3 disable slot %d in use!\n",
- slot);
+ pr_warning("L3 slot %d in use/index already disabled!\n",
+ slot);
return err;
}
return count;
@@ -525,13 +523,12 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
static struct _cache_attr subcaches =
__ATTR(subcaches, 0644, show_subcaches, store_subcaches);
-#else /* CONFIG_AMD_NB */
+#else
#define amd_init_l3_cache(x, y)
-#endif /* CONFIG_AMD_NB */
+#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */
static int
-__cpuinit cpuid4_cache_lookup_regs(int index,
- struct _cpuid4_info_regs *this_leaf)
+cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
{
union _cpuid4_leaf_eax eax;
union _cpuid4_leaf_ebx ebx;
@@ -539,7 +536,11 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
unsigned edx;
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- amd_cpuid4(index, &eax, &ebx, &ecx);
+ if (cpu_has_topoext)
+ cpuid_count(0x8000001d, index, &eax.full,
+ &ebx.full, &ecx.full, &edx);
+ else
+ amd_cpuid4(index, &eax, &ebx, &ecx);
amd_init_l3_cache(this_leaf, index);
} else {
cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
@@ -558,22 +559,40 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
return 0;
}
-static int __cpuinit find_num_cache_leaves(void)
+static int find_num_cache_leaves(struct cpuinfo_x86 *c)
{
- unsigned int eax, ebx, ecx, edx;
+ unsigned int eax, ebx, ecx, edx, op;
union _cpuid4_leaf_eax cache_eax;
int i = -1;
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ op = 0x8000001d;
+ else
+ op = 4;
+
do {
++i;
- /* Do cpuid(4) loop to find out num_cache_leaves */
- cpuid_count(4, i, &eax, &ebx, &ecx, &edx);
+ /* Do cpuid(op) loop to find out num_cache_leaves */
+ cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
cache_eax.full = eax;
} while (cache_eax.split.type != CACHE_TYPE_NULL);
return i;
}
-unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
+void init_amd_cacheinfo(struct cpuinfo_x86 *c)
+{
+
+ if (cpu_has_topoext) {
+ num_cache_leaves = find_num_cache_leaves(c);
+ } else if (c->extended_cpuid_level >= 0x80000006) {
+ if (cpuid_edx(0x80000006) & 0xf000)
+ num_cache_leaves = 4;
+ else
+ num_cache_leaves = 3;
+ }
+}
+
+unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
{
/* Cache sizes */
unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
@@ -589,7 +608,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
if (is_initialized == 0) {
/* Init num_cache_leaves from boot CPU */
- num_cache_leaves = find_num_cache_leaves();
+ num_cache_leaves = find_num_cache_leaves(c);
is_initialized++;
}
@@ -598,36 +617,34 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
* parameters cpuid leaf to find the cache details
*/
for (i = 0; i < num_cache_leaves; i++) {
- struct _cpuid4_info_regs this_leaf;
+ struct _cpuid4_info_regs this_leaf = {};
int retval;
retval = cpuid4_cache_lookup_regs(i, &this_leaf);
- if (retval >= 0) {
- switch (this_leaf.eax.split.level) {
- case 1:
- if (this_leaf.eax.split.type ==
- CACHE_TYPE_DATA)
- new_l1d = this_leaf.size/1024;
- else if (this_leaf.eax.split.type ==
- CACHE_TYPE_INST)
- new_l1i = this_leaf.size/1024;
- break;
- case 2:
- new_l2 = this_leaf.size/1024;
- num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
- index_msb = get_count_order(num_threads_sharing);
- l2_id = c->apicid >> index_msb;
- break;
- case 3:
- new_l3 = this_leaf.size/1024;
- num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
- index_msb = get_count_order(
- num_threads_sharing);
- l3_id = c->apicid >> index_msb;
- break;
- default:
- break;
- }
+ if (retval < 0)
+ continue;
+
+ switch (this_leaf.eax.split.level) {
+ case 1:
+ if (this_leaf.eax.split.type == CACHE_TYPE_DATA)
+ new_l1d = this_leaf.size/1024;
+ else if (this_leaf.eax.split.type == CACHE_TYPE_INST)
+ new_l1i = this_leaf.size/1024;
+ break;
+ case 2:
+ new_l2 = this_leaf.size/1024;
+ num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ l2_id = c->apicid & ~((1 << index_msb) - 1);
+ break;
+ case 3:
+ new_l3 = this_leaf.size/1024;
+ num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ l3_id = c->apicid & ~((1 << index_msb) - 1);
+ break;
+ default:
+ break;
}
}
}
@@ -713,6 +730,18 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
#endif
}
+#ifdef CONFIG_X86_HT
+ /*
+ * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
+ * turns means that the only possibility is SMT (as indicated in
+ * cpuid1). Since cpuid2 doesn't specify shared caches, and we know
+ * that SMT shares all caches, we can unconditionally set cpu_llc_id to
+ * c->phys_proc_id.
+ */
+ if (per_cpu(cpu_llc_id, cpu) == BAD_APICID)
+ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+#endif
+
c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
return l2;
@@ -725,14 +754,40 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
#ifdef CONFIG_SMP
-static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
+
+static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
- unsigned long num_threads_sharing;
- int index_msb, i, sibling;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct _cpuid4_info *this_leaf;
+ int i, sibling;
+
+ if (cpu_has_topoext) {
+ unsigned int apicid, nshared, first, last;
+
+ if (!per_cpu(ici_cpuid4_info, cpu))
+ return 0;
- if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
+ this_leaf = CPUID4_INFO_IDX(cpu, index);
+ nshared = this_leaf->base.eax.split.num_threads_sharing + 1;
+ apicid = cpu_data(cpu).apicid;
+ first = apicid - (apicid % nshared);
+ last = first + nshared - 1;
+
+ for_each_online_cpu(i) {
+ apicid = cpu_data(i).apicid;
+ if ((apicid < first) || (apicid > last))
+ continue;
+ if (!per_cpu(ici_cpuid4_info, i))
+ continue;
+ this_leaf = CPUID4_INFO_IDX(i, index);
+
+ for_each_online_cpu(sibling) {
+ apicid = cpu_data(sibling).apicid;
+ if ((apicid < first) || (apicid > last))
+ continue;
+ set_bit(sibling, this_leaf->shared_cpu_map);
+ }
+ }
+ } else if (index == 3) {
for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
if (!per_cpu(ici_cpuid4_info, i))
continue;
@@ -743,8 +798,24 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
set_bit(sibling, this_leaf->shared_cpu_map);
}
}
- return;
+ } else
+ return 0;
+
+ return 1;
+}
+
+static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
+{
+ struct _cpuid4_info *this_leaf, *sibling_leaf;
+ unsigned long num_threads_sharing;
+ int index_msb, i;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (cache_shared_amd_cpu_map_setup(cpu, index))
+ return;
}
+
this_leaf = CPUID4_INFO_IDX(cpu, index);
num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
@@ -768,7 +839,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
}
}
}
-static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
+static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
{
struct _cpuid4_info *this_leaf, *sibling_leaf;
int sibling;
@@ -781,16 +852,16 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
}
}
#else
-static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
+static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
{
}
-static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
+static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
{
}
#endif
-static void __cpuinit free_cache_attributes(unsigned int cpu)
+static void free_cache_attributes(unsigned int cpu)
{
int i;
@@ -801,7 +872,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
per_cpu(ici_cpuid4_info, cpu) = NULL;
}
-static void __cpuinit get_cpu_leaves(void *_retval)
+static void get_cpu_leaves(void *_retval)
{
int j, *retval = _retval, cpu = smp_processor_id();
@@ -821,7 +892,7 @@ static void __cpuinit get_cpu_leaves(void *_retval)
}
}
-static int __cpuinit detect_cache_attributes(unsigned int cpu)
+static int detect_cache_attributes(unsigned int cpu)
{
int retval;
@@ -955,7 +1026,7 @@ static struct attribute *default_attrs[] = {
};
#ifdef CONFIG_AMD_NB
-static struct attribute ** __cpuinit amd_l3_attrs(void)
+static struct attribute **amd_l3_attrs(void)
{
static struct attribute **attrs;
int n;
@@ -963,7 +1034,7 @@ static struct attribute ** __cpuinit amd_l3_attrs(void)
if (attrs)
return attrs;
- n = sizeof (default_attrs) / sizeof (struct attribute *);
+ n = ARRAY_SIZE(default_attrs);
if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
n += 2;
@@ -1031,7 +1102,7 @@ static struct kobj_type ktype_percpu_entry = {
.sysfs_ops = &sysfs_ops,
};
-static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
+static void cpuid4_cache_sysfs_exit(unsigned int cpu)
{
kfree(per_cpu(ici_cache_kobject, cpu));
kfree(per_cpu(ici_index_kobject, cpu));
@@ -1040,7 +1111,7 @@ static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
free_cache_attributes(cpu);
}
-static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
+static int cpuid4_cache_sysfs_init(unsigned int cpu)
{
int err;
@@ -1072,7 +1143,7 @@ err_out:
static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
/* Add/Remove cache interface for CPU device */
-static int __cpuinit cache_add_dev(struct device *dev)
+static int cache_add_dev(struct device *dev)
{
unsigned int cpu = dev->id;
unsigned long i, j;
@@ -1123,7 +1194,7 @@ static int __cpuinit cache_add_dev(struct device *dev)
return 0;
}
-static void __cpuinit cache_remove_dev(struct device *dev)
+static void cache_remove_dev(struct device *dev)
{
unsigned int cpu = dev->id;
unsigned long i;
@@ -1140,8 +1211,8 @@ static void __cpuinit cache_remove_dev(struct device *dev)
cpuid4_cache_sysfs_exit(cpu);
}
-static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int cacheinfo_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct device *dev;
@@ -1160,27 +1231,30 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = {
+static struct notifier_block cacheinfo_cpu_notifier = {
.notifier_call = cacheinfo_cpu_callback,
};
-static int __cpuinit cache_sysfs_init(void)
+static int __init cache_sysfs_init(void)
{
- int i;
+ int i, err = 0;
if (num_cache_leaves == 0)
return 0;
+ cpu_notifier_register_begin();
for_each_online_cpu(i) {
- int err;
struct device *dev = get_cpu_device(i);
err = cache_add_dev(dev);
if (err)
- return err;
+ goto out;
}
- register_hotcpu_notifier(&cacheinfo_cpu_notifier);
- return 0;
+ __register_hotcpu_notifier(&cacheinfo_cpu_notifier);
+
+out:
+ cpu_notifier_register_done();
+ return err;
}
device_initcall(cache_sysfs_init);
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
new file mode 100644
index 00000000000..afa9f0d487e
--- /dev/null
+++ b/arch/x86/kernel/cpu/match.c
@@ -0,0 +1,49 @@
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+/**
+ * x86_match_cpu - match current CPU again an array of x86_cpu_ids
+ * @match: Pointer to array of x86_cpu_ids. Last entry terminated with
+ * {}.
+ *
+ * Return the entry if the current CPU matches the entries in the
+ * passed x86_cpu_id match table. Otherwise NULL. The match table
+ * contains vendor (X86_VENDOR_*), family, model and feature bits or
+ * respective wildcard entries.
+ *
+ * A typical table entry would be to match a specific CPU
+ * { X86_VENDOR_INTEL, 6, 0x12 }
+ * or to match a specific CPU feature
+ * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
+ *
+ * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
+ * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
+ *
+ * Arrays used to match for this should also be declared using
+ * MODULE_DEVICE_TABLE(x86cpu, ...)
+ *
+ * This always matches against the boot cpu, assuming models and features are
+ * consistent over all CPUs.
+ */
+const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
+{
+ const struct x86_cpu_id *m;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ for (m = match; m->vendor | m->family | m->model | m->feature; m++) {
+ if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
+ continue;
+ if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
+ continue;
+ if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
+ continue;
+ if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
+ continue;
+ return m;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(x86_match_cpu);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 507ea58688e..a1aef953315 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -33,22 +33,28 @@
#include <linux/acpi.h>
#include <linux/cper.h>
#include <acpi/apei.h>
+#include <acpi/ghes.h>
#include <asm/mce.h>
#include "mce-internal.h"
-void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
+void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
{
struct mce m;
- /* Only corrected MC is reported */
- if (!corrected)
+ if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
return;
mce_setup(&m);
m.bank = 1;
- /* Fake a memory read corrected error with unknown channel */
+ /* Fake a memory read error with unknown channel */
m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+
+ if (severity >= GHES_SEV_RECOVERABLE)
+ m.status |= MCI_STATUS_UC;
+ if (severity >= GHES_SEV_PANIC)
+ m.status |= MCI_STATUS_PCC;
+
m.addr = mem_err->physical_addr;
mce_log(&m);
mce_notify_irq();
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index fc4beb39357..5ac2d1fb28b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -78,6 +78,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
}
static cpumask_var_t mce_inject_cpumask;
+static DEFINE_MUTEX(mce_inject_mutex);
static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
{
@@ -152,7 +153,7 @@ static void raise_mce(struct mce *m)
return;
#ifdef CONFIG_X86_LOCAL_APIC
- if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) {
+ if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
unsigned long start;
int cpu;
@@ -166,7 +167,7 @@ static void raise_mce(struct mce *m)
cpumask_clear_cpu(cpu, mce_inject_cpumask);
}
if (!cpumask_empty(mce_inject_cpumask)) {
- if (m->inject_flags & MCJ_IRQ_BRAODCAST) {
+ if (m->inject_flags & MCJ_IRQ_BROADCAST) {
/*
* don't wait because mce_irq_ipi is necessary
* to be sync with following raise_local
@@ -194,7 +195,11 @@ static void raise_mce(struct mce *m)
put_online_cpus();
} else
#endif
+ {
+ preempt_disable();
raise_local();
+ preempt_enable();
+ }
}
/* Error injection interface */
@@ -225,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
* so do it a jiffie or two later everywhere.
*/
schedule_timeout(2);
+
+ mutex_lock(&mce_inject_mutex);
raise_mce(&m);
+ mutex_unlock(&mce_inject_mutex);
return usize;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index ed44c8a6585..09edd0b65fe 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -24,9 +24,22 @@ struct mce_bank {
int mce_severity(struct mce *a, int tolerant, char **msg);
struct dentry *mce_get_debugfs_dir(void);
-extern int mce_ser;
-
extern struct mce_bank *mce_banks;
+extern mce_banks_t mce_banks_ce_disabled;
+
+#ifdef CONFIG_X86_MCE_INTEL
+unsigned long mce_intel_adjust_timer(unsigned long interval);
+void mce_intel_cmci_poll(void);
+void mce_intel_hcpu_update(unsigned long cpu);
+void cmci_disable_bank(int bank);
+#else
+# define mce_intel_adjust_timer mce_adjust_timer_default
+static inline void mce_intel_cmci_poll(void) { }
+static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+static inline void cmci_disable_bank(int bank) { }
+#endif
+
+void mce_timer_kick(unsigned long interval);
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 7395d5f4272..c370e1c4468 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -54,7 +54,7 @@ static struct severity {
#define MASK(x, y) .mask = x, .result = y
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
-#define MCACOD 0xffff
+#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
MCESEV(
NO, "Invalid",
@@ -102,11 +102,29 @@ static struct severity {
SER, BITCLR(MCI_STATUS_S)
),
- /* AR add known MCACODs here */
MCESEV(
PANIC, "Action required with lost events",
SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
),
+
+ /* known AR MCACODs: */
+#ifdef CONFIG_MEMORY_FAILURE
+ MCESEV(
+ KEEP, "Action required but unaffected thread is continuable",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
+ MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
+ ),
+ MCESEV(
+ AR, "Action required: data load error in a user process",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ USER
+ ),
+ MCESEV(
+ AR, "Action required: instruction fetch error in a user process",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ USER
+ ),
+#endif
MCESEV(
PANIC, "Action required: unknown MCACOD",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
@@ -115,11 +133,11 @@ static struct severity {
/* known AO MCACODs: */
MCESEV(
AO, "Action optional: memory scrubbing error",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
),
MCESEV(
AO, "Action optional: last level cache writeback error",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
),
MCESEV(
SOME, "Action optional: unknown MCACOD",
@@ -145,15 +163,19 @@ static struct severity {
};
/*
- * If the EIPV bit is set, it means the saved IP is the
- * instruction which caused the MCE.
+ * If mcgstatus indicated that ip/cs on the stack were
+ * no good, then "m->cs" will be zero and we will have
+ * to assume the worst case (IN_KERNEL) as we actually
+ * have no idea what we were executing when the machine
+ * check hit.
+ * If we do have a good "m->cs" (or a faked one in the
+ * case we were executing in VM86 mode) we can use it to
+ * distinguish an exception taken in user from from one
+ * taken in the kernel.
*/
static int error_context(struct mce *m)
{
- if (m->mcgstatus & MCG_STATUS_EIPV)
- return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
- /* Unknown, assume kernel */
- return IN_KERNEL;
+ return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}
int mce_severity(struct mce *m, int tolerant, char **msg)
@@ -166,9 +188,9 @@ int mce_severity(struct mce *m, int tolerant, char **msg)
continue;
if ((m->mcgstatus & s->mcgmask) != s->mcgres)
continue;
- if (s->ser == SER_REQUIRED && !mce_ser)
+ if (s->ser == SER_REQUIRED && !mca_cfg.ser)
continue;
- if (s->ser == NO_SER && mce_ser)
+ if (s->ser == NO_SER && mca_cfg.ser)
continue;
if (s->context && ctx != s->context)
continue;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5a11ae2e9e9..9a79c8dbd8e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -7,6 +7,9 @@
* Copyright 2008 Intel Corporation
* Author: Andi Kleen
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/thread_info.h>
#include <linux/capability.h>
#include <linux/miscdevice.h>
@@ -55,35 +58,24 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
-int mce_disabled __read_mostly;
-
-#define MISC_MCELOG_MINOR 227
-
#define SPINUNIT 100 /* 100ns */
-atomic_t mce_entry;
-
DEFINE_PER_CPU(unsigned, mce_exception_count);
-/*
- * Tolerant levels:
- * 0: always panic on uncorrected errors, log corrected errors
- * 1: panic or SIGBUS on uncorrected errors, log corrected errors
- * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
- * 3: never panic or SIGBUS, log all errors (for testing only)
- */
-static int tolerant __read_mostly = 1;
-static int banks __read_mostly;
-static int rip_msr __read_mostly;
-static int mce_bootlog __read_mostly = -1;
-static int monarch_timeout __read_mostly = -1;
-static int mce_panic_timeout __read_mostly;
-static int mce_dont_log_ce __read_mostly;
-int mce_cmci_disabled __read_mostly;
-int mce_ignore_ce __read_mostly;
-int mce_ser __read_mostly;
-
-struct mce_bank *mce_banks __read_mostly;
+struct mce_bank *mce_banks __read_mostly;
+
+struct mca_config mca_cfg __read_mostly = {
+ .bootlog = -1,
+ /*
+ * Tolerant levels:
+ * 0: always panic on uncorrected errors, log corrected errors
+ * 1: panic or SIGBUS on uncorrected errors, log corrected errors
+ * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
+ * 3: never panic or SIGBUS, log all errors (for testing only)
+ */
+ .tolerant = 1,
+ .monarch_timeout = -1
+};
/* User mode helper program triggered by machine check event */
static unsigned long mce_need_notify;
@@ -95,13 +87,30 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
-/* MCA banks polled by the period polling timer for corrected events */
+/* CMCI storm detection filter */
+static DEFINE_PER_CPU(unsigned long, mce_polled_error);
+
+/*
+ * MCA banks polled by the period polling timer for corrected events.
+ * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
+ */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};
+/*
+ * MCA banks controlled through firmware first for corrected errors.
+ * This is a global list of banks for which we won't enable CMCI and we
+ * won't poll. Firmware controls these banks and is responsible for
+ * reporting corrected errors through GHES. Uncorrected/recoverable
+ * errors are still notified through a machine check.
+ */
+mce_banks_t mce_banks_ce_disabled;
+
static DEFINE_PER_CPU(struct work_struct, mce_work);
+static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
@@ -191,7 +200,7 @@ static void drain_mcelog_buffer(void)
{
unsigned int next, i, prev = 0;
- next = rcu_dereference_check_mce(mcelog.next);
+ next = ACCESS_ONCE(mcelog.next);
do {
struct mce *m;
@@ -210,7 +219,7 @@ static void drain_mcelog_buffer(void)
cpu_relax();
if (!m->finished && retries >= 4) {
- pr_err("MCE: skipping error being logged currently!\n");
+ pr_err("skipping error being logged currently!\n");
break;
}
}
@@ -298,7 +307,7 @@ static void wait_for_panic(void)
while (timeout-- > 0)
udelay(1);
if (panic_timeout == 0)
- panic_timeout = mce_panic_timeout;
+ panic_timeout = mca_cfg.panic_timeout;
panic("Panicing machine check CPU died");
}
@@ -356,7 +365,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
if (!fake_panic) {
if (panic_timeout == 0)
- panic_timeout = mce_panic_timeout;
+ panic_timeout = mca_cfg.panic_timeout;
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
@@ -368,7 +377,7 @@ static int msr_to_offset(u32 msr)
{
unsigned bank = __this_cpu_read(injectm.bank);
- if (msr == rip_msr)
+ if (msr == mca_cfg.rip_msr)
return offsetof(struct mce, ip);
if (msr == MSR_IA32_MCx_STATUS(bank))
return offsetof(struct mce, status);
@@ -437,10 +446,18 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
m->ip = regs->ip;
m->cs = regs->cs;
+
+ /*
+ * When in VM86 mode make the cs look like ring 3
+ * always. This is a lie, but it's better than passing
+ * the additional vm86 bit around everywhere.
+ */
+ if (v8086_mode(regs))
+ m->cs |= 3;
}
/* Use accurate RIP reporting if available. */
- if (rip_msr)
- m->ip = mce_rdmsrl(rip_msr);
+ if (mca_cfg.rip_msr)
+ m->ip = mce_rdmsrl(mca_cfg.rip_msr);
}
}
@@ -501,18 +518,15 @@ static int mce_ring_add(unsigned long pfn)
int mce_available(struct cpuinfo_x86 *c)
{
- if (mce_disabled)
+ if (mca_cfg.disabled)
return 0;
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
static void mce_schedule_work(void)
{
- if (!mce_ring_empty()) {
- struct work_struct *work = &__get_cpu_var(mce_work);
- if (!work_pending(work))
- schedule_work(work);
- }
+ if (!mce_ring_empty())
+ schedule_work(&__get_cpu_var(mce_work));
}
DEFINE_PER_CPU(struct irq_work, mce_irq_work);
@@ -540,6 +554,27 @@ static void mce_report_event(struct pt_regs *regs)
irq_work_queue(&__get_cpu_var(mce_irq_work));
}
+/*
+ * Read ADDR and MISC registers.
+ */
+static void mce_read_aux(struct mce *m, int i)
+{
+ if (m->status & MCI_STATUS_MISCV)
+ m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
+ if (m->status & MCI_STATUS_ADDRV) {
+ m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+
+ /*
+ * Mask the reported address by the reported granularity.
+ */
+ if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
+ u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+ m->addr >>= shift;
+ m->addr <<= shift;
+ }
+ }
+}
+
DEFINE_PER_CPU(unsigned, mce_poll_count);
/*
@@ -562,11 +597,11 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
struct mce m;
int i;
- percpu_inc(mce_poll_count);
+ this_cpu_inc(mce_poll_count);
mce_gather_info(&m, NULL);
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
@@ -580,6 +615,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(m.status & MCI_STATUS_VAL))
continue;
+ this_cpu_write(mce_polled_error, 1);
/*
* Uncorrected or signalled events are handled by the exception
* handler when it is enabled, so don't process those here.
@@ -587,13 +623,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* TBD do the same check for MCI_STATUS_EN here?
*/
if (!(flags & MCP_UC) &&
- (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
+ (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
continue;
- if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
- if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+ mce_read_aux(&m, i);
if (!(flags & MCP_TIMESTAMP))
m.tsc = 0;
@@ -601,7 +634,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce)
+ if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
mce_log(&m);
/*
@@ -623,16 +656,22 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
-static int mce_no_way_out(struct mce *m, char **msg)
+static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+ struct pt_regs *regs)
{
- int i;
+ int i, ret = 0;
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
- if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
- return 1;
+ if (m->status & MCI_STATUS_VAL) {
+ __set_bit(i, validp);
+ if (quirk_no_way_out)
+ quirk_no_way_out(i, m, regs);
+ }
+ if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY)
+ ret = 1;
}
- return 0;
+ return ret;
}
/*
@@ -660,11 +699,10 @@ static int mce_timed_out(u64 *t)
rmb();
if (atomic_read(&mce_paniced))
wait_for_panic();
- if (!monarch_timeout)
+ if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
- /* CHECKME: Make panic default for 1 too? */
- if (tolerant < 1)
+ if (mca_cfg.tolerant <= 1)
mce_panic("Timeout synchronizing machine check over CPUs",
NULL, NULL);
cpu_missing = 1;
@@ -714,7 +752,8 @@ static void mce_reign(void)
* Grade the severity of the errors of all the CPUs.
*/
for_each_possible_cpu(cpu) {
- int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
+ int severity = mce_severity(&per_cpu(mces_seen, cpu),
+ mca_cfg.tolerant,
&nmsg);
if (severity > global_worst) {
msg = nmsg;
@@ -728,7 +767,7 @@ static void mce_reign(void)
* This dumps all the mces in the log buffer and stops the
* other CPUs.
*/
- if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
+ if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
mce_panic("Fatal Machine check", m, msg);
/*
@@ -741,7 +780,7 @@ static void mce_reign(void)
* No machine check event found. Must be some external
* source or one CPU is hung. Panic.
*/
- if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
+ if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
mce_panic("Machine check from unknown source", NULL, NULL);
/*
@@ -765,7 +804,7 @@ static int mce_start(int *no_way_out)
{
int order;
int cpus = num_online_cpus();
- u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
if (!timeout)
return -1;
@@ -829,7 +868,7 @@ static int mce_start(int *no_way_out)
static int mce_end(int order)
{
int ret = -1;
- u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
if (!timeout)
goto reset;
@@ -910,13 +949,58 @@ static void mce_clear_state(unsigned long *toclear)
{
int i;
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
if (test_bit(i, toclear))
mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
}
}
/*
+ * Need to save faulting physical address associated with a process
+ * in the machine check handler some place where we can grab it back
+ * later in mce_notify_process()
+ */
+#define MCE_INFO_MAX 16
+
+struct mce_info {
+ atomic_t inuse;
+ struct task_struct *t;
+ __u64 paddr;
+ int restartable;
+} mce_info[MCE_INFO_MAX];
+
+static void mce_save_info(__u64 addr, int c)
+{
+ struct mce_info *mi;
+
+ for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
+ if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
+ mi->t = current;
+ mi->paddr = addr;
+ mi->restartable = c;
+ return;
+ }
+ }
+
+ mce_panic("Too many concurrent recoverable errors", NULL, NULL);
+}
+
+static struct mce_info *mce_find_info(void)
+{
+ struct mce_info *mi;
+
+ for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
+ if (atomic_read(&mi->inuse) && mi->t == current)
+ return mi;
+ return NULL;
+}
+
+static void mce_clear_info(struct mce_info *mi)
+{
+ atomic_set(&mi->inuse, 0);
+}
+
+/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
*
@@ -930,6 +1014,7 @@ static void mce_clear_state(unsigned long *toclear)
*/
void do_machine_check(struct pt_regs *regs, long error_code)
{
+ struct mca_config *cfg = &mca_cfg;
struct mce m, *final;
int i;
int worst = 0;
@@ -941,7 +1026,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
int order;
/*
* If no_way_out gets set, there is no safe way to recover from this
- * MCE. If tolerant is cranked up, we'll try anyway.
+ * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
*/
int no_way_out = 0;
/*
@@ -950,13 +1035,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
*/
int kill_it = 0;
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown";
- atomic_inc(&mce_entry);
-
- percpu_inc(mce_exception_count);
+ this_cpu_inc(mce_exception_count);
- if (!banks)
+ if (!cfg->banks)
goto out;
mce_gather_info(&m, regs);
@@ -964,12 +1048,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
final = &__get_cpu_var(mces_seen);
*final = m;
- no_way_out = mce_no_way_out(&m, &msg);
+ memset(valid_banks, 0, sizeof(valid_banks));
+ no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
barrier();
/*
- * When no restart IP must always kill or panic.
+ * When no restart IP might need to kill or panic.
+ * Assume the worst for now, but if we find the
+ * severity is MCE_AR_SEVERITY we have other options.
*/
if (!(m.mcgstatus & MCG_STATUS_RIPV))
kill_it = 1;
@@ -980,8 +1067,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* because the first one to see it will clear it.
*/
order = mce_start(&no_way_out);
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < cfg->banks; i++) {
__clear_bit(i, toclear);
+ if (!test_bit(i, valid_banks))
+ continue;
if (!mce_banks[i].ctl)
continue;
@@ -997,16 +1086,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* Non uncorrected or non signaled errors are handled by
* machine_check_poll. Leave them alone, unless this panics.
*/
- if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
!no_way_out)
continue;
/*
* Set taint even when machine check was not enabled.
*/
- add_taint(TAINT_MACHINE_CHECK);
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
- severity = mce_severity(&m, tolerant, NULL);
+ severity = mce_severity(&m, cfg->tolerant, NULL);
/*
* When machine check was for corrected handler don't touch,
@@ -1023,23 +1112,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)
continue;
}
- /*
- * Kill on action required.
- */
- if (severity == MCE_AR_SEVERITY)
- kill_it = 1;
-
- if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
- if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+ mce_read_aux(&m, i);
/*
* Action optional error. Queue address for later processing.
* When the ring overflows we just ignore the AO error.
* RED-PEN add some logging mechanism when
* usable_address or mce_add_ring fails.
- * RED-PEN don't ignore overflow for tolerant == 0
+ * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
*/
if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
mce_ring_add(m.addr >> PAGE_SHIFT);
@@ -1052,6 +1132,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
}
}
+ /* mce_clear_state will clear *final, save locally for use later */
+ m = *final;
+
if (!no_way_out)
mce_clear_state(toclear);
@@ -1063,65 +1146,91 @@ void do_machine_check(struct pt_regs *regs, long error_code)
no_way_out = worst >= MCE_PANIC_SEVERITY;
/*
- * If we have decided that we just CAN'T continue, and the user
- * has not set tolerant to an insane level, give up and die.
- *
- * This is mainly used in the case when the system doesn't
- * support MCE broadcasting or it has been disabled.
+ * At insane "tolerant" levels we take no action. Otherwise
+ * we only die if we have no other choice. For less serious
+ * issues we try to recover, or limit damage to the current
+ * process.
*/
- if (no_way_out && tolerant < 3)
- mce_panic("Fatal machine check on current CPU", final, msg);
-
- /*
- * If the error seems to be unrecoverable, something should be
- * done. Try to kill as little as possible. If we can kill just
- * one task, do that. If the user has set the tolerance very
- * high, don't try to do anything at all.
- */
-
- if (kill_it && tolerant < 3)
- force_sig(SIGBUS, current);
-
- /* notify userspace ASAP */
- set_thread_flag(TIF_MCE_NOTIFY);
+ if (cfg->tolerant < 3) {
+ if (no_way_out)
+ mce_panic("Fatal machine check on current CPU", &m, msg);
+ if (worst == MCE_AR_SEVERITY) {
+ /* schedule action before return to userland */
+ mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
+ set_thread_flag(TIF_MCE_NOTIFY);
+ } else if (kill_it) {
+ force_sig(SIGBUS, current);
+ }
+ }
if (worst > 0)
mce_report_event(regs);
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
out:
- atomic_dec(&mce_entry);
sync_core();
}
EXPORT_SYMBOL_GPL(do_machine_check);
-/* dummy to break dependency. actual code is in mm/memory-failure.c */
-void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+#ifndef CONFIG_MEMORY_FAILURE
+int memory_failure(unsigned long pfn, int vector, int flags)
{
- printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+ /* mce_severity() should not hand us an ACTION_REQUIRED error */
+ BUG_ON(flags & MF_ACTION_REQUIRED);
+ pr_err("Uncorrected memory error in page 0x%lx ignored\n"
+ "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
+ pfn);
+
+ return 0;
}
+#endif
/*
- * Called after mce notification in process context. This code
- * is allowed to sleep. Call the high level VM handler to process
- * any corrupted pages.
- * Assume that the work queue code only calls this one at a time
- * per CPU.
- * Note we don't disable preemption, so this code might run on the wrong
- * CPU. In this case the event is picked up by the scheduled work queue.
- * This is merely a fast path to expedite processing in some common
- * cases.
+ * Called in process context that interrupted by MCE and marked with
+ * TIF_MCE_NOTIFY, just before returning to erroneous userland.
+ * This code is allowed to sleep.
+ * Attempt possible recovery such as calling the high level VM handler to
+ * process any corrupted pages, and kill/signal current process if required.
+ * Action required errors are handled here.
*/
void mce_notify_process(void)
{
unsigned long pfn;
- mce_notify_irq();
- while (mce_ring_get(&pfn))
- memory_failure(pfn, MCE_VECTOR);
+ struct mce_info *mi = mce_find_info();
+ int flags = MF_ACTION_REQUIRED;
+
+ if (!mi)
+ mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
+ pfn = mi->paddr >> PAGE_SHIFT;
+
+ clear_thread_flag(TIF_MCE_NOTIFY);
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx",
+ mi->paddr);
+ /*
+ * We must call memory_failure() here even if the current process is
+ * doomed. We still need to mark the page as poisoned and alert any
+ * other users of the page.
+ */
+ if (!mi->restartable)
+ flags |= MF_MUST_KILL;
+ if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
+ pr_err("Memory error not recovered");
+ force_sig(SIGBUS, current);
+ }
+ mce_clear_info(mi);
}
+/*
+ * Action optional processing happens here (picking up
+ * from the list of faulting pages that do_machine_check()
+ * placed into the "ring").
+ */
static void mce_process_work(struct work_struct *dummy)
{
- mce_notify_process();
+ unsigned long pfn;
+
+ while (mce_ring_get(&pfn))
+ memory_failure(pfn, MCE_VECTOR, 0);
}
#ifdef CONFIG_X86_MCE_INTEL
@@ -1154,35 +1263,79 @@ void mce_log_therm_throt_event(__u64 status)
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
*/
-static int check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = 5 * 60; /* 5 minutes */
-static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mce_start_timer(unsigned long data)
+static unsigned long mce_adjust_timer_default(unsigned long interval)
+{
+ return interval;
+}
+
+static unsigned long (*mce_adjust_timer)(unsigned long interval) =
+ mce_adjust_timer_default;
+
+static int cmc_error_seen(void)
+{
+ unsigned long *v = &__get_cpu_var(mce_polled_error);
+
+ return test_and_clear_bit(0, v);
+}
+
+static void mce_timer_fn(unsigned long data)
{
- struct timer_list *t = &per_cpu(mce_timer, data);
- int *n;
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned long iv;
+ int notify;
WARN_ON(smp_processor_id() != data);
if (mce_available(__this_cpu_ptr(&cpu_info))) {
machine_check_poll(MCP_TIMESTAMP,
&__get_cpu_var(mce_poll_banks));
+ mce_intel_cmci_poll();
}
/*
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
- n = &__get_cpu_var(mce_next_interval);
- if (mce_notify_irq())
- *n = max(*n/2, HZ/100);
- else
- *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
+ iv = __this_cpu_read(mce_next_interval);
+ notify = mce_notify_irq();
+ notify |= cmc_error_seen();
+ if (notify) {
+ iv = max(iv / 2, (unsigned long) HZ/100);
+ } else {
+ iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+ iv = mce_adjust_timer(iv);
+ }
+ __this_cpu_write(mce_next_interval, iv);
+ /* Might have become 0 after CMCI storm subsided */
+ if (iv) {
+ t->expires = jiffies + iv;
+ add_timer_on(t, smp_processor_id());
+ }
+}
- t->expires = jiffies + *n;
- add_timer_on(t, smp_processor_id());
+/*
+ * Ensure that the timer is firing in @interval from now.
+ */
+void mce_timer_kick(unsigned long interval)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned long when = jiffies + interval;
+ unsigned long iv = __this_cpu_read(mce_next_interval);
+
+ if (timer_pending(t)) {
+ if (time_before(when, t->expires))
+ mod_timer_pinned(t, when);
+ } else {
+ t->expires = round_jiffies(when);
+ add_timer_on(t, smp_processor_id());
+ }
+ if (interval < iv)
+ __this_cpu_write(mce_next_interval, interval);
}
/* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -1211,18 +1364,11 @@ int mce_notify_irq(void)
/* Not more than two messages every minute */
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
- clear_thread_flag(TIF_MCE_NOTIFY);
-
if (test_and_clear_bit(0, &mce_need_notify)) {
/* wake processes polling /dev/mcelog */
wake_up_interruptible(&mce_chrdev_wait);
- /*
- * There is no risk of missing notifications because
- * work_pending is always cleared before the function is
- * executed.
- */
- if (mce_helper[0] && !work_pending(&mce_trigger_work))
+ if (mce_helper[0])
schedule_work(&mce_trigger_work);
if (__ratelimit(&ratelimit))
@@ -1234,14 +1380,16 @@ int mce_notify_irq(void)
}
EXPORT_SYMBOL_GPL(mce_notify_irq);
-static int __cpuinit __mcheck_cpu_mce_banks_init(void)
+static int __mcheck_cpu_mce_banks_init(void)
{
int i;
+ u8 num_banks = mca_cfg.banks;
- mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
+ mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
if (!mce_banks)
return -ENOMEM;
- for (i = 0; i < banks; i++) {
+
+ for (i = 0; i < num_banks; i++) {
struct mce_bank *b = &mce_banks[i];
b->ctl = -1ULL;
@@ -1253,7 +1401,7 @@ static int __cpuinit __mcheck_cpu_mce_banks_init(void)
/*
* Initialize Machine Checks for a CPU.
*/
-static int __cpuinit __mcheck_cpu_cap_init(void)
+static int __mcheck_cpu_cap_init(void)
{
unsigned b;
u64 cap;
@@ -1261,19 +1409,19 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
rdmsrl(MSR_IA32_MCG_CAP, cap);
b = cap & MCG_BANKCNT_MASK;
- if (!banks)
- printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
+ if (!mca_cfg.banks)
+ pr_info("CPU supports %d MCE banks\n", b);
if (b > MAX_NR_BANKS) {
- printk(KERN_WARNING
- "MCE: Using only %u machine check banks out of %u\n",
+ pr_warn("Using only %u machine check banks out of %u\n",
MAX_NR_BANKS, b);
b = MAX_NR_BANKS;
}
/* Don't support asymmetric configurations today */
- WARN_ON(banks != 0 && b != banks);
- banks = b;
+ WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
+ mca_cfg.banks = b;
+
if (!mce_banks) {
int err = __mcheck_cpu_mce_banks_init();
@@ -1283,25 +1431,29 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
/* Use accurate RIP reporting if available. */
if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
- rip_msr = MSR_IA32_MCG_EIP;
+ mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
if (cap & MCG_SER_P)
- mce_ser = 1;
+ mca_cfg.ser = true;
return 0;
}
static void __mcheck_cpu_init_generic(void)
{
+ enum mcp_flags m_fl = 0;
mce_banks_t all_banks;
u64 cap;
int i;
+ if (!mca_cfg.bootlog)
+ m_fl = MCP_DONTLOG;
+
/*
* Log the machine checks left over from the previous reset.
*/
bitmap_fill(all_banks, MAX_NR_BANKS);
- machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
+ machine_check_poll(MCP_UC | m_fl, &all_banks);
set_in_cr4(X86_CR4_MCE);
@@ -1309,7 +1461,7 @@ static void __mcheck_cpu_init_generic(void)
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
if (!b->init)
@@ -1319,17 +1471,47 @@ static void __mcheck_cpu_init_generic(void)
}
}
+/*
+ * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+ * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+ * Vol 3B Table 15-20). But this confuses both the code that determines
+ * whether the machine check occurred in kernel or user mode, and also
+ * the severity assessment code. Pretend that EIPV was set, and take the
+ * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+ */
+static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 0)
+ return;
+ if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+ return;
+ if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+ MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+ MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+ MCACOD)) !=
+ (MCI_STATUS_UC|MCI_STATUS_EN|
+ MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+ MCI_STATUS_AR|MCACOD_INSTR))
+ return;
+
+ m->mcgstatus |= MCG_STATUS_EIPV;
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+}
+
/* Add per CPU specific workarounds here */
-static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
+static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{
+ struct mca_config *cfg = &mca_cfg;
+
if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
- pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
+ pr_info("unknown CPU type - not enabling MCE support\n");
return -EOPNOTSUPP;
}
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD) {
- if (c->x86 == 15 && banks > 4) {
+ if (c->x86 == 15 && cfg->banks > 4) {
/*
* disable GART TBL walk error reporting, which
* trips off incorrectly with the IOMMU & 3ware
@@ -1337,19 +1519,56 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
*/
clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
}
- if (c->x86 <= 17 && mce_bootlog < 0) {
+ if (c->x86 <= 17 && cfg->bootlog < 0) {
/*
* Lots of broken BIOS around that don't clear them
* by default and leave crap in there. Don't log:
*/
- mce_bootlog = 0;
+ cfg->bootlog = 0;
}
/*
* Various K7s with broken bank 0 around. Always disable
* by default.
*/
- if (c->x86 == 6 && banks > 0)
+ if (c->x86 == 6 && cfg->banks > 0)
mce_banks[0].ctl = 0;
+
+ /*
+ * Turn off MC4_MISC thresholding banks on those models since
+ * they're not supported there.
+ */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+ int i;
+ u64 val, hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
+ 0x00000413, /* MC4_MISC0 */
+ 0xc0000408, /* MC4_MISC1 */
+ };
+
+ rdmsrl(MSR_K7_HWCR, hwcr);
+
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
+
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+
+ for (i = 0; i < ARRAY_SIZE(msrs); i++) {
+ rdmsrl(msrs[i], val);
+
+ /* CntP bit set? */
+ if (val & BIT_64(62)) {
+ val &= ~BIT_64(62);
+ wrmsrl(msrs[i], val);
+ }
+ }
+
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+ }
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1362,7 +1581,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* valid event later, merely don't write CTL0.
*/
- if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
+ if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
mce_banks[0].init = 0;
/*
@@ -1370,25 +1589,28 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* synchronization with a one second timeout.
*/
if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
- monarch_timeout < 0)
- monarch_timeout = USEC_PER_SEC;
+ cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
/*
* There are also broken BIOSes on some Pentium M and
* earlier systems:
*/
- if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
- mce_bootlog = 0;
+ if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
+ cfg->bootlog = 0;
+
+ if (c->x86 == 6 && c->x86_model == 45)
+ quirk_no_way_out = quirk_sandybridge_ifu;
}
- if (monarch_timeout < 0)
- monarch_timeout = 0;
- if (mce_bootlog != 0)
- mce_panic_timeout = 30;
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = 0;
+ if (cfg->bootlog != 0)
+ cfg->panic_timeout = 30;
return 0;
}
-static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
return 0;
@@ -1412,6 +1634,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
+ mce_adjust_timer = mce_intel_adjust_timer;
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
@@ -1421,27 +1644,32 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
}
}
-static void __mcheck_cpu_init_timer(void)
+static void mce_start_timer(unsigned int cpu, struct timer_list *t)
{
- struct timer_list *t = &__get_cpu_var(mce_timer);
- int *n = &__get_cpu_var(mce_next_interval);
+ unsigned long iv = check_interval * HZ;
- setup_timer(t, mce_start_timer, smp_processor_id());
-
- if (mce_ignore_ce)
+ if (mca_cfg.ignore_ce || !iv)
return;
- *n = check_interval * HZ;
- if (!*n)
- return;
- t->expires = round_jiffies(jiffies + *n);
- add_timer_on(t, smp_processor_id());
+ per_cpu(mce_next_interval, cpu) = iv;
+
+ t->expires = round_jiffies(jiffies + iv);
+ add_timer_on(t, cpu);
+}
+
+static void __mcheck_cpu_init_timer(void)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned int cpu = smp_processor_id();
+
+ setup_timer(t, mce_timer_fn, cpu);
+ mce_start_timer(cpu, t);
}
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
{
- printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+ pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
smp_processor_id());
}
@@ -1453,9 +1681,9 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off:
*/
-void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
+void mcheck_cpu_init(struct cpuinfo_x86 *c)
{
- if (mce_disabled)
+ if (mca_cfg.disabled)
return;
if (__mcheck_cpu_ancient_init(c))
@@ -1465,7 +1693,7 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
return;
if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
- mce_disabled = 1;
+ mca_cfg.disabled = true;
return;
}
@@ -1541,6 +1769,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
/* Error or no more MCE record */
if (rc <= 0) {
mce_apei_read_done = 1;
+ /*
+ * When ERST is disabled, mce_chrdev_read() should return
+ * "no record" instead of "no device."
+ */
+ if (rc == -ENODEV)
+ return 0;
return rc;
}
rc = -EFAULT;
@@ -1718,6 +1952,25 @@ static struct miscdevice mce_chrdev_device = {
&mce_chrdev_ops,
};
+static void __mce_disable_bank(void *arg)
+{
+ int bank = *((int *)arg);
+ __clear_bit(bank, __get_cpu_var(mce_poll_banks));
+ cmci_disable_bank(bank);
+}
+
+void mce_disable_bank(int bank)
+{
+ if (bank >= mca_cfg.banks) {
+ pr_warn(FW_BUG
+ "Ignoring request to disable invalid MCA bank %d.\n",
+ bank);
+ return;
+ }
+ set_bit(bank, mce_banks_ce_disabled);
+ on_each_cpu(__mce_disable_bank, &bank, 1);
+}
+
/*
* mce=off Disables machine check
* mce=no_cmci Disables CMCI
@@ -1728,9 +1981,12 @@ static struct miscdevice mce_chrdev_device = {
* check, or 0 to not wait
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
+ * mce=bios_cmci_threshold Don't program the CMCI threshold
*/
static int __init mcheck_enable(char *str)
{
+ struct mca_config *cfg = &mca_cfg;
+
if (*str == 0) {
enable_p5_mce();
return 1;
@@ -1738,24 +1994,25 @@ static int __init mcheck_enable(char *str)
if (*str == '=')
str++;
if (!strcmp(str, "off"))
- mce_disabled = 1;
+ cfg->disabled = true;
else if (!strcmp(str, "no_cmci"))
- mce_cmci_disabled = 1;
+ cfg->cmci_disabled = true;
else if (!strcmp(str, "dont_log_ce"))
- mce_dont_log_ce = 1;
+ cfg->dont_log_ce = true;
else if (!strcmp(str, "ignore_ce"))
- mce_ignore_ce = 1;
+ cfg->ignore_ce = true;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
- mce_bootlog = (str[0] == 'b');
+ cfg->bootlog = (str[0] == 'b');
+ else if (!strcmp(str, "bios_cmci_threshold"))
+ cfg->bios_cmci_threshold = true;
else if (isdigit(str[0])) {
- get_option(&str, &tolerant);
+ get_option(&str, &(cfg->tolerant));
if (*str == ',') {
++str;
- get_option(&str, &monarch_timeout);
+ get_option(&str, &(cfg->monarch_timeout));
}
} else {
- printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
- str);
+ pr_info("mce argument %s ignored. Please use /sys\n", str);
return 0;
}
return 1;
@@ -1781,7 +2038,7 @@ static int mce_disable_error_reporting(void)
{
int i;
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
if (b->init)
@@ -1859,9 +2116,8 @@ static struct bus_type mce_subsys = {
.dev_name = "machinecheck",
};
-struct device *mce_device[CONFIG_NR_CPUS];
+DEFINE_PER_CPU(struct device *, mce_device);
-__cpuinitdata
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
@@ -1921,15 +2177,15 @@ static ssize_t set_ignore_ce(struct device *s,
if (strict_strtoull(buf, 0, &new) < 0)
return -EINVAL;
- if (mce_ignore_ce ^ !!new) {
+ if (mca_cfg.ignore_ce ^ !!new) {
if (new) {
/* disable ce features */
mce_timer_delete_all();
on_each_cpu(mce_disable_cmci, NULL, 1);
- mce_ignore_ce = 1;
+ mca_cfg.ignore_ce = true;
} else {
/* enable ce features */
- mce_ignore_ce = 0;
+ mca_cfg.ignore_ce = false;
on_each_cpu(mce_enable_ce, (void *)1, 1);
}
}
@@ -1945,14 +2201,14 @@ static ssize_t set_cmci_disabled(struct device *s,
if (strict_strtoull(buf, 0, &new) < 0)
return -EINVAL;
- if (mce_cmci_disabled ^ !!new) {
+ if (mca_cfg.cmci_disabled ^ !!new) {
if (new) {
/* disable cmci */
on_each_cpu(mce_disable_cmci, NULL, 1);
- mce_cmci_disabled = 1;
+ mca_cfg.cmci_disabled = true;
} else {
/* enable cmci */
- mce_cmci_disabled = 0;
+ mca_cfg.cmci_disabled = false;
on_each_cpu(mce_enable_ce, NULL, 1);
}
}
@@ -1969,9 +2225,9 @@ static ssize_t store_int_with_restart(struct device *s,
}
static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
-static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
-static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
-static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
+static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
+static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
+static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
static struct dev_ext_attribute dev_attr_check_interval = {
__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
@@ -1979,13 +2235,13 @@ static struct dev_ext_attribute dev_attr_check_interval = {
};
static struct dev_ext_attribute dev_attr_ignore_ce = {
- __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
- &mce_ignore_ce
+ __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
+ &mca_cfg.ignore_ce
};
static struct dev_ext_attribute dev_attr_cmci_disabled = {
- __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
- &mce_cmci_disabled
+ __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
+ &mca_cfg.cmci_disabled
};
static struct device_attribute *mce_device_attrs[] = {
@@ -2007,7 +2263,7 @@ static void mce_device_release(struct device *dev)
}
/* Per cpu device init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_device_create(unsigned int cpu)
+static int mce_device_create(unsigned int cpu)
{
struct device *dev;
int err;
@@ -2024,21 +2280,23 @@ static __cpuinit int mce_device_create(unsigned int cpu)
dev->release = &mce_device_release;
err = device_register(dev);
- if (err)
+ if (err) {
+ put_device(dev);
return err;
+ }
for (i = 0; mce_device_attrs[i]; i++) {
err = device_create_file(dev, mce_device_attrs[i]);
if (err)
goto error;
}
- for (j = 0; j < banks; j++) {
+ for (j = 0; j < mca_cfg.banks; j++) {
err = device_create_file(dev, &mce_banks[j].attr);
if (err)
goto error2;
}
cpumask_set_cpu(cpu, mce_device_initialized);
- mce_device[cpu] = dev;
+ per_cpu(mce_device, cpu) = dev;
return 0;
error2:
@@ -2053,9 +2311,9 @@ error:
return err;
}
-static __cpuinit void mce_device_remove(unsigned int cpu)
+static void mce_device_remove(unsigned int cpu)
{
- struct device *dev = mce_device[cpu];
+ struct device *dev = per_cpu(mce_device, cpu);
int i;
if (!cpumask_test_cpu(cpu, mce_device_initialized))
@@ -2064,16 +2322,16 @@ static __cpuinit void mce_device_remove(unsigned int cpu)
for (i = 0; mce_device_attrs[i]; i++)
device_remove_file(dev, mce_device_attrs[i]);
- for (i = 0; i < banks; i++)
+ for (i = 0; i < mca_cfg.banks; i++)
device_remove_file(dev, &mce_banks[i].attr);
device_unregister(dev);
cpumask_clear_cpu(cpu, mce_device_initialized);
- mce_device[cpu] = NULL;
+ per_cpu(mce_device, cpu) = NULL;
}
/* Make sure there are no machine checks on offlined CPUs. */
-static void __cpuinit mce_disable_cpu(void *h)
+static void mce_disable_cpu(void *h)
{
unsigned long action = *(unsigned long *)h;
int i;
@@ -2083,7 +2341,7 @@ static void __cpuinit mce_disable_cpu(void *h)
if (!(action & CPU_TASKS_FROZEN))
cmci_clear();
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
if (b->init)
@@ -2091,7 +2349,7 @@ static void __cpuinit mce_disable_cpu(void *h)
}
}
-static void __cpuinit mce_reenable_cpu(void *h)
+static void mce_reenable_cpu(void *h)
{
unsigned long action = *(unsigned long *)h;
int i;
@@ -2101,7 +2359,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
if (!(action & CPU_TASKS_FROZEN))
cmci_reenable();
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
if (b->init)
@@ -2110,48 +2368,43 @@ static void __cpuinit mce_reenable_cpu(void *h)
}
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int __cpuinit
+static int
mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct timer_list *t = &per_cpu(mce_timer, cpu);
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
mce_device_create(cpu);
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
break;
case CPU_DEAD:
- case CPU_DEAD_FROZEN:
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
mce_device_remove(cpu);
+ mce_intel_hcpu_update(cpu);
break;
case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- del_timer_sync(t);
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ del_timer_sync(t);
break;
case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- if (!mce_ignore_ce && check_interval) {
- t->expires = round_jiffies(jiffies +
- __get_cpu_var(mce_next_interval));
- add_timer_on(t, cpu);
- }
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ mce_start_timer(cpu, t);
break;
- case CPU_POST_DEAD:
+ }
+
+ if (action == CPU_POST_DEAD) {
/* intentionally ignoring frozen here */
- cmci_rediscover(cpu);
- break;
+ cmci_rediscover();
}
+
return NOTIFY_OK;
}
-static struct notifier_block mce_cpu_notifier __cpuinitdata = {
+static struct notifier_block mce_cpu_notifier = {
.notifier_call = mce_cpu_callback,
};
@@ -2159,7 +2412,7 @@ static __init void mce_init_banks(void)
{
int i;
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
struct device_attribute *a = &b->attr;
@@ -2178,39 +2431,78 @@ static __init int mcheck_init_device(void)
int err;
int i = 0;
- if (!mce_available(&boot_cpu_data))
- return -EIO;
+ if (!mce_available(&boot_cpu_data)) {
+ err = -EIO;
+ goto err_out;
+ }
- zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
+ if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
mce_init_banks();
err = subsys_system_register(&mce_subsys, NULL);
if (err)
- return err;
+ goto err_out_mem;
+ cpu_notifier_register_begin();
for_each_online_cpu(i) {
err = mce_device_create(i);
- if (err)
- return err;
+ if (err) {
+ /*
+ * Register notifier anyway (and do not unreg it) so
+ * that we don't leave undeleted timers, see notifier
+ * callback above.
+ */
+ __register_hotcpu_notifier(&mce_cpu_notifier);
+ cpu_notifier_register_done();
+ goto err_device_create;
+ }
}
+ __register_hotcpu_notifier(&mce_cpu_notifier);
+ cpu_notifier_register_done();
+
register_syscore_ops(&mce_syscore_ops);
- register_hotcpu_notifier(&mce_cpu_notifier);
/* register character device /dev/mcelog */
- misc_register(&mce_chrdev_device);
+ err = misc_register(&mce_chrdev_device);
+ if (err)
+ goto err_register;
+
+ return 0;
+
+err_register:
+ unregister_syscore_ops(&mce_syscore_ops);
+
+err_device_create:
+ /*
+ * We didn't keep track of which devices were created above, but
+ * even if we had, the set of online cpus might have changed.
+ * Play safe and remove for every possible cpu, since
+ * mce_device_remove() will do the right thing.
+ */
+ for_each_possible_cpu(i)
+ mce_device_remove(i);
+
+err_out_mem:
+ free_cpumask_var(mce_device_initialized);
+
+err_out:
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
return err;
}
-device_initcall(mcheck_init_device);
+device_initcall_sync(mcheck_init_device);
/*
* Old style boot options parsing. Only for compatibility.
*/
static int __init mcheck_disable(char *str)
{
- mce_disabled = 1;
+ mca_cfg.disabled = true;
return 1;
}
__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 786e76a8632..603df4f7464 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,15 +1,17 @@
/*
- * (c) 2005, 2006 Advanced Micro Devices, Inc.
+ * (c) 2005-2012 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
*
* Written by Jacob Shin - AMD, Inc.
*
- * Support : jacob.shin@amd.com
+ * Maintained by: Borislav Petkov <bp@alien8.de>
*
* April 2006
* - added support for AMD Family 0x10 processors
+ * May 2012
+ * - major scrubbing
*
* All MC4_MISCi registers are shared between multi-cores
*/
@@ -25,12 +27,12 @@
#include <linux/cpu.h>
#include <linux/smp.h>
+#include <asm/amd_nb.h>
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
-#define NR_BANKS 6
#define NR_BLOCKS 9
#define THRESHOLD_MAX 0xFFF
#define INT_TYPE_APIC 0x00020000
@@ -45,28 +47,16 @@
#define MASK_BLKPTR_LO 0xFF000000
#define MCG_XBLK_ADDR 0xC0000400
-struct threshold_block {
- unsigned int block;
- unsigned int bank;
- unsigned int cpu;
- u32 address;
- u16 interrupt_enable;
- u16 threshold_limit;
- struct kobject kobj;
- struct list_head miscj;
-};
-
-struct threshold_bank {
- struct kobject *kobj;
- struct threshold_block *blocks;
- cpumask_var_t cpus;
-};
-static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
-
-static unsigned char shared_bank[NR_BANKS] = {
- 0, 0, 0, 0, 1
+static const char * const th_names[] = {
+ "load_store",
+ "insn_fetch",
+ "combined_unit",
+ "",
+ "northbridge",
+ "execution_unit",
};
+static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
static void amd_threshold_interrupt(void);
@@ -83,6 +73,47 @@ struct thresh_restart {
u16 old_limit;
};
+static inline bool is_shared_bank(int bank)
+{
+ /* Bank 4 is for northbridge reporting and is thus shared */
+ return (bank == 4);
+}
+
+static const char * const bank4_names(struct threshold_block *b)
+{
+ switch (b->address) {
+ /* MSR4_MISC0 */
+ case 0x00000413:
+ return "dram";
+
+ case 0xc0000408:
+ return "ht_links";
+
+ case 0xc0000409:
+ return "l3_cache";
+
+ default:
+ WARN(1, "Funny MSR: 0x%08x\n", b->address);
+ return "";
+ }
+};
+
+
+static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
+{
+ /*
+ * bank 4 supports APIC LVT interrupts implicitly since forever.
+ */
+ if (bank == 4)
+ return true;
+
+ /*
+ * IntP: interrupt present; if this bit is set, the thresholding
+ * bank can generate APIC LVT interrupts
+ */
+ return msr_high_bits & BIT(28);
+}
+
static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
{
int msr = (hi & MASK_LVTOFF_HI) >> 20;
@@ -104,8 +135,10 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
return 1;
};
-/* must be called with correct cpu affinity */
-/* Called via smp_call_function_single() */
+/*
+ * Called via smp_call_function_single(), must be called with correct
+ * cpu affinity.
+ */
static void threshold_restart_bank(void *_tr)
{
struct thresh_restart *tr = _tr;
@@ -128,6 +161,12 @@ static void threshold_restart_bank(void *_tr)
(new_count & THRESHOLD_MAX);
}
+ /* clear IntType */
+ hi &= ~MASK_INT_TYPE_HI;
+
+ if (!tr->b->interrupt_capable)
+ goto done;
+
if (tr->set_lvt_off) {
if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
/* set new lvt offset */
@@ -136,9 +175,10 @@ static void threshold_restart_bank(void *_tr)
}
}
- tr->b->interrupt_enable ?
- (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
- (hi &= ~MASK_INT_TYPE_HI);
+ if (tr->b->interrupt_enable)
+ hi |= INT_TYPE_APIC;
+
+ done:
hi |= MASK_COUNT_EN_HI;
wrmsr(tr->b->address, lo, hi);
@@ -174,7 +214,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
unsigned int bank, block;
int offset = -1;
- for (bank = 0; bank < NR_BANKS; ++bank) {
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
if (block == 0)
address = MSR_IA32_MC0_MISC + bank * 4;
@@ -199,17 +239,18 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (!block)
per_cpu(bank_map, cpu) |= (1 << bank);
- if (shared_bank[bank] && c->cpu_core_id)
- break;
-
- offset = setup_APIC_mce(offset,
- (high & MASK_LVTOFF_HI) >> 20);
memset(&b, 0, sizeof(b));
- b.cpu = cpu;
- b.bank = bank;
- b.block = block;
- b.address = address;
+ b.cpu = cpu;
+ b.bank = bank;
+ b.block = block;
+ b.address = address;
+ b.interrupt_capable = lvt_interrupt_supported(bank, high);
+
+ if (b.interrupt_capable) {
+ int new = (high & MASK_LVTOFF_HI) >> 20;
+ offset = setup_APIC_mce(offset, new);
+ }
mce_threshold_block_init(&b, offset);
mce_threshold_vector = amd_threshold_interrupt;
@@ -235,7 +276,7 @@ static void amd_threshold_interrupt(void)
mce_setup(&m);
/* assume first bank caused it */
- for (bank = 0; bank < NR_BANKS; ++bank) {
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
continue;
for (block = 0; block < NR_BLOCKS; ++block) {
@@ -298,7 +339,7 @@ struct threshold_attr {
#define SHOW_FIELDS(name) \
static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
{ \
- return sprintf(buf, "%lx\n", (unsigned long) b->name); \
+ return sprintf(buf, "%lu\n", (unsigned long) b->name); \
}
SHOW_FIELDS(interrupt_enable)
SHOW_FIELDS(threshold_limit)
@@ -309,6 +350,9 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
struct thresh_restart tr;
unsigned long new;
+ if (!b->interrupt_capable)
+ return -EINVAL;
+
if (strict_strtoul(buf, 0, &new) < 0)
return -EINVAL;
@@ -346,38 +390,21 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
return size;
}
-struct threshold_block_cross_cpu {
- struct threshold_block *tb;
- long retval;
-};
-
-static void local_error_count_handler(void *_tbcc)
-{
- struct threshold_block_cross_cpu *tbcc = _tbcc;
- struct threshold_block *b = tbcc->tb;
- u32 low, high;
-
- rdmsr(b->address, low, high);
- tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
-}
-
static ssize_t show_error_count(struct threshold_block *b, char *buf)
{
- struct threshold_block_cross_cpu tbcc = { .tb = b, };
+ u32 lo, hi;
- smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
- return sprintf(buf, "%lx\n", tbcc.retval);
-}
+ rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
-static ssize_t store_error_count(struct threshold_block *b,
- const char *buf, size_t count)
-{
- struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
-
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
- return 1;
+ return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
+ (THRESHOLD_MAX - b->threshold_limit)));
}
+static struct threshold_attr error_count = {
+ .attr = {.name = __stringify(error_count), .mode = 0444 },
+ .show = show_error_count,
+};
+
#define RW_ATTR(val) \
static struct threshold_attr val = { \
.attr = {.name = __stringify(val), .mode = 0644 }, \
@@ -387,13 +414,12 @@ static struct threshold_attr val = { \
RW_ATTR(interrupt_enable);
RW_ATTR(threshold_limit);
-RW_ATTR(error_count);
static struct attribute *default_attrs[] = {
- &interrupt_enable.attr,
&threshold_limit.attr,
&error_count.attr,
- NULL
+ NULL, /* possibly interrupt_enable if supported, see below */
+ NULL,
};
#define to_block(k) container_of(k, struct threshold_block, kobj)
@@ -432,16 +458,14 @@ static struct kobj_type threshold_ktype = {
.default_attrs = default_attrs,
};
-static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
- unsigned int bank,
- unsigned int block,
- u32 address)
+static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
+ unsigned int block, u32 address)
{
struct threshold_block *b = NULL;
u32 low, high;
int err;
- if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
+ if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
return 0;
if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
@@ -467,8 +491,14 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
b->cpu = cpu;
b->address = address;
b->interrupt_enable = 0;
+ b->interrupt_capable = lvt_interrupt_supported(bank, high);
b->threshold_limit = THRESHOLD_MAX;
+ if (b->interrupt_capable)
+ threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
+ else
+ threshold_ktype.default_attrs[2] = NULL;
+
INIT_LIST_HEAD(&b->miscj);
if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
@@ -480,7 +510,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
err = kobject_init_and_add(&b->kobj, &threshold_ktype,
per_cpu(threshold_banks, cpu)[bank]->kobj,
- "misc%i", block);
+ (bank == 4 ? bank4_names(b) : th_names[bank]));
if (err)
goto out_free;
recurse:
@@ -511,47 +541,56 @@ out_free:
return err;
}
-static __cpuinit long
-local_allocate_threshold_blocks(int cpu, unsigned int bank)
+static int __threshold_add_blocks(struct threshold_bank *b)
{
- return allocate_threshold_blocks(cpu, bank, 0,
- MSR_IA32_MC0_MISC + bank * 4);
-}
+ struct list_head *head = &b->blocks->miscj;
+ struct threshold_block *pos = NULL;
+ struct threshold_block *tmp = NULL;
+ int err = 0;
-/* symlinks sibling shared banks to first core. first core owns dir/files. */
-static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
-{
- int i, err = 0;
- struct threshold_bank *b = NULL;
- struct device *dev = mce_device[cpu];
- char name[32];
+ err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
+ if (err)
+ return err;
- sprintf(name, "threshold_bank%i", bank);
+ list_for_each_entry_safe(pos, tmp, head, miscj) {
- if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
- i = cpumask_first(cpu_llc_shared_mask(cpu));
+ err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
+ if (err) {
+ list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
+ kobject_del(&pos->kobj);
- /* first core not up yet */
- if (cpu_data(i).cpu_core_id)
- goto out;
+ return err;
+ }
+ }
+ return err;
+}
- /* already linked */
- if (per_cpu(threshold_banks, cpu)[bank])
- goto out;
+static int threshold_create_bank(unsigned int cpu, unsigned int bank)
+{
+ struct device *dev = per_cpu(mce_device, cpu);
+ struct amd_northbridge *nb = NULL;
+ struct threshold_bank *b = NULL;
+ const char *name = th_names[bank];
+ int err = 0;
- b = per_cpu(threshold_banks, i)[bank];
+ if (is_shared_bank(bank)) {
+ nb = node_to_amd_nb(amd_get_nb_id(cpu));
- if (!b)
- goto out;
+ /* threshold descriptor already initialized on this node? */
+ if (nb && nb->bank4) {
+ /* yes, use it */
+ b = nb->bank4;
+ err = kobject_add(b->kobj, &dev->kobj, name);
+ if (err)
+ goto out;
- err = sysfs_create_link(&dev->kobj, b->kobj, name);
- if (err)
- goto out;
+ per_cpu(threshold_banks, cpu)[bank] = b;
+ atomic_inc(&b->cpus);
- cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
- per_cpu(threshold_banks, cpu)[bank] = b;
+ err = __threshold_add_blocks(b);
- goto out;
+ goto out;
+ }
}
b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
@@ -559,58 +598,52 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
err = -ENOMEM;
goto out;
}
- if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
- kfree(b);
- err = -ENOMEM;
- goto out;
- }
b->kobj = kobject_create_and_add(name, &dev->kobj);
- if (!b->kobj)
+ if (!b->kobj) {
+ err = -EINVAL;
goto out_free;
-
-#ifndef CONFIG_SMP
- cpumask_setall(b->cpus);
-#else
- cpumask_set_cpu(cpu, b->cpus);
-#endif
+ }
per_cpu(threshold_banks, cpu)[bank] = b;
- err = local_allocate_threshold_blocks(cpu, bank);
- if (err)
- goto out_free;
-
- for_each_cpu(i, b->cpus) {
- if (i == cpu)
- continue;
-
- dev = mce_device[i];
- if (dev)
- err = sysfs_create_link(&dev->kobj,b->kobj, name);
- if (err)
- goto out;
+ if (is_shared_bank(bank)) {
+ atomic_set(&b->cpus, 1);
- per_cpu(threshold_banks, i)[bank] = b;
+ /* nb is already initialized, see above */
+ if (nb) {
+ WARN_ON(nb->bank4);
+ nb->bank4 = b;
+ }
}
- goto out;
+ err = allocate_threshold_blocks(cpu, bank, 0,
+ MSR_IA32_MC0_MISC + bank * 4);
+ if (!err)
+ goto out;
-out_free:
- per_cpu(threshold_banks, cpu)[bank] = NULL;
- free_cpumask_var(b->cpus);
+ out_free:
kfree(b);
-out:
+
+ out:
return err;
}
/* create dir/files for all valid threshold banks */
-static __cpuinit int threshold_create_device(unsigned int cpu)
+static int threshold_create_device(unsigned int cpu)
{
unsigned int bank;
+ struct threshold_bank **bp;
int err = 0;
- for (bank = 0; bank < NR_BANKS; ++bank) {
+ bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
+ GFP_KERNEL);
+ if (!bp)
+ return -ENOMEM;
+
+ per_cpu(threshold_banks, cpu) = bp;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
err = threshold_create_bank(cpu, bank);
@@ -621,12 +654,6 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
return err;
}
-/*
- * let's be hotplug friendly.
- * in case of multiple core processors, the first core always takes ownership
- * of shared sysfs dir/files, and rest of the cores will be symlinked to it.
- */
-
static void deallocate_threshold_block(unsigned int cpu,
unsigned int bank)
{
@@ -647,40 +674,42 @@ static void deallocate_threshold_block(unsigned int cpu,
per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
}
+static void __threshold_remove_blocks(struct threshold_bank *b)
+{
+ struct threshold_block *pos = NULL;
+ struct threshold_block *tmp = NULL;
+
+ kobject_del(b->kobj);
+
+ list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
+ kobject_del(&pos->kobj);
+}
+
static void threshold_remove_bank(unsigned int cpu, int bank)
{
+ struct amd_northbridge *nb;
struct threshold_bank *b;
- struct device *dev;
- char name[32];
- int i = 0;
b = per_cpu(threshold_banks, cpu)[bank];
if (!b)
return;
+
if (!b->blocks)
goto free_out;
- sprintf(name, "threshold_bank%i", bank);
-
-#ifdef CONFIG_SMP
- /* sibling symlink */
- if (shared_bank[bank] && b->blocks->cpu != cpu) {
- sysfs_remove_link(&mce_device[cpu]->kobj, name);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
-
- return;
- }
-#endif
-
- /* remove all sibling symlinks before unregistering */
- for_each_cpu(i, b->cpus) {
- if (i == cpu)
- continue;
-
- dev = mce_device[i];
- if (dev)
- sysfs_remove_link(&dev->kobj, name);
- per_cpu(threshold_banks, i)[bank] = NULL;
+ if (is_shared_bank(bank)) {
+ if (!atomic_dec_and_test(&b->cpus)) {
+ __threshold_remove_blocks(b);
+ per_cpu(threshold_banks, cpu)[bank] = NULL;
+ return;
+ } else {
+ /*
+ * the last CPU on this node using the shared bank is
+ * going away, remove that bank now.
+ */
+ nb = node_to_amd_nb(amd_get_nb_id(cpu));
+ nb->bank4 = NULL;
+ }
}
deallocate_threshold_block(cpu, bank);
@@ -688,7 +717,6 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
free_out:
kobject_del(b->kobj);
kobject_put(b->kobj);
- free_cpumask_var(b->cpus);
kfree(b);
per_cpu(threshold_banks, cpu)[bank] = NULL;
}
@@ -697,15 +725,16 @@ static void threshold_remove_device(unsigned int cpu)
{
unsigned int bank;
- for (bank = 0; bank < NR_BANKS; ++bank) {
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
threshold_remove_bank(cpu, bank);
}
+ kfree(per_cpu(threshold_banks, cpu));
}
/* get notified when a cpu comes on/off */
-static void __cpuinit
+static void
amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
{
switch (action) {
@@ -737,4 +766,24 @@ static __init int threshold_init_device(void)
return 0;
}
-device_initcall(threshold_init_device);
+/*
+ * there are 3 funcs which need to be _initcalled in a logic sequence:
+ * 1. xen_late_init_mcelog
+ * 2. mcheck_init_device
+ * 3. threshold_init_device
+ *
+ * xen_late_init_mcelog must register xen_mce_chrdev_device before
+ * native mce_chrdev_device registration if running under xen platform;
+ *
+ * mcheck_init_device should be inited before threshold_init_device to
+ * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
+ *
+ * so we use following _initcalls
+ * 1. device_initcall(xen_late_init_mcelog);
+ * 2. device_initcall_sync(mcheck_init_device);
+ * 3. late_initcall(threshold_init_device);
+ *
+ * when running under xen, the initcall order is 1,2,3;
+ * on baremetal, we skip 1 and we do only 2 and 3.
+ */
+late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 38e49bc95ff..9a316b21df8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -6,15 +6,17 @@
*/
#include <linux/gfp.h>
-#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/sched.h>
+#include <linux/cpumask.h>
#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
+#include "mce-internal.h"
+
/*
* Support for Intel Correct Machine Check Interrupts. This allows
* the CPU to raise an interrupt when a corrected machine check happened.
@@ -22,21 +24,48 @@
* Also supports reliable discovery of shared banks.
*/
+/*
+ * CMCI can be delivered to multiple cpus that share a machine check bank
+ * so we need to designate a single cpu to process errors logged in each bank
+ * in the interrupt handler (otherwise we would have many races and potential
+ * double reporting of the same error).
+ * Note that this can change when a cpu is offlined or brought online since
+ * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear()
+ * disables CMCI on all banks owned by the cpu and clears this bitfield. At
+ * this point, cmci_rediscover() kicks in and a different cpu may end up
+ * taking ownership of some of the shared MCA banks that were previously
+ * owned by the offlined cpu.
+ */
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
-static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
+static DEFINE_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD 1
+#define CMCI_POLL_INTERVAL (30 * HZ)
+#define CMCI_STORM_INTERVAL (1 * HZ)
+#define CMCI_STORM_THRESHOLD 15
+
+static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
+
+enum {
+ CMCI_STORM_NONE,
+ CMCI_STORM_ACTIVE,
+ CMCI_STORM_SUBSIDED,
+};
-#define CMCI_THRESHOLD 1
+static atomic_t cmci_storm_on_cpus;
static int cmci_supported(int *banks)
{
u64 cap;
- if (mce_cmci_disabled || mce_ignore_ce)
+ if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
return 0;
/*
@@ -53,6 +82,109 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}
+void mce_intel_cmci_poll(void)
+{
+ if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
+ return;
+ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+}
+
+void mce_intel_hcpu_update(unsigned long cpu)
+{
+ if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
+ atomic_dec(&cmci_storm_on_cpus);
+
+ per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
+}
+
+unsigned long mce_intel_adjust_timer(unsigned long interval)
+{
+ int r;
+
+ if (interval < CMCI_POLL_INTERVAL)
+ return interval;
+
+ switch (__this_cpu_read(cmci_storm_state)) {
+ case CMCI_STORM_ACTIVE:
+ /*
+ * We switch back to interrupt mode once the poll timer has
+ * silenced itself. That means no events recorded and the
+ * timer interval is back to our poll interval.
+ */
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
+ r = atomic_sub_return(1, &cmci_storm_on_cpus);
+ if (r == 0)
+ pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+ /* FALLTHROUGH */
+
+ case CMCI_STORM_SUBSIDED:
+ /*
+ * We wait for all cpus to go back to SUBSIDED
+ * state. When that happens we switch back to
+ * interrupt mode.
+ */
+ if (!atomic_read(&cmci_storm_on_cpus)) {
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
+ cmci_reenable();
+ cmci_recheck();
+ }
+ return CMCI_POLL_INTERVAL;
+ default:
+ /*
+ * We have shiny weather. Let the poll do whatever it
+ * thinks.
+ */
+ return interval;
+ }
+}
+
+static void cmci_storm_disable_banks(void)
+{
+ unsigned long flags, *owned;
+ int bank;
+ u64 val;
+
+ spin_lock_irqsave(&cmci_discover_lock, flags);
+ owned = __get_cpu_var(mce_banks_owned);
+ for_each_set_bit(bank, owned, MAX_NR_BANKS) {
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ }
+ spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+static bool cmci_storm_detect(void)
+{
+ unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
+ unsigned long ts = __this_cpu_read(cmci_time_stamp);
+ unsigned long now = jiffies;
+ int r;
+
+ if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
+ return true;
+
+ if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
+ cnt++;
+ } else {
+ cnt = 1;
+ __this_cpu_write(cmci_time_stamp, now);
+ }
+ __this_cpu_write(cmci_storm_cnt, cnt);
+
+ if (cnt <= CMCI_STORM_THRESHOLD)
+ return false;
+
+ cmci_storm_disable_banks();
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
+ r = atomic_add_return(1, &cmci_storm_on_cpus);
+ mce_timer_kick(CMCI_POLL_INTERVAL);
+
+ if (r == 1)
+ pr_notice("CMCI storm detected: switching to poll mode\n");
+ return true;
+}
+
/*
* The interrupt handler. This is called on every event.
* Just call the poller directly to log any events.
@@ -61,64 +193,86 @@ static int cmci_supported(int *banks)
*/
static void intel_threshold_interrupt(void)
{
+ if (cmci_storm_detect())
+ return;
machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
mce_notify_irq();
}
-static void print_update(char *type, int *hdr, int num)
-{
- if (*hdr == 0)
- printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
- *hdr = 1;
- printk(KERN_CONT " %s:%d", type, num);
-}
-
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
* banks.
*/
-static void cmci_discover(int banks, int boot)
+static void cmci_discover(int banks)
{
unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
unsigned long flags;
- int hdr = 0;
int i;
+ int bios_wrong_thresh = 0;
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
+ int bios_zero_thresh = 0;
if (test_bit(i, owned))
continue;
+ /* Skip banks in firmware first mode */
+ if (test_bit(i, mce_banks_ce_disabled))
+ continue;
+
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Already owned by someone else? */
if (val & MCI_CTL2_CMCI_EN) {
- if (test_and_clear_bit(i, owned) && !boot)
- print_update("SHD", &hdr, i);
+ clear_bit(i, owned);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
continue;
}
- val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
- val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
+ if (!mca_cfg.bios_cmci_threshold) {
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= CMCI_THRESHOLD;
+ } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
+ /*
+ * If bios_cmci_threshold boot option was specified
+ * but the threshold is zero, we'll try to initialize
+ * it to 1.
+ */
+ bios_zero_thresh = 1;
+ val |= CMCI_THRESHOLD;
+ }
+
+ val |= MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Did the enable bit stick? -- the bank supports CMCI */
if (val & MCI_CTL2_CMCI_EN) {
- if (!test_and_set_bit(i, owned) && !boot)
- print_update("CMCI", &hdr, i);
+ set_bit(i, owned);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
+ /*
+ * We are able to set thresholds for some banks that
+ * had a threshold of 0. This means the BIOS has not
+ * set the thresholds properly or does not work with
+ * this boot option. Note down now and report later.
+ */
+ if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
+ (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+ bios_wrong_thresh = 1;
} else {
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}
}
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
- if (hdr)
- printk(KERN_CONT "\n");
+ spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
+ pr_info_once(
+ "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
+ pr_info_once(
+ "bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
+ }
}
/*
@@ -137,6 +291,19 @@ void cmci_recheck(void)
local_irq_restore(flags);
}
+/* Caller must hold the lock on cmci_discover_lock */
+static void __cmci_disable_bank(int bank)
+{
+ u64 val;
+
+ if (!test_bit(bank, __get_cpu_var(mce_banks_owned)))
+ return;
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ __clear_bit(bank, __get_cpu_var(mce_banks_owned));
+}
+
/*
* Disable CMCI on this CPU for all banks it owns when it goes down.
* This allows other CPUs to claim the banks on rediscovery.
@@ -146,51 +313,33 @@ void cmci_clear(void)
unsigned long flags;
int i;
int banks;
- u64 val;
if (!cmci_supported(&banks))
return;
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- for (i = 0; i < banks; i++) {
- if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
- continue;
- /* Disable CMCI */
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
- val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
- wrmsrl(MSR_IA32_MCx_CTL2(i), val);
- __clear_bit(i, __get_cpu_var(mce_banks_owned));
- }
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ spin_lock_irqsave(&cmci_discover_lock, flags);
+ for (i = 0; i < banks; i++)
+ __cmci_disable_bank(i);
+ spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
-/*
- * After a CPU went down cycle through all the others and rediscover
- * Must run in process context.
- */
-void cmci_rediscover(int dying)
+static void cmci_rediscover_work_func(void *arg)
+{
+ int banks;
+
+ /* Recheck banks in case CPUs don't all have the same */
+ if (cmci_supported(&banks))
+ cmci_discover(banks);
+}
+
+/* After a CPU went down cycle through all the others and rediscover */
+void cmci_rediscover(void)
{
int banks;
- int cpu;
- cpumask_var_t old;
if (!cmci_supported(&banks))
return;
- if (!alloc_cpumask_var(&old, GFP_KERNEL))
- return;
- cpumask_copy(old, &current->cpus_allowed);
-
- for_each_online_cpu(cpu) {
- if (cpu == dying)
- continue;
- if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
- continue;
- /* Recheck banks in case CPUs don't all have the same */
- if (cmci_supported(&banks))
- cmci_discover(banks, 0);
- }
- set_cpus_allowed_ptr(current, old);
- free_cpumask_var(old);
+ on_each_cpu(cmci_rediscover_work_func, NULL, 1);
}
/*
@@ -200,7 +349,20 @@ void cmci_reenable(void)
{
int banks;
if (cmci_supported(&banks))
- cmci_discover(banks, 0);
+ cmci_discover(banks);
+}
+
+void cmci_disable_bank(int bank)
+{
+ int banks;
+ unsigned long flags;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ spin_lock_irqsave(&cmci_discover_lock, flags);
+ __cmci_disable_bank(bank);
+ spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void intel_init_cmci(void)
@@ -211,7 +373,7 @@ static void intel_init_cmci(void)
return;
mce_threshold_vector = intel_threshold_interrupt;
- cmci_discover(banks, 1);
+ cmci_discover(banks);
/*
* For CPU #0 this runs with still disabled APIC, but that's
* ok because only the vector is set up. We still do another
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 5c0e6533d9b..a3042989398 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -5,11 +5,9 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/mce.h>
#include <asm/msr.h>
@@ -34,7 +32,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
smp_processor_id());
}
- add_taint(TAINT_MACHINE_CHECK);
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
}
/* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 67bb17a37a0..36a1bb6d1ee 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -25,11 +25,11 @@
#include <linux/cpu.h>
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)
@@ -55,12 +55,24 @@ struct thermal_state {
struct _thermal_state package_power_limit;
struct _thermal_state core_thresh0;
struct _thermal_state core_thresh1;
+ struct _thermal_state pkg_thresh0;
+ struct _thermal_state pkg_thresh1;
};
/* Callback to handle core threshold interrupts */
int (*platform_thermal_notify)(__u64 msr_val);
EXPORT_SYMBOL(platform_thermal_notify);
+/* Callback to handle core package threshold_interrupts */
+int (*platform_thermal_package_notify)(__u64 msr_val);
+EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
+
+/* Callback support of rate control, return true, if
+ * callback has rate control */
+bool (*platform_thermal_package_rate_control)(void);
+EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
+
+
static DEFINE_PER_CPU(struct thermal_state, thermal_state);
static atomic_t therm_throt_en = ATOMIC_INIT(0);
@@ -182,11 +194,6 @@ static int therm_throt_process(bool new_event, int event, int level)
this_cpu,
level == CORE_LEVEL ? "Core" : "Package",
state->count);
- else
- printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
- this_cpu,
- level == CORE_LEVEL ? "Core" : "Package",
- state->count);
return 1;
}
if (old_event) {
@@ -194,36 +201,46 @@ static int therm_throt_process(bool new_event, int event, int level)
printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
this_cpu,
level == CORE_LEVEL ? "Core" : "Package");
- else
- printk(KERN_INFO "CPU%d: %s power limit normal\n",
- this_cpu,
- level == CORE_LEVEL ? "Core" : "Package");
return 1;
}
return 0;
}
-static int thresh_event_valid(int event)
+static int thresh_event_valid(int level, int event)
{
struct _thermal_state *state;
unsigned int this_cpu = smp_processor_id();
struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
u64 now = get_jiffies_64();
- state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1;
+ if (level == PACKAGE_LEVEL)
+ state = (event == 0) ? &pstate->pkg_thresh0 :
+ &pstate->pkg_thresh1;
+ else
+ state = (event == 0) ? &pstate->core_thresh0 :
+ &pstate->core_thresh1;
if (time_before64(now, state->next_check))
return 0;
state->next_check = now + CHECK_INTERVAL;
+
+ return 1;
+}
+
+static bool int_pln_enable;
+static int __init int_pln_enable_setup(char *s)
+{
+ int_pln_enable = true;
+
return 1;
}
+__setup("int_pln_enable", int_pln_enable_setup);
#ifdef CONFIG_SYSFS
/* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct device *dev,
- unsigned int cpu)
+static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
{
int err;
struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -232,7 +249,7 @@ static __cpuinit int thermal_throttle_add_dev(struct device *dev,
if (err)
return err;
- if (cpu_has(c, X86_FEATURE_PLN))
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_core_power_limit_count.attr,
thermal_attr_group.name);
@@ -240,7 +257,7 @@ static __cpuinit int thermal_throttle_add_dev(struct device *dev,
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_throttle_count.attr,
thermal_attr_group.name);
- if (cpu_has(c, X86_FEATURE_PLN))
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_power_limit_count.attr,
thermal_attr_group.name);
@@ -249,16 +266,13 @@ static __cpuinit int thermal_throttle_add_dev(struct device *dev,
return err;
}
-static __cpuinit void thermal_throttle_remove_dev(struct device *dev)
+static void thermal_throttle_remove_dev(struct device *dev)
{
sysfs_remove_group(&dev->kobj, &thermal_attr_group);
}
-/* Mutex protecting device creation against CPU hotplug: */
-static DEFINE_MUTEX(therm_cpu_lock);
-
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static __cpuinit int
+static int
thermal_throttle_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
@@ -272,24 +286,20 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- mutex_lock(&therm_cpu_lock);
err = thermal_throttle_add_dev(dev, cpu);
- mutex_unlock(&therm_cpu_lock);
WARN_ON(err);
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- mutex_lock(&therm_cpu_lock);
thermal_throttle_remove_dev(dev);
- mutex_unlock(&therm_cpu_lock);
break;
}
return notifier_from_errno(err);
}
-static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
+static struct notifier_block thermal_throttle_cpu_notifier =
{
.notifier_call = thermal_throttle_cpu_callback,
};
@@ -302,19 +312,16 @@ static __init int thermal_throttle_init_device(void)
if (!atomic_read(&therm_throt_en))
return 0;
- register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
+ cpu_notifier_register_begin();
-#ifdef CONFIG_HOTPLUG_CPU
- mutex_lock(&therm_cpu_lock);
-#endif
/* connect live CPUs to sysfs */
for_each_online_cpu(cpu) {
err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
WARN_ON(err);
}
-#ifdef CONFIG_HOTPLUG_CPU
- mutex_unlock(&therm_cpu_lock);
-#endif
+
+ __register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
+ cpu_notifier_register_done();
return 0;
}
@@ -322,6 +329,39 @@ device_initcall(thermal_throttle_init_device);
#endif /* CONFIG_SYSFS */
+static void notify_package_thresholds(__u64 msr_val)
+{
+ bool notify_thres_0 = false;
+ bool notify_thres_1 = false;
+
+ if (!platform_thermal_package_notify)
+ return;
+
+ /* lower threshold check */
+ if (msr_val & THERM_LOG_THRESHOLD0)
+ notify_thres_0 = true;
+ /* higher threshold check */
+ if (msr_val & THERM_LOG_THRESHOLD1)
+ notify_thres_1 = true;
+
+ if (!notify_thres_0 && !notify_thres_1)
+ return;
+
+ if (platform_thermal_package_rate_control &&
+ platform_thermal_package_rate_control()) {
+ /* Rate control is implemented in callback */
+ platform_thermal_package_notify(msr_val);
+ return;
+ }
+
+ /* lower threshold reached */
+ if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
+ platform_thermal_package_notify(msr_val);
+ /* higher threshold reached */
+ if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
+ platform_thermal_package_notify(msr_val);
+}
+
static void notify_thresholds(__u64 msr_val)
{
/* check whether the interrupt handler is defined;
@@ -331,10 +371,12 @@ static void notify_thresholds(__u64 msr_val)
return;
/* lower threshold reached */
- if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0))
+ if ((msr_val & THERM_LOG_THRESHOLD0) &&
+ thresh_event_valid(CORE_LEVEL, 0))
platform_thermal_notify(msr_val);
/* higher threshold reached */
- if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1))
+ if ((msr_val & THERM_LOG_THRESHOLD1) &&
+ thresh_event_valid(CORE_LEVEL, 1))
platform_thermal_notify(msr_val);
}
@@ -353,17 +395,19 @@ static void intel_thermal_interrupt(void)
CORE_LEVEL) != 0)
mce_log_therm_throt_event(msr_val);
- if (this_cpu_has(X86_FEATURE_PLN))
+ if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
CORE_LEVEL);
if (this_cpu_has(X86_FEATURE_PTS)) {
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+ /* check violations of package thermal thresholds */
+ notify_package_thresholds(msr_val);
therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
PACKAGE_LEVEL);
- if (this_cpu_has(X86_FEATURE_PLN))
+ if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
therm_throt_process(msr_val &
PACKAGE_THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
@@ -379,15 +423,26 @@ static void unexpected_thermal_interrupt(void)
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
+static inline void __smp_thermal_interrupt(void)
{
- irq_enter();
- exit_idle();
inc_irq_stat(irq_thermal_count);
smp_thermal_vector();
- irq_exit();
- /* Ack only at the end to avoid potential reentry */
- ack_APIC_irq();
+}
+
+asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
+{
+ entering_irq();
+ __smp_thermal_interrupt();
+ exiting_ack_irq();
+}
+
+asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
+{
+ entering_irq();
+ trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
+ __smp_thermal_interrupt();
+ trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
+ exiting_ack_irq();
}
/* Thermal monitoring depends on APIC, ACPI and clock modulation */
@@ -471,9 +526,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
apic_write(APIC_LVTTHMR, h);
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- if (cpu_has(c, X86_FEATURE_PLN))
+ if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ (l | (THERM_INT_LOW_ENABLE
+ | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
+ else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE
+ l | (THERM_INT_LOW_ENABLE
| THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
else
wrmsr(MSR_IA32_THERM_INTERRUPT,
@@ -481,9 +540,14 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_PTS)) {
rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
- if (cpu_has(c, X86_FEATURE_PLN))
+ if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- l | (PACKAGE_THERM_INT_LOW_ENABLE
+ (l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE))
+ & ~PACKAGE_THERM_INT_PLN_ENABLE, h);
+ else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
| PACKAGE_THERM_INT_HIGH_ENABLE
| PACKAGE_THERM_INT_PLN_ENABLE), h);
else
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index aa578cadb94..7245980186e 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -8,6 +8,7 @@
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/mce.h>
+#include <asm/trace/irq_vectors.h>
static void default_threshold_interrupt(void)
{
@@ -17,13 +18,24 @@ static void default_threshold_interrupt(void)
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-asmlinkage void smp_threshold_interrupt(void)
+static inline void __smp_threshold_interrupt(void)
{
- irq_enter();
- exit_idle();
inc_irq_stat(irq_threshold_count);
mce_threshold_vector();
- irq_exit();
- /* Ack only at the end to avoid potential reentry */
- ack_APIC_irq();
+}
+
+asmlinkage __visible void smp_threshold_interrupt(void)
+{
+ entering_irq();
+ __smp_threshold_interrupt();
+ exiting_ack_irq();
+}
+
+asmlinkage __visible void smp_trace_threshold_interrupt(void)
+{
+ entering_irq();
+ trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
+ __smp_threshold_interrupt();
+ trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
+ exiting_ack_irq();
}
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 54060f56597..7dc5564d0cd 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -5,10 +5,8 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/init.h>
#include <asm/processor.h>
-#include <asm/system.h>
#include <asm/mce.h>
#include <asm/msr.h>
@@ -16,7 +14,7 @@
static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
- add_taint(TAINT_MACHINE_CHECK);
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
}
/* Set up machine check reporting on the Winchip C6 series */
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
new file mode 100644
index 00000000000..285c85427c3
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -0,0 +1,7 @@
+microcode-y := core.o
+obj-$(CONFIG_MICROCODE) += microcode.o
+microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o
+microcode-$(CONFIG_MICROCODE_AMD) += amd.o
+obj-$(CONFIG_MICROCODE_EARLY) += core_early.o
+obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o
+obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
new file mode 100644
index 00000000000..8fffd845e22
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -0,0 +1,492 @@
+/*
+ * AMD CPU Microcode Update Driver for Linux
+ * Copyright (C) 2008-2011 Advanced Micro Devices Inc.
+ *
+ * Author: Peter Oruba <peter.oruba@amd.com>
+ *
+ * Based on work by:
+ * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *
+ * Maintainers:
+ * Andreas Herrmann <herrmann.der.user@googlemail.com>
+ * Borislav Petkov <bp@alien8.de>
+ *
+ * This driver allows to upgrade microcode on F10h AMD
+ * CPUs and later.
+ *
+ * Licensed under the terms of the GNU General Public
+ * License version 2. See file COPYING for details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/firmware.h>
+#include <linux/pci_ids.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/microcode_amd.h>
+
+MODULE_DESCRIPTION("AMD Microcode Update Driver");
+MODULE_AUTHOR("Peter Oruba");
+MODULE_LICENSE("GPL v2");
+
+static struct equiv_cpu_entry *equiv_cpu_table;
+
+struct ucode_patch {
+ struct list_head plist;
+ void *data;
+ u32 patch_id;
+ u16 equiv_cpu;
+};
+
+static LIST_HEAD(pcache);
+
+static u16 __find_equiv_id(unsigned int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ return find_equiv_id(equiv_cpu_table, uci->cpu_sig.sig);
+}
+
+static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu)
+{
+ int i = 0;
+
+ BUG_ON(!equiv_cpu_table);
+
+ while (equiv_cpu_table[i].equiv_cpu != 0) {
+ if (equiv_cpu == equiv_cpu_table[i].equiv_cpu)
+ return equiv_cpu_table[i].installed_cpu;
+ i++;
+ }
+ return 0;
+}
+
+/*
+ * a small, trivial cache of per-family ucode patches
+ */
+static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
+{
+ struct ucode_patch *p;
+
+ list_for_each_entry(p, &pcache, plist)
+ if (p->equiv_cpu == equiv_cpu)
+ return p;
+ return NULL;
+}
+
+static void update_cache(struct ucode_patch *new_patch)
+{
+ struct ucode_patch *p;
+
+ list_for_each_entry(p, &pcache, plist) {
+ if (p->equiv_cpu == new_patch->equiv_cpu) {
+ if (p->patch_id >= new_patch->patch_id)
+ /* we already have the latest patch */
+ return;
+
+ list_replace(&p->plist, &new_patch->plist);
+ kfree(p->data);
+ kfree(p);
+ return;
+ }
+ }
+ /* no patch found, add it */
+ list_add_tail(&new_patch->plist, &pcache);
+}
+
+static void free_cache(void)
+{
+ struct ucode_patch *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, &pcache, plist) {
+ __list_del(p->plist.prev, p->plist.next);
+ kfree(p->data);
+ kfree(p);
+ }
+}
+
+static struct ucode_patch *find_patch(unsigned int cpu)
+{
+ u16 equiv_id;
+
+ equiv_id = __find_equiv_id(cpu);
+ if (!equiv_id)
+ return NULL;
+
+ return cache_find_patch(equiv_id);
+}
+
+static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct ucode_patch *p;
+
+ csig->sig = cpuid_eax(0x00000001);
+ csig->rev = c->microcode;
+
+ /*
+ * a patch could have been loaded early, set uci->mc so that
+ * mc_bp_resume() can call apply_microcode()
+ */
+ p = find_patch(cpu);
+ if (p && (p->patch_id == csig->rev))
+ uci->mc = p->data;
+
+ pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
+
+ return 0;
+}
+
+static unsigned int verify_patch_size(u8 family, u32 patch_size,
+ unsigned int size)
+{
+ u32 max_size;
+
+#define F1XH_MPB_MAX_SIZE 2048
+#define F14H_MPB_MAX_SIZE 1824
+#define F15H_MPB_MAX_SIZE 4096
+#define F16H_MPB_MAX_SIZE 3458
+
+ switch (family) {
+ case 0x14:
+ max_size = F14H_MPB_MAX_SIZE;
+ break;
+ case 0x15:
+ max_size = F15H_MPB_MAX_SIZE;
+ break;
+ case 0x16:
+ max_size = F16H_MPB_MAX_SIZE;
+ break;
+ default:
+ max_size = F1XH_MPB_MAX_SIZE;
+ break;
+ }
+
+ if (patch_size > min_t(u32, size, max_size)) {
+ pr_err("patch size mismatch\n");
+ return 0;
+ }
+
+ return patch_size;
+}
+
+int __apply_microcode_amd(struct microcode_amd *mc_amd)
+{
+ u32 rev, dummy;
+
+ native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
+
+ /* verify patch application was successful */
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ if (rev != mc_amd->hdr.patch_id)
+ return -1;
+
+ return 0;
+}
+
+int apply_microcode_amd(int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct microcode_amd *mc_amd;
+ struct ucode_cpu_info *uci;
+ struct ucode_patch *p;
+ u32 rev, dummy;
+
+ BUG_ON(raw_smp_processor_id() != cpu);
+
+ uci = ucode_cpu_info + cpu;
+
+ p = find_patch(cpu);
+ if (!p)
+ return 0;
+
+ mc_amd = p->data;
+ uci->mc = p->data;
+
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ /* need to apply patch? */
+ if (rev >= mc_amd->hdr.patch_id) {
+ c->microcode = rev;
+ uci->cpu_sig.rev = rev;
+ return 0;
+ }
+
+ if (__apply_microcode_amd(mc_amd)) {
+ pr_err("CPU%d: update failed for patch_level=0x%08x\n",
+ cpu, mc_amd->hdr.patch_id);
+ return -1;
+ }
+ pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
+ mc_amd->hdr.patch_id);
+
+ uci->cpu_sig.rev = mc_amd->hdr.patch_id;
+ c->microcode = mc_amd->hdr.patch_id;
+
+ return 0;
+}
+
+static int install_equiv_cpu_table(const u8 *buf)
+{
+ unsigned int *ibuf = (unsigned int *)buf;
+ unsigned int type = ibuf[1];
+ unsigned int size = ibuf[2];
+
+ if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
+ pr_err("empty section/"
+ "invalid type field in container file section header\n");
+ return -EINVAL;
+ }
+
+ equiv_cpu_table = vmalloc(size);
+ if (!equiv_cpu_table) {
+ pr_err("failed to allocate equivalent CPU table\n");
+ return -ENOMEM;
+ }
+
+ memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
+
+ /* add header length */
+ return size + CONTAINER_HDR_SZ;
+}
+
+static void free_equiv_cpu_table(void)
+{
+ vfree(equiv_cpu_table);
+ equiv_cpu_table = NULL;
+}
+
+static void cleanup(void)
+{
+ free_equiv_cpu_table();
+ free_cache();
+}
+
+/*
+ * We return the current size even if some of the checks failed so that
+ * we can skip over the next patch. If we return a negative value, we
+ * signal a grave error like a memory allocation has failed and the
+ * driver cannot continue functioning normally. In such cases, we tear
+ * down everything we've used up so far and exit.
+ */
+static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
+{
+ struct microcode_header_amd *mc_hdr;
+ struct ucode_patch *patch;
+ unsigned int patch_size, crnt_size, ret;
+ u32 proc_fam;
+ u16 proc_id;
+
+ patch_size = *(u32 *)(fw + 4);
+ crnt_size = patch_size + SECTION_HDR_SIZE;
+ mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
+ proc_id = mc_hdr->processor_rev_id;
+
+ proc_fam = find_cpu_family_by_equiv_cpu(proc_id);
+ if (!proc_fam) {
+ pr_err("No patch family for equiv ID: 0x%04x\n", proc_id);
+ return crnt_size;
+ }
+
+ /* check if patch is for the current family */
+ proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
+ if (proc_fam != family)
+ return crnt_size;
+
+ if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
+ pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n",
+ mc_hdr->patch_id);
+ return crnt_size;
+ }
+
+ ret = verify_patch_size(family, patch_size, leftover);
+ if (!ret) {
+ pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
+ return crnt_size;
+ }
+
+ patch = kzalloc(sizeof(*patch), GFP_KERNEL);
+ if (!patch) {
+ pr_err("Patch allocation failure.\n");
+ return -EINVAL;
+ }
+
+ patch->data = kzalloc(patch_size, GFP_KERNEL);
+ if (!patch->data) {
+ pr_err("Patch data allocation failure.\n");
+ kfree(patch);
+ return -EINVAL;
+ }
+
+ /* All looks ok, copy patch... */
+ memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
+ INIT_LIST_HEAD(&patch->plist);
+ patch->patch_id = mc_hdr->patch_id;
+ patch->equiv_cpu = proc_id;
+
+ pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
+ __func__, patch->patch_id, proc_id);
+
+ /* ... and add to cache. */
+ update_cache(patch);
+
+ return crnt_size;
+}
+
+static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
+ size_t size)
+{
+ enum ucode_state ret = UCODE_ERROR;
+ unsigned int leftover;
+ u8 *fw = (u8 *)data;
+ int crnt_size = 0;
+ int offset;
+
+ offset = install_equiv_cpu_table(data);
+ if (offset < 0) {
+ pr_err("failed to create equivalent cpu table\n");
+ return ret;
+ }
+ fw += offset;
+ leftover = size - offset;
+
+ if (*(u32 *)fw != UCODE_UCODE_TYPE) {
+ pr_err("invalid type field in container file section header\n");
+ free_equiv_cpu_table();
+ return ret;
+ }
+
+ while (leftover) {
+ crnt_size = verify_and_add_patch(family, fw, leftover);
+ if (crnt_size < 0)
+ return ret;
+
+ fw += crnt_size;
+ leftover -= crnt_size;
+ }
+
+ return UCODE_OK;
+}
+
+enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+ enum ucode_state ret;
+
+ /* free old equiv table */
+ free_equiv_cpu_table();
+
+ ret = __load_microcode_amd(family, data, size);
+
+ if (ret != UCODE_OK)
+ cleanup();
+
+#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32)
+ /* save BSP's matching patch for early load */
+ if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) {
+ struct ucode_patch *p = find_patch(smp_processor_id());
+ if (p) {
+ memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
+ memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
+ PATCH_MAX_SIZE));
+ }
+ }
+#endif
+ return ret;
+}
+
+/*
+ * AMD microcode firmware naming convention, up to family 15h they are in
+ * the legacy file:
+ *
+ * amd-ucode/microcode_amd.bin
+ *
+ * This legacy file is always smaller than 2K in size.
+ *
+ * Beginning with family 15h, they are in family-specific firmware files:
+ *
+ * amd-ucode/microcode_amd_fam15h.bin
+ * amd-ucode/microcode_amd_fam16h.bin
+ * ...
+ *
+ * These might be larger than 2K.
+ */
+static enum ucode_state request_microcode_amd(int cpu, struct device *device,
+ bool refresh_fw)
+{
+ char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ enum ucode_state ret = UCODE_NFOUND;
+ const struct firmware *fw;
+
+ /* reload ucode container only on the boot cpu */
+ if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
+ return UCODE_OK;
+
+ if (c->x86 >= 0x15)
+ snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
+
+ if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
+ pr_debug("failed to load file %s\n", fw_name);
+ goto out;
+ }
+
+ ret = UCODE_ERROR;
+ if (*(u32 *)fw->data != UCODE_MAGIC) {
+ pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
+ goto fw_release;
+ }
+
+ ret = load_microcode_amd(c->x86, fw->data, fw->size);
+
+ fw_release:
+ release_firmware(fw);
+
+ out:
+ return ret;
+}
+
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
+{
+ return UCODE_ERROR;
+}
+
+static void microcode_fini_cpu_amd(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ uci->mc = NULL;
+}
+
+static struct microcode_ops microcode_amd_ops = {
+ .request_microcode_user = request_microcode_user,
+ .request_microcode_fw = request_microcode_amd,
+ .collect_cpu_info = collect_cpu_info_amd,
+ .apply_microcode = apply_microcode_amd,
+ .microcode_fini_cpu = microcode_fini_cpu_amd,
+};
+
+struct microcode_ops * __init init_amd_microcode(void)
+{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
+ if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
+ pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+
+ return &microcode_amd_ops;
+}
+
+void __exit exit_amd_microcode(void)
+{
+ cleanup();
+}
diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
new file mode 100644
index 00000000000..617a9e28424
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ * Fixes: Borislav Petkov <bp@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/earlycpio.h>
+#include <linux/initrd.h>
+
+#include <asm/cpu.h>
+#include <asm/setup.h>
+#include <asm/microcode_amd.h>
+
+/*
+ * This points to the current valid container of microcode patches which we will
+ * save from the initrd before jettisoning its contents.
+ */
+static u8 *container;
+static size_t container_size;
+
+static u32 ucode_new_rev;
+u8 amd_ucode_patch[PATCH_MAX_SIZE];
+static u16 this_equiv_id;
+
+struct cpio_data ucode_cpio;
+
+/*
+ * Microcode patch container file is prepended to the initrd in cpio format.
+ * See Documentation/x86/early-microcode.txt
+ */
+static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
+
+static struct cpio_data __init find_ucode_in_initrd(void)
+{
+ long offset = 0;
+ char *path;
+ void *start;
+ size_t size;
+
+#ifdef CONFIG_X86_32
+ struct boot_params *p;
+
+ /*
+ * On 32-bit, early load occurs before paging is turned on so we need
+ * to use physical addresses.
+ */
+ p = (struct boot_params *)__pa_nodebug(&boot_params);
+ path = (char *)__pa_nodebug(ucode_path);
+ start = (void *)p->hdr.ramdisk_image;
+ size = p->hdr.ramdisk_size;
+#else
+ path = ucode_path;
+ start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
+ size = boot_params.hdr.ramdisk_size;
+#endif
+
+ return find_cpio_data(path, start, size, &offset);
+}
+
+static size_t compute_container_size(u8 *data, u32 total_size)
+{
+ size_t size = 0;
+ u32 *header = (u32 *)data;
+
+ if (header[0] != UCODE_MAGIC ||
+ header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+ header[2] == 0) /* size */
+ return size;
+
+ size = header[2] + CONTAINER_HDR_SZ;
+ total_size -= size;
+ data += size;
+
+ while (total_size) {
+ u16 patch_size;
+
+ header = (u32 *)data;
+
+ if (header[0] != UCODE_UCODE_TYPE)
+ break;
+
+ /*
+ * Sanity-check patch size.
+ */
+ patch_size = header[1];
+ if (patch_size > PATCH_MAX_SIZE)
+ break;
+
+ size += patch_size + SECTION_HDR_SIZE;
+ data += patch_size + SECTION_HDR_SIZE;
+ total_size -= patch_size + SECTION_HDR_SIZE;
+ }
+
+ return size;
+}
+
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ */
+static void apply_ucode_in_initrd(void *ucode, size_t size)
+{
+ struct equiv_cpu_entry *eq;
+ size_t *cont_sz;
+ u32 *header;
+ u8 *data, **cont;
+ u16 eq_id = 0;
+ int offset, left;
+ u32 rev, eax, ebx, ecx, edx;
+ u32 *new_rev;
+
+#ifdef CONFIG_X86_32
+ new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+ cont_sz = (size_t *)__pa_nodebug(&container_size);
+ cont = (u8 **)__pa_nodebug(&container);
+#else
+ new_rev = &ucode_new_rev;
+ cont_sz = &container_size;
+ cont = &container;
+#endif
+
+ data = ucode;
+ left = size;
+ header = (u32 *)data;
+
+ /* find equiv cpu table */
+ if (header[0] != UCODE_MAGIC ||
+ header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+ header[2] == 0) /* size */
+ return;
+
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ while (left > 0) {
+ eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
+
+ *cont = data;
+
+ /* Advance past the container header */
+ offset = header[2] + CONTAINER_HDR_SZ;
+ data += offset;
+ left -= offset;
+
+ eq_id = find_equiv_id(eq, eax);
+ if (eq_id) {
+ this_equiv_id = eq_id;
+ *cont_sz = compute_container_size(*cont, left + offset);
+
+ /*
+ * truncate how much we need to iterate over in the
+ * ucode update loop below
+ */
+ left = *cont_sz - offset;
+ break;
+ }
+
+ /*
+ * support multiple container files appended together. if this
+ * one does not have a matching equivalent cpu entry, we fast
+ * forward to the next container file.
+ */
+ while (left > 0) {
+ header = (u32 *)data;
+ if (header[0] == UCODE_MAGIC &&
+ header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
+ break;
+
+ offset = header[1] + SECTION_HDR_SIZE;
+ data += offset;
+ left -= offset;
+ }
+
+ /* mark where the next microcode container file starts */
+ offset = data - (u8 *)ucode;
+ ucode = data;
+ }
+
+ if (!eq_id) {
+ *cont = NULL;
+ *cont_sz = 0;
+ return;
+ }
+
+ /* find ucode and update if needed */
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
+
+ while (left > 0) {
+ struct microcode_amd *mc;
+
+ header = (u32 *)data;
+ if (header[0] != UCODE_UCODE_TYPE || /* type */
+ header[1] == 0) /* size */
+ break;
+
+ mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
+
+ if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
+
+ if (!__apply_microcode_amd(mc)) {
+ rev = mc->hdr.patch_id;
+ *new_rev = rev;
+
+ /* save ucode patch */
+ memcpy(amd_ucode_patch, mc,
+ min_t(u32, header[1], PATCH_MAX_SIZE));
+ }
+ }
+
+ offset = header[1] + SECTION_HDR_SIZE;
+ data += offset;
+ left -= offset;
+ }
+}
+
+void __init load_ucode_amd_bsp(void)
+{
+ struct cpio_data cp;
+ void **data;
+ size_t *size;
+
+#ifdef CONFIG_X86_32
+ data = (void **)__pa_nodebug(&ucode_cpio.data);
+ size = (size_t *)__pa_nodebug(&ucode_cpio.size);
+#else
+ data = &ucode_cpio.data;
+ size = &ucode_cpio.size;
+#endif
+
+ cp = find_ucode_in_initrd();
+ if (!cp.data)
+ return;
+
+ *data = cp.data;
+ *size = cp.size;
+
+ apply_ucode_in_initrd(cp.data, cp.size);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * On 32-bit, since AP's early load occurs before paging is turned on, we
+ * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
+ * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
+ * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
+ * which is used upon resume from suspend.
+ */
+void load_ucode_amd_ap(void)
+{
+ struct microcode_amd *mc;
+ size_t *usize;
+ void **ucode;
+
+ mc = (struct microcode_amd *)__pa(amd_ucode_patch);
+ if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
+ __apply_microcode_amd(mc);
+ return;
+ }
+
+ ucode = (void *)__pa_nodebug(&container);
+ usize = (size_t *)__pa_nodebug(&container_size);
+
+ if (!*ucode || !*usize)
+ return;
+
+ apply_ucode_in_initrd(*ucode, *usize);
+}
+
+static void __init collect_cpu_sig_on_bsp(void *arg)
+{
+ unsigned int cpu = smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ uci->cpu_sig.sig = cpuid_eax(0x00000001);
+}
+
+static void __init get_bsp_sig(void)
+{
+ unsigned int bsp = boot_cpu_data.cpu_index;
+ struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
+
+ if (!uci->cpu_sig.sig)
+ smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
+}
+#else
+void load_ucode_amd_ap(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct equiv_cpu_entry *eq;
+ struct microcode_amd *mc;
+ u32 rev, eax;
+ u16 eq_id;
+
+ /* Exit if called on the BSP. */
+ if (!cpu)
+ return;
+
+ if (!container)
+ return;
+
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
+
+ uci->cpu_sig.rev = rev;
+ uci->cpu_sig.sig = eax;
+
+ eax = cpuid_eax(0x00000001);
+ eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
+
+ eq_id = find_equiv_id(eq, eax);
+ if (!eq_id)
+ return;
+
+ if (eq_id == this_equiv_id) {
+ mc = (struct microcode_amd *)amd_ucode_patch;
+
+ if (mc && rev < mc->hdr.patch_id) {
+ if (!__apply_microcode_amd(mc))
+ ucode_new_rev = mc->hdr.patch_id;
+ }
+
+ } else {
+ if (!ucode_cpio.data)
+ return;
+
+ /*
+ * AP has a different equivalence ID than BSP, looks like
+ * mixed-steppings silicon so go through the ucode blob anew.
+ */
+ apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size);
+ }
+}
+#endif
+
+int __init save_microcode_in_initrd_amd(void)
+{
+ unsigned long cont;
+ enum ucode_state ret;
+ u32 eax;
+
+ if (!container)
+ return -EINVAL;
+
+#ifdef CONFIG_X86_32
+ get_bsp_sig();
+ cont = (unsigned long)container;
+#else
+ /*
+ * We need the physical address of the container for both bitness since
+ * boot_params.hdr.ramdisk_image is a physical address.
+ */
+ cont = __pa(container);
+#endif
+
+ /*
+ * Take into account the fact that the ramdisk might get relocated and
+ * therefore we need to recompute the container's position in virtual
+ * memory space.
+ */
+ if (relocated_ramdisk)
+ container = (u8 *)(__va(relocated_ramdisk) +
+ (cont - boot_params.hdr.ramdisk_image));
+
+ if (ucode_new_rev)
+ pr_info("microcode: updated early to new patch_level=0x%08x\n",
+ ucode_new_rev);
+
+ eax = cpuid_eax(0x00000001);
+ eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+
+ ret = load_microcode_amd(eax, container, container_size);
+ if (ret != UCODE_OK)
+ return -EINVAL;
+
+ /*
+ * This will be freed any msec now, stash patches for the current
+ * family and switch to patch cache for cpu hotplug, etc later.
+ */
+ container = NULL;
+ container_size = 0;
+
+ return 0;
+}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/cpu/microcode/core.c
index fda91c30710..dd9d6190b08 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -86,6 +86,8 @@
#include <asm/microcode.h>
#include <asm/processor.h>
+#include <asm/cpu_device_id.h>
+#include <asm/perf_event.h>
MODULE_DESCRIPTION("Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -95,6 +97,9 @@ MODULE_LICENSE("GPL");
static struct microcode_ops *microcode_ops;
+bool dis_ucode_ldr;
+module_param(dis_ucode_ldr, bool, 0);
+
/*
* Synchronization.
*
@@ -223,6 +228,9 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
if (do_microcode_update(buf, len) == 0)
ret = (ssize_t)len;
+ if (ret > 0)
+ perf_check_microcode();
+
mutex_unlock(&microcode_mutex);
put_online_cpus();
@@ -274,21 +282,18 @@ static struct platform_device *microcode_pdev;
static int reload_for_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ enum ucode_state ustate;
int err = 0;
- mutex_lock(&microcode_mutex);
- if (uci->valid) {
- enum ucode_state ustate;
-
- ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
- if (ustate == UCODE_OK)
- apply_microcode_on_target(cpu);
- else
- if (ustate == UCODE_ERROR)
- err = -EINVAL;
- }
- mutex_unlock(&microcode_mutex);
+ if (!uci->valid)
+ return err;
+ ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, true);
+ if (ustate == UCODE_OK)
+ apply_microcode_on_target(cpu);
+ else
+ if (ustate == UCODE_ERROR)
+ err = -EINVAL;
return err;
}
@@ -297,20 +302,31 @@ static ssize_t reload_store(struct device *dev,
const char *buf, size_t size)
{
unsigned long val;
- int cpu = dev->id;
- int ret = 0;
- char *end;
+ int cpu;
+ ssize_t ret = 0, tmp_ret;
- val = simple_strtoul(buf, &end, 0);
- if (end == buf)
- return -EINVAL;
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val != 1)
+ return size;
+
+ get_online_cpus();
+ mutex_lock(&microcode_mutex);
+ for_each_online_cpu(cpu) {
+ tmp_ret = reload_for_cpu(cpu);
+ if (tmp_ret != 0)
+ pr_warn("Error reloading microcode on CPU %d\n", cpu);
- if (val == 1) {
- get_online_cpus();
- if (cpu_online(cpu))
- ret = reload_for_cpu(cpu);
- put_online_cpus();
+ /* save retval of the first encountered reload error */
+ if (!ret)
+ ret = tmp_ret;
}
+ if (!ret)
+ perf_check_microcode();
+ mutex_unlock(&microcode_mutex);
+ put_online_cpus();
if (!ret)
ret = size;
@@ -339,7 +355,6 @@ static DEVICE_ATTR(version, 0400, version_show, NULL);
static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
static struct attribute *mc_default_attrs[] = {
- &dev_attr_reload.attr,
&dev_attr_version.attr,
&dev_attr_processor_flags.attr,
NULL
@@ -352,28 +367,26 @@ static struct attribute_group mc_attr_group = {
static void microcode_fini_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
microcode_ops->microcode_fini_cpu(cpu);
- uci->valid = 0;
}
static enum ucode_state microcode_resume_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- if (!uci->mc)
- return UCODE_NFOUND;
-
pr_debug("CPU%d updated upon resume\n", cpu);
- apply_microcode_on_target(cpu);
+
+ if (apply_microcode_on_target(cpu))
+ return UCODE_ERROR;
return UCODE_OK;
}
-static enum ucode_state microcode_init_cpu(int cpu)
+static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
{
enum ucode_state ustate;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (uci && uci->valid)
+ return UCODE_OK;
if (collect_cpu_info(cpu))
return UCODE_ERROR;
@@ -382,7 +395,8 @@ static enum ucode_state microcode_init_cpu(int cpu)
if (system_state != SYSTEM_RUNNING)
return UCODE_NFOUND;
- ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
+ ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev,
+ refresh_fw);
if (ustate == UCODE_OK) {
pr_debug("CPU%d updated upon init\n", cpu);
@@ -395,14 +409,11 @@ static enum ucode_state microcode_init_cpu(int cpu)
static enum ucode_state microcode_update_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- enum ucode_state ustate;
if (uci->valid)
- ustate = microcode_resume_cpu(cpu);
- else
- ustate = microcode_init_cpu(cpu);
+ return microcode_resume_cpu(cpu);
- return ustate;
+ return microcode_init_cpu(cpu, false);
}
static int mc_device_add(struct device *dev, struct subsys_interface *sif)
@@ -418,10 +429,8 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)
if (err)
return err;
- if (microcode_init_cpu(cpu) == UCODE_ERROR) {
- sysfs_remove_group(&dev->kobj, &mc_attr_group);
+ if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
return -EINVAL;
- }
return err;
}
@@ -462,41 +471,48 @@ static struct syscore_ops mc_syscore_ops = {
.resume = mc_bp_resume,
};
-static __cpuinit int
+static int
mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct device *dev;
dev = get_cpu_device(cpu);
- switch (action) {
+
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
microcode_update_cpu(cpu);
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
pr_debug("CPU%d added\n", cpu);
+ /*
+ * "break" is missing on purpose here because we want to fall
+ * through in order to create the sysfs group.
+ */
+
+ case CPU_DOWN_FAILED:
if (sysfs_create_group(&dev->kobj, &mc_attr_group))
pr_err("Failed to create group for CPU%d\n", cpu);
break;
+
case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
/* Suspend is in progress, only remove the interface */
sysfs_remove_group(&dev->kobj, &mc_attr_group);
pr_debug("CPU%d removed\n", cpu);
break;
/*
+ * case CPU_DEAD:
+ *
* When a CPU goes offline, don't free up or invalidate the copy of
* the microcode in kernel memory, so that we can reuse it when the
* CPU comes back online without unnecessarily requesting the userspace
* for it again.
*/
- case CPU_UP_CANCELED_FROZEN:
- /* The CPU refused to come up during a system resume */
- microcode_fini_cpu(cpu);
- break;
}
+
+ /* The CPU refused to come up during a system resume */
+ if (action == CPU_UP_CANCELED_FROZEN)
+ microcode_fini_cpu(cpu);
+
return NOTIFY_OK;
}
@@ -504,20 +520,47 @@ static struct notifier_block __refdata mc_cpu_notifier = {
.notifier_call = mc_cpu_callback,
};
+#ifdef MODULE
+/* Autoload on Intel and AMD systems */
+static const struct x86_cpu_id __initconst microcode_id[] = {
+#ifdef CONFIG_MICROCODE_INTEL
+ { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
+#endif
+#ifdef CONFIG_MICROCODE_AMD
+ { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
+#endif
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, microcode_id);
+#endif
+
+static struct attribute *cpu_root_microcode_attrs[] = {
+ &dev_attr_reload.attr,
+ NULL
+};
+
+static struct attribute_group cpu_root_microcode_group = {
+ .name = "microcode",
+ .attrs = cpu_root_microcode_attrs,
+};
+
static int __init microcode_init(void)
{
struct cpuinfo_x86 *c = &cpu_data(0);
int error;
+ if (dis_ucode_ldr)
+ return 0;
+
if (c->x86_vendor == X86_VENDOR_INTEL)
microcode_ops = init_intel_microcode();
else if (c->x86_vendor == X86_VENDOR_AMD)
microcode_ops = init_amd_microcode();
-
- if (!microcode_ops) {
+ else
pr_err("no support for this CPU vendor\n");
+
+ if (!microcode_ops)
return -ENODEV;
- }
microcode_pdev = platform_device_register_simple("microcode", -1,
NULL, 0);
@@ -528,16 +571,25 @@ static int __init microcode_init(void)
mutex_lock(&microcode_mutex);
error = subsys_interface_register(&mc_cpu_interface);
-
+ if (!error)
+ perf_check_microcode();
mutex_unlock(&microcode_mutex);
put_online_cpus();
if (error)
goto out_pdev;
+ error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
+ &cpu_root_microcode_group);
+
+ if (error) {
+ pr_err("Error creating microcode group!\n");
+ goto out_driver;
+ }
+
error = microcode_dev_init();
if (error)
- goto out_driver;
+ goto out_ucode_group;
register_syscore_ops(&mc_syscore_ops);
register_hotcpu_notifier(&mc_cpu_notifier);
@@ -547,7 +599,11 @@ static int __init microcode_init(void)
return 0;
-out_driver:
+ out_ucode_group:
+ sysfs_remove_group(&cpu_subsys.dev_root->kobj,
+ &cpu_root_microcode_group);
+
+ out_driver:
get_online_cpus();
mutex_lock(&microcode_mutex);
@@ -556,7 +612,7 @@ out_driver:
mutex_unlock(&microcode_mutex);
put_online_cpus();
-out_pdev:
+ out_pdev:
platform_device_unregister(microcode_pdev);
return error;
@@ -572,6 +628,9 @@ static void __exit microcode_exit(void)
unregister_hotcpu_notifier(&mc_cpu_notifier);
unregister_syscore_ops(&mc_syscore_ops);
+ sysfs_remove_group(&cpu_subsys.dev_root->kobj,
+ &cpu_root_microcode_group);
+
get_online_cpus();
mutex_lock(&microcode_mutex);
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
new file mode 100644
index 00000000000..5f28a64e71e
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -0,0 +1,178 @@
+/*
+ * X86 CPU microcode early update for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ *
+ * This driver allows to early upgrade microcode on Intel processors
+ * belonging to IA-32 family - PentiumPro, Pentium II,
+ * Pentium III, Xeon, Pentium 4, etc.
+ *
+ * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
+ * Software Developer's Manual.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <asm/microcode.h>
+#include <asm/microcode_intel.h>
+#include <asm/microcode_amd.h>
+#include <asm/processor.h>
+#include <asm/cmdline.h>
+
+#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
+#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
+#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
+#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
+#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
+#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
+#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+
+#define CPUID_IS(a, b, c, ebx, ecx, edx) \
+ (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
+
+/*
+ * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
+ * x86_vendor() gets vendor id for BSP.
+ *
+ * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
+ * coding, we still use x86_vendor() to get vendor id for AP.
+ *
+ * x86_vendor() gets vendor information directly through cpuid.
+ */
+static int x86_vendor(void)
+{
+ u32 eax = 0x00000000;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
+ return X86_VENDOR_INTEL;
+
+ if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
+ return X86_VENDOR_AMD;
+
+ return X86_VENDOR_UNKNOWN;
+}
+
+static int x86_family(void)
+{
+ u32 eax = 0x00000001;
+ u32 ebx, ecx = 0, edx;
+ int x86;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ x86 = (eax >> 8) & 0xf;
+ if (x86 == 15)
+ x86 += (eax >> 20) & 0xff;
+
+ return x86;
+}
+
+static bool __init check_loader_disabled_bsp(void)
+{
+#ifdef CONFIG_X86_32
+ const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
+ const char *opt = "dis_ucode_ldr";
+ const char *option = (const char *)__pa_nodebug(opt);
+ bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
+
+#else /* CONFIG_X86_64 */
+ const char *cmdline = boot_command_line;
+ const char *option = "dis_ucode_ldr";
+ bool *res = &dis_ucode_ldr;
+#endif
+
+ if (cmdline_find_option_bool(cmdline, option))
+ *res = true;
+
+ return *res;
+}
+
+void __init load_ucode_bsp(void)
+{
+ int vendor, x86;
+
+ if (check_loader_disabled_bsp())
+ return;
+
+ if (!have_cpuid_p())
+ return;
+
+ vendor = x86_vendor();
+ x86 = x86_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (x86 >= 6)
+ load_ucode_intel_bsp();
+ break;
+ case X86_VENDOR_AMD:
+ if (x86 >= 0x10)
+ load_ucode_amd_bsp();
+ break;
+ default:
+ break;
+ }
+}
+
+static bool check_loader_disabled_ap(void)
+{
+#ifdef CONFIG_X86_32
+ return __pa_nodebug(dis_ucode_ldr);
+#else
+ return dis_ucode_ldr;
+#endif
+}
+
+void load_ucode_ap(void)
+{
+ int vendor, x86;
+
+ if (check_loader_disabled_ap())
+ return;
+
+ if (!have_cpuid_p())
+ return;
+
+ vendor = x86_vendor();
+ x86 = x86_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (x86 >= 6)
+ load_ucode_intel_ap();
+ break;
+ case X86_VENDOR_AMD:
+ if (x86 >= 0x10)
+ load_ucode_amd_ap();
+ break;
+ default:
+ break;
+ }
+}
+
+int __init save_microcode_in_initrd(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (c->x86 >= 6)
+ save_microcode_in_initrd_intel();
+ break;
+ case X86_VENDOR_AMD:
+ if (c->x86 >= 0x10)
+ save_microcode_in_initrd_amd();
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 3ca42d0e43a..a276fa75d9b 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -79,7 +79,7 @@
#include <linux/module.h>
#include <linux/vmalloc.h>
-#include <asm/microcode.h>
+#include <asm/microcode_intel.h>
#include <asm/processor.h>
#include <asm/msr.h>
@@ -87,59 +87,6 @@ MODULE_DESCRIPTION("Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
MODULE_LICENSE("GPL");
-struct microcode_header_intel {
- unsigned int hdrver;
- unsigned int rev;
- unsigned int date;
- unsigned int sig;
- unsigned int cksum;
- unsigned int ldrver;
- unsigned int pf;
- unsigned int datasize;
- unsigned int totalsize;
- unsigned int reserved[3];
-};
-
-struct microcode_intel {
- struct microcode_header_intel hdr;
- unsigned int bits[0];
-};
-
-/* microcode format is extended from prescott processors */
-struct extended_signature {
- unsigned int sig;
- unsigned int pf;
- unsigned int cksum;
-};
-
-struct extended_sigtable {
- unsigned int count;
- unsigned int cksum;
- unsigned int reserved[3];
- struct extended_signature sigs[0];
-};
-
-#define DEFAULT_UCODE_DATASIZE (2000)
-#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
-#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
-#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
-#define DWSIZE (sizeof(u32))
-
-#define get_totalsize(mc) \
- (((struct microcode_intel *)mc)->hdr.totalsize ? \
- ((struct microcode_intel *)mc)->hdr.totalsize : \
- DEFAULT_UCODE_TOTALSIZE)
-
-#define get_datasize(mc) \
- (((struct microcode_intel *)mc)->hdr.datasize ? \
- ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
-
-#define sigmatch(s1, s2, p1, p2) \
- (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
-#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
@@ -147,12 +94,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
memset(csig, 0, sizeof(*csig));
- if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
- cpu_has(c, X86_FEATURE_IA64)) {
- pr_err("CPU%d not a capable Intel processor\n", cpu_num);
- return -1;
- }
-
csig->sig = cpuid_eax(0x00000001);
if ((c->x86_model >= 5) || (c->x86 > 6)) {
@@ -168,128 +109,25 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
return 0;
}
-static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
-{
- return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
-}
-
-static inline int
-update_match_revision(struct microcode_header_intel *mc_header, int rev)
-{
- return (mc_header->rev <= rev) ? 0 : 1;
-}
-
-static int microcode_sanity_check(void *mc)
-{
- unsigned long total_size, data_size, ext_table_size;
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header = NULL;
- int sum, orig_sum, ext_sigcount = 0, i;
- struct extended_signature *ext_sig;
-
- total_size = get_totalsize(mc_header);
- data_size = get_datasize(mc_header);
-
- if (data_size + MC_HEADER_SIZE > total_size) {
- pr_err("error! Bad data size in microcode data file\n");
- return -EINVAL;
- }
-
- if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
- pr_err("error! Unknown microcode update format\n");
- return -EINVAL;
- }
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- pr_err("error! Small exttable size in microcode data file\n");
- return -EINVAL;
- }
- ext_header = mc + MC_HEADER_SIZE + data_size;
- if (ext_table_size != exttable_size(ext_header)) {
- pr_err("error! Bad exttable size in microcode data file\n");
- return -EFAULT;
- }
- ext_sigcount = ext_header->count;
- }
-
- /* check extended table checksum */
- if (ext_table_size) {
- int ext_table_sum = 0;
- int *ext_tablep = (int *)ext_header;
-
- i = ext_table_size / DWSIZE;
- while (i--)
- ext_table_sum += ext_tablep[i];
- if (ext_table_sum) {
- pr_warning("aborting, bad extended signature table checksum\n");
- return -EINVAL;
- }
- }
-
- /* calculate the checksum */
- orig_sum = 0;
- i = (MC_HEADER_SIZE + data_size) / DWSIZE;
- while (i--)
- orig_sum += ((int *)mc)[i];
- if (orig_sum) {
- pr_err("aborting, bad checksum\n");
- return -EINVAL;
- }
- if (!ext_table_size)
- return 0;
- /* check extended signature checksum */
- for (i = 0; i < ext_sigcount; i++) {
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
- EXT_SIGNATURE_SIZE * i;
- sum = orig_sum
- - (mc_header->sig + mc_header->pf + mc_header->cksum)
- + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
- if (sum) {
- pr_err("aborting, bad checksum\n");
- return -EINVAL;
- }
- }
- return 0;
-}
-
/*
* return 0 - no update found
* return 1 - found update
*/
-static int
-get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
+static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
{
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header;
- unsigned long total_size = get_totalsize(mc_header);
- int ext_sigcount, i;
- struct extended_signature *ext_sig;
+ struct cpu_signature cpu_sig;
+ unsigned int csig, cpf, crev;
- if (!update_match_revision(mc_header, rev))
- return 0;
-
- if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
- return 1;
-
- /* Look for ext. headers: */
- if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
- return 0;
+ collect_cpu_info(cpu, &cpu_sig);
- ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
- ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+ csig = cpu_sig.sig;
+ cpf = cpu_sig.pf;
+ crev = cpu_sig.rev;
- for (i = 0; i < ext_sigcount; i++) {
- if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
- return 1;
- ext_sig++;
- }
- return 0;
+ return get_matching_microcode(csig, cpf, mc_intel, crev);
}
-static int apply_microcode(int cpu)
+int apply_microcode(int cpu)
{
struct microcode_intel *mc_intel;
struct ucode_cpu_info *uci;
@@ -306,6 +144,14 @@ static int apply_microcode(int cpu)
if (mc_intel == NULL)
return 0;
+ /*
+ * Microcode on this CPU could be updated earlier. Only apply the
+ * microcode patch in mc_intel when it is newer than the one on this
+ * CPU.
+ */
+ if (get_matching_mc(mc_intel, cpu) == 0)
+ return 0;
+
/* write microcode via MSR 0x79 */
wrmsr(MSR_IA32_UCODE_WRITE,
(unsigned long) mc_intel->bits,
@@ -344,6 +190,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
unsigned int leftover = size;
enum ucode_state state = UCODE_OK;
unsigned int curr_mc_size = 0;
+ unsigned int csig, cpf;
while (leftover) {
struct microcode_header_intel mc_header;
@@ -368,11 +215,13 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
}
if (get_ucode_data(mc, ucode_ptr, mc_size) ||
- microcode_sanity_check(mc) < 0) {
+ microcode_sanity_check(mc, 1) < 0) {
break;
}
- if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
+ csig = uci->cpu_sig.sig;
+ cpf = uci->cpu_sig.pf;
+ if (get_matching_microcode(csig, cpf, mc, new_rev)) {
vfree(new_mc);
new_rev = mc_header.rev;
new_mc = mc;
@@ -399,6 +248,13 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
vfree(uci->mc);
uci->mc = (struct microcode_intel *)new_mc;
+ /*
+ * If early loading microcode is supported, save this mc into
+ * permanent memory. So it will be loaded early when a CPU is hot added
+ * or resumes.
+ */
+ save_mc_for_early(new_mc);
+
pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
out:
@@ -411,7 +267,8 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
return 0;
}
-static enum ucode_state request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device,
+ bool refresh_fw)
{
char name[30];
struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -421,7 +278,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
sprintf(name, "intel-ucode/%02x-%02x-%02x",
c->x86, c->x86_model, c->x86_mask);
- if (request_firmware(&firmware, name, device)) {
+ if (request_firmware_direct(&firmware, name, device)) {
pr_debug("data file %s load failed\n", name);
return UCODE_NFOUND;
}
@@ -463,6 +320,14 @@ static struct microcode_ops microcode_intel_ops = {
struct microcode_ops * __init init_intel_microcode(void)
{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
+ if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+ cpu_has(c, X86_FEATURE_IA64)) {
+ pr_err("Intel CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+
return &microcode_intel_ops;
}
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
new file mode 100644
index 00000000000..18f739129e7
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -0,0 +1,787 @@
+/*
+ * Intel CPU microcode early update for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ *
+ * This allows to early upgrade microcode on Intel processors
+ * belonging to IA-32 family - PentiumPro, Pentium II,
+ * Pentium III, Xeon, Pentium 4, etc.
+ *
+ * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
+ * Software Developer's Manual.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/earlycpio.h>
+#include <linux/initrd.h>
+#include <linux/cpu.h>
+#include <asm/msr.h>
+#include <asm/microcode_intel.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
+
+unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+struct mc_saved_data {
+ unsigned int mc_saved_count;
+ struct microcode_intel **mc_saved;
+} mc_saved_data;
+
+static enum ucode_state
+generic_load_microcode_early(struct microcode_intel **mc_saved_p,
+ unsigned int mc_saved_count,
+ struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *ucode_ptr, *new_mc = NULL;
+ int new_rev = uci->cpu_sig.rev;
+ enum ucode_state state = UCODE_OK;
+ unsigned int mc_size;
+ struct microcode_header_intel *mc_header;
+ unsigned int csig = uci->cpu_sig.sig;
+ unsigned int cpf = uci->cpu_sig.pf;
+ int i;
+
+ for (i = 0; i < mc_saved_count; i++) {
+ ucode_ptr = mc_saved_p[i];
+
+ mc_header = (struct microcode_header_intel *)ucode_ptr;
+ mc_size = get_totalsize(mc_header);
+ if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
+ new_rev = mc_header->rev;
+ new_mc = ucode_ptr;
+ }
+ }
+
+ if (!new_mc) {
+ state = UCODE_NFOUND;
+ goto out;
+ }
+
+ uci->mc = (struct microcode_intel *)new_mc;
+out:
+ return state;
+}
+
+static void
+microcode_pointer(struct microcode_intel **mc_saved,
+ unsigned long *mc_saved_in_initrd,
+ unsigned long initrd_start, int mc_saved_count)
+{
+ int i;
+
+ for (i = 0; i < mc_saved_count; i++)
+ mc_saved[i] = (struct microcode_intel *)
+ (mc_saved_in_initrd[i] + initrd_start);
+}
+
+#ifdef CONFIG_X86_32
+static void
+microcode_phys(struct microcode_intel **mc_saved_tmp,
+ struct mc_saved_data *mc_saved_data)
+{
+ int i;
+ struct microcode_intel ***mc_saved;
+
+ mc_saved = (struct microcode_intel ***)
+ __pa_nodebug(&mc_saved_data->mc_saved);
+ for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
+ struct microcode_intel *p;
+
+ p = *(struct microcode_intel **)
+ __pa_nodebug(mc_saved_data->mc_saved + i);
+ mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
+ }
+}
+#endif
+
+static enum ucode_state
+load_microcode(struct mc_saved_data *mc_saved_data,
+ unsigned long *mc_saved_in_initrd,
+ unsigned long initrd_start,
+ struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+ unsigned int count = mc_saved_data->mc_saved_count;
+
+ if (!mc_saved_data->mc_saved) {
+ microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
+ initrd_start, count);
+
+ return generic_load_microcode_early(mc_saved_tmp, count, uci);
+ } else {
+#ifdef CONFIG_X86_32
+ microcode_phys(mc_saved_tmp, mc_saved_data);
+ return generic_load_microcode_early(mc_saved_tmp, count, uci);
+#else
+ return generic_load_microcode_early(mc_saved_data->mc_saved,
+ count, uci);
+#endif
+ }
+}
+
+static u8 get_x86_family(unsigned long sig)
+{
+ u8 x86;
+
+ x86 = (sig >> 8) & 0xf;
+
+ if (x86 == 0xf)
+ x86 += (sig >> 20) & 0xff;
+
+ return x86;
+}
+
+static u8 get_x86_model(unsigned long sig)
+{
+ u8 x86, x86_model;
+
+ x86 = get_x86_family(sig);
+ x86_model = (sig >> 4) & 0xf;
+
+ if (x86 == 0x6 || x86 == 0xf)
+ x86_model += ((sig >> 16) & 0xf) << 4;
+
+ return x86_model;
+}
+
+/*
+ * Given CPU signature and a microcode patch, this function finds if the
+ * microcode patch has matching family and model with the CPU.
+ */
+static enum ucode_state
+matching_model_microcode(struct microcode_header_intel *mc_header,
+ unsigned long sig)
+{
+ u8 x86, x86_model;
+ u8 x86_ucode, x86_model_ucode;
+ struct extended_sigtable *ext_header;
+ unsigned long total_size = get_totalsize(mc_header);
+ unsigned long data_size = get_datasize(mc_header);
+ int ext_sigcount, i;
+ struct extended_signature *ext_sig;
+
+ x86 = get_x86_family(sig);
+ x86_model = get_x86_model(sig);
+
+ x86_ucode = get_x86_family(mc_header->sig);
+ x86_model_ucode = get_x86_model(mc_header->sig);
+
+ if (x86 == x86_ucode && x86_model == x86_model_ucode)
+ return UCODE_OK;
+
+ /* Look for ext. headers: */
+ if (total_size <= data_size + MC_HEADER_SIZE)
+ return UCODE_NFOUND;
+
+ ext_header = (struct extended_sigtable *)
+ mc_header + data_size + MC_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+ for (i = 0; i < ext_sigcount; i++) {
+ x86_ucode = get_x86_family(ext_sig->sig);
+ x86_model_ucode = get_x86_model(ext_sig->sig);
+
+ if (x86 == x86_ucode && x86_model == x86_model_ucode)
+ return UCODE_OK;
+
+ ext_sig++;
+ }
+
+ return UCODE_NFOUND;
+}
+
+static int
+save_microcode(struct mc_saved_data *mc_saved_data,
+ struct microcode_intel **mc_saved_src,
+ unsigned int mc_saved_count)
+{
+ int i, j;
+ struct microcode_intel **mc_saved_p;
+ int ret;
+
+ if (!mc_saved_count)
+ return -EINVAL;
+
+ /*
+ * Copy new microcode data.
+ */
+ mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
+ GFP_KERNEL);
+ if (!mc_saved_p)
+ return -ENOMEM;
+
+ for (i = 0; i < mc_saved_count; i++) {
+ struct microcode_intel *mc = mc_saved_src[i];
+ struct microcode_header_intel *mc_header = &mc->hdr;
+ unsigned long mc_size = get_totalsize(mc_header);
+ mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
+ if (!mc_saved_p[i]) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!mc_saved_src[i]) {
+ ret = -EINVAL;
+ goto err;
+ }
+ memcpy(mc_saved_p[i], mc, mc_size);
+ }
+
+ /*
+ * Point to newly saved microcode.
+ */
+ mc_saved_data->mc_saved = mc_saved_p;
+ mc_saved_data->mc_saved_count = mc_saved_count;
+
+ return 0;
+
+err:
+ for (j = 0; j <= i; j++)
+ kfree(mc_saved_p[j]);
+ kfree(mc_saved_p);
+
+ return ret;
+}
+
+/*
+ * A microcode patch in ucode_ptr is saved into mc_saved
+ * - if it has matching signature and newer revision compared to an existing
+ * patch mc_saved.
+ * - or if it is a newly discovered microcode patch.
+ *
+ * The microcode patch should have matching model with CPU.
+ */
+static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
+ unsigned int *mc_saved_count_p)
+{
+ int i;
+ int found = 0;
+ unsigned int mc_saved_count = *mc_saved_count_p;
+ struct microcode_header_intel *mc_header;
+
+ mc_header = (struct microcode_header_intel *)ucode_ptr;
+ for (i = 0; i < mc_saved_count; i++) {
+ unsigned int sig, pf;
+ unsigned int new_rev;
+ struct microcode_header_intel *mc_saved_header =
+ (struct microcode_header_intel *)mc_saved[i];
+ sig = mc_saved_header->sig;
+ pf = mc_saved_header->pf;
+ new_rev = mc_header->rev;
+
+ if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
+ found = 1;
+ if (update_match_revision(mc_header, new_rev)) {
+ /*
+ * Found an older ucode saved before.
+ * Replace the older one with this newer
+ * one.
+ */
+ mc_saved[i] =
+ (struct microcode_intel *)ucode_ptr;
+ break;
+ }
+ }
+ }
+ if (i >= mc_saved_count && !found)
+ /*
+ * This ucode is first time discovered in ucode file.
+ * Save it to memory.
+ */
+ mc_saved[mc_saved_count++] =
+ (struct microcode_intel *)ucode_ptr;
+
+ *mc_saved_count_p = mc_saved_count;
+}
+
+/*
+ * Get microcode matching with BSP's model. Only CPUs with the same model as
+ * BSP can stay in the platform.
+ */
+static enum ucode_state __init
+get_matching_model_microcode(int cpu, unsigned long start,
+ void *data, size_t size,
+ struct mc_saved_data *mc_saved_data,
+ unsigned long *mc_saved_in_initrd,
+ struct ucode_cpu_info *uci)
+{
+ u8 *ucode_ptr = data;
+ unsigned int leftover = size;
+ enum ucode_state state = UCODE_OK;
+ unsigned int mc_size;
+ struct microcode_header_intel *mc_header;
+ struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+ unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
+ int i;
+
+ while (leftover) {
+ mc_header = (struct microcode_header_intel *)ucode_ptr;
+
+ mc_size = get_totalsize(mc_header);
+ if (!mc_size || mc_size > leftover ||
+ microcode_sanity_check(ucode_ptr, 0) < 0)
+ break;
+
+ leftover -= mc_size;
+
+ /*
+ * Since APs with same family and model as the BSP may boot in
+ * the platform, we need to find and save microcode patches
+ * with the same family and model as the BSP.
+ */
+ if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
+ UCODE_OK) {
+ ucode_ptr += mc_size;
+ continue;
+ }
+
+ _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
+
+ ucode_ptr += mc_size;
+ }
+
+ if (leftover) {
+ state = UCODE_ERROR;
+ goto out;
+ }
+
+ if (mc_saved_count == 0) {
+ state = UCODE_NFOUND;
+ goto out;
+ }
+
+ for (i = 0; i < mc_saved_count; i++)
+ mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
+
+ mc_saved_data->mc_saved_count = mc_saved_count;
+out:
+ return state;
+}
+
+static int collect_cpu_info_early(struct ucode_cpu_info *uci)
+{
+ unsigned int val[2];
+ u8 x86, x86_model;
+ struct cpu_signature csig;
+ unsigned int eax, ebx, ecx, edx;
+
+ csig.sig = 0;
+ csig.pf = 0;
+ csig.rev = 0;
+
+ memset(uci, 0, sizeof(*uci));
+
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ csig.sig = eax;
+
+ x86 = get_x86_family(csig.sig);
+ x86_model = get_x86_model(csig.sig);
+
+ if ((x86_model >= 5) || (x86 > 6)) {
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig.pf = 1 << ((val[1] >> 18) & 7);
+ }
+ native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+ /* As documented in the SDM: Do a CPUID 1 here */
+ sync_core();
+
+ /* get the current revision from MSR 0x8B */
+ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+ csig.rev = val[1];
+
+ uci->cpu_sig = csig;
+ uci->valid = 1;
+
+ return 0;
+}
+
+#ifdef DEBUG
+static void __ref show_saved_mc(void)
+{
+ int i, j;
+ unsigned int sig, pf, rev, total_size, data_size, date;
+ struct ucode_cpu_info uci;
+
+ if (mc_saved_data.mc_saved_count == 0) {
+ pr_debug("no micorcode data saved.\n");
+ return;
+ }
+ pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
+
+ collect_cpu_info_early(&uci);
+
+ sig = uci.cpu_sig.sig;
+ pf = uci.cpu_sig.pf;
+ rev = uci.cpu_sig.rev;
+ pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
+ smp_processor_id(), sig, pf, rev);
+
+ for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
+ struct microcode_header_intel *mc_saved_header;
+ struct extended_sigtable *ext_header;
+ int ext_sigcount;
+ struct extended_signature *ext_sig;
+
+ mc_saved_header = (struct microcode_header_intel *)
+ mc_saved_data.mc_saved[i];
+ sig = mc_saved_header->sig;
+ pf = mc_saved_header->pf;
+ rev = mc_saved_header->rev;
+ total_size = get_totalsize(mc_saved_header);
+ data_size = get_datasize(mc_saved_header);
+ date = mc_saved_header->date;
+
+ pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
+ i, sig, pf, rev, total_size,
+ date & 0xffff,
+ date >> 24,
+ (date >> 16) & 0xff);
+
+ /* Look for ext. headers: */
+ if (total_size <= data_size + MC_HEADER_SIZE)
+ continue;
+
+ ext_header = (struct extended_sigtable *)
+ mc_saved_header + data_size + MC_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+ for (j = 0; j < ext_sigcount; j++) {
+ sig = ext_sig->sig;
+ pf = ext_sig->pf;
+
+ pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
+ j, sig, pf);
+
+ ext_sig++;
+ }
+
+ }
+}
+#else
+static inline void show_saved_mc(void)
+{
+}
+#endif
+
+#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
+static DEFINE_MUTEX(x86_cpu_microcode_mutex);
+/*
+ * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
+ * hot added or resumes.
+ *
+ * Please make sure this mc should be a valid microcode patch before calling
+ * this function.
+ */
+int save_mc_for_early(u8 *mc)
+{
+ struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+ unsigned int mc_saved_count_init;
+ unsigned int mc_saved_count;
+ struct microcode_intel **mc_saved;
+ int ret = 0;
+ int i;
+
+ /*
+ * Hold hotplug lock so mc_saved_data is not accessed by a CPU in
+ * hotplug.
+ */
+ mutex_lock(&x86_cpu_microcode_mutex);
+
+ mc_saved_count_init = mc_saved_data.mc_saved_count;
+ mc_saved_count = mc_saved_data.mc_saved_count;
+ mc_saved = mc_saved_data.mc_saved;
+
+ if (mc_saved && mc_saved_count)
+ memcpy(mc_saved_tmp, mc_saved,
+ mc_saved_count * sizeof(struct mirocode_intel *));
+ /*
+ * Save the microcode patch mc in mc_save_tmp structure if it's a newer
+ * version.
+ */
+
+ _save_mc(mc_saved_tmp, mc, &mc_saved_count);
+
+ /*
+ * Save the mc_save_tmp in global mc_saved_data.
+ */
+ ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
+ if (ret) {
+ pr_err("Cannot save microcode patch.\n");
+ goto out;
+ }
+
+ show_saved_mc();
+
+ /*
+ * Free old saved microcod data.
+ */
+ if (mc_saved) {
+ for (i = 0; i < mc_saved_count_init; i++)
+ kfree(mc_saved[i]);
+ kfree(mc_saved);
+ }
+
+out:
+ mutex_unlock(&x86_cpu_microcode_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(save_mc_for_early);
+#endif
+
+static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
+static __init enum ucode_state
+scan_microcode(unsigned long start, unsigned long end,
+ struct mc_saved_data *mc_saved_data,
+ unsigned long *mc_saved_in_initrd,
+ struct ucode_cpu_info *uci)
+{
+ unsigned int size = end - start + 1;
+ struct cpio_data cd;
+ long offset = 0;
+#ifdef CONFIG_X86_32
+ char *p = (char *)__pa_nodebug(ucode_name);
+#else
+ char *p = ucode_name;
+#endif
+
+ cd.data = NULL;
+ cd.size = 0;
+
+ cd = find_cpio_data(p, (void *)start, size, &offset);
+ if (!cd.data)
+ return UCODE_ERROR;
+
+
+ return get_matching_model_microcode(0, start, cd.data, cd.size,
+ mc_saved_data, mc_saved_in_initrd,
+ uci);
+}
+
+/*
+ * Print ucode update info.
+ */
+static void
+print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
+{
+ int cpu = smp_processor_id();
+
+ pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
+ cpu,
+ uci->cpu_sig.rev,
+ date & 0xffff,
+ date >> 24,
+ (date >> 16) & 0xff);
+}
+
+#ifdef CONFIG_X86_32
+
+static int delay_ucode_info;
+static int current_mc_date;
+
+/*
+ * Print early updated ucode info after printk works. This is delayed info dump.
+ */
+void show_ucode_info_early(void)
+{
+ struct ucode_cpu_info uci;
+
+ if (delay_ucode_info) {
+ collect_cpu_info_early(&uci);
+ print_ucode_info(&uci, current_mc_date);
+ delay_ucode_info = 0;
+ }
+}
+
+/*
+ * At this point, we can not call printk() yet. Keep microcode patch number in
+ * mc_saved_data.mc_saved and delay printing microcode info in
+ * show_ucode_info_early() until printk() works.
+ */
+static void print_ucode(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_intel;
+ int *delay_ucode_info_p;
+ int *current_mc_date_p;
+
+ mc_intel = uci->mc;
+ if (mc_intel == NULL)
+ return;
+
+ delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
+ current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
+
+ *delay_ucode_info_p = 1;
+ *current_mc_date_p = mc_intel->hdr.date;
+}
+#else
+
+/*
+ * Flush global tlb. We only do this in x86_64 where paging has been enabled
+ * already and PGE should be enabled as well.
+ */
+static inline void flush_tlb_early(void)
+{
+ __native_flush_tlb_global_irq_disabled();
+}
+
+static inline void print_ucode(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_intel;
+
+ mc_intel = uci->mc;
+ if (mc_intel == NULL)
+ return;
+
+ print_ucode_info(uci, mc_intel->hdr.date);
+}
+#endif
+
+static int apply_microcode_early(struct mc_saved_data *mc_saved_data,
+ struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_intel;
+ unsigned int val[2];
+
+ mc_intel = uci->mc;
+ if (mc_intel == NULL)
+ return 0;
+
+ /* write microcode via MSR 0x79 */
+ native_wrmsr(MSR_IA32_UCODE_WRITE,
+ (unsigned long) mc_intel->bits,
+ (unsigned long) mc_intel->bits >> 16 >> 16);
+ native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+ /* As documented in the SDM: Do a CPUID 1 here */
+ sync_core();
+
+ /* get the current revision from MSR 0x8B */
+ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+ if (val[1] != mc_intel->hdr.rev)
+ return -1;
+
+#ifdef CONFIG_X86_64
+ /* Flush global tlb. This is precaution. */
+ flush_tlb_early();
+#endif
+ uci->cpu_sig.rev = val[1];
+
+ print_ucode(uci);
+
+ return 0;
+}
+
+/*
+ * This function converts microcode patch offsets previously stored in
+ * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
+ */
+int __init save_microcode_in_initrd_intel(void)
+{
+ unsigned int count = mc_saved_data.mc_saved_count;
+ struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
+ int ret = 0;
+
+ if (count == 0)
+ return ret;
+
+ microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
+ ret = save_microcode(&mc_saved_data, mc_saved, count);
+ if (ret)
+ pr_err("Cannot save microcode patches from initrd.\n");
+
+ show_saved_mc();
+
+ return ret;
+}
+
+static void __init
+_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
+ unsigned long *mc_saved_in_initrd,
+ unsigned long initrd_start_early,
+ unsigned long initrd_end_early,
+ struct ucode_cpu_info *uci)
+{
+ collect_cpu_info_early(uci);
+ scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
+ mc_saved_in_initrd, uci);
+ load_microcode(mc_saved_data, mc_saved_in_initrd,
+ initrd_start_early, uci);
+ apply_microcode_early(mc_saved_data, uci);
+}
+
+void __init
+load_ucode_intel_bsp(void)
+{
+ u64 ramdisk_image, ramdisk_size;
+ unsigned long initrd_start_early, initrd_end_early;
+ struct ucode_cpu_info uci;
+#ifdef CONFIG_X86_32
+ struct boot_params *boot_params_p;
+
+ boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params);
+ ramdisk_image = boot_params_p->hdr.ramdisk_image;
+ ramdisk_size = boot_params_p->hdr.ramdisk_size;
+ initrd_start_early = ramdisk_image;
+ initrd_end_early = initrd_start_early + ramdisk_size;
+
+ _load_ucode_intel_bsp(
+ (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+ (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+ initrd_start_early, initrd_end_early, &uci);
+#else
+ ramdisk_image = boot_params.hdr.ramdisk_image;
+ ramdisk_size = boot_params.hdr.ramdisk_size;
+ initrd_start_early = ramdisk_image + PAGE_OFFSET;
+ initrd_end_early = initrd_start_early + ramdisk_size;
+
+ _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
+ initrd_start_early, initrd_end_early, &uci);
+#endif
+}
+
+void load_ucode_intel_ap(void)
+{
+ struct mc_saved_data *mc_saved_data_p;
+ struct ucode_cpu_info uci;
+ unsigned long *mc_saved_in_initrd_p;
+ unsigned long initrd_start_addr;
+#ifdef CONFIG_X86_32
+ unsigned long *initrd_start_p;
+
+ mc_saved_in_initrd_p =
+ (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
+ mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
+ initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
+ initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
+#else
+ mc_saved_data_p = &mc_saved_data;
+ mc_saved_in_initrd_p = mc_saved_in_initrd;
+ initrd_start_addr = initrd_start;
+#endif
+
+ /*
+ * If there is no valid ucode previously saved in memory, no need to
+ * update ucode on this AP.
+ */
+ if (mc_saved_data_p->mc_saved_count == 0)
+ return;
+
+ collect_cpu_info_early(&uci);
+ load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+ initrd_start_addr, &uci);
+ apply_microcode_early(mc_saved_data_p, &uci);
+}
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
new file mode 100644
index 00000000000..ce69320d017
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -0,0 +1,174 @@
+/*
+ * Intel CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ *
+ * This driver allows to upgrade microcode on Intel processors
+ * belonging to IA-32 family - PentiumPro, Pentium II,
+ * Pentium III, Xeon, Pentium 4, etc.
+ *
+ * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ * Software Developer's Manual
+ * Order Number 253668 or free download from:
+ *
+ * http://developer.intel.com/Assets/PDF/manual/253668.pdf
+ *
+ * For more information, go to http://www.urbanmyth.org/microcode
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/firmware.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/microcode_intel.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+
+static inline int
+update_match_cpu(unsigned int csig, unsigned int cpf,
+ unsigned int sig, unsigned int pf)
+{
+ return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
+}
+
+int
+update_match_revision(struct microcode_header_intel *mc_header, int rev)
+{
+ return (mc_header->rev <= rev) ? 0 : 1;
+}
+
+int microcode_sanity_check(void *mc, int print_err)
+{
+ unsigned long total_size, data_size, ext_table_size;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ int sum, orig_sum, ext_sigcount = 0, i;
+ struct extended_signature *ext_sig;
+
+ total_size = get_totalsize(mc_header);
+ data_size = get_datasize(mc_header);
+
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ if (print_err)
+ pr_err("error! Bad data size in microcode data file\n");
+ return -EINVAL;
+ }
+
+ if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+ if (print_err)
+ pr_err("error! Unknown microcode update format\n");
+ return -EINVAL;
+ }
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ if ((ext_table_size < EXT_HEADER_SIZE)
+ || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (print_err)
+ pr_err("error! Small exttable size in microcode data file\n");
+ return -EINVAL;
+ }
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ if (print_err)
+ pr_err("error! Bad exttable size in microcode data file\n");
+ return -EFAULT;
+ }
+ ext_sigcount = ext_header->count;
+ }
+
+ /* check extended table checksum */
+ if (ext_table_size) {
+ int ext_table_sum = 0;
+ int *ext_tablep = (int *)ext_header;
+
+ i = ext_table_size / DWSIZE;
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+ if (ext_table_sum) {
+ if (print_err)
+ pr_warn("aborting, bad extended signature table checksum\n");
+ return -EINVAL;
+ }
+ }
+
+ /* calculate the checksum */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+ while (i--)
+ orig_sum += ((int *)mc)[i];
+ if (orig_sum) {
+ if (print_err)
+ pr_err("aborting, bad checksum\n");
+ return -EINVAL;
+ }
+ if (!ext_table_size)
+ return 0;
+ /* check extended signature checksum */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+ sum = orig_sum
+ - (mc_header->sig + mc_header->pf + mc_header->cksum)
+ + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ if (print_err)
+ pr_err("aborting, bad checksum\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(microcode_sanity_check);
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ */
+int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
+{
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header;
+ unsigned long total_size = get_totalsize(mc_header);
+ int ext_sigcount, i;
+ struct extended_signature *ext_sig;
+
+ if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf))
+ return 1;
+
+ /* Look for ext. headers: */
+ if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+ return 0;
+
+ ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+ for (i = 0; i < ext_sigcount; i++) {
+ if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf))
+ return 1;
+ ext_sig++;
+ }
+ return 0;
+}
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ */
+int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
+{
+ struct microcode_header_intel *mc_header = mc;
+
+ if (!update_match_revision(mc_header, rev))
+ return 0;
+
+ return get_matching_sig(csig, cpf, mc, rev);
+}
+EXPORT_SYMBOL_GPL(get_matching_microcode);
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
deleted file mode 100644
index dfea390e160..00000000000
--- a/arch/x86/kernel/cpu/mkcapflags.pl
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/perl
-#
-# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
-#
-
-($in, $out) = @ARGV;
-
-open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
-open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
-
-print OUT "#include <asm/cpufeature.h>\n\n";
-print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
-
-while (defined($line = <IN>)) {
- if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
- $macro = $1;
- $feature = $2;
- $tail = $3;
- if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
- $feature = $1;
- }
-
- if ($feature ne '') {
- printf OUT "\t%-32s = \"%s\",\n",
- "[$macro]", "\L$feature";
- }
- }
-}
-print OUT "};\n";
-
-close(IN);
-close(OUT);
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
new file mode 100644
index 00000000000..2bf61650549
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+#
+# Generate the x86_cap_flags[] array from include/asm/cpufeature.h
+#
+
+IN=$1
+OUT=$2
+
+TABS="$(printf '\t\t\t\t\t')"
+trap 'rm "$OUT"' EXIT
+
+(
+ echo "#ifndef _ASM_X86_CPUFEATURE_H"
+ echo "#include <asm/cpufeature.h>"
+ echo "#endif"
+ echo ""
+ echo "const char * const x86_cap_flags[NCAPINTS*32] = {"
+
+ # Iterate through any input lines starting with #define X86_FEATURE_
+ sed -n -e 's/\t/ /g' -e 's/^ *# *define *X86_FEATURE_//p' $IN |
+ while read i
+ do
+ # Name is everything up to the first whitespace
+ NAME="$(echo "$i" | sed 's/ .*//')"
+
+ # If the /* comment */ starts with a quote string, grab that.
+ VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
+ [ -z "$VALUE" ] && VALUE="\"$NAME\""
+ [ "$VALUE" == '""' ] && continue
+
+ # Name is uppercase, VALUE is all lowercase
+ VALUE="$(echo "$VALUE" | tr A-Z a-z)"
+
+ TABCOUNT=$(( ( 5*8 - 14 - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s]%.*s = %s,\n" \
+ "X86_FEATURE_$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ done
+ echo "};"
+) > $OUT
+
+trap - EXIT
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 0a630dd4b62..a450373e8e9 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -14,28 +14,80 @@
#include <linux/time.h>
#include <linux/clocksource.h>
#include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/efi.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv.h>
#include <asm/mshyperv.h>
+#include <asm/desc.h>
+#include <asm/idle.h>
+#include <asm/irq_regs.h>
+#include <asm/i8259.h>
+#include <asm/apic.h>
+#include <asm/timer.h>
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
-static bool __init ms_hyperv_platform(void)
+#if IS_ENABLED(CONFIG_HYPERV)
+static void (*vmbus_handler)(void);
+
+void hyperv_vector_handler(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ irq_enter();
+ exit_idle();
+
+ inc_irq_stat(irq_hv_callback_count);
+ if (vmbus_handler)
+ vmbus_handler();
+
+ irq_exit();
+ set_irq_regs(old_regs);
+}
+
+void hv_setup_vmbus_irq(void (*handler)(void))
+{
+ vmbus_handler = handler;
+ /*
+ * Setup the IDT for hypervisor callback. Prevent reallocation
+ * at module reload.
+ */
+ if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
+ hyperv_callback_vector);
+}
+
+void hv_remove_vmbus_irq(void)
+{
+ /* We have no way to deallocate the interrupt gate */
+ vmbus_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
+EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
+#endif
+
+static uint32_t __init ms_hyperv_platform(void)
{
u32 eax;
u32 hyp_signature[3];
if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return false;
+ return 0;
cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
&eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
- return eax >= HYPERV_CPUID_MIN &&
- eax <= HYPERV_CPUID_MAX &&
- !memcmp("Microsoft Hv", hyp_signature, 12);
+ if (eax >= HYPERV_CPUID_MIN &&
+ eax <= HYPERV_CPUID_MAX &&
+ !memcmp("Microsoft Hv", hyp_signature, 12))
+ return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
+
+ return 0;
}
static cycle_t read_hv_clock(struct clocksource *arg)
@@ -68,7 +120,28 @@ static void __init ms_hyperv_init_platform(void)
printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
ms_hyperv.features, ms_hyperv.hints);
- clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
+ /*
+ * Get the APIC frequency.
+ */
+ u64 hv_lapic_frequency;
+
+ rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
+ hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
+ lapic_timer_frequency = hv_lapic_frequency;
+ printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n",
+ lapic_timer_frequency);
+ }
+#endif
+
+ if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
+ clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
+
}
const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ac140c7be39..5f90b85ff22 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -258,15 +258,15 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
/* Compute the maximum size with which we can make a range: */
if (range_startk)
- max_align = ffs(range_startk) - 1;
+ max_align = __ffs(range_startk);
else
- max_align = 32;
+ max_align = BITS_PER_LONG - 1;
- align = fls(range_sizek) - 1;
+ align = __fls(range_sizek);
if (align > max_align)
align = max_align;
- sizek = 1 << align;
+ sizek = 1UL << align;
if (debug_print) {
char start_factor = 'K', size_factor = 'K';
unsigned long start_base, size_base;
@@ -714,15 +714,15 @@ int __init mtrr_cleanup(unsigned address_bits)
if (mtrr_tom2)
x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base;
- nr_range = x86_get_mtrr_mem_range(range, 0, x_remove_base, x_remove_size);
/*
* [0, 1M) should always be covered by var mtrr with WB
* and fixed mtrrs should take effect before var mtrr for it:
*/
- nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
+ nr_range = add_range_with_merge(range, RANGE_NUM, 0, 0,
1ULL<<(20 - PAGE_SHIFT));
- /* Sort the ranges: */
- sort_range(range, nr_range);
+ /* add from var mtrr at last */
+ nr_range = x86_get_mtrr_mem_range(range, nr_range,
+ x_remove_base, x_remove_size);
range_sums = sum_ranges(range, nr_range);
printk(KERN_INFO "total RAM covered: %ldM\n",
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 68a3343e579..9e451b0876b 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -167,7 +167,7 @@ static void post_set(void)
setCx86(CX86_CCR3, ccr3);
/* Enable caches */
- write_cr0(read_cr0() & 0xbfffffff);
+ write_cr0(read_cr0() & ~X86_CR0_CD);
/* Restore value of CR4 */
if (cpu_has_pge)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 97b26356e9e..0e25a1bc5ab 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -12,7 +12,6 @@
#include <asm/processor-flags.h>
#include <asm/cpufeature.h>
#include <asm/tlbflush.h>
-#include <asm/system.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include <asm/pat.h>
@@ -362,11 +361,7 @@ static void __init print_mtrr_state(void)
}
pr_debug("MTRR variable ranges %sabled:\n",
mtrr_state.enabled & 2 ? "en" : "dis");
- if (size_or_mask & 0xffffffffUL)
- high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
- else
- high_width = ffs(size_or_mask>>32) + 32 - 1;
- high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
+ high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
for (i = 0; i < num_var_ranges; ++i) {
if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
@@ -515,8 +510,9 @@ generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
static void generic_get_mtrr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type)
{
- unsigned int mask_lo, mask_hi, base_lo, base_hi;
- unsigned int tmp, hi;
+ u32 mask_lo, mask_hi, base_lo, base_hi;
+ unsigned int hi;
+ u64 tmp, mask;
/*
* get_mtrr doesn't need to update mtrr_state, also it could be called
@@ -537,18 +533,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
/* Work out the shifted address mask: */
- tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
- mask_lo = size_or_mask | tmp;
+ tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
+ mask = size_or_mask | tmp;
/* Expand tmp with high bits to all 1s: */
- hi = fls(tmp);
+ hi = fls64(tmp);
if (hi > 0) {
- tmp |= ~((1<<(hi - 1)) - 1);
+ tmp |= ~((1ULL<<(hi - 1)) - 1);
- if (tmp != mask_lo) {
+ if (tmp != mask) {
printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
- add_taint(TAINT_FIRMWARE_WORKAROUND);
- mask_lo = tmp;
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+ mask = tmp;
}
}
@@ -556,8 +552,8 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
* This works correctly if size is a power of two, i.e. a
* contiguous range:
*/
- *size = -mask_lo;
- *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
+ *size = -mask;
+ *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
*type = base_lo & 0xff;
out_put_cpu:
@@ -687,6 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
}
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Save MTRR state */
@@ -700,13 +697,14 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
static void post_set(void) __releases(set_atomicity_lock)
{
/* Flush TLBs (no need to flush caches - they are disabled) */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Intel (P6) standard MTRRs */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Enable caches */
- write_cr0(read_cr0() & 0xbfffffff);
+ write_cr0(read_cr0() & ~X86_CR0_CD);
/* Restore value of CR4 */
if (cpu_has_pge)
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 79289632cb2..a041e094b8b 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -167,6 +167,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
{
int err = 0;
mtrr_type type;
+ unsigned long base;
unsigned long size;
struct mtrr_sentry sentry;
struct mtrr_gentry gentry;
@@ -267,14 +268,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
- mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+ mtrr_if->get(gentry.regnum, &base, &size, &type);
/* Hide entries that go above 4GB */
- if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
+ if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
|| size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
gentry.base = gentry.size = gentry.type = 0;
else {
- gentry.base <<= PAGE_SHIFT;
+ gentry.base = base << PAGE_SHIFT;
gentry.size = size << PAGE_SHIFT;
gentry.type = type;
}
@@ -321,11 +322,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
- mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+ mtrr_if->get(gentry.regnum, &base, &size, &type);
/* Hide entries that would overflow */
if (size != (__typeof__(gentry.size))size)
gentry.base = gentry.size = gentry.type = 0;
else {
+ gentry.base = base;
gentry.size = size;
gentry.type = type;
}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6b96110bb0c..f961de9964c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -51,9 +51,13 @@
#include <asm/e820.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
+#include <asm/pat.h>
#include "mtrr.h"
+/* arch_phys_wc_add returns an MTRR register index plus this offset. */
+#define MTRR_TO_PHYS_WC_OFFSET 1000
+
u32 num_var_ranges;
unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
@@ -305,7 +309,8 @@ int mtrr_add_page(unsigned long base, unsigned long size,
return -EINVAL;
}
- if (base & size_or_mask || size & size_or_mask) {
+ if ((base | (base + size - 1)) >>
+ (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
pr_warning("mtrr: base or size exceeds the MTRR width\n");
return -EINVAL;
}
@@ -524,6 +529,73 @@ int mtrr_del(int reg, unsigned long base, unsigned long size)
}
EXPORT_SYMBOL(mtrr_del);
+/**
+ * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
+ * @base: Physical base address
+ * @size: Size of region
+ *
+ * If PAT is available, this does nothing. If PAT is unavailable, it
+ * attempts to add a WC MTRR covering size bytes starting at base and
+ * logs an error if this fails.
+ *
+ * Drivers must store the return value to pass to mtrr_del_wc_if_needed,
+ * but drivers should not try to interpret that return value.
+ */
+int arch_phys_wc_add(unsigned long base, unsigned long size)
+{
+ int ret;
+
+ if (pat_enabled)
+ return 0; /* Success! (We don't need to do anything.) */
+
+ ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
+ if (ret < 0) {
+ pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.",
+ (void *)base, (void *)(base + size - 1));
+ return ret;
+ }
+ return ret + MTRR_TO_PHYS_WC_OFFSET;
+}
+EXPORT_SYMBOL(arch_phys_wc_add);
+
+/*
+ * arch_phys_wc_del - undoes arch_phys_wc_add
+ * @handle: Return value from arch_phys_wc_add
+ *
+ * This cleans up after mtrr_add_wc_if_needed.
+ *
+ * The API guarantees that mtrr_del_wc_if_needed(error code) and
+ * mtrr_del_wc_if_needed(0) do nothing.
+ */
+void arch_phys_wc_del(int handle)
+{
+ if (handle >= 1) {
+ WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET);
+ mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0);
+ }
+}
+EXPORT_SYMBOL(arch_phys_wc_del);
+
+/*
+ * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
+ * @handle: Return value from arch_phys_wc_add
+ *
+ * This will turn the return value from arch_phys_wc_add into an mtrr
+ * index suitable for debugging.
+ *
+ * Note: There is no legitimate use for this function, except possibly
+ * in printk line. Alas there is an illegitimate use in some ancient
+ * drm ioctls.
+ */
+int phys_wc_to_mtrr_index(int handle)
+{
+ if (handle < MTRR_TO_PHYS_WC_OFFSET)
+ return -1;
+ else
+ return handle - MTRR_TO_PHYS_WC_OFFSET;
+}
+EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
+
/*
* HACK ALERT!
* These should be called implicitly, but we can't yet until all the initcall
@@ -583,6 +655,7 @@ static struct syscore_ops mtrr_syscore_ops = {
int __initdata changed_by_mtrr_cleanup;
+#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1))
/**
* mtrr_bp_init - initialize mtrrs on the boot CPU
*
@@ -600,13 +673,13 @@ void __init mtrr_bp_init(void)
if (cpu_has_mtrr) {
mtrr_if = &generic_mtrr_ops;
- size_or_mask = 0xff000000; /* 36 bits */
+ size_or_mask = SIZE_OR_MASK_BITS(36);
size_and_mask = 0x00f00000;
phys_addr = 36;
/*
* This is an AMD specific MSR, but we assume(hope?) that
- * Intel will implement it to when they extend the address
+ * Intel will implement it too when they extend the address
* bus of the Xeon.
*/
if (cpuid_eax(0x80000000) >= 0x80000008) {
@@ -619,7 +692,7 @@ void __init mtrr_bp_init(void)
boot_cpu_data.x86_mask == 0x4))
phys_addr = 36;
- size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1);
+ size_or_mask = SIZE_OR_MASK_BITS(phys_addr);
size_and_mask = ~size_or_mask & 0xfffff00000ULL;
} else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
boot_cpu_data.x86 == 6) {
@@ -627,7 +700,7 @@ void __init mtrr_bp_init(void)
* VIA C* family have Intel style MTRRs,
* but don't support PAE
*/
- size_or_mask = 0xfff00000; /* 32 bits */
+ size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
phys_addr = 32;
}
@@ -637,21 +710,21 @@ void __init mtrr_bp_init(void)
if (cpu_has_k6_mtrr) {
/* Pre-Athlon (K6) AMD CPU MTRRs */
mtrr_if = mtrr_ops[X86_VENDOR_AMD];
- size_or_mask = 0xfff00000; /* 32 bits */
+ size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
case X86_VENDOR_CENTAUR:
if (cpu_has_centaur_mcr) {
mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
- size_or_mask = 0xfff00000; /* 32 bits */
+ size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
case X86_VENDOR_CYRIX:
if (cpu_has_cyrix_arr) {
mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
- size_or_mask = 0xfff00000; /* 32 bits */
+ size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
@@ -695,11 +768,16 @@ void mtrr_ap_init(void)
}
/**
- * Save current fixed-range MTRR state of the BSP
+ * Save current fixed-range MTRR state of the first cpu in cpu_online_mask.
*/
void mtrr_save_state(void)
{
- smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
+ int first_cpu;
+
+ get_online_cpus();
+ first_cpu = cpumask_first(cpu_online_mask);
+ smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
+ put_online_cpus();
}
void set_mtrr_aps_delayed_init(void)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5adce1040b1..2879ecdaac4 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -24,27 +24,19 @@
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
+#include <linux/device.h>
#include <asm/apic.h>
#include <asm/stacktrace.h>
#include <asm/nmi.h>
-#include <asm/compat.h>
#include <asm/smp.h>
#include <asm/alternative.h>
+#include <asm/timer.h>
+#include <asm/desc.h>
+#include <asm/ldt.h>
#include "perf_event.h"
-#if 0
-#undef wrmsrl
-#define wrmsrl(msr, val) \
-do { \
- trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
- (unsigned long)(val)); \
- native_write_msr((msr), (u32)((u64)(val)), \
- (u32)((u64)(val) >> 32)); \
-} while (0)
-#endif
-
struct x86_pmu x86_pmu __read_mostly;
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -73,7 +65,7 @@ u64 x86_perf_event_update(struct perf_event *event)
int idx = hwc->idx;
s64 delta;
- if (idx == X86_PMC_IDX_FIXED_BTS)
+ if (idx == INTEL_PMC_IDX_FIXED_BTS)
return 0;
/*
@@ -85,7 +77,7 @@ u64 x86_perf_event_update(struct perf_event *event)
*/
again:
prev_raw_count = local64_read(&hwc->prev_count);
- rdmsrl(hwc->event_base, new_raw_count);
+ rdpmcl(hwc->event_base_rdpmc, new_raw_count);
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count)
@@ -126,6 +118,9 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
continue;
if (event->attr.config1 & ~er->valid_mask)
return -EINVAL;
+ /* Check if the extra msrs can be safely accessed*/
+ if (!er->extra_msr_access)
+ return -ENXIO;
reg->idx = er->idx;
reg->config = event->attr.config1;
@@ -188,8 +183,9 @@ static void release_pmc_hardware(void) {}
static bool check_hw_exists(void)
{
- u64 val, val_new = 0;
- int i, reg, ret = 0;
+ u64 val, val_fail, val_new= ~0;
+ int i, reg, reg_fail, ret = 0;
+ int bios_fail = 0;
/*
* Check to see if the BIOS enabled any of the counters, if so
@@ -200,8 +196,11 @@ static bool check_hw_exists(void)
ret = rdmsrl_safe(reg, &val);
if (ret)
goto msr_fail;
- if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
- goto bios_fail;
+ if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
+ bios_fail = 1;
+ val_fail = val;
+ reg_fail = reg;
+ }
}
if (x86_pmu.num_counters_fixed) {
@@ -210,35 +209,41 @@ static bool check_hw_exists(void)
if (ret)
goto msr_fail;
for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
- if (val & (0x03 << i*4))
- goto bios_fail;
+ if (val & (0x03 << i*4)) {
+ bios_fail = 1;
+ val_fail = val;
+ reg_fail = reg;
+ }
}
}
/*
- * Now write a value and read it back to see if it matches,
- * this is needed to detect certain hardware emulators (qemu/kvm)
- * that don't trap on the MSR access and always return 0s.
+ * Read the current value, change it and read it back to see if it
+ * matches, this is needed to detect certain hardware emulators
+ * (qemu/kvm) that don't trap on the MSR access and always return 0s.
*/
- val = 0xabcdUL;
- ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
- ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
+ reg = x86_pmu_event_addr(0);
+ if (rdmsrl_safe(reg, &val))
+ goto msr_fail;
+ val ^= 0xffffUL;
+ ret = wrmsrl_safe(reg, val);
+ ret |= rdmsrl_safe(reg, &val_new);
if (ret || val != val_new)
goto msr_fail;
- return true;
-
-bios_fail:
/*
* We still allow the PMU driver to operate:
*/
- printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
- printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
+ if (bios_fail) {
+ printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
+ printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail);
+ }
return true;
msr_fail:
printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
+ printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new);
return false;
}
@@ -301,15 +306,6 @@ int x86_setup_perfctr(struct perf_event *event)
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
- } else {
- /*
- * If we have a PMU initialized but no APIC
- * interrupts, we cannot sample hardware
- * events (user-space has to fall back and
- * sample via a hrtimer based software event):
- */
- if (!x86_pmu.apic)
- return -EOPNOTSUPP;
}
if (attr->type == PERF_TYPE_RAW)
@@ -351,13 +347,43 @@ int x86_setup_perfctr(struct perf_event *event)
return 0;
}
+/*
+ * check that branch_sample_type is compatible with
+ * settings needed for precise_ip > 1 which implies
+ * using the LBR to capture ALL taken branches at the
+ * priv levels of the measurement
+ */
+static inline int precise_br_compat(struct perf_event *event)
+{
+ u64 m = event->attr.branch_sample_type;
+ u64 b = 0;
+
+ /* must capture all branches */
+ if (!(m & PERF_SAMPLE_BRANCH_ANY))
+ return 0;
+
+ m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_user)
+ b |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_kernel)
+ b |= PERF_SAMPLE_BRANCH_KERNEL;
+
+ /*
+ * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
+ */
+
+ return m == b;
+}
+
int x86_pmu_hw_config(struct perf_event *event)
{
if (event->attr.precise_ip) {
int precise = 0;
/* Support for constant skid */
- if (x86_pmu.pebs_active) {
+ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
precise++;
/* Support for IP fixup */
@@ -367,6 +393,37 @@ int x86_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
+ /*
+ * check that PEBS LBR correction does not conflict with
+ * whatever the user is asking with attr->branch_sample_type
+ */
+ if (event->attr.precise_ip > 1 &&
+ x86_pmu.intel_cap.pebs_format < 2) {
+ u64 *br_type = &event->attr.branch_sample_type;
+
+ if (has_branch_stack(event)) {
+ if (!precise_br_compat(event))
+ return -EOPNOTSUPP;
+
+ /* branch_sample_type is compatible */
+
+ } else {
+ /*
+ * user did not specify branch_sample_type
+ *
+ * For PEBS fixups, we capture all
+ * the branches at the priv level of the
+ * event.
+ */
+ *br_type = PERF_SAMPLE_BRANCH_ANY;
+
+ if (!event->attr.exclude_user)
+ *br_type |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_kernel)
+ *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
+ }
+ }
}
/*
@@ -423,6 +480,7 @@ static int __x86_pmu_event_init(struct perf_event *event)
/* mark unused */
event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
return x86_pmu.hw_config(event);
}
@@ -505,7 +563,7 @@ struct sched_state {
struct perf_sched {
int max_weight;
int max_events;
- struct event_constraint **constraints;
+ struct perf_event **events;
struct sched_state state;
int saved_states;
struct sched_state saved[SCHED_STATES_MAX];
@@ -514,7 +572,7 @@ struct perf_sched {
/*
* Initialize interator that runs through all events and counters.
*/
-static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c,
+static void perf_sched_init(struct perf_sched *sched, struct perf_event **events,
int num, int wmin, int wmax)
{
int idx;
@@ -522,10 +580,10 @@ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **
memset(sched, 0, sizeof(*sched));
sched->max_events = num;
sched->max_weight = wmax;
- sched->constraints = c;
+ sched->events = events;
for (idx = 0; idx < num; idx++) {
- if (c[idx]->weight == wmin)
+ if (events[idx]->hw.constraint->weight == wmin)
break;
}
@@ -572,19 +630,18 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
if (sched->state.event >= sched->max_events)
return false;
- c = sched->constraints[sched->state.event];
-
+ c = sched->events[sched->state.event]->hw.constraint;
/* Prefer fixed purpose counters */
- if (x86_pmu.num_counters_fixed) {
- idx = X86_PMC_IDX_FIXED;
- for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
+ idx = INTEL_PMC_IDX_FIXED;
+ for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
if (!__test_and_set_bit(idx, sched->state.used))
goto done;
}
}
/* Grab the first unused counter starting with idx */
idx = sched->state.counter;
- for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
+ for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
if (!__test_and_set_bit(idx, sched->state.used))
goto done;
}
@@ -631,7 +688,7 @@ static bool perf_sched_next_event(struct perf_sched *sched)
if (sched->state.weight > sched->max_weight)
return false;
}
- c = sched->constraints[sched->state.event];
+ c = sched->events[sched->state.event]->hw.constraint;
} while (c->weight != sched->state.weight);
sched->state.counter = 0; /* start with first counter */
@@ -642,12 +699,12 @@ static bool perf_sched_next_event(struct perf_sched *sched)
/*
* Assign a counter for each event.
*/
-static int perf_assign_events(struct event_constraint **constraints, int n,
- int wmin, int wmax, int *assign)
+int perf_assign_events(struct perf_event **events, int n,
+ int wmin, int wmax, int *assign)
{
struct perf_sched sched;
- perf_sched_init(&sched, constraints, n, wmin, wmax);
+ perf_sched_init(&sched, events, n, wmin, wmax);
do {
if (!perf_sched_find_counter(&sched))
@@ -658,19 +715,23 @@ static int perf_assign_events(struct event_constraint **constraints, int n,
return sched.state.unassigned;
}
+EXPORT_SYMBOL_GPL(perf_assign_events);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
- struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
+ struct event_constraint *c;
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ struct perf_event *e;
int i, wmin, wmax, num = 0;
struct hw_perf_event *hwc;
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+ hwc = &cpuc->event_list[i]->hw;
c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
- constraints[i] = c;
+ hwc->constraint = c;
+
wmin = min(wmin, c->weight);
wmax = max(wmax, c->weight);
}
@@ -680,7 +741,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
*/
for (i = 0; i < n; i++) {
hwc = &cpuc->event_list[i]->hw;
- c = constraints[i];
+ c = hwc->constraint;
/* never assigned */
if (hwc->idx == -1)
@@ -701,16 +762,35 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
/* slow path */
if (i != n)
- num = perf_assign_events(constraints, n, wmin, wmax, assign);
+ num = perf_assign_events(cpuc->event_list, n, wmin,
+ wmax, assign);
/*
+ * Mark the event as committed, so we do not put_constraint()
+ * in case new events are added and fail scheduling.
+ */
+ if (!num && assign) {
+ for (i = 0; i < n; i++) {
+ e = cpuc->event_list[i];
+ e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+ }
+ }
+ /*
* scheduling failed or is just a simulation,
* free resources if necessary
*/
if (!assign || num) {
for (i = 0; i < n; i++) {
+ e = cpuc->event_list[i];
+ /*
+ * do not put_constraint() on comitted events,
+ * because they are good to go
+ */
+ if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
+ continue;
+
if (x86_pmu.put_event_constraints)
- x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
+ x86_pmu.put_event_constraints(cpuc, e);
}
}
return num ? -EINVAL : 0;
@@ -762,15 +842,17 @@ static inline void x86_assign_hw_event(struct perf_event *event,
hwc->last_cpu = smp_processor_id();
hwc->last_tag = ++cpuc->tags[i];
- if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
+ if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
hwc->config_base = 0;
hwc->event_base = 0;
- } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
+ } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
- hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
+ hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
+ hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
} else {
hwc->config_base = x86_pmu_config_addr(hwc->idx);
hwc->event_base = x86_pmu_event_addr(hwc->idx);
+ hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
}
}
@@ -805,7 +887,6 @@ static void x86_pmu_enable(struct pmu *pmu)
* hw_perf_group_sched_in() or x86_pmu_enable()
*
* step1: save events moving to new counters
- * step2: reprogram moved events into new counters
*/
for (i = 0; i < n_running; i++) {
event = cpuc->event_list[i];
@@ -831,6 +912,9 @@ static void x86_pmu_enable(struct pmu *pmu)
x86_pmu_stop(event, PERF_EF_UPDATE);
}
+ /*
+ * step2: reprogram moved events into new counters
+ */
for (i = 0; i < cpuc->n_events; i++) {
event = cpuc->event_list[i];
hwc = &event->hw;
@@ -868,7 +952,7 @@ int x86_perf_event_set_period(struct perf_event *event)
s64 period = hwc->sample_period;
int ret = 0, idx = hwc->idx;
- if (idx == X86_PMC_IDX_FIXED_BTS)
+ if (idx == INTEL_PMC_IDX_FIXED_BTS)
return 0;
/*
@@ -956,7 +1040,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
/*
* If group events scheduling transaction was started,
* skip the schedulability test here, it will be performed
- * at commit time (->commit_txn) as a whole
+ * at commit time (->commit_txn) as a whole.
*/
if (cpuc->group_flag & PERF_EVENT_TXN)
goto done_collect;
@@ -971,6 +1055,10 @@ static int x86_pmu_add(struct perf_event *event, int flags)
memcpy(cpuc->assign, assign, n*sizeof(int));
done_collect:
+ /*
+ * Commit the collect_events() state. See x86_pmu_del() and
+ * x86_pmu_*_txn().
+ */
cpuc->n_events = n;
cpuc->n_added += n - n0;
cpuc->n_txn += n - n0;
@@ -1088,28 +1176,46 @@ static void x86_pmu_del(struct perf_event *event, int flags)
int i;
/*
+ * event is descheduled
+ */
+ event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
+
+ /*
* If we're called during a txn, we don't need to do anything.
* The events never got scheduled and ->cancel_txn will truncate
* the event_list.
+ *
+ * XXX assumes any ->del() called during a TXN will only be on
+ * an event added during that same TXN.
*/
if (cpuc->group_flag & PERF_EVENT_TXN)
return;
+ /*
+ * Not a TXN, therefore cleanup properly.
+ */
x86_pmu_stop(event, PERF_EF_UPDATE);
for (i = 0; i < cpuc->n_events; i++) {
- if (event == cpuc->event_list[i]) {
+ if (event == cpuc->event_list[i])
+ break;
+ }
- if (x86_pmu.put_event_constraints)
- x86_pmu.put_event_constraints(cpuc, event);
+ if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
+ return;
- while (++i < cpuc->n_events)
- cpuc->event_list[i-1] = cpuc->event_list[i];
+ /* If we have a newly added event; make sure to decrease n_added. */
+ if (i >= cpuc->n_events - cpuc->n_added)
+ --cpuc->n_added;
+
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(cpuc, event);
+
+ /* Delete the array entry. */
+ while (++i < cpuc->n_events)
+ cpuc->event_list[i-1] = cpuc->event_list[i];
+ --cpuc->n_events;
- --cpuc->n_events;
- break;
- }
- }
perf_event_update_userpage(event);
}
@@ -1121,8 +1227,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- perf_sample_data_init(&data, 0);
-
cpuc = &__get_cpu_var(cpu_hw_events);
/*
@@ -1157,7 +1261,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
* event overflow
*/
handled++;
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
if (!x86_perf_event_set_period(event))
continue;
@@ -1183,19 +1287,30 @@ void perf_events_lapic_init(void)
apic_write(APIC_LVTPC, APIC_DM_NMI);
}
-static int __kprobes
+static int
perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
+ u64 start_clock;
+ u64 finish_clock;
+ int ret;
+
if (!atomic_read(&active_events))
return NMI_DONE;
- return x86_pmu.handle_irq(regs);
+ start_clock = sched_clock();
+ ret = x86_pmu.handle_irq(regs);
+ finish_clock = sched_clock();
+
+ perf_sample_event_took(finish_clock - start_clock);
+
+ return ret;
}
+NOKPROBE_SYMBOL(perf_event_nmi_handler);
struct event_constraint emptyconstraint;
struct event_constraint unconstrained;
-static int __cpuinit
+static int
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
@@ -1210,6 +1325,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
break;
case CPU_STARTING:
+ if (x86_pmu.attr_rdpmc)
+ set_in_cr4(X86_CR4_PCE);
if (x86_pmu.cpu_starting)
x86_pmu.cpu_starting(cpu);
break;
@@ -1244,12 +1361,163 @@ static void __init pmu_check_apic(void)
x86_pmu.apic = 0;
pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
pr_info("no hardware sampling interrupt available.\n");
+
+ /*
+ * If we have a PMU initialized but no APIC
+ * interrupts, we cannot sample hardware
+ * events (user-space has to fall back and
+ * sample via a hrtimer based software event):
+ */
+ pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+
+}
+
+static struct attribute_group x86_pmu_format_group = {
+ .name = "format",
+ .attrs = NULL,
+};
+
+/*
+ * Remove all undefined events (x86_pmu.event_map(id) == 0)
+ * out of events_attr attributes.
+ */
+static void __init filter_events(struct attribute **attrs)
+{
+ struct device_attribute *d;
+ struct perf_pmu_events_attr *pmu_attr;
+ int i, j;
+
+ for (i = 0; attrs[i]; i++) {
+ d = (struct device_attribute *)attrs[i];
+ pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
+ /* str trumps id */
+ if (pmu_attr->event_str)
+ continue;
+ if (x86_pmu.event_map(i))
+ continue;
+
+ for (j = i; attrs[j]; j++)
+ attrs[j] = attrs[j + 1];
+
+ /* Check the shifted attr. */
+ i--;
+ }
+}
+
+/* Merge two pointer arrays */
+static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
+{
+ struct attribute **new;
+ int j, i;
+
+ for (j = 0; a[j]; j++)
+ ;
+ for (i = 0; b[i]; i++)
+ j++;
+ j++;
+
+ new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ j = 0;
+ for (i = 0; a[i]; i++)
+ new[j++] = a[i];
+ for (i = 0; b[i]; i++)
+ new[j++] = b[i];
+ new[j] = NULL;
+
+ return new;
+}
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr = \
+ container_of(attr, struct perf_pmu_events_attr, attr);
+ u64 config = x86_pmu.event_map(pmu_attr->id);
+
+ /* string trumps id */
+ if (pmu_attr->event_str)
+ return sprintf(page, "%s", pmu_attr->event_str);
+
+ return x86_pmu.events_sysfs_show(page, config);
+}
+
+EVENT_ATTR(cpu-cycles, CPU_CYCLES );
+EVENT_ATTR(instructions, INSTRUCTIONS );
+EVENT_ATTR(cache-references, CACHE_REFERENCES );
+EVENT_ATTR(cache-misses, CACHE_MISSES );
+EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS );
+EVENT_ATTR(branch-misses, BRANCH_MISSES );
+EVENT_ATTR(bus-cycles, BUS_CYCLES );
+EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND );
+EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND );
+EVENT_ATTR(ref-cycles, REF_CPU_CYCLES );
+
+static struct attribute *empty_attrs;
+
+static struct attribute *events_attr[] = {
+ EVENT_PTR(CPU_CYCLES),
+ EVENT_PTR(INSTRUCTIONS),
+ EVENT_PTR(CACHE_REFERENCES),
+ EVENT_PTR(CACHE_MISSES),
+ EVENT_PTR(BRANCH_INSTRUCTIONS),
+ EVENT_PTR(BRANCH_MISSES),
+ EVENT_PTR(BUS_CYCLES),
+ EVENT_PTR(STALLED_CYCLES_FRONTEND),
+ EVENT_PTR(STALLED_CYCLES_BACKEND),
+ EVENT_PTR(REF_CPU_CYCLES),
+ NULL,
+};
+
+static struct attribute_group x86_pmu_events_group = {
+ .name = "events",
+ .attrs = events_attr,
+};
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
+{
+ u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+ u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
+ bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE);
+ bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
+ bool any = (config & ARCH_PERFMON_EVENTSEL_ANY);
+ bool inv = (config & ARCH_PERFMON_EVENTSEL_INV);
+ ssize_t ret;
+
+ /*
+ * We have whole page size to spend and just little data
+ * to write, so we can safely use sprintf.
+ */
+ ret = sprintf(page, "event=0x%02llx", event);
+
+ if (umask)
+ ret += sprintf(page + ret, ",umask=0x%02llx", umask);
+
+ if (edge)
+ ret += sprintf(page + ret, ",edge");
+
+ if (pc)
+ ret += sprintf(page + ret, ",pc");
+
+ if (any)
+ ret += sprintf(page + ret, ",any");
+
+ if (inv)
+ ret += sprintf(page + ret, ",inv");
+
+ if (cmask)
+ ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
+
+ ret += sprintf(page + ret, "\n");
+
+ return ret;
}
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
- struct event_constraint *c;
int err;
pr_info("Performance Events: ");
@@ -1262,7 +1530,7 @@ static int __init init_hw_perf_events(void)
err = amd_pmu_init();
break;
default:
- return 0;
+ err = -ENOTSUPP;
}
if (err != 0) {
pr_cont("no PMU driver, software events only.\n");
@@ -1277,46 +1545,37 @@ static int __init init_hw_perf_events(void)
pr_cont("%s PMU driver.\n", x86_pmu.name);
+ x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
+
for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
quirk->func();
- if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
- WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
- x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
- x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
- }
- x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
- if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
- WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
- x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
- x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
- }
-
- x86_pmu.intel_ctrl |=
- ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
+ if (!x86_pmu.intel_ctrl)
+ x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
perf_events_lapic_init();
register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
unconstrained = (struct event_constraint)
__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
- 0, x86_pmu.num_counters, 0);
+ 0, x86_pmu.num_counters, 0, 0);
- if (x86_pmu.event_constraints) {
- /*
- * event on fixed counter2 (REF_CYCLES) only works on this
- * counter, so do not extend mask to generic counters
- */
- for_each_event_constraint(c, x86_pmu.event_constraints) {
- if (c->cmask != X86_RAW_EVENT_MASK
- || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {
- continue;
- }
+ x86_pmu_format_group.attrs = x86_pmu.format_attrs;
- c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
- c->weight += x86_pmu.num_counters;
- }
+ if (x86_pmu.event_attrs)
+ x86_pmu_events_group.attrs = x86_pmu.event_attrs;
+
+ if (!x86_pmu.events_sysfs_show)
+ x86_pmu_events_group.attrs = &empty_attrs;
+ else
+ filter_events(x86_pmu_events_group.attrs);
+
+ if (x86_pmu.cpu_events) {
+ struct attribute **tmp;
+
+ tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
+ if (!WARN_ON(!tmp))
+ x86_pmu_events_group.attrs = tmp;
}
pr_info("... version: %d\n", x86_pmu.version);
@@ -1360,7 +1619,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
{
__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
/*
- * Truncate the collected events.
+ * Truncate collected array by the number of events added in this
+ * transaction. See x86_pmu_add() and x86_pmu_*_txn().
*/
__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
@@ -1371,6 +1631,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
* Commit group events scheduling transaction
* Perform the group schedulability test as a whole
* Return 0 if success
+ *
+ * Does not cancel the transaction on failure; expects the caller to do this.
*/
static int x86_pmu_commit_txn(struct pmu *pmu)
{
@@ -1426,6 +1688,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void)
if (!cpuc->shared_regs)
goto error;
}
+ cpuc->is_fake = 1;
return cpuc;
error:
free_fake_cpuc(cpuc);
@@ -1542,23 +1805,138 @@ static int x86_pmu_event_init(struct perf_event *event)
return err;
}
+static int x86_pmu_event_idx(struct perf_event *event)
+{
+ int idx = event->hw.idx;
+
+ if (!x86_pmu.attr_rdpmc)
+ return 0;
+
+ if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
+ idx -= INTEL_PMC_IDX_FIXED;
+ idx |= 1 << 30;
+ }
+
+ return idx + 1;
+}
+
+static ssize_t get_attr_rdpmc(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
+}
+
+static void change_rdpmc(void *info)
+{
+ bool enable = !!(unsigned long)info;
+
+ if (enable)
+ set_in_cr4(X86_CR4_PCE);
+ else
+ clear_in_cr4(X86_CR4_PCE);
+}
+
+static ssize_t set_attr_rdpmc(struct device *cdev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (x86_pmu.attr_rdpmc_broken)
+ return -ENOTSUPP;
+
+ if (!!val != !!x86_pmu.attr_rdpmc) {
+ x86_pmu.attr_rdpmc = !!val;
+ on_each_cpu(change_rdpmc, (void *)val, 1);
+ }
+
+ return count;
+}
+
+static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
+
+static struct attribute *x86_pmu_attrs[] = {
+ &dev_attr_rdpmc.attr,
+ NULL,
+};
+
+static struct attribute_group x86_pmu_attr_group = {
+ .attrs = x86_pmu_attrs,
+};
+
+static const struct attribute_group *x86_pmu_attr_groups[] = {
+ &x86_pmu_attr_group,
+ &x86_pmu_format_group,
+ &x86_pmu_events_group,
+ NULL,
+};
+
+static void x86_pmu_flush_branch_stack(void)
+{
+ if (x86_pmu.flush_branch_stack)
+ x86_pmu.flush_branch_stack();
+}
+
+void perf_check_microcode(void)
+{
+ if (x86_pmu.check_microcode)
+ x86_pmu.check_microcode();
+}
+EXPORT_SYMBOL_GPL(perf_check_microcode);
+
static struct pmu pmu = {
- .pmu_enable = x86_pmu_enable,
- .pmu_disable = x86_pmu_disable,
+ .pmu_enable = x86_pmu_enable,
+ .pmu_disable = x86_pmu_disable,
+
+ .attr_groups = x86_pmu_attr_groups,
+
+ .event_init = x86_pmu_event_init,
- .event_init = x86_pmu_event_init,
+ .add = x86_pmu_add,
+ .del = x86_pmu_del,
+ .start = x86_pmu_start,
+ .stop = x86_pmu_stop,
+ .read = x86_pmu_read,
- .add = x86_pmu_add,
- .del = x86_pmu_del,
- .start = x86_pmu_start,
- .stop = x86_pmu_stop,
- .read = x86_pmu_read,
+ .start_txn = x86_pmu_start_txn,
+ .cancel_txn = x86_pmu_cancel_txn,
+ .commit_txn = x86_pmu_commit_txn,
- .start_txn = x86_pmu_start_txn,
- .cancel_txn = x86_pmu_cancel_txn,
- .commit_txn = x86_pmu_commit_txn,
+ .event_idx = x86_pmu_event_idx,
+ .flush_branch_stack = x86_pmu_flush_branch_stack,
};
+void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+{
+ struct cyc2ns_data *data;
+
+ userpg->cap_user_time = 0;
+ userpg->cap_user_time_zero = 0;
+ userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
+ userpg->pmc_width = x86_pmu.cntval_bits;
+
+ if (!sched_clock_stable())
+ return;
+
+ data = cyc2ns_read_begin();
+
+ userpg->cap_user_time = 1;
+ userpg->time_mult = data->cyc2ns_mul;
+ userpg->time_shift = data->cyc2ns_shift;
+ userpg->time_offset = data->cyc2ns_offset - now;
+
+ userpg->cap_user_time_zero = 1;
+ userpg->time_zero = data->cyc2ns_offset;
+
+ cyc2ns_read_end(data);
+}
+
/*
* callchain support
*/
@@ -1594,32 +1972,68 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
}
+static inline int
+valid_user_frame(const void __user *fp, unsigned long size)
+{
+ return (__range_not_ok(fp, size, TASK_SIZE) == 0);
+}
+
+static unsigned long get_segment_base(unsigned int segment)
+{
+ struct desc_struct *desc;
+ int idx = segment >> 3;
+
+ if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+ if (idx > LDT_ENTRIES)
+ return 0;
+
+ if (idx > current->active_mm->context.size)
+ return 0;
+
+ desc = current->active_mm->context.ldt;
+ } else {
+ if (idx > GDT_ENTRIES)
+ return 0;
+
+ desc = __this_cpu_ptr(&gdt_page.gdt[0]);
+ }
+
+ return get_desc_base(desc + idx);
+}
+
#ifdef CONFIG_COMPAT
+
+#include <asm/compat.h>
+
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
/* 32-bit process in 64-bit kernel. */
+ unsigned long ss_base, cs_base;
struct stack_frame_ia32 frame;
const void __user *fp;
if (!test_thread_flag(TIF_IA32))
return 0;
- fp = compat_ptr(regs->bp);
+ cs_base = get_segment_base(regs->cs);
+ ss_base = get_segment_base(regs->ss);
+
+ fp = compat_ptr(ss_base + regs->bp);
while (entry->nr < PERF_MAX_STACK_DEPTH) {
unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
- if (bytes != sizeof(frame))
+ if (bytes != 0)
break;
- if (fp < compat_ptr(regs->sp))
+ if (!valid_user_frame(fp, sizeof(frame)))
break;
- perf_callchain_store(entry, frame.return_address);
- fp = compat_ptr(frame.next_frame);
+ perf_callchain_store(entry, cs_base + frame.return_address);
+ fp = compat_ptr(ss_base + frame.next_frame);
}
return 1;
}
@@ -1642,6 +2056,12 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
return;
}
+ /*
+ * We don't know what to do with VM86 stacks.. ignore them for now.
+ */
+ if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
+ return;
+
fp = (void __user *)regs->bp;
perf_callchain_store(entry, regs->ip);
@@ -1658,10 +2078,10 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
frame.return_address = 0;
bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
- if (bytes != sizeof(frame))
+ if (bytes != 0)
break;
- if ((unsigned long)fp < regs->sp)
+ if (!valid_user_frame(fp, sizeof(frame)))
break;
perf_callchain_store(entry, frame.return_address);
@@ -1669,16 +2089,50 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
}
}
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
+/*
+ * Deal with code segment offsets for the various execution modes:
+ *
+ * VM86 - the good olde 16 bit days, where the linear address is
+ * 20 bits and we use regs->ip + 0x10 * regs->cs.
+ *
+ * IA32 - Where we need to look at GDT/LDT segment descriptor tables
+ * to figure out what the 32bit base address is.
+ *
+ * X32 - has TIF_X32 set, but is running in x86_64
+ *
+ * X86_64 - CS,DS,SS,ES are all zero based.
+ */
+static unsigned long code_segment_base(struct pt_regs *regs)
{
- unsigned long ip;
+ /*
+ * If we are in VM86 mode, add the segment offset to convert to a
+ * linear address.
+ */
+ if (regs->flags & X86_VM_MASK)
+ return 0x10 * regs->cs;
+
+ /*
+ * For IA32 we look at the GDT/LDT segment base to convert the
+ * effective IP to a linear address.
+ */
+#ifdef CONFIG_X86_32
+ if (user_mode(regs) && regs->cs != __USER_CS)
+ return get_segment_base(regs->cs);
+#else
+ if (test_thread_flag(TIF_IA32)) {
+ if (user_mode(regs) && regs->cs != __USER32_CS)
+ return get_segment_base(regs->cs);
+ }
+#endif
+ return 0;
+}
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
- ip = perf_guest_cbs->get_guest_ip();
- else
- ip = instruction_pointer(regs);
+ return perf_guest_cbs->get_guest_ip();
- return ip;
+ return regs->ip + code_segment_base(regs);
}
unsigned long perf_misc_flags(struct pt_regs *regs)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 8944062f46e..8ade93111e0 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -14,6 +14,18 @@
#include <linux/perf_event.h>
+#if 0
+#undef wrmsrl
+#define wrmsrl(msr, val) \
+do { \
+ unsigned int _msr = (msr); \
+ u64 _val = (val); \
+ trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \
+ (unsigned long long)(_val)); \
+ native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \
+} while (0)
+#endif
+
/*
* | NHM/WSM | SNB |
* register -------------------------------
@@ -33,6 +45,8 @@ enum extra_reg_type {
EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
+ EXTRA_REG_LBR = 2, /* lbr_select */
+ EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */
EXTRA_REG_MAX /* number of entries needed */
};
@@ -46,7 +60,15 @@ struct event_constraint {
u64 cmask;
int weight;
int overlap;
+ int flags;
};
+/*
+ * struct hw_perf_event.flags flags
+ */
+#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style st data sampling */
+#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */
struct amd_nb {
int nb_id; /* NorthBridge id */
@@ -56,7 +78,7 @@ struct amd_nb {
};
/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS 4
+#define MAX_PEBS_EVENTS 8
/*
* A debug store configuration.
@@ -108,14 +130,17 @@ struct cpu_hw_events {
unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;
- int n_events;
- int n_added;
- int n_txn;
+ int n_events; /* the # of events in the below arrays */
+ int n_added; /* the # last events in the below arrays;
+ they've never been enabled yet */
+ int n_txn; /* the # last events in the below arrays;
+ added in the current transaction */
int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
u64 tags[X86_PMC_IDX_MAX];
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
unsigned int group_flag;
+ int is_fake;
/*
* Intel DebugStore bits
@@ -130,6 +155,8 @@ struct cpu_hw_events {
void *lbr_context;
struct perf_branch_stack lbr_stack;
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
+ struct er_account *lbr_sel;
+ u64 br_sel;
/*
* Intel host/guest exclude bits
@@ -139,6 +166,11 @@ struct cpu_hw_events {
struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX];
/*
+ * Intel checkpoint mask
+ */
+ u64 intel_cp_status;
+
+ /*
* manage shared (per-core, per-cpu) registers
* used on Intel NHM/WSM/SNB
*/
@@ -147,21 +179,24 @@ struct cpu_hw_events {
/*
* AMD specific bits
*/
- struct amd_nb *amd_nb;
+ struct amd_nb *amd_nb;
+ /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
+ u64 perf_ctr_virt_mask;
void *kfree_on_online;
};
-#define __EVENT_CONSTRAINT(c, n, m, w, o) {\
+#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
{ .idxmsk64 = (n) }, \
.code = (c), \
.cmask = (m), \
.weight = (w), \
.overlap = (o), \
+ .flags = f, \
}
#define EVENT_CONSTRAINT(c, n, m) \
- __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0)
+ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
/*
* The overlap flag marks event constraints with overlapping counter
@@ -185,7 +220,7 @@ struct cpu_hw_events {
* and its counter masks must be kept at a minimum.
*/
#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \
- __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1)
+ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
/*
* Constraint on the Event code.
@@ -201,11 +236,14 @@ struct cpu_hw_events {
* - inv
* - edge
* - cnt-mask
+ * - in_tx
+ * - in_tx_checkpointed
* The other filters are supported by fixed counters.
* The any-thread option is supported starting with v3.
*/
+#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
#define FIXED_EVENT_CONSTRAINT(c, n) \
- EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
+ EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
/*
* Constraint on the Event code + UMask
@@ -213,11 +251,33 @@ struct cpu_hw_events {
#define INTEL_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
-#define EVENT_CONSTRAINT_END \
- EVENT_CONSTRAINT(0, 0, 0)
+#define INTEL_PLD_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
+#define INTEL_PST_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
+
+/* DataLA version of store sampling without extra enable bit. */
+#define INTEL_PST_HSW_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+/*
+ * We define the end marker as having a weight of -1
+ * to enable blacklisting of events using a counter bitmask
+ * of zero and thus a weight of zero.
+ * The end marker has a weight that cannot possibly be
+ * obtained from counting the bits in the bitmask.
+ */
+#define EVENT_CONSTRAINT_END { .weight = -1 }
+
+/*
+ * Check for end marker with weight == -1
+ */
#define for_each_event_constraint(e, c) \
- for ((e) = (c); (e)->weight; (e)++)
+ for ((e) = (c); (e)->weight != -1; (e)++)
/*
* Extra registers for specific events.
@@ -235,19 +295,31 @@ struct extra_reg {
u64 config_mask;
u64 valid_mask;
int idx; /* per_xxx->regs[] reg index */
+ bool extra_msr_access;
};
#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
- .event = (e), \
- .msr = (ms), \
- .config_mask = (m), \
- .valid_mask = (vm), \
- .idx = EXTRA_REG_##i \
+ .event = (e), \
+ .msr = (ms), \
+ .config_mask = (m), \
+ .valid_mask = (vm), \
+ .idx = EXTRA_REG_##i, \
+ .extra_msr_access = true, \
}
#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
+ EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
+ ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
+
+#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
+ INTEL_UEVENT_EXTRA_REG(c, \
+ MSR_PEBS_LD_LAT_THRESHOLD, \
+ 0xffff, \
+ LDLAT)
+
#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
union perf_capabilities {
@@ -257,6 +329,11 @@ union perf_capabilities {
u64 pebs_arch_reg:1;
u64 pebs_format:4;
u64 smm_freeze:1;
+ /*
+ * PMU supports separate counter range for writing
+ * values > 32bit.
+ */
+ u64 full_width_write:1;
};
u64 capabilities;
};
@@ -266,6 +343,29 @@ struct x86_pmu_quirk {
void (*func)(void);
};
+union x86_pmu_config {
+ struct {
+ u64 event:8,
+ umask:8,
+ usr:1,
+ os:1,
+ edge:1,
+ pc:1,
+ interrupt:1,
+ __reserved1:1,
+ en:1,
+ inv:1,
+ cmask:8,
+ event2:4,
+ __reserved2:4,
+ go:1,
+ ho:1;
+ } bits;
+ u64 value;
+};
+
+#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+
/*
* struct x86_pmu - generic x86 pmu
*/
@@ -284,6 +384,8 @@ struct x86_pmu {
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
unsigned eventsel;
unsigned perfctr;
+ int (*addr_offset)(int index, bool eventsel);
+ int (*rdpmc_index)(int index);
u64 (*event_map)(int);
int max_events;
int num_counters;
@@ -306,12 +408,30 @@ struct x86_pmu {
struct event_constraint *event_constraints;
struct x86_pmu_quirk *quirks;
int perfctr_second_write;
+ bool late_ack;
+
+ /*
+ * sysfs attrs
+ */
+ int attr_rdpmc_broken;
+ int attr_rdpmc;
+ struct attribute **format_attrs;
+ struct attribute **event_attrs;
+ ssize_t (*events_sysfs_show)(char *page, u64 config);
+ struct attribute **cpu_events;
+
+ /*
+ * CPU Hotplug hooks
+ */
int (*cpu_prepare)(int cpu);
void (*cpu_starting)(int cpu);
void (*cpu_dying)(int cpu);
void (*cpu_dead)(int cpu);
+ void (*check_microcode)(void);
+ void (*flush_branch_stack)(void);
+
/*
* Intel Arch Perfmon v2+
*/
@@ -321,17 +441,25 @@ struct x86_pmu {
/*
* Intel DebugStore bits
*/
- int bts, pebs;
- int bts_active, pebs_active;
+ unsigned int bts :1,
+ bts_active :1,
+ pebs :1,
+ pebs_active :1,
+ pebs_broken :1;
int pebs_record_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
+ void (*pebs_aliases)(struct perf_event *event);
+ int max_pebs_events;
/*
* Intel LBR
*/
unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
int lbr_nr; /* hardware stack size */
+ u64 lbr_sel_mask; /* LBR_SELECT valid bits */
+ const int *lbr_sel_map; /* lbr_select mappings */
+ bool lbr_double_abort; /* duplicated lbr aborts */
/*
* Extra registers for events
@@ -357,6 +485,23 @@ do { \
#define ERF_NO_HT_SHARING 1
#define ERF_HAS_RSP_1 2
+#define EVENT_VAR(_id) event_attr_##_id
+#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
+
+#define EVENT_ATTR(_name, _id) \
+static struct perf_pmu_events_attr EVENT_VAR(_id) = { \
+ .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
+ .id = PERF_COUNT_HW_##_id, \
+ .event_str = NULL, \
+};
+
+#define EVENT_ATTR_STR(_name, v, str) \
+static struct perf_pmu_events_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
+ .id = 0, \
+ .event_str = str, \
+};
+
extern struct x86_pmu x86_pmu __read_mostly;
DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
@@ -384,28 +529,21 @@ extern u64 __read_mostly hw_cache_extra_regs
u64 x86_perf_event_update(struct perf_event *event);
-static inline int x86_pmu_addr_offset(int index)
+static inline unsigned int x86_pmu_config_addr(int index)
{
- int offset;
-
- /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
- alternative_io(ASM_NOP2,
- "shll $1, %%eax",
- X86_FEATURE_PERFCTR_CORE,
- "=a" (offset),
- "a" (index));
-
- return offset;
+ return x86_pmu.eventsel + (x86_pmu.addr_offset ?
+ x86_pmu.addr_offset(index, true) : index);
}
-static inline unsigned int x86_pmu_config_addr(int index)
+static inline unsigned int x86_pmu_event_addr(int index)
{
- return x86_pmu.eventsel + x86_pmu_addr_offset(index);
+ return x86_pmu.perfctr + (x86_pmu.addr_offset ?
+ x86_pmu.addr_offset(index, false) : index);
}
-static inline unsigned int x86_pmu_event_addr(int index)
+static inline int x86_pmu_rdpmc_index(int index)
{
- return x86_pmu.perfctr + x86_pmu_addr_offset(index);
+ return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
}
int x86_setup_perfctr(struct perf_event *event);
@@ -417,13 +555,17 @@ void x86_pmu_disable_all(void);
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
u64 enable_mask)
{
+ u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
+
if (hwc->extra_reg.reg)
wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
- wrmsrl(hwc->config_base, hwc->config | enable_mask);
+ wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
}
void x86_pmu_enable_all(int added);
+int perf_assign_events(struct perf_event **events, int n,
+ int wmin, int wmax, int *assign);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
void x86_pmu_stop(struct perf_event *event, int flags);
@@ -443,6 +585,38 @@ extern struct event_constraint emptyconstraint;
extern struct event_constraint unconstrained;
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+ return ip > PAGE_OFFSET;
+#else
+ return (long)ip < 0;
+#endif
+}
+
+/*
+ * Not all PMUs provide the right context information to place the reported IP
+ * into full context. Specifically segment registers are typically not
+ * supplied.
+ *
+ * Assuming the address is a linear address (it is for IBS), we fake the CS and
+ * vm86 mode using the known zero-based code segment and 'fix up' the registers
+ * to reflect this.
+ *
+ * Intel PEBS/LBR appear to typically provide the effective address, nothing
+ * much we can do about that but pray and treat it like a linear address.
+ */
+static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
+{
+ regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
+ if (regs->flags & X86_VM_MASK)
+ regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
+ regs->ip = ip;
+}
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
+ssize_t intel_event_sysfs_show(char *page, u64 config);
+
#ifdef CONFIG_CPU_SUP_AMD
int amd_pmu_init(void);
@@ -487,12 +661,18 @@ extern struct event_constraint intel_core2_pebs_event_constraints[];
extern struct event_constraint intel_atom_pebs_event_constraints[];
+extern struct event_constraint intel_slm_pebs_event_constraints[];
+
extern struct event_constraint intel_nehalem_pebs_event_constraints[];
extern struct event_constraint intel_westmere_pebs_event_constraints[];
extern struct event_constraint intel_snb_pebs_event_constraints[];
+extern struct event_constraint intel_ivb_pebs_event_constraints[];
+
+extern struct event_constraint intel_hsw_pebs_event_constraints[];
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
void intel_pmu_pebs_enable(struct perf_event *event);
@@ -523,10 +703,19 @@ void intel_pmu_lbr_init_nhm(void);
void intel_pmu_lbr_init_atom(void);
+void intel_pmu_lbr_init_snb(void);
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event);
+
int p4_pmu_init(void);
int p6_pmu_init(void);
+int knc_pmu_init(void);
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *page);
+
#else /* CONFIG_CPU_SUP_INTEL */
static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 0397b23be8e..beeb7cc0704 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,4 +1,5 @@
#include <linux/perf_event.h>
+#include <linux/export.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/slab.h>
@@ -131,13 +132,49 @@ static u64 amd_pmu_event_map(int hw_event)
return amd_perfmon_event_map[hw_event];
}
-static int amd_pmu_hw_config(struct perf_event *event)
+/*
+ * Previously calculated offsets
+ */
+static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ * 4 counters starting at 0xc0010000 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ * 6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index, bool eventsel)
{
- int ret = x86_pmu_hw_config(event);
+ int offset;
- if (ret)
- return ret;
+ if (!index)
+ return index;
+
+ if (eventsel)
+ offset = event_offsets[index];
+ else
+ offset = count_offsets[index];
+ if (offset)
+ return offset;
+
+ if (!cpu_has_perfctr_core)
+ offset = index;
+ else
+ offset = index << 1;
+
+ if (eventsel)
+ event_offsets[index] = offset;
+ else
+ count_offsets[index] = offset;
+
+ return offset;
+}
+
+static int amd_core_hw_config(struct perf_event *event)
+{
if (event->attr.exclude_host && event->attr.exclude_guest)
/*
* When HO == GO == 1 the hardware treats that as GO == HO == 0
@@ -147,14 +184,9 @@ static int amd_pmu_hw_config(struct perf_event *event)
event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
ARCH_PERFMON_EVENTSEL_OS);
else if (event->attr.exclude_host)
- event->hw.config |= AMD_PERFMON_EVENTSEL_GUESTONLY;
+ event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
else if (event->attr.exclude_guest)
- event->hw.config |= AMD_PERFMON_EVENTSEL_HOSTONLY;
-
- if (event->attr.type != PERF_TYPE_RAW)
- return 0;
-
- event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+ event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
return 0;
}
@@ -179,20 +211,34 @@ static inline int amd_has_nb(struct cpu_hw_events *cpuc)
return nb && nb->nb_id != -1;
}
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
+static int amd_pmu_hw_config(struct perf_event *event)
+{
+ int ret;
+
+ /* pass precise event sampling to ibs: */
+ if (event->attr.precise_ip && get_ibs_caps())
+ return -ENOENT;
+
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ ret = x86_pmu_hw_config(event);
+ if (ret)
+ return ret;
+
+ if (event->attr.type == PERF_TYPE_RAW)
+ event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+ return amd_core_hw_config(event);
+}
+
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
{
- struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
int i;
/*
- * only care about NB events
- */
- if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
- return;
-
- /*
* need to scan whole list because event may not have
* been assigned during scheduling
*
@@ -201,10 +247,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
* when we come here
*/
for (i = 0; i < x86_pmu.num_counters; i++) {
- if (nb->owners[i] == event) {
- cmpxchg(nb->owners+i, event, NULL);
+ if (cmpxchg(nb->owners + i, event, NULL) == event)
break;
- }
}
}
@@ -240,24 +284,24 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
*
* Given that resources are allocated (cmpxchg), they must be
* eventually freed for others to use. This is accomplished by
- * calling amd_put_event_constraints().
+ * calling __amd_put_nb_event_constraints()
*
* Non NB events are not impacted by this restriction.
*/
static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+ struct event_constraint *c)
{
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
- struct perf_event *old = NULL;
- int max = x86_pmu.num_counters;
- int i, j, k = -1;
+ struct perf_event *old;
+ int idx, new = -1;
- /*
- * if not NB event or no NB, then no constraints
- */
- if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
- return &unconstrained;
+ if (!c)
+ c = &unconstrained;
+
+ if (cpuc->is_fake)
+ return c;
/*
* detect if already present, if so reuse
@@ -269,48 +313,33 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
* because of successive calls to x86_schedule_events() from
* hw_perf_group_sched_in() without hw_perf_enable()
*/
- for (i = 0; i < max; i++) {
- /*
- * keep track of first free slot
- */
- if (k == -1 && !nb->owners[i])
- k = i;
+ for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
+ if (new == -1 || hwc->idx == idx)
+ /* assign free slot, prefer hwc->idx */
+ old = cmpxchg(nb->owners + idx, NULL, event);
+ else if (nb->owners[idx] == event)
+ /* event already present */
+ old = event;
+ else
+ continue;
+
+ if (old && old != event)
+ continue;
+
+ /* reassign to this slot */
+ if (new != -1)
+ cmpxchg(nb->owners + new, event, NULL);
+ new = idx;
/* already present, reuse */
- if (nb->owners[i] == event)
- goto done;
- }
- /*
- * not present, so grab a new slot
- * starting either at:
- */
- if (hwc->idx != -1) {
- /* previous assignment */
- i = hwc->idx;
- } else if (k != -1) {
- /* start from free slot found */
- i = k;
- } else {
- /*
- * event not found, no slot found in
- * first pass, try again from the
- * beginning
- */
- i = 0;
- }
- j = i;
- do {
- old = cmpxchg(nb->owners+i, NULL, event);
- if (!old)
+ if (old == event)
break;
- if (++i == max)
- i = 0;
- } while (i != j);
-done:
- if (!old)
- return &nb->event_constraints[i];
-
- return &emptyconstraint;
+ }
+
+ if (new == -1)
+ return &emptyconstraint;
+
+ return &nb->event_constraints[new];
}
static struct amd_nb *amd_alloc_nb(int cpu)
@@ -318,8 +347,7 @@ static struct amd_nb *amd_alloc_nb(int cpu)
struct amd_nb *nb;
int i;
- nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO,
- cpu_to_node(cpu));
+ nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
if (!nb)
return NULL;
@@ -357,6 +385,8 @@ static void amd_pmu_cpu_starting(int cpu)
struct amd_nb *nb;
int i, nb_id;
+ cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
if (boot_cpu_data.x86_max_cores < 2)
return;
@@ -398,31 +428,38 @@ static void amd_pmu_cpu_dead(int cpu)
}
}
-static __initconst const struct x86_pmu amd_pmu = {
- .name = "AMD",
- .handle_irq = x86_pmu_handle_irq,
- .disable_all = x86_pmu_disable_all,
- .enable_all = x86_pmu_enable_all,
- .enable = x86_pmu_enable_event,
- .disable = x86_pmu_disable_event,
- .hw_config = amd_pmu_hw_config,
- .schedule_events = x86_schedule_events,
- .eventsel = MSR_K7_EVNTSEL0,
- .perfctr = MSR_K7_PERFCTR0,
- .event_map = amd_pmu_event_map,
- .max_events = ARRAY_SIZE(amd_perfmon_event_map),
- .num_counters = AMD64_NUM_COUNTERS,
- .cntval_bits = 48,
- .cntval_mask = (1ULL << 48) - 1,
- .apic = 1,
- /* use highest bit to detect overflow */
- .max_period = (1ULL << 47) - 1,
- .get_event_constraints = amd_get_event_constraints,
- .put_event_constraints = amd_put_event_constraints,
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+ /*
+ * if not NB event or no NB, then no constraints
+ */
+ if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+ return &unconstrained;
- .cpu_prepare = amd_pmu_cpu_prepare,
- .cpu_starting = amd_pmu_cpu_starting,
- .cpu_dead = amd_pmu_cpu_dead,
+ return __amd_get_nb_event_constraints(cpuc, event, NULL);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+ __amd_put_nb_event_constraints(cpuc, event);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+
+static struct attribute *amd_format_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
};
/* AMD Family 15h */
@@ -470,6 +507,7 @@ static __initconst const struct x86_pmu amd_pmu = {
* 0x023 DE PERF_CTL[2:0]
* 0x02D LS PERF_CTL[3]
* 0x02E LS PERF_CTL[3,0]
+ * 0x031 LS PERF_CTL[2:0] (**)
* 0x043 CU PERF_CTL[2:0]
* 0x045 CU PERF_CTL[2:0]
* 0x046 CU PERF_CTL[2:0]
@@ -483,10 +521,12 @@ static __initconst const struct x86_pmu amd_pmu = {
* 0x0DD LS PERF_CTL[5:0]
* 0x0DE LS PERF_CTL[5:0]
* 0x0DF LS PERF_CTL[5:0]
+ * 0x1C0 EX PERF_CTL[5:3]
* 0x1D6 EX PERF_CTL[5:0]
* 0x1D8 EX PERF_CTL[5:0]
*
- * (*) depending on the umask all FPU counters may be used
+ * (*) depending on the umask all FPU counters may be used
+ * (**) only one unitmask enabled at a time
*/
static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
@@ -536,6 +576,12 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
return &amd_f15_PMC3;
case 0x02E:
return &amd_f15_PMC30;
+ case 0x031:
+ if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+ return &amd_f15_PMC20;
+ return &emptyconstraint;
+ case 0x1C0:
+ return &amd_f15_PMC53;
default:
return &amd_f15_PMC50;
}
@@ -555,15 +601,23 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
return &amd_f15_PMC20;
}
case AMD_EVENT_NB:
- /* not yet implemented */
+ /* moved to perf_event_amd_uncore.c */
return &emptyconstraint;
default:
return &emptyconstraint;
}
}
-static __initconst const struct x86_pmu amd_pmu_f15h = {
- .name = "AMD Family 15h",
+static ssize_t amd_event_sysfs_show(char *page, u64 config)
+{
+ u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
+ (config & AMD64_EVENTSEL_EVENT) >> 24;
+
+ return x86_event_sysfs_show(page, config, event);
+}
+
+static __initconst const struct x86_pmu amd_pmu = {
+ .name = "AMD",
.handle_irq = x86_pmu_handle_irq,
.disable_all = x86_pmu_disable_all,
.enable_all = x86_pmu_enable_all,
@@ -571,53 +625,104 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
.disable = x86_pmu_disable_event,
.hw_config = amd_pmu_hw_config,
.schedule_events = x86_schedule_events,
- .eventsel = MSR_F15H_PERF_CTL,
- .perfctr = MSR_F15H_PERF_CTR,
+ .eventsel = MSR_K7_EVNTSEL0,
+ .perfctr = MSR_K7_PERFCTR0,
+ .addr_offset = amd_pmu_addr_offset,
.event_map = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
- .num_counters = AMD64_NUM_COUNTERS_F15H,
+ .num_counters = AMD64_NUM_COUNTERS,
.cntval_bits = 48,
.cntval_mask = (1ULL << 48) - 1,
.apic = 1,
/* use highest bit to detect overflow */
.max_period = (1ULL << 47) - 1,
- .get_event_constraints = amd_get_event_constraints_f15h,
- /* nortbridge counters not yet implemented: */
-#if 0
+ .get_event_constraints = amd_get_event_constraints,
.put_event_constraints = amd_put_event_constraints,
+ .format_attrs = amd_format_attr,
+ .events_sysfs_show = amd_event_sysfs_show,
+
.cpu_prepare = amd_pmu_cpu_prepare,
.cpu_starting = amd_pmu_cpu_starting,
.cpu_dead = amd_pmu_cpu_dead,
-#endif
};
-__init int amd_pmu_init(void)
+static int __init amd_core_pmu_init(void)
{
- /* Performance-monitoring supported from K7 and later: */
- if (boot_cpu_data.x86 < 6)
- return -ENODEV;
+ if (!cpu_has_perfctr_core)
+ return 0;
- /*
- * If core performance counter extensions exists, it must be
- * family 15h, otherwise fail. See x86_pmu_addr_offset().
- */
switch (boot_cpu_data.x86) {
case 0x15:
- if (!cpu_has_perfctr_core)
- return -ENODEV;
- x86_pmu = amd_pmu_f15h;
+ pr_cont("Fam15h ");
+ x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
break;
+
default:
- if (cpu_has_perfctr_core)
- return -ENODEV;
- x86_pmu = amd_pmu;
- break;
+ pr_err("core perfctr but no constraints; unknown hardware!\n");
+ return -ENODEV;
}
+ /*
+ * If core performance counter extensions exists, we must use
+ * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
+ * amd_pmu_addr_offset().
+ */
+ x86_pmu.eventsel = MSR_F15H_PERF_CTL;
+ x86_pmu.perfctr = MSR_F15H_PERF_CTR;
+ x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE;
+
+ pr_cont("core perfctr, ");
+ return 0;
+}
+
+__init int amd_pmu_init(void)
+{
+ int ret;
+
+ /* Performance-monitoring supported from K7 and later: */
+ if (boot_cpu_data.x86 < 6)
+ return -ENODEV;
+
+ x86_pmu = amd_pmu;
+
+ ret = amd_core_pmu_init();
+ if (ret)
+ return ret;
+
/* Events are common for all AMDs */
memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
return 0;
}
+
+void amd_pmu_enable_virt(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ cpuc->perf_ctr_virt_mask = 0;
+
+ /* Reload all events */
+ x86_pmu_disable_all();
+ x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+
+void amd_pmu_disable_virt(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ /*
+ * We only mask out the Host-only bit so that host-only counting works
+ * when SVM is disabled. If someone sets up a guest-only counter when
+ * SVM is disabled the Guest-only bits still gets set and the counter
+ * will not count anything.
+ */
+ cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+ /* Reload all events */
+ x86_pmu_disable_all();
+ x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 3b8a2d30d14..cbb1be3ed9e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -9,43 +9,652 @@
#include <linux/perf_event.h>
#include <linux/module.h>
#include <linux/pci.h>
+#include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
#include <asm/apic.h>
+#include "perf_event.h"
+
static u32 ibs_caps;
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
-static struct pmu perf_ibs;
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+
+#include <asm/nmi.h>
+
+#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
+
+enum ibs_states {
+ IBS_ENABLED = 0,
+ IBS_STARTED = 1,
+ IBS_STOPPING = 2,
+
+ IBS_MAX_STATES,
+};
+
+struct cpu_perf_ibs {
+ struct perf_event *event;
+ unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)];
+};
+
+struct perf_ibs {
+ struct pmu pmu;
+ unsigned int msr;
+ u64 config_mask;
+ u64 cnt_mask;
+ u64 enable_mask;
+ u64 valid_mask;
+ u64 max_period;
+ unsigned long offset_mask[1];
+ int offset_max;
+ struct cpu_perf_ibs __percpu *pcpu;
+
+ struct attribute **format_attrs;
+ struct attribute_group format_group;
+ const struct attribute_group *attr_groups[2];
+
+ u64 (*get_count)(u64 config);
+};
+
+struct perf_ibs_data {
+ u32 size;
+ union {
+ u32 data[0]; /* data buffer starts here */
+ u32 caps;
+ };
+ u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
+};
+
+static int
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
+{
+ s64 left = local64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ int overflow = 0;
+
+ /*
+ * If we are way outside a reasonable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ overflow = 1;
+ }
+
+ if (unlikely(left < (s64)min)) {
+ left += period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ overflow = 1;
+ }
+
+ /*
+ * If the hw period that triggers the sw overflow is too short
+ * we might hit the irq handler. This biases the results.
+ * Thus we shorten the next-to-last period and set the last
+ * period to the max period.
+ */
+ if (left > max) {
+ left -= max;
+ if (left > max)
+ left = max;
+ else if (left < min)
+ left = min;
+ }
+
+ *hw_period = (u64)left;
+
+ return overflow;
+}
+
+static int
+perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int shift = 64 - width;
+ u64 prev_raw_count;
+ u64 delta;
+
+ /*
+ * Careful: an NMI might modify the previous event value.
+ *
+ * Our tactic to handle this is to first atomically read and
+ * exchange a new raw count - then add that new-prev delta
+ * count to the generic event atomically:
+ */
+ prev_raw_count = local64_read(&hwc->prev_count);
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ return 0;
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
+
+ return 1;
+}
+
+static struct perf_ibs perf_ibs_fetch;
+static struct perf_ibs perf_ibs_op;
+
+static struct perf_ibs *get_ibs_pmu(int type)
+{
+ if (perf_ibs_fetch.pmu.type == type)
+ return &perf_ibs_fetch;
+ if (perf_ibs_op.pmu.type == type)
+ return &perf_ibs_op;
+ return NULL;
+}
+
+/*
+ * Use IBS for precise event sampling:
+ *
+ * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count
+ * perf record -a -e r076:p ... # same as -e cpu-cycles:p
+ * perf record -a -e r0C1:p ... # use ibs op counting micro-ops
+ *
+ * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
+ * MSRC001_1033) is used to select either cycle or micro-ops counting
+ * mode.
+ *
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ *
+ */
+static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
+{
+ switch (event->attr.precise_ip) {
+ case 0:
+ return -ENOENT;
+ case 1:
+ case 2:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ switch (event->attr.type) {
+ case PERF_TYPE_HARDWARE:
+ switch (event->attr.config) {
+ case PERF_COUNT_HW_CPU_CYCLES:
+ *config = 0;
+ return 0;
+ }
+ break;
+ case PERF_TYPE_RAW:
+ switch (event->attr.config) {
+ case 0x0076:
+ *config = 0;
+ return 0;
+ case 0x00C1:
+ *config = IBS_OP_CNT_CTL;
+ return 0;
+ }
+ break;
+ default:
+ return -ENOENT;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static const struct perf_event_attr ibs_notsupp = {
+ .exclude_user = 1,
+ .exclude_kernel = 1,
+ .exclude_hv = 1,
+ .exclude_idle = 1,
+ .exclude_host = 1,
+ .exclude_guest = 1,
+};
static int perf_ibs_init(struct perf_event *event)
{
- if (perf_ibs.type != event->attr.type)
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs;
+ u64 max_cnt, config;
+ int ret;
+
+ perf_ibs = get_ibs_pmu(event->attr.type);
+ if (perf_ibs) {
+ config = event->attr.config;
+ } else {
+ perf_ibs = &perf_ibs_op;
+ ret = perf_ibs_precise_event(event, &config);
+ if (ret)
+ return ret;
+ }
+
+ if (event->pmu != &perf_ibs->pmu)
return -ENOENT;
+
+ if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
+ return -EINVAL;
+
+ if (config & ~perf_ibs->config_mask)
+ return -EINVAL;
+
+ if (hwc->sample_period) {
+ if (config & perf_ibs->cnt_mask)
+ /* raw max_cnt may not be set */
+ return -EINVAL;
+ if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
+ /*
+ * lower 4 bits can not be set in ibs max cnt,
+ * but allowing it in case we adjust the
+ * sample period to set a frequency.
+ */
+ return -EINVAL;
+ hwc->sample_period &= ~0x0FULL;
+ if (!hwc->sample_period)
+ hwc->sample_period = 0x10;
+ } else {
+ max_cnt = config & perf_ibs->cnt_mask;
+ config &= ~perf_ibs->cnt_mask;
+ event->attr.sample_period = max_cnt << 4;
+ hwc->sample_period = event->attr.sample_period;
+ }
+
+ if (!hwc->sample_period)
+ return -EINVAL;
+
+ /*
+ * If we modify hwc->sample_period, we also need to update
+ * hwc->last_period and hwc->period_left.
+ */
+ hwc->last_period = hwc->sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+
+ hwc->config_base = perf_ibs->msr;
+ hwc->config = config;
+
return 0;
}
+static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 *period)
+{
+ int overflow;
+
+ /* ignore lower 4 bits in min count: */
+ overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+ local64_set(&hwc->prev_count, 0);
+
+ return overflow;
+}
+
+static u64 get_ibs_fetch_count(u64 config)
+{
+ return (config & IBS_FETCH_CNT) >> 12;
+}
+
+static u64 get_ibs_op_count(u64 config)
+{
+ u64 count = 0;
+
+ if (config & IBS_OP_VAL)
+ count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
+
+ if (ibs_caps & IBS_CAPS_RDWROPCNT)
+ count += (config & IBS_OP_CUR_CNT) >> 32;
+
+ return count;
+}
+
+static void
+perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
+ u64 *config)
+{
+ u64 count = perf_ibs->get_count(*config);
+
+ /*
+ * Set width to 64 since we do not overflow on max width but
+ * instead on max count. In perf_ibs_set_period() we clear
+ * prev count manually on overflow.
+ */
+ while (!perf_event_try_update(event, count, 64)) {
+ rdmsrl(event->hw.config_base, *config);
+ count = perf_ibs->get_count(*config);
+ }
+}
+
+static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 config)
+{
+ wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
+}
+
+/*
+ * Erratum #420 Instruction-Based Sampling Engine May Generate
+ * Interrupt that Cannot Be Cleared:
+ *
+ * Must clear counter mask first, then clear the enable bit. See
+ * Revision Guide for AMD Family 10h Processors, Publication #41322.
+ */
+static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 config)
+{
+ config &= ~perf_ibs->cnt_mask;
+ wrmsrl(hwc->config_base, config);
+ config &= ~perf_ibs->enable_mask;
+ wrmsrl(hwc->config_base, config);
+}
+
+/*
+ * We cannot restore the ibs pmu state, so we always needs to update
+ * the event while stopping it and then reset the state when starting
+ * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
+ * perf_ibs_start()/perf_ibs_stop() and instead always do it.
+ */
+static void perf_ibs_start(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ u64 period;
+
+ if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ return;
+
+ WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+ hwc->state = 0;
+
+ perf_ibs_set_period(perf_ibs, hwc, &period);
+ set_bit(IBS_STARTED, pcpu->state);
+ perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+ perf_event_update_userpage(event);
+}
+
+static void perf_ibs_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ u64 config;
+ int stopping;
+
+ stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
+
+ if (!stopping && (hwc->state & PERF_HES_UPTODATE))
+ return;
+
+ rdmsrl(hwc->config_base, config);
+
+ if (stopping) {
+ set_bit(IBS_STOPPING, pcpu->state);
+ perf_ibs_disable_event(perf_ibs, hwc, config);
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ if (hwc->state & PERF_HES_UPTODATE)
+ return;
+
+ /*
+ * Clear valid bit to not count rollovers on update, rollovers
+ * are only updated in the irq handler.
+ */
+ config &= ~perf_ibs->valid_mask;
+
+ perf_ibs_event_update(perf_ibs, event, &config);
+ hwc->state |= PERF_HES_UPTODATE;
+}
+
static int perf_ibs_add(struct perf_event *event, int flags)
{
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+ if (test_and_set_bit(IBS_ENABLED, pcpu->state))
+ return -ENOSPC;
+
+ event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ pcpu->event = event;
+
+ if (flags & PERF_EF_START)
+ perf_ibs_start(event, PERF_EF_RELOAD);
+
return 0;
}
static void perf_ibs_del(struct perf_event *event, int flags)
{
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+ if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
+ return;
+
+ perf_ibs_stop(event, PERF_EF_UPDATE);
+
+ pcpu->event = NULL;
+
+ perf_event_update_userpage(event);
}
-static struct pmu perf_ibs = {
- .event_init= perf_ibs_init,
- .add= perf_ibs_add,
- .del= perf_ibs_del,
+static void perf_ibs_read(struct perf_event *event) { }
+
+PMU_FORMAT_ATTR(rand_en, "config:57");
+PMU_FORMAT_ATTR(cnt_ctl, "config:19");
+
+static struct attribute *ibs_fetch_format_attrs[] = {
+ &format_attr_rand_en.attr,
+ NULL,
};
+static struct attribute *ibs_op_format_attrs[] = {
+ NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
+ NULL,
+};
+
+static struct perf_ibs perf_ibs_fetch = {
+ .pmu = {
+ .task_ctx_nr = perf_invalid_context,
+
+ .event_init = perf_ibs_init,
+ .add = perf_ibs_add,
+ .del = perf_ibs_del,
+ .start = perf_ibs_start,
+ .stop = perf_ibs_stop,
+ .read = perf_ibs_read,
+ },
+ .msr = MSR_AMD64_IBSFETCHCTL,
+ .config_mask = IBS_FETCH_CONFIG_MASK,
+ .cnt_mask = IBS_FETCH_MAX_CNT,
+ .enable_mask = IBS_FETCH_ENABLE,
+ .valid_mask = IBS_FETCH_VAL,
+ .max_period = IBS_FETCH_MAX_CNT << 4,
+ .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
+ .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
+ .format_attrs = ibs_fetch_format_attrs,
+
+ .get_count = get_ibs_fetch_count,
+};
+
+static struct perf_ibs perf_ibs_op = {
+ .pmu = {
+ .task_ctx_nr = perf_invalid_context,
+
+ .event_init = perf_ibs_init,
+ .add = perf_ibs_add,
+ .del = perf_ibs_del,
+ .start = perf_ibs_start,
+ .stop = perf_ibs_stop,
+ .read = perf_ibs_read,
+ },
+ .msr = MSR_AMD64_IBSOPCTL,
+ .config_mask = IBS_OP_CONFIG_MASK,
+ .cnt_mask = IBS_OP_MAX_CNT,
+ .enable_mask = IBS_OP_ENABLE,
+ .valid_mask = IBS_OP_VAL,
+ .max_period = IBS_OP_MAX_CNT << 4,
+ .offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
+ .offset_max = MSR_AMD64_IBSOP_REG_COUNT,
+ .format_attrs = ibs_op_format_attrs,
+
+ .get_count = get_ibs_op_count,
+};
+
+static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
+{
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ struct perf_event *event = pcpu->event;
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ struct perf_ibs_data ibs_data;
+ int offset, size, check_rip, offset_max, throttle = 0;
+ unsigned int msr;
+ u64 *buf, *config, period;
+
+ if (!test_bit(IBS_STARTED, pcpu->state)) {
+ /*
+ * Catch spurious interrupts after stopping IBS: After
+ * disabling IBS there could be still incoming NMIs
+ * with samples that even have the valid bit cleared.
+ * Mark all this NMIs as handled.
+ */
+ return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
+ }
+
+ msr = hwc->config_base;
+ buf = ibs_data.regs;
+ rdmsrl(msr, *buf);
+ if (!(*buf++ & perf_ibs->valid_mask))
+ return 0;
+
+ config = &ibs_data.regs[0];
+ perf_ibs_event_update(perf_ibs, event, config);
+ perf_sample_data_init(&data, 0, hwc->last_period);
+ if (!perf_ibs_set_period(perf_ibs, hwc, &period))
+ goto out; /* no sw counter overflow */
+
+ ibs_data.caps = ibs_caps;
+ size = 1;
+ offset = 1;
+ check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
+ if (event->attr.sample_type & PERF_SAMPLE_RAW)
+ offset_max = perf_ibs->offset_max;
+ else if (check_rip)
+ offset_max = 2;
+ else
+ offset_max = 1;
+ do {
+ rdmsrl(msr + offset, *buf++);
+ size++;
+ offset = find_next_bit(perf_ibs->offset_mask,
+ perf_ibs->offset_max,
+ offset + 1);
+ } while (offset < offset_max);
+ ibs_data.size = sizeof(u64) * size;
+
+ regs = *iregs;
+ if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
+ regs.flags &= ~PERF_EFLAGS_EXACT;
+ } else {
+ set_linear_ip(&regs, ibs_data.regs[1]);
+ regs.flags |= PERF_EFLAGS_EXACT;
+ }
+
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw.size = sizeof(u32) + ibs_data.size;
+ raw.data = ibs_data.data;
+ data.raw = &raw;
+ }
+
+ throttle = perf_event_overflow(event, &data, &regs);
+out:
+ if (throttle)
+ perf_ibs_disable_event(perf_ibs, hwc, *config);
+ else
+ perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+ perf_event_update_userpage(event);
+
+ return 1;
+}
+
+static int
+perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ int handled = 0;
+
+ handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
+ handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ return handled;
+}
+NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
+
+static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
+{
+ struct cpu_perf_ibs __percpu *pcpu;
+ int ret;
+
+ pcpu = alloc_percpu(struct cpu_perf_ibs);
+ if (!pcpu)
+ return -ENOMEM;
+
+ perf_ibs->pcpu = pcpu;
+
+ /* register attributes */
+ if (perf_ibs->format_attrs[0]) {
+ memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
+ perf_ibs->format_group.name = "format";
+ perf_ibs->format_group.attrs = perf_ibs->format_attrs;
+
+ memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
+ perf_ibs->attr_groups[0] = &perf_ibs->format_group;
+ perf_ibs->pmu.attr_groups = perf_ibs->attr_groups;
+ }
+
+ ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
+ if (ret) {
+ perf_ibs->pcpu = NULL;
+ free_percpu(pcpu);
+ }
+
+ return ret;
+}
+
static __init int perf_event_ibs_init(void)
{
+ struct attribute **attr = ibs_op_format_attrs;
+
if (!ibs_caps)
return -ENODEV; /* ibs not supported by the cpu */
- perf_pmu_register(&perf_ibs, "ibs", -1);
+ perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+
+ if (ibs_caps & IBS_CAPS_OPCNT) {
+ perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
+ *attr++ = &format_attr_cnt_ctl.attr;
+ }
+ perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+
+ register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
return 0;
@@ -209,6 +818,18 @@ out:
return ret;
}
+static void ibs_eilvt_setup(void)
+{
+ /*
+ * Force LVT offset assignment for family 10h: The offsets are
+ * not assigned by the BIOS for this family, so the OS is
+ * responsible for doing it. If the OS assignment fails, fall
+ * back to BIOS settings and try to setup this.
+ */
+ if (boot_cpu_data.x86 == 0x10)
+ force_ibs_eilvt_setup();
+}
+
static inline int get_ibs_lvt_offset(void)
{
u64 val;
@@ -244,7 +865,37 @@ static void clear_APIC_ibs(void *dummy)
setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
}
-static int __cpuinit
+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void)
+{
+ clear_APIC_ibs(NULL);
+ return 0;
+}
+
+static void perf_ibs_resume(void)
+{
+ ibs_eilvt_setup();
+ setup_APIC_ibs(NULL);
+}
+
+static struct syscore_ops perf_ibs_syscore_ops = {
+ .resume = perf_ibs_resume,
+ .suspend = perf_ibs_suspend,
+};
+
+static void perf_ibs_pm_init(void)
+{
+ register_syscore_ops(&perf_ibs_syscore_ops);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
+static int
perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
@@ -270,25 +921,19 @@ static __init int amd_ibs_init(void)
if (!caps)
return -ENODEV; /* ibs not supported by the cpu */
- /*
- * Force LVT offset assignment for family 10h: The offsets are
- * not assigned by the BIOS for this family, so the OS is
- * responsible for doing it. If the OS assignment fails, fall
- * back to BIOS settings and try to setup this.
- */
- if (boot_cpu_data.x86 == 0x10)
- force_ibs_eilvt_setup();
+ ibs_eilvt_setup();
if (!ibs_eilvt_valid())
goto out;
- get_online_cpus();
+ perf_ibs_pm_init();
+ cpu_notifier_register_begin();
ibs_caps = caps;
/* make ibs_caps visible to other cpus: */
smp_mb();
- perf_cpu_notifier(perf_ibs_cpu_notifier);
smp_call_function(setup_APIC_ibs, NULL, 1);
- put_online_cpus();
+ __perf_cpu_notifier(perf_ibs_cpu_notifier);
+ cpu_notifier_register_done();
ret = perf_event_ibs_init();
out:
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
new file mode 100644
index 00000000000..639d1289b1b
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
@@ -0,0 +1,502 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/slab.h>
+
+#include "perf_event.h"
+#include "perf_event_amd_iommu.h"
+
+#define COUNTER_SHIFT 16
+
+#define _GET_BANK(ev) ((u8)(ev->hw.extra_reg.reg >> 8))
+#define _GET_CNTR(ev) ((u8)(ev->hw.extra_reg.reg))
+
+/* iommu pmu config masks */
+#define _GET_CSOURCE(ev) ((ev->hw.config & 0xFFULL))
+#define _GET_DEVID(ev) ((ev->hw.config >> 8) & 0xFFFFULL)
+#define _GET_PASID(ev) ((ev->hw.config >> 24) & 0xFFFFULL)
+#define _GET_DOMID(ev) ((ev->hw.config >> 40) & 0xFFFFULL)
+#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config) & 0xFFFFULL)
+#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
+#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
+
+static struct perf_amd_iommu __perf_iommu;
+
+struct perf_amd_iommu {
+ struct pmu pmu;
+ u8 max_banks;
+ u8 max_counters;
+ u64 cntr_assign_mask;
+ raw_spinlock_t lock;
+ const struct attribute_group *attr_groups[4];
+};
+
+#define format_group attr_groups[0]
+#define cpumask_group attr_groups[1]
+#define events_group attr_groups[2]
+#define null_group attr_groups[3]
+
+/*---------------------------------------------
+ * sysfs format attributes
+ *---------------------------------------------*/
+PMU_FORMAT_ATTR(csource, "config:0-7");
+PMU_FORMAT_ATTR(devid, "config:8-23");
+PMU_FORMAT_ATTR(pasid, "config:24-39");
+PMU_FORMAT_ATTR(domid, "config:40-55");
+PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
+PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
+
+static struct attribute *iommu_format_attrs[] = {
+ &format_attr_csource.attr,
+ &format_attr_devid.attr,
+ &format_attr_pasid.attr,
+ &format_attr_domid.attr,
+ &format_attr_devid_mask.attr,
+ &format_attr_pasid_mask.attr,
+ &format_attr_domid_mask.attr,
+ NULL,
+};
+
+static struct attribute_group amd_iommu_format_group = {
+ .name = "format",
+ .attrs = iommu_format_attrs,
+};
+
+/*---------------------------------------------
+ * sysfs events attributes
+ *---------------------------------------------*/
+struct amd_iommu_event_desc {
+ struct kobj_attribute attr;
+ const char *event;
+};
+
+static ssize_t _iommu_event_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct amd_iommu_event_desc *event =
+ container_of(attr, struct amd_iommu_event_desc, attr);
+ return sprintf(buf, "%s\n", event->event);
+}
+
+#define AMD_IOMMU_EVENT_DESC(_name, _event) \
+{ \
+ .attr = __ATTR(_name, 0444, _iommu_event_show, NULL), \
+ .event = _event, \
+}
+
+static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
+ AMD_IOMMU_EVENT_DESC(mem_pass_untrans, "csource=0x01"),
+ AMD_IOMMU_EVENT_DESC(mem_pass_pretrans, "csource=0x02"),
+ AMD_IOMMU_EVENT_DESC(mem_pass_excl, "csource=0x03"),
+ AMD_IOMMU_EVENT_DESC(mem_target_abort, "csource=0x04"),
+ AMD_IOMMU_EVENT_DESC(mem_trans_total, "csource=0x05"),
+ AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit, "csource=0x06"),
+ AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis, "csource=0x07"),
+ AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit, "csource=0x08"),
+ AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis, "csource=0x09"),
+ AMD_IOMMU_EVENT_DESC(mem_dte_hit, "csource=0x0a"),
+ AMD_IOMMU_EVENT_DESC(mem_dte_mis, "csource=0x0b"),
+ AMD_IOMMU_EVENT_DESC(page_tbl_read_tot, "csource=0x0c"),
+ AMD_IOMMU_EVENT_DESC(page_tbl_read_nst, "csource=0x0d"),
+ AMD_IOMMU_EVENT_DESC(page_tbl_read_gst, "csource=0x0e"),
+ AMD_IOMMU_EVENT_DESC(int_dte_hit, "csource=0x0f"),
+ AMD_IOMMU_EVENT_DESC(int_dte_mis, "csource=0x10"),
+ AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"),
+ AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"),
+ AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"),
+ { /* end: all zeroes */ },
+};
+
+/*---------------------------------------------
+ * sysfs cpumask attributes
+ *---------------------------------------------*/
+static cpumask_t iommu_cpumask;
+
+static ssize_t _iommu_cpumask_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &iommu_cpumask);
+ buf[n++] = '\n';
+ buf[n] = '\0';
+ return n;
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
+
+static struct attribute *iommu_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group amd_iommu_cpumask_group = {
+ .attrs = iommu_cpumask_attrs,
+};
+
+/*---------------------------------------------*/
+
+static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
+{
+ unsigned long flags;
+ int shift, bank, cntr, retval;
+ int max_banks = perf_iommu->max_banks;
+ int max_cntrs = perf_iommu->max_counters;
+
+ raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+
+ for (bank = 0, shift = 0; bank < max_banks; bank++) {
+ for (cntr = 0; cntr < max_cntrs; cntr++) {
+ shift = bank + (bank*3) + cntr;
+ if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
+ continue;
+ } else {
+ perf_iommu->cntr_assign_mask |= (1ULL<<shift);
+ retval = ((u16)((u16)bank<<8) | (u8)(cntr));
+ goto out;
+ }
+ }
+ }
+ retval = -ENOSPC;
+out:
+ raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+ return retval;
+}
+
+static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
+ u8 bank, u8 cntr)
+{
+ unsigned long flags;
+ int max_banks, max_cntrs;
+ int shift = 0;
+
+ max_banks = perf_iommu->max_banks;
+ max_cntrs = perf_iommu->max_counters;
+
+ if ((bank > max_banks) || (cntr > max_cntrs))
+ return -EINVAL;
+
+ shift = bank + cntr + (bank*3);
+
+ raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+ perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
+ raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+
+ return 0;
+}
+
+static int perf_iommu_event_init(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_amd_iommu *perf_iommu;
+ u64 config, config1;
+
+ /* test the event attr type check for PMU enumeration */
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /*
+ * IOMMU counters are shared across all cores.
+ * Therefore, it does not support per-process mode.
+ * Also, it does not support event sampling mode.
+ */
+ if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+ return -EINVAL;
+
+ /* IOMMU counters do not have usr/os/guest/host bits */
+ if (event->attr.exclude_user || event->attr.exclude_kernel ||
+ event->attr.exclude_host || event->attr.exclude_guest)
+ return -EINVAL;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ perf_iommu = &__perf_iommu;
+
+ if (event->pmu != &perf_iommu->pmu)
+ return -ENOENT;
+
+ if (perf_iommu) {
+ config = event->attr.config;
+ config1 = event->attr.config1;
+ } else {
+ return -EINVAL;
+ }
+
+ /* integrate with iommu base devid (0000), assume one iommu */
+ perf_iommu->max_banks =
+ amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
+ perf_iommu->max_counters =
+ amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
+ if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
+ return -EINVAL;
+
+ /* update the hw_perf_event struct with the iommu config data */
+ hwc->config = config;
+ hwc->extra_reg.config = config1;
+
+ return 0;
+}
+
+static void perf_iommu_enable_event(struct perf_event *ev)
+{
+ u8 csource = _GET_CSOURCE(ev);
+ u16 devid = _GET_DEVID(ev);
+ u64 reg = 0ULL;
+
+ reg = csource;
+ amd_iommu_pc_get_set_reg_val(devid,
+ _GET_BANK(ev), _GET_CNTR(ev) ,
+ IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+
+ reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
+ if (reg)
+ reg |= (1UL << 31);
+ amd_iommu_pc_get_set_reg_val(devid,
+ _GET_BANK(ev), _GET_CNTR(ev) ,
+ IOMMU_PC_DEVID_MATCH_REG, &reg, true);
+
+ reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
+ if (reg)
+ reg |= (1UL << 31);
+ amd_iommu_pc_get_set_reg_val(devid,
+ _GET_BANK(ev), _GET_CNTR(ev) ,
+ IOMMU_PC_PASID_MATCH_REG, &reg, true);
+
+ reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
+ if (reg)
+ reg |= (1UL << 31);
+ amd_iommu_pc_get_set_reg_val(devid,
+ _GET_BANK(ev), _GET_CNTR(ev) ,
+ IOMMU_PC_DOMID_MATCH_REG, &reg, true);
+}
+
+static void perf_iommu_disable_event(struct perf_event *event)
+{
+ u64 reg = 0ULL;
+
+ amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+ _GET_BANK(event), _GET_CNTR(event),
+ IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+}
+
+static void perf_iommu_start(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ pr_debug("perf: amd_iommu:perf_iommu_start\n");
+ if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ return;
+
+ WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+ hwc->state = 0;
+
+ if (flags & PERF_EF_RELOAD) {
+ u64 prev_raw_count = local64_read(&hwc->prev_count);
+ amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+ _GET_BANK(event), _GET_CNTR(event),
+ IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
+ }
+
+ perf_iommu_enable_event(event);
+ perf_event_update_userpage(event);
+
+}
+
+static void perf_iommu_read(struct perf_event *event)
+{
+ u64 count = 0ULL;
+ u64 prev_raw_count = 0ULL;
+ u64 delta = 0ULL;
+ struct hw_perf_event *hwc = &event->hw;
+ pr_debug("perf: amd_iommu:perf_iommu_read\n");
+
+ amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+ _GET_BANK(event), _GET_CNTR(event),
+ IOMMU_PC_COUNTER_REG, &count, false);
+
+ /* IOMMU pc counter register is only 48 bits */
+ count &= 0xFFFFFFFFFFFFULL;
+
+ prev_raw_count = local64_read(&hwc->prev_count);
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ count) != prev_raw_count)
+ return;
+
+ /* Handling 48-bit counter overflowing */
+ delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
+ delta >>= COUNTER_SHIFT;
+ local64_add(delta, &event->count);
+
+}
+
+static void perf_iommu_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 config;
+
+ pr_debug("perf: amd_iommu:perf_iommu_stop\n");
+
+ if (hwc->state & PERF_HES_UPTODATE)
+ return;
+
+ perf_iommu_disable_event(event);
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+
+ if (hwc->state & PERF_HES_UPTODATE)
+ return;
+
+ config = hwc->config;
+ perf_iommu_read(event);
+ hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_iommu_add(struct perf_event *event, int flags)
+{
+ int retval;
+ struct perf_amd_iommu *perf_iommu =
+ container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+ pr_debug("perf: amd_iommu:perf_iommu_add\n");
+ event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ /* request an iommu bank/counter */
+ retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
+ if (retval != -ENOSPC)
+ event->hw.extra_reg.reg = (u16)retval;
+ else
+ return retval;
+
+ if (flags & PERF_EF_START)
+ perf_iommu_start(event, PERF_EF_RELOAD);
+
+ return 0;
+}
+
+static void perf_iommu_del(struct perf_event *event, int flags)
+{
+ struct perf_amd_iommu *perf_iommu =
+ container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+ pr_debug("perf: amd_iommu:perf_iommu_del\n");
+ perf_iommu_stop(event, PERF_EF_UPDATE);
+
+ /* clear the assigned iommu bank/counter */
+ clear_avail_iommu_bnk_cntr(perf_iommu,
+ _GET_BANK(event),
+ _GET_CNTR(event));
+
+ perf_event_update_userpage(event);
+}
+
+static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
+{
+ struct attribute **attrs;
+ struct attribute_group *attr_group;
+ int i = 0, j;
+
+ while (amd_iommu_v2_event_descs[i].attr.attr.name)
+ i++;
+
+ attr_group = kzalloc(sizeof(struct attribute *)
+ * (i + 1) + sizeof(*attr_group), GFP_KERNEL);
+ if (!attr_group)
+ return -ENOMEM;
+
+ attrs = (struct attribute **)(attr_group + 1);
+ for (j = 0; j < i; j++)
+ attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
+
+ attr_group->name = "events";
+ attr_group->attrs = attrs;
+ perf_iommu->events_group = attr_group;
+
+ return 0;
+}
+
+static __init void amd_iommu_pc_exit(void)
+{
+ if (__perf_iommu.events_group != NULL) {
+ kfree(__perf_iommu.events_group);
+ __perf_iommu.events_group = NULL;
+ }
+}
+
+static __init int _init_perf_amd_iommu(
+ struct perf_amd_iommu *perf_iommu, char *name)
+{
+ int ret;
+
+ raw_spin_lock_init(&perf_iommu->lock);
+
+ /* Init format attributes */
+ perf_iommu->format_group = &amd_iommu_format_group;
+
+ /* Init cpumask attributes to only core 0 */
+ cpumask_set_cpu(0, &iommu_cpumask);
+ perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
+
+ /* Init events attributes */
+ if (_init_events_attrs(perf_iommu) != 0)
+ pr_err("perf: amd_iommu: Only support raw events.\n");
+
+ /* Init null attributes */
+ perf_iommu->null_group = NULL;
+ perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
+
+ ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
+ if (ret) {
+ pr_err("perf: amd_iommu: Failed to initialized.\n");
+ amd_iommu_pc_exit();
+ } else {
+ pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
+ amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
+ amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
+ }
+
+ return ret;
+}
+
+static struct perf_amd_iommu __perf_iommu = {
+ .pmu = {
+ .event_init = perf_iommu_event_init,
+ .add = perf_iommu_add,
+ .del = perf_iommu_del,
+ .start = perf_iommu_start,
+ .stop = perf_iommu_stop,
+ .read = perf_iommu_read,
+ },
+ .max_banks = 0x00,
+ .max_counters = 0x00,
+ .cntr_assign_mask = 0ULL,
+ .format_group = NULL,
+ .cpumask_group = NULL,
+ .events_group = NULL,
+ .null_group = NULL,
+};
+
+static __init int amd_iommu_pc_init(void)
+{
+ /* Make sure the IOMMU PC resource is available */
+ if (!amd_iommu_pc_supported())
+ return -ENODEV;
+
+ _init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
+
+ return 0;
+}
+
+device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.h b/arch/x86/kernel/cpu/perf_event_amd_iommu.h
new file mode 100644
index 00000000000..845d173278e
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _PERF_EVENT_AMD_IOMMU_H_
+#define _PERF_EVENT_AMD_IOMMU_H_
+
+/* iommu pc mmio region register indexes */
+#define IOMMU_PC_COUNTER_REG 0x00
+#define IOMMU_PC_COUNTER_SRC_REG 0x08
+#define IOMMU_PC_PASID_MATCH_REG 0x10
+#define IOMMU_PC_DOMID_MATCH_REG 0x18
+#define IOMMU_PC_DEVID_MATCH_REG 0x20
+#define IOMMU_PC_COUNTER_REPORT_REG 0x28
+
+/* maximun specified bank/counters */
+#define PC_MAX_SPEC_BNKS 64
+#define PC_MAX_SPEC_CNTRS 16
+
+/* iommu pc reg masks*/
+#define IOMMU_BASE_DEVID 0x0000
+
+/* amd_iommu_init.c external support functions */
+extern bool amd_iommu_pc_supported(void);
+
+extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+
+extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+
+extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
+ u8 fxn, u64 *value, bool is_write);
+
+#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
new file mode 100644
index 00000000000..3bbdf4cd38b
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+
+#include <asm/cpufeature.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#define NUM_COUNTERS_NB 4
+#define NUM_COUNTERS_L2 4
+#define MAX_COUNTERS NUM_COUNTERS_NB
+
+#define RDPMC_BASE_NB 6
+#define RDPMC_BASE_L2 10
+
+#define COUNTER_SHIFT 16
+
+struct amd_uncore {
+ int id;
+ int refcnt;
+ int cpu;
+ int num_counters;
+ int rdpmc_base;
+ u32 msr_base;
+ cpumask_t *active_mask;
+ struct pmu *pmu;
+ struct perf_event *events[MAX_COUNTERS];
+ struct amd_uncore *free_when_cpu_online;
+};
+
+static struct amd_uncore * __percpu *amd_uncore_nb;
+static struct amd_uncore * __percpu *amd_uncore_l2;
+
+static struct pmu amd_nb_pmu;
+static struct pmu amd_l2_pmu;
+
+static cpumask_t amd_nb_active_mask;
+static cpumask_t amd_l2_active_mask;
+
+static bool is_nb_event(struct perf_event *event)
+{
+ return event->pmu->type == amd_nb_pmu.type;
+}
+
+static bool is_l2_event(struct perf_event *event)
+{
+ return event->pmu->type == amd_l2_pmu.type;
+}
+
+static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
+{
+ if (is_nb_event(event) && amd_uncore_nb)
+ return *per_cpu_ptr(amd_uncore_nb, event->cpu);
+ else if (is_l2_event(event) && amd_uncore_l2)
+ return *per_cpu_ptr(amd_uncore_l2, event->cpu);
+
+ return NULL;
+}
+
+static void amd_uncore_read(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev, new;
+ s64 delta;
+
+ /*
+ * since we do not enable counter overflow interrupts,
+ * we do not have to worry about prev_count changing on us
+ */
+
+ prev = local64_read(&hwc->prev_count);
+ rdpmcl(hwc->event_base_rdpmc, new);
+ local64_set(&hwc->prev_count, new);
+ delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
+ delta >>= COUNTER_SHIFT;
+ local64_add(delta, &event->count);
+}
+
+static void amd_uncore_start(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (flags & PERF_EF_RELOAD)
+ wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
+
+ hwc->state = 0;
+ wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
+ perf_event_update_userpage(event);
+}
+
+static void amd_uncore_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, hwc->config);
+ hwc->state |= PERF_HES_STOPPED;
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ amd_uncore_read(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static int amd_uncore_add(struct perf_event *event, int flags)
+{
+ int i;
+ struct amd_uncore *uncore = event_to_amd_uncore(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ /* are we already assigned? */
+ if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
+ goto out;
+
+ for (i = 0; i < uncore->num_counters; i++) {
+ if (uncore->events[i] == event) {
+ hwc->idx = i;
+ goto out;
+ }
+ }
+
+ /* if not, take the first available counter */
+ hwc->idx = -1;
+ for (i = 0; i < uncore->num_counters; i++) {
+ if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
+ hwc->idx = i;
+ break;
+ }
+ }
+
+out:
+ if (hwc->idx == -1)
+ return -EBUSY;
+
+ hwc->config_base = uncore->msr_base + (2 * hwc->idx);
+ hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
+ hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (flags & PERF_EF_START)
+ amd_uncore_start(event, PERF_EF_RELOAD);
+
+ return 0;
+}
+
+static void amd_uncore_del(struct perf_event *event, int flags)
+{
+ int i;
+ struct amd_uncore *uncore = event_to_amd_uncore(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ amd_uncore_stop(event, PERF_EF_UPDATE);
+
+ for (i = 0; i < uncore->num_counters; i++) {
+ if (cmpxchg(&uncore->events[i], event, NULL) == event)
+ break;
+ }
+
+ hwc->idx = -1;
+}
+
+static int amd_uncore_event_init(struct perf_event *event)
+{
+ struct amd_uncore *uncore;
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /*
+ * NB and L2 counters (MSRs) are shared across all cores that share the
+ * same NB / L2 cache. Interrupts can be directed to a single target
+ * core, however, event counts generated by processes running on other
+ * cores cannot be masked out. So we do not support sampling and
+ * per-thread events.
+ */
+ if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+ return -EINVAL;
+
+ /* NB and L2 counters do not have usr/os/guest/host bits */
+ if (event->attr.exclude_user || event->attr.exclude_kernel ||
+ event->attr.exclude_host || event->attr.exclude_guest)
+ return -EINVAL;
+
+ /* and we do not enable counter overflow interrupts */
+ hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
+ hwc->idx = -1;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ uncore = event_to_amd_uncore(event);
+ if (!uncore)
+ return -ENODEV;
+
+ /*
+ * since request can come in to any of the shared cores, we will remap
+ * to a single common cpu.
+ */
+ event->cpu = uncore->cpu;
+
+ return 0;
+}
+
+static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ int n;
+ cpumask_t *active_mask;
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ if (pmu->type == amd_nb_pmu.type)
+ active_mask = &amd_nb_active_mask;
+ else if (pmu->type == amd_l2_pmu.type)
+ active_mask = &amd_l2_active_mask;
+ else
+ return 0;
+
+ n = cpulist_scnprintf(buf, PAGE_SIZE - 2, active_mask);
+ buf[n++] = '\n';
+ buf[n] = '\0';
+ return n;
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
+
+static struct attribute *amd_uncore_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group amd_uncore_attr_group = {
+ .attrs = amd_uncore_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15");
+
+static struct attribute *amd_uncore_format_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ NULL,
+};
+
+static struct attribute_group amd_uncore_format_group = {
+ .name = "format",
+ .attrs = amd_uncore_format_attr,
+};
+
+static const struct attribute_group *amd_uncore_attr_groups[] = {
+ &amd_uncore_attr_group,
+ &amd_uncore_format_group,
+ NULL,
+};
+
+static struct pmu amd_nb_pmu = {
+ .attr_groups = amd_uncore_attr_groups,
+ .name = "amd_nb",
+ .event_init = amd_uncore_event_init,
+ .add = amd_uncore_add,
+ .del = amd_uncore_del,
+ .start = amd_uncore_start,
+ .stop = amd_uncore_stop,
+ .read = amd_uncore_read,
+};
+
+static struct pmu amd_l2_pmu = {
+ .attr_groups = amd_uncore_attr_groups,
+ .name = "amd_l2",
+ .event_init = amd_uncore_event_init,
+ .add = amd_uncore_add,
+ .del = amd_uncore_del,
+ .start = amd_uncore_start,
+ .stop = amd_uncore_stop,
+ .read = amd_uncore_read,
+};
+
+static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
+{
+ return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
+ cpu_to_node(cpu));
+}
+
+static void amd_uncore_cpu_up_prepare(unsigned int cpu)
+{
+ struct amd_uncore *uncore;
+
+ if (amd_uncore_nb) {
+ uncore = amd_uncore_alloc(cpu);
+ uncore->cpu = cpu;
+ uncore->num_counters = NUM_COUNTERS_NB;
+ uncore->rdpmc_base = RDPMC_BASE_NB;
+ uncore->msr_base = MSR_F15H_NB_PERF_CTL;
+ uncore->active_mask = &amd_nb_active_mask;
+ uncore->pmu = &amd_nb_pmu;
+ *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+ }
+
+ if (amd_uncore_l2) {
+ uncore = amd_uncore_alloc(cpu);
+ uncore->cpu = cpu;
+ uncore->num_counters = NUM_COUNTERS_L2;
+ uncore->rdpmc_base = RDPMC_BASE_L2;
+ uncore->msr_base = MSR_F16H_L2I_PERF_CTL;
+ uncore->active_mask = &amd_l2_active_mask;
+ uncore->pmu = &amd_l2_pmu;
+ *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+ }
+}
+
+static struct amd_uncore *
+amd_uncore_find_online_sibling(struct amd_uncore *this,
+ struct amd_uncore * __percpu *uncores)
+{
+ unsigned int cpu;
+ struct amd_uncore *that;
+
+ for_each_online_cpu(cpu) {
+ that = *per_cpu_ptr(uncores, cpu);
+
+ if (!that)
+ continue;
+
+ if (this == that)
+ continue;
+
+ if (this->id == that->id) {
+ that->free_when_cpu_online = this;
+ this = that;
+ break;
+ }
+ }
+
+ this->refcnt++;
+ return this;
+}
+
+static void amd_uncore_cpu_starting(unsigned int cpu)
+{
+ unsigned int eax, ebx, ecx, edx;
+ struct amd_uncore *uncore;
+
+ if (amd_uncore_nb) {
+ uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
+ cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+ uncore->id = ecx & 0xff;
+
+ uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
+ *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+ }
+
+ if (amd_uncore_l2) {
+ unsigned int apicid = cpu_data(cpu).apicid;
+ unsigned int nshared;
+
+ uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
+ cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
+ nshared = ((eax >> 14) & 0xfff) + 1;
+ uncore->id = apicid - (apicid % nshared);
+
+ uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
+ *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+ }
+}
+
+static void uncore_online(unsigned int cpu,
+ struct amd_uncore * __percpu *uncores)
+{
+ struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+ kfree(uncore->free_when_cpu_online);
+ uncore->free_when_cpu_online = NULL;
+
+ if (cpu == uncore->cpu)
+ cpumask_set_cpu(cpu, uncore->active_mask);
+}
+
+static void amd_uncore_cpu_online(unsigned int cpu)
+{
+ if (amd_uncore_nb)
+ uncore_online(cpu, amd_uncore_nb);
+
+ if (amd_uncore_l2)
+ uncore_online(cpu, amd_uncore_l2);
+}
+
+static void uncore_down_prepare(unsigned int cpu,
+ struct amd_uncore * __percpu *uncores)
+{
+ unsigned int i;
+ struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
+
+ if (this->cpu != cpu)
+ return;
+
+ /* this cpu is going down, migrate to a shared sibling if possible */
+ for_each_online_cpu(i) {
+ struct amd_uncore *that = *per_cpu_ptr(uncores, i);
+
+ if (cpu == i)
+ continue;
+
+ if (this == that) {
+ perf_pmu_migrate_context(this->pmu, cpu, i);
+ cpumask_clear_cpu(cpu, that->active_mask);
+ cpumask_set_cpu(i, that->active_mask);
+ that->cpu = i;
+ break;
+ }
+ }
+}
+
+static void amd_uncore_cpu_down_prepare(unsigned int cpu)
+{
+ if (amd_uncore_nb)
+ uncore_down_prepare(cpu, amd_uncore_nb);
+
+ if (amd_uncore_l2)
+ uncore_down_prepare(cpu, amd_uncore_l2);
+}
+
+static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
+{
+ struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+ if (cpu == uncore->cpu)
+ cpumask_clear_cpu(cpu, uncore->active_mask);
+
+ if (!--uncore->refcnt)
+ kfree(uncore);
+ *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
+}
+
+static void amd_uncore_cpu_dead(unsigned int cpu)
+{
+ if (amd_uncore_nb)
+ uncore_dead(cpu, amd_uncore_nb);
+
+ if (amd_uncore_l2)
+ uncore_dead(cpu, amd_uncore_l2);
+}
+
+static int
+amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
+ void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ amd_uncore_cpu_up_prepare(cpu);
+ break;
+
+ case CPU_STARTING:
+ amd_uncore_cpu_starting(cpu);
+ break;
+
+ case CPU_ONLINE:
+ amd_uncore_cpu_online(cpu);
+ break;
+
+ case CPU_DOWN_PREPARE:
+ amd_uncore_cpu_down_prepare(cpu);
+ break;
+
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+ amd_uncore_cpu_dead(cpu);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block amd_uncore_cpu_notifier_block = {
+ .notifier_call = amd_uncore_cpu_notifier,
+ .priority = CPU_PRI_PERF + 1,
+};
+
+static void __init init_cpu_already_online(void *dummy)
+{
+ unsigned int cpu = smp_processor_id();
+
+ amd_uncore_cpu_starting(cpu);
+ amd_uncore_cpu_online(cpu);
+}
+
+static int __init amd_uncore_init(void)
+{
+ unsigned int cpu;
+ int ret = -ENODEV;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return -ENODEV;
+
+ if (!cpu_has_topoext)
+ return -ENODEV;
+
+ if (cpu_has_perfctr_nb) {
+ amd_uncore_nb = alloc_percpu(struct amd_uncore *);
+ perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+
+ printk(KERN_INFO "perf: AMD NB counters detected\n");
+ ret = 0;
+ }
+
+ if (cpu_has_perfctr_l2) {
+ amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
+ perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+
+ printk(KERN_INFO "perf: AMD L2I counters detected\n");
+ ret = 0;
+ }
+
+ if (ret)
+ return -ENODEV;
+
+ cpu_notifier_register_begin();
+
+ /* init cpus already online before registering for hotplug notifier */
+ for_each_online_cpu(cpu) {
+ amd_uncore_cpu_up_prepare(cpu);
+ smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
+ }
+
+ __register_cpu_notifier(&amd_uncore_cpu_notifier_block);
+ cpu_notifier_register_done();
+
+ return 0;
+}
+device_initcall(amd_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3bd37bdf1b8..2502d0d9d24 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -5,12 +5,15 @@
* among events on a single PMU.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <asm/cpufeature.h>
#include <asm/hardirq.h>
#include <asm/apic.h>
@@ -21,14 +24,14 @@
*/
static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
- [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
- [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+ [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
};
static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -78,7 +81,9 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
{
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
EVENT_EXTRA_END
};
@@ -99,16 +104,51 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+ INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
+ INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+ INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ /*
+ * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT
+ * siblings; disable these events because they can corrupt unrelated
+ * counters.
+ */
+ INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
EVENT_CONSTRAINT_END
};
static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
{
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
- INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
EVENT_EXTRA_END
};
@@ -125,17 +165,144 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
EVENT_CONSTRAINT_END
};
+static struct event_constraint intel_slm_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+ EVENT_CONSTRAINT_END
+};
+
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
- INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
EVENT_EXTRA_END
};
+static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ EVENT_EXTRA_END
+};
+
+EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
+EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
+
+struct attribute *nhm_events_attrs[] = {
+ EVENT_PTR(mem_ld_nhm),
+ NULL,
+};
+
+struct attribute *snb_events_attrs[] = {
+ EVENT_PTR(mem_ld_snb),
+ EVENT_PTR(mem_st_snb),
+ NULL,
+};
+
+static struct event_constraint intel_hsw_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */
+ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+ /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+ INTEL_EVENT_CONSTRAINT(0x08a3, 0x4),
+ /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+ INTEL_EVENT_CONSTRAINT(0x0ca3, 0x4),
+ /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+ INTEL_EVENT_CONSTRAINT(0x04a3, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
static u64 intel_pmu_event_map(int hw_event)
{
return intel_perfmon_event_map[hw_event];
}
+#define SNB_DMND_DATA_RD (1ULL << 0)
+#define SNB_DMND_RFO (1ULL << 1)
+#define SNB_DMND_IFETCH (1ULL << 2)
+#define SNB_DMND_WB (1ULL << 3)
+#define SNB_PF_DATA_RD (1ULL << 4)
+#define SNB_PF_RFO (1ULL << 5)
+#define SNB_PF_IFETCH (1ULL << 6)
+#define SNB_LLC_DATA_RD (1ULL << 7)
+#define SNB_LLC_RFO (1ULL << 8)
+#define SNB_LLC_IFETCH (1ULL << 9)
+#define SNB_BUS_LOCKS (1ULL << 10)
+#define SNB_STRM_ST (1ULL << 11)
+#define SNB_OTHER (1ULL << 15)
+#define SNB_RESP_ANY (1ULL << 16)
+#define SNB_NO_SUPP (1ULL << 17)
+#define SNB_LLC_HITM (1ULL << 18)
+#define SNB_LLC_HITE (1ULL << 19)
+#define SNB_LLC_HITS (1ULL << 20)
+#define SNB_LLC_HITF (1ULL << 21)
+#define SNB_LOCAL (1ULL << 22)
+#define SNB_REMOTE (0xffULL << 23)
+#define SNB_SNP_NONE (1ULL << 31)
+#define SNB_SNP_NOT_NEEDED (1ULL << 32)
+#define SNB_SNP_MISS (1ULL << 33)
+#define SNB_NO_FWD (1ULL << 34)
+#define SNB_SNP_FWD (1ULL << 35)
+#define SNB_HITM (1ULL << 36)
+#define SNB_NON_DRAM (1ULL << 37)
+
+#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
+#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO)
+#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
+ SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
+ SNB_HITM)
+
+#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
+#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY)
+
+#define SNB_L3_ACCESS SNB_RESP_ANY
+#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 snb_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
+ [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
+ [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
+ [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
+ [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
+ [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
+ [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
+ },
+ },
+};
+
static __initconst const u64 snb_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -233,16 +400,16 @@ static __initconst const u64 snb_hw_cache_event_ids
},
[ C(NODE) ] = {
[ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
},
[ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
},
[ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
},
},
@@ -385,14 +552,15 @@ static __initconst const u64 westmere_hw_cache_event_ids
#define NHM_LOCAL_DRAM (1 << 14)
#define NHM_NON_DRAM (1 << 15)
-#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM)
+#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_REMOTE (NHM_REMOTE_DRAM)
#define NHM_DMND_READ (NHM_DMND_DATA_RD)
#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
-#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
static __initconst const u64 nehalem_hw_cache_extra_regs
@@ -416,16 +584,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
},
[ C(NODE) ] = {
[ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
- [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
+ [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
+ [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE,
},
[ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
- [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
+ [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
+ [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE,
},
[ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
- [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
+ [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
+ [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE,
},
},
};
@@ -727,13 +895,161 @@ static __initconst const u64 atom_hw_cache_event_ids
},
};
+static struct extra_reg intel_slm_extra_regs[] __read_mostly =
+{
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1),
+ EVENT_EXTRA_END
+};
+
+#define SLM_DMND_READ SNB_DMND_DATA_RD
+#define SLM_DMND_WRITE SNB_DMND_RFO
+#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
+#define SLM_LLC_ACCESS SNB_RESP_ANY
+#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 slm_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = SLM_DMND_READ|SLM_LLC_MISS,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
+ },
+ },
+};
+
+static __initconst const u64 slm_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
+ [ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
+{
+ /* user explicitly requested branch sampling */
+ if (has_branch_stack(event))
+ return true;
+
+ /* implicit branch sampling to correct PEBS skid */
+ if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
+ x86_pmu.intel_cap.pebs_format < 2)
+ return true;
+
+ return false;
+}
+
static void intel_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
- if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
intel_pmu_disable_bts();
intel_pmu_pebs_disable_all();
@@ -749,9 +1065,9 @@ static void intel_pmu_enable_all(int added)
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
- if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+ if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
struct perf_event *event =
- cpuc->events[X86_PMC_IDX_FIXED_BTS];
+ cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
if (WARN_ON_ONCE(!event))
return;
@@ -857,7 +1173,7 @@ static inline void intel_pmu_ack_status(u64 ack)
static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
{
- int idx = hwc->idx - X86_PMC_IDX_FIXED;
+ int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
u64 ctrl_val, mask;
mask = 0xfULL << (idx * 4);
@@ -867,12 +1183,17 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
wrmsrl(hwc->config_base, ctrl_val);
}
+static inline bool event_is_checkpointed(struct perf_event *event)
+{
+ return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
+}
+
static void intel_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
+ if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
intel_pmu_disable_bts();
intel_pmu_drain_bts_buffer();
return;
@@ -880,6 +1201,14 @@ static void intel_pmu_disable_event(struct perf_event *event)
cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
+ cpuc->intel_cp_status &= ~(1ull << hwc->idx);
+
+ /*
+ * must disable before any actual event
+ * because any event may be combined with LBR
+ */
+ if (intel_pmu_needs_lbr_smpl(event))
+ intel_pmu_lbr_disable(event);
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_disable_fixed(hwc);
@@ -894,7 +1223,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
{
- int idx = hwc->idx - X86_PMC_IDX_FIXED;
+ int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
u64 ctrl_val, bits, mask;
/*
@@ -928,19 +1257,28 @@ static void intel_pmu_enable_event(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
+ if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
if (!__this_cpu_read(cpu_hw_events.enabled))
return;
intel_pmu_enable_bts(hwc->config);
return;
}
+ /*
+ * must enabled before any actual event
+ * because any event may be combined with LBR
+ */
+ if (intel_pmu_needs_lbr_smpl(event))
+ intel_pmu_lbr_enable(event);
if (event->attr.exclude_host)
cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
if (event->attr.exclude_guest)
cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
+ if (unlikely(event_is_checkpointed(event)))
+ cpuc->intel_cp_status |= (1ull << hwc->idx);
+
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_enable_fixed(hwc);
return;
@@ -959,6 +1297,17 @@ static void intel_pmu_enable_event(struct perf_event *event)
int intel_pmu_save_and_restart(struct perf_event *event)
{
x86_perf_event_update(event);
+ /*
+ * For a checkpointed counter always reset back to 0. This
+ * avoids a situation where the counter overflows, aborts the
+ * transaction and is then set back to shortly before the
+ * overflow, and overflows and aborts again.
+ */
+ if (unlikely(event_is_checkpointed(event))) {
+ /* No race with NMIs because the counter should not be armed */
+ wrmsrl(event->hw.event_base, 0);
+ local64_set(&event->hw.prev_count, 0);
+ }
return x86_perf_event_set_period(event);
}
@@ -973,14 +1322,14 @@ static void intel_pmu_reset(void)
local_irq_save(flags);
- printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+ pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- checking_wrmsrl(x86_pmu_config_addr(idx), 0ull);
- checking_wrmsrl(x86_pmu_event_addr(idx), 0ull);
+ wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
+ wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
}
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
- checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+ wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
if (ds)
ds->bts_index = ds->bts_buffer_base;
@@ -1000,34 +1349,30 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
u64 status;
int handled;
- perf_sample_data_init(&data, 0);
-
cpuc = &__get_cpu_var(cpu_hw_events);
/*
- * Some chipsets need to unmask the LVTPC in a particular spot
- * inside the nmi handler. As a result, the unmasking was pushed
- * into all the nmi handlers.
- *
- * This handler doesn't seem to have any issues with the unmasking
- * so it was left at the top.
+ * No known reason to not always do late ACK,
+ * but just in case do it opt-in.
*/
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
+ if (!x86_pmu.late_ack)
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
intel_pmu_disable_all();
handled = intel_pmu_drain_bts_buffer();
status = intel_pmu_get_status();
- if (!status) {
- intel_pmu_enable_all(0);
- return handled;
- }
+ if (!status)
+ goto done;
loops = 0;
again:
intel_pmu_ack_status(status);
if (++loops > 100) {
- WARN_ONCE(1, "perfevents: irq loop stuck!\n");
- perf_event_print_debug();
+ static bool warned = false;
+ if (!warned) {
+ WARN(1, "perfevents: irq loop stuck!\n");
+ perf_event_print_debug();
+ warned = true;
+ }
intel_pmu_reset();
goto done;
}
@@ -1037,6 +1382,15 @@ again:
intel_pmu_lbr_read();
/*
+ * CondChgd bit 63 doesn't mean any overflow status. Ignore
+ * and clear the bit.
+ */
+ if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+ if (!status)
+ goto done;
+ }
+
+ /*
* PEBS overflow sets bit 62 in the global status register
*/
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
@@ -1044,6 +1398,13 @@ again:
x86_pmu.drain_pebs(regs);
}
+ /*
+ * Checkpointed counters can lead to 'spurious' PMIs because the
+ * rollback caused by the PMI will have cleared the overflow status
+ * bit. Therefore always force probe these counters.
+ */
+ status |= cpuc->intel_cp_status;
+
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[bit];
@@ -1055,7 +1416,10 @@ again:
if (!intel_pmu_save_and_restart(event))
continue;
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
+ if (has_branch_stack(event))
+ data.br_stack = &cpuc->lbr_stack;
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
@@ -1070,6 +1434,13 @@ again:
done:
intel_pmu_enable_all(0);
+ /*
+ * Only unmask the NMI after the overflow counters
+ * have been reset. This avoids spurious NMIs on
+ * Haswell CPUs.
+ */
+ if (x86_pmu.late_ack)
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
return handled;
}
@@ -1091,27 +1462,33 @@ intel_bts_constraints(struct perf_event *event)
return NULL;
}
-static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
+static int intel_alt_er(int idx)
{
if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
- return false;
+ return idx;
- if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
- event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= 0x01bb;
- event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
- event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
- } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
+ if (idx == EXTRA_REG_RSP_0)
+ return EXTRA_REG_RSP_1;
+
+ if (idx == EXTRA_REG_RSP_1)
+ return EXTRA_REG_RSP_0;
+
+ return idx;
+}
+
+static void intel_fixup_er(struct perf_event *event, int idx)
+{
+ event->hw.extra_reg.idx = idx;
+
+ if (idx == EXTRA_REG_RSP_0) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= 0x01b7;
- event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
+ event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+ } else if (idx == EXTRA_REG_RSP_1) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
}
-
- if (event->hw.extra_reg.idx == orig_idx)
- return false;
-
- return true;
}
/*
@@ -1123,20 +1500,24 @@ static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
*/
static struct event_constraint *
__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
+ struct perf_event *event,
+ struct hw_perf_event_extra *reg)
{
struct event_constraint *c = &emptyconstraint;
- struct hw_perf_event_extra *reg = &event->hw.extra_reg;
struct er_account *era;
unsigned long flags;
- int orig_idx = reg->idx;
+ int idx = reg->idx;
- /* already allocated shared msr */
- if (reg->alloc)
- return &unconstrained;
+ /*
+ * reg->alloc can be set due to existing state, so for fake cpuc we
+ * need to ignore this, otherwise we might fail to allocate proper fake
+ * state for this extra reg constraint. Also see the comment below.
+ */
+ if (reg->alloc && !cpuc->is_fake)
+ return NULL; /* call x86_get_event_constraint() */
again:
- era = &cpuc->shared_regs->regs[reg->idx];
+ era = &cpuc->shared_regs->regs[idx];
/*
* we use spin_lock_irqsave() to avoid lockdep issues when
* passing a fake cpuc
@@ -1145,6 +1526,29 @@ again:
if (!atomic_read(&era->ref) || era->config == reg->config) {
+ /*
+ * If its a fake cpuc -- as per validate_{group,event}() we
+ * shouldn't touch event state and we can avoid doing so
+ * since both will only call get_event_constraints() once
+ * on each event, this avoids the need for reg->alloc.
+ *
+ * Not doing the ER fixup will only result in era->reg being
+ * wrong, but since we won't actually try and program hardware
+ * this isn't a problem either.
+ */
+ if (!cpuc->is_fake) {
+ if (idx != reg->idx)
+ intel_fixup_er(event, idx);
+
+ /*
+ * x86_schedule_events() can call get_event_constraints()
+ * multiple times on events in the case of incremental
+ * scheduling(). reg->alloc ensures we only do the ER
+ * allocation once.
+ */
+ reg->alloc = 1;
+ }
+
/* lock in msr value */
era->config = reg->config;
era->reg = reg->reg;
@@ -1152,21 +1556,17 @@ again:
/* one more user */
atomic_inc(&era->ref);
- /* no need to reallocate during incremental event scheduling */
- reg->alloc = 1;
-
/*
- * All events using extra_reg are unconstrained.
- * Avoids calling x86_get_event_constraints()
- *
- * Must revisit if extra_reg controlling events
- * ever have constraints. Worst case we go through
- * the regular event constraint table.
+ * need to call x86_get_event_constraint()
+ * to check if associated event has constraints
*/
- c = &unconstrained;
- } else if (intel_try_alt_er(event, orig_idx)) {
- raw_spin_unlock_irqrestore(&era->lock, flags);
- goto again;
+ c = NULL;
+ } else {
+ idx = intel_alt_er(idx);
+ if (idx != reg->idx) {
+ raw_spin_unlock_irqrestore(&era->lock, flags);
+ goto again;
+ }
}
raw_spin_unlock_irqrestore(&era->lock, flags);
@@ -1180,11 +1580,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
struct er_account *era;
/*
- * only put constraint if extra reg was actually
- * allocated. Also takes care of event which do
- * not use an extra shared reg
+ * Only put constraint if extra reg was actually allocated. Also takes
+ * care of event which do not use an extra shared reg.
+ *
+ * Also, if this is a fake cpuc we shouldn't touch any event state
+ * (reg->alloc) and we don't care about leaving inconsistent cpuc state
+ * either since it'll be thrown out.
*/
- if (!reg->alloc)
+ if (!reg->alloc || cpuc->is_fake)
return;
era = &cpuc->shared_regs->regs[reg->idx];
@@ -1200,11 +1603,23 @@ static struct event_constraint *
intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
- struct event_constraint *c = NULL;
-
- if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
- c = __intel_shared_reg_get_constraints(cpuc, event);
-
+ struct event_constraint *c = NULL, *d;
+ struct hw_perf_event_extra *xreg, *breg;
+
+ xreg = &event->hw.extra_reg;
+ if (xreg->idx != EXTRA_REG_NONE) {
+ c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
+ if (c == &emptyconstraint)
+ return c;
+ }
+ breg = &event->hw.branch_reg;
+ if (breg->idx != EXTRA_REG_NONE) {
+ d = __intel_shared_reg_get_constraints(cpuc, event, breg);
+ if (d == &emptyconstraint) {
+ __intel_shared_reg_put_constraints(cpuc, xreg);
+ c = d;
+ }
+ }
return c;
}
@@ -1215,8 +1630,10 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
if (x86_pmu.event_constraints) {
for_each_event_constraint(c, x86_pmu.event_constraints) {
- if ((event->hw.config & c->cmask) == c->code)
+ if ((event->hw.config & c->cmask) == c->code) {
+ event->hw.flags |= c->flags;
return c;
+ }
}
}
@@ -1252,6 +1669,10 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
reg = &event->hw.extra_reg;
if (reg->idx != EXTRA_REG_NONE)
__intel_shared_reg_put_constraints(cpuc, reg);
+
+ reg = &event->hw.branch_reg;
+ if (reg->idx != EXTRA_REG_NONE)
+ __intel_shared_reg_put_constraints(cpuc, reg);
}
static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
@@ -1260,15 +1681,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
intel_put_shared_regs_event_constraints(cpuc, event);
}
-static int intel_pmu_hw_config(struct perf_event *event)
+static void intel_pebs_aliases_core2(struct perf_event *event)
{
- int ret = x86_pmu_hw_config(event);
-
- if (ret)
- return ret;
-
- if (event->attr.precise_ip &&
- (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
/*
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
* (0x003c) so that we can use it with PEBS.
@@ -1287,11 +1702,56 @@ static int intel_pmu_hw_config(struct perf_event *event)
*
* Thereby we gain a PEBS capable cycle counter.
*/
- u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
+ u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_snb(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use UOPS_RETIRED.ALL
+ * (0x01c2), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * UOPS_RETIRED.ALL counts the number of cycles that retires
+ * CNTMASK micro-ops. By setting CNTMASK to a value (16)
+ * larger than the maximum number of micro-ops that can be
+ * retired per cycle (4) and then inverting the condition, we
+ * count all cycles that retire 16 or less micro-ops, which
+ * is every cycle.
+ *
+ * Thereby we gain a PEBS capable cycle counter.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
event->hw.config = alt_config;
}
+}
+
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ if (event->attr.precise_ip && x86_pmu.pebs_aliases)
+ x86_pmu.pebs_aliases(event);
+
+ if (intel_pmu_needs_lbr_smpl(event)) {
+ ret = intel_pmu_setup_lbr_filter(event);
+ if (ret)
+ return ret;
+ }
if (event->attr.type != PERF_TYPE_RAW)
return 0;
@@ -1327,8 +1787,16 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
+ /*
+ * If PMU counter has PEBS enabled it is not enough to disable counter
+ * on a guest entry since PEBS memory write can overshoot guest entry
+ * and corrupt guest memory. Disabling PEBS solves the problem.
+ */
+ arr[1].msr = MSR_IA32_PEBS_ENABLE;
+ arr[1].host = cpuc->pebs_enabled;
+ arr[1].guest = 0;
- *nr = 1;
+ *nr = 2;
return arr;
}
@@ -1382,6 +1850,88 @@ static void core_pmu_enable_all(int added)
}
}
+static int hsw_hw_config(struct perf_event *event)
+{
+ int ret = intel_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+ if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
+ return 0;
+ event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
+
+ /*
+ * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
+ * PEBS or in ANY thread mode. Since the results are non-sensical forbid
+ * this combination.
+ */
+ if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
+ ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
+ event->attr.precise_ip > 0))
+ return -EOPNOTSUPP;
+
+ if (event_is_checkpointed(event)) {
+ /*
+ * Sampling of checkpointed events can cause situations where
+ * the CPU constantly aborts because of a overflow, which is
+ * then checkpointed back and ignored. Forbid checkpointing
+ * for sampling.
+ *
+ * But still allow a long sampling period, so that perf stat
+ * from KVM works.
+ */
+ if (event->attr.sample_period > 0 &&
+ event->attr.sample_period < 0x7fffffff)
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static struct event_constraint counter2_constraint =
+ EVENT_CONSTRAINT(0, 0x4, 0);
+
+static struct event_constraint *
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+ struct event_constraint *c = intel_get_event_constraints(cpuc, event);
+
+ /* Handle special quirk on in_tx_checkpointed only in counter 2 */
+ if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
+ if (c->idxmsk64 & (1U << 2))
+ return &counter2_constraint;
+ return &emptyconstraint;
+ }
+
+ return c;
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7" );
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(pc, "config:19" );
+PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+PMU_FORMAT_ATTR(in_tx, "config:32");
+PMU_FORMAT_ATTR(in_tx_cp, "config:33");
+
+static struct attribute *intel_arch_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_pc.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+ssize_t intel_event_sysfs_show(char *page, u64 config)
+{
+ u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+ return x86_event_sysfs_show(page, config, event);
+}
+
static __initconst const struct x86_pmu core_pmu = {
.name = "core",
.handle_irq = x86_pmu_handle_irq,
@@ -1406,6 +1956,8 @@ static __initconst const struct x86_pmu core_pmu = {
.put_event_constraints = intel_put_event_constraints,
.event_constraints = intel_core_event_constraints,
.guest_get_msrs = core_guest_get_msrs,
+ .format_attrs = intel_arch_formats_attr,
+ .events_sysfs_show = intel_event_sysfs_show,
};
struct intel_shared_regs *allocate_shared_regs(int cpu)
@@ -1431,7 +1983,7 @@ static int intel_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- if (!x86_pmu.extra_regs)
+ if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
return NOTIFY_OK;
cpuc->shared_regs = allocate_shared_regs(cpu);
@@ -1453,22 +2005,28 @@ static void intel_pmu_cpu_starting(int cpu)
*/
intel_pmu_lbr_reset();
- if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
+ cpuc->lbr_sel = NULL;
+
+ if (!cpuc->shared_regs)
return;
- for_each_cpu(i, topology_thread_cpumask(cpu)) {
- struct intel_shared_regs *pc;
+ if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
+ for_each_cpu(i, topology_thread_cpumask(cpu)) {
+ struct intel_shared_regs *pc;
- pc = per_cpu(cpu_hw_events, i).shared_regs;
- if (pc && pc->core_id == core_id) {
- cpuc->kfree_on_online = cpuc->shared_regs;
- cpuc->shared_regs = pc;
- break;
+ pc = per_cpu(cpu_hw_events, i).shared_regs;
+ if (pc && pc->core_id == core_id) {
+ cpuc->kfree_on_online = cpuc->shared_regs;
+ cpuc->shared_regs = pc;
+ break;
+ }
}
+ cpuc->shared_regs->core_id = core_id;
+ cpuc->shared_regs->refcnt++;
}
- cpuc->shared_regs->core_id = core_id;
- cpuc->shared_regs->refcnt++;
+ if (x86_pmu.lbr_sel_map)
+ cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
}
static void intel_pmu_cpu_dying(int cpu)
@@ -1486,6 +2044,38 @@ static void intel_pmu_cpu_dying(int cpu)
fini_debug_store_on_cpu(cpu);
}
+static void intel_pmu_flush_branch_stack(void)
+{
+ /*
+ * Intel LBR does not tag entries with the
+ * PID of the current task, then we need to
+ * flush it on ctxsw
+ * For now, we simply reset it
+ */
+ if (x86_pmu.lbr_nr)
+ intel_pmu_lbr_reset();
+}
+
+PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
+
+PMU_FORMAT_ATTR(ldlat, "config1:0-15");
+
+static struct attribute *intel_arch3_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_pc.attr,
+ &format_attr_any.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ &format_attr_in_tx.attr,
+ &format_attr_in_tx_cp.attr,
+
+ &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
+ &format_attr_ldlat.attr, /* PEBS load latency */
+ NULL,
+};
+
static __initconst const struct x86_pmu intel_pmu = {
.name = "Intel",
.handle_irq = intel_pmu_handle_irq,
@@ -1508,11 +2098,16 @@ static __initconst const struct x86_pmu intel_pmu = {
.max_period = (1ULL << 31) - 1,
.get_event_constraints = intel_get_event_constraints,
.put_event_constraints = intel_put_event_constraints,
+ .pebs_aliases = intel_pebs_aliases_core2,
+
+ .format_attrs = intel_arch3_formats_attr,
+ .events_sysfs_show = intel_event_sysfs_show,
.cpu_prepare = intel_pmu_cpu_prepare,
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.guest_get_msrs = intel_guest_get_msrs,
+ .flush_branch_stack = intel_pmu_flush_branch_stack,
};
static __init void intel_clovertown_quirk(void)
@@ -1536,16 +2131,96 @@ static __init void intel_clovertown_quirk(void)
* But taken together it might just make sense to not enable PEBS on
* these chips.
*/
- printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
+ pr_warn("PEBS disabled due to CPU errata\n");
x86_pmu.pebs = 0;
x86_pmu.pebs_constraints = NULL;
}
+static int intel_snb_pebs_broken(int cpu)
+{
+ u32 rev = UINT_MAX; /* default to broken for unknown models */
+
+ switch (cpu_data(cpu).x86_model) {
+ case 42: /* SNB */
+ rev = 0x28;
+ break;
+
+ case 45: /* SNB-EP */
+ switch (cpu_data(cpu).x86_mask) {
+ case 6: rev = 0x618; break;
+ case 7: rev = 0x70c; break;
+ }
+ }
+
+ return (cpu_data(cpu).microcode < rev);
+}
+
+static void intel_snb_check_microcode(void)
+{
+ int pebs_broken = 0;
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ if ((pebs_broken = intel_snb_pebs_broken(cpu)))
+ break;
+ }
+ put_online_cpus();
+
+ if (pebs_broken == x86_pmu.pebs_broken)
+ return;
+
+ /*
+ * Serialized by the microcode lock..
+ */
+ if (x86_pmu.pebs_broken) {
+ pr_info("PEBS enabled due to microcode update\n");
+ x86_pmu.pebs_broken = 0;
+ } else {
+ pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
+ x86_pmu.pebs_broken = 1;
+ }
+}
+
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+ u64 val_old, val_new, val_tmp;
+
+ /*
+ * Read the current value, change it and read it back to see if it
+ * matches, this is needed to detect certain hardware emulators
+ * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+ */
+ if (rdmsrl_safe(msr, &val_old))
+ return false;
+
+ /*
+ * Only change the bits which can be updated by wrmsrl.
+ */
+ val_tmp = val_old ^ mask;
+ if (wrmsrl_safe(msr, val_tmp) ||
+ rdmsrl_safe(msr, &val_new))
+ return false;
+
+ if (val_new != val_tmp)
+ return false;
+
+ /* Here it's sure that the MSR can be safely accessed.
+ * Restore the old value and return.
+ */
+ wrmsrl(msr, val_old);
+
+ return true;
+}
+
static __init void intel_sandybridge_quirk(void)
{
- printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
- x86_pmu.pebs = 0;
- x86_pmu.pebs_constraints = NULL;
+ x86_pmu.check_microcode = intel_snb_check_microcode;
+ intel_snb_check_microcode();
}
static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
@@ -1565,8 +2240,8 @@ static __init void intel_arch_events_quirk(void)
/* disable event that reported as not presend by cpuid */
for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
- printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n",
- intel_arch_events_map[bit].name);
+ pr_warn("CPUID marked event: \'%s\' unavailable\n",
+ intel_arch_events_map[bit].name);
}
}
@@ -1585,22 +2260,61 @@ static __init void intel_nehalem_quirk(void)
intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
ebx.split.no_branch_misses_retired = 0;
x86_pmu.events_maskl = ebx.full;
- printk(KERN_INFO "CPU erratum AAJ80 worked around\n");
+ pr_info("CPU erratum AAJ80 worked around\n");
}
}
+EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
+
+/* Haswell special events */
+EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1");
+EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2");
+EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4");
+EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2");
+EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1");
+EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1");
+EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2");
+EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4");
+EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2");
+EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1");
+EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1");
+EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1");
+
+static struct attribute *hsw_events_attrs[] = {
+ EVENT_PTR(tx_start),
+ EVENT_PTR(tx_commit),
+ EVENT_PTR(tx_abort),
+ EVENT_PTR(tx_capacity),
+ EVENT_PTR(tx_conflict),
+ EVENT_PTR(el_start),
+ EVENT_PTR(el_commit),
+ EVENT_PTR(el_abort),
+ EVENT_PTR(el_capacity),
+ EVENT_PTR(el_conflict),
+ EVENT_PTR(cycles_t),
+ EVENT_PTR(cycles_ct),
+ EVENT_PTR(mem_ld_hsw),
+ EVENT_PTR(mem_st_hsw),
+ NULL
+};
+
__init int intel_pmu_init(void)
{
union cpuid10_edx edx;
union cpuid10_eax eax;
union cpuid10_ebx ebx;
+ struct event_constraint *c;
unsigned int unused;
- int version;
+ struct extra_reg *er;
+ int version, i;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
switch (boot_cpu_data.x86) {
case 0x6:
return p6_pmu_init();
+ case 0xb:
+ return knc_pmu_init();
case 0xf:
return p4_pmu_init();
}
@@ -1629,6 +2343,8 @@ __init int intel_pmu_init(void)
x86_pmu.events_maskl = ebx.full;
x86_pmu.events_mask_len = eax.split.mask_length;
+ x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+
/*
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events:
@@ -1636,10 +2352,7 @@ __init int intel_pmu_init(void)
if (version > 1)
x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
- /*
- * v2 and above have a perf capabilities MSR
- */
- if (version > 1) {
+ if (boot_cpu_has(X86_FEATURE_PDCM)) {
u64 capabilities;
rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
@@ -1688,10 +2401,14 @@ __init int intel_pmu_init(void)
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.extra_regs = intel_nehalem_extra_regs;
+ x86_pmu.cpu_events = nhm_events_attrs;
+
/* UOPS_ISSUED.STALLED_CYCLES */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
x86_add_quirk(intel_nehalem_quirk);
@@ -1699,6 +2416,10 @@ __init int intel_pmu_init(void)
break;
case 28: /* Atom */
+ case 38: /* Lincroft */
+ case 39: /* Penwell */
+ case 53: /* Cloverview */
+ case 54: /* Cedarview */
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -1709,6 +2430,22 @@ __init int intel_pmu_init(void)
pr_cont("Atom events, ");
break;
+ case 55: /* Atom 22nm "Silvermont" */
+ case 77: /* Avoton "Silvermont" */
+ memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_atom();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_slm_extra_regs;
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ pr_cont("Silvermont events, ");
+ break;
+
case 37: /* 32 nm nehalem, "Clarkdale" */
case 44: /* 32 nm nehalem, "Gulftown" */
case 47: /* 32 nm Xeon E7 */
@@ -1725,36 +2462,108 @@ __init int intel_pmu_init(void)
x86_pmu.extra_regs = intel_westmere_extra_regs;
x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.cpu_events = nhm_events_attrs;
+
/* UOPS_ISSUED.STALLED_CYCLES */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
pr_cont("Westmere events, ");
break;
case 42: /* SandyBridge */
- x86_add_quirk(intel_sandybridge_quirk);
case 45: /* SandyBridge, "Romely-EP" */
+ x86_add_quirk(intel_sandybridge_quirk);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
- intel_pmu_lbr_init_nhm();
+ intel_pmu_lbr_init_snb();
x86_pmu.event_constraints = intel_snb_event_constraints;
x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
- x86_pmu.extra_regs = intel_snb_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ if (boot_cpu_data.x86_model == 45)
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ else
+ x86_pmu.extra_regs = intel_snb_extra_regs;
/* all extra regs are per-cpu when HT is on */
x86_pmu.er_flags |= ERF_HAS_RSP_1;
x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+ x86_pmu.cpu_events = snb_events_attrs;
+
/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
pr_cont("SandyBridge events, ");
break;
+ case 58: /* IvyBridge */
+ case 62: /* IvyBridge EP */
+ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ /* dTLB-load-misses on IVB is different than SNB */
+ hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
+ memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_snb();
+
+ x86_pmu.event_constraints = intel_ivb_event_constraints;
+ x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ if (boot_cpu_data.x86_model == 62)
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ else
+ x86_pmu.extra_regs = intel_snb_extra_regs;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+ x86_pmu.cpu_events = snb_events_attrs;
+
+ /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+
+ pr_cont("IvyBridge events, ");
+ break;
+
+
+ case 60: /* Haswell Client */
+ case 70:
+ case 71:
+ case 63:
+ case 69:
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_snb();
+
+ x86_pmu.event_constraints = intel_hsw_event_constraints;
+ x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_snb_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.cpu_events = hsw_events_attrs;
+ x86_pmu.lbr_double_abort = true;
+ pr_cont("Haswell events, ");
+ break;
default:
switch (x86_pmu.version) {
@@ -1772,5 +2581,72 @@ __init int intel_pmu_init(void)
}
}
+ if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
+ WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+ x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
+ x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
+ }
+ x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+
+ if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
+ WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+ x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
+ x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
+ }
+
+ x86_pmu.intel_ctrl |=
+ ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+ if (x86_pmu.event_constraints) {
+ /*
+ * event on fixed counter2 (REF_CYCLES) only works on this
+ * counter, so do not extend mask to generic counters
+ */
+ for_each_event_constraint(c, x86_pmu.event_constraints) {
+ if (c->cmask != FIXED_EVENT_FLAGS
+ || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) {
+ continue;
+ }
+
+ c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+ c->weight += x86_pmu.num_counters;
+ }
+ }
+
+ /*
+ * Access LBR MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support LBR MSR
+ * Check all LBT MSR here.
+ * Disable LBR access if any LBR MSRs can not be accessed.
+ */
+ if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+ x86_pmu.lbr_nr = 0;
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+ check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+ x86_pmu.lbr_nr = 0;
+ }
+
+ /*
+ * Access extra MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support offcore event
+ * Check all extra_regs here.
+ */
+ if (x86_pmu.extra_regs) {
+ for (er = x86_pmu.extra_regs; er->msr; er++) {
+ er->extra_msr_access = check_msr(er->msr, 0x1ffUL);
+ /* Disable LBR select mapping */
+ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+ x86_pmu.lbr_sel_map = NULL;
+ }
+ }
+
+ /* Support full width counters using alternative MSR range */
+ if (x86_pmu.intel_cap.full_width_write) {
+ x86_pmu.max_period = x86_pmu.cntval_mask;
+ x86_pmu.perfctr = MSR_IA32_PMC0;
+ pr_cont("full-width counters, ");
+ }
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index d6bd49faa40..696ade311de 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -3,6 +3,7 @@
#include <linux/slab.h>
#include <asm/perf_event.h>
+#include <asm/insn.h>
#include "perf_event.h"
@@ -11,6 +12,7 @@
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_BUFFER_SIZE PAGE_SIZE
+#define PEBS_FIXUP_SIZE PAGE_SIZE
/*
* pebs_record_32 for p4 and core not supported
@@ -23,6 +25,159 @@ struct pebs_record_32 {
*/
+union intel_x86_pebs_dse {
+ u64 val;
+ struct {
+ unsigned int ld_dse:4;
+ unsigned int ld_stlb_miss:1;
+ unsigned int ld_locked:1;
+ unsigned int ld_reserved:26;
+ };
+ struct {
+ unsigned int st_l1d_hit:1;
+ unsigned int st_reserved1:3;
+ unsigned int st_stlb_miss:1;
+ unsigned int st_locked:1;
+ unsigned int st_reserved2:26;
+ };
+};
+
+
+/*
+ * Map PEBS Load Latency Data Source encodings to generic
+ * memory data source information
+ */
+#define P(a, b) PERF_MEM_S(a, b)
+#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
+
+static const u64 pebs_data_source[] = {
+ P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+ OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */
+ OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
+ OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */
+ OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */
+ OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */
+ OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */
+ OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */
+ OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */
+ OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+ OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */
+ OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */
+ OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
+ OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
+ OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */
+ OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+};
+
+static u64 precise_store_data(u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
+
+ dse.val = status;
+
+ /*
+ * bit 4: TLB access
+ * 1 = stored missed 2nd level TLB
+ *
+ * so it either hit the walker or the OS
+ * otherwise hit 2nd level TLB
+ */
+ if (dse.st_stlb_miss)
+ val |= P(TLB, MISS);
+ else
+ val |= P(TLB, HIT);
+
+ /*
+ * bit 0: hit L1 data cache
+ * if not set, then all we know is that
+ * it missed L1D
+ */
+ if (dse.st_l1d_hit)
+ val |= P(LVL, HIT);
+ else
+ val |= P(LVL, MISS);
+
+ /*
+ * bit 5: Locked prefix
+ */
+ if (dse.st_locked)
+ val |= P(LOCK, LOCKED);
+
+ return val;
+}
+
+static u64 precise_store_data_hsw(struct perf_event *event, u64 status)
+{
+ union perf_mem_data_src dse;
+ u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK;
+
+ dse.val = 0;
+ dse.mem_op = PERF_MEM_OP_STORE;
+ dse.mem_lvl = PERF_MEM_LVL_NA;
+
+ /*
+ * L1 info only valid for following events:
+ *
+ * MEM_UOPS_RETIRED.STLB_MISS_STORES
+ * MEM_UOPS_RETIRED.LOCK_STORES
+ * MEM_UOPS_RETIRED.SPLIT_STORES
+ * MEM_UOPS_RETIRED.ALL_STORES
+ */
+ if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0)
+ return dse.mem_lvl;
+
+ if (status & 1)
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+ else
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+
+ /* Nothing else supported. Sorry. */
+ return dse.val;
+}
+
+static u64 load_latency_data(u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ u64 val;
+ int model = boot_cpu_data.x86_model;
+ int fam = boot_cpu_data.x86;
+
+ dse.val = status;
+
+ /*
+ * use the mapping table for bit 0-3
+ */
+ val = pebs_data_source[dse.ld_dse];
+
+ /*
+ * Nehalem models do not support TLB, Lock infos
+ */
+ if (fam == 0x6 && (model == 26 || model == 30
+ || model == 31 || model == 46)) {
+ val |= P(TLB, NA) | P(LOCK, NA);
+ return val;
+ }
+ /*
+ * bit 4: TLB access
+ * 0 = did not miss 2nd level TLB
+ * 1 = missed 2nd level TLB
+ */
+ if (dse.ld_stlb_miss)
+ val |= P(TLB, MISS) | P(TLB, L2);
+ else
+ val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+ /*
+ * bit 5: locked prefix
+ */
+ if (dse.ld_locked)
+ val |= P(LOCK, LOCKED);
+
+ return val;
+}
+
struct pebs_record_core {
u64 flags, ip;
u64 ax, bx, cx, dx;
@@ -40,6 +195,36 @@ struct pebs_record_nhm {
u64 status, dla, dse, lat;
};
+/*
+ * Same as pebs_record_nhm, with two additional fields.
+ */
+struct pebs_record_hsw {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+ u64 real_ip, tsx_tuning;
+};
+
+union hsw_tsx_tuning {
+ struct {
+ u32 cycles_last_block : 32,
+ hle_abort : 1,
+ rtm_abort : 1,
+ instruction_abort : 1,
+ non_instruction_abort : 1,
+ retry : 1,
+ data_conflict : 1,
+ capacity_writes : 1,
+ capacity_reads : 1;
+ };
+ u64 value;
+};
+
+#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL
+
void init_debug_store_on_cpu(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -60,20 +245,35 @@ void fini_debug_store_on_cpu(int cpu)
wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
}
+static DEFINE_PER_CPU(void *, insn_buffer);
+
static int alloc_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
int node = cpu_to_node(cpu);
int max, thresh = 1; /* always use a single PEBS record */
- void *buffer;
+ void *buffer, *ibuffer;
if (!x86_pmu.pebs)
return 0;
- buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+ buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
+ /*
+ * HSW+ already provides us the eventing ip; no need to allocate this
+ * buffer then.
+ */
+ if (x86_pmu.intel_cap.pebs_format < 2) {
+ ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+ if (!ibuffer) {
+ kfree(buffer);
+ return -ENOMEM;
+ }
+ per_cpu(insn_buffer, cpu) = ibuffer;
+ }
+
max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
ds->pebs_buffer_base = (u64)(unsigned long)buffer;
@@ -94,6 +294,9 @@ static void release_pebs_buffer(int cpu)
if (!ds || !x86_pmu.pebs)
return;
+ kfree(per_cpu(insn_buffer, cpu));
+ per_cpu(insn_buffer, cpu) = NULL;
+
kfree((void *)(unsigned long)ds->pebs_buffer_base);
ds->pebs_buffer_base = 0;
}
@@ -108,9 +311,11 @@ static int alloc_bts_buffer(int cpu)
if (!x86_pmu.bts)
return 0;
- buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
- if (unlikely(!buffer))
+ buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+ if (unlikely(!buffer)) {
+ WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
+ }
max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
thresh = max / 16;
@@ -141,7 +346,7 @@ static int alloc_ds_buffer(int cpu)
int node = cpu_to_node(cpu);
struct debug_store *ds;
- ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
+ ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
if (unlikely(!ds))
return -ENOMEM;
@@ -247,7 +452,7 @@ void reserve_ds_buffers(void)
*/
struct event_constraint bts_constraint =
- EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
+ EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
void intel_pmu_enable_bts(u64 config)
{
@@ -294,7 +499,7 @@ int intel_pmu_drain_bts_buffer(void)
u64 to;
u64 flags;
};
- struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+ struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
struct bts_record *at, *top;
struct perf_output_handle handle;
struct perf_event_header header;
@@ -313,11 +518,11 @@ int intel_pmu_drain_bts_buffer(void)
if (top <= at)
return 0;
+ memset(&regs, 0, sizeof(regs));
+
ds->bts_index = ds->bts_buffer_base;
- perf_sample_data_init(&data, 0);
- data.period = event->hw.last_period;
- regs.ip = 0;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
/*
* Prepare a generic sample, i.e. fill in the invariant fields.
@@ -363,8 +568,34 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_slm_pebs_event_constraints[] = {
+ INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */
+ INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */
+ INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */
+ INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */
+ INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */
+ INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */
+ INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */
+ INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */
+ INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */
+ INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */
+ INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */
+ INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */
+ INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */
+ INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */
+ INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */
+ INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */
+ INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */
+ INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */
+ INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */
+ INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */
+ INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */
+ INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint intel_nehalem_pebs_event_constraints[] = {
- INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
+ INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */
@@ -379,7 +610,7 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = {
};
struct event_constraint intel_westmere_pebs_event_constraints[] = {
- INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
+ INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */
@@ -399,21 +630,67 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
- INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
- INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
- INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
+ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
+ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_ivb_pebs_event_constraints[] = {
+ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
+ INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
+ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
+ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_hsw_pebs_event_constraints[] = {
+ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_PST_HSW_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
+ INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
+ INTEL_UEVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */
+ INTEL_UEVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */
+ INTEL_UEVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.NEAR_TAKEN */
+ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.* */
+ /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+ INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf),
+ /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+ INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+ INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+ /* MEM_UOPS_RETIRED.SPLIT_STORES */
+ INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+ INTEL_PST_HSW_CONSTRAINT(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+ INTEL_UEVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */
+ INTEL_UEVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */
+ INTEL_UEVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L3_HIT */
+ /* MEM_LOAD_UOPS_RETIRED.HIT_LFB */
+ INTEL_UEVENT_CONSTRAINT(0x40d1, 0xf),
+ /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */
+ INTEL_UEVENT_CONSTRAINT(0x01d2, 0xf),
+ /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */
+ INTEL_UEVENT_CONSTRAINT(0x02d2, 0xf),
+ /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM */
+ INTEL_UEVENT_CONSTRAINT(0x01d3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x04c8, 0xf), /* HLE_RETIRED.Abort */
+ INTEL_UEVENT_CONSTRAINT(0x04c9, 0xf), /* RTM_RETIRED.Abort */
+
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
struct event_constraint *c;
@@ -423,8 +700,10 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
if (x86_pmu.pebs_constraints) {
for_each_event_constraint(c, x86_pmu.pebs_constraints) {
- if ((event->hw.config & c->cmask) == c->code)
+ if ((event->hw.config & c->cmask) == c->code) {
+ event->hw.flags |= c->flags;
return c;
+ }
}
}
@@ -440,8 +719,10 @@ void intel_pmu_pebs_enable(struct perf_event *event)
cpuc->pebs_enabled |= 1ULL << hwc->idx;
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
- intel_pmu_lbr_enable(event);
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+ cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
+ else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+ cpuc->pebs_enabled |= 1ULL << 63;
}
void intel_pmu_pebs_disable(struct perf_event *event)
@@ -450,13 +731,16 @@ void intel_pmu_pebs_disable(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+
+ if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT)
+ cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
+ else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST)
+ cpuc->pebs_enabled &= ~(1ULL << 63);
+
if (cpuc->enabled)
wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
-
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
- intel_pmu_lbr_disable(event);
}
void intel_pmu_pebs_enable_all(void)
@@ -475,17 +759,6 @@ void intel_pmu_pebs_disable_all(void)
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
-#include <asm/insn.h>
-
-static inline bool kernel_ip(unsigned long ip)
-{
-#ifdef CONFIG_X86_32
- return ip > PAGE_OFFSET;
-#else
- return (long)ip < 0;
-#endif
-}
-
static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -493,6 +766,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
unsigned long old_to, to = cpuc->lbr_entries[0].to;
unsigned long ip = regs->ip;
int is_64bit = 0;
+ void *kaddr;
/*
* We don't need to fixup if the PEBS assist is fault like
@@ -516,44 +790,48 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
* unsigned math, either ip is before the start (impossible) or
* the basic block is larger than 1 page (sanity)
*/
- if ((ip - to) > PAGE_SIZE)
+ if ((ip - to) > PEBS_FIXUP_SIZE)
return 0;
/*
* We sampled a branch insn, rewind using the LBR stack
*/
if (ip == to) {
- regs->ip = from;
+ set_linear_ip(regs, from);
return 1;
}
+ if (!kernel_ip(ip)) {
+ int size, bytes;
+ u8 *buf = this_cpu_read(insn_buffer);
+
+ size = ip - to; /* Must fit our buffer, see above */
+ bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+ if (bytes != 0)
+ return 0;
+
+ kaddr = buf;
+ } else {
+ kaddr = (void *)to;
+ }
+
do {
struct insn insn;
- u8 buf[MAX_INSN_SIZE];
- void *kaddr;
old_to = to;
- if (!kernel_ip(ip)) {
- int bytes, size = MAX_INSN_SIZE;
-
- bytes = copy_from_user_nmi(buf, (void __user *)to, size);
- if (bytes != size)
- return 0;
-
- kaddr = buf;
- } else
- kaddr = (void *)to;
#ifdef CONFIG_X86_64
is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
#endif
insn_init(&insn, kaddr, is_64bit);
insn_get_length(&insn);
+
to += insn.length;
+ kaddr += insn.length;
} while (to < ip);
if (to == ip) {
- regs->ip = old_to;
+ set_linear_ip(regs, old_to);
return 1;
}
@@ -564,23 +842,74 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
return 0;
}
+static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
+{
+ if (pebs->tsx_tuning) {
+ union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
+ return tsx.cycles_last_block;
+ }
+ return 0;
+}
+
+static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
+{
+ u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
+
+ /* For RTM XABORTs also log the abort code from AX */
+ if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
+ txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
+ return txn;
+}
+
static void __intel_pmu_pebs_event(struct perf_event *event,
struct pt_regs *iregs, void *__pebs)
{
/*
- * We cast to pebs_record_core since that is a subset of
- * both formats and we don't use the other fields in this
- * routine.
+ * We cast to the biggest pebs_record but are careful not to
+ * unconditionally access the 'extra' entries.
*/
- struct pebs_record_core *pebs = __pebs;
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct pebs_record_hsw *pebs = __pebs;
struct perf_sample_data data;
struct pt_regs regs;
+ u64 sample_type;
+ int fll, fst;
if (!intel_pmu_save_and_restart(event))
return;
- perf_sample_data_init(&data, 0);
+ fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT;
+ fst = event->hw.flags & (PERF_X86_EVENT_PEBS_ST |
+ PERF_X86_EVENT_PEBS_ST_HSW);
+
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
data.period = event->hw.last_period;
+ sample_type = event->attr.sample_type;
+
+ /*
+ * if PEBS-LL or PreciseStore
+ */
+ if (fll || fst) {
+ /*
+ * Use latency for weight (only avail with PEBS-LL)
+ */
+ if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
+ data.weight = pebs->lat;
+
+ /*
+ * data.data_src encodes the data source
+ */
+ if (sample_type & PERF_SAMPLE_DATA_SRC) {
+ if (fll)
+ data.data_src.val = load_latency_data(pebs->dse);
+ else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+ data.data_src.val =
+ precise_store_data_hsw(event, pebs->dse);
+ else
+ data.data_src.val = precise_store_data(pebs->dse);
+ }
+ }
/*
* We use the interrupt regs as a base because the PEBS record
@@ -593,15 +922,35 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
* A possible PERF_SAMPLE_REGS will have to transfer all regs.
*/
regs = *iregs;
- regs.ip = pebs->ip;
+ regs.flags = pebs->flags;
+ set_linear_ip(&regs, pebs->ip);
regs.bp = pebs->bp;
regs.sp = pebs->sp;
- if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
+ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
+ regs.ip = pebs->real_ip;
+ regs.flags |= PERF_EFLAGS_EXACT;
+ } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
regs.flags |= PERF_EFLAGS_EXACT;
else
regs.flags &= ~PERF_EFLAGS_EXACT;
+ if ((event->attr.sample_type & PERF_SAMPLE_ADDR) &&
+ x86_pmu.intel_cap.pebs_format >= 1)
+ data.addr = pebs->dla;
+
+ if (x86_pmu.intel_cap.pebs_format >= 2) {
+ /* Only set the TSX weight when no memory weight. */
+ if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll)
+ data.weight = intel_hsw_weight(pebs);
+
+ if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION)
+ data.txn = intel_hsw_transaction(pebs);
+ }
+
+ if (has_branch_stack(event))
+ data.br_stack = &cpuc->lbr_stack;
+
if (perf_event_overflow(event, &data, &regs))
x86_pmu_stop(event, 0);
}
@@ -641,7 +990,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
* Should not happen, we program the threshold at 1 and do not
* set a reset value.
*/
- WARN_ON_ONCE(n > 1);
+ WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
at += n - 1;
__intel_pmu_pebs_event(event, iregs, at);
@@ -651,10 +1000,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct debug_store *ds = cpuc->ds;
- struct pebs_record_nhm *at, *top;
struct perf_event *event = NULL;
+ void *at, *top;
u64 status = 0;
- int bit, n;
+ int bit;
if (!x86_pmu.pebs_active)
return;
@@ -664,18 +1013,22 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
ds->pebs_index = ds->pebs_buffer_base;
- n = top - at;
- if (n <= 0)
+ if (unlikely(at > top))
return;
/*
* Should not happen, we program the threshold at 1 and do not
* set a reset value.
*/
- WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
+ WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size,
+ "Unexpected number of pebs records %ld\n",
+ (long)(top - at) / x86_pmu.pebs_record_size);
- for ( ; at < top; at++) {
- for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
+ for (; at < top; at += x86_pmu.pebs_record_size) {
+ struct pebs_record_nhm *p = at;
+
+ for_each_set_bit(bit, (unsigned long *)&p->status,
+ x86_pmu.max_pebs_events) {
event = cpuc->events[bit];
if (!test_bit(bit, cpuc->active_mask))
continue;
@@ -691,7 +1044,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
break;
}
- if (!event || bit >= MAX_PEBS_EVENTS)
+ if (!event || bit >= x86_pmu.max_pebs_events)
continue;
__intel_pmu_pebs_event(event, iregs, at);
@@ -729,9 +1082,25 @@ void intel_ds_init(void)
x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
break;
+ case 2:
+ pr_cont("PEBS fmt2%c, ", pebs_type);
+ x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ break;
+
default:
printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
x86_pmu.pebs = 0;
}
}
}
+
+void perf_restore_debug_store(void)
+{
+ struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+
+ if (!x86_pmu.bts && !x86_pmu.pebs)
+ return;
+
+ wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
+}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 47a7e63bfe5..9dd2459a4c7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -3,6 +3,7 @@
#include <asm/perf_event.h>
#include <asm/msr.h>
+#include <asm/insn.h>
#include "perf_event.h"
@@ -11,8 +12,119 @@ enum {
LBR_FORMAT_LIP = 0x01,
LBR_FORMAT_EIP = 0x02,
LBR_FORMAT_EIP_FLAGS = 0x03,
+ LBR_FORMAT_EIP_FLAGS2 = 0x04,
+ LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2,
};
+static enum {
+ LBR_EIP_FLAGS = 1,
+ LBR_TSX = 2,
+} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
+ [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS,
+ [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
+};
+
+/*
+ * Intel LBR_SELECT bits
+ * Intel Vol3a, April 2011, Section 16.7 Table 16-10
+ *
+ * Hardware branch filter (not available on all CPUs)
+ */
+#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */
+#define LBR_USER_BIT 1 /* do not capture at ring > 0 */
+#define LBR_JCC_BIT 2 /* do not capture conditional branches */
+#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */
+#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */
+#define LBR_RETURN_BIT 5 /* do not capture near returns */
+#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
+#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
+#define LBR_FAR_BIT 8 /* do not capture far branches */
+
+#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
+#define LBR_USER (1 << LBR_USER_BIT)
+#define LBR_JCC (1 << LBR_JCC_BIT)
+#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT)
+#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT)
+#define LBR_RETURN (1 << LBR_RETURN_BIT)
+#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
+#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
+#define LBR_FAR (1 << LBR_FAR_BIT)
+
+#define LBR_PLM (LBR_KERNEL | LBR_USER)
+
+#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */
+#define LBR_NOT_SUPP -1 /* LBR filter not supported */
+#define LBR_IGN 0 /* ignored */
+
+#define LBR_ANY \
+ (LBR_JCC |\
+ LBR_REL_CALL |\
+ LBR_IND_CALL |\
+ LBR_RETURN |\
+ LBR_REL_JMP |\
+ LBR_IND_JMP |\
+ LBR_FAR)
+
+#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
+#define LBR_FROM_FLAG_IN_TX (1ULL << 62)
+#define LBR_FROM_FLAG_ABORT (1ULL << 61)
+
+#define for_each_branch_sample_type(x) \
+ for ((x) = PERF_SAMPLE_BRANCH_USER; \
+ (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
+
+/*
+ * x86control flow change classification
+ * x86control flow changes include branches, interrupts, traps, faults
+ */
+enum {
+ X86_BR_NONE = 0, /* unknown */
+
+ X86_BR_USER = 1 << 0, /* branch target is user */
+ X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
+
+ X86_BR_CALL = 1 << 2, /* call */
+ X86_BR_RET = 1 << 3, /* return */
+ X86_BR_SYSCALL = 1 << 4, /* syscall */
+ X86_BR_SYSRET = 1 << 5, /* syscall return */
+ X86_BR_INT = 1 << 6, /* sw interrupt */
+ X86_BR_IRET = 1 << 7, /* return from interrupt */
+ X86_BR_JCC = 1 << 8, /* conditional */
+ X86_BR_JMP = 1 << 9, /* jump */
+ X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
+ X86_BR_IND_CALL = 1 << 11,/* indirect calls */
+ X86_BR_ABORT = 1 << 12,/* transaction abort */
+ X86_BR_IN_TX = 1 << 13,/* in transaction */
+ X86_BR_NO_TX = 1 << 14,/* not in transaction */
+};
+
+#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
+#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
+
+#define X86_BR_ANY \
+ (X86_BR_CALL |\
+ X86_BR_RET |\
+ X86_BR_SYSCALL |\
+ X86_BR_SYSRET |\
+ X86_BR_INT |\
+ X86_BR_IRET |\
+ X86_BR_JCC |\
+ X86_BR_JMP |\
+ X86_BR_IRQ |\
+ X86_BR_ABORT |\
+ X86_BR_IND_CALL)
+
+#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
+
+#define X86_BR_ANY_CALL \
+ (X86_BR_CALL |\
+ X86_BR_IND_CALL |\
+ X86_BR_SYSCALL |\
+ X86_BR_IRQ |\
+ X86_BR_INT)
+
+static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
+
/*
* We only support LBR implementations that have FREEZE_LBRS_ON_PMI
* otherwise it becomes near impossible to get a reliable stack.
@@ -21,6 +133,10 @@ enum {
static void __intel_pmu_lbr_enable(void)
{
u64 debugctl;
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (cpuc->lbr_sel)
+ wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
@@ -76,11 +192,11 @@ void intel_pmu_lbr_enable(struct perf_event *event)
* Reset the LBR stack if we changed task context to
* avoid data leaks.
*/
-
if (event->ctx->task && cpuc->lbr_context != event->ctx) {
intel_pmu_lbr_reset();
cpuc->lbr_context = event->ctx;
}
+ cpuc->br_sel = event->hw.branch_reg.reg;
cpuc->lbr_users++;
}
@@ -95,8 +211,11 @@ void intel_pmu_lbr_disable(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
- if (cpuc->enabled && !cpuc->lbr_users)
+ if (cpuc->enabled && !cpuc->lbr_users) {
__intel_pmu_lbr_disable();
+ /* avoid stale pointer */
+ cpuc->lbr_context = NULL;
+ }
}
void intel_pmu_lbr_enable_all(void)
@@ -115,6 +234,9 @@ void intel_pmu_lbr_disable_all(void)
__intel_pmu_lbr_disable();
}
+/*
+ * TOS = most recently recorded branch
+ */
static inline u64 intel_pmu_lbr_tos(void)
{
u64 tos;
@@ -142,15 +264,15 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
- cpuc->lbr_entries[i].from = msr_lastbranch.from;
- cpuc->lbr_entries[i].to = msr_lastbranch.to;
- cpuc->lbr_entries[i].flags = 0;
+ cpuc->lbr_entries[i].from = msr_lastbranch.from;
+ cpuc->lbr_entries[i].to = msr_lastbranch.to;
+ cpuc->lbr_entries[i].mispred = 0;
+ cpuc->lbr_entries[i].predicted = 0;
+ cpuc->lbr_entries[i].reserved = 0;
}
cpuc->lbr_stack.nr = i;
}
-#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
-
/*
* Due to lack of segmentation in Linux the effective address (offset)
* is the same as the linear address, allowing us to merge the LIP and EIP
@@ -162,24 +284,50 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
int lbr_format = x86_pmu.intel_cap.lbr_format;
u64 tos = intel_pmu_lbr_tos();
int i;
+ int out = 0;
for (i = 0; i < x86_pmu.lbr_nr; i++) {
unsigned long lbr_idx = (tos - i) & mask;
- u64 from, to, flags = 0;
+ u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
+ int skip = 0;
+ int lbr_flags = lbr_desc[lbr_format];
rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
- if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
- flags = !!(from & LBR_FROM_FLAG_MISPRED);
- from = (u64)((((s64)from) << 1) >> 1);
+ if (lbr_flags & LBR_EIP_FLAGS) {
+ mis = !!(from & LBR_FROM_FLAG_MISPRED);
+ pred = !mis;
+ skip = 1;
}
-
- cpuc->lbr_entries[i].from = from;
- cpuc->lbr_entries[i].to = to;
- cpuc->lbr_entries[i].flags = flags;
+ if (lbr_flags & LBR_TSX) {
+ in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
+ abort = !!(from & LBR_FROM_FLAG_ABORT);
+ skip = 3;
+ }
+ from = (u64)((((s64)from) << skip) >> skip);
+
+ /*
+ * Some CPUs report duplicated abort records,
+ * with the second entry not having an abort bit set.
+ * Skip them here. This loop runs backwards,
+ * so we need to undo the previous record.
+ * If the abort just happened outside the window
+ * the extra entry cannot be removed.
+ */
+ if (abort && x86_pmu.lbr_double_abort && out > 0)
+ out--;
+
+ cpuc->lbr_entries[out].from = from;
+ cpuc->lbr_entries[out].to = to;
+ cpuc->lbr_entries[out].mispred = mis;
+ cpuc->lbr_entries[out].predicted = pred;
+ cpuc->lbr_entries[out].in_tx = in_tx;
+ cpuc->lbr_entries[out].abort = abort;
+ cpuc->lbr_entries[out].reserved = 0;
+ out++;
}
- cpuc->lbr_stack.nr = i;
+ cpuc->lbr_stack.nr = out;
}
void intel_pmu_lbr_read(void)
@@ -193,28 +341,439 @@ void intel_pmu_lbr_read(void)
intel_pmu_lbr_read_32(cpuc);
else
intel_pmu_lbr_read_64(cpuc);
+
+ intel_pmu_lbr_filter(cpuc);
+}
+
+/*
+ * SW filter is used:
+ * - in case there is no HW filter
+ * - in case the HW filter has errata or limitations
+ */
+static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+{
+ u64 br_type = event->attr.branch_sample_type;
+ int mask = 0;
+
+ if (br_type & PERF_SAMPLE_BRANCH_USER)
+ mask |= X86_BR_USER;
+
+ if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
+ mask |= X86_BR_KERNEL;
+
+ /* we ignore BRANCH_HV here */
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY)
+ mask |= X86_BR_ANY;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+ mask |= X86_BR_ANY_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+ mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
+
+ if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
+ mask |= X86_BR_IND_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
+ mask |= X86_BR_ABORT;
+
+ if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
+ mask |= X86_BR_IN_TX;
+
+ if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
+ mask |= X86_BR_NO_TX;
+
+ if (br_type & PERF_SAMPLE_BRANCH_COND)
+ mask |= X86_BR_JCC;
+
+ /*
+ * stash actual user request into reg, it may
+ * be used by fixup code for some CPU
+ */
+ event->hw.branch_reg.reg = mask;
+}
+
+/*
+ * setup the HW LBR filter
+ * Used only when available, may not be enough to disambiguate
+ * all branches, may need the help of the SW filter
+ */
+static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg;
+ u64 br_type = event->attr.branch_sample_type;
+ u64 mask = 0, m;
+ u64 v;
+
+ for_each_branch_sample_type(m) {
+ if (!(br_type & m))
+ continue;
+
+ v = x86_pmu.lbr_sel_map[m];
+ if (v == LBR_NOT_SUPP)
+ return -EOPNOTSUPP;
+
+ if (v != LBR_IGN)
+ mask |= v;
+ }
+ reg = &event->hw.branch_reg;
+ reg->idx = EXTRA_REG_LBR;
+
+ /* LBR_SELECT operates in suppress mode so invert mask */
+ reg->config = ~mask & x86_pmu.lbr_sel_mask;
+
+ return 0;
+}
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event)
+{
+ int ret = 0;
+
+ /*
+ * no LBR on this PMU
+ */
+ if (!x86_pmu.lbr_nr)
+ return -EOPNOTSUPP;
+
+ /*
+ * setup SW LBR filter
+ */
+ intel_pmu_setup_sw_lbr_filter(event);
+
+ /*
+ * setup HW LBR filter, if any
+ */
+ if (x86_pmu.lbr_sel_map)
+ ret = intel_pmu_setup_hw_lbr_filter(event);
+
+ return ret;
}
+/*
+ * return the type of control flow change at address "from"
+ * intruction is not necessarily a branch (in case of interrupt).
+ *
+ * The branch type returned also includes the priv level of the
+ * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
+ *
+ * If a branch type is unknown OR the instruction cannot be
+ * decoded (e.g., text page not present), then X86_BR_NONE is
+ * returned.
+ */
+static int branch_type(unsigned long from, unsigned long to, int abort)
+{
+ struct insn insn;
+ void *addr;
+ int bytes, size = MAX_INSN_SIZE;
+ int ret = X86_BR_NONE;
+ int ext, to_plm, from_plm;
+ u8 buf[MAX_INSN_SIZE];
+ int is64 = 0;
+
+ to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+ from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
+
+ /*
+ * maybe zero if lbr did not fill up after a reset by the time
+ * we get a PMU interrupt
+ */
+ if (from == 0 || to == 0)
+ return X86_BR_NONE;
+
+ if (abort)
+ return X86_BR_ABORT | to_plm;
+
+ if (from_plm == X86_BR_USER) {
+ /*
+ * can happen if measuring at the user level only
+ * and we interrupt in a kernel thread, e.g., idle.
+ */
+ if (!current->mm)
+ return X86_BR_NONE;
+
+ /* may fail if text not present */
+ bytes = copy_from_user_nmi(buf, (void __user *)from, size);
+ if (bytes != 0)
+ return X86_BR_NONE;
+
+ addr = buf;
+ } else {
+ /*
+ * The LBR logs any address in the IP, even if the IP just
+ * faulted. This means userspace can control the from address.
+ * Ensure we don't blindy read any address by validating it is
+ * a known text address.
+ */
+ if (kernel_text_address(from))
+ addr = (void *)from;
+ else
+ return X86_BR_NONE;
+ }
+
+ /*
+ * decoder needs to know the ABI especially
+ * on 64-bit systems running 32-bit apps
+ */
+#ifdef CONFIG_X86_64
+ is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
+#endif
+ insn_init(&insn, addr, is64);
+ insn_get_opcode(&insn);
+
+ switch (insn.opcode.bytes[0]) {
+ case 0xf:
+ switch (insn.opcode.bytes[1]) {
+ case 0x05: /* syscall */
+ case 0x34: /* sysenter */
+ ret = X86_BR_SYSCALL;
+ break;
+ case 0x07: /* sysret */
+ case 0x35: /* sysexit */
+ ret = X86_BR_SYSRET;
+ break;
+ case 0x80 ... 0x8f: /* conditional */
+ ret = X86_BR_JCC;
+ break;
+ default:
+ ret = X86_BR_NONE;
+ }
+ break;
+ case 0x70 ... 0x7f: /* conditional */
+ ret = X86_BR_JCC;
+ break;
+ case 0xc2: /* near ret */
+ case 0xc3: /* near ret */
+ case 0xca: /* far ret */
+ case 0xcb: /* far ret */
+ ret = X86_BR_RET;
+ break;
+ case 0xcf: /* iret */
+ ret = X86_BR_IRET;
+ break;
+ case 0xcc ... 0xce: /* int */
+ ret = X86_BR_INT;
+ break;
+ case 0xe8: /* call near rel */
+ case 0x9a: /* call far absolute */
+ ret = X86_BR_CALL;
+ break;
+ case 0xe0 ... 0xe3: /* loop jmp */
+ ret = X86_BR_JCC;
+ break;
+ case 0xe9 ... 0xeb: /* jmp */
+ ret = X86_BR_JMP;
+ break;
+ case 0xff: /* call near absolute, call far absolute ind */
+ insn_get_modrm(&insn);
+ ext = (insn.modrm.bytes[0] >> 3) & 0x7;
+ switch (ext) {
+ case 2: /* near ind call */
+ case 3: /* far ind call */
+ ret = X86_BR_IND_CALL;
+ break;
+ case 4:
+ case 5:
+ ret = X86_BR_JMP;
+ break;
+ }
+ break;
+ default:
+ ret = X86_BR_NONE;
+ }
+ /*
+ * interrupts, traps, faults (and thus ring transition) may
+ * occur on any instructions. Thus, to classify them correctly,
+ * we need to first look at the from and to priv levels. If they
+ * are different and to is in the kernel, then it indicates
+ * a ring transition. If the from instruction is not a ring
+ * transition instr (syscall, systenter, int), then it means
+ * it was a irq, trap or fault.
+ *
+ * we have no way of detecting kernel to kernel faults.
+ */
+ if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
+ && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
+ ret = X86_BR_IRQ;
+
+ /*
+ * branch priv level determined by target as
+ * is done by HW when LBR_SELECT is implemented
+ */
+ if (ret != X86_BR_NONE)
+ ret |= to_plm;
+
+ return ret;
+}
+
+/*
+ * implement actual branch filter based on user demand.
+ * Hardware may not exactly satisfy that request, thus
+ * we need to inspect opcodes. Mismatched branches are
+ * discarded. Therefore, the number of branches returned
+ * in PERF_SAMPLE_BRANCH_STACK sample may vary.
+ */
+static void
+intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
+{
+ u64 from, to;
+ int br_sel = cpuc->br_sel;
+ int i, j, type;
+ bool compress = false;
+
+ /* if sampling all branches, then nothing to filter */
+ if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+ return;
+
+ for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+
+ from = cpuc->lbr_entries[i].from;
+ to = cpuc->lbr_entries[i].to;
+
+ type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+ if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
+ if (cpuc->lbr_entries[i].in_tx)
+ type |= X86_BR_IN_TX;
+ else
+ type |= X86_BR_NO_TX;
+ }
+
+ /* if type does not correspond, then discard */
+ if (type == X86_BR_NONE || (br_sel & type) != type) {
+ cpuc->lbr_entries[i].from = 0;
+ compress = true;
+ }
+ }
+
+ if (!compress)
+ return;
+
+ /* remove all entries with from=0 */
+ for (i = 0; i < cpuc->lbr_stack.nr; ) {
+ if (!cpuc->lbr_entries[i].from) {
+ j = i;
+ while (++j < cpuc->lbr_stack.nr)
+ cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
+ cpuc->lbr_stack.nr--;
+ if (!cpuc->lbr_entries[i].from)
+ continue;
+ }
+ i++;
+ }
+}
+
+/*
+ * Map interface branch filters onto LBR filters
+ */
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
+ [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
+ | LBR_IND_JMP | LBR_FAR,
+ /*
+ * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
+ */
+ [PERF_SAMPLE_BRANCH_ANY_CALL] =
+ LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
+ /*
+ * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
+ */
+ [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
+};
+
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
+ [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
+};
+
+/* core */
void intel_pmu_lbr_init_core(void)
{
x86_pmu.lbr_nr = 4;
- x86_pmu.lbr_tos = 0x01c9;
- x86_pmu.lbr_from = 0x40;
- x86_pmu.lbr_to = 0x60;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
+ x86_pmu.lbr_to = MSR_LBR_CORE_TO;
+
+ /*
+ * SW branch filter usage:
+ * - compensate for lack of HW filter
+ */
+ pr_cont("4-deep LBR, ");
}
+/* nehalem/westmere */
void intel_pmu_lbr_init_nhm(void)
{
x86_pmu.lbr_nr = 16;
- x86_pmu.lbr_tos = 0x01c9;
- x86_pmu.lbr_from = 0x680;
- x86_pmu.lbr_to = 0x6c0;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - workaround LBR_SEL errata (see above)
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+ pr_cont("16-deep LBR, ");
+}
+
+/* sandy bridge */
+void intel_pmu_lbr_init_snb(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = snb_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+ pr_cont("16-deep LBR, ");
}
+/* atom */
void intel_pmu_lbr_init_atom(void)
{
+ /*
+ * only models starting at stepping 10 seems
+ * to have an operational LBR which can freeze
+ * on PMU interrupt
+ */
+ if (boot_cpu_data.x86_model == 28
+ && boot_cpu_data.x86_mask < 10) {
+ pr_cont("LBR disabled due to erratum");
+ return;
+ }
+
x86_pmu.lbr_nr = 8;
- x86_pmu.lbr_tos = 0x01c9;
- x86_pmu.lbr_from = 0x40;
- x86_pmu.lbr_to = 0x60;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
+ x86_pmu.lbr_to = MSR_LBR_CORE_TO;
+
+ /*
+ * SW branch filter usage:
+ * - compensate for lack of HW filter
+ */
+ pr_cont("8-deep LBR, ");
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
new file mode 100644
index 00000000000..619f7699487
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -0,0 +1,714 @@
+/*
+ * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
+ * Copyright (C) 2013 Google, Inc., Stephane Eranian
+ *
+ * Intel RAPL interface is specified in the IA-32 Manual Vol3b
+ * section 14.7.1 (September 2013)
+ *
+ * RAPL provides more controls than just reporting energy consumption
+ * however here we only expose the 3 energy consumption free running
+ * counters (pp0, pkg, dram).
+ *
+ * Each of those counters increments in a power unit defined by the
+ * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
+ * but it can vary.
+ *
+ * Counter to rapl events mappings:
+ *
+ * pp0 counter: consumption of all physical cores (power plane 0)
+ * event: rapl_energy_cores
+ * perf code: 0x1
+ *
+ * pkg counter: consumption of the whole processor package
+ * event: rapl_energy_pkg
+ * perf code: 0x2
+ *
+ * dram counter: consumption of the dram domain (servers only)
+ * event: rapl_energy_dram
+ * perf code: 0x3
+ *
+ * dram counter: consumption of the builtin-gpu domain (client only)
+ * event: rapl_energy_gpu
+ * perf code: 0x4
+ *
+ * We manage those counters as free running (read-only). They may be
+ * use simultaneously by other tools, such as turbostat.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it does not make sense and is not
+ * supported by the RAPL hardware.
+ *
+ * Because we want to avoid floating-point operations in the kernel,
+ * the events are all reported in fixed point arithmetic (32.32).
+ * Tools must adjust the counts to convert them to Watts using
+ * the duration of the measurement. Tools may use a function such as
+ * ldexp(raw_count, -32);
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+/*
+ * RAPL energy status counters
+ */
+#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
+#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
+#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
+#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
+#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
+#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
+#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
+#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
+
+/* Clients have PP0, PKG */
+#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
+ 1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM */
+#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
+ 1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_RAM_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM, PP1 */
+#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\
+ 1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_RAM_NRG_STAT|\
+ 1<<RAPL_IDX_PP1_NRG_STAT)
+
+/*
+ * event code: LSB 8 bits, passed in attr->config
+ * any other bit is reserved
+ */
+#define RAPL_EVENT_MASK 0xFFULL
+
+#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
+static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ char *page) \
+{ \
+ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
+ return sprintf(page, _format "\n"); \
+} \
+static struct kobj_attribute format_attr_##_var = \
+ __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
+
+#define RAPL_EVENT_DESC(_name, _config) \
+{ \
+ .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
+ .config = _config, \
+}
+
+#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
+
+struct rapl_pmu {
+ spinlock_t lock;
+ int hw_unit; /* 1/2^hw_unit Joule */
+ int n_active; /* number of active events */
+ struct list_head active_list;
+ struct pmu *pmu; /* pointer to rapl_pmu_class */
+ ktime_t timer_interval; /* in ktime_t unit */
+ struct hrtimer hrtimer;
+};
+
+static struct pmu rapl_pmu_class;
+static cpumask_t rapl_cpu_mask;
+static int rapl_cntr_mask;
+
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
+
+static inline u64 rapl_read_counter(struct perf_event *event)
+{
+ u64 raw;
+ rdmsrl(event->hw.event_base, raw);
+ return raw;
+}
+
+static inline u64 rapl_scale(u64 v)
+{
+ /*
+ * scale delta to smallest unit (1/2^32)
+ * users must then scale back: count * 1/(1e9*2^32) to get Joules
+ * or use ldexp(count, -32).
+ * Watts = Joules/Time delta
+ */
+ return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_raw_count, new_raw_count;
+ s64 delta, sdelta;
+ int shift = RAPL_CNTR_WIDTH;
+
+again:
+ prev_raw_count = local64_read(&hwc->prev_count);
+ rdmsrl(event->hw.event_base, new_raw_count);
+
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count) {
+ cpu_relax();
+ goto again;
+ }
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ sdelta = rapl_scale(delta);
+
+ local64_add(sdelta, &event->count);
+
+ return new_raw_count;
+}
+
+static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+{
+ __hrtimer_start_range_ns(&pmu->hrtimer,
+ pmu->timer_interval, 0,
+ HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
+{
+ hrtimer_cancel(&pmu->hrtimer);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+ struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ struct perf_event *event;
+ unsigned long flags;
+
+ if (!pmu->n_active)
+ return HRTIMER_NORESTART;
+
+ spin_lock_irqsave(&pmu->lock, flags);
+
+ list_for_each_entry(event, &pmu->active_list, active_entry) {
+ rapl_event_update(event);
+ }
+
+ spin_unlock_irqrestore(&pmu->lock, flags);
+
+ hrtimer_forward_now(hrtimer, pmu->timer_interval);
+
+ return HRTIMER_RESTART;
+}
+
+static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+{
+ struct hrtimer *hr = &pmu->hrtimer;
+
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hr->function = rapl_hrtimer_handle;
+}
+
+static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+ struct perf_event *event)
+{
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+
+ list_add_tail(&event->active_entry, &pmu->active_list);
+
+ local64_set(&event->hw.prev_count, rapl_read_counter(event));
+
+ pmu->n_active++;
+ if (pmu->n_active == 1)
+ rapl_start_hrtimer(pmu);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&pmu->lock, flags);
+ __rapl_pmu_event_start(pmu, event);
+ spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pmu->lock, flags);
+
+ /* mark event as deactivated and stopped */
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ WARN_ON_ONCE(pmu->n_active <= 0);
+ pmu->n_active--;
+ if (pmu->n_active == 0)
+ rapl_stop_hrtimer(pmu);
+
+ list_del(&event->active_entry);
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ /* check if update of sw counter is necessary */
+ if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ rapl_event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+
+ spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pmu->lock, flags);
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_START)
+ __rapl_pmu_event_start(pmu, event);
+
+ spin_unlock_irqrestore(&pmu->lock, flags);
+
+ return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+ rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+ int bit, msr, ret = 0;
+
+ /* only look at RAPL events */
+ if (event->attr.type != rapl_pmu_class.type)
+ return -ENOENT;
+
+ /* check only supported bits are set */
+ if (event->attr.config & ~RAPL_EVENT_MASK)
+ return -EINVAL;
+
+ /*
+ * check event is known (determines counter)
+ */
+ switch (cfg) {
+ case INTEL_RAPL_PP0:
+ bit = RAPL_IDX_PP0_NRG_STAT;
+ msr = MSR_PP0_ENERGY_STATUS;
+ break;
+ case INTEL_RAPL_PKG:
+ bit = RAPL_IDX_PKG_NRG_STAT;
+ msr = MSR_PKG_ENERGY_STATUS;
+ break;
+ case INTEL_RAPL_RAM:
+ bit = RAPL_IDX_RAM_NRG_STAT;
+ msr = MSR_DRAM_ENERGY_STATUS;
+ break;
+ case INTEL_RAPL_PP1:
+ bit = RAPL_IDX_PP1_NRG_STAT;
+ msr = MSR_PP1_ENERGY_STATUS;
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* check event supported */
+ if (!(rapl_cntr_mask & (1 << bit)))
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ /* must be done before validate_group */
+ event->hw.event_base = msr;
+ event->hw.config = cfg;
+ event->hw.idx = bit;
+
+ return ret;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+ rapl_event_update(event);
+}
+
+static ssize_t rapl_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
+
+ buf[n++] = '\n';
+ buf[n] = '\0';
+ return n;
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
+
+static struct attribute *rapl_pmu_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_attr_group = {
+ .attrs = rapl_pmu_attrs,
+};
+
+EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
+EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
+EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
+
+EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
+EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
+EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
+
+/*
+ * we compute in 0.23 nJ increments regardless of MSR
+ */
+EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
+
+static struct attribute *rapl_events_srv_attr[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_ram),
+
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_ram_unit),
+
+ EVENT_PTR(rapl_cores_scale),
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_ram_scale),
+ NULL,
+};
+
+static struct attribute *rapl_events_cln_attr[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_gpu),
+
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_gpu_unit),
+
+ EVENT_PTR(rapl_cores_scale),
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_gpu_scale),
+ NULL,
+};
+
+static struct attribute *rapl_events_hsw_attr[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_gpu),
+ EVENT_PTR(rapl_ram),
+
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_gpu_unit),
+ EVENT_PTR(rapl_ram_unit),
+
+ EVENT_PTR(rapl_cores_scale),
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_gpu_scale),
+ EVENT_PTR(rapl_ram_scale),
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_events_group = {
+ .name = "events",
+ .attrs = NULL, /* patched at runtime */
+};
+
+DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
+static struct attribute *rapl_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_format_group = {
+ .name = "format",
+ .attrs = rapl_formats_attr,
+};
+
+const struct attribute_group *rapl_attr_groups[] = {
+ &rapl_pmu_attr_group,
+ &rapl_pmu_format_group,
+ &rapl_pmu_events_group,
+ NULL,
+};
+
+static struct pmu rapl_pmu_class = {
+ .attr_groups = rapl_attr_groups,
+ .task_ctx_nr = perf_invalid_context, /* system-wide only */
+ .event_init = rapl_pmu_event_init,
+ .add = rapl_pmu_event_add, /* must have */
+ .del = rapl_pmu_event_del, /* must have */
+ .start = rapl_pmu_event_start,
+ .stop = rapl_pmu_event_stop,
+ .read = rapl_pmu_event_read,
+};
+
+static void rapl_cpu_exit(int cpu)
+{
+ struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+ int i, phys_id = topology_physical_package_id(cpu);
+ int target = -1;
+
+ /* find a new cpu on same package */
+ for_each_online_cpu(i) {
+ if (i == cpu)
+ continue;
+ if (phys_id == topology_physical_package_id(i)) {
+ target = i;
+ break;
+ }
+ }
+ /*
+ * clear cpu from cpumask
+ * if was set in cpumask and still some cpu on package,
+ * then move to new cpu
+ */
+ if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
+ cpumask_set_cpu(target, &rapl_cpu_mask);
+
+ WARN_ON(cpumask_empty(&rapl_cpu_mask));
+ /*
+ * migrate events and context to new cpu
+ */
+ if (target >= 0)
+ perf_pmu_migrate_context(pmu->pmu, cpu, target);
+
+ /* cancel overflow polling timer for CPU */
+ rapl_stop_hrtimer(pmu);
+}
+
+static void rapl_cpu_init(int cpu)
+{
+ int i, phys_id = topology_physical_package_id(cpu);
+
+ /* check if phys_is is already covered */
+ for_each_cpu(i, &rapl_cpu_mask) {
+ if (phys_id == topology_physical_package_id(i))
+ return;
+ }
+ /* was not found, so add it */
+ cpumask_set_cpu(cpu, &rapl_cpu_mask);
+}
+
+static int rapl_cpu_prepare(int cpu)
+{
+ struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+ int phys_id = topology_physical_package_id(cpu);
+ u64 ms;
+ u64 msr_rapl_power_unit_bits;
+
+ if (pmu)
+ return 0;
+
+ if (phys_id < 0)
+ return -1;
+
+ /* protect rdmsrl() to handle virtualization */
+ if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+ return -1;
+
+ pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+ if (!pmu)
+ return -1;
+
+ spin_lock_init(&pmu->lock);
+
+ INIT_LIST_HEAD(&pmu->active_list);
+
+ /*
+ * grab power unit as: 1/2^unit Joules
+ *
+ * we cache in local PMU instance
+ */
+ pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+ pmu->pmu = &rapl_pmu_class;
+
+ /*
+ * use reference of 200W for scaling the timeout
+ * to avoid missing counter overflows.
+ * 200W = 200 Joules/sec
+ * divide interval by 2 to avoid lockstep (2 * 100)
+ * if hw unit is 32, then we use 2 ms 1/200/2
+ */
+ if (pmu->hw_unit < 32)
+ ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
+ else
+ ms = 2;
+
+ pmu->timer_interval = ms_to_ktime(ms);
+
+ rapl_hrtimer_init(pmu);
+
+ /* set RAPL pmu for this cpu for now */
+ per_cpu(rapl_pmu, cpu) = pmu;
+ per_cpu(rapl_pmu_to_free, cpu) = NULL;
+
+ return 0;
+}
+
+static void rapl_cpu_kfree(int cpu)
+{
+ struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
+
+ kfree(pmu);
+
+ per_cpu(rapl_pmu_to_free, cpu) = NULL;
+}
+
+static int rapl_cpu_dying(int cpu)
+{
+ struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+
+ if (!pmu)
+ return 0;
+
+ per_cpu(rapl_pmu, cpu) = NULL;
+
+ per_cpu(rapl_pmu_to_free, cpu) = pmu;
+
+ return 0;
+}
+
+static int rapl_cpu_notifier(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ rapl_cpu_prepare(cpu);
+ break;
+ case CPU_STARTING:
+ rapl_cpu_init(cpu);
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DYING:
+ rapl_cpu_dying(cpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ rapl_cpu_kfree(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ rapl_cpu_exit(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id rapl_cpu_match[] = {
+ [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
+ [1] = {},
+};
+
+static int __init rapl_pmu_init(void)
+{
+ struct rapl_pmu *pmu;
+ int cpu, ret;
+
+ /*
+ * check for Intel processor family 6
+ */
+ if (!x86_match_cpu(rapl_cpu_match))
+ return 0;
+
+ /* check supported CPU */
+ switch (boot_cpu_data.x86_model) {
+ case 42: /* Sandy Bridge */
+ case 58: /* Ivy Bridge */
+ rapl_cntr_mask = RAPL_IDX_CLN;
+ rapl_pmu_events_group.attrs = rapl_events_cln_attr;
+ break;
+ case 60: /* Haswell */
+ case 69: /* Haswell-Celeron */
+ rapl_cntr_mask = RAPL_IDX_HSW;
+ rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
+ break;
+ case 45: /* Sandy Bridge-EP */
+ case 62: /* IvyTown */
+ rapl_cntr_mask = RAPL_IDX_SRV;
+ rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+ break;
+
+ default:
+ /* unsupported */
+ return 0;
+ }
+
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(cpu) {
+ ret = rapl_cpu_prepare(cpu);
+ if (ret)
+ goto out;
+ rapl_cpu_init(cpu);
+ }
+
+ __perf_cpu_notifier(rapl_cpu_notifier);
+
+ ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
+ if (WARN_ON(ret)) {
+ pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
+ cpu_notifier_register_done();
+ return -1;
+ }
+
+ pmu = __get_cpu_var(rapl_pmu);
+
+ pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
+ " API unit is 2^-32 Joules,"
+ " %d fixed counters"
+ " %llu ms ovfl timer\n",
+ pmu->hw_unit,
+ hweight32(rapl_cntr_mask),
+ ktime_to_ms(pmu->timer_interval));
+
+out:
+ cpu_notifier_register_done();
+
+ return 0;
+}
+device_initcall(rapl_pmu_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
new file mode 100644
index 00000000000..ae6552a0701
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -0,0 +1,4298 @@
+#include "perf_event_intel_uncore.h"
+
+static struct intel_uncore_type *empty_uncore[] = { NULL, };
+static struct intel_uncore_type **msr_uncores = empty_uncore;
+static struct intel_uncore_type **pci_uncores = empty_uncore;
+/* pci bus to socket mapping */
+static int pcibus_to_physid[256] = { [0 ... 255] = -1, };
+
+static struct pci_dev *extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+
+static DEFINE_RAW_SPINLOCK(uncore_box_lock);
+
+/* mask of cpus that collect uncore events */
+static cpumask_t uncore_cpu_mask;
+
+/* constraint for the fixed counter */
+static struct event_constraint constraint_fixed =
+ EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
+static struct event_constraint constraint_empty =
+ EVENT_CONSTRAINT(0, 0, 0);
+
+#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
+ ((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
+DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
+DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
+DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
+
+static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
+static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
+static void uncore_pmu_event_read(struct perf_event *event);
+
+static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+ return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+static struct intel_uncore_box *
+uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+{
+ struct intel_uncore_box *box;
+
+ box = *per_cpu_ptr(pmu->box, cpu);
+ if (box)
+ return box;
+
+ raw_spin_lock(&uncore_box_lock);
+ list_for_each_entry(box, &pmu->box_list, list) {
+ if (box->phys_id == topology_physical_package_id(cpu)) {
+ atomic_inc(&box->refcnt);
+ *per_cpu_ptr(pmu->box, cpu) = box;
+ break;
+ }
+ }
+ raw_spin_unlock(&uncore_box_lock);
+
+ return *per_cpu_ptr(pmu->box, cpu);
+}
+
+static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+ /*
+ * perf core schedules event on the basis of cpu, uncore events are
+ * collected by one of the cpus inside a physical package.
+ */
+ return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
+}
+
+static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ u64 count;
+
+ rdmsrl(event->hw.event_base, count);
+
+ return count;
+}
+
+/*
+ * generic get constraint function for shared match/mask registers.
+ */
+static struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_extra_reg *er;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ unsigned long flags;
+ bool ok = false;
+
+ /*
+ * reg->alloc can be set due to existing state, so for fake box we
+ * need to ignore this, otherwise we might fail to allocate proper
+ * fake state for this extra reg constraint.
+ */
+ if (reg1->idx == EXTRA_REG_NONE ||
+ (!uncore_box_is_fake(box) && reg1->alloc))
+ return NULL;
+
+ er = &box->shared_regs[reg1->idx];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (!atomic_read(&er->ref) ||
+ (er->config1 == reg1->config && er->config2 == reg2->config)) {
+ atomic_inc(&er->ref);
+ er->config1 = reg1->config;
+ er->config2 = reg2->config;
+ ok = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ if (ok) {
+ if (!uncore_box_is_fake(box))
+ reg1->alloc = 1;
+ return NULL;
+ }
+
+ return &constraint_empty;
+}
+
+static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_extra_reg *er;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+
+ /*
+ * Only put constraint if extra reg was actually allocated. Also
+ * takes care of event which do not use an extra shared reg.
+ *
+ * Also, if this is a fake box we shouldn't touch any event state
+ * (reg->alloc) and we don't care about leaving inconsistent box
+ * state either since it will be thrown out.
+ */
+ if (uncore_box_is_fake(box) || !reg1->alloc)
+ return;
+
+ er = &box->shared_regs[reg1->idx];
+ atomic_dec(&er->ref);
+ reg1->alloc = 0;
+}
+
+static u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ u64 config;
+
+ er = &box->shared_regs[idx];
+
+ raw_spin_lock_irqsave(&er->lock, flags);
+ config = er->config;
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ return config;
+}
+
+/* Sandy Bridge-EP uncore support */
+static struct intel_uncore_type snbep_uncore_cbox;
+static struct intel_uncore_type snbep_uncore_pcu;
+
+static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+ u32 config = 0;
+
+ if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+ config |= SNBEP_PMON_BOX_CTL_FRZ;
+ pci_write_config_dword(pdev, box_ctl, config);
+ }
+}
+
+static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+ u32 config = 0;
+
+ if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+ config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+ pci_write_config_dword(pdev, box_ctl, config);
+ }
+}
+
+static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+ pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ u64 config;
+ unsigned msr;
+
+ msr = uncore_msr_box_ctl(box);
+ if (msr) {
+ rdmsrl(msr, config);
+ config |= SNBEP_PMON_BOX_CTL_FRZ;
+ wrmsrl(msr, config);
+ }
+}
+
+static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ u64 config;
+ unsigned msr;
+
+ msr = uncore_msr_box_ctl(box);
+ if (msr) {
+ rdmsrl(msr, config);
+ config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+ wrmsrl(msr, config);
+ }
+}
+
+static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE)
+ wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
+
+ wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+
+ if (msr)
+ wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static struct attribute *snbep_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid.attr,
+ &format_attr_filter_nid.attr,
+ &format_attr_filter_state.attr,
+ &format_attr_filter_opc.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_pcu_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge.attr,
+ &format_attr_filter_band0.attr,
+ &format_attr_filter_band1.attr,
+ &format_attr_filter_band2.attr,
+ &format_attr_filter_band3.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_qpi_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match_rds.attr,
+ &format_attr_match_rnid30.attr,
+ &format_attr_match_rnid4.attr,
+ &format_attr_match_dnid.attr,
+ &format_attr_match_mc.attr,
+ &format_attr_match_opc.attr,
+ &format_attr_match_vnw.attr,
+ &format_attr_match0.attr,
+ &format_attr_match1.attr,
+ &format_attr_mask_rds.attr,
+ &format_attr_mask_rnid30.attr,
+ &format_attr_mask_rnid4.attr,
+ &format_attr_mask_dnid.attr,
+ &format_attr_mask_mc.attr,
+ &format_attr_mask_opc.attr,
+ &format_attr_mask_vnw.attr,
+ &format_attr_mask0.attr,
+ &format_attr_mask1.attr,
+ NULL,
+};
+
+static struct uncore_event_desc snbep_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+ { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc snbep_uncore_qpi_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
+ INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
+ INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"),
+ INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"),
+ { /* end: all zeroes */ },
+};
+
+static struct attribute_group snbep_uncore_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_qpi_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_qpi_formats_attr,
+};
+
+#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
+ .init_box = snbep_uncore_msr_init_box, \
+ .disable_box = snbep_uncore_msr_disable_box, \
+ .enable_box = snbep_uncore_msr_enable_box, \
+ .disable_event = snbep_uncore_msr_disable_event, \
+ .enable_event = snbep_uncore_msr_enable_event, \
+ .read_counter = uncore_msr_read_counter
+
+static struct intel_uncore_ops snbep_uncore_msr_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \
+ .init_box = snbep_uncore_pci_init_box, \
+ .disable_box = snbep_uncore_pci_disable_box, \
+ .enable_box = snbep_uncore_pci_enable_box, \
+ .disable_event = snbep_uncore_pci_disable_event, \
+ .read_counter = snbep_uncore_pci_read_counter
+
+static struct intel_uncore_ops snbep_uncore_pci_ops = {
+ SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+ .enable_event = snbep_uncore_pci_enable_event, \
+};
+
+static struct event_constraint snbep_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
+ EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snbep_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &snbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
+ EVENT_EXTRA_END
+};
+
+static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ int i;
+
+ if (uncore_box_is_fake(box))
+ return;
+
+ for (i = 0; i < 5; i++) {
+ if (reg1->alloc & (0x1 << i))
+ atomic_sub(1 << (i * 6), &er->ref);
+ }
+ reg1->alloc = 0;
+}
+
+static struct event_constraint *
+__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
+ u64 (*cbox_filter_mask)(int fields))
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ int i, alloc = 0;
+ unsigned long flags;
+ u64 mask;
+
+ if (reg1->idx == EXTRA_REG_NONE)
+ return NULL;
+
+ raw_spin_lock_irqsave(&er->lock, flags);
+ for (i = 0; i < 5; i++) {
+ if (!(reg1->idx & (0x1 << i)))
+ continue;
+ if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+ continue;
+
+ mask = cbox_filter_mask(0x1 << i);
+ if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
+ !((reg1->config ^ er->config) & mask)) {
+ atomic_add(1 << (i * 6), &er->ref);
+ er->config &= ~mask;
+ er->config |= reg1->config & mask;
+ alloc |= (0x1 << i);
+ } else {
+ break;
+ }
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+ if (i < 5)
+ goto fail;
+
+ if (!uncore_box_is_fake(box))
+ reg1->alloc |= alloc;
+
+ return NULL;
+fail:
+ for (; i >= 0; i--) {
+ if (alloc & (0x1 << i))
+ atomic_sub(1 << (i * 6), &er->ref);
+ }
+ return &constraint_empty;
+}
+
+static u64 snbep_cbox_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+ if (fields & 0x4)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+ return mask;
+}
+
+static struct event_constraint *
+snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
+}
+
+static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+ SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_cbox_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_cbox_hw_config,
+ .get_constraint = snbep_cbox_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 44,
+ .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = SNBEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = snbep_uncore_cbox_constraints,
+ .ops = &snbep_uncore_cbox_ops,
+ .format_group = &snbep_uncore_cbox_format_group,
+};
+
+static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ u64 config = reg1->config;
+
+ if (new_idx > reg1->idx)
+ config <<= 8 * (new_idx - reg1->idx);
+ else
+ config >>= 8 * (reg1->idx - new_idx);
+
+ if (modify) {
+ hwc->config += new_idx - reg1->idx;
+ reg1->config = config;
+ reg1->idx = new_idx;
+ }
+ return config;
+}
+
+static struct event_constraint *
+snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ unsigned long flags;
+ int idx = reg1->idx;
+ u64 mask, config1 = reg1->config;
+ bool ok = false;
+
+ if (reg1->idx == EXTRA_REG_NONE ||
+ (!uncore_box_is_fake(box) && reg1->alloc))
+ return NULL;
+again:
+ mask = 0xffULL << (idx * 8);
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
+ !((config1 ^ er->config) & mask)) {
+ atomic_add(1 << (idx * 8), &er->ref);
+ er->config &= ~mask;
+ er->config |= config1 & mask;
+ ok = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ if (!ok) {
+ idx = (idx + 1) % 4;
+ if (idx != reg1->idx) {
+ config1 = snbep_pcu_alter_er(event, idx, false);
+ goto again;
+ }
+ return &constraint_empty;
+ }
+
+ if (!uncore_box_is_fake(box)) {
+ if (idx != reg1->idx)
+ snbep_pcu_alter_er(event, idx, true);
+ reg1->alloc = 1;
+ }
+ return NULL;
+}
+
+static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+
+ if (uncore_box_is_fake(box) || !reg1->alloc)
+ return;
+
+ atomic_sub(1 << (reg1->idx * 8), &er->ref);
+ reg1->alloc = 0;
+}
+
+static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+ if (ev_sel >= 0xb && ev_sel <= 0xe) {
+ reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
+ reg1->idx = ev_sel - 0xb;
+ reg1->config = event->attr.config1 & (0xff << reg1->idx);
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_pcu_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_pcu_ops,
+ .format_group = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *snbep_msr_uncores[] = {
+ &snbep_uncore_ubox,
+ &snbep_uncore_cbox,
+ &snbep_uncore_pcu,
+ NULL,
+};
+
+enum {
+ SNBEP_PCI_QPI_PORT0_FILTER,
+ SNBEP_PCI_QPI_PORT1_FILTER,
+};
+
+static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
+ reg1->idx = 0;
+ reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
+ reg1->config = event->attr.config1;
+ reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
+ reg2->config = event->attr.config2;
+ }
+ return 0;
+}
+
+static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
+ struct pci_dev *filter_pdev = extra_pci_dev[box->phys_id][idx];
+ WARN_ON_ONCE(!filter_pdev);
+ if (filter_pdev) {
+ pci_write_config_dword(filter_pdev, reg1->reg,
+ (u32)reg1->config);
+ pci_write_config_dword(filter_pdev, reg1->reg + 4,
+ (u32)(reg1->config >> 32));
+ pci_write_config_dword(filter_pdev, reg2->reg,
+ (u32)reg2->config);
+ pci_write_config_dword(filter_pdev, reg2->reg + 4,
+ (u32)(reg2->config >> 32));
+ }
+ }
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops snbep_uncore_qpi_ops = {
+ SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+ .enable_event = snbep_qpi_enable_event,
+ .hw_config = snbep_qpi_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+#define SNBEP_UNCORE_PCI_COMMON_INIT() \
+ .perf_ctr = SNBEP_PCI_PMON_CTR0, \
+ .event_ctl = SNBEP_PCI_PMON_CTL0, \
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
+ .ops = &snbep_uncore_pci_ops, \
+ .format_group = &snbep_uncore_format_group
+
+static struct intel_uncore_type snbep_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = snbep_uncore_imc_events,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .event_descs = snbep_uncore_qpi_events,
+ .format_group = &snbep_uncore_qpi_format_group,
+};
+
+
+static struct intel_uncore_type snbep_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r3qpi_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ SNBEP_PCI_UNCORE_HA,
+ SNBEP_PCI_UNCORE_IMC,
+ SNBEP_PCI_UNCORE_QPI,
+ SNBEP_PCI_UNCORE_R2PCIE,
+ SNBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *snbep_pci_uncores[] = {
+ [SNBEP_PCI_UNCORE_HA] = &snbep_uncore_ha,
+ [SNBEP_PCI_UNCORE_IMC] = &snbep_uncore_imc,
+ [SNBEP_PCI_UNCORE_QPI] = &snbep_uncore_qpi,
+ [SNBEP_PCI_UNCORE_R2PCIE] = &snbep_uncore_r2pcie,
+ [SNBEP_PCI_UNCORE_R3QPI] = &snbep_uncore_r3qpi,
+ NULL,
+};
+
+static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = {
+ { /* Home Agent */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
+ },
+ { /* MC Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
+ },
+ { /* QPI Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver snbep_uncore_pci_driver = {
+ .name = "snbep_uncore",
+ .id_table = snbep_uncore_pci_ids,
+};
+
+/*
+ * build pci bus to socket mapping
+ */
+static int snbep_pci2phy_map_init(int devid)
+{
+ struct pci_dev *ubox_dev = NULL;
+ int i, bus, nodeid;
+ int err = 0;
+ u32 config = 0;
+
+ while (1) {
+ /* find the UBOX device */
+ ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
+ if (!ubox_dev)
+ break;
+ bus = ubox_dev->bus->number;
+ /* get the Node ID of the local register */
+ err = pci_read_config_dword(ubox_dev, 0x40, &config);
+ if (err)
+ break;
+ nodeid = config;
+ /* get the Node ID mapping */
+ err = pci_read_config_dword(ubox_dev, 0x54, &config);
+ if (err)
+ break;
+ /*
+ * every three bits in the Node ID mapping register maps
+ * to a particular node.
+ */
+ for (i = 0; i < 8; i++) {
+ if (nodeid == ((config >> (3 * i)) & 0x7)) {
+ pcibus_to_physid[bus] = i;
+ break;
+ }
+ }
+ }
+
+ if (!err) {
+ /*
+ * For PCI bus with no UBOX device, find the next bus
+ * that has UBOX device and use its mapping.
+ */
+ i = -1;
+ for (bus = 255; bus >= 0; bus--) {
+ if (pcibus_to_physid[bus] >= 0)
+ i = pcibus_to_physid[bus];
+ else
+ pcibus_to_physid[bus] = i;
+ }
+ }
+
+ if (ubox_dev)
+ pci_dev_put(ubox_dev);
+
+ return err ? pcibios_err_to_errno(err) : 0;
+}
+/* end of Sandy Bridge-EP uncore support */
+
+/* IvyTown uncore support */
+static void ivt_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ if (msr)
+ wrmsrl(msr, IVT_PMON_BOX_CTL_INT);
+}
+
+static void ivt_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVT_PMON_BOX_CTL_INT);
+}
+
+#define IVT_UNCORE_MSR_OPS_COMMON_INIT() \
+ .init_box = ivt_uncore_msr_init_box, \
+ .disable_box = snbep_uncore_msr_disable_box, \
+ .enable_box = snbep_uncore_msr_enable_box, \
+ .disable_event = snbep_uncore_msr_disable_event, \
+ .enable_event = snbep_uncore_msr_enable_event, \
+ .read_counter = uncore_msr_read_counter
+
+static struct intel_uncore_ops ivt_uncore_msr_ops = {
+ IVT_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+static struct intel_uncore_ops ivt_uncore_pci_ops = {
+ .init_box = ivt_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+#define IVT_UNCORE_PCI_COMMON_INIT() \
+ .perf_ctr = SNBEP_PCI_PMON_CTR0, \
+ .event_ctl = SNBEP_PCI_PMON_CTL0, \
+ .event_mask = IVT_PMON_RAW_EVENT_MASK, \
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
+ .ops = &ivt_uncore_pci_ops, \
+ .format_group = &ivt_uncore_format_group
+
+static struct attribute *ivt_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute *ivt_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static struct attribute *ivt_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid.attr,
+ &format_attr_filter_link.attr,
+ &format_attr_filter_state2.attr,
+ &format_attr_filter_nid2.attr,
+ &format_attr_filter_opc2.attr,
+ NULL,
+};
+
+static struct attribute *ivt_uncore_pcu_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_thresh5.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge.attr,
+ &format_attr_filter_band0.attr,
+ &format_attr_filter_band1.attr,
+ &format_attr_filter_band2.attr,
+ &format_attr_filter_band3.attr,
+ NULL,
+};
+
+static struct attribute *ivt_uncore_qpi_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match_rds.attr,
+ &format_attr_match_rnid30.attr,
+ &format_attr_match_rnid4.attr,
+ &format_attr_match_dnid.attr,
+ &format_attr_match_mc.attr,
+ &format_attr_match_opc.attr,
+ &format_attr_match_vnw.attr,
+ &format_attr_match0.attr,
+ &format_attr_match1.attr,
+ &format_attr_mask_rds.attr,
+ &format_attr_mask_rnid30.attr,
+ &format_attr_mask_rnid4.attr,
+ &format_attr_mask_dnid.attr,
+ &format_attr_mask_mc.attr,
+ &format_attr_mask_opc.attr,
+ &format_attr_mask_vnw.attr,
+ &format_attr_mask0.attr,
+ &format_attr_mask1.attr,
+ NULL,
+};
+
+static struct attribute_group ivt_uncore_format_group = {
+ .name = "format",
+ .attrs = ivt_uncore_formats_attr,
+};
+
+static struct attribute_group ivt_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = ivt_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group ivt_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = ivt_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group ivt_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = ivt_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group ivt_uncore_qpi_format_group = {
+ .name = "format",
+ .attrs = ivt_uncore_qpi_formats_attr,
+};
+
+static struct intel_uncore_type ivt_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_U_MSR_PMON_CTL0,
+ .event_mask = IVT_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &ivt_uncore_msr_ops,
+ .format_group = &ivt_uncore_ubox_format_group,
+};
+
+static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+ EVENT_EXTRA_END
+};
+
+static u64 ivt_cbox_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= IVT_CB0_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= IVT_CB0_MSR_PMON_BOX_FILTER_LINK;
+ if (fields & 0x4)
+ mask |= IVT_CB0_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8)
+ mask |= IVT_CB0_MSR_PMON_BOX_FILTER_NID;
+ if (fields & 0x10)
+ mask |= IVT_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+ return mask;
+}
+
+static struct event_constraint *
+ivt_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, ivt_cbox_filter_mask);
+}
+
+static int ivt_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = ivt_uncore_cbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+ SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & ivt_cbox_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static void ivt_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ u64 filter = uncore_shared_reg_config(box, 0);
+ wrmsrl(reg1->reg, filter & 0xffffffff);
+ wrmsrl(reg1->reg + 6, filter >> 32);
+ }
+
+ wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops ivt_uncore_cbox_ops = {
+ .init_box = ivt_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = ivt_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = ivt_cbox_hw_config,
+ .get_constraint = ivt_cbox_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type ivt_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 15,
+ .perf_ctr_bits = 44,
+ .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
+ .event_mask = IVT_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = SNBEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = snbep_uncore_cbox_constraints,
+ .ops = &ivt_uncore_cbox_ops,
+ .format_group = &ivt_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_ops ivt_uncore_pcu_ops = {
+ IVT_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type ivt_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
+ .event_mask = IVT_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivt_uncore_pcu_ops,
+ .format_group = &ivt_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *ivt_msr_uncores[] = {
+ &ivt_uncore_ubox,
+ &ivt_uncore_cbox,
+ &ivt_uncore_pcu,
+ NULL,
+};
+
+static struct intel_uncore_type ivt_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ IVT_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivt_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ IVT_UNCORE_PCI_COMMON_INIT(),
+};
+
+/* registers in IRP boxes are not properly aligned */
+static unsigned ivt_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
+static unsigned ivt_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
+
+static void ivt_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx],
+ hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void ivt_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], hwc->config);
+}
+
+static u64 ivt_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+ pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static struct intel_uncore_ops ivt_uncore_irp_ops = {
+ .init_box = ivt_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = ivt_uncore_irp_disable_event,
+ .enable_event = ivt_uncore_irp_enable_event,
+ .read_counter = ivt_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type ivt_uncore_irp = {
+ .name = "irp",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_mask = IVT_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivt_uncore_irp_ops,
+ .format_group = &ivt_uncore_format_group,
+};
+
+static struct intel_uncore_ops ivt_uncore_qpi_ops = {
+ .init_box = ivt_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_qpi_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+ .hw_config = snbep_qpi_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type ivt_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = IVT_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivt_uncore_qpi_ops,
+ .format_group = &ivt_uncore_qpi_format_group,
+};
+
+static struct intel_uncore_type ivt_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r2pcie_constraints,
+ IVT_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivt_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r3qpi_constraints,
+ IVT_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ IVT_PCI_UNCORE_HA,
+ IVT_PCI_UNCORE_IMC,
+ IVT_PCI_UNCORE_IRP,
+ IVT_PCI_UNCORE_QPI,
+ IVT_PCI_UNCORE_R2PCIE,
+ IVT_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *ivt_pci_uncores[] = {
+ [IVT_PCI_UNCORE_HA] = &ivt_uncore_ha,
+ [IVT_PCI_UNCORE_IMC] = &ivt_uncore_imc,
+ [IVT_PCI_UNCORE_IRP] = &ivt_uncore_irp,
+ [IVT_PCI_UNCORE_QPI] = &ivt_uncore_qpi,
+ [IVT_PCI_UNCORE_R2PCIE] = &ivt_uncore_r2pcie,
+ [IVT_PCI_UNCORE_R3QPI] = &ivt_uncore_r3qpi,
+ NULL,
+};
+
+static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = {
+ { /* Home Agent 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0),
+ },
+ { /* Home Agent 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 1),
+ },
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC0 Channel 4 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 5),
+ },
+ { /* MC1 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 6),
+ },
+ { /* MC1 Channel 4 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IRP, 0),
+ },
+ { /* QPI0 Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI0 Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 1),
+ },
+ { /* QPI1 Port 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 2),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* R3QPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver ivt_uncore_pci_driver = {
+ .name = "ivt_uncore",
+ .id_table = ivt_uncore_pci_ids,
+};
+/* end of IvyTown uncore support */
+
+/* Sandy Bridge uncore support */
+static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+ wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+ else
+ wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
+}
+
+static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ wrmsrl(event->hw.config_base, 0);
+}
+
+static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0) {
+ wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+ }
+}
+
+static struct uncore_event_desc snb_uncore_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ { /* end: all zeroes */ },
+};
+
+static struct attribute *snb_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask5.attr,
+ NULL,
+};
+
+static struct attribute_group snb_uncore_format_group = {
+ .name = "format",
+ .attrs = snb_uncore_formats_attr,
+};
+
+static struct intel_uncore_ops snb_uncore_msr_ops = {
+ .init_box = snb_uncore_msr_init_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct event_constraint snb_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snb_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 2,
+ .num_boxes = 4,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
+ .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
+ .fixed_ctr = SNB_UNC_FIXED_CTR,
+ .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .constraints = snb_uncore_cbox_constraints,
+ .ops = &snb_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+ .event_descs = snb_uncore_events,
+};
+
+static struct intel_uncore_type *snb_msr_uncores[] = {
+ &snb_uncore_cbox,
+ NULL,
+};
+
+enum {
+ SNB_PCI_UNCORE_IMC,
+};
+
+static struct uncore_event_desc snb_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
+
+ { /* end: all zeroes */ },
+};
+
+#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff
+#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48
+
+/* page size multiple covering all config regs */
+#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000
+
+#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1
+#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
+#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+
+static struct attribute *snb_uncore_imc_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group snb_uncore_imc_format_group = {
+ .name = "format",
+ .attrs = snb_uncore_imc_formats_attr,
+};
+
+static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
+ resource_size_t addr;
+ u32 pci_dword;
+
+ pci_read_config_dword(pdev, where, &pci_dword);
+ addr = pci_dword;
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ pci_read_config_dword(pdev, where + 4, &pci_dword);
+ addr |= ((resource_size_t)pci_dword << 32);
+#endif
+
+ addr &= ~(PAGE_SIZE - 1);
+
+ box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+ box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
+}
+
+static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
+}
+
+/*
+ * custom event_init() function because we define our own fixed, free
+ * running counters, so we do not want to conflict with generic uncore
+ * logic. Also simplifies processing
+ */
+static int snb_uncore_imc_event_init(struct perf_event *event)
+{
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
+ int idx, base;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ pmu = uncore_event_to_pmu(event);
+ /* no device found for this pmu */
+ if (pmu->func_id < 0)
+ return -ENOENT;
+
+ /* Sampling not supported yet */
+ if (hwc->sample_period)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ /*
+ * Place all uncore events for a particular physical package
+ * onto a single cpu
+ */
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ /* check only supported bits are set */
+ if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
+ return -EINVAL;
+
+ box = uncore_pmu_to_box(pmu, event->cpu);
+ if (!box || box->cpu < 0)
+ return -EINVAL;
+
+ event->cpu = box->cpu;
+
+ event->hw.idx = -1;
+ event->hw.last_tag = ~0ULL;
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
+ /*
+ * check event is known (whitelist, determines counter)
+ */
+ switch (cfg) {
+ case SNB_UNCORE_PCI_IMC_DATA_READS:
+ base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
+ idx = UNCORE_PMC_IDX_FIXED;
+ break;
+ case SNB_UNCORE_PCI_IMC_DATA_WRITES:
+ base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
+ idx = UNCORE_PMC_IDX_FIXED + 1;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* must be done before validate_group */
+ event->hw.event_base = base;
+ event->hw.config = cfg;
+ event->hw.idx = idx;
+
+ /* no group validation needed, we have free running counters */
+
+ return 0;
+}
+
+static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return 0;
+}
+
+static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ u64 count;
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+ box->n_active++;
+
+ list_add_tail(&event->active_entry, &box->active_list);
+
+ count = snb_uncore_imc_read_counter(box, event);
+ local64_set(&event->hw.prev_count, count);
+
+ if (box->n_active == 1)
+ uncore_pmu_start_hrtimer(box);
+}
+
+static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ box->n_active--;
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+
+ list_del(&event->active_entry);
+
+ if (box->n_active == 0)
+ uncore_pmu_cancel_hrtimer(box);
+ }
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ uncore_perf_event_update(box, event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box)
+ return -ENODEV;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_ARCH;
+
+ snb_uncore_imc_event_start(event, 0);
+
+ box->n_events++;
+
+ return 0;
+}
+
+static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ int i;
+
+ snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
+
+ for (i = 0; i < box->n_events; i++) {
+ if (event == box->event_list[i]) {
+ --box->n_events;
+ break;
+ }
+ }
+}
+
+static int snb_pci2phy_map_init(int devid)
+{
+ struct pci_dev *dev = NULL;
+ int bus;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
+ if (!dev)
+ return -ENOTTY;
+
+ bus = dev->bus->number;
+
+ pcibus_to_physid[bus] = 0;
+
+ pci_dev_put(dev);
+
+ return 0;
+}
+
+static struct pmu snb_uncore_imc_pmu = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = snb_uncore_imc_event_init,
+ .add = snb_uncore_imc_event_add,
+ .del = snb_uncore_imc_event_del,
+ .start = snb_uncore_imc_event_start,
+ .stop = snb_uncore_imc_event_stop,
+ .read = uncore_pmu_event_read,
+};
+
+static struct intel_uncore_ops snb_uncore_imc_ops = {
+ .init_box = snb_uncore_imc_init_box,
+ .enable_box = snb_uncore_imc_enable_box,
+ .disable_box = snb_uncore_imc_disable_box,
+ .disable_event = snb_uncore_imc_disable_event,
+ .enable_event = snb_uncore_imc_enable_event,
+ .hw_config = snb_uncore_imc_hw_config,
+ .read_counter = snb_uncore_imc_read_counter,
+};
+
+static struct intel_uncore_type snb_uncore_imc = {
+ .name = "imc",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .fixed_ctr_bits = 32,
+ .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE,
+ .event_descs = snb_uncore_imc_events,
+ .format_group = &snb_uncore_imc_format_group,
+ .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
+ .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK,
+ .ops = &snb_uncore_imc_ops,
+ .pmu = &snb_uncore_imc_pmu,
+};
+
+static struct intel_uncore_type *snb_pci_uncores[] = {
+ [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc,
+ NULL,
+};
+
+static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static struct pci_driver snb_uncore_pci_driver = {
+ .name = "snb_uncore",
+ .id_table = snb_uncore_pci_ids,
+};
+
+static struct pci_driver ivb_uncore_pci_driver = {
+ .name = "ivb_uncore",
+ .id_table = ivb_uncore_pci_ids,
+};
+
+static struct pci_driver hsw_uncore_pci_driver = {
+ .name = "hsw_uncore",
+ .id_table = hsw_uncore_pci_ids,
+};
+
+/* end of Sandy Bridge uncore support */
+
+/* Nehalem uncore support */
+static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
+}
+
+static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+ wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+ else
+ wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
+}
+
+static struct attribute *nhm_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask8.attr,
+ NULL,
+};
+
+static struct attribute_group nhm_uncore_format_group = {
+ .name = "format",
+ .attrs = nhm_uncore_formats_attr,
+};
+
+static struct uncore_event_desc nhm_uncore_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
+ INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhm_uncore_msr_ops = {
+ .disable_box = nhm_uncore_msr_disable_box,
+ .enable_box = nhm_uncore_msr_enable_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = nhm_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type nhm_uncore = {
+ .name = "",
+ .num_counters = 8,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .event_ctl = NHM_UNC_PERFEVTSEL0,
+ .perf_ctr = NHM_UNC_UNCORE_PMC0,
+ .fixed_ctr = NHM_UNC_FIXED_CTR,
+ .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
+ .event_mask = NHM_UNC_RAW_EVENT_MASK,
+ .event_descs = nhm_uncore_events,
+ .ops = &nhm_uncore_msr_ops,
+ .format_group = &nhm_uncore_format_group,
+};
+
+static struct intel_uncore_type *nhm_msr_uncores[] = {
+ &nhm_uncore,
+ NULL,
+};
+/* end of Nehalem uncore support */
+
+/* Nehalem-EX uncore support */
+DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
+DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
+DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
+
+static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
+}
+
+static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ u64 config;
+
+ if (msr) {
+ rdmsrl(msr, config);
+ config &= ~((1ULL << uncore_num_counters(box)) - 1);
+ /* WBox has a fixed counter */
+ if (uncore_msr_fixed_ctl(box))
+ config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
+ wrmsrl(msr, config);
+ }
+}
+
+static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ u64 config;
+
+ if (msr) {
+ rdmsrl(msr, config);
+ config |= (1ULL << uncore_num_counters(box)) - 1;
+ /* WBox has a fixed counter */
+ if (uncore_msr_fixed_ctl(box))
+ config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
+ wrmsrl(msr, config);
+ }
+}
+
+static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ wrmsrl(event->hw.config_base, 0);
+}
+
+static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
+ wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
+ else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+ else
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+#define NHMEX_UNCORE_OPS_COMMON_INIT() \
+ .init_box = nhmex_uncore_msr_init_box, \
+ .disable_box = nhmex_uncore_msr_disable_box, \
+ .enable_box = nhmex_uncore_msr_enable_box, \
+ .disable_event = nhmex_uncore_msr_disable_event, \
+ .read_counter = uncore_msr_read_counter
+
+static struct intel_uncore_ops nhmex_uncore_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_uncore_msr_enable_event,
+};
+
+static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_edge.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type nhmex_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_U_MSR_PMON_EV_SEL,
+ .perf_ctr = NHMEX_U_MSR_PMON_CTR,
+ .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_ubox_format_group
+};
+
+static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_cbox_formats_attr,
+};
+
+/* msr offset for each instance of cbox */
+static unsigned nhmex_cbox_msr_offsets[] = {
+ 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
+};
+
+static struct intel_uncore_type nhmex_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 6,
+ .num_boxes = 10,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0,
+ .perf_ctr = NHMEX_C0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
+ .msr_offsets = nhmex_cbox_msr_offsets,
+ .pair_ctr_ctl = 1,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_cbox_format_group
+};
+
+static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type nhmex_uncore_wbox = {
+ .name = "wbox",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_W_MSR_PMON_CNT0,
+ .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0,
+ .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR,
+ .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_W_MSR_GLOBAL_CTL,
+ .pair_ctr_ctl = 1,
+ .event_descs = nhmex_uncore_wbox_events,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_cbox_format_group
+};
+
+static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int ctr, ev_sel;
+
+ ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
+ NHMEX_B_PMON_CTR_SHIFT;
+ ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
+ NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
+
+ /* events that do not use the match/mask registers */
+ if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
+ (ctr == 2 && ev_sel != 0x4) || ctr == 3)
+ return 0;
+
+ if (box->pmu->pmu_idx == 0)
+ reg1->reg = NHMEX_B0_MSR_MATCH;
+ else
+ reg1->reg = NHMEX_B1_MSR_MATCH;
+ reg1->idx = 0;
+ reg1->config = event->attr.config1;
+ reg2->config = event->attr.config2;
+ return 0;
+}
+
+static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ wrmsrl(reg1->reg, reg1->config);
+ wrmsrl(reg1->reg + 1, reg2->config);
+ }
+ wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+ (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
+}
+
+/*
+ * The Bbox has 4 counters, but each counter monitors different events.
+ * Use bits 6-7 in the event config to select counter.
+ */
+static struct event_constraint nhmex_uncore_bbox_constraints[] = {
+ EVENT_CONSTRAINT(0 , 1, 0xc0),
+ EVENT_CONSTRAINT(0x40, 2, 0xc0),
+ EVENT_CONSTRAINT(0x80, 4, 0xc0),
+ EVENT_CONSTRAINT(0xc0, 8, 0xc0),
+ EVENT_CONSTRAINT_END,
+};
+
+static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
+ &format_attr_event5.attr,
+ &format_attr_counter.attr,
+ &format_attr_match.attr,
+ &format_attr_mask.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_bbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_bbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_bbox_msr_enable_event,
+ .hw_config = nhmex_bbox_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_bbox = {
+ .name = "bbox",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_B0_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_B0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
+ .msr_offset = NHMEX_B_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 1,
+ .constraints = nhmex_uncore_bbox_constraints,
+ .ops = &nhmex_uncore_bbox_ops,
+ .format_group = &nhmex_uncore_bbox_format_group
+};
+
+static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ /* only TO_R_PROG_EV event uses the match/mask register */
+ if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
+ NHMEX_S_EVENT_TO_R_PROG_EV)
+ return 0;
+
+ if (box->pmu->pmu_idx == 0)
+ reg1->reg = NHMEX_S0_MSR_MM_CFG;
+ else
+ reg1->reg = NHMEX_S1_MSR_MM_CFG;
+ reg1->idx = 0;
+ reg1->config = event->attr.config1;
+ reg2->config = event->attr.config2;
+ return 0;
+}
+
+static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ wrmsrl(reg1->reg, 0);
+ wrmsrl(reg1->reg + 1, reg1->config);
+ wrmsrl(reg1->reg + 2, reg2->config);
+ wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
+ }
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+}
+
+static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match.attr,
+ &format_attr_mask.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_sbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_sbox_msr_enable_event,
+ .hw_config = nhmex_sbox_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_S0_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_S0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
+ .msr_offset = NHMEX_S_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 1,
+ .ops = &nhmex_uncore_sbox_ops,
+ .format_group = &nhmex_uncore_sbox_format_group
+};
+
+enum {
+ EXTRA_REG_NHMEX_M_FILTER,
+ EXTRA_REG_NHMEX_M_DSP,
+ EXTRA_REG_NHMEX_M_ISS,
+ EXTRA_REG_NHMEX_M_MAP,
+ EXTRA_REG_NHMEX_M_MSC_THR,
+ EXTRA_REG_NHMEX_M_PGT,
+ EXTRA_REG_NHMEX_M_PLD,
+ EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
+};
+
+static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
+ MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
+ MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
+ MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
+ MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
+ /* event 0xa uses two extra registers */
+ MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
+ MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
+ MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
+ /* events 0xd ~ 0x10 use the same extra register */
+ MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
+ EVENT_EXTRA_END
+};
+
+/* Nehalem-EX or Westmere-EX ? */
+static bool uncore_nhmex;
+
+static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
+{
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ bool ret = false;
+ u64 mask;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ er = &box->shared_regs[idx];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (!atomic_read(&er->ref) || er->config == config) {
+ atomic_inc(&er->ref);
+ er->config = config;
+ ret = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ return ret;
+ }
+ /*
+ * The ZDP_CTL_FVC MSR has 4 fields which are used to control
+ * events 0xd ~ 0x10. Besides these 4 fields, there are additional
+ * fields which are shared.
+ */
+ idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (WARN_ON_ONCE(idx >= 4))
+ return false;
+
+ /* mask of the shared fields */
+ if (uncore_nhmex)
+ mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
+ else
+ mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+
+ raw_spin_lock_irqsave(&er->lock, flags);
+ /* add mask of the non-shared field if it's in use */
+ if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
+ if (uncore_nhmex)
+ mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ }
+
+ if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
+ atomic_add(1 << (idx * 8), &er->ref);
+ if (uncore_nhmex)
+ mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
+ NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
+ WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ er->config &= ~mask;
+ er->config |= (config & mask);
+ ret = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ return ret;
+}
+
+static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
+{
+ struct intel_uncore_extra_reg *er;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ er = &box->shared_regs[idx];
+ atomic_dec(&er->ref);
+ return;
+ }
+
+ idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+ atomic_sub(1 << (idx * 8), &er->ref);
+}
+
+static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
+ u64 config = reg1->config;
+
+ /* get the non-shared control bits and shift them */
+ idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (uncore_nhmex)
+ config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ if (new_idx > orig_idx) {
+ idx = new_idx - orig_idx;
+ config <<= 3 * idx;
+ } else {
+ idx = orig_idx - new_idx;
+ config >>= 3 * idx;
+ }
+
+ /* add the shared control bits back */
+ if (uncore_nhmex)
+ config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ else
+ config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ if (modify) {
+ /* adjust the main event selector */
+ if (new_idx > orig_idx)
+ hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+ else
+ hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+ reg1->config = config;
+ reg1->idx = ~0xff | new_idx;
+ }
+ return config;
+}
+
+static struct event_constraint *
+nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ int i, idx[2], alloc = 0;
+ u64 config1 = reg1->config;
+
+ idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
+ idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
+again:
+ for (i = 0; i < 2; i++) {
+ if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+ idx[i] = 0xff;
+
+ if (idx[i] == 0xff)
+ continue;
+
+ if (!nhmex_mbox_get_shared_reg(box, idx[i],
+ __BITS_VALUE(config1, i, 32)))
+ goto fail;
+ alloc |= (0x1 << i);
+ }
+
+ /* for the match/mask registers */
+ if (reg2->idx != EXTRA_REG_NONE &&
+ (uncore_box_is_fake(box) || !reg2->alloc) &&
+ !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
+ goto fail;
+
+ /*
+ * If it's a fake box -- as per validate_{group,event}() we
+ * shouldn't touch event state and we can avoid doing so
+ * since both will only call get_event_constraints() once
+ * on each event, this avoids the need for reg->alloc.
+ */
+ if (!uncore_box_is_fake(box)) {
+ if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
+ nhmex_mbox_alter_er(event, idx[0], true);
+ reg1->alloc |= alloc;
+ if (reg2->idx != EXTRA_REG_NONE)
+ reg2->alloc = 1;
+ }
+ return NULL;
+fail:
+ if (idx[0] != 0xff && !(alloc & 0x1) &&
+ idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ /*
+ * events 0xd ~ 0x10 are functional identical, but are
+ * controlled by different fields in the ZDP_CTL_FVC
+ * register. If we failed to take one field, try the
+ * rest 3 choices.
+ */
+ BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
+ idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ idx[0] = (idx[0] + 1) % 4;
+ idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
+ config1 = nhmex_mbox_alter_er(event, idx[0], false);
+ goto again;
+ }
+ }
+
+ if (alloc & 0x1)
+ nhmex_mbox_put_shared_reg(box, idx[0]);
+ if (alloc & 0x2)
+ nhmex_mbox_put_shared_reg(box, idx[1]);
+ return &constraint_empty;
+}
+
+static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+
+ if (uncore_box_is_fake(box))
+ return;
+
+ if (reg1->alloc & 0x1)
+ nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
+ if (reg1->alloc & 0x2)
+ nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
+ reg1->alloc = 0;
+
+ if (reg2->alloc) {
+ nhmex_mbox_put_shared_reg(box, reg2->idx);
+ reg2->alloc = 0;
+ }
+}
+
+static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
+{
+ if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+ return er->idx;
+ return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
+}
+
+static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_type *type = box->pmu->type;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ struct extra_reg *er;
+ unsigned msr;
+ int reg_idx = 0;
+ /*
+ * The mbox events may require 2 extra MSRs at the most. But only
+ * the lower 32 bits in these MSRs are significant, so we can use
+ * config1 to pass two MSRs' config.
+ */
+ for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ if (event->attr.config1 & ~er->valid_mask)
+ return -EINVAL;
+
+ msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
+ if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
+ return -EINVAL;
+
+ /* always use the 32~63 bits to pass the PLD config */
+ if (er->idx == EXTRA_REG_NHMEX_M_PLD)
+ reg_idx = 1;
+ else if (WARN_ON_ONCE(reg_idx > 0))
+ return -EINVAL;
+
+ reg1->idx &= ~(0xff << (reg_idx * 8));
+ reg1->reg &= ~(0xffff << (reg_idx * 16));
+ reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
+ reg1->reg |= msr << (reg_idx * 16);
+ reg1->config = event->attr.config1;
+ reg_idx++;
+ }
+ /*
+ * The mbox only provides ability to perform address matching
+ * for the PLD events.
+ */
+ if (reg_idx == 2) {
+ reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
+ if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
+ reg2->config = event->attr.config2;
+ else
+ reg2->config = ~0ULL;
+ if (box->pmu->pmu_idx == 0)
+ reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
+ else
+ reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
+ }
+ return 0;
+}
+
+static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ u64 config;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+ return box->shared_regs[idx].config;
+
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ config = er->config;
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+ return config;
+}
+
+static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int idx;
+
+ idx = __BITS_VALUE(reg1->idx, 0, 8);
+ if (idx != 0xff)
+ wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
+ nhmex_mbox_shared_reg_config(box, idx));
+ idx = __BITS_VALUE(reg1->idx, 1, 8);
+ if (idx != 0xff)
+ wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
+ nhmex_mbox_shared_reg_config(box, idx));
+
+ if (reg2->idx != EXTRA_REG_NONE) {
+ wrmsrl(reg2->reg, 0);
+ if (reg2->config != ~0ULL) {
+ wrmsrl(reg2->reg + 1,
+ reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
+ wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
+ (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
+ wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
+ }
+ }
+
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3");
+DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5");
+DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6");
+DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13");
+DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61");
+DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63");
+
+static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
+ &format_attr_count_mode.attr,
+ &format_attr_storage_mode.attr,
+ &format_attr_wrap_mode.attr,
+ &format_attr_flag_mode.attr,
+ &format_attr_inc_sel.attr,
+ &format_attr_set_flag_sel.attr,
+ &format_attr_filter_cfg_en.attr,
+ &format_attr_filter_match.attr,
+ &format_attr_filter_mask.attr,
+ &format_attr_dsp.attr,
+ &format_attr_thr.attr,
+ &format_attr_fvc.attr,
+ &format_attr_pgt.attr,
+ &format_attr_map.attr,
+ &format_attr_iss.attr,
+ &format_attr_pld.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_mbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_mbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
+ { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_mbox_msr_enable_event,
+ .hw_config = nhmex_mbox_hw_config,
+ .get_constraint = nhmex_mbox_get_constraint,
+ .put_constraint = nhmex_mbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_mbox = {
+ .name = "mbox",
+ .num_counters = 6,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_M0_MSR_PMU_CTL0,
+ .perf_ctr = NHMEX_M0_MSR_PMU_CNT0,
+ .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL,
+ .msr_offset = NHMEX_M_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 8,
+ .event_descs = nhmex_uncore_mbox_events,
+ .ops = &nhmex_uncore_mbox_ops,
+ .format_group = &nhmex_uncore_mbox_format_group,
+};
+
+static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ /* adjust the main event selector and extra register index */
+ if (reg1->idx % 2) {
+ reg1->idx--;
+ hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ } else {
+ reg1->idx++;
+ hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ }
+
+ /* adjust extra register config */
+ switch (reg1->idx % 6) {
+ case 2:
+ /* shift the 8~15 bits to the 0~7 bits */
+ reg1->config >>= 8;
+ break;
+ case 3:
+ /* shift the 0~7 bits to the 8~15 bits */
+ reg1->config <<= 8;
+ break;
+ };
+}
+
+/*
+ * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
+ * An event set consists of 6 events, the 3rd and 4th events in
+ * an event set use the same extra register. So an event set uses
+ * 5 extra registers.
+ */
+static struct event_constraint *
+nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ int idx, er_idx;
+ u64 config1;
+ bool ok = false;
+
+ if (!uncore_box_is_fake(box) && reg1->alloc)
+ return NULL;
+
+ idx = reg1->idx % 6;
+ config1 = reg1->config;
+again:
+ er_idx = idx;
+ /* the 3rd and 4th events use the same extra register */
+ if (er_idx > 2)
+ er_idx--;
+ er_idx += (reg1->idx / 6) * 5;
+
+ er = &box->shared_regs[er_idx];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (idx < 2) {
+ if (!atomic_read(&er->ref) || er->config == reg1->config) {
+ atomic_inc(&er->ref);
+ er->config = reg1->config;
+ ok = true;
+ }
+ } else if (idx == 2 || idx == 3) {
+ /*
+ * these two events use different fields in a extra register,
+ * the 0~7 bits and the 8~15 bits respectively.
+ */
+ u64 mask = 0xff << ((idx - 2) * 8);
+ if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
+ !((er->config ^ config1) & mask)) {
+ atomic_add(1 << ((idx - 2) * 8), &er->ref);
+ er->config &= ~mask;
+ er->config |= config1 & mask;
+ ok = true;
+ }
+ } else {
+ if (!atomic_read(&er->ref) ||
+ (er->config == (hwc->config >> 32) &&
+ er->config1 == reg1->config &&
+ er->config2 == reg2->config)) {
+ atomic_inc(&er->ref);
+ er->config = (hwc->config >> 32);
+ er->config1 = reg1->config;
+ er->config2 = reg2->config;
+ ok = true;
+ }
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ if (!ok) {
+ /*
+ * The Rbox events are always in pairs. The paired
+ * events are functional identical, but use different
+ * extra registers. If we failed to take an extra
+ * register, try the alternative.
+ */
+ if (idx % 2)
+ idx--;
+ else
+ idx++;
+ if (idx != reg1->idx % 6) {
+ if (idx == 2)
+ config1 >>= 8;
+ else if (idx == 3)
+ config1 <<= 8;
+ goto again;
+ }
+ } else {
+ if (!uncore_box_is_fake(box)) {
+ if (idx != reg1->idx % 6)
+ nhmex_rbox_alter_er(box, event);
+ reg1->alloc = 1;
+ }
+ return NULL;
+ }
+ return &constraint_empty;
+}
+
+static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_extra_reg *er;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ int idx, er_idx;
+
+ if (uncore_box_is_fake(box) || !reg1->alloc)
+ return;
+
+ idx = reg1->idx % 6;
+ er_idx = idx;
+ if (er_idx > 2)
+ er_idx--;
+ er_idx += (reg1->idx / 6) * 5;
+
+ er = &box->shared_regs[er_idx];
+ if (idx == 2 || idx == 3)
+ atomic_sub(1 << ((idx - 2) * 8), &er->ref);
+ else
+ atomic_dec(&er->ref);
+
+ reg1->alloc = 0;
+}
+
+static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ int idx;
+
+ idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
+ NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ if (idx >= 0x18)
+ return -EINVAL;
+
+ reg1->idx = idx;
+ reg1->config = event->attr.config1;
+
+ switch (idx % 6) {
+ case 4:
+ case 5:
+ hwc->config |= event->attr.config & (~0ULL << 32);
+ reg2->config = event->attr.config2;
+ break;
+ };
+ return 0;
+}
+
+static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int idx, port;
+
+ idx = reg1->idx;
+ port = idx / 6 + box->pmu->pmu_idx * 4;
+
+ switch (idx % 6) {
+ case 0:
+ wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
+ break;
+ case 1:
+ wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
+ break;
+ case 2:
+ case 3:
+ wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
+ uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
+ break;
+ case 4:
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
+ hwc->config >> 32);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
+ break;
+ case 5:
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
+ hwc->config >> 32);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
+ break;
+ };
+
+ wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+ (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
+DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
+
+static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
+ &format_attr_event5.attr,
+ &format_attr_xbr_mm_cfg.attr,
+ &format_attr_xbr_match.attr,
+ &format_attr_xbr_mask.attr,
+ &format_attr_qlx_cfg.attr,
+ &format_attr_iperf_cfg.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_rbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_rbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_rbox_msr_enable_event,
+ .hw_config = nhmex_rbox_hw_config,
+ .get_constraint = nhmex_rbox_get_constraint,
+ .put_constraint = nhmex_rbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_rbox = {
+ .name = "rbox",
+ .num_counters = 8,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_R_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_R_MSR_PMON_CNT0,
+ .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_R_MSR_GLOBAL_CTL,
+ .msr_offset = NHMEX_R_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 20,
+ .event_descs = nhmex_uncore_rbox_events,
+ .ops = &nhmex_uncore_rbox_ops,
+ .format_group = &nhmex_uncore_rbox_format_group
+};
+
+static struct intel_uncore_type *nhmex_msr_uncores[] = {
+ &nhmex_uncore_ubox,
+ &nhmex_uncore_cbox,
+ &nhmex_uncore_bbox,
+ &nhmex_uncore_sbox,
+ &nhmex_uncore_mbox,
+ &nhmex_uncore_rbox,
+ &nhmex_uncore_wbox,
+ NULL,
+};
+/* end of Nehalem-EX uncore support */
+
+static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->idx = idx;
+ hwc->last_tag = ++box->tags[idx];
+
+ if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
+ hwc->event_base = uncore_fixed_ctr(box);
+ hwc->config_base = uncore_fixed_ctl(box);
+ return;
+ }
+
+ hwc->config_base = uncore_event_ctl(box, hwc->idx);
+ hwc->event_base = uncore_perf_ctr(box, hwc->idx);
+}
+
+static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
+{
+ u64 prev_count, new_count, delta;
+ int shift;
+
+ if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
+ shift = 64 - uncore_fixed_ctr_bits(box);
+ else
+ shift = 64 - uncore_perf_ctr_bits(box);
+
+ /* the hrtimer might modify the previous event value */
+again:
+ prev_count = local64_read(&event->hw.prev_count);
+ new_count = uncore_read_counter(box, event);
+ if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
+ goto again;
+
+ delta = (new_count << shift) - (prev_count << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+}
+
+/*
+ * The overflow interrupt is unavailable for SandyBridge-EP, is broken
+ * for SandyBridge. So we use hrtimer to periodically poll the counter
+ * to avoid overflow.
+ */
+static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
+{
+ struct intel_uncore_box *box;
+ struct perf_event *event;
+ unsigned long flags;
+ int bit;
+
+ box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
+ if (!box->n_active || box->cpu != smp_processor_id())
+ return HRTIMER_NORESTART;
+ /*
+ * disable local interrupt to prevent uncore_pmu_event_start/stop
+ * to interrupt the update process
+ */
+ local_irq_save(flags);
+
+ /*
+ * handle boxes with an active event list as opposed to active
+ * counters
+ */
+ list_for_each_entry(event, &box->active_list, active_entry) {
+ uncore_perf_event_update(box, event);
+ }
+
+ for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
+ uncore_perf_event_update(box, box->events[bit]);
+
+ local_irq_restore(flags);
+
+ hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
+ return HRTIMER_RESTART;
+}
+
+static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
+{
+ __hrtimer_start_range_ns(&box->hrtimer,
+ ns_to_ktime(box->hrtimer_duration), 0,
+ HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
+{
+ hrtimer_cancel(&box->hrtimer);
+}
+
+static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
+{
+ hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ box->hrtimer.function = uncore_pmu_hrtimer;
+}
+
+static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node)
+{
+ struct intel_uncore_box *box;
+ int i, size;
+
+ size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
+
+ box = kzalloc_node(size, GFP_KERNEL, node);
+ if (!box)
+ return NULL;
+
+ for (i = 0; i < type->num_shared_regs; i++)
+ raw_spin_lock_init(&box->shared_regs[i].lock);
+
+ uncore_pmu_init_hrtimer(box);
+ atomic_set(&box->refcnt, 1);
+ box->cpu = -1;
+ box->phys_id = -1;
+
+ /* set default hrtimer timeout */
+ box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
+
+ INIT_LIST_HEAD(&box->active_list);
+
+ return box;
+}
+
+static int
+uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
+{
+ struct perf_event *event;
+ int n, max_count;
+
+ max_count = box->pmu->type->num_counters;
+ if (box->pmu->type->fixed_ctl)
+ max_count++;
+
+ if (box->n_events >= max_count)
+ return -EINVAL;
+
+ n = box->n_events;
+ box->event_list[n] = leader;
+ n++;
+ if (!dogrp)
+ return n;
+
+ list_for_each_entry(event, &leader->sibling_list, group_entry) {
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ continue;
+
+ if (n >= max_count)
+ return -EINVAL;
+
+ box->event_list[n] = event;
+ n++;
+ }
+ return n;
+}
+
+static struct event_constraint *
+uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_type *type = box->pmu->type;
+ struct event_constraint *c;
+
+ if (type->ops->get_constraint) {
+ c = type->ops->get_constraint(box, event);
+ if (c)
+ return c;
+ }
+
+ if (event->attr.config == UNCORE_FIXED_EVENT)
+ return &constraint_fixed;
+
+ if (type->constraints) {
+ for_each_event_constraint(c, type->constraints) {
+ if ((event->hw.config & c->cmask) == c->code)
+ return c;
+ }
+ }
+
+ return &type->unconstrainted;
+}
+
+static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ if (box->pmu->type->ops->put_constraint)
+ box->pmu->type->ops->put_constraint(box, event);
+}
+
+static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
+{
+ unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+ struct event_constraint *c;
+ int i, wmin, wmax, ret = 0;
+ struct hw_perf_event *hwc;
+
+ bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
+
+ for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+ hwc = &box->event_list[i]->hw;
+ c = uncore_get_event_constraint(box, box->event_list[i]);
+ hwc->constraint = c;
+ wmin = min(wmin, c->weight);
+ wmax = max(wmax, c->weight);
+ }
+
+ /* fastpath, try to reuse previous register */
+ for (i = 0; i < n; i++) {
+ hwc = &box->event_list[i]->hw;
+ c = hwc->constraint;
+
+ /* never assigned */
+ if (hwc->idx == -1)
+ break;
+
+ /* constraint still honored */
+ if (!test_bit(hwc->idx, c->idxmsk))
+ break;
+
+ /* not already used */
+ if (test_bit(hwc->idx, used_mask))
+ break;
+
+ __set_bit(hwc->idx, used_mask);
+ if (assign)
+ assign[i] = hwc->idx;
+ }
+ /* slow path */
+ if (i != n)
+ ret = perf_assign_events(box->event_list, n,
+ wmin, wmax, assign);
+
+ if (!assign || ret) {
+ for (i = 0; i < n; i++)
+ uncore_put_event_constraint(box, box->event_list[i]);
+ }
+ return ret ? -EINVAL : 0;
+}
+
+static void uncore_pmu_event_start(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ int idx = event->hw.idx;
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
+ return;
+
+ event->hw.state = 0;
+ box->events[idx] = event;
+ box->n_active++;
+ __set_bit(idx, box->active_mask);
+
+ local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
+ uncore_enable_event(box, event);
+
+ if (box->n_active == 1) {
+ uncore_enable_box(box);
+ uncore_pmu_start_hrtimer(box);
+ }
+}
+
+static void uncore_pmu_event_stop(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
+ uncore_disable_event(box, event);
+ box->n_active--;
+ box->events[hwc->idx] = NULL;
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+
+ if (box->n_active == 0) {
+ uncore_disable_box(box);
+ uncore_pmu_cancel_hrtimer(box);
+ }
+ }
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ uncore_perf_event_update(box, event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static int uncore_pmu_event_add(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+ int assign[UNCORE_PMC_IDX_MAX];
+ int i, n, ret;
+
+ if (!box)
+ return -ENODEV;
+
+ ret = n = uncore_collect_events(box, event, false);
+ if (ret < 0)
+ return ret;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_ARCH;
+
+ ret = uncore_assign_events(box, assign, n);
+ if (ret)
+ return ret;
+
+ /* save events moving to new counters */
+ for (i = 0; i < box->n_events; i++) {
+ event = box->event_list[i];
+ hwc = &event->hw;
+
+ if (hwc->idx == assign[i] &&
+ hwc->last_tag == box->tags[assign[i]])
+ continue;
+ /*
+ * Ensure we don't accidentally enable a stopped
+ * counter simply because we rescheduled.
+ */
+ if (hwc->state & PERF_HES_STOPPED)
+ hwc->state |= PERF_HES_ARCH;
+
+ uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+ }
+
+ /* reprogram moved events into new counters */
+ for (i = 0; i < n; i++) {
+ event = box->event_list[i];
+ hwc = &event->hw;
+
+ if (hwc->idx != assign[i] ||
+ hwc->last_tag != box->tags[assign[i]])
+ uncore_assign_hw_event(box, event, assign[i]);
+ else if (i < box->n_events)
+ continue;
+
+ if (hwc->state & PERF_HES_ARCH)
+ continue;
+
+ uncore_pmu_event_start(event, 0);
+ }
+ box->n_events = n;
+
+ return 0;
+}
+
+static void uncore_pmu_event_del(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ int i;
+
+ uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+
+ for (i = 0; i < box->n_events; i++) {
+ if (event == box->event_list[i]) {
+ uncore_put_event_constraint(box, event);
+
+ while (++i < box->n_events)
+ box->event_list[i - 1] = box->event_list[i];
+
+ --box->n_events;
+ break;
+ }
+ }
+
+ event->hw.idx = -1;
+ event->hw.last_tag = ~0ULL;
+}
+
+static void uncore_pmu_event_read(struct perf_event *event)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ uncore_perf_event_update(box, event);
+}
+
+/*
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int uncore_validate_group(struct intel_uncore_pmu *pmu,
+ struct perf_event *event)
+{
+ struct perf_event *leader = event->group_leader;
+ struct intel_uncore_box *fake_box;
+ int ret = -EINVAL, n;
+
+ fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
+ if (!fake_box)
+ return -ENOMEM;
+
+ fake_box->pmu = pmu;
+ /*
+ * the event is not yet connected with its
+ * siblings therefore we must first collect
+ * existing siblings, then add the new event
+ * before we can simulate the scheduling
+ */
+ n = uncore_collect_events(fake_box, leader, true);
+ if (n < 0)
+ goto out;
+
+ fake_box->n_events = n;
+ n = uncore_collect_events(fake_box, event, false);
+ if (n < 0)
+ goto out;
+
+ fake_box->n_events = n;
+
+ ret = uncore_assign_events(fake_box, NULL, n);
+out:
+ kfree(fake_box);
+ return ret;
+}
+
+static int uncore_pmu_event_init(struct perf_event *event)
+{
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ struct hw_perf_event *hwc = &event->hw;
+ int ret;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ pmu = uncore_event_to_pmu(event);
+ /* no device found for this pmu */
+ if (pmu->func_id < 0)
+ return -ENOENT;
+
+ /*
+ * Uncore PMU does measure at all privilege level all the time.
+ * So it doesn't make sense to specify any exclude bits.
+ */
+ if (event->attr.exclude_user || event->attr.exclude_kernel ||
+ event->attr.exclude_hv || event->attr.exclude_idle)
+ return -EINVAL;
+
+ /* Sampling not supported yet */
+ if (hwc->sample_period)
+ return -EINVAL;
+
+ /*
+ * Place all uncore events for a particular physical package
+ * onto a single cpu
+ */
+ if (event->cpu < 0)
+ return -EINVAL;
+ box = uncore_pmu_to_box(pmu, event->cpu);
+ if (!box || box->cpu < 0)
+ return -EINVAL;
+ event->cpu = box->cpu;
+
+ event->hw.idx = -1;
+ event->hw.last_tag = ~0ULL;
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+ if (event->attr.config == UNCORE_FIXED_EVENT) {
+ /* no fixed counter */
+ if (!pmu->type->fixed_ctl)
+ return -EINVAL;
+ /*
+ * if there is only one fixed counter, only the first pmu
+ * can access the fixed counter
+ */
+ if (pmu->type->single_fixed && pmu->pmu_idx > 0)
+ return -EINVAL;
+
+ /* fixed counters have event field hardcoded to zero */
+ hwc->config = 0ULL;
+ } else {
+ hwc->config = event->attr.config & pmu->type->event_mask;
+ if (pmu->type->ops->hw_config) {
+ ret = pmu->type->ops->hw_config(box, event);
+ if (ret)
+ return ret;
+ }
+ }
+
+ if (event->group_leader != event)
+ ret = uncore_validate_group(pmu, event);
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static ssize_t uncore_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask);
+
+ buf[n++] = '\n';
+ buf[n] = '\0';
+ return n;
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
+
+static struct attribute *uncore_pmu_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group uncore_pmu_attr_group = {
+ .attrs = uncore_pmu_attrs,
+};
+
+static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
+{
+ int ret;
+
+ if (!pmu->type->pmu) {
+ pmu->pmu = (struct pmu) {
+ .attr_groups = pmu->type->attr_groups,
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = uncore_pmu_event_init,
+ .add = uncore_pmu_event_add,
+ .del = uncore_pmu_event_del,
+ .start = uncore_pmu_event_start,
+ .stop = uncore_pmu_event_stop,
+ .read = uncore_pmu_event_read,
+ };
+ } else {
+ pmu->pmu = *pmu->type->pmu;
+ pmu->pmu.attr_groups = pmu->type->attr_groups;
+ }
+
+ if (pmu->type->num_boxes == 1) {
+ if (strlen(pmu->type->name) > 0)
+ sprintf(pmu->name, "uncore_%s", pmu->type->name);
+ else
+ sprintf(pmu->name, "uncore");
+ } else {
+ sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
+ pmu->pmu_idx);
+ }
+
+ ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
+ return ret;
+}
+
+static void __init uncore_type_exit(struct intel_uncore_type *type)
+{
+ int i;
+
+ for (i = 0; i < type->num_boxes; i++)
+ free_percpu(type->pmus[i].box);
+ kfree(type->pmus);
+ type->pmus = NULL;
+ kfree(type->events_group);
+ type->events_group = NULL;
+}
+
+static void __init uncore_types_exit(struct intel_uncore_type **types)
+{
+ int i;
+ for (i = 0; types[i]; i++)
+ uncore_type_exit(types[i]);
+}
+
+static int __init uncore_type_init(struct intel_uncore_type *type)
+{
+ struct intel_uncore_pmu *pmus;
+ struct attribute_group *attr_group;
+ struct attribute **attrs;
+ int i, j;
+
+ pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
+ if (!pmus)
+ return -ENOMEM;
+
+ type->pmus = pmus;
+
+ type->unconstrainted = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
+ 0, type->num_counters, 0, 0);
+
+ for (i = 0; i < type->num_boxes; i++) {
+ pmus[i].func_id = -1;
+ pmus[i].pmu_idx = i;
+ pmus[i].type = type;
+ INIT_LIST_HEAD(&pmus[i].box_list);
+ pmus[i].box = alloc_percpu(struct intel_uncore_box *);
+ if (!pmus[i].box)
+ goto fail;
+ }
+
+ if (type->event_descs) {
+ i = 0;
+ while (type->event_descs[i].attr.attr.name)
+ i++;
+
+ attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
+ sizeof(*attr_group), GFP_KERNEL);
+ if (!attr_group)
+ goto fail;
+
+ attrs = (struct attribute **)(attr_group + 1);
+ attr_group->name = "events";
+ attr_group->attrs = attrs;
+
+ for (j = 0; j < i; j++)
+ attrs[j] = &type->event_descs[j].attr.attr;
+
+ type->events_group = attr_group;
+ }
+
+ type->pmu_group = &uncore_pmu_attr_group;
+ return 0;
+fail:
+ uncore_type_exit(type);
+ return -ENOMEM;
+}
+
+static int __init uncore_types_init(struct intel_uncore_type **types)
+{
+ int i, ret;
+
+ for (i = 0; types[i]; i++) {
+ ret = uncore_type_init(types[i]);
+ if (ret)
+ goto fail;
+ }
+ return 0;
+fail:
+ while (--i >= 0)
+ uncore_type_exit(types[i]);
+ return ret;
+}
+
+static struct pci_driver *uncore_pci_driver;
+static bool pcidrv_registered;
+
+/*
+ * add a pci uncore device
+ */
+static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ struct intel_uncore_type *type;
+ int phys_id;
+
+ phys_id = pcibus_to_physid[pdev->bus->number];
+ if (phys_id < 0)
+ return -ENODEV;
+
+ if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
+ extra_pci_dev[phys_id][UNCORE_PCI_DEV_IDX(id->driver_data)] = pdev;
+ pci_set_drvdata(pdev, NULL);
+ return 0;
+ }
+
+ type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+ box = uncore_alloc_box(type, NUMA_NO_NODE);
+ if (!box)
+ return -ENOMEM;
+
+ /*
+ * for performance monitoring unit with multiple boxes,
+ * each box has a different function id.
+ */
+ pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+ if (pmu->func_id < 0)
+ pmu->func_id = pdev->devfn;
+ else
+ WARN_ON_ONCE(pmu->func_id != pdev->devfn);
+
+ box->phys_id = phys_id;
+ box->pci_dev = pdev;
+ box->pmu = pmu;
+ uncore_box_init(box);
+ pci_set_drvdata(pdev, box);
+
+ raw_spin_lock(&uncore_box_lock);
+ list_add_tail(&box->list, &pmu->box_list);
+ raw_spin_unlock(&uncore_box_lock);
+
+ return 0;
+}
+
+static void uncore_pci_remove(struct pci_dev *pdev)
+{
+ struct intel_uncore_box *box = pci_get_drvdata(pdev);
+ struct intel_uncore_pmu *pmu;
+ int i, cpu, phys_id = pcibus_to_physid[pdev->bus->number];
+
+ box = pci_get_drvdata(pdev);
+ if (!box) {
+ for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
+ if (extra_pci_dev[phys_id][i] == pdev) {
+ extra_pci_dev[phys_id][i] = NULL;
+ break;
+ }
+ }
+ WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
+ return;
+ }
+
+ pmu = box->pmu;
+ if (WARN_ON_ONCE(phys_id != box->phys_id))
+ return;
+
+ pci_set_drvdata(pdev, NULL);
+
+ raw_spin_lock(&uncore_box_lock);
+ list_del(&box->list);
+ raw_spin_unlock(&uncore_box_lock);
+
+ for_each_possible_cpu(cpu) {
+ if (*per_cpu_ptr(pmu->box, cpu) == box) {
+ *per_cpu_ptr(pmu->box, cpu) = NULL;
+ atomic_dec(&box->refcnt);
+ }
+ }
+
+ WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
+ kfree(box);
+}
+
+static int __init uncore_pci_init(void)
+{
+ int ret;
+
+ switch (boot_cpu_data.x86_model) {
+ case 45: /* Sandy Bridge-EP */
+ ret = snbep_pci2phy_map_init(0x3ce0);
+ if (ret)
+ return ret;
+ pci_uncores = snbep_pci_uncores;
+ uncore_pci_driver = &snbep_uncore_pci_driver;
+ break;
+ case 62: /* IvyTown */
+ ret = snbep_pci2phy_map_init(0x0e1e);
+ if (ret)
+ return ret;
+ pci_uncores = ivt_pci_uncores;
+ uncore_pci_driver = &ivt_uncore_pci_driver;
+ break;
+ case 42: /* Sandy Bridge */
+ ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC);
+ if (ret)
+ return ret;
+ pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = &snb_uncore_pci_driver;
+ break;
+ case 58: /* Ivy Bridge */
+ ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC);
+ if (ret)
+ return ret;
+ pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = &ivb_uncore_pci_driver;
+ break;
+ case 60: /* Haswell */
+ case 69: /* Haswell Celeron */
+ ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC);
+ if (ret)
+ return ret;
+ pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = &hsw_uncore_pci_driver;
+ break;
+ default:
+ return 0;
+ }
+
+ ret = uncore_types_init(pci_uncores);
+ if (ret)
+ return ret;
+
+ uncore_pci_driver->probe = uncore_pci_probe;
+ uncore_pci_driver->remove = uncore_pci_remove;
+
+ ret = pci_register_driver(uncore_pci_driver);
+ if (ret == 0)
+ pcidrv_registered = true;
+ else
+ uncore_types_exit(pci_uncores);
+
+ return ret;
+}
+
+static void __init uncore_pci_exit(void)
+{
+ if (pcidrv_registered) {
+ pcidrv_registered = false;
+ pci_unregister_driver(uncore_pci_driver);
+ uncore_types_exit(pci_uncores);
+ }
+}
+
+/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
+static LIST_HEAD(boxes_to_free);
+
+static void uncore_kfree_boxes(void)
+{
+ struct intel_uncore_box *box;
+
+ while (!list_empty(&boxes_to_free)) {
+ box = list_entry(boxes_to_free.next,
+ struct intel_uncore_box, list);
+ list_del(&box->list);
+ kfree(box);
+ }
+}
+
+static void uncore_cpu_dying(int cpu)
+{
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ int i, j;
+
+ for (i = 0; msr_uncores[i]; i++) {
+ type = msr_uncores[i];
+ for (j = 0; j < type->num_boxes; j++) {
+ pmu = &type->pmus[j];
+ box = *per_cpu_ptr(pmu->box, cpu);
+ *per_cpu_ptr(pmu->box, cpu) = NULL;
+ if (box && atomic_dec_and_test(&box->refcnt))
+ list_add(&box->list, &boxes_to_free);
+ }
+ }
+}
+
+static int uncore_cpu_starting(int cpu)
+{
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box, *exist;
+ int i, j, k, phys_id;
+
+ phys_id = topology_physical_package_id(cpu);
+
+ for (i = 0; msr_uncores[i]; i++) {
+ type = msr_uncores[i];
+ for (j = 0; j < type->num_boxes; j++) {
+ pmu = &type->pmus[j];
+ box = *per_cpu_ptr(pmu->box, cpu);
+ /* called by uncore_cpu_init? */
+ if (box && box->phys_id >= 0) {
+ uncore_box_init(box);
+ continue;
+ }
+
+ for_each_online_cpu(k) {
+ exist = *per_cpu_ptr(pmu->box, k);
+ if (exist && exist->phys_id == phys_id) {
+ atomic_inc(&exist->refcnt);
+ *per_cpu_ptr(pmu->box, cpu) = exist;
+ if (box) {
+ list_add(&box->list,
+ &boxes_to_free);
+ box = NULL;
+ }
+ break;
+ }
+ }
+
+ if (box) {
+ box->phys_id = phys_id;
+ uncore_box_init(box);
+ }
+ }
+ }
+ return 0;
+}
+
+static int uncore_cpu_prepare(int cpu, int phys_id)
+{
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ int i, j;
+
+ for (i = 0; msr_uncores[i]; i++) {
+ type = msr_uncores[i];
+ for (j = 0; j < type->num_boxes; j++) {
+ pmu = &type->pmus[j];
+ if (pmu->func_id < 0)
+ pmu->func_id = j;
+
+ box = uncore_alloc_box(type, cpu_to_node(cpu));
+ if (!box)
+ return -ENOMEM;
+
+ box->pmu = pmu;
+ box->phys_id = phys_id;
+ *per_cpu_ptr(pmu->box, cpu) = box;
+ }
+ }
+ return 0;
+}
+
+static void
+uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
+{
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ int i, j;
+
+ for (i = 0; uncores[i]; i++) {
+ type = uncores[i];
+ for (j = 0; j < type->num_boxes; j++) {
+ pmu = &type->pmus[j];
+ if (old_cpu < 0)
+ box = uncore_pmu_to_box(pmu, new_cpu);
+ else
+ box = uncore_pmu_to_box(pmu, old_cpu);
+ if (!box)
+ continue;
+
+ if (old_cpu < 0) {
+ WARN_ON_ONCE(box->cpu != -1);
+ box->cpu = new_cpu;
+ continue;
+ }
+
+ WARN_ON_ONCE(box->cpu != old_cpu);
+ if (new_cpu >= 0) {
+ uncore_pmu_cancel_hrtimer(box);
+ perf_pmu_migrate_context(&pmu->pmu,
+ old_cpu, new_cpu);
+ box->cpu = new_cpu;
+ } else {
+ box->cpu = -1;
+ }
+ }
+ }
+}
+
+static void uncore_event_exit_cpu(int cpu)
+{
+ int i, phys_id, target;
+
+ /* if exiting cpu is used for collecting uncore events */
+ if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
+ return;
+
+ /* find a new cpu to collect uncore events */
+ phys_id = topology_physical_package_id(cpu);
+ target = -1;
+ for_each_online_cpu(i) {
+ if (i == cpu)
+ continue;
+ if (phys_id == topology_physical_package_id(i)) {
+ target = i;
+ break;
+ }
+ }
+
+ /* migrate uncore events to the new cpu */
+ if (target >= 0)
+ cpumask_set_cpu(target, &uncore_cpu_mask);
+
+ uncore_change_context(msr_uncores, cpu, target);
+ uncore_change_context(pci_uncores, cpu, target);
+}
+
+static void uncore_event_init_cpu(int cpu)
+{
+ int i, phys_id;
+
+ phys_id = topology_physical_package_id(cpu);
+ for_each_cpu(i, &uncore_cpu_mask) {
+ if (phys_id == topology_physical_package_id(i))
+ return;
+ }
+
+ cpumask_set_cpu(cpu, &uncore_cpu_mask);
+
+ uncore_change_context(msr_uncores, -1, cpu);
+ uncore_change_context(pci_uncores, -1, cpu);
+}
+
+static int uncore_cpu_notifier(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ /* allocate/free data structure for uncore box */
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ uncore_cpu_prepare(cpu, -1);
+ break;
+ case CPU_STARTING:
+ uncore_cpu_starting(cpu);
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DYING:
+ uncore_cpu_dying(cpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ uncore_kfree_boxes();
+ break;
+ default:
+ break;
+ }
+
+ /* select the cpu that collects uncore events */
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_FAILED:
+ case CPU_STARTING:
+ uncore_event_init_cpu(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ uncore_event_exit_cpu(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block uncore_cpu_nb = {
+ .notifier_call = uncore_cpu_notifier,
+ /*
+ * to migrate uncore events, our notifier should be executed
+ * before perf core's notifier.
+ */
+ .priority = CPU_PRI_PERF + 1,
+};
+
+static void __init uncore_cpu_setup(void *dummy)
+{
+ uncore_cpu_starting(smp_processor_id());
+}
+
+static int __init uncore_cpu_init(void)
+{
+ int ret, max_cores;
+
+ max_cores = boot_cpu_data.x86_max_cores;
+ switch (boot_cpu_data.x86_model) {
+ case 26: /* Nehalem */
+ case 30:
+ case 37: /* Westmere */
+ case 44:
+ msr_uncores = nhm_msr_uncores;
+ break;
+ case 42: /* Sandy Bridge */
+ case 58: /* Ivy Bridge */
+ if (snb_uncore_cbox.num_boxes > max_cores)
+ snb_uncore_cbox.num_boxes = max_cores;
+ msr_uncores = snb_msr_uncores;
+ break;
+ case 45: /* Sandy Bridge-EP */
+ if (snbep_uncore_cbox.num_boxes > max_cores)
+ snbep_uncore_cbox.num_boxes = max_cores;
+ msr_uncores = snbep_msr_uncores;
+ break;
+ case 46: /* Nehalem-EX */
+ uncore_nhmex = true;
+ case 47: /* Westmere-EX aka. Xeon E7 */
+ if (!uncore_nhmex)
+ nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
+ if (nhmex_uncore_cbox.num_boxes > max_cores)
+ nhmex_uncore_cbox.num_boxes = max_cores;
+ msr_uncores = nhmex_msr_uncores;
+ break;
+ case 62: /* IvyTown */
+ if (ivt_uncore_cbox.num_boxes > max_cores)
+ ivt_uncore_cbox.num_boxes = max_cores;
+ msr_uncores = ivt_msr_uncores;
+ break;
+
+ default:
+ return 0;
+ }
+
+ ret = uncore_types_init(msr_uncores);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int __init uncore_pmus_register(void)
+{
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_type *type;
+ int i, j;
+
+ for (i = 0; msr_uncores[i]; i++) {
+ type = msr_uncores[i];
+ for (j = 0; j < type->num_boxes; j++) {
+ pmu = &type->pmus[j];
+ uncore_pmu_register(pmu);
+ }
+ }
+
+ for (i = 0; pci_uncores[i]; i++) {
+ type = pci_uncores[i];
+ for (j = 0; j < type->num_boxes; j++) {
+ pmu = &type->pmus[j];
+ uncore_pmu_register(pmu);
+ }
+ }
+
+ return 0;
+}
+
+static void __init uncore_cpumask_init(void)
+{
+ int cpu;
+
+ /*
+ * ony invoke once from msr or pci init code
+ */
+ if (!cpumask_empty(&uncore_cpu_mask))
+ return;
+
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(cpu) {
+ int i, phys_id = topology_physical_package_id(cpu);
+
+ for_each_cpu(i, &uncore_cpu_mask) {
+ if (phys_id == topology_physical_package_id(i)) {
+ phys_id = -1;
+ break;
+ }
+ }
+ if (phys_id < 0)
+ continue;
+
+ uncore_cpu_prepare(cpu, phys_id);
+ uncore_event_init_cpu(cpu);
+ }
+ on_each_cpu(uncore_cpu_setup, NULL, 1);
+
+ __register_cpu_notifier(&uncore_cpu_nb);
+
+ cpu_notifier_register_done();
+}
+
+
+static int __init intel_uncore_init(void)
+{
+ int ret;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return -ENODEV;
+
+ if (cpu_has_hypervisor)
+ return -ENODEV;
+
+ ret = uncore_pci_init();
+ if (ret)
+ goto fail;
+ ret = uncore_cpu_init();
+ if (ret) {
+ uncore_pci_exit();
+ goto fail;
+ }
+ uncore_cpumask_init();
+
+ uncore_pmus_register();
+ return 0;
+fail:
+ return ret;
+}
+device_initcall(intel_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
new file mode 100644
index 00000000000..90236f0c94a
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -0,0 +1,696 @@
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/perf_event.h>
+#include "perf_event.h"
+
+#define UNCORE_PMU_NAME_LEN 32
+#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC)
+#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
+
+#define UNCORE_FIXED_EVENT 0xff
+#define UNCORE_PMC_IDX_MAX_GENERIC 8
+#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
+#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1)
+
+#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
+#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff)
+#define UNCORE_PCI_DEV_IDX(data) (data & 0xff)
+#define UNCORE_EXTRA_PCI_DEV 0xff
+#define UNCORE_EXTRA_PCI_DEV_MAX 2
+
+/* support up to 8 sockets */
+#define UNCORE_SOCKET_MAX 8
+
+#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
+
+/* SNB event control */
+#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
+#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
+#define SNB_UNC_CTL_EDGE_DET (1 << 18)
+#define SNB_UNC_CTL_EN (1 << 22)
+#define SNB_UNC_CTL_INVERT (1 << 23)
+#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
+#define NHM_UNC_CTL_CMASK_MASK 0xff000000
+#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
+
+#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ SNB_UNC_CTL_CMASK_MASK)
+
+#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ NHM_UNC_CTL_CMASK_MASK)
+
+/* SNB global control register */
+#define SNB_UNC_PERF_GLOBAL_CTL 0x391
+#define SNB_UNC_FIXED_CTR_CTRL 0x394
+#define SNB_UNC_FIXED_CTR 0x395
+
+/* SNB uncore global control */
+#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
+#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
+
+/* SNB Cbo register */
+#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
+#define SNB_UNC_CBO_0_PER_CTR0 0x706
+#define SNB_UNC_CBO_MSR_OFFSET 0x10
+
+/* NHM global control register */
+#define NHM_UNC_PERF_GLOBAL_CTL 0x391
+#define NHM_UNC_FIXED_CTR 0x394
+#define NHM_UNC_FIXED_CTR_CTRL 0x395
+
+/* NHM uncore global control */
+#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
+#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
+
+/* NHM uncore register */
+#define NHM_UNC_PERFEVTSEL0 0x3c0
+#define NHM_UNC_UNCORE_PMC0 0x3b0
+
+/* SNB-EP Box level control */
+#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
+#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
+#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8)
+#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16)
+#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+ SNBEP_PMON_BOX_CTL_RST_CTRS | \
+ SNBEP_PMON_BOX_CTL_FRZ_EN)
+/* SNB-EP event control */
+#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff
+#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00
+#define SNBEP_PMON_CTL_RST (1 << 17)
+#define SNBEP_PMON_CTL_EDGE_DET (1 << 18)
+#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21)
+#define SNBEP_PMON_CTL_EN (1 << 22)
+#define SNBEP_PMON_CTL_INVERT (1 << 23)
+#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000
+#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_PMON_CTL_TRESH_MASK)
+
+/* SNB-EP Ubox event control */
+#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000
+#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+
+#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19)
+#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* SNB-EP PCU event control */
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000
+#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30)
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31)
+#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_EV_SEL_EXT | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_PMON_CTL_EV_SEL_EXT)
+
+/* SNB-EP pci control register */
+#define SNBEP_PCI_PMON_BOX_CTL 0xf4
+#define SNBEP_PCI_PMON_CTL0 0xd8
+/* SNB-EP pci counter register */
+#define SNBEP_PCI_PMON_CTR0 0xa0
+
+/* SNB-EP home agent register */
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44
+#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48
+/* SNB-EP memory controller register */
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0
+/* SNB-EP QPI register */
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c
+
+/* SNB-EP Ubox register */
+#define SNBEP_U_MSR_PMON_CTR0 0xc16
+#define SNBEP_U_MSR_PMON_CTL0 0xc10
+
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09
+
+/* SNB-EP Cbo register */
+#define SNBEP_C0_MSR_PMON_CTR0 0xd16
+#define SNBEP_C0_MSR_PMON_CTL0 0xd10
+#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04
+#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14
+#define SNBEP_CBO_MSR_OFFSET 0x20
+
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID 0x1f
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID 0x3fc00
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE 0x7c0000
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC 0xff800000
+
+#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) { \
+ .event = (e), \
+ .msr = SNBEP_C0_MSR_PMON_BOX_FILTER, \
+ .config_mask = (m), \
+ .idx = (i) \
+}
+
+/* SNB-EP PCU register */
+#define SNBEP_PCU_MSR_PMON_CTR0 0xc36
+#define SNBEP_PCU_MSR_PMON_CTL0 0xc30
+#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff
+#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc
+#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd
+
+/* IVT event control */
+#define IVT_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+ SNBEP_PMON_BOX_CTL_RST_CTRS)
+#define IVT_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_TRESH_MASK)
+/* IVT Ubox */
+#define IVT_U_MSR_PMON_GLOBAL_CTL 0xc00
+#define IVT_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
+#define IVT_U_PMON_GLOBAL_UNFRZ_ALL (1 << 29)
+
+#define IVT_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+/* IVT Cbo */
+#define IVT_CBO_MSR_PMON_RAW_EVENT_MASK (IVT_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+
+#define IVT_CB0_MSR_PMON_BOX_FILTER_TID (0x1fULL << 0)
+#define IVT_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 5)
+#define IVT_CB0_MSR_PMON_BOX_FILTER_STATE (0x3fULL << 17)
+#define IVT_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32)
+#define IVT_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52)
+#define IVT_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
+#define IVT_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
+#define IVT_CB0_MSR_PMON_BOX_FILTER_IOSC (0x1ULL << 63)
+
+/* IVT home agent */
+#define IVT_HA_PCI_PMON_CTL_Q_OCC_RST (1 << 16)
+#define IVT_HA_PCI_PMON_RAW_EVENT_MASK \
+ (IVT_PMON_RAW_EVENT_MASK | \
+ IVT_HA_PCI_PMON_CTL_Q_OCC_RST)
+/* IVT PCU */
+#define IVT_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_EV_SEL_EXT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+/* IVT QPI */
+#define IVT_QPI_PCI_PMON_RAW_EVENT_MASK \
+ (IVT_PMON_RAW_EVENT_MASK | \
+ SNBEP_PMON_CTL_EV_SEL_EXT)
+
+/* NHM-EX event control */
+#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff
+#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00
+#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0)
+#define NHMEX_PMON_CTL_EDGE_DET (1 << 18)
+#define NHMEX_PMON_CTL_PMI_EN (1 << 20)
+#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22)
+#define NHMEX_PMON_CTL_INVERT (1 << 23)
+#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000
+#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_PMON_CTL_UMASK_MASK | \
+ NHMEX_PMON_CTL_EDGE_DET | \
+ NHMEX_PMON_CTL_INVERT | \
+ NHMEX_PMON_CTL_TRESH_MASK)
+
+/* NHM-EX Ubox */
+#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00
+#define NHMEX_U_MSR_PMON_CTR 0xc11
+#define NHMEX_U_MSR_PMON_EV_SEL 0xc10
+
+#define NHMEX_U_PMON_GLOBAL_EN (1 << 0)
+#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e
+#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28)
+#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29)
+#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
+
+#define NHMEX_U_PMON_RAW_EVENT_MASK \
+ (NHMEX_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_PMON_CTL_EDGE_DET)
+
+/* NHM-EX Cbox */
+#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00
+#define NHMEX_C0_MSR_PMON_CTR0 0xd11
+#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10
+#define NHMEX_C_MSR_OFFSET 0x20
+
+/* NHM-EX Bbox */
+#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20
+#define NHMEX_B0_MSR_PMON_CTR0 0xc31
+#define NHMEX_B0_MSR_PMON_CTL0 0xc30
+#define NHMEX_B_MSR_OFFSET 0x40
+#define NHMEX_B0_MSR_MATCH 0xe45
+#define NHMEX_B0_MSR_MASK 0xe46
+#define NHMEX_B1_MSR_MATCH 0xe4d
+#define NHMEX_B1_MSR_MASK 0xe4e
+
+#define NHMEX_B_PMON_CTL_EN (1 << 0)
+#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1
+#define NHMEX_B_PMON_CTL_EV_SEL_MASK \
+ (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_B_PMON_CTR_SHIFT 6
+#define NHMEX_B_PMON_CTR_MASK \
+ (0x3 << NHMEX_B_PMON_CTR_SHIFT)
+#define NHMEX_B_PMON_RAW_EVENT_MASK \
+ (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_B_PMON_CTR_MASK)
+
+/* NHM-EX Sbox */
+#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40
+#define NHMEX_S0_MSR_PMON_CTR0 0xc51
+#define NHMEX_S0_MSR_PMON_CTL0 0xc50
+#define NHMEX_S_MSR_OFFSET 0x80
+#define NHMEX_S0_MSR_MM_CFG 0xe48
+#define NHMEX_S0_MSR_MATCH 0xe49
+#define NHMEX_S0_MSR_MASK 0xe4a
+#define NHMEX_S1_MSR_MM_CFG 0xe58
+#define NHMEX_S1_MSR_MATCH 0xe59
+#define NHMEX_S1_MSR_MASK 0xe5a
+
+#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63)
+#define NHMEX_S_EVENT_TO_R_PROG_EV 0
+
+/* NHM-EX Mbox */
+#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0
+#define NHMEX_M0_MSR_PMU_DSP 0xca5
+#define NHMEX_M0_MSR_PMU_ISS 0xca6
+#define NHMEX_M0_MSR_PMU_MAP 0xca7
+#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8
+#define NHMEX_M0_MSR_PMU_PGT 0xca9
+#define NHMEX_M0_MSR_PMU_PLD 0xcaa
+#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab
+#define NHMEX_M0_MSR_PMU_CTL0 0xcb0
+#define NHMEX_M0_MSR_PMU_CNT0 0xcb1
+#define NHMEX_M_MSR_OFFSET 0x40
+#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54
+#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c
+
+#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63)
+#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34
+
+#define NHMEX_M_PMON_CTL_EN (1 << 0)
+#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1)
+#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2
+#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \
+ (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \
+ (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6)
+#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7)
+#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9
+#define NHMEX_M_PMON_CTL_INC_SEL_MASK \
+ (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \
+ (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
+#define NHMEX_M_PMON_RAW_EVENT_MASK \
+ (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \
+ NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \
+ NHMEX_M_PMON_CTL_WRAP_MODE | \
+ NHMEX_M_PMON_CTL_FLAG_MODE | \
+ NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+ NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
+
+#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23))
+#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
+
+#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24))
+#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
+
+/*
+ * use the 9~13 bits to select event If the 7th bit is not set,
+ * otherwise use the 19~21 bits to select event.
+ */
+#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_EXTAR_REG(c, r) \
+ EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+ MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
+#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
+ EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+ MBOX_SET_FLAG_SEL_MASK, \
+ (u64)-1, NHMEX_M_##r)
+
+/* NHM-EX Rbox */
+#define NHMEX_R_MSR_GLOBAL_CTL 0xe00
+#define NHMEX_R_MSR_PMON_CTL0 0xe10
+#define NHMEX_R_MSR_PMON_CNT0 0xe11
+#define NHMEX_R_MSR_OFFSET 0x20
+
+#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \
+ ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n))
+#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \
+ (((n) < 4 ? 0 : 0x10) + (n) * 4)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \
+ (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \
+ (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
+
+#define NHMEX_R_PMON_CTL_EN (1 << 0)
+#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1
+#define NHMEX_R_PMON_CTL_EV_SEL_MASK \
+ (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6)
+#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK
+
+/* NHM-EX Wbox */
+#define NHMEX_W_MSR_GLOBAL_CTL 0xc80
+#define NHMEX_W_MSR_PMON_CNT0 0xc90
+#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91
+#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394
+#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395
+
+#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31)
+
+struct intel_uncore_ops;
+struct intel_uncore_pmu;
+struct intel_uncore_box;
+struct uncore_event_desc;
+
+struct intel_uncore_type {
+ const char *name;
+ int num_counters;
+ int num_boxes;
+ int perf_ctr_bits;
+ int fixed_ctr_bits;
+ unsigned perf_ctr;
+ unsigned event_ctl;
+ unsigned event_mask;
+ unsigned fixed_ctr;
+ unsigned fixed_ctl;
+ unsigned box_ctl;
+ unsigned msr_offset;
+ unsigned num_shared_regs:8;
+ unsigned single_fixed:1;
+ unsigned pair_ctr_ctl:1;
+ unsigned *msr_offsets;
+ struct event_constraint unconstrainted;
+ struct event_constraint *constraints;
+ struct intel_uncore_pmu *pmus;
+ struct intel_uncore_ops *ops;
+ struct uncore_event_desc *event_descs;
+ const struct attribute_group *attr_groups[4];
+ struct pmu *pmu; /* for custom pmu ops */
+};
+
+#define pmu_group attr_groups[0]
+#define format_group attr_groups[1]
+#define events_group attr_groups[2]
+
+struct intel_uncore_ops {
+ void (*init_box)(struct intel_uncore_box *);
+ void (*disable_box)(struct intel_uncore_box *);
+ void (*enable_box)(struct intel_uncore_box *);
+ void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
+ void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
+ u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
+ int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
+ struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
+ struct perf_event *);
+ void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
+};
+
+struct intel_uncore_pmu {
+ struct pmu pmu;
+ char name[UNCORE_PMU_NAME_LEN];
+ int pmu_idx;
+ int func_id;
+ struct intel_uncore_type *type;
+ struct intel_uncore_box ** __percpu box;
+ struct list_head box_list;
+};
+
+struct intel_uncore_extra_reg {
+ raw_spinlock_t lock;
+ u64 config, config1, config2;
+ atomic_t ref;
+};
+
+struct intel_uncore_box {
+ int phys_id;
+ int n_active; /* number of active events */
+ int n_events;
+ int cpu; /* cpu to collect events */
+ unsigned long flags;
+ atomic_t refcnt;
+ struct perf_event *events[UNCORE_PMC_IDX_MAX];
+ struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
+ unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+ u64 tags[UNCORE_PMC_IDX_MAX];
+ struct pci_dev *pci_dev;
+ struct intel_uncore_pmu *pmu;
+ u64 hrtimer_duration; /* hrtimer timeout for this box */
+ struct hrtimer hrtimer;
+ struct list_head list;
+ struct list_head active_list;
+ void *io_addr;
+ struct intel_uncore_extra_reg shared_regs[0];
+};
+
+#define UNCORE_BOX_FLAG_INITIATED 0
+
+struct uncore_event_desc {
+ struct kobj_attribute attr;
+ const char *config;
+};
+
+#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
+{ \
+ .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
+ .config = _config, \
+}
+
+#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
+static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ char *page) \
+{ \
+ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
+ return sprintf(page, _format "\n"); \
+} \
+static struct kobj_attribute format_attr_##_var = \
+ __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
+
+
+static ssize_t uncore_event_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct uncore_event_desc *event =
+ container_of(attr, struct uncore_event_desc, attr);
+ return sprintf(buf, "%s", event->config);
+}
+
+static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
+{
+ return box->pmu->type->box_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
+{
+ return box->pmu->type->fixed_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
+{
+ return box->pmu->type->fixed_ctr;
+}
+
+static inline
+unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
+{
+ return idx * 4 + box->pmu->type->event_ctl;
+}
+
+static inline
+unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+ return idx * 8 + box->pmu->type->perf_ctr;
+}
+
+static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
+{
+ struct intel_uncore_pmu *pmu = box->pmu;
+ return pmu->type->msr_offsets ?
+ pmu->type->msr_offsets[pmu->pmu_idx] :
+ pmu->type->msr_offset * pmu->pmu_idx;
+}
+
+static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
+{
+ if (!box->pmu->type->box_ctl)
+ return 0;
+ return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
+{
+ if (!box->pmu->type->fixed_ctl)
+ return 0;
+ return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
+{
+ return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
+{
+ return box->pmu->type->event_ctl +
+ (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+ uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+ return box->pmu->type->perf_ctr +
+ (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+ uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
+{
+ if (box->pci_dev)
+ return uncore_pci_fixed_ctl(box);
+ else
+ return uncore_msr_fixed_ctl(box);
+}
+
+static inline
+unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
+{
+ if (box->pci_dev)
+ return uncore_pci_fixed_ctr(box);
+ else
+ return uncore_msr_fixed_ctr(box);
+}
+
+static inline
+unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
+{
+ if (box->pci_dev)
+ return uncore_pci_event_ctl(box, idx);
+ else
+ return uncore_msr_event_ctl(box, idx);
+}
+
+static inline
+unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+ if (box->pci_dev)
+ return uncore_pci_perf_ctr(box, idx);
+ else
+ return uncore_msr_perf_ctr(box, idx);
+}
+
+static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
+{
+ return box->pmu->type->perf_ctr_bits;
+}
+
+static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
+{
+ return box->pmu->type->fixed_ctr_bits;
+}
+
+static inline int uncore_num_counters(struct intel_uncore_box *box)
+{
+ return box->pmu->type->num_counters;
+}
+
+static inline void uncore_disable_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->type->ops->disable_box)
+ box->pmu->type->ops->disable_box(box);
+}
+
+static inline void uncore_enable_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->type->ops->enable_box)
+ box->pmu->type->ops->enable_box(box);
+}
+
+static inline void uncore_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ box->pmu->type->ops->disable_event(box, event);
+}
+
+static inline void uncore_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ box->pmu->type->ops->enable_event(box, event);
+}
+
+static inline u64 uncore_read_counter(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ return box->pmu->type->ops->read_counter(box, event);
+}
+
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+ if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+ if (box->pmu->type->ops->init_box)
+ box->pmu->type->ops->init_box(box);
+ }
+}
+
+static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
+{
+ return (box->phys_id < 0);
+}
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c
new file mode 100644
index 00000000000..838fa8772c6
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_knc.c
@@ -0,0 +1,319 @@
+/* Driver for Intel Xeon Phi "Knights Corner" PMU */
+
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/hardirq.h>
+
+#include "perf_event.h"
+
+static const u64 knc_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x002a,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x0016,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0028,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0029,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0012,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x002b,
+};
+
+static const u64 __initconst knc_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ /* On Xeon Phi event "0" is a valid DATA_READ */
+ /* (L1 Data Cache Reads) Instruction. */
+ /* We code this as ARCH_PERFMON_EVENTSEL_INT as this */
+ /* bit will always be set in x86_pmu_hw_config(). */
+ [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+ /* DATA_READ */
+ [ C(RESULT_MISS) ] = 0x0003, /* DATA_READ_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
+ [ C(RESULT_MISS) ] = 0x0004, /* DATA_WRITE_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0011, /* L1_DATA_PF1 */
+ [ C(RESULT_MISS) ] = 0x001c, /* L1_DATA_PF1_MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
+ [ C(RESULT_MISS) ] = 0x000e, /* CODE_CACHE_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x10cb, /* L2_READ_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10cc, /* L2_WRITE_HIT */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10fc, /* L2_DATA_PF2 */
+ [ C(RESULT_MISS) ] = 0x10fe, /* L2_DATA_PF2_MISS */
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+ /* DATA_READ */
+ /* see note on L1 OP_READ */
+ [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
+ [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
+ [ C(RESULT_MISS) ] = 0x000d, /* CODE_PAGE_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0012, /* BRANCHES */
+ [ C(RESULT_MISS) ] = 0x002b, /* BRANCHES_MISPREDICTED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+
+static u64 knc_pmu_event_map(int hw_event)
+{
+ return knc_perfmon_event_map[hw_event];
+}
+
+static struct event_constraint knc_event_constraints[] =
+{
+ INTEL_EVENT_CONSTRAINT(0xc3, 0x1), /* HWP_L2HIT */
+ INTEL_EVENT_CONSTRAINT(0xc4, 0x1), /* HWP_L2MISS */
+ INTEL_EVENT_CONSTRAINT(0xc8, 0x1), /* L2_READ_HIT_E */
+ INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* L2_READ_HIT_M */
+ INTEL_EVENT_CONSTRAINT(0xca, 0x1), /* L2_READ_HIT_S */
+ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* L2_READ_MISS */
+ INTEL_EVENT_CONSTRAINT(0xcc, 0x1), /* L2_WRITE_HIT */
+ INTEL_EVENT_CONSTRAINT(0xce, 0x1), /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
+ INTEL_EVENT_CONSTRAINT(0xcf, 0x1), /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
+ INTEL_EVENT_CONSTRAINT(0xd7, 0x1), /* L2_VICTIM_REQ_WITH_DATA */
+ INTEL_EVENT_CONSTRAINT(0xe3, 0x1), /* SNP_HITM_BUNIT */
+ INTEL_EVENT_CONSTRAINT(0xe6, 0x1), /* SNP_HIT_L2 */
+ INTEL_EVENT_CONSTRAINT(0xe7, 0x1), /* SNP_HITM_L2 */
+ INTEL_EVENT_CONSTRAINT(0xf1, 0x1), /* L2_DATA_READ_MISS_CACHE_FILL */
+ INTEL_EVENT_CONSTRAINT(0xf2, 0x1), /* L2_DATA_WRITE_MISS_CACHE_FILL */
+ INTEL_EVENT_CONSTRAINT(0xf6, 0x1), /* L2_DATA_READ_MISS_MEM_FILL */
+ INTEL_EVENT_CONSTRAINT(0xf7, 0x1), /* L2_DATA_WRITE_MISS_MEM_FILL */
+ INTEL_EVENT_CONSTRAINT(0xfc, 0x1), /* L2_DATA_PF2 */
+ INTEL_EVENT_CONSTRAINT(0xfd, 0x1), /* L2_DATA_PF2_DROP */
+ INTEL_EVENT_CONSTRAINT(0xfe, 0x1), /* L2_DATA_PF2_MISS */
+ INTEL_EVENT_CONSTRAINT(0xff, 0x1), /* L2_DATA_HIT_INFLIGHT_PF2 */
+ EVENT_CONSTRAINT_END
+};
+
+#define MSR_KNC_IA32_PERF_GLOBAL_STATUS 0x0000002d
+#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL 0x0000002e
+#define MSR_KNC_IA32_PERF_GLOBAL_CTRL 0x0000002f
+
+#define KNC_ENABLE_COUNTER0 0x00000001
+#define KNC_ENABLE_COUNTER1 0x00000002
+
+static void knc_pmu_disable_all(void)
+{
+ u64 val;
+
+ rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+ val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+ wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static void knc_pmu_enable_all(int added)
+{
+ u64 val;
+
+ rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+ val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+ wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static inline void
+knc_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val;
+
+ val = hwc->config;
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static void knc_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val;
+
+ val = hwc->config;
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static inline u64 knc_pmu_get_status(void)
+{
+ u64 status;
+
+ rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void knc_pmu_ack_status(u64 ack)
+{
+ wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
+}
+
+static int knc_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc;
+ int handled = 0;
+ int bit, loops;
+ u64 status;
+
+ cpuc = &__get_cpu_var(cpu_hw_events);
+
+ knc_pmu_disable_all();
+
+ status = knc_pmu_get_status();
+ if (!status) {
+ knc_pmu_enable_all(0);
+ return handled;
+ }
+
+ loops = 0;
+again:
+ knc_pmu_ack_status(status);
+ if (++loops > 100) {
+ WARN_ONCE(1, "perf: irq loop stuck!\n");
+ perf_event_print_debug();
+ goto done;
+ }
+
+ inc_irq_stat(apic_perf_irqs);
+
+ for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+ struct perf_event *event = cpuc->events[bit];
+
+ handled++;
+
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ if (!intel_pmu_save_and_restart(event))
+ continue;
+
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
+ if (perf_event_overflow(event, &data, regs))
+ x86_pmu_stop(event, 0);
+ }
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ status = knc_pmu_get_status();
+ if (status)
+ goto again;
+
+done:
+ knc_pmu_enable_all(0);
+
+ return handled;
+}
+
+
+PMU_FORMAT_ATTR(event, "config:0-7" );
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+
+static struct attribute *intel_knc_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+static const struct x86_pmu knc_pmu __initconst = {
+ .name = "knc",
+ .handle_irq = knc_pmu_handle_irq,
+ .disable_all = knc_pmu_disable_all,
+ .enable_all = knc_pmu_enable_all,
+ .enable = knc_pmu_enable_event,
+ .disable = knc_pmu_disable_event,
+ .hw_config = x86_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_KNC_EVNTSEL0,
+ .perfctr = MSR_KNC_PERFCTR0,
+ .event_map = knc_pmu_event_map,
+ .max_events = ARRAY_SIZE(knc_perfmon_event_map),
+ .apic = 1,
+ .max_period = (1ULL << 39) - 1,
+ .version = 0,
+ .num_counters = 2,
+ .cntval_bits = 40,
+ .cntval_mask = (1ULL << 40) - 1,
+ .get_event_constraints = x86_get_event_constraints,
+ .event_constraints = knc_event_constraints,
+ .format_attrs = intel_knc_formats_attr,
+};
+
+__init int knc_pmu_init(void)
+{
+ x86_pmu = knc_pmu;
+
+ memcpy(hw_cache_event_ids, knc_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ef484d9d0a2..5d466b7d860 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -895,8 +895,8 @@ static void p4_pmu_disable_pebs(void)
* So at moment let leave metrics turned on forever -- it's
* ok for now but need to be revisited!
*
- * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
- * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
+ * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0);
+ * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
*/
}
@@ -909,9 +909,8 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
* state we need to clear P4_CCCR_OVF, otherwise interrupt get
* asserted again and again
*/
- (void)checking_wrmsrl(hwc->config_base,
- (u64)(p4_config_unpack_cccr(hwc->config)) &
- ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
+ (void)wrmsrl_safe(hwc->config_base,
+ p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
}
static void p4_pmu_disable_all(void)
@@ -943,8 +942,8 @@ static void p4_pmu_enable_pebs(u64 config)
bind = &p4_pebs_bind_map[idx];
- (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
- (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
+ (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
+ (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
}
static void p4_pmu_enable_event(struct perf_event *event)
@@ -957,7 +956,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
u64 escr_addr, cccr;
bind = &p4_event_bind_map[idx];
- escr_addr = (u64)bind->escr_msr[thread];
+ escr_addr = bind->escr_msr[thread];
/*
* - we dont support cascaded counters yet
@@ -978,8 +977,8 @@ static void p4_pmu_enable_event(struct perf_event *event)
*/
p4_pmu_enable_pebs(hwc->config);
- (void)checking_wrmsrl(escr_addr, escr_conf);
- (void)checking_wrmsrl(hwc->config_base,
+ (void)wrmsrl_safe(escr_addr, escr_conf);
+ (void)wrmsrl_safe(hwc->config_base,
(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
}
@@ -1005,8 +1004,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- perf_sample_data_init(&data, 0);
-
cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -1034,10 +1031,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
handled += overflow;
/* event overflow for sure */
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, hwc->last_period);
if (!x86_perf_event_set_period(event))
continue;
+
+
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
@@ -1258,7 +1257,24 @@ again:
pass++;
goto again;
}
-
+ /*
+ * Perf does test runs to see if a whole group can be assigned
+ * together succesfully. There can be multiple rounds of this.
+ * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
+ * bits, such that the next round of group assignments will
+ * cause the above p4_should_swap_ts to pass instead of fail.
+ * This leads to counters exclusive to thread0 being used by
+ * thread1.
+ *
+ * Solve this with a cheap hack, reset the idx back to -1 to
+ * force a new lookup (p4_next_cntr) to get the right counter
+ * for the right thread.
+ *
+ * This probably doesn't comply with the general spirit of how
+ * perf wants to work, but P4 is special. :-(
+ */
+ if (p4_should_swap_ts(hwc->config, cpu))
+ hwc->idx = -1;
p4_pmu_swap_config_ts(hwc, cpu);
if (assign)
assign[i] = cntr_idx;
@@ -1271,6 +1287,17 @@ done:
return num ? -EINVAL : 0;
}
+PMU_FORMAT_ATTR(cccr, "config:0-31" );
+PMU_FORMAT_ATTR(escr, "config:32-62");
+PMU_FORMAT_ATTR(ht, "config:63" );
+
+static struct attribute *intel_p4_formats_attr[] = {
+ &format_attr_cccr.attr,
+ &format_attr_escr.attr,
+ &format_attr_ht.attr,
+ NULL,
+};
+
static __initconst const struct x86_pmu p4_pmu = {
.name = "Netburst P4/Xeon",
.handle_irq = p4_pmu_handle_irq,
@@ -1305,14 +1332,17 @@ static __initconst const struct x86_pmu p4_pmu = {
* the former idea is taken from OProfile code
*/
.perfctr_second_write = 1,
+
+ .format_attrs = intel_p4_formats_attr,
};
__init int p4_pmu_init(void)
{
unsigned int low, high;
+ int i, reg;
/* If we get stripped -- indexing fails */
- BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
+ BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
rdmsr(MSR_IA32_MISC_ENABLE, low, high);
if (!(low & (1 << 7))) {
@@ -1328,5 +1358,19 @@ __init int p4_pmu_init(void)
x86_pmu = p4_pmu;
+ /*
+ * Even though the counters are configured to interrupt a particular
+ * logical processor when an overflow happens, testing has shown that
+ * on kdump kernels (which uses a single cpu), thread1's counter
+ * continues to run and will report an NMI on thread0. Due to the
+ * overflow bug, this leads to a stream of unknown NMIs.
+ *
+ * Solve this by zero'ing out the registers to mimic a reset.
+ */
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ reg = x86_pmu_config_addr(i);
+ wrmsrl_safe(reg, 0ULL);
+ }
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index c7181befecd..7c1a0c07b60 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -8,13 +8,106 @@
*/
static const u64 p6_perfmon_event_map[] =
{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
- [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */
+
+};
+
+static const u64 __initconst p6_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
+ [ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
+ [ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
+ [ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
};
static u64 p6_pmu_event_map(int hw_event)
@@ -34,7 +127,7 @@ static struct event_constraint p6_event_constraints[] =
{
INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
- INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
+ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
@@ -64,29 +157,46 @@ static void p6_pmu_enable_all(int added)
static inline void
p6_pmu_disable_event(struct perf_event *event)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
u64 val = P6_NOP_EVENT;
- if (cpuc->enabled)
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-
- (void)checking_wrmsrl(hwc->config_base, val);
+ (void)wrmsrl_safe(hwc->config_base, val);
}
static void p6_pmu_enable_event(struct perf_event *event)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
u64 val;
val = hwc->config;
- if (cpuc->enabled)
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
- (void)checking_wrmsrl(hwc->config_base, val);
+ /*
+ * p6 only has a global event enable, set on PerfEvtSel0
+ * We "disable" events by programming P6_NOP_EVENT
+ * and we rely on p6_pmu_enable_all() being called
+ * to actually enable the events.
+ */
+
+ (void)wrmsrl_safe(hwc->config_base, val);
}
+PMU_FORMAT_ATTR(event, "config:0-7" );
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(pc, "config:19" );
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+
+static struct attribute *intel_p6_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_pc.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
static __initconst const struct x86_pmu p6_pmu = {
.name = "p6",
.handle_irq = x86_pmu_handle_irq,
@@ -115,29 +225,55 @@ static __initconst const struct x86_pmu p6_pmu = {
.cntval_mask = (1ULL << 32) - 1,
.get_event_constraints = x86_get_event_constraints,
.event_constraints = p6_event_constraints,
+
+ .format_attrs = intel_p6_formats_attr,
+ .events_sysfs_show = intel_event_sysfs_show,
+
};
+static __init void p6_pmu_rdpmc_quirk(void)
+{
+ if (boot_cpu_data.x86_mask < 9) {
+ /*
+ * PPro erratum 26; fixed in stepping 9 and above.
+ */
+ pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
+ x86_pmu.attr_rdpmc_broken = 1;
+ x86_pmu.attr_rdpmc = 0;
+ }
+}
+
__init int p6_pmu_init(void)
{
+ x86_pmu = p6_pmu;
+
switch (boot_cpu_data.x86_model) {
- case 1:
- case 3: /* Pentium Pro */
- case 5:
- case 6: /* Pentium II */
- case 7:
- case 8:
- case 11: /* Pentium III */
- case 9:
- case 13:
- /* Pentium M */
+ case 1: /* Pentium Pro */
+ x86_add_quirk(p6_pmu_rdpmc_quirk);
+ break;
+
+ case 3: /* Pentium II - Klamath */
+ case 5: /* Pentium II - Deschutes */
+ case 6: /* Pentium II - Mendocino */
break;
+
+ case 7: /* Pentium III - Katmai */
+ case 8: /* Pentium III - Coppermine */
+ case 10: /* Pentium III Xeon */
+ case 11: /* Pentium III - Tualatin */
+ break;
+
+ case 9: /* Pentium M - Banias */
+ case 13: /* Pentium M - Dothan */
+ break;
+
default:
- pr_cont("unsupported p6 CPU model %d ",
- boot_cpu_data.x86_model);
+ pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
return -ENODEV;
}
- x86_pmu = p6_pmu;
+ memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
return 0;
}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 966512b2cac..2e8caf03f59 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -56,6 +56,8 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
switch (boot_cpu_data.x86) {
case 6:
return msr - MSR_P6_PERFCTR0;
+ case 11:
+ return msr - MSR_KNC_PERFCTR0;
case 15:
return msr - MSR_P4_BPU_PERFCTR0;
}
@@ -82,6 +84,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
switch (boot_cpu_data.x86) {
case 6:
return msr - MSR_P6_EVNTSEL0;
+ case 11:
+ return msr - MSR_KNC_EVNTSEL0;
case 15:
return msr - MSR_P4_BSU_ESCR0;
}
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
index 7b3fe56b1c2..31f0f335ed2 100644
--- a/arch/x86/kernel/cpu/powerflags.c
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -11,10 +11,10 @@ const char *const x86_power_flags[32] = {
"fid", /* frequency id control */
"vid", /* voltage id control */
"ttp", /* thermal trip */
- "tm",
- "stc",
- "100mhzsteps",
- "hwpstate",
+ "tm", /* hardware thermal control */
+ "stc", /* software thermal control */
+ "100mhzsteps", /* 100 MHz multiplier control */
+ "hwpstate", /* hardware P-state control */
"", /* tsc invariant mapped to constant_tsc */
"cpb", /* core performance boost */
"eff_freq_ro", /* Readonly aperf/mperf */
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 8022c668148..06fe3ed8b85 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -11,41 +11,31 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
unsigned int cpu)
{
#ifdef CONFIG_SMP
- if (c->x86_max_cores * smp_num_siblings > 1) {
- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
- seq_printf(m, "siblings\t: %d\n",
- cpumask_weight(cpu_core_mask(cpu)));
- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
- seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
- seq_printf(m, "apicid\t\t: %d\n", c->apicid);
- seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
- }
+ seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+ seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu)));
+ seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+ seq_printf(m, "apicid\t\t: %d\n", c->apicid);
+ seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
#endif
}
#ifdef CONFIG_X86_32
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
{
- /*
- * We use exception 16 if we have hardware math and we've either seen
- * it or the CPU claims it is internal
- */
- int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
seq_printf(m,
"fdiv_bug\t: %s\n"
- "hlt_bug\t\t: %s\n"
"f00f_bug\t: %s\n"
"coma_bug\t: %s\n"
"fpu\t\t: %s\n"
"fpu_exception\t: %s\n"
"cpuid level\t: %d\n"
"wp\t\t: %s\n",
- c->fdiv_bug ? "yes" : "no",
- c->hlt_works_ok ? "no" : "yes",
- c->f00f_bug ? "yes" : "no",
- c->coma_bug ? "yes" : "no",
- c->hard_math ? "yes" : "no",
- fpu_exception ? "yes" : "no",
+ static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no",
+ static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no",
+ static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no",
+ static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
+ static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
c->cpuid_level,
c->wp_works_ok ? "yes" : "no");
}
@@ -140,10 +130,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
static void *c_start(struct seq_file *m, loff_t *pos)
{
- if (*pos == 0) /* just in case, cpu 0 is not the first */
- *pos = cpumask_first(cpu_online_mask);
- else
- *pos = cpumask_next(*pos - 1, cpu_online_mask);
+ *pos = cpumask_next(*pos - 1, cpu_online_mask);
if ((*pos) < nr_cpu_ids)
return &cpu_data(*pos);
return NULL;
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index feca286c2bb..136ac74dee8 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -27,24 +27,11 @@
static int __init x86_rdrand_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_RDRAND);
+ setup_clear_cpu_cap(X86_FEATURE_RDSEED);
return 1;
}
__setup("nordrand", x86_rdrand_setup);
-/* We can't use arch_get_random_long() here since alternatives haven't run */
-static inline int rdrand_long(unsigned long *v)
-{
- int ok;
- asm volatile("1: " RDRAND_LONG "\n\t"
- "jc 2f\n\t"
- "decl %0\n\t"
- "jnz 1b\n\t"
- "2:"
- : "=r" (ok), "=a" (*v)
- : "0" (RDRAND_RETRY_LOOPS));
- return ok;
-}
-
/*
* Force a reseed cycle; we are architecturally guaranteed a reseed
* after no more than 512 128-bit chunks of random data. This also
@@ -52,7 +39,7 @@ static inline int rdrand_long(unsigned long *v)
*/
#define RESEED_LOOP ((512*128)/sizeof(unsigned long))
-void __cpuinit x86_init_rdrand(struct cpuinfo_x86 *c)
+void x86_init_rdrand(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_ARCH_RANDOM
unsigned long tmp;
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index c7f64e6f537..b6f794aa169 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -1,5 +1,5 @@
/*
- * Routines to indentify additional cpu features that are scattered in
+ * Routines to identify additional cpu features that are scattered in
* cpuid space.
*/
#include <linux/cpu.h>
@@ -24,14 +24,14 @@ enum cpuid_regs {
CR_EBX
};
-void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{
u32 max_level;
u32 regs[4];
const struct cpuid_bit *cb;
- static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
- { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 },
+ static const struct cpuid_bit cpuid_bits[] = {
+ { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 },
{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
{ X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
@@ -39,7 +39,9 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
+ { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 },
{ X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
{ X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
{ X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
deleted file mode 100644
index a640ae5ad20..00000000000
--- a/arch/x86/kernel/cpu/sched.c
+++ /dev/null
@@ -1,55 +0,0 @@
-#include <linux/sched.h>
-#include <linux/math64.h>
-#include <linux/percpu.h>
-#include <linux/irqflags.h>
-
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-
-#ifdef CONFIG_SMP
-
-static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
-
-static unsigned long scale_aperfmperf(void)
-{
- struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
- unsigned long ratio, flags;
-
- local_irq_save(flags);
- get_aperfmperf(&val);
- local_irq_restore(flags);
-
- ratio = calc_aperfmperf_ratio(old, &val);
- *old = val;
-
- return ratio;
-}
-
-unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
- /*
- * do aperf/mperf on the cpu level because it includes things
- * like turbo mode, which are relevant to full cores.
- */
- if (boot_cpu_has(X86_FEATURE_APERFMPERF))
- return scale_aperfmperf();
-
- /*
- * maybe have something cpufreq here
- */
-
- return default_scale_freq_power(sd, cpu);
-}
-
-unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
-{
- /*
- * aperf/mperf already includes the smt gain
- */
- if (boot_cpu_has(X86_FEATURE_APERFMPERF))
- return SCHED_LOAD_SCALE;
-
- return default_scale_smt_power(sd, cpu);
-}
-
-#endif
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 4397e987a1c..4c60eaf0571 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -26,7 +26,7 @@
* exists, use it for populating initial_apicid and cpu topology
* detection.
*/
-void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
+void detect_extended_topology(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
unsigned int eax, ebx, ecx, edx, sub_index;
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 28000743bbb..3fa0e5ad86b 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,11 +1,10 @@
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/init.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include "cpu.h"
-static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
+static void early_init_transmeta(struct cpuinfo_x86 *c)
{
u32 xlvl;
@@ -17,7 +16,7 @@ static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
+static void init_transmeta(struct cpuinfo_x86 *c)
{
unsigned int cap_mask, uk, max, dummy;
unsigned int cms_rev1, cms_rev2;
@@ -98,7 +97,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
#endif
}
-static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
+static const struct cpu_dev transmeta_cpu_dev = {
.c_vendor = "Transmeta",
.c_ident = { "GenuineTMx86", "TransmetaCPU" },
.c_early_init = early_init_transmeta,
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index fd2c37bf7ac..ef9c2a0078b 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -1,5 +1,4 @@
#include <linux/kernel.h>
-#include <linux/init.h>
#include <asm/processor.h>
#include "cpu.h"
@@ -8,11 +7,11 @@
* so no special init takes place.
*/
-static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
+static const struct cpu_dev umc_cpu_dev = {
.c_vendor = "UMC",
.c_ident = { "UMC UMC UMC" },
- .c_models = {
- { .vendor = X86_VENDOR_UMC, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[1] = "U5D",
[2] = "U5S",
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index d22d0c4edcf..628a059a9a0 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -33,6 +33,9 @@
#define VMWARE_PORT_CMD_GETVERSION 10
#define VMWARE_PORT_CMD_GETHZ 45
+#define VMWARE_PORT_CMD_GETVCPU_INFO 68
+#define VMWARE_PORT_CMD_LEGACY_X2APIC 3
+#define VMWARE_PORT_CMD_VCPU_RESERVED 31
#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
__asm__("inl (%%dx)" : \
@@ -90,7 +93,7 @@ static void __init vmware_platform_setup(void)
* serial key should be enough, as this will always have a VMware
* specific string when running under VMware hypervisor.
*/
-static bool __init vmware_platform(void)
+static uint32_t __init vmware_platform(void)
{
if (cpu_has_hypervisor) {
unsigned int eax;
@@ -99,12 +102,12 @@ static bool __init vmware_platform(void)
cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
&hyper_vendor_id[1], &hyper_vendor_id[2]);
if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
- return true;
+ return CPUID_VMWARE_INFO_LEAF;
} else if (dmi_available && dmi_name_in_serial("VMware") &&
__vmware_platform())
- return true;
+ return 1;
- return false;
+ return 0;
}
/*
@@ -119,16 +122,26 @@ static bool __init vmware_platform(void)
* so that the kernel could just trust the hypervisor with providing a
* reliable virtual TSC that is suitable for timekeeping.
*/
-static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
+static void vmware_set_cpu_features(struct cpuinfo_x86 *c)
{
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
}
+/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
+static bool __init vmware_legacy_x2apic_available(void)
+{
+ uint32_t eax, ebx, ecx, edx;
+ VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx);
+ return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 &&
+ (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
+}
+
const __refconst struct hypervisor_x86 x86_hyper_vmware = {
.name = "VMware",
.detect = vmware_platform,
.set_cpu_features = vmware_set_cpu_features,
.init_platform = vmware_platform_setup,
+ .x2apic_available = vmware_legacy_x2apic_available,
};
EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index a524353d93f..3225ae6c518 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -43,7 +43,6 @@
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/system.h>
static struct class *cpuid_class;
@@ -86,7 +85,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
{
char __user *tmp = buf;
struct cpuid_regs cmd;
- int cpu = iminor(file->f_path.dentry->d_inode);
+ int cpu = iminor(file_inode(file));
u64 pos = *ppos;
ssize_t bytes = 0;
int err = 0;
@@ -117,7 +116,7 @@ static int cpuid_open(struct inode *inode, struct file *file)
unsigned int cpu;
struct cpuinfo_x86 *c;
- cpu = iminor(file->f_path.dentry->d_inode);
+ cpu = iminor(file_inode(file));
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
return -ENXIO; /* No such CPU */
@@ -138,7 +137,7 @@ static const struct file_operations cpuid_fops = {
.open = cpuid_open,
};
-static __cpuinit int cpuid_device_create(int cpu)
+static int cpuid_device_create(int cpu)
{
struct device *dev;
@@ -152,9 +151,8 @@ static void cpuid_device_destroy(int cpu)
device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
}
-static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int cpuid_class_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
int err = 0;
@@ -200,12 +198,15 @@ static int __init cpuid_init(void)
goto out_chrdev;
}
cpuid_class->devnode = cpuid_devnode;
+
+ cpu_notifier_register_begin();
for_each_online_cpu(i) {
err = cpuid_device_create(i);
if (err != 0)
goto out_class;
}
- register_hotcpu_notifier(&cpuid_class_cpu_notifier);
+ __register_hotcpu_notifier(&cpuid_class_cpu_notifier);
+ cpu_notifier_register_done();
err = 0;
goto out;
@@ -215,6 +216,7 @@ out_class:
for_each_online_cpu(i) {
cpuid_device_destroy(i);
}
+ cpu_notifier_register_done();
class_destroy(cpuid_class);
out_chrdev:
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
@@ -226,11 +228,13 @@ static void __exit cpuid_exit(void)
{
int cpu = 0;
+ cpu_notifier_register_begin();
for_each_online_cpu(cpu)
cpuid_device_destroy(cpu);
class_destroy(cpuid_class);
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
- unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
+ __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
+ cpu_notifier_register_done();
}
module_init(cpuid_init);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 13ad89971d4..507de806659 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -7,7 +7,6 @@
*
*/
-#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/smp.h>
@@ -16,6 +15,7 @@
#include <linux/delay.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
+#include <linux/module.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
@@ -30,15 +30,34 @@
int in_crash_kexec;
+/*
+ * This is used to VMCLEAR all VMCSs loaded on the
+ * processor. And when loading kvm_intel module, the
+ * callback function pointer will be assigned.
+ *
+ * protected by rcu.
+ */
+crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
+EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
+
+static inline void cpu_crash_vmclear_loaded_vmcss(void)
+{
+ crash_vmclear_fn *do_vmclear_operation = NULL;
+
+ rcu_read_lock();
+ do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
+ if (do_vmclear_operation)
+ do_vmclear_operation();
+ rcu_read_unlock();
+}
+
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
struct pt_regs fixed_regs;
-#endif
-#ifdef CONFIG_X86_32
if (!user_mode_vm(regs)) {
crash_fixup_ss_esp(&fixed_regs, regs);
regs = &fixed_regs;
@@ -46,6 +65,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
#endif
crash_save_cpu(regs, cpu);
+ /*
+ * VMCLEAR VMCSs loaded on all cpus if needed.
+ */
+ cpu_crash_vmclear_loaded_vmcss();
+
/* Disable VMX or SVM if needed.
*
* We need to disable virtualization on all CPUs.
@@ -88,6 +112,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
kdump_nmi_shootdown_cpus();
+ /*
+ * VMCLEAR VMCSs loaded on this cpu if needed.
+ */
+ cpu_crash_vmclear_loaded_vmcss();
+
/* Booting kdump kernel with VMX or SVM enabled won't work,
* because (among other limitations) we can't disable paging
* with the virt flags.
@@ -95,10 +124,12 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
cpu_emergency_vmxoff();
cpu_emergency_svm_disable();
- lapic_shutdown();
-#if defined(CONFIG_X86_IO_APIC)
+#ifdef CONFIG_X86_IO_APIC
+ /* Prevent crash_kexec() from deadlocking on ioapic_lock. */
+ ioapic_zap_locks();
disable_IO_APIC();
#endif
+ lapic_shutdown();
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 642f75a68cd..11891ca7b71 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -62,16 +62,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!userbuf) {
memcpy(buf, (vaddr + offset), csize);
- kunmap_atomic(vaddr, KM_PTE0);
+ kunmap_atomic(vaddr);
} else {
if (!kdump_buf_page) {
printk(KERN_WARNING "Kdump: Kdump buffer page not"
" allocated\n");
- kunmap_atomic(vaddr, KM_PTE0);
+ kunmap_atomic(vaddr);
return -EFAULT;
}
copy_page(kdump_buf_page, vaddr);
- kunmap_atomic(vaddr, KM_PTE0);
+ kunmap_atomic(vaddr);
if (copy_to_user(buf, (kdump_buf_page + offset), csize))
return -EFAULT;
}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 52821799a70..7db54b5d5f8 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -4,6 +4,7 @@
#include <linux/bootmem.h>
#include <linux/export.h>
#include <linux/io.h>
+#include <linux/irqdomain.h>
#include <linux/interrupt.h>
#include <linux/list.h>
#include <linux/of.h>
@@ -17,74 +18,15 @@
#include <linux/initrd.h>
#include <asm/hpet.h>
-#include <asm/irq_controller.h>
#include <asm/apic.h>
#include <asm/pci_x86.h>
+#include <asm/setup.h>
__initdata u64 initial_dtb;
char __initdata cmd_line[COMMAND_LINE_SIZE];
-static LIST_HEAD(irq_domains);
-static DEFINE_RAW_SPINLOCK(big_irq_lock);
int __initdata of_ioapic;
-#ifdef CONFIG_X86_IO_APIC
-static void add_interrupt_host(struct irq_domain *ih)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&big_irq_lock, flags);
- list_add(&ih->l, &irq_domains);
- raw_spin_unlock_irqrestore(&big_irq_lock, flags);
-}
-#endif
-
-static struct irq_domain *get_ih_from_node(struct device_node *controller)
-{
- struct irq_domain *ih, *found = NULL;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&big_irq_lock, flags);
- list_for_each_entry(ih, &irq_domains, l) {
- if (ih->controller == controller) {
- found = ih;
- break;
- }
- }
- raw_spin_unlock_irqrestore(&big_irq_lock, flags);
- return found;
-}
-
-unsigned int irq_create_of_mapping(struct device_node *controller,
- const u32 *intspec, unsigned int intsize)
-{
- struct irq_domain *ih;
- u32 virq, type;
- int ret;
-
- ih = get_ih_from_node(controller);
- if (!ih)
- return 0;
- ret = ih->xlate(ih, intspec, intsize, &virq, &type);
- if (ret)
- return 0;
- if (type == IRQ_TYPE_NONE)
- return virq;
- irq_set_irq_type(virq, type);
- return virq;
-}
-EXPORT_SYMBOL_GPL(irq_create_of_mapping);
-
-unsigned long pci_address_to_pio(phys_addr_t address)
-{
- /*
- * The ioport address can be directly used by inX / outX
- */
- BUG_ON(address >= (1 << 16));
- return (unsigned long)address;
-}
-EXPORT_SYMBOL_GPL(pci_address_to_pio);
-
void __init early_init_dt_scan_chosen_arch(unsigned long node)
{
BUG();
@@ -100,16 +42,6 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init early_init_dt_setup_initrd_arch(unsigned long start,
- unsigned long end)
-{
- initrd_start = (unsigned long)__va(start);
- initrd_end = (unsigned long)__va(end);
- initrd_below_start_ok = 1;
-}
-#endif
-
void __init add_dtb(u64 data)
{
initial_dtb = data + offsetof(struct setup_data, data);
@@ -155,7 +87,6 @@ struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
static int x86_of_pci_irq_enable(struct pci_dev *dev)
{
- struct of_irq oirq;
u32 virq;
int ret;
u8 pin;
@@ -166,12 +97,7 @@ static int x86_of_pci_irq_enable(struct pci_dev *dev)
if (!pin)
return 0;
- ret = of_irq_map_pci(dev, &oirq);
- if (ret)
- return ret;
-
- virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
- oirq.size);
+ virq = of_irq_parse_and_map_pci(dev, 0, 0);
if (virq == 0)
return -EINVAL;
dev->irq = virq;
@@ -182,7 +108,7 @@ static void x86_of_pci_irq_disable(struct pci_dev *dev)
{
}
-void __cpuinit x86_of_pci_init(void)
+void x86_of_pci_init(void)
{
pcibios_enable_irq = x86_of_pci_irq_enable;
pcibios_disable_irq = x86_of_pci_irq_disable;
@@ -280,32 +206,23 @@ static void __init dtb_apic_setup(void)
static void __init x86_flattree_get_config(void)
{
u32 size, map_len;
- void *new_dtb;
+ void *dt;
if (!initial_dtb)
return;
- map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK),
- (u64)sizeof(struct boot_param_header));
+ map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
- initial_boot_params = early_memremap(initial_dtb, map_len);
- size = be32_to_cpu(initial_boot_params->totalsize);
+ initial_boot_params = dt = early_memremap(initial_dtb, map_len);
+ size = of_get_flat_dt_size();
if (map_len < size) {
- early_iounmap(initial_boot_params, map_len);
- initial_boot_params = early_memremap(initial_dtb, size);
+ early_iounmap(dt, map_len);
+ initial_boot_params = dt = early_memremap(initial_dtb, size);
map_len = size;
}
- new_dtb = alloc_bootmem(size);
- memcpy(new_dtb, initial_boot_params, size);
- early_iounmap(initial_boot_params, map_len);
-
- initial_boot_params = new_dtb;
-
- /* root level address cells */
- of_scan_flat_dt(early_init_dt_scan_root, NULL);
-
- unflatten_device_tree();
+ unflatten_and_copy_device_tree();
+ early_iounmap(dt, map_len);
}
#else
static inline void x86_flattree_get_config(void) { }
@@ -354,34 +271,80 @@ static struct of_ioapic_type of_ioapic_type[] =
},
};
-static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
- u32 *out_hwirq, u32 *out_type)
+static int ioapic_xlate(struct irq_domain *domain,
+ struct device_node *controller,
+ const u32 *intspec, u32 intsize,
+ irq_hw_number_t *out_hwirq, u32 *out_type)
{
- struct mp_ioapic_gsi *gsi_cfg;
struct io_apic_irq_attr attr;
struct of_ioapic_type *it;
- u32 line, idx, type;
+ u32 line, idx;
+ int rc;
- if (intsize < 2)
+ if (WARN_ON(intsize < 2))
return -EINVAL;
- line = *intspec;
- idx = (u32) id->priv;
- gsi_cfg = mp_ioapic_gsi_routing(idx);
- *out_hwirq = line + gsi_cfg->gsi_base;
+ line = intspec[0];
- intspec++;
- type = *intspec;
-
- if (type >= ARRAY_SIZE(of_ioapic_type))
+ if (intspec[1] >= ARRAY_SIZE(of_ioapic_type))
return -EINVAL;
- it = of_ioapic_type + type;
- *out_type = it->out_type;
+ it = &of_ioapic_type[intspec[1]];
+ idx = (u32) domain->host_data;
set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
- return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr);
+ rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line),
+ cpu_to_node(0), &attr);
+ if (rc)
+ return rc;
+
+ *out_hwirq = line;
+ *out_type = it->out_type;
+ return 0;
+}
+
+const struct irq_domain_ops ioapic_irq_domain_ops = {
+ .xlate = ioapic_xlate,
+};
+
+static void dt_add_ioapic_domain(unsigned int ioapic_num,
+ struct device_node *np)
+{
+ struct irq_domain *id;
+ struct mp_ioapic_gsi *gsi_cfg;
+ int ret;
+ int num;
+
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic_num);
+ num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
+
+ id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops,
+ (void *)ioapic_num);
+ BUG_ON(!id);
+ if (gsi_cfg->gsi_base == 0) {
+ /*
+ * The first NR_IRQS_LEGACY irq descs are allocated in
+ * early_irq_init() and need just a mapping. The
+ * remaining irqs need both. All of them are preallocated
+ * and assigned so we can keep the 1:1 mapping which the ioapic
+ * is having.
+ */
+ irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY);
+
+ if (num > NR_IRQS_LEGACY) {
+ ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY,
+ NR_IRQS_LEGACY, num - NR_IRQS_LEGACY);
+ if (ret)
+ pr_err("Error creating mapping for the "
+ "remaining IRQs: %d\n", ret);
+ }
+ irq_set_default_host(id);
+ } else {
+ ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num);
+ if (ret)
+ pr_err("Error creating IRQ mapping: %d\n", ret);
+ }
}
static void __init ioapic_add_ofnode(struct device_node *np)
@@ -398,14 +361,7 @@ static void __init ioapic_add_ofnode(struct device_node *np)
for (i = 0; i < nr_ioapics; i++) {
if (r.start == mpc_ioapic_addr(i)) {
- struct irq_domain *id;
-
- id = kzalloc(sizeof(*id), GFP_KERNEL);
- BUG_ON(!id);
- id->controller = np;
- id->xlate = ioapic_xlate;
- id->priv = (void *)i;
- add_interrupt_host(id);
+ dt_add_ioapic_domain(i, np);
return;
}
}
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault.c
index 37250fe490b..f6dfd9334b6 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault.c
@@ -1,6 +1,5 @@
#include <linux/mm.h>
#include <linux/sched.h>
-#include <linux/init.h>
#include <linux/init_task.h>
#include <linux/fs.h>
@@ -9,6 +8,8 @@
#include <asm/processor.h>
#include <asm/desc.h>
+#ifdef CONFIG_X86_32
+
#define DOUBLEFAULT_STACKSIZE (1024)
static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
@@ -20,7 +21,7 @@ static void doublefault_fn(void)
struct desc_ptr gdt_desc = {0, 0};
unsigned long gdt, tss;
- store_gdt(&gdt_desc);
+ native_store_gdt(&gdt_desc);
gdt = gdt_desc.address;
printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
@@ -67,3 +68,16 @@ struct tss_struct doublefault_tss __cacheline_aligned = {
.__cr3 = __pa_nodebug(swapper_pg_dir),
}
};
+
+/* dummy for do_double_fault() call */
+void df_debug(struct pt_regs *regs, long error_code) {}
+
+#else /* !CONFIG_X86_32 */
+
+void df_debug(struct pt_regs *regs, long error_code)
+{
+ pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
+ show_regs(regs);
+ panic("Machine halted.");
+}
+#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 4025fe4f928..b74ebc7c440 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -25,10 +25,15 @@ unsigned int code_bytes = 64;
int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
static int die_counter;
-void printk_address(unsigned long address, int reliable)
+static void printk_stack_address(unsigned long address, int reliable)
{
- printk(" [<%p>] %s%pB\n", (void *) address,
- reliable ? "" : "? ", (void *) address);
+ pr_cont(" [<%p>] %s%pB\n",
+ (void *)address, reliable ? "" : "? ", (void *)address);
+}
+
+void printk_address(unsigned long address)
+{
+ pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address);
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -37,13 +42,16 @@ print_ftrace_graph_addr(unsigned long addr, void *data,
const struct stacktrace_ops *ops,
struct thread_info *tinfo, int *graph)
{
- struct task_struct *task = tinfo->task;
+ struct task_struct *task;
unsigned long ret_addr;
- int index = task->curr_ret_stack;
+ int index;
if (addr != (unsigned long)return_to_handler)
return;
+ task = tinfo->task;
+ index = task->curr_ret_stack;
+
if (!task->ret_stack || index < *graph)
return;
@@ -148,7 +156,7 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
{
touch_nmi_watchdog();
printk(data);
- printk_address(addr, reliable);
+ printk_stack_address(addr, reliable);
}
static const struct stacktrace_ops print_trace_ops = {
@@ -173,32 +181,26 @@ void show_trace(struct task_struct *task, struct pt_regs *regs,
void show_stack(struct task_struct *task, unsigned long *sp)
{
- show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
- unsigned long bp;
+ unsigned long bp = 0;
unsigned long stack;
- bp = stack_frame(current, NULL);
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- show_trace(NULL, NULL, &stack, bp);
+ /*
+ * Stack frames below this one aren't interesting. Don't show them
+ * if we're printing for %current.
+ */
+ if (!sp && (!task || task == current)) {
+ sp = &stack;
+ bp = stack_frame(current, NULL);
+ }
+
+ show_stack_log_lvl(task, NULL, sp, bp, "");
}
-EXPORT_SYMBOL(dump_stack);
static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static int die_owner = -1;
static unsigned int die_nest_count;
-unsigned __kprobes long oops_begin(void)
+unsigned long oops_begin(void)
{
int cpu;
unsigned long flags;
@@ -221,15 +223,16 @@ unsigned __kprobes long oops_begin(void)
return flags;
}
EXPORT_SYMBOL_GPL(oops_begin);
+NOKPROBE_SYMBOL(oops_begin);
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
if (regs && kexec_should_crash(current))
crash_kexec(regs);
bust_spinlocks(0);
die_owner = -1;
- add_taint(TAINT_DIE);
+ add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
die_nest_count--;
if (!die_nest_count)
/* Nest count reaches zero, release the lock. */
@@ -245,8 +248,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
panic("Fatal exception");
do_exit(signr);
}
+NOKPROBE_SYMBOL(oops_end);
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+int __die(const char *str, struct pt_regs *regs, long err)
{
#ifdef CONFIG_X86_32
unsigned short ss;
@@ -265,10 +269,11 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
#endif
printk("\n");
if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
return 1;
- show_registers(regs);
+ print_modules();
+ show_regs(regs);
#ifdef CONFIG_X86_32
if (user_mode_vm(regs)) {
sp = regs->sp;
@@ -283,11 +288,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
#else
/* Executive summary in case the oops scrolled away */
printk(KERN_ALERT "RIP ");
- printk_address(regs->ip, 1);
+ printk_address(regs->ip);
printk(" RSP <%016lx>\n", regs->sp);
#endif
return 0;
}
+NOKPROBE_SYMBOL(__die);
/*
* This is gone through when something in the kernel has done something bad
@@ -308,16 +314,33 @@ void die(const char *str, struct pt_regs *regs, long err)
static int __init kstack_setup(char *s)
{
+ ssize_t ret;
+ unsigned long val;
+
if (!s)
return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+
+ ret = kstrtoul(s, 0, &val);
+ if (ret)
+ return ret;
+ kstack_depth_to_print = val;
return 0;
}
early_param("kstack", kstack_setup);
static int __init code_bytes_setup(char *s)
{
- code_bytes = simple_strtoul(s, NULL, 0);
+ ssize_t ret;
+ unsigned long val;
+
+ if (!s)
+ return -EINVAL;
+
+ ret = kstrtoul(s, 0, &val);
+ if (ret)
+ return ret;
+
+ code_bytes = val;
if (code_bytes > 8192)
code_bytes = 8192;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index c99f9ed013d..5abd4cd4230 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,12 +16,35 @@
#include <asm/stacktrace.h>
+static void *is_irq_stack(void *p, void *irq)
+{
+ if (p < irq || p >= (irq + THREAD_SIZE))
+ return NULL;
+ return irq + THREAD_SIZE;
+}
+
+
+static void *is_hardirq_stack(unsigned long *stack, int cpu)
+{
+ void *irq = per_cpu(hardirq_stack, cpu);
+
+ return is_irq_stack(stack, irq);
+}
+
+static void *is_softirq_stack(unsigned long *stack, int cpu)
+{
+ void *irq = per_cpu(softirq_stack, cpu);
+
+ return is_irq_stack(stack, irq);
+}
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
{
+ const unsigned cpu = get_cpu();
int graph = 0;
+ u32 *prev_esp;
if (!task)
task = current;
@@ -30,7 +53,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long dummy;
stack = &dummy;
- if (task && task != current)
+ if (task != current)
stack = (unsigned long *)task->thread.sp;
}
@@ -39,18 +62,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
for (;;) {
struct thread_info *context;
+ void *end_stack;
+
+ end_stack = is_hardirq_stack(stack, cpu);
+ if (!end_stack)
+ end_stack = is_softirq_stack(stack, cpu);
- context = (struct thread_info *)
- ((unsigned long)stack & (~(THREAD_SIZE - 1)));
- bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph);
+ context = task_thread_info(task);
+ bp = ops->walk_stack(context, stack, bp, ops, data,
+ end_stack, &graph);
- stack = (unsigned long *)context->previous_esp;
+ /* Stop if not on irq stack */
+ if (!end_stack)
+ break;
+
+ /* The previous esp is saved on the bottom of the stack */
+ prev_esp = (u32 *)(end_stack - THREAD_SIZE);
+ stack = (unsigned long *)*prev_esp;
if (!stack)
break;
+
if (ops->stack(data, "IRQ") < 0)
break;
touch_nmi_watchdog();
}
+ put_cpu();
}
EXPORT_SYMBOL(dump_trace);
@@ -73,25 +109,22 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (kstack_end(stack))
break;
if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- printk(KERN_CONT "\n");
- printk(KERN_CONT " %08lx", *stack++);
+ pr_cont("\n");
+ pr_cont(" %08lx", *stack++);
touch_nmi_watchdog();
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
-void show_registers(struct pt_regs *regs)
+void show_regs(struct pt_regs *regs)
{
int i;
- print_modules();
- __show_regs(regs, 0);
+ show_regs_print_info(KERN_EMERG);
+ __show_regs(regs, !user_mode_vm(regs));
- printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
- TASK_COMM_LEN, current->comm, task_pid_nr(current),
- current_thread_info(), current, task_thread_info(current));
/*
* When in-kernel, we also print out the stack and code at the
* time of the fault..
@@ -102,10 +135,10 @@ void show_registers(struct pt_regs *regs)
unsigned char c;
u8 *ip;
- printk(KERN_EMERG "Stack:\n");
+ pr_emerg("Stack:\n");
show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
- printk(KERN_EMERG "Code: ");
+ pr_emerg("Code:");
ip = (u8 *)regs->ip - code_prologue;
if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
@@ -116,16 +149,16 @@ void show_registers(struct pt_regs *regs)
for (i = 0; i < code_len; i++, ip++) {
if (ip < (u8 *)PAGE_OFFSET ||
probe_kernel_address(ip, c)) {
- printk(KERN_CONT " Bad EIP value.");
+ pr_cont(" Bad EIP value.");
break;
}
if (ip == (u8 *)regs->ip)
- printk(KERN_CONT "<%02x> ", c);
+ pr_cont(" <%02x>", c);
else
- printk(KERN_CONT "%02x ", c);
+ pr_cont(" %02x", c);
}
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 17107bd6e1f..1abcb50b48a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -104,6 +104,44 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
return (stack >= irq_stack && stack < irq_stack_end);
}
+static const unsigned long irq_stack_size =
+ (IRQ_STACK_SIZE - 64) / sizeof(unsigned long);
+
+enum stack_type {
+ STACK_IS_UNKNOWN,
+ STACK_IS_NORMAL,
+ STACK_IS_EXCEPTION,
+ STACK_IS_IRQ,
+};
+
+static enum stack_type
+analyze_stack(int cpu, struct task_struct *task, unsigned long *stack,
+ unsigned long **stack_end, unsigned long *irq_stack,
+ unsigned *used, char **id)
+{
+ unsigned long addr;
+
+ addr = ((unsigned long)stack & (~(THREAD_SIZE - 1)));
+ if ((unsigned long)task_stack_page(task) == addr)
+ return STACK_IS_NORMAL;
+
+ *stack_end = in_exception_stack(cpu, (unsigned long)stack,
+ used, id);
+ if (*stack_end)
+ return STACK_IS_EXCEPTION;
+
+ if (!irq_stack)
+ return STACK_IS_NORMAL;
+
+ *stack_end = irq_stack;
+ irq_stack = irq_stack - irq_stack_size;
+
+ if (in_irq_stack(stack, irq_stack, *stack_end))
+ return STACK_IS_IRQ;
+
+ return STACK_IS_UNKNOWN;
+}
+
/*
* x86-64 can have up to three kernel stacks:
* process stack
@@ -116,12 +154,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irq_stack_end =
- (unsigned long *)per_cpu(irq_stack_ptr, cpu);
- unsigned used = 0;
struct thread_info *tinfo;
- int graph = 0;
+ unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
unsigned long dummy;
+ unsigned used = 0;
+ int graph = 0;
+ int done = 0;
if (!task)
task = current;
@@ -143,49 +181,61 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
* exceptions
*/
tinfo = task_thread_info(task);
- for (;;) {
+ while (!done) {
+ unsigned long *stack_end;
+ enum stack_type stype;
char *id;
- unsigned long *estack_end;
- estack_end = in_exception_stack(cpu, (unsigned long)stack,
- &used, &id);
- if (estack_end) {
+ stype = analyze_stack(cpu, task, stack, &stack_end,
+ irq_stack, &used, &id);
+
+ /* Default finish unless specified to continue */
+ done = 1;
+
+ switch (stype) {
+
+ /* Break out early if we are on the thread stack */
+ case STACK_IS_NORMAL:
+ break;
+
+ case STACK_IS_EXCEPTION:
+
if (ops->stack(data, id) < 0)
break;
bp = ops->walk_stack(tinfo, stack, bp, ops,
- data, estack_end, &graph);
+ data, stack_end, &graph);
ops->stack(data, "<EOE>");
/*
* We link to the next stack via the
* second-to-last pointer (index -2 to end) in the
* exception stack:
*/
- stack = (unsigned long *) estack_end[-2];
- continue;
- }
- if (irq_stack_end) {
- unsigned long *irq_stack;
- irq_stack = irq_stack_end -
- (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
-
- if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
- if (ops->stack(data, "IRQ") < 0)
- break;
- bp = ops->walk_stack(tinfo, stack, bp,
- ops, data, irq_stack_end, &graph);
- /*
- * We link to the next stack (which would be
- * the process stack normally) the last
- * pointer (index -1 to end) in the IRQ stack:
- */
- stack = (unsigned long *) (irq_stack_end[-1]);
- irq_stack_end = NULL;
- ops->stack(data, "EOI");
- continue;
- }
+ stack = (unsigned long *) stack_end[-2];
+ done = 0;
+ break;
+
+ case STACK_IS_IRQ:
+
+ if (ops->stack(data, "IRQ") < 0)
+ break;
+ bp = ops->walk_stack(tinfo, stack, bp,
+ ops, data, stack_end, &graph);
+ /*
+ * We link to the next stack (which would be
+ * the process stack normally) the last
+ * pointer (index -1 to end) in the IRQ stack:
+ */
+ stack = (unsigned long *) (stack_end[-1]);
+ irq_stack = NULL;
+ ops->stack(data, "EOI");
+ done = 0;
+ break;
+
+ case STACK_IS_UNKNOWN:
+ ops->stack(data, "UNK");
+ break;
}
- break;
}
/*
@@ -228,36 +278,31 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (stack >= irq_stack && stack <= irq_stack_end) {
if (stack == irq_stack_end) {
stack = (unsigned long *) (irq_stack_end[-1]);
- printk(KERN_CONT " <EOI> ");
+ pr_cont(" <EOI> ");
}
} else {
if (((long) stack & (THREAD_SIZE-1)) == 0)
break;
}
if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- printk(KERN_CONT "\n");
- printk(KERN_CONT " %016lx", *stack++);
+ pr_cont("\n");
+ pr_cont(" %016lx", *stack++);
touch_nmi_watchdog();
}
preempt_enable();
- printk(KERN_CONT "\n");
+ pr_cont("\n");
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
-void show_registers(struct pt_regs *regs)
+void show_regs(struct pt_regs *regs)
{
int i;
unsigned long sp;
- const int cpu = smp_processor_id();
- struct task_struct *cur = current;
sp = regs->sp;
- printk("CPU %d ", cpu);
- print_modules();
+ show_regs_print_info(KERN_DEFAULT);
__show_regs(regs, 1);
- printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
- cur->comm, cur->pid, task_thread_info(cur), cur);
/*
* When in-kernel, we also print out the stack and code at the
@@ -284,16 +329,16 @@ void show_registers(struct pt_regs *regs)
for (i = 0; i < code_len; i++, ip++) {
if (ip < (u8 *)PAGE_OFFSET ||
probe_kernel_address(ip, c)) {
- printk(KERN_CONT " Bad RIP value.");
+ pr_cont(" Bad RIP value.");
break;
}
if (ip == (u8 *)regs->ip)
- printk(KERN_CONT "<%02x> ", c);
+ pr_cont("<%02x> ", c);
else
- printk(KERN_CONT "%02x ", c);
+ pr_cont("%02x ", c);
}
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 62d61e9976e..988c00a1f60 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -113,7 +113,9 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
int x = e820x->nr_map;
if (x >= ARRAY_SIZE(e820x->map)) {
- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+ printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n",
+ (unsigned long long) start,
+ (unsigned long long) (start + size - 1));
return;
}
@@ -133,19 +135,19 @@ static void __init e820_print_type(u32 type)
switch (type) {
case E820_RAM:
case E820_RESERVED_KERN:
- printk(KERN_CONT "(usable)");
+ printk(KERN_CONT "usable");
break;
case E820_RESERVED:
- printk(KERN_CONT "(reserved)");
+ printk(KERN_CONT "reserved");
break;
case E820_ACPI:
- printk(KERN_CONT "(ACPI data)");
+ printk(KERN_CONT "ACPI data");
break;
case E820_NVS:
- printk(KERN_CONT "(ACPI NVS)");
+ printk(KERN_CONT "ACPI NVS");
break;
case E820_UNUSABLE:
- printk(KERN_CONT "(unusable)");
+ printk(KERN_CONT "unusable");
break;
default:
printk(KERN_CONT "type %u", type);
@@ -158,10 +160,10 @@ void __init e820_print_map(char *who)
int i;
for (i = 0; i < e820.nr_map; i++) {
- printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+ printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
(unsigned long long) e820.map[i].addr,
(unsigned long long)
- (e820.map[i].addr + e820.map[i].size));
+ (e820.map[i].addr + e820.map[i].size - 1));
e820_print_type(e820.map[i].type);
printk(KERN_CONT "\n");
}
@@ -428,9 +430,8 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
size = ULLONG_MAX - start;
end = start + size;
- printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
- (unsigned long long) start,
- (unsigned long long) end);
+ printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ",
+ (unsigned long long) start, (unsigned long long) (end - 1));
e820_print_type(old_type);
printk(KERN_CONT " ==> ");
e820_print_type(new_type);
@@ -509,9 +510,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
size = ULLONG_MAX - start;
end = start + size;
- printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
- (unsigned long long) start,
- (unsigned long long) end);
+ printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ",
+ (unsigned long long) start, (unsigned long long) (end - 1));
if (checktype)
e820_print_type(old_type);
printk(KERN_CONT "\n");
@@ -567,7 +567,7 @@ void __init update_e820(void)
if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
return;
e820.nr_map = nr_map;
- printk(KERN_INFO "modified physical RAM map:\n");
+ printk(KERN_INFO "e820: modified physical RAM map:\n");
e820_print_map("modified");
}
static void __init update_e820_saved(void)
@@ -637,8 +637,8 @@ __init void e820_setup_gap(void)
if (!found) {
gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
printk(KERN_ERR
- "PCI: Warning: Cannot find a gap in the 32bit address range\n"
- "PCI: Unassigned devices with 32bit resource registers may break!\n");
+ "e820: cannot find a gap in the 32bit address range\n"
+ "e820: PCI devices with unassigned 32bit BARs may break!\n");
}
#endif
@@ -648,8 +648,8 @@ __init void e820_setup_gap(void)
pci_mem_start = gapstart;
printk(KERN_INFO
- "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
- pci_mem_start, gapstart, gapsize);
+ "e820: [mem %#010lx-%#010lx] available for PCI devices\n",
+ gapstart, gapstart + gapsize - 1);
}
/**
@@ -658,16 +658,19 @@ __init void e820_setup_gap(void)
* boot_params.e820_map, others are passed via SETUP_E820_EXT node of
* linked list of struct setup_data, which is parsed here.
*/
-void __init parse_e820_ext(struct setup_data *sdata)
+void __init parse_e820_ext(u64 phys_addr, u32 data_len)
{
int entries;
struct e820entry *extmap;
+ struct setup_data *sdata;
+ sdata = early_memremap(phys_addr, data_len);
entries = sdata->len / sizeof(struct e820entry);
extmap = (struct e820entry *)(sdata->data);
__append_e820_map(extmap, entries);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
- printk(KERN_INFO "extended physical RAM map:\n");
+ early_iounmap(sdata, data_len);
+ printk(KERN_INFO "e820: extended physical RAM map:\n");
e820_print_map("extended");
}
@@ -734,7 +737,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
if (addr) {
e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
- printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
+ printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n");
update_e820_saved();
}
@@ -784,7 +787,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
if (last_pfn > max_arch_pfn)
last_pfn = max_arch_pfn;
- printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
+ printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
last_pfn, max_arch_pfn);
return last_pfn;
}
@@ -835,7 +838,7 @@ static int __init parse_memopt(char *p)
}
early_param("mem", parse_memopt);
-static int __init parse_memmap_opt(char *p)
+static int __init parse_memmap_one(char *p)
{
char *oldp;
u64 start_at, mem_size;
@@ -877,6 +880,20 @@ static int __init parse_memmap_opt(char *p)
return *p == '\0' ? 0 : -EINVAL;
}
+static int __init parse_memmap_opt(char *str)
+{
+ while (str) {
+ char *k = strchr(str, ',');
+
+ if (k)
+ *k++ = 0;
+
+ parse_memmap_one(str);
+ str = k;
+ }
+
+ return 0;
+}
early_param("memmap", parse_memmap_opt);
void __init finish_e820_parsing(void)
@@ -888,7 +905,7 @@ void __init finish_e820_parsing(void)
early_panic("Invalid user supplied memory map");
e820.nr_map = nr;
- printk(KERN_INFO "user-defined physical RAM map:\n");
+ printk(KERN_INFO "e820: user-defined physical RAM map:\n");
e820_print_map("user");
}
}
@@ -944,7 +961,7 @@ void __init e820_reserve_resources(void)
for (i = 0; i < e820_saved.nr_map; i++) {
struct e820entry *entry = &e820_saved.map[i];
firmware_map_add_early(entry->addr,
- entry->addr + entry->size - 1,
+ entry->addr + entry->size,
e820_type_to_string(entry->type));
}
}
@@ -996,8 +1013,9 @@ void __init e820_reserve_resources_late(void)
end = MAX_RESOURCE_SIZE;
if (start >= end)
continue;
- printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
- start, end);
+ printk(KERN_DEBUG
+ "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n",
+ start, end);
reserve_region_with_split(&iomem_resource, start, end,
"RAM buffer");
}
@@ -1047,7 +1065,7 @@ void __init setup_memory_map(void)
who = x86_init.resources.memory_setup();
memcpy(&e820_saved, &e820, sizeof(struct e820map));
- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+ printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
e820_print_map(who);
}
@@ -1076,6 +1094,9 @@ void __init memblock_x86_fill(void)
memblock_add(ei->addr, ei->size);
}
+ /* throw away partial pages */
+ memblock_trim_memory(PAGE_SIZE);
+
memblock_dump_all();
}
@@ -1099,7 +1120,7 @@ void __init memblock_find_dma_reserve(void)
nr_pages += end_pfn - start_pfn;
}
- for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
+ for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
if (start_pfn < end_pfn)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 3755ef49439..2e1a6853e00 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -12,12 +12,15 @@
#include <linux/pci.h>
#include <linux/acpi.h>
#include <linux/pci_ids.h>
+#include <drm/i915_drm.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
+#include <asm/hpet.h>
#include <asm/iommu.h>
#include <asm/gart.h>
+#include <asm/irq_remapping.h>
static void __init fix_hypertransport_config(int num, int slot, int func)
{
@@ -192,6 +195,377 @@ static void __init ati_bugs_contd(int num, int slot, int func)
}
#endif
+static void __init intel_remapping_check(int num, int slot, int func)
+{
+ u8 revision;
+ u16 device;
+
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+ revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID);
+
+ /*
+ * Revision <= 13 of all triggering devices id in this quirk
+ * have a problem draining interrupts when irq remapping is
+ * enabled, and should be flagged as broken. Additionally
+ * revision 0x22 of device id 0x3405 has this problem.
+ */
+ if (revision <= 0x13)
+ set_irq_remapping_broken();
+ else if (device == 0x3405 && revision == 0x22)
+ set_irq_remapping_broken();
+}
+
+/*
+ * Systems with Intel graphics controllers set aside memory exclusively
+ * for gfx driver use. This memory is not marked in the E820 as reserved
+ * or as RAM, and so is subject to overlap from E820 manipulation later
+ * in the boot process. On some systems, MMIO space is allocated on top,
+ * despite the efforts of the "RAM buffer" approach, which simply rounds
+ * memory boundaries up to 64M to try to catch space that may decode
+ * as RAM and so is not suitable for MMIO.
+ *
+ * And yes, so far on current devices the base addr is always under 4G.
+ */
+static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_size)
+{
+ u32 base;
+
+ /*
+ * For the PCI IDs in this quirk, the stolen base is always
+ * in 0x5c, aka the BDSM register (yes that's really what
+ * it's called).
+ */
+ base = read_pci_config(num, slot, func, 0x5c);
+ base &= ~((1<<20) - 1);
+
+ return base;
+}
+
+#define KB(x) ((x) * 1024UL)
+#define MB(x) (KB (KB (x)))
+#define GB(x) (MB (KB (x)))
+
+static size_t __init i830_tseg_size(void)
+{
+ u8 tmp = read_pci_config_byte(0, 0, 0, I830_ESMRAMC);
+
+ if (!(tmp & TSEG_ENABLE))
+ return 0;
+
+ if (tmp & I830_TSEG_SIZE_1M)
+ return MB(1);
+ else
+ return KB(512);
+}
+
+static size_t __init i845_tseg_size(void)
+{
+ u8 tmp = read_pci_config_byte(0, 0, 0, I845_ESMRAMC);
+
+ if (!(tmp & TSEG_ENABLE))
+ return 0;
+
+ switch (tmp & I845_TSEG_SIZE_MASK) {
+ case I845_TSEG_SIZE_512K:
+ return KB(512);
+ case I845_TSEG_SIZE_1M:
+ return MB(1);
+ default:
+ WARN_ON(1);
+ return 0;
+ }
+}
+
+static size_t __init i85x_tseg_size(void)
+{
+ u8 tmp = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC);
+
+ if (!(tmp & TSEG_ENABLE))
+ return 0;
+
+ return MB(1);
+}
+
+static size_t __init i830_mem_size(void)
+{
+ return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32);
+}
+
+static size_t __init i85x_mem_size(void)
+{
+ return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32);
+}
+
+/*
+ * On 830/845/85x the stolen memory base isn't available in any
+ * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size.
+ */
+static u32 __init i830_stolen_base(int num, int slot, int func, size_t stolen_size)
+{
+ return i830_mem_size() - i830_tseg_size() - stolen_size;
+}
+
+static u32 __init i845_stolen_base(int num, int slot, int func, size_t stolen_size)
+{
+ return i830_mem_size() - i845_tseg_size() - stolen_size;
+}
+
+static u32 __init i85x_stolen_base(int num, int slot, int func, size_t stolen_size)
+{
+ return i85x_mem_size() - i85x_tseg_size() - stolen_size;
+}
+
+static u32 __init i865_stolen_base(int num, int slot, int func, size_t stolen_size)
+{
+ /*
+ * FIXME is the graphics stolen memory region
+ * always at TOUD? Ie. is it always the last
+ * one to be allocated by the BIOS?
+ */
+ return read_pci_config_16(0, 0, 0, I865_TOUD) << 16;
+}
+
+static size_t __init i830_stolen_size(int num, int slot, int func)
+{
+ size_t stolen_size;
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
+
+ switch (gmch_ctrl & I830_GMCH_GMS_MASK) {
+ case I830_GMCH_GMS_STOLEN_512:
+ stolen_size = KB(512);
+ break;
+ case I830_GMCH_GMS_STOLEN_1024:
+ stolen_size = MB(1);
+ break;
+ case I830_GMCH_GMS_STOLEN_8192:
+ stolen_size = MB(8);
+ break;
+ case I830_GMCH_GMS_LOCAL:
+ /* local memory isn't part of the normal address space */
+ stolen_size = 0;
+ break;
+ default:
+ return 0;
+ }
+
+ return stolen_size;
+}
+
+static size_t __init gen3_stolen_size(int num, int slot, int func)
+{
+ size_t stolen_size;
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
+
+ switch (gmch_ctrl & I855_GMCH_GMS_MASK) {
+ case I855_GMCH_GMS_STOLEN_1M:
+ stolen_size = MB(1);
+ break;
+ case I855_GMCH_GMS_STOLEN_4M:
+ stolen_size = MB(4);
+ break;
+ case I855_GMCH_GMS_STOLEN_8M:
+ stolen_size = MB(8);
+ break;
+ case I855_GMCH_GMS_STOLEN_16M:
+ stolen_size = MB(16);
+ break;
+ case I855_GMCH_GMS_STOLEN_32M:
+ stolen_size = MB(32);
+ break;
+ case I915_GMCH_GMS_STOLEN_48M:
+ stolen_size = MB(48);
+ break;
+ case I915_GMCH_GMS_STOLEN_64M:
+ stolen_size = MB(64);
+ break;
+ case G33_GMCH_GMS_STOLEN_128M:
+ stolen_size = MB(128);
+ break;
+ case G33_GMCH_GMS_STOLEN_256M:
+ stolen_size = MB(256);
+ break;
+ case INTEL_GMCH_GMS_STOLEN_96M:
+ stolen_size = MB(96);
+ break;
+ case INTEL_GMCH_GMS_STOLEN_160M:
+ stolen_size = MB(160);
+ break;
+ case INTEL_GMCH_GMS_STOLEN_224M:
+ stolen_size = MB(224);
+ break;
+ case INTEL_GMCH_GMS_STOLEN_352M:
+ stolen_size = MB(352);
+ break;
+ default:
+ stolen_size = 0;
+ break;
+ }
+
+ return stolen_size;
+}
+
+static size_t __init gen6_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gmch_ctrl >>= SNB_GMCH_GMS_SHIFT;
+ gmch_ctrl &= SNB_GMCH_GMS_MASK;
+
+ return gmch_ctrl << 25; /* 32 MB units */
+}
+
+static size_t __init gen8_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gmch_ctrl >>= BDW_GMCH_GMS_SHIFT;
+ gmch_ctrl &= BDW_GMCH_GMS_MASK;
+ return gmch_ctrl << 25; /* 32 MB units */
+}
+
+static size_t __init chv_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gmch_ctrl >>= SNB_GMCH_GMS_SHIFT;
+ gmch_ctrl &= SNB_GMCH_GMS_MASK;
+
+ /*
+ * 0x0 to 0x10: 32MB increments starting at 0MB
+ * 0x11 to 0x16: 4MB increments starting at 8MB
+ * 0x17 to 0x1d: 4MB increments start at 36MB
+ */
+ if (gmch_ctrl < 0x11)
+ return gmch_ctrl << 25;
+ else if (gmch_ctrl < 0x17)
+ return (gmch_ctrl - 0x11 + 2) << 22;
+ else
+ return (gmch_ctrl - 0x17 + 9) << 22;
+}
+
+struct intel_stolen_funcs {
+ size_t (*size)(int num, int slot, int func);
+ u32 (*base)(int num, int slot, int func, size_t size);
+};
+
+static const struct intel_stolen_funcs i830_stolen_funcs __initconst = {
+ .base = i830_stolen_base,
+ .size = i830_stolen_size,
+};
+
+static const struct intel_stolen_funcs i845_stolen_funcs __initconst = {
+ .base = i845_stolen_base,
+ .size = i830_stolen_size,
+};
+
+static const struct intel_stolen_funcs i85x_stolen_funcs __initconst = {
+ .base = i85x_stolen_base,
+ .size = gen3_stolen_size,
+};
+
+static const struct intel_stolen_funcs i865_stolen_funcs __initconst = {
+ .base = i865_stolen_base,
+ .size = gen3_stolen_size,
+};
+
+static const struct intel_stolen_funcs gen3_stolen_funcs __initconst = {
+ .base = intel_stolen_base,
+ .size = gen3_stolen_size,
+};
+
+static const struct intel_stolen_funcs gen6_stolen_funcs __initconst = {
+ .base = intel_stolen_base,
+ .size = gen6_stolen_size,
+};
+
+static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = {
+ .base = intel_stolen_base,
+ .size = gen8_stolen_size,
+};
+
+static const struct intel_stolen_funcs chv_stolen_funcs __initconst = {
+ .base = intel_stolen_base,
+ .size = chv_stolen_size,
+};
+
+static const struct pci_device_id intel_stolen_ids[] __initconst = {
+ INTEL_I830_IDS(&i830_stolen_funcs),
+ INTEL_I845G_IDS(&i845_stolen_funcs),
+ INTEL_I85X_IDS(&i85x_stolen_funcs),
+ INTEL_I865G_IDS(&i865_stolen_funcs),
+ INTEL_I915G_IDS(&gen3_stolen_funcs),
+ INTEL_I915GM_IDS(&gen3_stolen_funcs),
+ INTEL_I945G_IDS(&gen3_stolen_funcs),
+ INTEL_I945GM_IDS(&gen3_stolen_funcs),
+ INTEL_VLV_M_IDS(&gen6_stolen_funcs),
+ INTEL_VLV_D_IDS(&gen6_stolen_funcs),
+ INTEL_PINEVIEW_IDS(&gen3_stolen_funcs),
+ INTEL_I965G_IDS(&gen3_stolen_funcs),
+ INTEL_G33_IDS(&gen3_stolen_funcs),
+ INTEL_I965GM_IDS(&gen3_stolen_funcs),
+ INTEL_GM45_IDS(&gen3_stolen_funcs),
+ INTEL_G45_IDS(&gen3_stolen_funcs),
+ INTEL_IRONLAKE_D_IDS(&gen3_stolen_funcs),
+ INTEL_IRONLAKE_M_IDS(&gen3_stolen_funcs),
+ INTEL_SNB_D_IDS(&gen6_stolen_funcs),
+ INTEL_SNB_M_IDS(&gen6_stolen_funcs),
+ INTEL_IVB_M_IDS(&gen6_stolen_funcs),
+ INTEL_IVB_D_IDS(&gen6_stolen_funcs),
+ INTEL_HSW_D_IDS(&gen6_stolen_funcs),
+ INTEL_HSW_M_IDS(&gen6_stolen_funcs),
+ INTEL_BDW_M_IDS(&gen8_stolen_funcs),
+ INTEL_BDW_D_IDS(&gen8_stolen_funcs),
+ INTEL_CHV_IDS(&chv_stolen_funcs),
+};
+
+static void __init intel_graphics_stolen(int num, int slot, int func)
+{
+ size_t size;
+ int i;
+ u32 start;
+ u16 device, subvendor, subdevice;
+
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+ subvendor = read_pci_config_16(num, slot, func,
+ PCI_SUBSYSTEM_VENDOR_ID);
+ subdevice = read_pci_config_16(num, slot, func, PCI_SUBSYSTEM_ID);
+
+ for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) {
+ if (intel_stolen_ids[i].device == device) {
+ const struct intel_stolen_funcs *stolen_funcs =
+ (const struct intel_stolen_funcs *)intel_stolen_ids[i].driver_data;
+ size = stolen_funcs->size(num, slot, func);
+ start = stolen_funcs->base(num, slot, func, size);
+ if (size && start) {
+ printk(KERN_INFO "Reserving Intel graphics stolen memory at 0x%x-0x%x\n",
+ start, start + (u32)size - 1);
+ /* Mark this space as reserved */
+ e820_add_region(start, size, E820_RESERVED);
+ sanitize_e820_map(e820.map,
+ ARRAY_SIZE(e820.map),
+ &e820.nr_map);
+ }
+ return;
+ }
+ }
+}
+
+static void __init force_disable_hpet(int num, int slot, int func)
+{
+#ifdef CONFIG_HPET_TIMER
+ boot_hpet_disable = 1;
+ pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
+#endif
+}
+
+
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -221,6 +595,20 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
+ { PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, 0x3405, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
+ QFLAG_APPLY_ONCE, intel_graphics_stolen },
+ /*
+ * HPET on current version of Baytrail platform has accuracy
+ * problems, disable it for now:
+ */
+ { PCI_VENDOR_ID_INTEL, 0x0f00,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
{}
};
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 9b9f18b4991..01d1c187c9f 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,9 +14,11 @@
#include <xen/hvc-console.h>
#include <asm/pci-direct.h>
#include <asm/fixmap.h>
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
#include <asm/pgtable.h>
#include <linux/usb/ehci_def.h>
+#include <linux/efi.h>
+#include <asm/efi.h>
/* Simple VGA output */
#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -169,25 +171,9 @@ static struct console early_serial_console = {
.index = -1,
};
-/* Direct interface for emergencies */
-static struct console *early_console = &early_vga_console;
-static int __initdata early_console_initialized;
-
-asmlinkage void early_printk(const char *fmt, ...)
-{
- char buf[512];
- int n;
- va_list ap;
-
- va_start(ap, fmt);
- n = vscnprintf(buf, sizeof(buf), fmt, ap);
- early_console->write(early_console, buf, n);
- va_end(ap);
-}
-
static inline void early_console_register(struct console *con, int keep_early)
{
- if (early_console->index != -1) {
+ if (con->index != -1) {
printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
con->name);
return;
@@ -207,9 +193,8 @@ static int __init setup_early_printk(char *buf)
if (!buf)
return 0;
- if (early_console_initialized)
+ if (early_console)
return 0;
- early_console_initialized = 1;
keep = (strstr(buf, "keep") != NULL);
@@ -251,6 +236,11 @@ static int __init setup_early_printk(char *buf)
early_console_register(&early_hsu_console, keep);
}
#endif
+#ifdef CONFIG_EARLY_PRINTK_EFI
+ if (!strncmp(buf, "efi", 3))
+ early_console_register(&early_efi_console, keep);
+#endif
+
buf++;
}
return 0;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 79d97e68f04..0d0c9d4ab6d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -56,6 +56,8 @@
#include <asm/irq_vectors.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -98,12 +100,6 @@
#endif
.endm
-#ifdef CONFIG_VM86
-#define resume_userspace_sig check_userspace
-#else
-#define resume_userspace_sig resume_userspace
-#endif
-
/*
* User gs save/restore
*
@@ -157,10 +153,8 @@
.pushsection .fixup, "ax"
99: movl $0, (%esp)
jmp 98b
-.section __ex_table, "a"
- .align 4
- .long 98b, 99b
.popsection
+ _ASM_EXTABLE(98b,99b)
.endm
.macro PTGS_TO_GS
@@ -170,10 +164,8 @@
.pushsection .fixup, "ax"
99: movl $0, PT_GS(%esp)
jmp 98b
-.section __ex_table, "a"
- .align 4
- .long 98b, 99b
.popsection
+ _ASM_EXTABLE(98b,99b)
.endm
.macro GS_TO_REG reg
@@ -255,12 +247,10 @@
jmp 2b
6: movl $0, (%esp)
jmp 3b
-.section __ex_table, "a"
- .align 4
- .long 1b, 4b
- .long 2b, 5b
- .long 3b, 6b
.popsection
+ _ASM_EXTABLE(1b,4b)
+ _ASM_EXTABLE(2b,5b)
+ _ASM_EXTABLE(3b,6b)
POP_GS_EX
.endm
@@ -309,10 +299,21 @@ ENTRY(ret_from_fork)
CFI_ENDPROC
END(ret_from_fork)
-/*
- * Interrupt exit functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
+ENTRY(ret_from_kernel_thread)
+ CFI_STARTPROC
+ pushl_cfi %eax
+ call schedule_tail
+ GET_THREAD_INFO(%ebp)
+ popl_cfi %eax
+ pushl_cfi $0x0202 # Reset kernel eflags
+ popfl_cfi
+ movl PT_EBP(%esp),%eax
+ call *PT_EBX(%esp)
+ movl $0,PT_EAX(%esp)
+ jmp syscall_exit
+ CFI_ENDPROC
+ENDPROC(ret_from_kernel_thread)
+
/*
* Return to user mode is not as complex as all this looks,
* but we want the default path for a system call return to
@@ -327,10 +328,17 @@ ret_from_exception:
preempt_stop(CLBR_ANY)
ret_from_intr:
GET_THREAD_INFO(%ebp)
-check_userspace:
+#ifdef CONFIG_VM86
movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
movb PT_CS(%esp), %al
andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
+ /*
+ * We can be coming here from child spawned by kernel_thread().
+ */
+ movl PT_CS(%esp), %eax
+ andl $SEGMENT_RPL_MASK, %eax
+#endif
cmpl $USER_RPL, %eax
jb resume_kernel # not returning to v8086 or userspace
@@ -350,12 +358,9 @@ END(ret_from_exception)
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
DISABLE_INTERRUPTS(CLBR_ANY)
- cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
- jnz restore_all
need_resched:
- movl TI_flags(%ebp), %ecx # need_resched set ?
- testb $_TIF_NEED_RESCHED, %cl
- jz restore_all
+ cmpl $0,PER_CPU_VAR(__preempt_count)
+ jnz restore_all
testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
jz restore_all
call preempt_schedule_irq
@@ -363,10 +368,6 @@ need_resched:
END(resume_kernel)
#endif
CFI_ENDPROC
-/*
- * End of kprobes section
- */
- .popsection
/* SYSENTER_RETURN points to after the "sysenter" instruction in
the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -410,12 +411,11 @@ sysenter_past_esp:
*/
cmpl $__PAGE_OFFSET-3,%ebp
jae syscall_fault
+ ASM_STAC
1: movl (%ebp),%ebp
+ ASM_CLAC
movl %ebp,PT_EBP(%esp)
-.section __ex_table,"a"
- .align 4
- .long 1b,syscall_fault
-.previous
+ _ASM_EXTABLE(1b,syscall_fault)
GET_THREAD_INFO(%ebp)
@@ -423,8 +423,9 @@ sysenter_past_esp:
jnz sysenter_audit
sysenter_do_call:
cmpl $(NR_syscalls), %eax
- jae syscall_badsys
+ jae sysenter_badsys
call *sys_call_table(,%eax,4)
+sysenter_after_call:
movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
@@ -482,20 +483,15 @@ sysexit_audit:
.pushsection .fixup,"ax"
2: movl $0,PT_FS(%esp)
jmp 1b
-.section __ex_table,"a"
- .align 4
- .long 1b,2b
.popsection
+ _ASM_EXTABLE(1b,2b)
PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
-/*
- * syscall stub including irq exit should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
# system call handler stub
ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
+ ASM_CLAC
pushl_cfi %eax # save orig_eax
SAVE_ALL
GET_THREAD_INFO(%ebp)
@@ -506,6 +502,7 @@ ENTRY(system_call)
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
+syscall_after_call:
movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
@@ -520,6 +517,7 @@ syscall_exit:
restore_all:
TRACE_IRQS_IRET
restore_all_notrace:
+#ifdef CONFIG_X86_ESPFIX32
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
# are returning to the kernel.
@@ -530,6 +528,7 @@ restore_all_notrace:
cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
+#endif
restore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
irq_return:
@@ -540,18 +539,11 @@ ENTRY(iret_exc)
pushl $do_iret_error
jmp error_code
.previous
-.section __ex_table,"a"
- .align 4
- .long irq_return,iret_exc
-.previous
+ _ASM_EXTABLE(irq_return,iret_exc)
+#ifdef CONFIG_X86_ESPFIX32
CFI_RESTORE_STATE
ldt_ss:
- larl PT_OLDSS(%esp), %eax
- jnz restore_nocheck
- testl $0x00400000, %eax # returning to 32bit stack?
- jnz restore_nocheck # allright, normal return
-
#ifdef CONFIG_PARAVIRT
/*
* The kernel can't run on a non-flat stack if paravirt mode
@@ -593,6 +585,7 @@ ldt_ss:
lss (%esp), %esp /* switch to espfix segment */
CFI_ADJUST_CFA_OFFSET -8
jmp restore_nocheck
+#endif
CFI_ENDPROC
ENDPROC(system_call)
@@ -623,26 +616,29 @@ work_notifysig: # deal with pending signals and
movl %esp, %eax
jne work_notifysig_v86 # returning to kernel-space or
# vm86-space
+1:
+#else
+ movl %esp, %eax
+#endif
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
+ movb PT_CS(%esp), %bl
+ andb $SEGMENT_RPL_MASK, %bl
+ cmpb $USER_RPL, %bl
+ jb resume_kernel
xorl %edx, %edx
call do_notify_resume
- jmp resume_userspace_sig
+ jmp resume_userspace
+#ifdef CONFIG_VM86
ALIGN
work_notifysig_v86:
pushl_cfi %ecx # save ti_flags for do_notify_resume
call save_v86_state # %eax contains pt_regs pointer
popl_cfi %ecx
movl %eax, %esp
-#else
- movl %esp, %eax
+ jmp 1b
#endif
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %edx, %edx
- call do_notify_resume
- jmp resume_userspace_sig
END(work_pending)
# perform syscall exit tracing
@@ -673,85 +669,22 @@ END(syscall_exit_work)
RING0_INT_FRAME # can't unwind into user space anyway
syscall_fault:
+ ASM_CLAC
GET_THREAD_INFO(%ebp)
movl $-EFAULT,PT_EAX(%esp)
jmp resume_userspace
END(syscall_fault)
syscall_badsys:
- movl $-ENOSYS,PT_EAX(%esp)
- jmp resume_userspace
+ movl $-ENOSYS,%eax
+ jmp syscall_after_call
END(syscall_badsys)
- CFI_ENDPROC
-/*
- * End of kprobes section
- */
- .popsection
-/*
- * System calls that need a pt_regs pointer.
- */
-#define PTREGSCALL0(name) \
-ENTRY(ptregs_##name) ; \
- leal 4(%esp),%eax; \
- jmp sys_##name; \
-ENDPROC(ptregs_##name)
-
-#define PTREGSCALL1(name) \
-ENTRY(ptregs_##name) ; \
- leal 4(%esp),%edx; \
- movl (PT_EBX+4)(%esp),%eax; \
- jmp sys_##name; \
-ENDPROC(ptregs_##name)
-
-#define PTREGSCALL2(name) \
-ENTRY(ptregs_##name) ; \
- leal 4(%esp),%ecx; \
- movl (PT_ECX+4)(%esp),%edx; \
- movl (PT_EBX+4)(%esp),%eax; \
- jmp sys_##name; \
-ENDPROC(ptregs_##name)
-
-#define PTREGSCALL3(name) \
-ENTRY(ptregs_##name) ; \
- CFI_STARTPROC; \
- leal 4(%esp),%eax; \
- pushl_cfi %eax; \
- movl PT_EDX(%eax),%ecx; \
- movl PT_ECX(%eax),%edx; \
- movl PT_EBX(%eax),%eax; \
- call sys_##name; \
- addl $4,%esp; \
- CFI_ADJUST_CFA_OFFSET -4; \
- ret; \
- CFI_ENDPROC; \
-ENDPROC(ptregs_##name)
-
-PTREGSCALL1(iopl)
-PTREGSCALL0(fork)
-PTREGSCALL0(vfork)
-PTREGSCALL3(execve)
-PTREGSCALL2(sigaltstack)
-PTREGSCALL0(sigreturn)
-PTREGSCALL0(rt_sigreturn)
-PTREGSCALL2(vm86)
-PTREGSCALL1(vm86old)
-
-/* Clone is an oddball. The 4th arg is in %edi */
-ENTRY(ptregs_clone)
- CFI_STARTPROC
- leal 4(%esp),%eax
- pushl_cfi %eax
- pushl_cfi PT_EDI(%eax)
- movl PT_EDX(%eax),%ecx
- movl PT_ECX(%eax),%edx
- movl PT_EBX(%eax),%eax
- call sys_clone
- addl $8,%esp
- CFI_ADJUST_CFA_OFFSET -8
- ret
+sysenter_badsys:
+ movl $-ENOSYS,%eax
+ jmp sysenter_after_call
+END(syscall_badsys)
CFI_ENDPROC
-ENDPROC(ptregs_clone)
.macro FIXUP_ESPFIX_STACK
/*
@@ -761,6 +694,7 @@ ENDPROC(ptregs_clone)
* the high word of the segment base from the GDT and swiches to the
* normal stack and adjusts ESP with the matching offset.
*/
+#ifdef CONFIG_X86_ESPFIX32
/* fixup the stack */
mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
@@ -770,8 +704,10 @@ ENDPROC(ptregs_clone)
pushl_cfi %eax
lss (%esp), %esp /* switch to the normal stack segment */
CFI_ADJUST_CFA_OFFSET -8
+#endif
.endm
.macro UNWIND_ESPFIX_STACK
+#ifdef CONFIG_X86_ESPFIX32
movl %ss, %eax
/* see if on espfix stack */
cmpw $__ESPFIX_SS, %ax
@@ -782,6 +718,7 @@ ENDPROC(ptregs_clone)
/* switch to normal stack */
FIXUP_ESPFIX_STACK
27:
+#endif
.endm
/*
@@ -828,6 +765,7 @@ END(interrupt)
*/
.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
+ ASM_CLAC
addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
SAVE_ALL
TRACE_IRQS_OFF
@@ -837,13 +775,10 @@ common_interrupt:
ENDPROC(common_interrupt)
CFI_ENDPROC
-/*
- * Irq entries should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
+ ASM_CLAC; \
pushl_cfi $~(nr); \
SAVE_ALL; \
TRACE_IRQS_OFF \
@@ -853,13 +788,24 @@ ENTRY(name) \
CFI_ENDPROC; \
ENDPROC(name)
-#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
+
+#ifdef CONFIG_TRACING
+#define TRACE_BUILD_INTERRUPT(name, nr) \
+ BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
+#else
+#define TRACE_BUILD_INTERRUPT(name, nr)
+#endif
+
+#define BUILD_INTERRUPT(name, nr) \
+ BUILD_INTERRUPT3(name, nr, smp_##name); \
+ TRACE_BUILD_INTERRUPT(name, nr)
/* The include is where all of the SMP etc. interrupts come from */
#include <asm/entry_arch.h>
ENTRY(coprocessor_error)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi $do_coprocessor_error
jmp error_code
@@ -868,6 +814,7 @@ END(coprocessor_error)
ENTRY(simd_coprocessor_error)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
#ifdef CONFIG_X86_INVD_BUG
/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
@@ -889,6 +836,7 @@ END(simd_coprocessor_error)
ENTRY(device_not_available)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $-1 # mark this as an int
pushl_cfi $do_device_not_available
jmp error_code
@@ -898,10 +846,7 @@ END(device_not_available)
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
iret
-.section __ex_table,"a"
- .align 4
- .long native_iret, iret_exc
-.previous
+ _ASM_EXTABLE(native_iret, iret_exc)
END(native_iret)
ENTRY(native_irq_enable_sysexit)
@@ -912,6 +857,7 @@ END(native_irq_enable_sysexit)
ENTRY(overflow)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi $do_overflow
jmp error_code
@@ -920,6 +866,7 @@ END(overflow)
ENTRY(bounds)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi $do_bounds
jmp error_code
@@ -928,6 +875,7 @@ END(bounds)
ENTRY(invalid_op)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi $do_invalid_op
jmp error_code
@@ -936,6 +884,7 @@ END(invalid_op)
ENTRY(coprocessor_segment_overrun)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi $do_coprocessor_segment_overrun
jmp error_code
@@ -944,6 +893,7 @@ END(coprocessor_segment_overrun)
ENTRY(invalid_TSS)
RING0_EC_FRAME
+ ASM_CLAC
pushl_cfi $do_invalid_TSS
jmp error_code
CFI_ENDPROC
@@ -951,6 +901,7 @@ END(invalid_TSS)
ENTRY(segment_not_present)
RING0_EC_FRAME
+ ASM_CLAC
pushl_cfi $do_segment_not_present
jmp error_code
CFI_ENDPROC
@@ -958,6 +909,7 @@ END(segment_not_present)
ENTRY(stack_segment)
RING0_EC_FRAME
+ ASM_CLAC
pushl_cfi $do_stack_segment
jmp error_code
CFI_ENDPROC
@@ -965,6 +917,7 @@ END(stack_segment)
ENTRY(alignment_check)
RING0_EC_FRAME
+ ASM_CLAC
pushl_cfi $do_alignment_check
jmp error_code
CFI_ENDPROC
@@ -972,6 +925,7 @@ END(alignment_check)
ENTRY(divide_error)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0 # no error code
pushl_cfi $do_divide_error
jmp error_code
@@ -981,6 +935,7 @@ END(divide_error)
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi machine_check_vector
jmp error_code
@@ -990,25 +945,12 @@ END(machine_check)
ENTRY(spurious_interrupt_bug)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $0
pushl_cfi $do_spurious_interrupt_bug
jmp error_code
CFI_ENDPROC
END(spurious_interrupt_bug)
-/*
- * End of kprobes section
- */
- .popsection
-
-ENTRY(kernel_thread_helper)
- pushl $0 # fake return address for unwinder
- CFI_STARTPROC
- movl %edi,%eax
- call *%esi
- call do_exit
- ud2 # padding for call trace
- CFI_ENDPROC
-ENDPROC(kernel_thread_helper)
#ifdef CONFIG_XEN
/* Xen doesn't set %esp to be precisely what the normal sysenter
@@ -1022,7 +964,7 @@ ENTRY(xen_sysenter_target)
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
- pushl_cfi $0
+ pushl_cfi $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
TRACE_IRQS_OFF
@@ -1064,14 +1006,15 @@ ENTRY(xen_failsafe_callback)
2: mov 8(%esp),%es
3: mov 12(%esp),%fs
4: mov 16(%esp),%gs
+ /* EAX == 0 => Category 1 (Bad segment)
+ EAX != 0 => Category 2 (Bad IRET) */
testl %eax,%eax
popl_cfi %eax
lea 16(%esp),%esp
CFI_ADJUST_CFA_OFFSET -16
jz 5f
- addl $16,%esp
- jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
-5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment)
+ jmp iret_exc
+5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
jmp ret_from_exception
CFI_ENDPROC
@@ -1090,20 +1033,24 @@ ENTRY(xen_failsafe_callback)
movl %eax,16(%esp)
jmp 4b
.previous
-.section __ex_table,"a"
- .align 4
- .long 1b,6b
- .long 2b,7b
- .long 3b,8b
- .long 4b,9b
-.previous
+ _ASM_EXTABLE(1b,6b)
+ _ASM_EXTABLE(2b,7b)
+ _ASM_EXTABLE(3b,8b)
+ _ASM_EXTABLE(4b,9b)
ENDPROC(xen_failsafe_callback)
-BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
+BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
xen_evtchn_do_upcall)
#endif /* CONFIG_XEN */
+#if IS_ENABLED(CONFIG_HYPERV)
+
+BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
+ hyperv_vector_handler)
+
+#endif /* CONFIG_HYPERV */
+
#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1118,17 +1065,21 @@ ENTRY(ftrace_caller)
pushl %eax
pushl %ecx
pushl %edx
- movl 0xc(%esp), %eax
+ pushl $0 /* Pass NULL as regs pointer */
+ movl 4*4(%esp), %eax
movl 0x4(%ebp), %edx
+ movl function_trace_op, %ecx
subl $MCOUNT_INSN_SIZE, %eax
.globl ftrace_call
ftrace_call:
call ftrace_stub
+ addl $4,%esp /* skip NULL pointer */
popl %edx
popl %ecx
popl %eax
+ftrace_ret:
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
.globl ftrace_graph_call
ftrace_graph_call:
@@ -1140,9 +1091,77 @@ ftrace_stub:
ret
END(ftrace_caller)
+ENTRY(ftrace_regs_caller)
+ pushf /* push flags before compare (in cs location) */
+ cmpl $0, function_trace_stop
+ jne ftrace_restore_flags
+
+ /*
+ * i386 does not save SS and ESP when coming from kernel.
+ * Instead, to get sp, &regs->sp is used (see ptrace.h).
+ * Unfortunately, that means eflags must be at the same location
+ * as the current return ip is. We move the return ip into the
+ * ip location, and move flags into the return ip location.
+ */
+ pushl 4(%esp) /* save return ip into ip slot */
+
+ pushl $0 /* Load 0 into orig_ax */
+ pushl %gs
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+
+ movl 13*4(%esp), %eax /* Get the saved flags */
+ movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
+ /* clobbering return ip */
+ movl $__KERNEL_CS,13*4(%esp)
+
+ movl 12*4(%esp), %eax /* Load ip (1st parameter) */
+ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
+ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
+ movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+ pushl %esp /* Save pt_regs as 4th parameter */
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ addl $4, %esp /* Skip pt_regs */
+ movl 14*4(%esp), %eax /* Move flags back into cs */
+ movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
+ movl 12*4(%esp), %eax /* Get return ip from regs->ip */
+ movl %eax, 14*4(%esp) /* Put return ip back for ret */
+
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+ popl %ds
+ popl %es
+ popl %fs
+ popl %gs
+ addl $8, %esp /* Skip orig_ax and ip */
+ popf /* Pop flags at end (no addl to corrupt flags) */
+ jmp ftrace_ret
+
+ftrace_restore_flags:
+ popf
+ jmp ftrace_stub
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
+ cmpl $__PAGE_OFFSET, %esp
+ jb ftrace_stub /* Paging not enabled yet? */
+
cmpl $0, function_trace_stop
jne ftrace_stub
@@ -1180,9 +1199,6 @@ END(mcount)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ENTRY(ftrace_graph_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
pushl %eax
pushl %ecx
pushl %edx
@@ -1209,13 +1225,19 @@ return_to_handler:
jmp *%ecx
#endif
-/*
- * Some functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
+#ifdef CONFIG_TRACING
+ENTRY(trace_page_fault)
+ RING0_EC_FRAME
+ ASM_CLAC
+ pushl_cfi $trace_do_page_fault
+ jmp error_code
+ CFI_ENDPROC
+END(trace_page_fault)
+#endif
ENTRY(page_fault)
RING0_EC_FRAME
+ ASM_CLAC
pushl_cfi $do_page_fault
ALIGN
error_code:
@@ -1288,6 +1310,7 @@ END(page_fault)
ENTRY(debug)
RING0_INT_FRAME
+ ASM_CLAC
cmpl $ia32_sysenter_target,(%esp)
jne debug_stack_correct
FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
@@ -1312,11 +1335,14 @@ END(debug)
*/
ENTRY(nmi)
RING0_INT_FRAME
+ ASM_CLAC
+#ifdef CONFIG_X86_ESPFIX32
pushl_cfi %eax
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
popl_cfi %eax
je nmi_espfix_stack
+#endif
cmpl $ia32_sysenter_target,(%esp)
je nmi_stack_fixup
pushl_cfi %eax
@@ -1356,6 +1382,7 @@ nmi_debug_stack_check:
FIX_STACK 24, nmi_stack_correct, 1
jmp nmi_stack_correct
+#ifdef CONFIG_X86_ESPFIX32
nmi_espfix_stack:
/* We have a RING0_INT_FRAME here.
*
@@ -1377,11 +1404,13 @@ nmi_espfix_stack:
lss 12+4(%esp), %esp # back to espfix stack
CFI_ADJUST_CFA_OFFSET -24
jmp irq_return
+#endif
CFI_ENDPROC
END(nmi)
ENTRY(int3)
RING0_INT_FRAME
+ ASM_CLAC
pushl_cfi $-1 # mark this as an int
SAVE_ALL
TRACE_IRQS_OFF
@@ -1402,13 +1431,10 @@ END(general_protection)
#ifdef CONFIG_KVM_GUEST
ENTRY(async_page_fault)
RING0_EC_FRAME
+ ASM_CLAC
pushl_cfi $do_async_page_fault
jmp error_code
CFI_ENDPROC
END(async_page_fault)
#endif
-/*
- * End of kprobes section
- */
- .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3fe8239fd8f..c844f0816ab 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -36,7 +36,7 @@
* - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
* frame that is otherwise undefined after a SYSCALL
* - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - errorentry/paranoidentry/zeroentry - Define exception entry points.
+ * - idtentry - Define exception entry points.
*/
#include <linux/linkage.h>
@@ -53,8 +53,11 @@
#include <asm/page_types.h>
#include <asm/irqflags.h>
#include <asm/paravirt.h>
-#include <asm/ftrace.h>
#include <asm/percpu.h>
+#include <asm/asm.h>
+#include <asm/context_tracking.h>
+#include <asm/smap.h>
+#include <asm/pgtable_types.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -66,107 +69,6 @@
.code64
.section .entry.text, "ax"
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-ENTRY(mcount)
- retq
-END(mcount)
-
-ENTRY(ftrace_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- MCOUNT_SAVE_FRAME
-
- movq 0x38(%rsp), %rdi
- movq 8(%rbp), %rsi
- subq $MCOUNT_INSN_SIZE, %rdi
-
-GLOBAL(ftrace_call)
- call ftrace_stub
-
- MCOUNT_RESTORE_FRAME
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-GLOBAL(ftrace_graph_call)
- jmp ftrace_stub
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-END(ftrace_caller)
-
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-ENTRY(mcount)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- cmpq $ftrace_stub, ftrace_trace_function
- jnz trace
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpq $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-
-trace:
- MCOUNT_SAVE_FRAME
-
- movq 0x38(%rsp), %rdi
- movq 8(%rbp), %rsi
- subq $MCOUNT_INSN_SIZE, %rdi
-
- call *ftrace_trace_function
-
- MCOUNT_RESTORE_FRAME
-
- jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- MCOUNT_SAVE_FRAME
-
- leaq 8(%rbp), %rdi
- movq 0x38(%rsp), %rsi
- movq (%rbp), %rdx
- subq $MCOUNT_INSN_SIZE, %rsi
-
- call prepare_ftrace_return
-
- MCOUNT_RESTORE_FRAME
-
- retq
-END(ftrace_graph_caller)
-
-GLOBAL(return_to_handler)
- subq $24, %rsp
-
- /* Save the return values */
- movq %rax, (%rsp)
- movq %rdx, 8(%rsp)
- movq %rbp, %rdi
-
- call ftrace_return_to_handler
-
- movq %rax, %rdi
- movq 8(%rsp), %rdx
- movq (%rsp), %rax
- addq $24, %rsp
- jmp *%rdi
-#endif
-
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
@@ -190,6 +92,44 @@ ENDPROC(native_usergs_sysret64)
.endm
/*
+ * When dynamic function tracer is enabled it will add a breakpoint
+ * to all locations that it is about to modify, sync CPUs, update
+ * all the code, sync CPUs, then remove the breakpoints. In this time
+ * if lockdep is enabled, it might jump back into the debug handler
+ * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
+ *
+ * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
+ * make sure the stack pointer does not get reset back to the top
+ * of the debug stack, and instead just reuses the current stack.
+ */
+#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
+
+.macro TRACE_IRQS_OFF_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_OFF
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_ON_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_ON
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
+ bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
+ jnc 1f
+ TRACE_IRQS_ON_DEBUG
+1:
+.endm
+
+#else
+# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
+# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
+#endif
+
+/*
* C code is not supposed to know about undefined top of stack. Every time
* a C function with an pt_regs argument is called from the SYSCALL based
* fast path FIXUP_TOP_OF_STACK is needed.
@@ -222,7 +162,7 @@ ENDPROC(native_usergs_sysret64)
/*CFI_REL_OFFSET ss,0*/
pushq_cfi %rax /* rsp */
CFI_REL_OFFSET rsp,0
- pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
+ pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */
/*CFI_REL_OFFSET rflags,0*/
pushq_cfi $__KERNEL_CS /* cs */
/*CFI_REL_OFFSET cs,0*/
@@ -303,15 +243,15 @@ ENDPROC(native_usergs_sysret64)
.macro SAVE_ARGS_IRQ
cld
/* start from rbp in pt_regs and jump over */
- movq_cfi rdi, RDI-RBP
- movq_cfi rsi, RSI-RBP
- movq_cfi rdx, RDX-RBP
- movq_cfi rcx, RCX-RBP
- movq_cfi rax, RAX-RBP
- movq_cfi r8, R8-RBP
- movq_cfi r9, R9-RBP
- movq_cfi r10, R10-RBP
- movq_cfi r11, R11-RBP
+ movq_cfi rdi, (RDI-RBP)
+ movq_cfi rsi, (RSI-RBP)
+ movq_cfi rdx, (RDX-RBP)
+ movq_cfi rcx, (RCX-RBP)
+ movq_cfi rax, (RAX-RBP)
+ movq_cfi r8, (R8-RBP)
+ movq_cfi r9, (R9-RBP)
+ movq_cfi r10, (R10-RBP)
+ movq_cfi r11, (R11-RBP)
/* Save rbp so that we can unwind from get_irq_regs() */
movq_cfi rbp, 0
@@ -320,7 +260,7 @@ ENDPROC(native_usergs_sysret64)
movq %rsp, %rsi
leaq -RBP(%rsp),%rdi /* arg1 for handler */
- testl $3, CS(%rdi)
+ testl $3, CS-RBP(%rsi)
je 1f
SWAPGS
/*
@@ -330,11 +270,10 @@ ENDPROC(native_usergs_sysret64)
* moving irq_enter into assembly, which would be too much work)
*/
1: incl PER_CPU_VAR(irq_count)
- jne 2f
- mov PER_CPU_VAR(irq_stack_ptr),%rsp
+ cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
CFI_DEF_CFA_REGISTER rsi
-2: /* Store previous stack value */
+ /* Store previous stack value */
pushq %rsi
CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
0x77 /* DW_OP_breg7 */, 0, \
@@ -345,23 +284,6 @@ ENDPROC(native_usergs_sysret64)
TRACE_IRQS_OFF
.endm
-ENTRY(save_rest)
- PARTIAL_FRAME 1 REST_SKIP+8
- movq 5*8+16(%rsp), %r11 /* save return address */
- movq_cfi rbx, RBX+16
- movq_cfi rbp, RBP+16
- movq_cfi r12, R12+16
- movq_cfi r13, R13+16
- movq_cfi r14, R14+16
- movq_cfi r15, R15+16
- movq %r11, 8(%rsp) /* return address */
- FIXUP_TOP_OF_STACK %r11, 16
- ret
- CFI_ENDPROC
-END(save_rest)
-
-/* save complete stack frame */
- .pushsection .kprobes.text, "ax"
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
@@ -390,7 +312,6 @@ ENTRY(save_paranoid)
1: ret
CFI_ENDPROC
END(save_paranoid)
- .popsection
/*
* A newly forked process directly context switches into this address.
@@ -402,7 +323,7 @@ ENTRY(ret_from_fork)
LOCK ; btr $TIF_FORK,TI_flags(%r8)
- pushq_cfi kernel_eflags(%rip)
+ pushq_cfi $0x0002
popfq_cfi # reset kernel eflags
call schedule_tail # rdi: 'prev' task parameter
@@ -412,7 +333,7 @@ ENTRY(ret_from_fork)
RESTORE_REST
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
- jz retint_restore_args
+ jz 1f
testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
jnz int_ret_from_sys_call
@@ -420,6 +341,14 @@ ENTRY(ret_from_fork)
RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
jmp ret_from_sys_call # go to the SYSRET fastpath
+1:
+ subq $REST_SKIP, %rsp # leave space for volatiles
+ CFI_ADJUST_CFA_OFFSET REST_SKIP
+ movq %rbp, %rdi
+ call *%rbx
+ movl $0, RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
CFI_ENDPROC
END(ret_from_fork)
@@ -427,7 +356,8 @@ END(ret_from_fork)
* System call entry. Up to 6 arguments in registers are supported.
*
* SYSCALL does not save anything on the stack and does not change the
- * stack pointer.
+ * stack pointer. However, it does mask the flags register for us, so
+ * CLD and CLAC are not needed.
*/
/*
@@ -482,7 +412,12 @@ GLOBAL(system_call_after_swapgs)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz tracesys
system_call_fastpath:
+#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
+#else
+ andl $__SYSCALL_MASK,%eax
+ cmpl $__NR_syscall_max,%eax
+#endif
ja badsys
movq %r10,%rcx
call *sys_call_table(,%rax,8) # XXX: rip relative
@@ -522,7 +457,7 @@ sysret_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ SCHEDULE_USER
popq_cfi %rdi
jmp sysret_check
@@ -596,7 +531,12 @@ tracesys:
*/
LOAD_ARGS ARGOFFSET, 1
RESTORE_REST
+#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
+#else
+ andl $__SYSCALL_MASK,%eax
+ cmpl $__NR_syscall_max,%eax
+#endif
ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
movq %r10,%rcx /* fixup for C */
call *sys_call_table(,%rax,8)
@@ -630,7 +570,7 @@ int_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ SCHEDULE_USER
popq_cfi %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -667,28 +607,38 @@ int_restore_rest:
CFI_ENDPROC
END(system_call)
-/*
- * Certain special system calls that need to save a complete full stack frame.
- */
- .macro PTREGSCALL label,func,arg
-ENTRY(\label)
- PARTIAL_FRAME 1 8 /* offset 8: return address */
- subq $REST_SKIP, %rsp
- CFI_ADJUST_CFA_OFFSET REST_SKIP
- call save_rest
+ .macro FORK_LIKE func
+ENTRY(stub_\func)
+ CFI_STARTPROC
+ popq %r11 /* save return address */
+ PARTIAL_FRAME 0
+ SAVE_REST
+ pushq %r11 /* put it back on stack */
+ FIXUP_TOP_OF_STACK %r11, 8
DEFAULT_FRAME 0 8 /* offset 8: return address */
- leaq 8(%rsp), \arg /* pt_regs pointer */
+ call sys_\func
+ RESTORE_TOP_OF_STACK %r11, 8
+ ret $REST_SKIP /* pop extended registers */
+ CFI_ENDPROC
+END(stub_\func)
+ .endm
+
+ .macro FIXED_FRAME label,func
+ENTRY(\label)
+ CFI_STARTPROC
+ PARTIAL_FRAME 0 8 /* offset 8: return address */
+ FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
call \func
- jmp ptregscall_common
+ RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
+ ret
CFI_ENDPROC
END(\label)
.endm
- PTREGSCALL stub_clone, sys_clone, %r8
- PTREGSCALL stub_fork, sys_fork, %rdi
- PTREGSCALL stub_vfork, sys_vfork, %rdi
- PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
- PTREGSCALL stub_iopl, sys_iopl, %rsi
+ FORK_LIKE clone
+ FORK_LIKE fork
+ FORK_LIKE vfork
+ FIXED_FRAME stub_iopl, sys_iopl
ENTRY(ptregscall_common)
DEFAULT_FRAME 1 8 /* offset 8: return address */
@@ -709,9 +659,7 @@ ENTRY(stub_execve)
PARTIAL_FRAME 0
SAVE_REST
FIXUP_TOP_OF_STACK %r11
- movq %rsp, %rcx
call sys_execve
- RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
RESTORE_REST
jmp int_ret_from_sys_call
@@ -727,7 +675,6 @@ ENTRY(stub_rt_sigreturn)
addq $8, %rsp
PARTIAL_FRAME 0
SAVE_REST
- movq %rsp,%rdi
FIXUP_TOP_OF_STACK %r11
call sys_rt_sigreturn
movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
@@ -736,6 +683,36 @@ ENTRY(stub_rt_sigreturn)
CFI_ENDPROC
END(stub_rt_sigreturn)
+#ifdef CONFIG_X86_X32_ABI
+ENTRY(stub_x32_rt_sigreturn)
+ CFI_STARTPROC
+ addq $8, %rsp
+ PARTIAL_FRAME 0
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ call sys32_x32_rt_sigreturn
+ movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_x32_rt_sigreturn)
+
+ENTRY(stub_x32_execve)
+ CFI_STARTPROC
+ addq $8, %rsp
+ PARTIAL_FRAME 0
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ call compat_sys_execve
+ RESTORE_TOP_OF_STACK %r11
+ movq %rax,RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_x32_execve)
+
+#endif
+
/*
* Build the entry stubs and pointer table with some assembler magic.
* We pack 7 stubs into a single 32-byte chunk, which will fit in a
@@ -792,10 +769,6 @@ END(interrupt)
call \func
.endm
-/*
- * Interrupt entry/exit should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
/*
* The interrupt stubs push (~vector+0x80) onto the stack and
* then jump to common_interrupt.
@@ -803,6 +776,7 @@ END(interrupt)
.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
XCPT_FRAME
+ ASM_CLAC
addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
interrupt do_IRQ
/* 0(%rsp): old_rsp-ARGOFFSET */
@@ -813,7 +787,7 @@ ret_from_intr:
/* Restore saved previous stack */
popq %rsi
- CFI_DEF_CFA_REGISTER rsi
+ CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
leaq ARGOFFSET-RBP(%rsi), %rsp
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
@@ -858,17 +832,44 @@ restore_args:
irq_return:
INTERRUPT_RETURN
- .section __ex_table, "a"
- .quad irq_return, bad_iret
- .previous
-
-#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
+ /*
+ * Are we returning to a stack segment from the LDT? Note: in
+ * 64-bit mode SS:RSP on the exception stack is always valid.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ testb $4,(SS-RIP)(%rsp)
+ jnz native_irq_return_ldt
+#endif
+
+native_irq_return_iret:
iretq
+ _ASM_EXTABLE(native_irq_return_iret, bad_iret)
- .section __ex_table,"a"
- .quad native_iret, bad_iret
- .previous
+#ifdef CONFIG_X86_ESPFIX64
+native_irq_return_ldt:
+ pushq_cfi %rax
+ pushq_cfi %rdi
+ SWAPGS
+ movq PER_CPU_VAR(espfix_waddr),%rdi
+ movq %rax,(0*8)(%rdi) /* RAX */
+ movq (2*8)(%rsp),%rax /* RIP */
+ movq %rax,(1*8)(%rdi)
+ movq (3*8)(%rsp),%rax /* CS */
+ movq %rax,(2*8)(%rdi)
+ movq (4*8)(%rsp),%rax /* RFLAGS */
+ movq %rax,(3*8)(%rdi)
+ movq (6*8)(%rsp),%rax /* SS */
+ movq %rax,(5*8)(%rdi)
+ movq (5*8)(%rsp),%rax /* RSP */
+ movq %rax,(4*8)(%rdi)
+ andl $0xffff0000,%eax
+ popq_cfi %rdi
+ orq PER_CPU_VAR(espfix_stack),%rax
+ SWAPGS
+ movq %rax,%rsp
+ popq_cfi %rax
+ jmp native_irq_return_iret
#endif
.section .fixup,"ax"
@@ -898,7 +899,7 @@ retint_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ SCHEDULE_USER
popq_cfi %rdi
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -925,29 +926,54 @@ retint_signal:
/* Returning to kernel space. Check if we need preemption */
/* rcx: threadinfo. interrupts off. */
ENTRY(retint_kernel)
- cmpl $0,TI_preempt_count(%rcx)
+ cmpl $0,PER_CPU_VAR(__preempt_count)
jnz retint_restore_args
- bt $TIF_NEED_RESCHED,TI_flags(%rcx)
- jnc retint_restore_args
bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
jnc retint_restore_args
call preempt_schedule_irq
jmp exit_intr
#endif
-
CFI_ENDPROC
END(common_interrupt)
-/*
- * End of kprobes section
- */
- .popsection
+
+ /*
+ * If IRET takes a fault on the espfix stack, then we
+ * end up promoting it to a doublefault. In that case,
+ * modify the stack to make it look like we just entered
+ * the #GP handler from user space, similar to bad_iret.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ ALIGN
+__do_double_fault:
+ XCPT_FRAME 1 RDI+8
+ movq RSP(%rdi),%rax /* Trap on the espfix stack? */
+ sarq $PGDIR_SHIFT,%rax
+ cmpl $ESPFIX_PGD_ENTRY,%eax
+ jne do_double_fault /* No, just deliver the fault */
+ cmpl $__KERNEL_CS,CS(%rdi)
+ jne do_double_fault
+ movq RIP(%rdi),%rax
+ cmpq $native_irq_return_iret,%rax
+ jne do_double_fault /* This shouldn't happen... */
+ movq PER_CPU_VAR(kernel_stack),%rax
+ subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
+ movq %rax,RSP(%rdi)
+ movq $0,(%rax) /* Missing (lost) #GP error code */
+ movq $general_protection,RIP(%rdi)
+ retq
+ CFI_ENDPROC
+END(__do_double_fault)
+#else
+# define __do_double_fault do_double_fault
+#endif
/*
* APIC interrupts.
*/
-.macro apicinterrupt num sym do_sym
+.macro apicinterrupt3 num sym do_sym
ENTRY(\sym)
INTR_FRAME
+ ASM_CLAC
pushq_cfi $~(\num)
.Lcommon_\sym:
interrupt \do_sym
@@ -956,15 +982,32 @@ ENTRY(\sym)
END(\sym)
.endm
+#ifdef CONFIG_TRACING
+#define trace(sym) trace_##sym
+#define smp_trace(sym) smp_trace_##sym
+
+.macro trace_apicinterrupt num sym
+apicinterrupt3 \num trace(\sym) smp_trace(\sym)
+.endm
+#else
+.macro trace_apicinterrupt num sym do_sym
+.endm
+#endif
+
+.macro apicinterrupt num sym do_sym
+apicinterrupt3 \num \sym \do_sym
+trace_apicinterrupt \num \sym
+.endm
+
#ifdef CONFIG_SMP
-apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
+apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \
irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
-apicinterrupt REBOOT_VECTOR \
+apicinterrupt3 REBOOT_VECTOR \
reboot_interrupt smp_reboot_interrupt
#endif
#ifdef CONFIG_X86_UV
-apicinterrupt UV_BAU_MESSAGE \
+apicinterrupt3 UV_BAU_MESSAGE \
uv_bau_message_intr1 uv_bau_message_interrupt
#endif
apicinterrupt LOCAL_TIMER_VECTOR \
@@ -972,28 +1015,20 @@ apicinterrupt LOCAL_TIMER_VECTOR \
apicinterrupt X86_PLATFORM_IPI_VECTOR \
x86_platform_ipi smp_x86_platform_ipi
-#ifdef CONFIG_SMP
- ALIGN
- INTR_FRAME
-.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
- 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-.if NUM_INVALIDATE_TLB_VECTORS > \idx
-ENTRY(invalidate_interrupt\idx)
- pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
- jmp .Lcommon_invalidate_interrupt0
- CFI_ADJUST_CFA_OFFSET -8
-END(invalidate_interrupt\idx)
-.endif
-.endr
- CFI_ENDPROC
-apicinterrupt INVALIDATE_TLB_VECTOR_START, \
- invalidate_interrupt0, smp_invalidate_interrupt
+#ifdef CONFIG_HAVE_KVM
+apicinterrupt3 POSTED_INTR_VECTOR \
+ kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
apicinterrupt THRESHOLD_APIC_VECTOR \
threshold_interrupt smp_threshold_interrupt
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
apicinterrupt THERMAL_APIC_VECTOR \
thermal_interrupt smp_thermal_interrupt
+#endif
#ifdef CONFIG_SMP
apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
@@ -1017,109 +1052,100 @@ apicinterrupt IRQ_WORK_VECTOR \
/*
* Exception entry points.
*/
-.macro zeroentry sym do_sym
-ENTRY(\sym)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
-.macro paranoidzeroentry sym do_sym
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call save_paranoid
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+ /* Sanity check */
+ .if \shift_ist != -1 && \paranoid == 0
+ .error "using shift_ist requires paranoid=1"
+ .endif
-#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
-.macro paranoidzeroentry_ist sym do_sym ist
-ENTRY(\sym)
+ .if \has_error_code
+ XCPT_FRAME
+ .else
INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call save_paranoid
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
- call \do_sym
- addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+ .endif
-.macro errorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
+ ASM_CLAC
PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
- call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
- /* error code is on the stack already */
-.macro paranoiderrorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
+ .ifeq \has_error_code
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ .endif
+
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+
+ .if \paranoid
call save_paranoid
+ .else
+ call error_entry
+ .endif
+
DEFAULT_FRAME 0
+
+ .if \paranoid
+ .if \shift_ist != -1
+ TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
+ .else
TRACE_IRQS_OFF
+ .endif
+ .endif
+
movq %rsp,%rdi /* pt_regs pointer */
+
+ .if \has_error_code
movq ORIG_RAX(%rsp),%rsi /* get error code */
movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ .else
+ xorl %esi,%esi /* no error code */
+ .endif
+
+ .if \shift_ist != -1
+ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ .endif
+
call \do_sym
+
+ .if \shift_ist != -1
+ addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ .endif
+
+ .if \paranoid
jmp paranoid_exit /* %ebx: no swapgs flag */
+ .else
+ jmp error_exit /* %ebx: no swapgs flag */
+ .endif
+
CFI_ENDPROC
END(\sym)
.endm
-zeroentry divide_error do_divide_error
-zeroentry overflow do_overflow
-zeroentry bounds do_bounds
-zeroentry invalid_op do_invalid_op
-zeroentry device_not_available do_device_not_available
-paranoiderrorentry double_fault do_double_fault
-zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
-errorentry invalid_TSS do_invalid_TSS
-errorentry segment_not_present do_segment_not_present
-zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
-zeroentry coprocessor_error do_coprocessor_error
-errorentry alignment_check do_alignment_check
-zeroentry simd_coprocessor_error do_simd_coprocessor_error
+#ifdef CONFIG_TRACING
+.macro trace_idtentry sym do_sym has_error_code:req
+idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
+idtentry \sym \do_sym has_error_code=\has_error_code
+.endm
+#else
+.macro trace_idtentry sym do_sym has_error_code:req
+idtentry \sym \do_sym has_error_code=\has_error_code
+.endm
+#endif
+
+idtentry divide_error do_divide_error has_error_code=0
+idtentry overflow do_overflow has_error_code=0
+idtentry bounds do_bounds has_error_code=0
+idtentry invalid_op do_invalid_op has_error_code=0
+idtentry device_not_available do_device_not_available has_error_code=0
+idtentry double_fault __do_double_fault has_error_code=1 paranoid=1
+idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
+idtentry invalid_TSS do_invalid_TSS has_error_code=1
+idtentry segment_not_present do_segment_not_present has_error_code=1
+idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
+idtentry coprocessor_error do_coprocessor_error has_error_code=0
+idtentry alignment_check do_alignment_check has_error_code=1
+idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
/* Reload gs selector with exception handling */
@@ -1138,10 +1164,7 @@ gs_change:
CFI_ENDPROC
END(native_load_gs_index)
- .section __ex_table,"a"
- .align 8
- .quad gs_change,bad_gs
- .previous
+ _ASM_EXTABLE(gs_change,bad_gs)
.section .fixup,"ax"
/* running with kernelgs */
bad_gs:
@@ -1151,54 +1174,8 @@ bad_gs:
jmp 2b
.previous
-ENTRY(kernel_thread_helper)
- pushq $0 # fake return address
- CFI_STARTPROC
- /*
- * Here we are in the child and the registers are set as they were
- * at kernel_thread() invocation in the parent.
- */
- call *%rsi
- # exit
- mov %eax, %edi
- call do_exit
- ud2 # padding for call trace
- CFI_ENDPROC
-END(kernel_thread_helper)
-
-/*
- * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
- *
- * C extern interface:
- * extern long execve(const char *name, char **argv, char **envp)
- *
- * asm input arguments:
- * rdi: name, rsi: argv, rdx: envp
- *
- * We want to fallback into:
- * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
- *
- * do_sys_execve asm fallback arguments:
- * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
- */
-ENTRY(kernel_execve)
- CFI_STARTPROC
- FAKE_STACK_FRAME $0
- SAVE_ALL
- movq %rsp,%rcx
- call sys_execve
- movq %rax, RAX(%rsp)
- RESTORE_REST
- testq %rax,%rax
- je int_ret_from_sys_call
- RESTORE_ARGS
- UNFAKE_STACK_FRAME
- ret
- CFI_ENDPROC
-END(kernel_execve)
-
/* Call softirq on interrupt stack. Interrupts are off. */
-ENTRY(call_softirq)
+ENTRY(do_softirq_own_stack)
CFI_STARTPROC
pushq_cfi %rbp
CFI_REL_OFFSET rbp,0
@@ -1215,10 +1192,10 @@ ENTRY(call_softirq)
decl PER_CPU_VAR(irq_count)
ret
CFI_ENDPROC
-END(call_softirq)
+END(do_softirq_own_stack)
#ifdef CONFIG_XEN
-zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
+idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
/*
* A note on the "critical region" in our callback handler.
@@ -1308,37 +1285,37 @@ ENTRY(xen_failsafe_callback)
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
- pushq_cfi $0
+ pushq_cfi $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
jmp error_exit
CFI_ENDPROC
END(xen_failsafe_callback)
-apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
+apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
xen_hvm_callback_vector xen_evtchn_do_upcall
#endif /* CONFIG_XEN */
-/*
- * Some functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
+#if IS_ENABLED(CONFIG_HYPERV)
+apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
+ hyperv_callback_vector hyperv_vector_handler
+#endif /* CONFIG_HYPERV */
-paranoidzeroentry_ist debug do_debug DEBUG_STACK
-paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
-paranoiderrorentry stack_segment do_stack_segment
+idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1
#ifdef CONFIG_XEN
-zeroentry xen_debug do_debug
-zeroentry xen_int3 do_int3
-errorentry xen_stack_segment do_stack_segment
+idtentry xen_debug do_debug has_error_code=0
+idtentry xen_int3 do_int3 has_error_code=0
+idtentry xen_stack_segment do_stack_segment has_error_code=1
#endif
-errorentry general_protection do_general_protection
-errorentry page_fault do_page_fault
+idtentry general_protection do_general_protection has_error_code=1
+trace_idtentry page_fault do_page_fault has_error_code=1
#ifdef CONFIG_KVM_GUEST
-errorentry async_page_fault do_async_page_fault
+idtentry async_page_fault do_async_page_fault has_error_code=1
#endif
#ifdef CONFIG_X86_MCE
-paranoidzeroentry machine_check *machine_check_vector(%rip)
+idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
#endif
/*
@@ -1358,7 +1335,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
ENTRY(paranoid_exit)
DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
+ TRACE_IRQS_OFF_DEBUG
testl %ebx,%ebx /* swapgs needed? */
jnz paranoid_restore
testl $3,CS(%rsp)
@@ -1369,7 +1346,7 @@ paranoid_swapgs:
RESTORE_ALL 8
jmp irq_return
paranoid_restore:
- TRACE_IRQS_IRETQ 0
+ TRACE_IRQS_IRETQ_DEBUG 0
RESTORE_ALL 8
jmp irq_return
paranoid_userspace:
@@ -1394,7 +1371,7 @@ paranoid_userspace:
paranoid_schedule:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
+ SCHEDULE_USER
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
jmp paranoid_userspace
@@ -1443,7 +1420,7 @@ error_sti:
*/
error_kernelspace:
incl %ebx
- leaq irq_return(%rip),%rcx
+ leaq native_irq_return_iret(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
movl %ecx,%eax /* zero extend */
@@ -1530,12 +1507,20 @@ ENTRY(nmi)
/* Use %rdx as out temp variable throughout */
pushq_cfi %rdx
+ CFI_REL_OFFSET rdx, 0
+
+ /*
+ * If %cs was not the kernel segment, then the NMI triggered in user
+ * space, which means it is definitely not nested.
+ */
+ cmpl $__KERNEL_CS, 16(%rsp)
+ jne first_nmi
/*
* Check the special variable on the stack to see if NMIs are
* executing.
*/
- cmp $1, -8(%rsp)
+ cmpl $1, -8(%rsp)
je nested_nmi
/*
@@ -1547,6 +1532,7 @@ ENTRY(nmi)
*/
lea 6*8(%rsp), %rdx
test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+ CFI_REMEMBER_STATE
nested_nmi:
/*
@@ -1563,9 +1549,10 @@ nested_nmi:
1:
/* Set up the interrupted NMIs stack to jump to repeat_nmi */
- leaq -6*8(%rsp), %rdx
+ leaq -1*8(%rsp), %rdx
movq %rdx, %rsp
- CFI_ADJUST_CFA_OFFSET 6*8
+ CFI_ADJUST_CFA_OFFSET 1*8
+ leaq -10*8(%rsp), %rdx
pushq_cfi $__KERNEL_DS
pushq_cfi %rdx
pushfq_cfi
@@ -1573,15 +1560,17 @@ nested_nmi:
pushq_cfi $repeat_nmi
/* Put stack back */
- addq $(11*8), %rsp
- CFI_ADJUST_CFA_OFFSET -11*8
+ addq $(6*8), %rsp
+ CFI_ADJUST_CFA_OFFSET -6*8
nested_nmi_out:
popq_cfi %rdx
+ CFI_RESTORE rdx
/* No need to check faults here */
INTERRUPT_RETURN
+ CFI_RESTORE_STATE
first_nmi:
/*
* Because nested NMIs will use the pushed location that we
@@ -1598,48 +1587,79 @@ first_nmi:
* +-------------------------+
* | NMI executing variable |
* +-------------------------+
- * | Saved SS |
- * | Saved Return RSP |
- * | Saved RFLAGS |
- * | Saved CS |
- * | Saved RIP |
- * +-------------------------+
* | copied SS |
* | copied Return RSP |
* | copied RFLAGS |
* | copied CS |
* | copied RIP |
* +-------------------------+
+ * | Saved SS |
+ * | Saved Return RSP |
+ * | Saved RFLAGS |
+ * | Saved CS |
+ * | Saved RIP |
+ * +-------------------------+
* | pt_regs |
* +-------------------------+
*
- * The saved RIP is used to fix up the copied RIP that a nested
- * NMI may zero out. The original stack frame and the temp storage
+ * The saved stack frame is used to fix up the copied stack frame
+ * that a nested NMI may change to make the interrupted NMI iret jump
+ * to the repeat_nmi. The original stack frame and the temp storage
* is also used by nested NMIs and can not be trusted on exit.
*/
+ /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
+ movq (%rsp), %rdx
+ CFI_RESTORE rdx
+
/* Set the NMI executing variable on the stack. */
pushq_cfi $1
+ /*
+ * Leave room for the "copied" frame
+ */
+ subq $(5*8), %rsp
+ CFI_ADJUST_CFA_OFFSET 5*8
+
/* Copy the stack frame to the Saved frame */
.rept 5
- pushq_cfi 6*8(%rsp)
+ pushq_cfi 11*8(%rsp)
.endr
+ CFI_DEF_CFA_OFFSET SS+8-RIP
+
+ /* Everything up to here is safe from nested NMIs */
+
+ /*
+ * If there was a nested NMI, the first NMI's iret will return
+ * here. But NMIs are still enabled and we can take another
+ * nested NMI. The nested NMI checks the interrupted RIP to see
+ * if it is between repeat_nmi and end_repeat_nmi, and if so
+ * it will just return, as we are about to repeat an NMI anyway.
+ * This makes it safe to copy to the stack frame that a nested
+ * NMI will update.
+ */
+repeat_nmi:
+ /*
+ * Update the stack variable to say we are still in NMI (the update
+ * is benign for the non-repeat case, where 1 was pushed just above
+ * to this very stack slot).
+ */
+ movq $1, 10*8(%rsp)
/* Make another copy, this one may be modified by nested NMIs */
+ addq $(10*8), %rsp
+ CFI_ADJUST_CFA_OFFSET -10*8
.rept 5
- pushq_cfi 4*8(%rsp)
+ pushq_cfi -6*8(%rsp)
.endr
-
- /* Do not pop rdx, nested NMIs will corrupt it */
- movq 11*8(%rsp), %rdx
+ subq $(5*8), %rsp
+ CFI_DEF_CFA_OFFSET SS+8-RIP
+end_repeat_nmi:
/*
* Everything below this point can be preempted by a nested
- * NMI if the first NMI took an exception. Repeated NMIs
- * caused by an exception and nested NMI will start here, and
- * can still be preempted by another NMI.
+ * NMI if the first NMI took an exception and reset our iret stack
+ * so that we repeat another NMI.
*/
-restart_nmi:
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1652,42 +1672,44 @@ restart_nmi:
*/
call save_paranoid
DEFAULT_FRAME 0
+
+ /*
+ * Save off the CR2 register. If we take a page fault in the NMI then
+ * it could corrupt the CR2 value. If the NMI preempts a page fault
+ * handler before it was able to read the CR2 register, and then the
+ * NMI itself takes a page fault, the page fault that was preempted
+ * will read the information from the NMI page fault and not the
+ * origin fault. Save it off and restore it if it changes.
+ * Use the r12 callee-saved register.
+ */
+ movq %cr2, %r12
+
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp,%rdi
movq $-1,%rsi
call do_nmi
+
+ /* Did the NMI take a page fault? Restore cr2 if it did */
+ movq %cr2, %rcx
+ cmpq %rcx, %r12
+ je 1f
+ movq %r12, %cr2
+1:
+
testl %ebx,%ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
- RESTORE_ALL 8
+ /* Pop the extra iret frame at once */
+ RESTORE_ALL 6*8
+
/* Clear the NMI executing stack variable */
- movq $0, 10*8(%rsp)
+ movq $0, 5*8(%rsp)
jmp irq_return
CFI_ENDPROC
END(nmi)
- /*
- * If an NMI hit an iret because of an exception or breakpoint,
- * it can lose its NMI context, and a nested NMI may come in.
- * In that case, the nested NMI will change the preempted NMI's
- * stack to jump to here when it does the final iret.
- */
-repeat_nmi:
- INTR_FRAME
- /* Update the stack variable to say we are still in NMI */
- movq $1, 5*8(%rsp)
-
- /* copy the saved stack back to copy stack */
- .rept 5
- pushq_cfi 4*8(%rsp)
- .endr
-
- jmp restart_nmi
- CFI_ENDPROC
-end_repeat_nmi:
-
ENTRY(ignore_sysret)
CFI_STARTPROC
mov $-ENOSYS,%eax
@@ -1695,7 +1717,3 @@ ENTRY(ignore_sysret)
CFI_ENDPROC
END(ignore_sysret)
-/*
- * End of kprobes section
- */
- .popsection
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
new file mode 100644
index 00000000000..94d857fb103
--- /dev/null
+++ b/arch/x86/kernel/espfix_64.c
@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2014 Intel Corporation; author: H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * The IRET instruction, when returning to a 16-bit segment, only
+ * restores the bottom 16 bits of the user space stack pointer. This
+ * causes some 16-bit software to break, but it also leaks kernel state
+ * to user space.
+ *
+ * This works around this by creating percpu "ministacks", each of which
+ * is mapped 2^16 times 64K apart. When we detect that the return SS is
+ * on the LDT, we copy the IRET frame to the ministack and use the
+ * relevant alias to return to userspace. The ministacks are mapped
+ * readonly, so if the IRET fault we promote #GP to #DF which is an IST
+ * vector and thus has its own stack; we then do the fixup in the #DF
+ * handler.
+ *
+ * This file sets up the ministacks and the related page tables. The
+ * actual ministack invocation is in entry_64.S.
+ */
+
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/random.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/setup.h>
+#include <asm/espfix.h>
+
+/*
+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+ * it up to a cache line to avoid unnecessary sharing.
+ */
+#define ESPFIX_STACK_SIZE (8*8UL)
+#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
+
+/* There is address space for how many espfix pages? */
+#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
+
+#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
+#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
+# error "Need more than one PGD for the ESPFIX hack"
+#endif
+
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
+
+/* This contains the *bottom* address of the espfix stack */
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+/* Initialization mutex - should this be a spinlock? */
+static DEFINE_MUTEX(espfix_init_mutex);
+
+/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
+#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
+static void *espfix_pages[ESPFIX_MAX_PAGES];
+
+static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
+ __aligned(PAGE_SIZE);
+
+static unsigned int page_random, slot_random;
+
+/*
+ * This returns the bottom address of the espfix stack for a specific CPU.
+ * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
+ * we have to account for some amount of padding at the end of each page.
+ */
+static inline unsigned long espfix_base_addr(unsigned int cpu)
+{
+ unsigned long page, slot;
+ unsigned long addr;
+
+ page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
+ slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
+ addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
+ addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
+ addr += ESPFIX_BASE_ADDR;
+ return addr;
+}
+
+#define PTE_STRIDE (65536/PAGE_SIZE)
+#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
+#define ESPFIX_PMD_CLONES PTRS_PER_PMD
+#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
+
+#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
+
+static void init_espfix_random(void)
+{
+ unsigned long rand;
+
+ /*
+ * This is run before the entropy pools are initialized,
+ * but this is hopefully better than nothing.
+ */
+ if (!arch_get_random_long(&rand)) {
+ /* The constant is an arbitrary large prime */
+ rdtscll(rand);
+ rand *= 0xc345c6b72fd16123UL;
+ }
+
+ slot_random = rand % ESPFIX_STACKS_PER_PAGE;
+ page_random = (rand / ESPFIX_STACKS_PER_PAGE)
+ & (ESPFIX_PAGE_SPACE - 1);
+}
+
+void __init init_espfix_bsp(void)
+{
+ pgd_t *pgd_p;
+ pteval_t ptemask;
+
+ ptemask = __supported_pte_mask;
+
+ /* Install the espfix pud into the kernel page directory */
+ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+
+ /* Randomize the locations */
+ init_espfix_random();
+
+ /* The rest is the same as for any other processor */
+ init_espfix_ap();
+}
+
+void init_espfix_ap(void)
+{
+ unsigned int cpu, page;
+ unsigned long addr;
+ pud_t pud, *pud_p;
+ pmd_t pmd, *pmd_p;
+ pte_t pte, *pte_p;
+ int n;
+ void *stack_page;
+ pteval_t ptemask;
+
+ /* We only have to do this once... */
+ if (likely(this_cpu_read(espfix_stack)))
+ return; /* Already initialized */
+
+ cpu = smp_processor_id();
+ addr = espfix_base_addr(cpu);
+ page = cpu/ESPFIX_STACKS_PER_PAGE;
+
+ /* Did another CPU already set this up? */
+ stack_page = ACCESS_ONCE(espfix_pages[page]);
+ if (likely(stack_page))
+ goto done;
+
+ mutex_lock(&espfix_init_mutex);
+
+ /* Did we race on the lock? */
+ stack_page = ACCESS_ONCE(espfix_pages[page]);
+ if (stack_page)
+ goto unlock_done;
+
+ ptemask = __supported_pte_mask;
+
+ pud_p = &espfix_pud_page[pud_index(addr)];
+ pud = *pud_p;
+ if (!pud_present(pud)) {
+ pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
+ pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PUD_CLONES; n++)
+ set_pud(&pud_p[n], pud);
+ }
+
+ pmd_p = pmd_offset(&pud, addr);
+ pmd = *pmd_p;
+ if (!pmd_present(pmd)) {
+ pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
+ pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PMD_CLONES; n++)
+ set_pmd(&pmd_p[n], pmd);
+ }
+
+ pte_p = pte_offset_kernel(&pmd, addr);
+ stack_page = (void *)__get_free_page(GFP_KERNEL);
+ pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
+ for (n = 0; n < ESPFIX_PTE_CLONES; n++)
+ set_pte(&pte_p[n*PTE_STRIDE], pte);
+
+ /* Job is done for this CPU and any CPU which shares this page */
+ ACCESS_ONCE(espfix_pages[page]) = stack_page;
+
+unlock_done:
+ mutex_unlock(&espfix_init_mutex);
+done:
+ this_cpu_write(espfix_stack, addr);
+ this_cpu_write(espfix_waddr, (unsigned long)stack_page
+ + (addr & ~PAGE_MASK));
+}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c9a281f272f..cbc4a91b131 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -24,40 +24,21 @@
#include <trace/syscall.h>
#include <asm/cacheflush.h>
+#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
-#include <asm/nmi.h>
-
#ifdef CONFIG_DYNAMIC_FTRACE
-/*
- * modifying_code is set to notify NMIs that they need to use
- * memory barriers when entering or exiting. But we don't want
- * to burden NMIs with unnecessary memory barriers when code
- * modification is not being done (which is most of the time).
- *
- * A mutex is already held when ftrace_arch_code_modify_prepare
- * and post_process are called. No locks need to be taken here.
- *
- * Stop machine will make sure currently running NMIs are done
- * and new NMIs will see the updated variable before we need
- * to worry about NMIs doing memory barriers.
- */
-static int modifying_code __read_mostly;
-static DEFINE_PER_CPU(int, save_modifying_code);
-
int ftrace_arch_code_modify_prepare(void)
{
set_kernel_text_rw();
set_all_modules_text_rw();
- modifying_code = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
{
- modifying_code = 0;
set_all_modules_text_ro();
set_kernel_text_ro();
return 0;
@@ -90,255 +71,577 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
return calc.code;
}
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr < end;
+}
+
+static unsigned long text_ip_addr(unsigned long ip)
+{
+ /*
+ * On x86_64, kernel text mappings are mapped read-only with
+ * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
+ * of the kernel text mapping to modify the kernel text.
+ *
+ * For 32bit kernels, these mappings are same and we can use
+ * kernel identity mapping to modify code.
+ */
+ if (within(ip, (unsigned long)_text, (unsigned long)_etext))
+ ip = (unsigned long)__va(__pa_symbol(ip));
+
+ return ip;
+}
+
+static const unsigned char *ftrace_nop_replace(void)
+{
+ return ideal_nops[NOP_ATOMIC5];
+}
+
+static int
+ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code)
+{
+ unsigned char replaced[MCOUNT_INSN_SIZE];
+
+ /*
+ * Note: Due to modules and __init, code can
+ * disappear and change, we need to protect against faulting
+ * as well as code changing. We do this by using the
+ * probe_kernel_* functions.
+ *
+ * No real locking needed, this code is run through
+ * kstop_machine, or before SMP starts.
+ */
+
+ /* read the text we want to modify */
+ if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ /* Make sure it is what we expect it to be */
+ if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+ return -EINVAL;
+
+ ip = text_ip_addr(ip);
+
+ /* replace the text with the new text */
+ if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+ return -EPERM;
+
+ sync_core();
+
+ return 0;
+}
+
+int ftrace_make_nop(struct module *mod,
+ struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned const char *new, *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_call_replace(ip, addr);
+ new = ftrace_nop_replace();
+
+ /*
+ * On boot up, and when modules are loaded, the MCOUNT_ADDR
+ * is converted to a nop, and will never become MCOUNT_ADDR
+ * again. This code is either running before SMP (on boot up)
+ * or before the code will ever be executed (module load).
+ * We do not want to use the breakpoint version in this case,
+ * just modify the code directly.
+ */
+ if (addr == MCOUNT_ADDR)
+ return ftrace_modify_code_direct(rec->ip, old, new);
+
+ /* Normal cases use add_brk_on_nop */
+ WARN_ONCE(1, "invalid use of ftrace_make_nop");
+ return -EINVAL;
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned const char *new, *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_nop_replace();
+ new = ftrace_call_replace(ip, addr);
+
+ /* Should only be called when module is loaded */
+ return ftrace_modify_code_direct(rec->ip, old, new);
+}
+
/*
- * Modifying code must take extra care. On an SMP machine, if
- * the code being modified is also being executed on another CPU
- * that CPU will have undefined results and possibly take a GPF.
- * We use kstop_machine to stop other CPUS from exectuing code.
- * But this does not stop NMIs from happening. We still need
- * to protect against that. We separate out the modification of
- * the code to take care of this.
+ * The modifying_ftrace_code is used to tell the breakpoint
+ * handler to call ftrace_int3_handler(). If it fails to
+ * call this handler for a breakpoint added by ftrace, then
+ * the kernel may crash.
+ *
+ * As atomic_writes on x86 do not need a barrier, we do not
+ * need to add smp_mb()s for this to work. It is also considered
+ * that we can not read the modifying_ftrace_code before
+ * executing the breakpoint. That would be quite remarkable if
+ * it could do that. Here's the flow that is required:
*
- * Two buffers are added: An IP buffer and a "code" buffer.
+ * CPU-0 CPU-1
*
- * 1) Put the instruction pointer into the IP buffer
- * and the new code into the "code" buffer.
- * 2) Wait for any running NMIs to finish and set a flag that says
- * we are modifying code, it is done in an atomic operation.
- * 3) Write the code
- * 4) clear the flag.
- * 5) Wait for any running NMIs to finish.
+ * atomic_inc(mfc);
+ * write int3s
+ * <trap-int3> // implicit (r)mb
+ * if (atomic_read(mfc))
+ * call ftrace_int3_handler()
*
- * If an NMI is executed, the first thing it does is to call
- * "ftrace_nmi_enter". This will check if the flag is set to write
- * and if it is, it will write what is in the IP and "code" buffers.
+ * Then when we are finished:
*
- * The trick is, it does not matter if everyone is writing the same
- * content to the code location. Also, if a CPU is executing code
- * it is OK to write to that code location if the contents being written
- * are the same as what exists.
+ * atomic_dec(mfc);
+ *
+ * If we hit a breakpoint that was not set by ftrace, it does not
+ * matter if ftrace_int3_handler() is called or not. It will
+ * simply be ignored. But it is crucial that a ftrace nop/caller
+ * breakpoint is handled. No other user should ever place a
+ * breakpoint on an ftrace nop/caller location. It must only
+ * be done by this code.
*/
+atomic_t modifying_ftrace_code __read_mostly;
-#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */
-static atomic_t nmi_running = ATOMIC_INIT(0);
-static int mod_code_status; /* holds return value of text write */
-static void *mod_code_ip; /* holds the IP to write to */
-static const void *mod_code_newcode; /* holds the text to write to the IP */
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code);
-static unsigned nmi_wait_count;
-static atomic_t nmi_update_count = ATOMIC_INIT(0);
+/*
+ * Should never be called:
+ * As it is only called by __ftrace_replace_code() which is called by
+ * ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
+ * which is called to turn mcount into nops or nops into function calls
+ * but not to convert a function from not using regs to one that uses
+ * regs, which ftrace_modify_call() is for.
+ */
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+ unsigned long addr)
+{
+ WARN_ON(1);
+ return -EINVAL;
+}
-int ftrace_arch_read_dyn_info(char *buf, int size)
+static unsigned long ftrace_update_func;
+
+static int update_ftrace_func(unsigned long ip, void *new)
{
- int r;
+ unsigned char old[MCOUNT_INSN_SIZE];
+ int ret;
+
+ memcpy(old, (void *)ip, MCOUNT_INSN_SIZE);
+
+ ftrace_update_func = ip;
+ /* Make sure the breakpoints see the ftrace_update_func update */
+ smp_wmb();
- r = snprintf(buf, size, "%u %u",
- nmi_wait_count,
- atomic_read(&nmi_update_count));
- return r;
+ /* See comment above by declaration of modifying_ftrace_code */
+ atomic_inc(&modifying_ftrace_code);
+
+ ret = ftrace_modify_code(ip, old, new);
+
+ atomic_dec(&modifying_ftrace_code);
+
+ return ret;
}
-static void clear_mod_flag(void)
+int ftrace_update_ftrace_func(ftrace_func_t func)
{
- int old = atomic_read(&nmi_running);
-
- for (;;) {
- int new = old & ~MOD_CODE_WRITE_FLAG;
+ unsigned long ip = (unsigned long)(&ftrace_call);
+ unsigned char *new;
+ int ret;
- if (old == new)
- break;
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = update_ftrace_func(ip, new);
- old = atomic_cmpxchg(&nmi_running, old, new);
+ /* Also update the regs callback function */
+ if (!ret) {
+ ip = (unsigned long)(&ftrace_regs_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = update_ftrace_func(ip, new);
}
+
+ return ret;
}
-static void ftrace_mod_code(void)
+static int is_ftrace_caller(unsigned long ip)
{
- /*
- * Yes, more than one CPU process can be writing to mod_code_status.
- * (and the code itself)
- * But if one were to fail, then they all should, and if one were
- * to succeed, then they all should.
- */
- mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
- MCOUNT_INSN_SIZE);
+ if (ip == ftrace_update_func)
+ return 1;
- /* if we fail, then kill any new writers */
- if (mod_code_status)
- clear_mod_flag();
+ return 0;
}
-void ftrace_nmi_enter(void)
+/*
+ * A breakpoint was added to the code address we are about to
+ * modify, and this is the handle that will just skip over it.
+ * We are either changing a nop into a trace call, or a trace
+ * call to a nop. While the change is taking place, we treat
+ * it just like it was a nop.
+ */
+int ftrace_int3_handler(struct pt_regs *regs)
{
- __this_cpu_write(save_modifying_code, modifying_code);
+ unsigned long ip;
- if (!__this_cpu_read(save_modifying_code))
- return;
+ if (WARN_ON_ONCE(!regs))
+ return 0;
- if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
- smp_rmb();
- ftrace_mod_code();
- atomic_inc(&nmi_update_count);
- }
- /* Must have previous changes seen before executions */
- smp_mb();
+ ip = regs->ip - 1;
+ if (!ftrace_location(ip) && !is_ftrace_caller(ip))
+ return 0;
+
+ regs->ip += MCOUNT_INSN_SIZE - 1;
+
+ return 1;
}
-void ftrace_nmi_exit(void)
+static int ftrace_write(unsigned long ip, const char *val, int size)
{
- if (!__this_cpu_read(save_modifying_code))
- return;
+ ip = text_ip_addr(ip);
- /* Finish all executions before clearing nmi_running */
- smp_mb();
- atomic_dec(&nmi_running);
+ if (probe_kernel_write((void *)ip, val, size))
+ return -EPERM;
+
+ return 0;
}
-static void wait_for_nmi_and_set_mod_flag(void)
+static int add_break(unsigned long ip, const char *old)
{
- if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
- return;
+ unsigned char replaced[MCOUNT_INSN_SIZE];
+ unsigned char brk = BREAKPOINT_INSTRUCTION;
- do {
- cpu_relax();
- } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
+ if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
- nmi_wait_count++;
+ /* Make sure it is what we expect it to be */
+ if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
+ return -EINVAL;
+
+ return ftrace_write(ip, &brk, 1);
}
-static void wait_for_nmi(void)
+static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
{
- if (!atomic_read(&nmi_running))
- return;
+ unsigned const char *old;
+ unsigned long ip = rec->ip;
- do {
- cpu_relax();
- } while (atomic_read(&nmi_running));
+ old = ftrace_call_replace(ip, addr);
- nmi_wait_count++;
+ return add_break(rec->ip, old);
}
-static inline int
-within(unsigned long addr, unsigned long start, unsigned long end)
+
+static int add_brk_on_nop(struct dyn_ftrace *rec)
{
- return addr >= start && addr < end;
+ unsigned const char *old;
+
+ old = ftrace_nop_replace();
+
+ return add_break(rec->ip, old);
}
-static int
-do_ftrace_mod_code(unsigned long ip, const void *new_code)
+static int add_breakpoints(struct dyn_ftrace *rec, int enable)
{
- /*
- * On x86_64, kernel text mappings are mapped read-only with
- * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
- * of the kernel text mapping to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa(ip));
+ unsigned long ftrace_addr;
+ int ret;
+
+ ftrace_addr = ftrace_get_addr_curr(rec);
+
+ ret = ftrace_test_record(rec, enable);
- mod_code_ip = (void *)ip;
- mod_code_newcode = new_code;
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
- /* The buffers need to be visible before we let NMIs write them */
- smp_mb();
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return add_brk_on_nop(rec);
- wait_for_nmi_and_set_mod_flag();
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return add_brk_on_call(rec, ftrace_addr);
+ }
+ return 0;
+}
+
+/*
+ * On error, we need to remove breakpoints. This needs to
+ * be done caefully. If the address does not currently have a
+ * breakpoint, we know we are done. Otherwise, we look at the
+ * remaining 4 bytes of the instruction. If it matches a nop
+ * we replace the breakpoint with the nop. Otherwise we replace
+ * it with the call instruction.
+ */
+static int remove_breakpoint(struct dyn_ftrace *rec)
+{
+ unsigned char ins[MCOUNT_INSN_SIZE];
+ unsigned char brk = BREAKPOINT_INSTRUCTION;
+ const unsigned char *nop;
+ unsigned long ftrace_addr;
+ unsigned long ip = rec->ip;
- /* Make sure all running NMIs have finished before we write the code */
- smp_mb();
+ /* If we fail the read, just give up */
+ if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
- ftrace_mod_code();
+ /* If this does not have a breakpoint, we are done */
+ if (ins[0] != brk)
+ return 0;
- /* Make sure the write happens before clearing the bit */
- smp_mb();
+ nop = ftrace_nop_replace();
- clear_mod_flag();
- wait_for_nmi();
+ /*
+ * If the last 4 bytes of the instruction do not match
+ * a nop, then we assume that this is a call to ftrace_addr.
+ */
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
+ /*
+ * For extra paranoidism, we check if the breakpoint is on
+ * a call that would actually jump to the ftrace_addr.
+ * If not, don't touch the breakpoint, we make just create
+ * a disaster.
+ */
+ ftrace_addr = ftrace_get_addr_new(rec);
+ nop = ftrace_call_replace(ip, ftrace_addr);
+
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
+ goto update;
+
+ /* Check both ftrace_addr and ftrace_old_addr */
+ ftrace_addr = ftrace_get_addr_curr(rec);
+ nop = ftrace_call_replace(ip, ftrace_addr);
+
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
+ return -EINVAL;
+ }
- return mod_code_status;
+ update:
+ return ftrace_write(ip, nop, 1);
}
-static const unsigned char *ftrace_nop_replace(void)
+static int add_update_code(unsigned long ip, unsigned const char *new)
{
- return ideal_nops[NOP_ATOMIC5];
+ /* skip breakpoint */
+ ip++;
+ new++;
+ return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1);
}
-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
+static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned char replaced[MCOUNT_INSN_SIZE];
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
- /*
- * Note: Due to modules and __init, code can
- * disappear and change, we need to protect against faulting
- * as well as code changing. We do this by using the
- * probe_kernel_* functions.
- *
- * No real locking needed, this code is run through
- * kstop_machine, or before SMP starts.
- */
+ new = ftrace_call_replace(ip, addr);
+ return add_update_code(ip, new);
+}
- /* read the text we want to modify */
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
+static int add_update_nop(struct dyn_ftrace *rec)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
- /* Make sure it is what we expect it to be */
- if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
- return -EINVAL;
+ new = ftrace_nop_replace();
+ return add_update_code(ip, new);
+}
- /* replace the text with the new text */
- if (do_ftrace_mod_code(ip, new_code))
- return -EPERM;
+static int add_update(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
- sync_core();
+ ret = ftrace_test_record(rec, enable);
+
+ ftrace_addr = ftrace_get_addr_new(rec);
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return add_update_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return add_update_nop(rec);
+ }
return 0;
}
-int ftrace_make_nop(struct module *mod,
- struct dyn_ftrace *rec, unsigned long addr)
+static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned const char *new, *old;
unsigned long ip = rec->ip;
+ unsigned const char *new;
- old = ftrace_call_replace(ip, addr);
- new = ftrace_nop_replace();
+ new = ftrace_call_replace(ip, addr);
- return ftrace_modify_code(rec->ip, old, new);
+ return ftrace_write(ip, new, 1);
}
-int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+static int finish_update_nop(struct dyn_ftrace *rec)
{
- unsigned const char *new, *old;
unsigned long ip = rec->ip;
+ unsigned const char *new;
- old = ftrace_nop_replace();
- new = ftrace_call_replace(ip, addr);
+ new = ftrace_nop_replace();
- return ftrace_modify_code(rec->ip, old, new);
+ return ftrace_write(ip, new, 1);
}
-int ftrace_update_ftrace_func(ftrace_func_t func)
+static int finish_update(struct dyn_ftrace *rec, int enable)
{
- unsigned long ip = (unsigned long)(&ftrace_call);
- unsigned char old[MCOUNT_INSN_SIZE], *new;
+ unsigned long ftrace_addr;
int ret;
- memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
- new = ftrace_call_replace(ip, (unsigned long)func);
- ret = ftrace_modify_code(ip, old, new);
+ ret = ftrace_update_record(rec, enable);
+
+ ftrace_addr = ftrace_get_addr_new(rec);
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return finish_update_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return finish_update_nop(rec);
+ }
+
+ return 0;
+}
+
+static void do_sync_core(void *data)
+{
+ sync_core();
+}
+
+static void run_sync(void)
+{
+ int enable_irqs = irqs_disabled();
+
+ /* We may be called with interrupts disbled (on bootup). */
+ if (enable_irqs)
+ local_irq_enable();
+ on_each_cpu(do_sync_core, NULL, 1);
+ if (enable_irqs)
+ local_irq_disable();
+}
+
+void ftrace_replace_code(int enable)
+{
+ struct ftrace_rec_iter *iter;
+ struct dyn_ftrace *rec;
+ const char *report = "adding breakpoints";
+ int count = 0;
+ int ret;
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = add_breakpoints(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ count++;
+ }
+
+ run_sync();
+
+ report = "updating code";
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = add_update(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ }
+
+ run_sync();
+
+ report = "removing breakpoints";
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+ ret = finish_update(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ }
+
+ run_sync();
+
+ return;
+
+ remove_breakpoints:
+ pr_warn("Failed on %s (%d):\n", report, count);
+ ftrace_bug(ret, rec ? rec->ip : 0);
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+ /*
+ * Breakpoints are handled only when this function is in
+ * progress. The system could not work with them.
+ */
+ if (remove_breakpoint(rec))
+ BUG();
+ }
+ run_sync();
+}
+
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code)
+{
+ int ret;
+
+ ret = add_break(ip, old_code);
+ if (ret)
+ goto out;
+
+ run_sync();
+
+ ret = add_update_code(ip, new_code);
+ if (ret)
+ goto fail_update;
+
+ run_sync();
+
+ ret = ftrace_write(ip, new_code, 1);
+ /*
+ * The breakpoint is handled only when this function is in progress.
+ * The system could not work if we could not remove it.
+ */
+ BUG_ON(ret);
+ out:
+ run_sync();
return ret;
+
+ fail_update:
+ /* Also here the system could not work with the breakpoint */
+ if (ftrace_write(ip, old_code, 1))
+ BUG();
+ goto out;
}
-int __init ftrace_dyn_arch_init(void *data)
+void arch_ftrace_update_code(int command)
{
- /* The return code is retured via data */
- *(unsigned long *)data = 0;
+ /* See comment above by declaration of modifying_ftrace_code */
+ atomic_inc(&modifying_ftrace_code);
+ ftrace_modify_all_code(command);
+
+ atomic_dec(&modifying_ftrace_code);
+}
+
+int __init ftrace_dyn_arch_init(void)
+{
return 0;
}
#endif
@@ -348,45 +651,41 @@ int __init ftrace_dyn_arch_init(void *data)
#ifdef CONFIG_DYNAMIC_FTRACE
extern void ftrace_graph_call(void);
-static int ftrace_mod_jmp(unsigned long ip,
- int old_offset, int new_offset)
+static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
{
- unsigned char code[MCOUNT_INSN_SIZE];
+ static union ftrace_code_union calc;
- if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
+ /* Jmp not a call (ignore the .e8) */
+ calc.e8 = 0xe9;
+ calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
- if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
- return -EINVAL;
+ /*
+ * ftrace external locks synchronize the access to the static variable.
+ */
+ return calc.code;
+}
- *(int *)(&code[1]) = new_offset;
+static int ftrace_mod_jmp(unsigned long ip, void *func)
+{
+ unsigned char *new;
- if (do_ftrace_mod_code(ip, &code))
- return -EPERM;
+ new = ftrace_jmp_replace(ip, (unsigned long)func);
- return 0;
+ return update_ftrace_func(ip, new);
}
int ftrace_enable_ftrace_graph_caller(void)
{
unsigned long ip = (unsigned long)(&ftrace_graph_call);
- int old_offset, new_offset;
- old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
- new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
-
- return ftrace_mod_jmp(ip, old_offset, new_offset);
+ return ftrace_mod_jmp(ip, &ftrace_graph_caller);
}
int ftrace_disable_ftrace_graph_caller(void)
{
unsigned long ip = (unsigned long)(&ftrace_graph_call);
- int old_offset, new_offset;
-
- old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
- new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
- return ftrace_mod_jmp(ip, old_offset, new_offset);
+ return ftrace_mod_jmp(ip, &ftrace_stub);
}
#endif /* !CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 48d9d4ea102..992f442ca15 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -5,8 +5,6 @@
#include <asm/setup.h>
#include <asm/bios_ebda.h>
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
/*
* The BIOS places the EBDA/XBDA at the top of conventional
* memory, and usually decreases the reported amount of
@@ -16,17 +14,30 @@
* chipset: reserve a page before VGA to prevent PCI prefetch
* into it (errata #56). Usually the page is reserved anyways,
* unless you have no PS/2 mouse plugged in.
+ *
+ * This functions is deliberately very conservative. Losing
+ * memory in the bottom megabyte is rarely a problem, as long
+ * as we have enough memory to install the trampoline. Using
+ * memory that is in use by the BIOS or by some DMA device
+ * the BIOS didn't shut down *is* a big problem.
*/
+
+#define BIOS_LOWMEM_KILOBYTES 0x413
+#define LOWMEM_CAP 0x9f000U /* Absolute maximum */
+#define INSANE_CUTOFF 0x20000U /* Less than this = insane */
+
void __init reserve_ebda_region(void)
{
unsigned int lowmem, ebda_addr;
- /* To determine the position of the EBDA and the */
- /* end of conventional memory, we need to look at */
- /* the BIOS data area. In a paravirtual environment */
- /* that area is absent. We'll just have to assume */
- /* that the paravirt case can handle memory setup */
- /* correctly, without our help. */
+ /*
+ * To determine the position of the EBDA and the
+ * end of conventional memory, we need to look at
+ * the BIOS data area. In a paravirtual environment
+ * that area is absent. We'll just have to assume
+ * that the paravirt case can handle memory setup
+ * correctly, without our help.
+ */
if (paravirt_enabled())
return;
@@ -37,19 +48,23 @@ void __init reserve_ebda_region(void)
/* start of EBDA area */
ebda_addr = get_bios_ebda();
- /* Fixup: bios puts an EBDA in the top 64K segment */
- /* of conventional memory, but does not adjust lowmem. */
- if ((lowmem - ebda_addr) <= 0x10000)
- lowmem = ebda_addr;
+ /*
+ * Note: some old Dells seem to need 4k EBDA without
+ * reporting so, so just consider the memory above 0x9f000
+ * to be off limits (bugzilla 2990).
+ */
+
+ /* If the EBDA address is below 128K, assume it is bogus */
+ if (ebda_addr < INSANE_CUTOFF)
+ ebda_addr = LOWMEM_CAP;
- /* Fixup: bios does not report an EBDA at all. */
- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
- lowmem = 0x9f000;
+ /* If lowmem is less than 128K, assume it is bogus */
+ if (lowmem < INSANE_CUTOFF)
+ lowmem = LOWMEM_CAP;
- /* Paranoia: should never happen, but... */
- if ((lowmem == 0) || (lowmem >= 0x100000))
- lowmem = 0x9f000;
+ /* Use the lower of the lowmem and EBDA markers as the cutoff */
+ lowmem = min(lowmem, ebda_addr);
+ lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */
/* reserve all memory between lowmem and the 1MB mark */
memblock_reserve(lowmem, 0x100000 - lowmem);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 51ff18616d5..d6c1b983699 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -14,11 +14,11 @@
#include <asm/sections.h>
#include <asm/e820.h>
#include <asm/page.h>
-#include <asm/trampoline.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/bios_ebda.h>
#include <asm/tlbflush.h>
+#include <asm/bootparam_utils.h>
static void __init i386_default_early_setup(void)
{
@@ -29,26 +29,14 @@ static void __init i386_default_early_setup(void)
reserve_ebda_region();
}
-void __init i386_start_kernel(void)
+asmlinkage __visible void __init i386_start_kernel(void)
{
- memblock_reserve(__pa_symbol(&_text),
- __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
-
-#ifdef CONFIG_BLK_DEV_INITRD
- /* Reserve INITRD */
- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
- /* Assume only end is not page aligned */
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
- u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
- }
-#endif
+ sanitize_boot_params(&boot_params);
/* Call the subarch specific early setup function */
switch (boot_params.hdr.hardware_subarch) {
- case X86_SUBARCH_MRST:
- x86_mrst_early_setup();
+ case X86_SUBARCH_INTEL_MID:
+ x86_intel_mid_early_setup();
break;
case X86_SUBARCH_CE4100:
x86_ce4100_early_setup();
@@ -58,11 +46,5 @@ void __init i386_start_kernel(void)
break;
}
- /*
- * At this point everything still needed from the boot loader
- * or BIOS or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
-
start_kernel();
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 3a3b779f41d..eda1a865641 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,14 +24,86 @@
#include <asm/sections.h>
#include <asm/kdebug.h>
#include <asm/e820.h>
-#include <asm/trampoline.h>
#include <asm/bios_ebda.h>
+#include <asm/bootparam_utils.h>
+#include <asm/microcode.h>
-static void __init zap_identity_mappings(void)
+/*
+ * Manage page tables very early on.
+ */
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+static unsigned int __initdata next_early_pgt = 2;
+pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+
+/* Wipe all early page tables except for the kernel symbol map */
+static void __init reset_early_page_tables(void)
+{
+ unsigned long i;
+
+ for (i = 0; i < PTRS_PER_PGD-1; i++)
+ early_level4_pgt[i].pgd = 0;
+
+ next_early_pgt = 0;
+
+ write_cr3(__pa(early_level4_pgt));
+}
+
+/* Create a new PMD entry */
+int __init early_make_pgtable(unsigned long address)
{
- pgd_t *pgd = pgd_offset_k(0UL);
- pgd_clear(pgd);
- __flush_tlb_all();
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ unsigned long i;
+ pgdval_t pgd, *pgd_p;
+ pudval_t pud, *pud_p;
+ pmdval_t pmd, *pmd_p;
+
+ /* Invalid address or early pgt is done ? */
+ if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
+ return -1;
+
+again:
+ pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
+ pgd = *pgd_p;
+
+ /*
+ * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
+ * critical -- __PAGE_OFFSET would point us back into the dynamic
+ * range and we might end up looping forever...
+ */
+ if (pgd)
+ pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ pud_p[i] = 0;
+ *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pud_p += pud_index(address);
+ pud = *pud_p;
+
+ if (pud)
+ pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ pmd_p[i] = 0;
+ *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pmd = (physaddr & PMD_MASK) + early_pmd_flags;
+ pmd_p[pmd_index(address)] = pmd;
+
+ return 0;
}
/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -42,18 +114,30 @@ static void __init clear_bss(void)
(unsigned long) __bss_stop - (unsigned long) __bss_start);
}
+static unsigned long get_cmd_line_ptr(void)
+{
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;
+
+ return cmd_line_ptr;
+}
+
static void __init copy_bootdata(char *real_mode_data)
{
char * command_line;
+ unsigned long cmd_line_ptr;
memcpy(&boot_params, real_mode_data, sizeof boot_params);
- if (boot_params.hdr.cmd_line_ptr) {
- command_line = __va(boot_params.hdr.cmd_line_ptr);
+ sanitize_boot_params(&boot_params);
+ cmd_line_ptr = get_cmd_line_ptr();
+ if (cmd_line_ptr) {
+ command_line = __va(cmd_line_ptr);
memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
}
}
-void __init x86_64_start_kernel(char * real_mode_data)
+asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
{
int i;
@@ -61,64 +145,50 @@ void __init x86_64_start_kernel(char * real_mode_data)
* Build-time sanity checks on the kernel image and module
* area mappings. (these are purely build-time and produce no code)
*/
- BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START);
- BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE);
+ BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map);
+ BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE);
BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
- BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0);
+ BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
+ /* Kill off the identity-map trampoline */
+ reset_early_page_tables();
+
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
- /* Make NULL pointers segfault */
- zap_identity_mappings();
+ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
+ set_intr_gate(i, early_idt_handlers[i]);
+ load_idt((const struct desc_ptr *)&idt_descr);
- max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
+ copy_bootdata(__va(real_mode_data));
- for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
-#ifdef CONFIG_EARLY_PRINTK
- set_intr_gate(i, &early_idt_handlers[i]);
-#else
- set_intr_gate(i, early_idt_handler);
-#endif
- }
- load_idt((const struct desc_ptr *)&idt_descr);
+ /*
+ * Load microcode early on BSP.
+ */
+ load_ucode_bsp();
- if (console_loglevel == 10)
+ if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
early_printk("Kernel alive\n");
+ clear_page(init_level4_pgt);
+ /* set init_level4_pgt kernel high mapping*/
+ init_level4_pgt[511] = early_level4_pgt[511];
+
x86_64_start_reservations(real_mode_data);
}
void __init x86_64_start_reservations(char *real_mode_data)
{
- copy_bootdata(__va(real_mode_data));
-
- memblock_reserve(__pa_symbol(&_text),
- __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
-
-#ifdef CONFIG_BLK_DEV_INITRD
- /* Reserve INITRD */
- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
- /* Assume only end is not page aligned */
- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
- unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
- }
-#endif
+ /* version is always not zero if it is copied */
+ if (!boot_params.hdr.version)
+ copy_bootdata(__va(real_mode_data));
reserve_ebda_region();
- /*
- * At this point everything still needed from the boot loader
- * or BIOS or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
-
start_kernel();
}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index ce0be7cd085..f36bd42d6f0 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -21,6 +21,7 @@
#include <asm/msr-index.h>
#include <asm/cpufeature.h>
#include <asm/percpu.h>
+#include <asm/nops.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -143,6 +144,11 @@ ENTRY(startup_32)
movl %eax, pa(olpc_ofw_pgd)
#endif
+#ifdef CONFIG_MICROCODE_EARLY
+ /* Early load ucode on BSP. */
+ call load_ucode_bsp
+#endif
+
/*
* Initialize page tables. This creates a PDE and a set of page
* tables, which are located immediately beyond __brk_base. The variable
@@ -265,6 +271,19 @@ num_subarch_entries = (. - subarch_entries) / 4
jmp default_entry
#endif /* CONFIG_PARAVIRT */
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
+ * up already except stack. We just set up stack here. Then call
+ * start_secondary().
+ */
+ENTRY(start_cpu0)
+ movl stack_start, %ecx
+ movl %ecx, %esp
+ jmp *(initial_code)
+ENDPROC(start_cpu0)
+#endif
+
/*
* Non-boot CPU entry point; entered from trampoline.S
* We can't lgdt here, because lgdt itself uses a data segment, but
@@ -273,10 +292,6 @@ num_subarch_entries = (. - subarch_entries) / 4
* If cpu hotplug is not supported then this code can go in init section
* which will be freed later
*/
-
-__CPUINIT
-
-#ifdef CONFIG_SMP
ENTRY(startup_32_smp)
cld
movl $(__BOOT_DS),%eax
@@ -287,33 +302,60 @@ ENTRY(startup_32_smp)
movl pa(stack_start),%ecx
movl %eax,%ss
leal -__PAGE_OFFSET(%ecx),%esp
-#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_MICROCODE_EARLY
+ /* Early load ucode on AP. */
+ call load_ucode_ap
+#endif
+
+
default_entry:
+#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
+ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
+ X86_CR0_PG)
+ movl $(CR0_STATE & ~X86_CR0_PG),%eax
+ movl %eax,%cr0
/*
- * New page tables may be in 4Mbyte page mode and may
- * be using the global pages.
- *
- * NOTE! If we are on a 486 we may have no cr4 at all!
- * So we do not try to touch it unless we really have
- * some bits in it to set. This won't work if the BSP
- * implements cr4 but this AP does not -- very unlikely
- * but be warned! The same applies to the pse feature
- * if not equally supported. --macro
+ * We want to start out with EFLAGS unambiguously cleared. Some BIOSes leave
+ * bits like NT set. This would confuse the debugger if this code is traced. So
+ * initialize them properly now before switching to protected mode. That means
+ * DF in particular (even though we have cleared it earlier after copying the
+ * command line) because GCC expects it.
+ */
+ pushl $0
+ popfl
+
+/*
+ * New page tables may be in 4Mbyte page mode and may be using the global pages.
*
- * NOTE! We have to correct for the fact that we're
- * not yet offset PAGE_OFFSET..
+ * NOTE! If we are on a 486 we may have no cr4 at all! Specifically, cr4 exists
+ * if and only if CPUID exists and has flags other than the FPU flag set.
*/
-#define cr4_bits pa(mmu_cr4_features)
- movl cr4_bits,%edx
- andl %edx,%edx
- jz 6f
- movl %cr4,%eax # Turn on paging options (PSE,PAE,..)
- orl %edx,%eax
+ movl $-1,pa(X86_CPUID) # preset CPUID level
+ movl $X86_EFLAGS_ID,%ecx
+ pushl %ecx
+ popfl # set EFLAGS=ID
+ pushfl
+ popl %eax # get EFLAGS
+ testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set?
+ jz enable_paging # hw disallowed setting of ID bit
+ # which means no CPUID and no CR4
+
+ xorl %eax,%eax
+ cpuid
+ movl %eax,pa(X86_CPUID) # save largest std CPUID function
+
+ movl $1,%eax
+ cpuid
+ andl $~1,%edx # Ignore CPUID.FPU
+ jz enable_paging # No flags or only CPUID.FPU = no CR4
+
+ movl pa(mmu_cr4_features),%eax
movl %eax,%cr4
testb $X86_CR4_PAE, %al # check if PAE is enabled
- jz 6f
+ jz enable_paging
/* Check if extended functions are implemented */
movl $0x80000000, %eax
@@ -321,7 +363,7 @@ default_entry:
/* Value must be in the range 0x80000001 to 0x8000ffff */
subl $0x80000001, %eax
cmpl $(0x8000ffff-0x80000001), %eax
- ja 6f
+ ja enable_paging
/* Clear bogus XD_DISABLE bits */
call verify_cpu
@@ -330,7 +372,7 @@ default_entry:
cpuid
/* Execute Disable bit supported? */
btl $(X86_FEATURE_NX & 31), %edx
- jnc 6f
+ jnc enable_paging
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
@@ -340,15 +382,14 @@ default_entry:
/* Make changes effective */
wrmsr
-6:
+enable_paging:
/*
* Enable paging
*/
movl $pa(initial_page_table), %eax
movl %eax,%cr3 /* set the page table pointer.. */
- movl %cr0,%eax
- orl $X86_CR0_PG,%eax
+ movl $CR0_STATE,%eax
movl %eax,%cr0 /* ..and set paging (PG) bit */
ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
1:
@@ -356,52 +397,20 @@ default_entry:
addl $__PAGE_OFFSET, %esp
/*
- * Initialize eflags. Some BIOS's leave bits like NT set. This would
- * confuse the debugger if this code is traced.
- * XXX - best to initialize before switching to protected mode.
- */
- pushl $0
- popfl
-
-#ifdef CONFIG_SMP
- cmpb $0, ready
- jnz checkCPUtype
-#endif /* CONFIG_SMP */
-
-/*
* start system 32-bit setup. We need to re-do some of the things done
* in 16-bit mode for the "real" operations.
*/
- call setup_idt
-
-checkCPUtype:
-
- movl $-1,X86_CPUID # -1 for no CPUID initially
+ movl setup_once_ref,%eax
+ andl %eax,%eax
+ jz 1f # Did we do this already?
+ call *%eax
+1:
-/* check if it is 486 or 386. */
/*
- * XXX - this does a lot of unnecessary setup. Alignment checks don't
- * apply at our cpl of 0 and the stack ought to be aligned already, and
- * we don't need to preserve eflags.
+ * Check if it is 486
*/
-
- movb $3,X86 # at least 386
- pushfl # push EFLAGS
- popl %eax # get EFLAGS
- movl %eax,%ecx # save original EFLAGS
- xorl $0x240000,%eax # flip AC and ID bits in EFLAGS
- pushl %eax # copy to EFLAGS
- popfl # set EFLAGS
- pushfl # get new EFLAGS
- popl %eax # put it in eax
- xorl %ecx,%eax # change in flags
- pushl %ecx # restore original EFLAGS
- popfl
- testl $0x40000,%eax # check if AC bit changed
- je is386
-
- movb $4,X86 # at least 486
- testl $0x200000,%eax # check if ID bit changed
+ movb $4,X86 # at least 486
+ cmpl $-1,X86_CPUID
je is486
/* get vendor info */
@@ -427,16 +436,13 @@ checkCPUtype:
movb %cl,X86_MASK
movl %edx,X86_CAPABILITY
-is486: movl $0x50022,%ecx # set AM, WP, NE and MP
- jmp 2f
-
-is386: movl $2,%ecx # set MP
-2: movl %cr0,%eax
+is486:
+ movl $0x50022,%ecx # set AM, WP, NE and MP
+ movl %cr0,%eax
andl $0x80000011,%eax # Save PG,PE,ET
orl %ecx,%eax
movl %eax,%cr0
- call check_x87
lgdt early_gdt_descr
lidt idt_descr
ljmp $(__KERNEL_CS),$1f
@@ -450,126 +456,134 @@ is386: movl $2,%ecx # set MP
movl $(__KERNEL_PERCPU), %eax
movl %eax,%fs # set this cpu's percpu
-#ifdef CONFIG_CC_STACKPROTECTOR
- /*
- * The linker can't handle this by relocation. Manually set
- * base address in stack canary segment descriptor.
- */
- cmpb $0,ready
- jne 1f
- movl $gdt_page,%eax
- movl $stack_canary,%ecx
- movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
- shrl $16, %ecx
- movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
- movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
-1:
-#endif
movl $(__KERNEL_STACK_CANARY),%eax
movl %eax,%gs
xorl %eax,%eax # Clear LDT
lldt %ax
- cld # gcc2 wants the direction flag cleared at all times
pushl $0 # fake return address for unwinder
- movb $1, ready
jmp *(initial_code)
-/*
- * We depend on ET to be correct. This checks for 287/387.
- */
-check_x87:
- movb $0,X86_HARD_MATH
- clts
- fninit
- fstsw %ax
- cmpb $0,%al
- je 1f
- movl %cr0,%eax /* no coprocessor: have to set bits */
- xorl $4,%eax /* set EM */
- movl %eax,%cr0
- ret
- ALIGN
-1: movb $1,X86_HARD_MATH
- .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
- ret
+#include "verify_cpu.S"
/*
- * setup_idt
+ * setup_once
*
- * sets up a idt with 256 entries pointing to
- * ignore_int, interrupt gates. It doesn't actually load
- * idt - that can be done only after paging has been enabled
- * and the kernel moved to PAGE_OFFSET. Interrupts
- * are enabled elsewhere, when we can be relatively
- * sure everything is ok.
+ * The setup work we only want to run on the BSP.
*
* Warning: %esi is live across this function.
*/
-setup_idt:
- lea ignore_int,%edx
- movl $(__KERNEL_CS << 16),%eax
- movw %dx,%ax /* selector = 0x0010 = cs */
- movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
+__INIT
+setup_once:
+ /*
+ * Set up a idt with 256 entries pointing to ignore_int,
+ * interrupt gates. It doesn't actually load idt - that needs
+ * to be done on each CPU. Interrupts are enabled elsewhere,
+ * when we can be relatively sure everything is ok.
+ */
- lea idt_table,%edi
- mov $256,%ecx
-rp_sidt:
+ movl $idt_table,%edi
+ movl $early_idt_handlers,%eax
+ movl $NUM_EXCEPTION_VECTORS,%ecx
+1:
movl %eax,(%edi)
- movl %edx,4(%edi)
+ movl %eax,4(%edi)
+ /* interrupt gate, dpl=0, present */
+ movl $(0x8E000000 + __KERNEL_CS),2(%edi)
+ addl $9,%eax
addl $8,%edi
- dec %ecx
- jne rp_sidt
+ loop 1b
-.macro set_early_handler handler,trapno
- lea \handler,%edx
+ movl $256 - NUM_EXCEPTION_VECTORS,%ecx
+ movl $ignore_int,%edx
movl $(__KERNEL_CS << 16),%eax
- movw %dx,%ax
+ movw %dx,%ax /* selector = 0x0010 = cs */
movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
- lea idt_table,%edi
- movl %eax,8*\trapno(%edi)
- movl %edx,8*\trapno+4(%edi)
-.endm
+2:
+ movl %eax,(%edi)
+ movl %edx,4(%edi)
+ addl $8,%edi
+ loop 2b
- set_early_handler handler=early_divide_err,trapno=0
- set_early_handler handler=early_illegal_opcode,trapno=6
- set_early_handler handler=early_protection_fault,trapno=13
- set_early_handler handler=early_page_fault,trapno=14
+#ifdef CONFIG_CC_STACKPROTECTOR
+ /*
+ * Configure the stack canary. The linker can't handle this by
+ * relocation. Manually set base address in stack canary
+ * segment descriptor.
+ */
+ movl $gdt_page,%eax
+ movl $stack_canary,%ecx
+ movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
+ shrl $16, %ecx
+ movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
+ movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
+#endif
+ andl $0,setup_once_ref /* Once is enough, thanks */
ret
-early_divide_err:
- xor %edx,%edx
- pushl $0 /* fake errcode */
- jmp early_fault
-
-early_illegal_opcode:
- movl $6,%edx
- pushl $0 /* fake errcode */
- jmp early_fault
+ENTRY(early_idt_handlers)
+ # 36(%esp) %eflags
+ # 32(%esp) %cs
+ # 28(%esp) %eip
+ # 24(%rsp) error code
+ i = 0
+ .rept NUM_EXCEPTION_VECTORS
+ .if (EXCEPTION_ERRCODE_MASK >> i) & 1
+ ASM_NOP2
+ .else
+ pushl $0 # Dummy error code, to make stack frame uniform
+ .endif
+ pushl $i # 20(%esp) Vector number
+ jmp early_idt_handler
+ i = i + 1
+ .endr
+ENDPROC(early_idt_handlers)
+
+ /* This is global to keep gas from relaxing the jumps */
+ENTRY(early_idt_handler)
+ cld
-early_protection_fault:
- movl $13,%edx
- jmp early_fault
+ cmpl $2,(%esp) # X86_TRAP_NMI
+ je is_nmi # Ignore NMI
-early_page_fault:
- movl $14,%edx
- jmp early_fault
+ cmpl $2,%ss:early_recursion_flag
+ je hlt_loop
+ incl %ss:early_recursion_flag
-early_fault:
- cld
-#ifdef CONFIG_PRINTK
- pusha
+ push %eax # 16(%esp)
+ push %ecx # 12(%esp)
+ push %edx # 8(%esp)
+ push %ds # 4(%esp)
+ push %es # 0(%esp)
movl $(__KERNEL_DS),%eax
movl %eax,%ds
movl %eax,%es
- cmpl $2,early_recursion_flag
- je hlt_loop
- incl early_recursion_flag
+
+ cmpl $(__KERNEL_CS),32(%esp)
+ jne 10f
+
+ leal 28(%esp),%eax # Pointer to %eip
+ call early_fixup_exception
+ andl %eax,%eax
+ jnz ex_entry /* found an exception entry */
+
+10:
+#ifdef CONFIG_PRINTK
+ xorl %eax,%eax
+ movw %ax,2(%esp) /* clean up the segment values on some cpus */
+ movw %ax,6(%esp)
+ movw %ax,34(%esp)
+ leal 40(%esp),%eax
+ pushl %eax /* %esp before the exception */
+ pushl %ebx
+ pushl %ebp
+ pushl %esi
+ pushl %edi
movl %cr2,%eax
pushl %eax
- pushl %edx /* trapno */
+ pushl (20+6*4)(%esp) /* trapno */
pushl $fault_msg
call printk
#endif
@@ -578,6 +592,18 @@ hlt_loop:
hlt
jmp hlt_loop
+ex_entry:
+ pop %es
+ pop %ds
+ pop %edx
+ pop %ecx
+ pop %eax
+ decl %ss:early_recursion_flag
+is_nmi:
+ addl $8,%esp /* drop vector number and error code */
+ iret
+ENDPROC(early_idt_handler)
+
/* This is the default interrupt "handler" :-) */
ALIGN
ignore_int:
@@ -611,13 +637,18 @@ ignore_int:
popl %eax
#endif
iret
+ENDPROC(ignore_int)
+__INITDATA
+ .align 4
+early_recursion_flag:
+ .long 0
-#include "verify_cpu.S"
-
- __REFDATA
-.align 4
+__REFDATA
+ .align 4
ENTRY(initial_code)
.long i386_start_kernel
+ENTRY(setup_once_ref)
+ .long setup_once
/*
* BSS section
@@ -670,22 +701,19 @@ ENTRY(initial_page_table)
ENTRY(stack_start)
.long init_thread_union+THREAD_SIZE
-early_recursion_flag:
- .long 0
-
-ready: .byte 0
-
+__INITRODATA
int_msg:
.asciz "Unknown interrupt or fault at: %p %p %p\n"
fault_msg:
/* fault info: */
.ascii "BUG: Int %d: CR2 %p\n"
-/* pusha regs: */
- .ascii " EDI %p ESI %p EBP %p ESP %p\n"
- .ascii " EBX %p EDX %p ECX %p EAX %p\n"
+/* regs pushed in early_idt_handler: */
+ .ascii " EDI %p ESI %p EBP %p EBX %p\n"
+ .ascii " ESP %p ES %p DS %p\n"
+ .ascii " EDX %p ECX %p EAX %p\n"
/* fault frame: */
- .ascii " err %p EIP %p CS %p flg %p\n"
+ .ascii " vec %p err %p EIP %p CS %p flg %p\n"
.ascii "Stack: %p %p %p %p %p %p %p %p\n"
.ascii " %p %p %p %p %p %p %p %p\n"
.asciz " %p %p %p %p %p %p %p %p\n"
@@ -699,6 +727,7 @@ fault_msg:
* segment size, and 32-bit linear address value:
*/
+ .data
.globl boot_gdt_descr
.globl idt_descr
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 40f4eb3766d..a468c0a65c4 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,12 +19,15 @@
#include <asm/cache.h>
#include <asm/processor-flags.h>
#include <asm/percpu.h>
+#include <asm/nops.h>
#ifdef CONFIG_PARAVIRT
#include <asm/asm-offsets.h>
#include <asm/paravirt.h>
+#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
#else
-#define GET_CR2_INTO_RCX movq %cr2, %rcx
+#define GET_CR2_INTO(reg) movq %cr2, reg
+#define INTERRUPT_RETURN iretq
#endif
/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
@@ -44,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
.code64
.globl startup_64
startup_64:
-
/*
- * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
+ * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded an identity mapped page table
* for us. These identity mapped page tables map all of the
* kernel pages and possibly all of memory.
*
- * %esi holds a physical pointer to real_mode_data.
+ * %rsi holds a physical pointer to real_mode_data.
*
* We come here either directly from a 64bit bootloader, or from
* arch/x86_64/boot/compressed/head.S.
@@ -63,7 +65,8 @@ startup_64:
* tables and then reload them.
*/
- /* Compute the delta between the address I am compiled to run at and the
+ /*
+ * Compute the delta between the address I am compiled to run at and the
* address I am actually running at.
*/
leaq _text(%rip), %rbp
@@ -75,45 +78,64 @@ startup_64:
testl %eax, %eax
jnz bad_address
- /* Is the address too large? */
- leaq _text(%rip), %rdx
- movq $PGDIR_SIZE, %rax
- cmpq %rax, %rdx
- jae bad_address
-
- /* Fixup the physical addresses in the page table
+ /*
+ * Is the address too large?
*/
- addq %rbp, init_level4_pgt + 0(%rip)
- addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
- addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
+ leaq _text(%rip), %rax
+ shrq $MAX_PHYSMEM_BITS, %rax
+ jnz bad_address
- addq %rbp, level3_ident_pgt + 0(%rip)
+ /*
+ * Fixup the physical addresses in the page table
+ */
+ addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
addq %rbp, level3_kernel_pgt + (510*8)(%rip)
addq %rbp, level3_kernel_pgt + (511*8)(%rip)
addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
- /* Add an Identity mapping if I am above 1G */
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
leaq _text(%rip), %rdi
- andq $PMD_PAGE_MASK, %rdi
+ leaq early_level4_pgt(%rip), %rbx
movq %rdi, %rax
- shrq $PUD_SHIFT, %rax
- andq $(PTRS_PER_PUD - 1), %rax
- jz ident_complete
+ shrq $PGDIR_SHIFT, %rax
+
+ leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
+ movq %rdx, 0(%rbx,%rax,8)
+ movq %rdx, 8(%rbx,%rax,8)
- leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
- leaq level3_ident_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
+ addq $4096, %rdx
+ movq %rdi, %rax
+ shrq $PUD_SHIFT, %rax
+ andl $(PTRS_PER_PUD-1), %eax
+ movq %rdx, 4096(%rbx,%rax,8)
+ incl %eax
+ andl $(PTRS_PER_PUD-1), %eax
+ movq %rdx, 4096(%rbx,%rax,8)
+ addq $8192, %rbx
movq %rdi, %rax
- shrq $PMD_SHIFT, %rax
- andq $(PTRS_PER_PMD - 1), %rax
- leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
- leaq level2_spare_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
-ident_complete:
+ shrq $PMD_SHIFT, %rdi
+ addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
+ leaq (_end - 1)(%rip), %rcx
+ shrq $PMD_SHIFT, %rcx
+ subq %rdi, %rcx
+ incl %ecx
+
+1:
+ andq $(PTRS_PER_PMD - 1), %rdi
+ movq %rax, (%rbx,%rdi,8)
+ incq %rdi
+ addq $PMD_SIZE, %rax
+ decl %ecx
+ jnz 1b
/*
* Fixup the kernel text+data virtual addresses. Note that
@@ -121,7 +143,6 @@ ident_complete:
* cleanup_highmap() fixes this up along with the mappings
* beyond _end.
*/
-
leaq level2_kernel_pgt(%rip), %rdi
leaq 4096(%rdi), %r8
/* See if it is a valid page table entry */
@@ -136,21 +157,14 @@ ident_complete:
/* Fixup phys_base */
addq %rbp, phys_base(%rip)
- /* Fixup trampoline */
- addq %rbp, trampoline_level4_pgt + 0(%rip)
- addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
-
- /* Due to ENTRY(), sometimes the empty space gets filled with
- * zeros. Better take a jmp than relying on empty space being
- * filled with 0x90 (nop)
- */
- jmp secondary_startup_64
+ movq $(early_level4_pgt - __START_KERNEL_map), %rax
+ jmp 1f
ENTRY(secondary_startup_64)
/*
- * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
+ * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded a mapped page table.
*
- * %esi holds a physical pointer to real_mode_data.
+ * %rsi holds a physical pointer to real_mode_data.
*
* We come here either from startup_64 (using physical addresses)
* or from trampoline.S (using virtual addresses).
@@ -160,12 +174,14 @@ ENTRY(secondary_startup_64)
* after the boot processor executes this code.
*/
+ movq $(init_level4_pgt - __START_KERNEL_map), %rax
+1:
+
/* Enable PAE mode and PGE */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %eax
- movq %rax, %cr4
+ movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ movq %rcx, %cr4
/* Setup early boot stage 4 level pagetables. */
- movq $(init_level4_pgt - __START_KERNEL_map), %rax
addq phys_base(%rip), %rax
movq %rax, %cr3
@@ -186,6 +202,7 @@ ENTRY(secondary_startup_64)
btl $20,%edi /* No Execute supported? */
jnc 1f
btsl $_EFER_NX, %eax
+ btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
1: wrmsr /* Make changes effective */
/* Setup cr0 */
@@ -197,7 +214,7 @@ ENTRY(secondary_startup_64)
movq %rax, %cr0
/* Setup a boot time stack */
- movq stack_start(%rip),%rsp
+ movq stack_start(%rip), %rsp
/* zero EFLAGS after setting rsp */
pushq $0
@@ -237,15 +254,33 @@ ENTRY(secondary_startup_64)
movl initial_gs+4(%rip),%edx
wrmsr
- /* esi is pointer to real mode structure with interesting info.
+ /* rsi is pointer to real mode structure with interesting info.
pass it to C */
- movl %esi, %edi
+ movq %rsi, %rdi
/* Finally jump to run C code and to be on real kernel address
* Since we are running on identity-mapped space we have to jump
* to the full 64bit address, this is only possible as indirect
* jump. In addition we need to ensure %cs is set so we make this
* a far return.
+ *
+ * Note: do not change to far jump indirect with 64bit offset.
+ *
+ * AMD does not support far jump indirect with 64bit offset.
+ * AMD64 Architecture Programmer's Manual, Volume 3: states only
+ * JMP FAR mem16:16 FF /5 Far jump indirect,
+ * with the target specified by a far pointer in memory.
+ * JMP FAR mem16:32 FF /5 Far jump indirect,
+ * with the target specified by a far pointer in memory.
+ *
+ * Intel64 does support 64bit offset.
+ * Software Developer Manual Vol 2: states:
+ * FF /5 JMP m16:16 Jump far, absolute indirect,
+ * address given in m16:16
+ * FF /5 JMP m16:32 Jump far, absolute indirect,
+ * address given in m16:32.
+ * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
+ * address given in m16:64.
*/
movq initial_code(%rip),%rax
pushq $0 # fake return address to stop unwinder
@@ -253,15 +288,31 @@ ENTRY(secondary_startup_64)
pushq %rax # target address in negative space
lretq
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
+ * up already except stack. We just set up stack here. Then call
+ * start_secondary().
+ */
+ENTRY(start_cpu0)
+ movq stack_start(%rip),%rsp
+ movq initial_code(%rip),%rax
+ pushq $0 # fake return address to stop unwinder
+ pushq $__KERNEL_CS # set correct cs
+ pushq %rax # target address in negative space
+ lretq
+ENDPROC(start_cpu0)
+#endif
+
/* SMP bootup changes these two */
__REFDATA
- .align 8
- ENTRY(initial_code)
+ .balign 8
+ GLOBAL(initial_code)
.quad x86_64_start_kernel
- ENTRY(initial_gs)
+ GLOBAL(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union)
- ENTRY(stack_start)
+ GLOBAL(stack_start)
.quad init_thread_union+THREAD_SIZE-8
.word 0
__FINITDATA
@@ -269,37 +320,69 @@ ENTRY(secondary_startup_64)
bad_address:
jmp bad_address
- .section ".init.text","ax"
-#ifdef CONFIG_EARLY_PRINTK
+ __INIT
.globl early_idt_handlers
early_idt_handlers:
+ # 104(%rsp) %rflags
+ # 96(%rsp) %cs
+ # 88(%rsp) %rip
+ # 80(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- movl $i, %esi
+ .if (EXCEPTION_ERRCODE_MASK >> i) & 1
+ ASM_NOP2
+ .else
+ pushq $0 # Dummy error code, to make stack frame uniform
+ .endif
+ pushq $i # 72(%rsp) Vector number
jmp early_idt_handler
i = i + 1
.endr
-#endif
+/* This is global to keep gas from relaxing the jumps */
ENTRY(early_idt_handler)
-#ifdef CONFIG_EARLY_PRINTK
+ cld
+
+ cmpl $2,(%rsp) # X86_TRAP_NMI
+ je is_nmi # Ignore NMI
+
cmpl $2,early_recursion_flag(%rip)
jz 1f
incl early_recursion_flag(%rip)
- GET_CR2_INTO_RCX
- movq %rcx,%r9
- xorl %r8d,%r8d # zero for error code
- movl %esi,%ecx # get vector number
- # Test %ecx against mask of vectors that push error code.
- cmpl $31,%ecx
- ja 0f
- movl $1,%eax
- salq %cl,%rax
- testl $0x27d00,%eax
- je 0f
- popq %r8 # get error code
-0: movq 0(%rsp),%rcx # get ip
- movq 8(%rsp),%rdx # get cs
+
+ pushq %rax # 64(%rsp)
+ pushq %rcx # 56(%rsp)
+ pushq %rdx # 48(%rsp)
+ pushq %rsi # 40(%rsp)
+ pushq %rdi # 32(%rsp)
+ pushq %r8 # 24(%rsp)
+ pushq %r9 # 16(%rsp)
+ pushq %r10 # 8(%rsp)
+ pushq %r11 # 0(%rsp)
+
+ cmpl $__KERNEL_CS,96(%rsp)
+ jne 11f
+
+ cmpl $14,72(%rsp) # Page fault?
+ jnz 10f
+ GET_CR2_INTO(%rdi) # can clobber any volatile register if pv
+ call early_make_pgtable
+ andl %eax,%eax
+ jz 20f # All good
+
+10:
+ leaq 88(%rsp),%rdi # Pointer to %rip
+ call early_fixup_exception
+ andl %eax,%eax
+ jnz 20f # Found an exception entry
+
+11:
+#ifdef CONFIG_EARLY_PRINTK
+ GET_CR2_INTO(%r9) # can clobber any volatile register if pv
+ movl 80(%rsp),%r8d # error code
+ movl 72(%rsp),%esi # vector number
+ movl 96(%rsp),%edx # %cs
+ movq 88(%rsp),%rcx # %rip
xorl %eax,%eax
leaq early_idt_msg(%rip),%rdi
call early_printk
@@ -308,27 +391,45 @@ ENTRY(early_idt_handler)
call dump_stack
#ifdef CONFIG_KALLSYMS
leaq early_idt_ripmsg(%rip),%rdi
- movq 0(%rsp),%rsi # get rip again
+ movq 40(%rsp),%rsi # %rip again
call __print_symbol
#endif
#endif /* EARLY_PRINTK */
1: hlt
jmp 1b
-#ifdef CONFIG_EARLY_PRINTK
+20: # Exception table entry found or page table generated
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rdi
+ popq %rsi
+ popq %rdx
+ popq %rcx
+ popq %rax
+ decl early_recursion_flag(%rip)
+is_nmi:
+ addq $16,%rsp # drop vector number and error code
+ INTERRUPT_RETURN
+ENDPROC(early_idt_handler)
+
+ __INITDATA
+
+ .balign 4
early_recursion_flag:
.long 0
+#ifdef CONFIG_EARLY_PRINTK
early_idt_msg:
.asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
early_idt_ripmsg:
.asciz "RIP %s\n"
#endif /* CONFIG_EARLY_PRINTK */
- .previous
#define NEXT_PAGE(name) \
.balign PAGE_SIZE; \
-ENTRY(name)
+GLOBAL(name)
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
@@ -338,24 +439,37 @@ ENTRY(name)
i = i + 1 ; \
.endr
+ __INITDATA
+NEXT_PAGE(early_level4_pgt)
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+
+NEXT_PAGE(early_dynamic_pgts)
+ .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+
.data
- /*
- * This default setting generates an ident mapping at address 0x100000
- * and a mapping for the kernel that precisely maps virtual address
- * 0xffffffff80000000 to physical address 0x000000. (always using
- * 2Mbyte large pages provided by PAE mode)
- */
+
+#ifndef CONFIG_XEN
+NEXT_PAGE(init_level4_pgt)
+ .fill 512,8,0
+#else
NEXT_PAGE(init_level4_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_START_KERNEL*8, 0
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+ .org init_level4_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 511,8,0
+ .fill 511, 8, 0
+NEXT_PAGE(level2_ident_pgt)
+ /* Since I easily can, map the first 1G.
+ * Don't set NX because code runs from these pages.
+ */
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+#endif
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
@@ -363,21 +477,6 @@ NEXT_PAGE(level3_kernel_pgt)
.quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
.quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
-NEXT_PAGE(level2_fixmap_pgt)
- .fill 506,8,0
- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
- /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
- .fill 5,8,0
-
-NEXT_PAGE(level1_fixmap_pgt)
- .fill 512,8,0
-
-NEXT_PAGE(level2_ident_pgt)
- /* Since I easily can, map the first 1G.
- * Don't set NX because code runs from these pages.
- */
- PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
-
NEXT_PAGE(level2_kernel_pgt)
/*
* 512 MB kernel mapping. We spend a full page on this pagetable
@@ -392,11 +491,16 @@ NEXT_PAGE(level2_kernel_pgt)
PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
KERNEL_IMAGE_SIZE/PMD_SIZE)
-NEXT_PAGE(level2_spare_pgt)
- .fill 512, 8, 0
+NEXT_PAGE(level2_fixmap_pgt)
+ .fill 506,8,0
+ .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+ /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
+ .fill 5,8,0
+
+NEXT_PAGE(level1_fixmap_pgt)
+ .fill 512,8,0
#undef PMDS
-#undef NEXT_PAGE
.data
.align 16
@@ -412,16 +516,6 @@ ENTRY(phys_base)
#include "../../x86/xen/xen-head.S"
- .section .bss, "aw", @nobits
- .align L1_CACHE_BYTES
-ENTRY(idt_table)
- .skip IDT_ENTRIES * 16
-
- .align L1_CACHE_BYTES
-ENTRY(nmi_idt_table)
- .skip IDT_ENTRIES * 16
-
__PAGE_ALIGNED_BSS
- .align PAGE_SIZE
-ENTRY(empty_zero_page)
+NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad0de0c2714..319bcb9372f 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -74,9 +74,6 @@ static inline void hpet_writel(unsigned int d, unsigned int a)
static inline void hpet_set_mapping(void)
{
hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
-#ifdef CONFIG_X86_64
- __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
-#endif
}
static inline void hpet_clear_mapping(void)
@@ -88,19 +85,24 @@ static inline void hpet_clear_mapping(void)
/*
* HPET command line enable / disable
*/
-static int boot_hpet_disable;
+int boot_hpet_disable;
int hpet_force_user;
static int hpet_verbose;
static int __init hpet_setup(char *str)
{
- if (str) {
+ while (str) {
+ char *next = strchr(str, ',');
+
+ if (next)
+ *next++ = 0;
if (!strncmp("disable", str, 7))
boot_hpet_disable = 1;
if (!strncmp("force", str, 5))
hpet_force_user = 1;
if (!strncmp("verbose", str, 7))
hpet_verbose = 1;
+ str = next;
}
return 1;
}
@@ -319,8 +321,6 @@ static void hpet_set_mode(enum clock_event_mode mode,
now = hpet_readl(HPET_COUNTER);
cmp = now + (unsigned int) delta;
cfg = hpet_readl(HPET_Tn_CFG(timer));
- /* Make sure we use edge triggered interrupts */
- cfg &= ~HPET_TN_LEVEL;
cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
HPET_TN_SETVAL | HPET_TN_32BIT;
hpet_writel(cfg, HPET_Tn_CFG(timer));
@@ -431,7 +431,7 @@ void hpet_msi_unmask(struct irq_data *data)
/* unmask it */
cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
- cfg |= HPET_TN_FSB;
+ cfg |= HPET_TN_ENABLE | HPET_TN_FSB;
hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
}
@@ -442,7 +442,7 @@ void hpet_msi_mask(struct irq_data *data)
/* mask it */
cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
- cfg &= ~HPET_TN_FSB;
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB);
hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
}
@@ -475,8 +475,8 @@ static int hpet_msi_next_event(unsigned long delta,
static int hpet_setup_msi_irq(unsigned int irq)
{
- if (arch_setup_hpet_msi(irq, hpet_blockid)) {
- destroy_irq(irq);
+ if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) {
+ irq_free_hwirq(irq);
return -EINVAL;
}
return 0;
@@ -484,9 +484,8 @@ static int hpet_setup_msi_irq(unsigned int irq)
static int hpet_assign_irq(struct hpet_dev *dev)
{
- unsigned int irq;
+ unsigned int irq = irq_alloc_hwirq(-1);
- irq = create_irq_nr(0, -1);
if (!irq)
return -EINVAL;
@@ -518,7 +517,7 @@ static int hpet_setup_irq(struct hpet_dev *dev)
{
if (request_irq(dev->irq, hpet_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
+ IRQF_TIMER | IRQF_NOBALANCING,
dev->name, dev))
return -1;
@@ -696,7 +695,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
/* FIXME: add schedule_work_on() */
schedule_delayed_work_on(cpu, &work.work, 0);
wait_for_completion(&work.complete);
- destroy_timer_on_stack(&work.work.timer);
+ destroy_delayed_work_on_stack(&work.work);
break;
case CPU_DEAD:
if (hdev) {
@@ -749,9 +748,7 @@ static struct clocksource clocksource_hpet = {
.mask = HPET_MASK,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_resume_counter,
-#ifdef CONFIG_X86_64
.archdata = { .vclock_mode = VCLOCK_HPET },
-#endif
};
static int hpet_clocksource_register(void)
@@ -787,15 +784,16 @@ static int hpet_clocksource_register(void)
return 0;
}
+static u32 *hpet_boot_cfg;
+
/**
* hpet_enable - Try to setup the HPET timer. Returns 1 on success.
*/
int __init hpet_enable(void)
{
- unsigned long hpet_period;
- unsigned int id;
+ u32 hpet_period, cfg, id;
u64 freq;
- int i;
+ unsigned int i, last;
if (!is_hpet_capable())
return 0;
@@ -847,15 +845,45 @@ int __init hpet_enable(void)
id = hpet_readl(HPET_ID);
hpet_print_config();
+ last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+
#ifdef CONFIG_HPET_EMULATE_RTC
/*
* The legacy routing mode needs at least two channels, tick timer
* and the rtc emulation channel.
*/
- if (!(id & HPET_ID_NUMBER))
+ if (!last)
goto out_nohpet;
#endif
+ cfg = hpet_readl(HPET_CFG);
+ hpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg),
+ GFP_KERNEL);
+ if (hpet_boot_cfg)
+ *hpet_boot_cfg = cfg;
+ else
+ pr_warn("HPET initial state will not be saved\n");
+ cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+ hpet_writel(cfg, HPET_CFG);
+ if (cfg)
+ pr_warn("HPET: Unrecognized bits %#x set in global cfg\n",
+ cfg);
+
+ for (i = 0; i <= last; ++i) {
+ cfg = hpet_readl(HPET_Tn_CFG(i));
+ if (hpet_boot_cfg)
+ hpet_boot_cfg[i + 1] = cfg;
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB);
+ hpet_writel(cfg, HPET_Tn_CFG(i));
+ cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP
+ | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
+ | HPET_TN_FSB | HPET_TN_FSB_CAP);
+ if (cfg)
+ pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n",
+ cfg, i);
+ }
+ hpet_print_config();
+
if (hpet_clocksource_register())
goto out_nohpet;
@@ -909,12 +937,14 @@ static __init int hpet_late_init(void)
if (boot_cpu_has(X86_FEATURE_ARAT))
return 0;
+ cpu_notifier_register_begin();
for_each_online_cpu(cpu) {
hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
}
/* This notifier should be called after workqueue is ready */
- hotcpu_notifier(hpet_cpuhp_notify, -20);
+ __hotcpu_notifier(hpet_cpuhp_notify, -20);
+ cpu_notifier_register_done();
return 0;
}
@@ -923,14 +953,28 @@ fs_initcall(hpet_late_init);
void hpet_disable(void)
{
if (is_hpet_capable() && hpet_virt_address) {
- unsigned int cfg = hpet_readl(HPET_CFG);
+ unsigned int cfg = hpet_readl(HPET_CFG), id, last;
- if (hpet_legacy_int_enabled) {
+ if (hpet_boot_cfg)
+ cfg = *hpet_boot_cfg;
+ else if (hpet_legacy_int_enabled) {
cfg &= ~HPET_CFG_LEGACY;
hpet_legacy_int_enabled = 0;
}
cfg &= ~HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
+
+ if (!hpet_boot_cfg)
+ return;
+
+ id = hpet_readl(HPET_ID);
+ last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+
+ for (id = 0; id <= last; ++id)
+ hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id));
+
+ if (*hpet_boot_cfg & HPET_CFG_ENABLE)
+ hpet_writel(*hpet_boot_cfg, HPET_CFG);
}
}
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 02f07634d26..5f9cf20cdb6 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -32,13 +32,11 @@
#include <linux/irqflags.h>
#include <linux/notifier.h>
#include <linux/kallsyms.h>
-#include <linux/kprobes.h>
#include <linux/percpu.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sched.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <asm/hw_breakpoint.h>
@@ -393,6 +391,9 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
unregister_hw_breakpoint(t->ptrace_bps[i]);
t->ptrace_bps[i] = NULL;
}
+
+ t->debugreg6 = 0;
+ t->ptrace_dr7 = 0;
}
void hw_breakpoint_restore(void)
@@ -422,7 +423,7 @@ EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
* NOTIFY_STOP returned for all other cases
*
*/
-static int __kprobes hw_breakpoint_handler(struct die_args *args)
+static int hw_breakpoint_handler(struct die_args *args)
{
int i, cpu, rc = NOTIFY_STOP;
struct perf_event *bp;
@@ -509,7 +510,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
/*
* Handle debug exception notifications.
*/
-int __kprobes hw_breakpoint_exceptions_notify(
+int hw_breakpoint_exceptions_notify(
struct notifier_block *unused, unsigned long val, void *data)
{
if (val != DIE_DEBUG)
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 9c3bd4a2050..05fd74f537d 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(csum_partial_copy_generic);
EXPORT_SYMBOL(__get_user_1);
EXPORT_SYMBOL(__get_user_2);
EXPORT_SYMBOL(__get_user_4);
+EXPORT_SYMBOL(__get_user_8);
EXPORT_SYMBOL(__put_user_1);
EXPORT_SYMBOL(__put_user_2);
@@ -36,3 +37,10 @@ EXPORT_SYMBOL(strstr);
EXPORT_SYMBOL(csum_partial);
EXPORT_SYMBOL(empty_zero_page);
+
+#ifdef CONFIG_PREEMPT
+EXPORT_SYMBOL(___preempt_schedule);
+#ifdef CONFIG_CONTEXT_TRACKING
+EXPORT_SYMBOL(___preempt_schedule_context);
+#endif
+#endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 739d8598f78..d5dd8081441 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -16,58 +16,131 @@
#include <asm/uaccess.h>
#include <asm/ptrace.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/user.h>
-#ifdef CONFIG_X86_64
-# include <asm/sigcontext32.h>
-# include <asm/user32.h>
-#else
-# define save_i387_xstate_ia32 save_i387_xstate
-# define restore_i387_xstate_ia32 restore_i387_xstate
-# define _fpstate_ia32 _fpstate
-# define _xstate_ia32 _xstate
-# define sig_xstate_ia32_size sig_xstate_size
-# define fx_sw_reserved_ia32 fx_sw_reserved
-# define user_i387_ia32_struct user_i387_struct
-# define user32_fxsr_struct user_fxsr_struct
-#endif
+/*
+ * Were we in an interrupt that interrupted kernel mode?
+ *
+ * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
+ * pair does nothing at all: the thread must not have fpu (so
+ * that we don't try to save the FPU state), and TS must
+ * be set (so that the clts/stts pair does nothing that is
+ * visible in the interrupted kernel thread).
+ *
+ * Except for the eagerfpu case when we return 1 unless we've already
+ * been eager and saved the state in kernel_fpu_begin().
+ */
+static inline bool interrupted_kernel_fpu_idle(void)
+{
+ if (use_eager_fpu())
+ return __thread_has_fpu(current);
-#ifdef CONFIG_MATH_EMULATION
-# define HAVE_HWFP (boot_cpu_data.hard_math)
-#else
-# define HAVE_HWFP 1
-#endif
+ return !__thread_has_fpu(current) &&
+ (read_cr0() & X86_CR0_TS);
+}
+
+/*
+ * Were we in user mode (or vm86 mode) when we were
+ * interrupted?
+ *
+ * Doing kernel_fpu_begin/end() is ok if we are running
+ * in an interrupt context from user mode - we'll just
+ * save the FPU state as required.
+ */
+static inline bool interrupted_user_mode(void)
+{
+ struct pt_regs *regs = get_irq_regs();
+ return regs && user_mode_vm(regs);
+}
+
+/*
+ * Can we use the FPU in kernel mode with the
+ * whole "kernel_fpu_begin/end()" sequence?
+ *
+ * It's always ok in process context (ie "not interrupt")
+ * but it is sometimes ok even from an irq.
+ */
+bool irq_fpu_usable(void)
+{
+ return !in_interrupt() ||
+ interrupted_user_mode() ||
+ interrupted_kernel_fpu_idle();
+}
+EXPORT_SYMBOL(irq_fpu_usable);
+
+void __kernel_fpu_begin(void)
+{
+ struct task_struct *me = current;
+
+ if (__thread_has_fpu(me)) {
+ __thread_clear_has_fpu(me);
+ __save_init_fpu(me);
+ /* We do 'stts()' in __kernel_fpu_end() */
+ } else if (!use_eager_fpu()) {
+ this_cpu_write(fpu_owner_task, NULL);
+ clts();
+ }
+}
+EXPORT_SYMBOL(__kernel_fpu_begin);
+
+void __kernel_fpu_end(void)
+{
+ if (use_eager_fpu()) {
+ /*
+ * For eager fpu, most the time, tsk_used_math() is true.
+ * Restore the user math as we are done with the kernel usage.
+ * At few instances during thread exit, signal handling etc,
+ * tsk_used_math() is false. Those few places will take proper
+ * actions, so we don't need to restore the math here.
+ */
+ if (likely(tsk_used_math(current)))
+ math_state_restore();
+ } else {
+ stts();
+ }
+}
+EXPORT_SYMBOL(__kernel_fpu_end);
+
+void unlazy_fpu(struct task_struct *tsk)
+{
+ preempt_disable();
+ if (__thread_has_fpu(tsk)) {
+ __save_init_fpu(tsk);
+ __thread_fpu_end(tsk);
+ } else
+ tsk->thread.fpu_counter = 0;
+ preempt_enable();
+}
+EXPORT_SYMBOL(unlazy_fpu);
-static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
unsigned int xstate_size;
EXPORT_SYMBOL_GPL(xstate_size);
-unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
-static struct i387_fxsave_struct fx_scratch __cpuinitdata;
+static struct i387_fxsave_struct fx_scratch;
-void __cpuinit mxcsr_feature_mask_init(void)
+static void mxcsr_feature_mask_init(void)
{
unsigned long mask = 0;
- clts();
if (cpu_has_fxsr) {
memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
- asm volatile("fxsave %0" : : "m" (fx_scratch));
+ asm volatile("fxsave %0" : "+m" (fx_scratch));
mask = fx_scratch.mxcsr_mask;
if (mask == 0)
mask = 0x0000ffbf;
}
mxcsr_feature_mask &= mask;
- stts();
}
-static void __cpuinit init_thread_xstate(void)
+static void init_thread_xstate(void)
{
/*
* Note that xstate_size might be overwriten later during
* xsave_init().
*/
- if (!HAVE_HWFP) {
+ if (!cpu_has_fpu) {
/*
* Disable xsave as we do not support it if i387
* emulation is enabled.
@@ -89,11 +162,19 @@ static void __cpuinit init_thread_xstate(void)
* into all processes.
*/
-void __cpuinit fpu_init(void)
+void fpu_init(void)
{
unsigned long cr0;
unsigned long cr4_mask = 0;
+#ifndef CONFIG_MATH_EMULATION
+ if (!cpu_has_fpu) {
+ pr_emerg("No FPU found and no math emulation present\n");
+ pr_emerg("Giving up\n");
+ for (;;)
+ asm volatile("hlt");
+ }
+#endif
if (cpu_has_fxsr)
cr4_mask |= X86_CR4_OSFXSR;
if (cpu_has_xmm)
@@ -103,33 +184,31 @@ void __cpuinit fpu_init(void)
cr0 = read_cr0();
cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
- if (!HAVE_HWFP)
+ if (!cpu_has_fpu)
cr0 |= X86_CR0_EM;
write_cr0(cr0);
- if (!smp_processor_id())
+ /*
+ * init_thread_xstate is only called once to avoid overriding
+ * xstate_size during boot time or during CPU hotplug.
+ */
+ if (xstate_size == 0)
init_thread_xstate();
mxcsr_feature_mask_init();
- /* clean state in init */
- current_thread_info()->status = 0;
- clear_used_math();
+ xsave_init();
+ eager_fpu_init();
}
void fpu_finit(struct fpu *fpu)
{
- if (!HAVE_HWFP) {
+ if (!cpu_has_fpu) {
finit_soft_fpu(&fpu->state->soft);
return;
}
if (cpu_has_fxsr) {
- struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
- memset(fx, 0, xstate_size);
- fx->cwd = 0x37f;
- if (cpu_has_xmm)
- fx->mxcsr = MXCSR_DEFAULT;
+ fx_finit(&fpu->state->fxsave);
} else {
struct i387_fsave_struct *fp = &fpu->state->fsave;
memset(fp, 0, xstate_size);
@@ -152,8 +231,9 @@ int init_fpu(struct task_struct *tsk)
int ret;
if (tsk_used_math(tsk)) {
- if (HAVE_HWFP && tsk == current)
+ if (cpu_has_fpu && tsk == current)
unlazy_fpu(tsk);
+ tsk->thread.fpu.last_cpu = ~0;
return 0;
}
@@ -372,7 +452,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
* FXSR floating point environment conversions.
*/
-static void
+void
convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
{
struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
@@ -409,8 +489,8 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
memcpy(&to[i], &from[i], sizeof(to[0]));
}
-static void convert_to_fxsr(struct task_struct *tsk,
- const struct user_i387_ia32_struct *env)
+void convert_to_fxsr(struct task_struct *tsk,
+ const struct user_i387_ia32_struct *env)
{
struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
@@ -448,14 +528,13 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
- if (!HAVE_HWFP)
+ if (!static_cpu_has(X86_FEATURE_FPU))
return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
- if (!cpu_has_fxsr) {
+ if (!cpu_has_fxsr)
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
&target->thread.fpu.state->fsave, 0,
-1);
- }
sanitize_i387_state(target);
@@ -482,13 +561,13 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
sanitize_i387_state(target);
- if (!HAVE_HWFP)
+ if (!static_cpu_has(X86_FEATURE_FPU))
return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
- if (!cpu_has_fxsr) {
+ if (!cpu_has_fxsr)
return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpu.state->fsave, 0, -1);
- }
+ &target->thread.fpu.state->fsave, 0,
+ -1);
if (pos > 0 || count < sizeof(env))
convert_from_fxsr(&env, target);
@@ -507,223 +586,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
}
/*
- * Signal frame handlers.
- */
-
-static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
-{
- struct task_struct *tsk = current;
- struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
-
- fp->status = fp->swd;
- if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
- return -1;
- return 1;
-}
-
-static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
-{
- struct task_struct *tsk = current;
- struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
- struct user_i387_ia32_struct env;
- int err = 0;
-
- convert_from_fxsr(&env, tsk);
- if (__copy_to_user(buf, &env, sizeof(env)))
- return -1;
-
- err |= __put_user(fx->swd, &buf->status);
- err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
- if (err)
- return -1;
-
- if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
- return -1;
- return 1;
-}
-
-static int save_i387_xsave(void __user *buf)
-{
- struct task_struct *tsk = current;
- struct _fpstate_ia32 __user *fx = buf;
- int err = 0;
-
-
- sanitize_i387_state(tsk);
-
- /*
- * For legacy compatible, we always set FP/SSE bits in the bit
- * vector while saving the state to the user context.
- * This will enable us capturing any changes(during sigreturn) to
- * the FP/SSE bits by the legacy applications which don't touch
- * xstate_bv in the xsave header.
- *
- * xsave aware applications can change the xstate_bv in the xsave
- * header as well as change any contents in the memory layout.
- * xrestore as part of sigreturn will capture all the changes.
- */
- tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
-
- if (save_i387_fxsave(fx) < 0)
- return -1;
-
- err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
- sizeof(struct _fpx_sw_bytes));
- err |= __put_user(FP_XSTATE_MAGIC2,
- (__u32 __user *) (buf + sig_xstate_ia32_size
- - FP_XSTATE_MAGIC2_SIZE));
- if (err)
- return -1;
-
- return 1;
-}
-
-int save_i387_xstate_ia32(void __user *buf)
-{
- struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
- struct task_struct *tsk = current;
-
- if (!used_math())
- return 0;
-
- if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
- return -EACCES;
- /*
- * This will cause a "finit" to be triggered by the next
- * attempted FPU operation by the 'current' process.
- */
- clear_used_math();
-
- if (!HAVE_HWFP) {
- return fpregs_soft_get(current, NULL,
- 0, sizeof(struct user_i387_ia32_struct),
- NULL, fp) ? -1 : 1;
- }
-
- unlazy_fpu(tsk);
-
- if (cpu_has_xsave)
- return save_i387_xsave(fp);
- if (cpu_has_fxsr)
- return save_i387_fxsave(fp);
- else
- return save_i387_fsave(fp);
-}
-
-static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
-{
- struct task_struct *tsk = current;
-
- return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
- sizeof(struct i387_fsave_struct));
-}
-
-static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
- unsigned int size)
-{
- struct task_struct *tsk = current;
- struct user_i387_ia32_struct env;
- int err;
-
- err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
- size);
- /* mxcsr reserved bits must be masked to zero for security reasons */
- tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
- if (err || __copy_from_user(&env, buf, sizeof(env)))
- return 1;
- convert_to_fxsr(tsk, &env);
-
- return 0;
-}
-
-static int restore_i387_xsave(void __user *buf)
-{
- struct _fpx_sw_bytes fx_sw_user;
- struct _fpstate_ia32 __user *fx_user =
- ((struct _fpstate_ia32 __user *) buf);
- struct i387_fxsave_struct __user *fx =
- (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
- struct xsave_hdr_struct *xsave_hdr =
- &current->thread.fpu.state->xsave.xsave_hdr;
- u64 mask;
- int err;
-
- if (check_for_xstate(fx, buf, &fx_sw_user))
- goto fx_only;
-
- mask = fx_sw_user.xstate_bv;
-
- err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
-
- xsave_hdr->xstate_bv &= pcntxt_mask;
- /*
- * These bits must be zero.
- */
- xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
-
- /*
- * Init the state that is not present in the memory layout
- * and enabled by the OS.
- */
- mask = ~(pcntxt_mask & ~mask);
- xsave_hdr->xstate_bv &= mask;
-
- return err;
-fx_only:
- /*
- * Couldn't find the extended state information in the memory
- * layout. Restore the FP/SSE and init the other extended state
- * enabled by the OS.
- */
- xsave_hdr->xstate_bv = XSTATE_FPSSE;
- return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
-}
-
-int restore_i387_xstate_ia32(void __user *buf)
-{
- int err;
- struct task_struct *tsk = current;
- struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
-
- if (HAVE_HWFP)
- clear_fpu(tsk);
-
- if (!buf) {
- if (used_math()) {
- clear_fpu(tsk);
- clear_used_math();
- }
-
- return 0;
- } else
- if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
- return -EACCES;
-
- if (!used_math()) {
- err = init_fpu(tsk);
- if (err)
- return err;
- }
-
- if (HAVE_HWFP) {
- if (cpu_has_xsave)
- err = restore_i387_xsave(buf);
- else if (cpu_has_fxsr)
- err = restore_i387_fxsave(fp, sizeof(struct
- i387_fxsave_struct));
- else
- err = restore_i387_fsave(fp);
- } else {
- err = fpregs_soft_set(current, NULL,
- 0, sizeof(struct user_i387_ia32_struct),
- NULL, fp) != 0;
- }
- set_used_math();
-
- return err;
-}
-
-/*
* FPU state for core dumps.
* This is only used for a.out dumps now.
* It is declared generically using elf_fpregset_t (which is
@@ -746,3 +608,33 @@ int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
EXPORT_SYMBOL(dump_fpu);
#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
+
+static int __init no_387(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+ return 1;
+}
+
+__setup("no387", no_387);
+
+void fpu_detect(struct cpuinfo_x86 *c)
+{
+ unsigned long cr0;
+ u16 fsw, fcw;
+
+ fsw = fcw = 0xffff;
+
+ cr0 = read_cr0();
+ cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
+ write_cr0(cr0);
+
+ asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+ : "+m" (fsw), "+m" (fcw));
+
+ if (fsw == 0 && (fcw & 0x103f) == 0x003f)
+ set_cpu_cap(c, X86_FEATURE_FPU);
+ else
+ clear_cpu_cap(c, X86_FEATURE_FPU);
+
+ /* The final cr0 value is set in fpu_init() */
+}
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 610485223bd..8af817105e2 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -15,7 +15,6 @@
#include <linux/delay.h>
#include <linux/atomic.h>
-#include <asm/system.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
#include <asm/pgtable.h>
@@ -264,7 +263,7 @@ static void i8259A_shutdown(void)
* out of.
*/
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
- outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
}
static struct syscore_ops i8259_syscore_ops = {
@@ -300,21 +299,38 @@ static void unmask_8259A(void)
static void init_8259A(int auto_eoi)
{
unsigned long flags;
+ unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
+ unsigned char new_val;
i8259A_auto_eoi = auto_eoi;
raw_spin_lock_irqsave(&i8259A_lock, flags);
- outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
+ /*
+ * Check to see if we have a PIC.
+ * Mask all except the cascade and read
+ * back the value we just wrote. If we don't
+ * have a PIC, we will read 0xff as opposed to the
+ * value we wrote.
+ */
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
+ outb(probe_val, PIC_MASTER_IMR);
+ new_val = inb(PIC_MASTER_IMR);
+ if (new_val != probe_val) {
+ printk(KERN_INFO "Using NULL legacy PIC\n");
+ legacy_pic = &null_legacy_pic;
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+ return;
+ }
+
+ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
/*
* outb_pic - this has to work on a wide range of PC hardware.
*/
outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
- /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64,
- to 0x20-0x27 on i386 */
+ /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
/* 8259A-1 (the master) has a slave on IR2 */
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
deleted file mode 100644
index 43e9ccf4494..00000000000
--- a/arch/x86/kernel/init_task.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-#include <linux/mqueue.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-
-static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
-static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-
-/*
- * Initial thread structure.
- *
- * We need to make sure that this is THREAD_SIZE aligned due to the
- * way process stacks are handled. This is done by having a special
- * "init_task" linker map entry..
- */
-union thread_union init_thread_union __init_task_data =
- { INIT_THREAD_INFO(init_task) };
-
-/*
- * Initial task structure.
- *
- * All other task structs will be allocated on slabs in fork.c
- */
-struct task_struct init_task = INIT_TASK(init_task);
-EXPORT_SYMBOL(init_task);
-
-/*
- * per-CPU TSS segments. Threads are completely 'soft' on Linux,
- * no more per-task TSS's. The TSS size is kept cacheline-aligned
- * so they are allowed to end up in the .data..cacheline_aligned
- * section. Since TSS's are completely CPU-local, we want them
- * on exact cacheline boundaries, to eliminate cacheline ping-pong.
- */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
-
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 8c968974253..4ddaf66ea35 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -93,8 +93,9 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* on system-call entry - see also fork() and the signal handling
* code.
*/
-long sys_iopl(unsigned int level, struct pt_regs *regs)
+SYSCALL_DEFINE1(iopl, unsigned int, level)
{
+ struct pt_regs *regs = current_pt_regs();
unsigned int old = (regs->flags >> 12) & 3;
struct thread_struct *t = &current->thread;
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
new file mode 100644
index 00000000000..d30acdc1229
--- /dev/null
+++ b/arch/x86/kernel/iosf_mbi.c
@@ -0,0 +1,237 @@
+/*
+ * IOSF-SB MailBox Interface Driver
+ * Copyright (c) 2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ *
+ * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a
+ * mailbox interface (MBI) to communicate with mutiple devices. This
+ * driver implements access to this interface for those platforms that can
+ * enumerate the device using PCI.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+
+#include <asm/iosf_mbi.h>
+
+#define PCI_DEVICE_ID_BAYTRAIL 0x0F00
+#define PCI_DEVICE_ID_QUARK_X1000 0x0958
+
+static DEFINE_SPINLOCK(iosf_mbi_lock);
+
+static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
+{
+ return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
+}
+
+static struct pci_dev *mbi_pdev; /* one mbi device */
+
+static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
+{
+ int result;
+
+ if (!mbi_pdev)
+ return -ENODEV;
+
+ if (mcrx) {
+ result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
+ mcrx);
+ if (result < 0)
+ goto fail_read;
+ }
+
+ result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
+ if (result < 0)
+ goto fail_read;
+
+ result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
+ if (result < 0)
+ goto fail_read;
+
+ return 0;
+
+fail_read:
+ dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
+ return result;
+}
+
+static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr)
+{
+ int result;
+
+ if (!mbi_pdev)
+ return -ENODEV;
+
+ result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
+ if (result < 0)
+ goto fail_write;
+
+ if (mcrx) {
+ result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
+ mcrx);
+ if (result < 0)
+ goto fail_write;
+ }
+
+ result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
+ if (result < 0)
+ goto fail_write;
+
+ return 0;
+
+fail_write:
+ dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
+ return result;
+}
+
+int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
+{
+ u32 mcr, mcrx;
+ unsigned long flags;
+ int ret;
+
+ /*Access to the GFX unit is handled by GPU code */
+ if (port == BT_MBI_UNIT_GFX) {
+ WARN_ON(1);
+ return -EPERM;
+ }
+
+ mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+ mcrx = offset & MBI_MASK_HI;
+
+ spin_lock_irqsave(&iosf_mbi_lock, flags);
+ ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr);
+ spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_read);
+
+int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
+{
+ u32 mcr, mcrx;
+ unsigned long flags;
+ int ret;
+
+ /*Access to the GFX unit is handled by GPU code */
+ if (port == BT_MBI_UNIT_GFX) {
+ WARN_ON(1);
+ return -EPERM;
+ }
+
+ mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+ mcrx = offset & MBI_MASK_HI;
+
+ spin_lock_irqsave(&iosf_mbi_lock, flags);
+ ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr);
+ spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_write);
+
+int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
+{
+ u32 mcr, mcrx;
+ u32 value;
+ unsigned long flags;
+ int ret;
+
+ /*Access to the GFX unit is handled by GPU code */
+ if (port == BT_MBI_UNIT_GFX) {
+ WARN_ON(1);
+ return -EPERM;
+ }
+
+ mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+ mcrx = offset & MBI_MASK_HI;
+
+ spin_lock_irqsave(&iosf_mbi_lock, flags);
+
+ /* Read current mdr value */
+ ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value);
+ if (ret < 0) {
+ spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+ return ret;
+ }
+
+ /* Apply mask */
+ value &= ~mask;
+ mdr &= mask;
+ value |= mdr;
+
+ /* Write back */
+ ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value);
+
+ spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_modify);
+
+bool iosf_mbi_available(void)
+{
+ /* Mbi isn't hot-pluggable. No remove routine is provided */
+ return mbi_pdev;
+}
+EXPORT_SYMBOL(iosf_mbi_available);
+
+static int iosf_mbi_probe(struct pci_dev *pdev,
+ const struct pci_device_id *unused)
+{
+ int ret;
+
+ ret = pci_enable_device(pdev);
+ if (ret < 0) {
+ dev_err(&pdev->dev, "error: could not enable device\n");
+ return ret;
+ }
+
+ mbi_pdev = pci_dev_get(pdev);
+ return 0;
+}
+
+static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) },
+ { 0, },
+};
+MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
+
+static struct pci_driver iosf_mbi_pci_driver = {
+ .name = "iosf_mbi_pci",
+ .probe = iosf_mbi_probe,
+ .id_table = iosf_mbi_pci_ids,
+};
+
+static int __init iosf_mbi_init(void)
+{
+ return pci_register_driver(&iosf_mbi_pci_driver);
+}
+
+static void __exit iosf_mbi_exit(void)
+{
+ pci_unregister_driver(&iosf_mbi_pci_driver);
+ if (mbi_pdev) {
+ pci_dev_put(mbi_pdev);
+ mbi_pdev = NULL;
+ }
+}
+
+module_init(iosf_mbi_init);
+module_exit(iosf_mbi_exit);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("IOSF Mailbox Interface accessor");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 7943e0c21bd..922d2858102 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -17,6 +17,10 @@
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
+#include <asm/desc.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/irq_vectors.h>
atomic_t irq_err_count;
@@ -92,7 +96,8 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, " Rescheduling interrupts\n");
seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+ seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
+ irq_stats(j)->irq_tlb_count);
seq_printf(p, " Function call interrupts\n");
seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
@@ -121,6 +126,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
seq_printf(p, " Machine check polls\n");
#endif
+#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
+ seq_printf(p, "%*s: ", prec, "THR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count);
+ seq_printf(p, " Hypervisor callback interrupts\n");
+#endif
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
@@ -147,7 +158,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
- sum += irq_stats(cpu)->irq_tlb_count;
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)->irq_thermal_count;
@@ -165,10 +175,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
u64 arch_irq_stat(void)
{
u64 sum = atomic_read(&irq_err_count);
-
-#ifdef CONFIG_X86_IO_APIC
- sum += atomic_read(&irq_mis_count);
-#endif
return sum;
}
@@ -178,7 +184,7 @@ u64 arch_irq_stat(void)
* SMP cross-CPU interrupts have their own specific
* handlers).
*/
-unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
+__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
@@ -194,9 +200,13 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
if (!handle_irq(irq, regs)) {
ack_APIC_irq();
- if (printk_ratelimit())
- pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
- __func__, smp_processor_id(), vector, irq);
+ if (irq != VECTOR_RETRIGGERED) {
+ pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
+ __func__, smp_processor_id(),
+ vector, irq);
+ } else {
+ __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+ }
}
irq_exit();
@@ -208,7 +218,29 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
/*
* Handler for X86_PLATFORM_IPI_VECTOR.
*/
-void smp_x86_platform_ipi(struct pt_regs *regs)
+void __smp_x86_platform_ipi(void)
+{
+ inc_irq_stat(x86_platform_ipis);
+
+ if (x86_platform_ipi_callback)
+ x86_platform_ipi_callback();
+}
+
+__visible void smp_x86_platform_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_ack_irq();
+ __smp_x86_platform_ipi();
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
+
+#ifdef CONFIG_HAVE_KVM
+/*
+ * Handler for POSTED_INTERRUPT_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
@@ -218,19 +250,113 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
exit_idle();
- inc_irq_stat(x86_platform_ipis);
-
- if (x86_platform_ipi_callback)
- x86_platform_ipi_callback();
+ inc_irq_stat(kvm_posted_intr_ipis);
irq_exit();
set_irq_regs(old_regs);
}
+#endif
+
+__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_ack_irq();
+ trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
+ __smp_x86_platform_ipi();
+ trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
#ifdef CONFIG_HOTPLUG_CPU
+
+/* These two declarations are only used in check_irq_vectors_for_cpu_disable()
+ * below, which is protected by stop_machine(). Putting them on the stack
+ * results in a stack frame overflow. Dynamically allocating could result in a
+ * failure so declare these two cpumasks as global.
+ */
+static struct cpumask affinity_new, online_new;
+
+/*
+ * This cpu is going to be removed and its vectors migrated to the remaining
+ * online cpus. Check to see if there are enough vectors in the remaining cpus.
+ * This function is protected by stop_machine().
+ */
+int check_irq_vectors_for_cpu_disable(void)
+{
+ int irq, cpu;
+ unsigned int this_cpu, vector, this_count, count;
+ struct irq_desc *desc;
+ struct irq_data *data;
+
+ this_cpu = smp_processor_id();
+ cpumask_copy(&online_new, cpu_online_mask);
+ cpu_clear(this_cpu, online_new);
+
+ this_count = 0;
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+ irq = __this_cpu_read(vector_irq[vector]);
+ if (irq >= 0) {
+ desc = irq_to_desc(irq);
+ data = irq_desc_get_irq_data(desc);
+ cpumask_copy(&affinity_new, data->affinity);
+ cpu_clear(this_cpu, affinity_new);
+
+ /* Do not count inactive or per-cpu irqs. */
+ if (!irq_has_action(irq) || irqd_is_per_cpu(data))
+ continue;
+
+ /*
+ * A single irq may be mapped to multiple
+ * cpu's vector_irq[] (for example IOAPIC cluster
+ * mode). In this case we have two
+ * possibilities:
+ *
+ * 1) the resulting affinity mask is empty; that is
+ * this the down'd cpu is the last cpu in the irq's
+ * affinity mask, or
+ *
+ * 2) the resulting affinity mask is no longer
+ * a subset of the online cpus but the affinity
+ * mask is not zero; that is the down'd cpu is the
+ * last online cpu in a user set affinity mask.
+ */
+ if (cpumask_empty(&affinity_new) ||
+ !cpumask_subset(&affinity_new, &online_new))
+ this_count++;
+ }
+ }
+
+ count = 0;
+ for_each_online_cpu(cpu) {
+ if (cpu == this_cpu)
+ continue;
+ /*
+ * We scan from FIRST_EXTERNAL_VECTOR to first system
+ * vector. If the vector is marked in the used vectors
+ * bitmap or an irq is assigned to it, we don't count
+ * it as available.
+ */
+ for (vector = FIRST_EXTERNAL_VECTOR;
+ vector < first_system_vector; vector++) {
+ if (!test_bit(vector, used_vectors) &&
+ per_cpu(vector_irq, cpu)[vector] < 0)
+ count++;
+ }
+ }
+
+ if (count < this_count) {
+ pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n",
+ this_cpu, this_count, count);
+ return -ERANGE;
+ }
+ return 0;
+}
+
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void)
{
@@ -239,6 +365,7 @@ void fixup_irqs(void)
struct irq_desc *desc;
struct irq_data *data;
struct irq_chip *chip;
+ int ret;
for_each_irq_desc(irq, desc) {
int break_affinity = 0;
@@ -270,28 +397,37 @@ void fixup_irqs(void)
if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
break_affinity = 1;
- affinity = cpu_all_mask;
+ affinity = cpu_online_mask;
}
chip = irq_data_get_irq_chip(data);
if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
chip->irq_mask(data);
- if (chip->irq_set_affinity)
- chip->irq_set_affinity(data, affinity, true);
- else if (!(warned++))
- set_affinity = 0;
+ if (chip->irq_set_affinity) {
+ ret = chip->irq_set_affinity(data, affinity, true);
+ if (ret == -ENOSPC)
+ pr_crit("IRQ %d set affinity failed because there are no available vectors. The device assigned to this IRQ is unstable.\n", irq);
+ } else {
+ if (!(warned++))
+ set_affinity = 0;
+ }
+ /*
+ * We unmask if the irq was not marked masked by the
+ * core code. That respects the lazy irq disable
+ * behaviour.
+ */
if (!irqd_can_move_in_process_context(data) &&
- !irqd_irq_disabled(data) && chip->irq_unmask)
+ !irqd_irq_masked(data) && chip->irq_unmask)
chip->irq_unmask(data);
raw_spin_unlock(&desc->lock);
if (break_affinity && set_affinity)
- printk("Broke affinity for irq %i\n", irq);
+ pr_notice("Broke affinity for irq %i\n", irq);
else if (!set_affinity)
- printk("Cannot set affinity for irq %i\n", irq);
+ pr_notice("Cannot set affinity for irq %i\n", irq);
}
/*
@@ -308,7 +444,7 @@ void fixup_irqs(void)
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
unsigned int irr;
- if (__this_cpu_read(vector_irq[vector]) < 0)
+ if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
continue;
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -319,10 +455,14 @@ void fixup_irqs(void)
data = irq_desc_get_irq_data(desc);
chip = irq_data_get_irq_chip(data);
raw_spin_lock(&desc->lock);
- if (chip->irq_retrigger)
+ if (chip->irq_retrigger) {
chip->irq_retrigger(data);
+ __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
+ }
raw_spin_unlock(&desc->lock);
}
+ if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
+ __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
}
}
#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 40fc86161d9..63ce838e5a5 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -55,16 +55,8 @@ static inline int check_stack_overflow(void) { return 0; }
static inline void print_stack_overflow(void) { }
#endif
-/*
- * per-CPU IRQ handling contexts (thread information and stack)
- */
-union irq_ctx {
- struct thread_info tinfo;
- u32 stack[THREAD_SIZE/sizeof(u32)];
-} __attribute__((aligned(THREAD_SIZE)));
-
-static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
-static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
+DEFINE_PER_CPU(struct irq_stack *, hardirq_stack);
+DEFINE_PER_CPU(struct irq_stack *, softirq_stack);
static void call_on_stack(void *func, void *stack)
{
@@ -77,14 +69,26 @@ static void call_on_stack(void *func, void *stack)
: "memory", "cc", "edx", "ecx", "eax");
}
+/* how to get the current stack pointer from C */
+#define current_stack_pointer ({ \
+ unsigned long sp; \
+ asm("mov %%esp,%0" : "=g" (sp)); \
+ sp; \
+})
+
+static inline void *current_stack(void)
+{
+ return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
+}
+
static inline int
execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
{
- union irq_ctx *curctx, *irqctx;
- u32 *isp, arg1, arg2;
+ struct irq_stack *curstk, *irqstk;
+ u32 *isp, *prev_esp, arg1, arg2;
- curctx = (union irq_ctx *) current_thread_info();
- irqctx = __this_cpu_read(hardirq_ctx);
+ curstk = (struct irq_stack *) current_stack();
+ irqstk = __this_cpu_read(hardirq_stack);
/*
* this is where we switch to the IRQ stack. However, if we are
@@ -92,21 +96,14 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
* handler) we can't do that and just have to keep using the
* current stack (which is the irq stack already after all)
*/
- if (unlikely(curctx == irqctx))
+ if (unlikely(curstk == irqstk))
return 0;
- /* build the stack frame on the IRQ stack */
- isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
- irqctx->tinfo.task = curctx->tinfo.task;
- irqctx->tinfo.previous_esp = current_stack_pointer;
+ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
- /*
- * Copy the softirq bits in preempt_count so that the
- * softirq checks work in the hardirq context.
- */
- irqctx->tinfo.preempt_count =
- (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
- (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+ /* Save the next esp at the bottom of the stack */
+ prev_esp = (u32 *)irqstk;
+ *prev_esp = current_stack_pointer;
if (unlikely(overflow))
call_on_stack(print_stack_overflow, isp);
@@ -124,65 +121,44 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
/*
* allocate per-cpu stacks for hardirq and for softirq processing
*/
-void __cpuinit irq_ctx_init(int cpu)
+void irq_ctx_init(int cpu)
{
- union irq_ctx *irqctx;
+ struct irq_stack *irqstk;
- if (per_cpu(hardirq_ctx, cpu))
+ if (per_cpu(hardirq_stack, cpu))
return;
- irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
- THREAD_FLAGS,
- THREAD_ORDER));
- memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
- irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
-
- per_cpu(hardirq_ctx, cpu) = irqctx;
+ irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),
+ THREADINFO_GFP,
+ THREAD_SIZE_ORDER));
+ per_cpu(hardirq_stack, cpu) = irqstk;
- irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
- THREAD_FLAGS,
- THREAD_ORDER));
- memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
- irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
-
- per_cpu(softirq_ctx, cpu) = irqctx;
+ irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),
+ THREADINFO_GFP,
+ THREAD_SIZE_ORDER));
+ per_cpu(softirq_stack, cpu) = irqstk;
printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
- cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
+ cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
}
-asmlinkage void do_softirq(void)
+void do_softirq_own_stack(void)
{
- unsigned long flags;
- struct thread_info *curctx;
- union irq_ctx *irqctx;
- u32 *isp;
+ struct thread_info *curstk;
+ struct irq_stack *irqstk;
+ u32 *isp, *prev_esp;
- if (in_interrupt())
- return;
+ curstk = current_stack();
+ irqstk = __this_cpu_read(softirq_stack);
- local_irq_save(flags);
+ /* build the stack frame on the softirq stack */
+ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
- if (local_softirq_pending()) {
- curctx = current_thread_info();
- irqctx = __this_cpu_read(softirq_ctx);
- irqctx->tinfo.task = curctx->task;
- irqctx->tinfo.previous_esp = current_stack_pointer;
-
- /* build the stack frame on the softirq stack */
- isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
-
- call_on_stack(__do_softirq, isp);
- /*
- * Shouldn't happen, we returned above if in_interrupt():
- */
- WARN_ON_ONCE(softirq_count());
- }
+ /* Push the previous esp onto the stack */
+ prev_esp = (u32 *)irqstk;
+ *prev_esp = current_stack_pointer;
- local_irq_restore(flags);
+ call_on_stack(__do_softirq, isp);
}
bool handle_irq(unsigned irq, struct pt_regs *regs)
@@ -196,7 +172,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
if (unlikely(!desc))
return false;
- if (!execute_on_irq_stack(overflow, desc, irq)) {
+ if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
if (unlikely(overflow))
print_stack_overflow();
desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index d04d3ecded6..4d1c746892e 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -87,24 +87,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
generic_handle_irq_desc(irq, desc);
return true;
}
-
-
-extern void call_softirq(void);
-
-asmlinkage void do_softirq(void)
-{
- __u32 pending;
- unsigned long flags;
-
- if (in_interrupt())
- return;
-
- local_irq_save(flags);
- pending = local_softirq_pending();
- /* Switch to interrupt stack */
- if (pending) {
- call_softirq();
- WARN_ON_ONCE(softirq_count());
- }
- local_irq_restore(flags);
-}
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index ca8f703a1e7..1de84e3ab4e 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -8,14 +8,34 @@
#include <linux/irq_work.h>
#include <linux/hardirq.h>
#include <asm/apic.h>
+#include <asm/trace/irq_vectors.h>
-void smp_irq_work_interrupt(struct pt_regs *regs)
+static inline void irq_work_entering_irq(void)
{
irq_enter();
ack_APIC_irq();
+}
+
+static inline void __smp_irq_work_interrupt(void)
+{
inc_irq_stat(apic_irq_work_irqs);
irq_work_run();
- irq_exit();
+}
+
+__visible void smp_irq_work_interrupt(struct pt_regs *regs)
+{
+ irq_work_entering_irq();
+ __smp_irq_work_interrupt();
+ exiting_irq();
+}
+
+__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
+{
+ irq_work_entering_irq();
+ trace_irq_work_entry(IRQ_WORK_VECTOR);
+ __smp_irq_work_interrupt();
+ trace_irq_work_exit(IRQ_WORK_VECTOR);
+ exiting_irq();
}
void arch_irq_work_raise(void)
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 313fb5cddbc..7f50156542f 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -16,7 +16,6 @@
#include <linux/delay.h>
#include <linux/atomic.h>
-#include <asm/system.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
#include <asm/pgtable.h>
@@ -43,39 +42,6 @@
* (these are usually mapped into the 0x30-0xff vector range)
*/
-#ifdef CONFIG_X86_32
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
- outb(0, 0xF0);
- if (ignore_fpu_irq || !boot_cpu_data.hard_math)
- return IRQ_NONE;
- math_error(get_irq_regs(), 0, 16);
- return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = {
- .handler = math_error_irq,
- .name = "fpu",
- .flags = IRQF_NO_THREAD,
-};
-#endif
-
/*
* IRQ2 is cascade interrupt to second interrupt controller
*/
@@ -86,7 +52,7 @@ static struct irqaction irq2 = {
};
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... NR_VECTORS - 1] = -1,
+ [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
};
int vector_used_by_percpu_irq(unsigned int vector)
@@ -94,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
int cpu;
for_each_online_cpu(cpu) {
- if (per_cpu(vector_irq, cpu)[vector] != -1)
+ if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
return 1;
}
@@ -172,79 +138,6 @@ static void __init smp_intr_init(void)
*/
alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
- /* IPIs for invalidation */
-#define ALLOC_INVTLB_VEC(NR) \
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
- invalidate_interrupt##NR)
-
- switch (NUM_INVALIDATE_TLB_VECTORS) {
- default:
- ALLOC_INVTLB_VEC(31);
- case 31:
- ALLOC_INVTLB_VEC(30);
- case 30:
- ALLOC_INVTLB_VEC(29);
- case 29:
- ALLOC_INVTLB_VEC(28);
- case 28:
- ALLOC_INVTLB_VEC(27);
- case 27:
- ALLOC_INVTLB_VEC(26);
- case 26:
- ALLOC_INVTLB_VEC(25);
- case 25:
- ALLOC_INVTLB_VEC(24);
- case 24:
- ALLOC_INVTLB_VEC(23);
- case 23:
- ALLOC_INVTLB_VEC(22);
- case 22:
- ALLOC_INVTLB_VEC(21);
- case 21:
- ALLOC_INVTLB_VEC(20);
- case 20:
- ALLOC_INVTLB_VEC(19);
- case 19:
- ALLOC_INVTLB_VEC(18);
- case 18:
- ALLOC_INVTLB_VEC(17);
- case 17:
- ALLOC_INVTLB_VEC(16);
- case 16:
- ALLOC_INVTLB_VEC(15);
- case 15:
- ALLOC_INVTLB_VEC(14);
- case 14:
- ALLOC_INVTLB_VEC(13);
- case 13:
- ALLOC_INVTLB_VEC(12);
- case 12:
- ALLOC_INVTLB_VEC(11);
- case 11:
- ALLOC_INVTLB_VEC(10);
- case 10:
- ALLOC_INVTLB_VEC(9);
- case 9:
- ALLOC_INVTLB_VEC(8);
- case 8:
- ALLOC_INVTLB_VEC(7);
- case 7:
- ALLOC_INVTLB_VEC(6);
- case 6:
- ALLOC_INVTLB_VEC(5);
- case 5:
- ALLOC_INVTLB_VEC(4);
- case 4:
- ALLOC_INVTLB_VEC(3);
- case 3:
- ALLOC_INVTLB_VEC(2);
- case 2:
- ALLOC_INVTLB_VEC(1);
- case 1:
- ALLOC_INVTLB_VEC(0);
- break;
- }
-
/* IPI for generic function call */
alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
@@ -279,6 +172,10 @@ static void __init apic_intr_init(void)
/* IPI for X86 platform specific use */
alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
+#ifdef CONFIG_HAVE_KVM
+ /* IPI for KVM to deliver posted interrupt */
+ alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
+#endif
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
@@ -306,23 +203,16 @@ void __init native_init_IRQ(void)
* us. (some of these will be overridden and become
* 'special' SMP interrupts)
*/
- for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+ i = FIRST_EXTERNAL_VECTOR;
+ for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
- if (!test_bit(i, used_vectors))
- set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+ set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
if (!acpi_ioapic && !of_ioapic)
setup_irq(2, &irq2);
#ifdef CONFIG_X86_32
- /*
- * External FPU? Set up irq13 if so, for
- * original braindamaged IBM FERR coupling.
- */
- if (boot_cpu_data.hard_math && !cpu_has_fpu)
- setup_irq(FPU_IRQ, &fpu_irq);
-
irq_ctx_init(smp_processor_id());
#endif
}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 2889b3d4388..26d5a55a273 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -24,20 +24,82 @@ union jump_code_union {
} __attribute__((packed));
};
+static void bug_at(unsigned char *ip, int line)
+{
+ /*
+ * The location is not an op that we were expecting.
+ * Something went wrong. Crash the box, as something could be
+ * corrupting the kernel.
+ */
+ pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n",
+ ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line);
+ BUG();
+}
+
static void __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type,
- void *(*poker)(void *, const void *, size_t))
+ void *(*poker)(void *, const void *, size_t),
+ int init)
{
union jump_code_union code;
+ const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
+ const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
if (type == JUMP_LABEL_ENABLE) {
+ if (init) {
+ /*
+ * Jump label is enabled for the first time.
+ * So we expect a default_nop...
+ */
+ if (unlikely(memcmp((void *)entry->code, default_nop, 5)
+ != 0))
+ bug_at((void *)entry->code, __LINE__);
+ } else {
+ /*
+ * ...otherwise expect an ideal_nop. Otherwise
+ * something went horribly wrong.
+ */
+ if (unlikely(memcmp((void *)entry->code, ideal_nop, 5)
+ != 0))
+ bug_at((void *)entry->code, __LINE__);
+ }
+
code.jump = 0xe9;
code.offset = entry->target -
(entry->code + JUMP_LABEL_NOP_SIZE);
- } else
+ } else {
+ /*
+ * We are disabling this jump label. If it is not what
+ * we think it is, then something must have gone wrong.
+ * If this is the first initialization call, then we
+ * are converting the default nop to the ideal nop.
+ */
+ if (init) {
+ if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0))
+ bug_at((void *)entry->code, __LINE__);
+ } else {
+ code.jump = 0xe9;
+ code.offset = entry->target -
+ (entry->code + JUMP_LABEL_NOP_SIZE);
+ if (unlikely(memcmp((void *)entry->code, &code, 5) != 0))
+ bug_at((void *)entry->code, __LINE__);
+ }
memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
+ }
- (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+ /*
+ * Make text_poke_bp() a default fallback poker.
+ *
+ * At the time the change is being done, just ignore whether we
+ * are doing nop -> jump or jump -> nop transition, and assume
+ * always nop being the 'currently valid' instruction
+ *
+ */
+ if (poker)
+ (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+ else
+ text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE,
+ (void *)entry->code + JUMP_LABEL_NOP_SIZE);
}
void arch_jump_label_transform(struct jump_entry *entry,
@@ -45,15 +107,38 @@ void arch_jump_label_transform(struct jump_entry *entry,
{
get_online_cpus();
mutex_lock(&text_mutex);
- __jump_label_transform(entry, type, text_poke_smp);
+ __jump_label_transform(entry, type, NULL, 0);
mutex_unlock(&text_mutex);
put_online_cpus();
}
+static enum {
+ JL_STATE_START,
+ JL_STATE_NO_UPDATE,
+ JL_STATE_UPDATE,
+} jlstate __initdata_or_module = JL_STATE_START;
+
__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
enum jump_label_type type)
{
- __jump_label_transform(entry, type, text_poke_early);
+ /*
+ * This function is called at boot up and when modules are
+ * first loaded. Check if the default nop, the one that is
+ * inserted at compile time, is the ideal nop. If it is, then
+ * we do not need to update the nop, and we can leave it as is.
+ * If it is not, then we need to update the nop to the ideal nop.
+ */
+ if (jlstate == JL_STATE_START) {
+ const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
+ const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
+
+ if (memcmp(ideal_nop, default_nop, 5) != 0)
+ jlstate = JL_STATE_UPDATE;
+ else
+ jlstate = JL_STATE_NO_UPDATE;
+ }
+ if (jlstate == JL_STATE_UPDATE)
+ __jump_label_transform(entry, type, text_poke_early, 1);
}
#endif
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 90fcf62854b..dc1404bf8e4 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -68,16 +68,9 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
return count;
}
-static int setup_data_open(struct inode *inode, struct file *file)
-{
- file->private_data = inode->i_private;
-
- return 0;
-}
-
static const struct file_operations fops_setup_data = {
.read = setup_data_read,
- .open = setup_data_open,
+ .open = simple_open,
.llseek = default_llseek,
};
@@ -114,7 +107,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
{
struct setup_data_node *node;
struct setup_data *data;
- int error = -ENOMEM;
+ int error;
struct dentry *d;
struct page *pg;
u64 pa_data;
@@ -128,8 +121,10 @@ static int __init create_setup_data_nodes(struct dentry *parent)
while (pa_data) {
node = kmalloc(sizeof(*node), GFP_KERNEL);
- if (!node)
+ if (!node) {
+ error = -ENOMEM;
goto err_dir;
+ }
pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
if (PageHighMem(pg)) {
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index faba5771aca..7ec1d5f8d28 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -39,14 +39,14 @@
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/kgdb.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nmi.h>
#include <linux/hw_breakpoint.h>
+#include <linux/uaccess.h>
+#include <linux/memory.h>
#include <asm/debugreg.h>
#include <asm/apicdef.h>
-#include <asm/system.h>
#include <asm/apic.h>
#include <asm/nmi.h>
@@ -67,8 +67,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{ "ss", 4, offsetof(struct pt_regs, ss) },
{ "ds", 4, offsetof(struct pt_regs, ds) },
{ "es", 4, offsetof(struct pt_regs, es) },
- { "fs", 4, -1 },
- { "gs", 4, -1 },
#else
{ "ax", 8, offsetof(struct pt_regs, ax) },
{ "bx", 8, offsetof(struct pt_regs, bx) },
@@ -90,7 +88,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{ "flags", 4, offsetof(struct pt_regs, flags) },
{ "cs", 4, offsetof(struct pt_regs, cs) },
{ "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, -1 },
+ { "es", 4, -1 },
#endif
+ { "fs", 4, -1 },
+ { "gs", 4, -1 },
};
int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
@@ -441,12 +443,12 @@ void kgdb_roundup_cpus(unsigned long flags)
/**
* kgdb_arch_handle_exception - Handle architecture specific GDB packets.
- * @vector: The error vector of the exception that happened.
+ * @e_vector: The error vector of the exception that happened.
* @signo: The signal number of the exception that happened.
* @err_code: The error code of the exception that happened.
- * @remcom_in_buffer: The buffer of the packet we have read.
- * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into.
- * @regs: The &struct pt_regs of the current process.
+ * @remcomInBuffer: The buffer of the packet we have read.
+ * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into.
+ * @linux_regs: The &struct pt_regs of the current process.
*
* This function MUST handle the 'c' and 's' command packets,
* as well packets to set / remove a hardware breakpoint, if used.
@@ -740,6 +742,66 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
regs->ip = ip;
}
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+ int err;
+#ifdef CONFIG_DEBUG_RODATA
+ char opc[BREAK_INSTR_SIZE];
+#endif /* CONFIG_DEBUG_RODATA */
+
+ bpt->type = BP_BREAKPOINT;
+ err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+ err = probe_kernel_write((char *)bpt->bpt_addr,
+ arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+#ifdef CONFIG_DEBUG_RODATA
+ if (!err)
+ return err;
+ /*
+ * It is safe to call text_poke() because normal kernel execution
+ * is stopped on all cores, so long as the text_mutex is not locked.
+ */
+ if (mutex_is_locked(&text_mutex))
+ return -EBUSY;
+ text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
+ BREAK_INSTR_SIZE);
+ err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+ if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
+ return -EINVAL;
+ bpt->type = BP_POKE_BREAKPOINT;
+#endif /* CONFIG_DEBUG_RODATA */
+ return err;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+#ifdef CONFIG_DEBUG_RODATA
+ int err;
+ char opc[BREAK_INSTR_SIZE];
+
+ if (bpt->type != BP_POKE_BREAKPOINT)
+ goto knl_write;
+ /*
+ * It is safe to call text_poke() because normal kernel execution
+ * is stopped on all cores, so long as the text_mutex is not locked.
+ */
+ if (mutex_is_locked(&text_mutex))
+ goto knl_write;
+ text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE);
+ err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
+ if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
+ goto knl_write;
+ return err;
+knl_write:
+#endif /* CONFIG_DEBUG_RODATA */
+ return probe_kernel_write((char *)bpt->bpt_addr,
+ (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
+}
+
struct kgdb_arch arch_kgdb_ops = {
/* Breakpoint instruction: */
.gdb_bpt_instr = { 0xcc },
diff --git a/arch/x86/kernel/kprobes/Makefile b/arch/x86/kernel/kprobes/Makefile
new file mode 100644
index 00000000000..0d33169cc1a
--- /dev/null
+++ b/arch/x86/kernel/kprobes/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for kernel probes
+#
+
+obj-$(CONFIG_KPROBES) += core.o
+obj-$(CONFIG_OPTPROBES) += opt.o
+obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
new file mode 100644
index 00000000000..c6ee63f927a
--- /dev/null
+++ b/arch/x86/kernel/kprobes/common.h
@@ -0,0 +1,108 @@
+#ifndef __X86_KERNEL_KPROBES_COMMON_H
+#define __X86_KERNEL_KPROBES_COMMON_H
+
+/* Kprobes and Optprobes common header */
+
+#ifdef CONFIG_X86_64
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax. */ \
+ " subq $24, %rsp\n" \
+ " pushq %rdi\n" \
+ " pushq %rsi\n" \
+ " pushq %rdx\n" \
+ " pushq %rcx\n" \
+ " pushq %rax\n" \
+ " pushq %r8\n" \
+ " pushq %r9\n" \
+ " pushq %r10\n" \
+ " pushq %r11\n" \
+ " pushq %rbx\n" \
+ " pushq %rbp\n" \
+ " pushq %r12\n" \
+ " pushq %r13\n" \
+ " pushq %r14\n" \
+ " pushq %r15\n"
+#define RESTORE_REGS_STRING \
+ " popq %r15\n" \
+ " popq %r14\n" \
+ " popq %r13\n" \
+ " popq %r12\n" \
+ " popq %rbp\n" \
+ " popq %rbx\n" \
+ " popq %r11\n" \
+ " popq %r10\n" \
+ " popq %r9\n" \
+ " popq %r8\n" \
+ " popq %rax\n" \
+ " popq %rcx\n" \
+ " popq %rdx\n" \
+ " popq %rsi\n" \
+ " popq %rdi\n" \
+ /* Skip orig_ax, ip, cs */ \
+ " addq $24, %rsp\n"
+#else
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax and gs. */ \
+ " subl $16, %esp\n" \
+ " pushl %fs\n" \
+ " pushl %es\n" \
+ " pushl %ds\n" \
+ " pushl %eax\n" \
+ " pushl %ebp\n" \
+ " pushl %edi\n" \
+ " pushl %esi\n" \
+ " pushl %edx\n" \
+ " pushl %ecx\n" \
+ " pushl %ebx\n"
+#define RESTORE_REGS_STRING \
+ " popl %ebx\n" \
+ " popl %ecx\n" \
+ " popl %edx\n" \
+ " popl %esi\n" \
+ " popl %edi\n" \
+ " popl %ebp\n" \
+ " popl %eax\n" \
+ /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
+ " addl $24, %esp\n"
+#endif
+
+/* Ensure if the instruction can be boostable */
+extern int can_boost(kprobe_opcode_t *instruction);
+/* Recover instruction if given address is probed */
+extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
+ unsigned long addr);
+/*
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
+ */
+extern int __copy_instruction(u8 *dest, u8 *src);
+
+/* Generate a relative-jump/call instruction */
+extern void synthesize_reljump(void *from, void *to);
+extern void synthesize_relcall(void *from, void *to);
+
+#ifdef CONFIG_OPTPROBES
+extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
+extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
+#else /* !CONFIG_OPTPROBES */
+static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+ return 0;
+}
+static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ return addr;
+}
+#endif
+
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb);
+#else
+static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ return 0;
+}
+#endif
+#endif
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes/core.c
index 7da647d8b64..67e6d19ef1b 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -30,16 +30,15 @@
* <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
* <prasanna@in.ibm.com> added function-return probes.
* 2005-May Rusty Lynch <rusty.lynch@intel.com>
- * Added function return probes functionality
+ * Added function return probes functionality
* 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
- * kprobe-booster and kretprobe-booster for i386.
+ * kprobe-booster and kretprobe-booster for i386.
* 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
- * and kretprobe-booster for x86-64
+ * and kretprobe-booster for x86-64
* 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
- * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
- * unified x86 kprobes code.
+ * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
+ * unified x86 kprobes code.
*/
-
#include <linux/kprobes.h>
#include <linux/ptrace.h>
#include <linux/string.h>
@@ -59,6 +58,8 @@
#include <asm/insn.h>
#include <asm/debugreg.h>
+#include "common.h"
+
void jprobe_return_end(void);
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
@@ -77,7 +78,7 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
* Groups, and some special opcodes can not boost.
* This is non-const and volatile to keep gcc from statically
* optimizing it out, as variable_test_bit makes gcc think only
- * *(unsigned long*) is used.
+ * *(unsigned long*) is used.
*/
static volatile u32 twobyte_is_boostable[256 / 32] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
@@ -108,14 +109,16 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
doesn't switch kernel stack.*/
{NULL, NULL} /* Terminator */
};
+
const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
+static nokprobe_inline void
+__synthesize_relative_insn(void *from, void *to, u8 op)
{
struct __arch_relative_insn {
u8 op;
s32 raddr;
- } __attribute__((packed)) *insn;
+ } __packed *insn;
insn = (struct __arch_relative_insn *)from;
insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
@@ -123,15 +126,23 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
}
/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static void __kprobes synthesize_reljump(void *from, void *to)
+void synthesize_reljump(void *from, void *to)
{
__synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
}
+NOKPROBE_SYMBOL(synthesize_reljump);
+
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+void synthesize_relcall(void *from, void *to)
+{
+ __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
+}
+NOKPROBE_SYMBOL(synthesize_relcall);
/*
* Skip the prefixes of the instruction.
*/
-static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
+static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
{
insn_attr_t attr;
@@ -146,12 +157,13 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
#endif
return insn;
}
+NOKPROBE_SYMBOL(skip_prefixes);
/*
* Returns non-zero if opcode is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
*/
-static int __kprobes can_boost(kprobe_opcode_t *opcodes)
+int can_boost(kprobe_opcode_t *opcodes)
{
kprobe_opcode_t opcode;
kprobe_opcode_t *orig_opcodes = opcodes;
@@ -207,13 +219,15 @@ retry:
}
}
-/* Recover the probed instruction at addr for further analysis. */
-static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+static unsigned long
+__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
{
struct kprobe *kp;
+
kp = get_kprobe((void *)addr);
+ /* There is no probe, return original address */
if (!kp)
- return -EINVAL;
+ return addr;
/*
* Basically, kp->ainsn.insn has an original instruction.
@@ -230,14 +244,29 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
*/
memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
buf[0] = kp->opcode;
- return 0;
+ return (unsigned long)buf;
+}
+
+/*
+ * Recover the probed instruction at addr for further analysis.
+ * Caller must lock kprobes by kprobe_mutex, or disable preemption
+ * for preventing to release referencing kprobes.
+ */
+unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+ unsigned long __addr;
+
+ __addr = __recover_optprobed_insn(buf, addr);
+ if (__addr != addr)
+ return __addr;
+
+ return __recover_probed_insn(buf, addr);
}
/* Check if paddr is at an instruction boundary */
-static int __kprobes can_probe(unsigned long paddr)
+static int can_probe(unsigned long paddr)
{
- int ret;
- unsigned long addr, offset = 0;
+ unsigned long addr, __addr, offset = 0;
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
@@ -247,26 +276,24 @@ static int __kprobes can_probe(unsigned long paddr)
/* Decode instructions */
addr = paddr - offset;
while (addr < paddr) {
- kernel_insn_init(&insn, (void *)addr);
- insn_get_opcode(&insn);
-
/*
* Check if the instruction has been modified by another
* kprobe, in which case we replace the breakpoint by the
* original instruction in our buffer.
+ * Also, jump optimization will change the breakpoint to
+ * relative-jump. Since the relative-jump itself is
+ * normally used, we just go through if there is no kprobe.
*/
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
- ret = recover_probed_instruction(buf, addr);
- if (ret)
- /*
- * Another debugging subsystem might insert
- * this breakpoint. In that case, we can't
- * recover it.
- */
- return 0;
- kernel_insn_init(&insn, buf);
- }
+ __addr = recover_probed_instruction(buf, addr);
+ kernel_insn_init(&insn, (void *)__addr);
insn_get_length(&insn);
+
+ /*
+ * Another debugging subsystem might insert this breakpoint.
+ * In that case, we can't recover it.
+ */
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ return 0;
addr += insn.length;
}
@@ -276,7 +303,7 @@ static int __kprobes can_probe(unsigned long paddr)
/*
* Returns non-zero if opcode modifies the interrupt flag.
*/
-static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
+static int is_IF_modifier(kprobe_opcode_t *insn)
{
/* Skip prefixes */
insn = skip_prefixes(insn);
@@ -299,24 +326,16 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
* If not, return null.
* Only applicable to 64-bit x86.
*/
-static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
+int __copy_instruction(u8 *dest, u8 *src)
{
struct insn insn;
- int ret;
kprobe_opcode_t buf[MAX_INSN_SIZE];
- kernel_insn_init(&insn, src);
- if (recover) {
- insn_get_opcode(&insn);
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
- ret = recover_probed_instruction(buf,
- (unsigned long)src);
- if (ret)
- return 0;
- kernel_insn_init(&insn, buf);
- }
- }
+ kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src));
insn_get_length(&insn);
+ /* Another subsystem puts a breakpoint, failed to recover */
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ return 0;
memcpy(dest, insn.kaddr, insn.length);
#ifdef CONFIG_X86_64
@@ -337,9 +356,12 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
* extension of the original signed 32-bit displacement would
* have given.
*/
- newdisp = (u8 *) src + (s64) insn.displacement.value -
- (u8 *) dest;
- BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
+ newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
+ if ((s64) (s32) newdisp != newdisp) {
+ pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
+ pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value);
+ return 0;
+ }
disp = (u8 *) dest + insn_offset_displacement(&insn);
*(s32 *) disp = (s32) newdisp;
}
@@ -347,23 +369,34 @@ static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
return insn.length;
}
-static void __kprobes arch_copy_kprobe(struct kprobe *p)
+static int arch_copy_kprobe(struct kprobe *p)
{
+ int ret;
+
+ /* Copy an instruction with recovering if other optprobe modifies it.*/
+ ret = __copy_instruction(p->ainsn.insn, p->addr);
+ if (!ret)
+ return -EINVAL;
+
/*
- * Copy an instruction without recovering int3, because it will be
- * put by another subsystem.
+ * __copy_instruction can modify the displacement of the instruction,
+ * but it doesn't affect boostable check.
*/
- __copy_instruction(p->ainsn.insn, p->addr, 0);
-
- if (can_boost(p->addr))
+ if (can_boost(p->ainsn.insn))
p->ainsn.boostable = 0;
else
p->ainsn.boostable = -1;
- p->opcode = *p->addr;
+ /* Check whether the instruction modifies Interrupt Flag or not */
+ p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn);
+
+ /* Also, displacement change doesn't affect the first byte */
+ p->opcode = p->ainsn.insn[0];
+
+ return 0;
}
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
+int arch_prepare_kprobe(struct kprobe *p)
{
if (alternatives_text_reserved(p->addr, p->addr))
return -EINVAL;
@@ -374,21 +407,21 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
p->ainsn.insn = get_insn_slot();
if (!p->ainsn.insn)
return -ENOMEM;
- arch_copy_kprobe(p);
- return 0;
+
+ return arch_copy_kprobe(p);
}
-void __kprobes arch_arm_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
{
text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
}
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
+void arch_disarm_kprobe(struct kprobe *p)
{
text_poke(p->addr, &p->opcode, 1);
}
-void __kprobes arch_remove_kprobe(struct kprobe *p)
+void arch_remove_kprobe(struct kprobe *p)
{
if (p->ainsn.insn) {
free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
@@ -396,7 +429,8 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
}
}
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void
+save_previous_kprobe(struct kprobe_ctlblk *kcb)
{
kcb->prev_kprobe.kp = kprobe_running();
kcb->prev_kprobe.status = kcb->kprobe_status;
@@ -404,7 +438,8 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
}
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void
+restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
kcb->kprobe_status = kcb->prev_kprobe.status;
@@ -412,17 +447,18 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
}
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+static nokprobe_inline void
+set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
__this_cpu_write(current_kprobe, p);
kcb->kprobe_saved_flags = kcb->kprobe_old_flags
= (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
- if (is_IF_modifier(p->ainsn.insn))
+ if (p->ainsn.if_modifier)
kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
}
-static void __kprobes clear_btf(void)
+static nokprobe_inline void clear_btf(void)
{
if (test_thread_flag(TIF_BLOCKSTEP)) {
unsigned long debugctl = get_debugctlmsr();
@@ -432,7 +468,7 @@ static void __kprobes clear_btf(void)
}
}
-static void __kprobes restore_btf(void)
+static nokprobe_inline void restore_btf(void)
{
if (test_thread_flag(TIF_BLOCKSTEP)) {
unsigned long debugctl = get_debugctlmsr();
@@ -442,8 +478,7 @@ static void __kprobes restore_btf(void)
}
}
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
- struct pt_regs *regs)
+void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long *sara = stack_addr(regs);
@@ -452,17 +487,10 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
/* Replace the return addr with trampoline addr */
*sara = (unsigned long) &kretprobe_trampoline;
}
+NOKPROBE_SYMBOL(arch_prepare_kretprobe);
-#ifdef CONFIG_OPTPROBES
-static int __kprobes setup_detour_execution(struct kprobe *p,
- struct pt_regs *regs,
- int reenter);
-#else
-#define setup_detour_execution(p, regs, reenter) (0)
-#endif
-
-static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb, int reenter)
+static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb, int reenter)
{
if (setup_detour_execution(p, regs, reenter))
return;
@@ -498,22 +526,24 @@ static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
else
regs->ip = (unsigned long)p->ainsn.insn;
}
+NOKPROBE_SYMBOL(setup_singlestep);
/*
* We have reentered the kprobe_handler(), since another probe was hit while
* within the handler. We save the original kprobes variables and just single
* step on the instruction of the new probe without calling any user handlers.
*/
-static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
switch (kcb->kprobe_status) {
case KPROBE_HIT_SSDONE:
case KPROBE_HIT_ACTIVE:
+ case KPROBE_HIT_SS:
kprobes_inc_nmissed_count(p);
setup_singlestep(p, regs, kcb, 1);
break;
- case KPROBE_HIT_SS:
+ case KPROBE_REENTER:
/* A probe has been hit in the codepath leading up to, or just
* after, single-stepping of a probed instruction. This entire
* codepath should strictly reside in .kprobes.text section.
@@ -532,17 +562,21 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
return 1;
}
+NOKPROBE_SYMBOL(reenter_kprobe);
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate and they
* remain disabled throughout this function.
*/
-static int __kprobes kprobe_handler(struct pt_regs *regs)
+int kprobe_int3_handler(struct pt_regs *regs)
{
kprobe_opcode_t *addr;
struct kprobe *p;
struct kprobe_ctlblk *kcb;
+ if (user_mode_vm(regs))
+ return 0;
+
addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
/*
* We don't want to be preempted for the entire
@@ -591,7 +625,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
} else if (kprobe_running()) {
p = __this_cpu_read(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
- setup_singlestep(p, regs, kcb, 0);
+ if (!skip_singlestep(p, regs, kcb))
+ setup_singlestep(p, regs, kcb, 0);
return 1;
}
} /* else: not a kprobe fault; let the kernel handle it */
@@ -599,75 +634,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
preempt_enable_no_resched();
return 0;
}
-
-#ifdef CONFIG_X86_64
-#define SAVE_REGS_STRING \
- /* Skip cs, ip, orig_ax. */ \
- " subq $24, %rsp\n" \
- " pushq %rdi\n" \
- " pushq %rsi\n" \
- " pushq %rdx\n" \
- " pushq %rcx\n" \
- " pushq %rax\n" \
- " pushq %r8\n" \
- " pushq %r9\n" \
- " pushq %r10\n" \
- " pushq %r11\n" \
- " pushq %rbx\n" \
- " pushq %rbp\n" \
- " pushq %r12\n" \
- " pushq %r13\n" \
- " pushq %r14\n" \
- " pushq %r15\n"
-#define RESTORE_REGS_STRING \
- " popq %r15\n" \
- " popq %r14\n" \
- " popq %r13\n" \
- " popq %r12\n" \
- " popq %rbp\n" \
- " popq %rbx\n" \
- " popq %r11\n" \
- " popq %r10\n" \
- " popq %r9\n" \
- " popq %r8\n" \
- " popq %rax\n" \
- " popq %rcx\n" \
- " popq %rdx\n" \
- " popq %rsi\n" \
- " popq %rdi\n" \
- /* Skip orig_ax, ip, cs */ \
- " addq $24, %rsp\n"
-#else
-#define SAVE_REGS_STRING \
- /* Skip cs, ip, orig_ax and gs. */ \
- " subl $16, %esp\n" \
- " pushl %fs\n" \
- " pushl %es\n" \
- " pushl %ds\n" \
- " pushl %eax\n" \
- " pushl %ebp\n" \
- " pushl %edi\n" \
- " pushl %esi\n" \
- " pushl %edx\n" \
- " pushl %ecx\n" \
- " pushl %ebx\n"
-#define RESTORE_REGS_STRING \
- " popl %ebx\n" \
- " popl %ecx\n" \
- " popl %edx\n" \
- " popl %esi\n" \
- " popl %edi\n" \
- " popl %ebp\n" \
- " popl %eax\n" \
- /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
- " addl $24, %esp\n"
-#endif
+NOKPROBE_SYMBOL(kprobe_int3_handler);
/*
* When a retprobed function returns, this code saves registers and
* calls trampoline_handler() runs, which calls the kretprobe's handler.
*/
-static void __used __kprobes kretprobe_trampoline_holder(void)
+static void __used kretprobe_trampoline_holder(void)
{
asm volatile (
".global kretprobe_trampoline\n"
@@ -698,15 +671,17 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
#endif
" ret\n");
}
+NOKPROBE_SYMBOL(kretprobe_trampoline_holder);
+NOKPROBE_SYMBOL(kretprobe_trampoline);
/*
* Called from kretprobe_trampoline
*/
-static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
+__visible __used void *trampoline_handler(struct pt_regs *regs)
{
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
kprobe_opcode_t *correct_ret_addr = NULL;
@@ -736,7 +711,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
* will be the real return address, and all the rest will
* point to kretprobe_trampoline.
*/
- hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
@@ -755,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
kretprobe_assert(ri, orig_ret_address, trampoline_address);
correct_ret_addr = ri->ret_addr;
- hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
@@ -782,12 +757,13 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
kretprobe_hash_unlock(current, &flags);
- hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+ hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
return (void *)orig_ret_address;
}
+NOKPROBE_SYMBOL(trampoline_handler);
/*
* Called after single-stepping. p->addr is the address of the
@@ -816,8 +792,8 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
* jump instruction after the copied instruction, that jumps to the next
* instruction after the probepoint.
*/
-static void __kprobes resume_execution(struct kprobe *p,
- struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+static void resume_execution(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
unsigned long *tos = stack_addr(regs);
unsigned long copy_ip = (unsigned long)p->ainsn.insn;
@@ -892,12 +868,13 @@ static void __kprobes resume_execution(struct kprobe *p,
no_change:
restore_btf();
}
+NOKPROBE_SYMBOL(resume_execution);
/*
* Interrupts are disabled on entry as trap1 is an interrupt gate and they
* remain disabled throughout this function.
*/
-static int __kprobes post_kprobe_handler(struct pt_regs *regs)
+int kprobe_debug_handler(struct pt_regs *regs)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -932,15 +909,17 @@ out:
return 1;
}
+NOKPROBE_SYMBOL(kprobe_debug_handler);
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- switch (kcb->kprobe_status) {
- case KPROBE_HIT_SS:
- case KPROBE_REENTER:
+ if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) {
+ /* This must happen on single-stepping */
+ WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS &&
+ kcb->kprobe_status != KPROBE_REENTER);
/*
* We are here because the instruction being single
* stepped caused a page fault. We reset the current
@@ -955,9 +934,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
else
reset_current_kprobe();
preempt_enable_no_resched();
- break;
- case KPROBE_HIT_ACTIVE:
- case KPROBE_HIT_SSDONE:
+ } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE ||
+ kcb->kprobe_status == KPROBE_HIT_SSDONE) {
/*
* We increment the nmissed count for accounting,
* we can also use npre/npostfault count for accounting
@@ -986,18 +964,17 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
* fixup routine could not handle it,
* Let do_page_fault() fix it.
*/
- break;
- default:
- break;
}
+
return 0;
}
+NOKPROBE_SYMBOL(kprobe_fault_handler);
/*
* Wrapper routine for handling exceptions.
*/
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
- unsigned long val, void *data)
+int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
+ void *data)
{
struct die_args *args = data;
int ret = NOTIFY_DONE;
@@ -1005,22 +982,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
if (args->regs && user_mode_vm(args->regs))
return ret;
- switch (val) {
- case DIE_INT3:
- if (kprobe_handler(args->regs))
- ret = NOTIFY_STOP;
- break;
- case DIE_DEBUG:
- if (post_kprobe_handler(args->regs)) {
- /*
- * Reset the BS bit in dr6 (pointed by args->err) to
- * denote completion of processing
- */
- (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
- ret = NOTIFY_STOP;
- }
- break;
- case DIE_GPF:
+ if (val == DIE_GPF) {
/*
* To be potentially processing a kprobe fault and to
* trust the result from kprobe_running(), we have
@@ -1029,14 +991,12 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
if (!preemptible() && kprobe_running() &&
kprobe_fault_handler(args->regs, args->trapnr))
ret = NOTIFY_STOP;
- break;
- default:
- break;
}
return ret;
}
+NOKPROBE_SYMBOL(kprobe_exceptions_notify);
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct jprobe *jp = container_of(p, struct jprobe, kp);
unsigned long addr;
@@ -1060,8 +1020,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
regs->ip = (unsigned long)(jp->entry);
return 1;
}
+NOKPROBE_SYMBOL(setjmp_pre_handler);
-void __kprobes jprobe_return(void)
+void jprobe_return(void)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -1077,8 +1038,10 @@ void __kprobes jprobe_return(void)
" nop \n"::"b"
(kcb->jprobe_saved_sp):"memory");
}
+NOKPROBE_SYMBOL(jprobe_return);
+NOKPROBE_SYMBOL(jprobe_return_end);
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
u8 *addr = (u8 *) (regs->ip - 1);
@@ -1092,9 +1055,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
"current sp %p does not match saved sp %p\n",
stack_addr(regs), kcb->jprobe_saved_sp);
printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
- show_registers(saved_regs);
+ show_regs(saved_regs);
printk(KERN_ERR "Current registers\n");
- show_registers(regs);
+ show_regs(regs);
BUG();
}
*regs = kcb->jprobe_saved_regs;
@@ -1106,470 +1069,22 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
}
return 0;
}
+NOKPROBE_SYMBOL(longjmp_break_handler);
-
-#ifdef CONFIG_OPTPROBES
-
-/* Insert a call instruction at address 'from', which calls address 'to'.*/
-static void __kprobes synthesize_relcall(void *from, void *to)
+bool arch_within_kprobe_blacklist(unsigned long addr)
{
- __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
+ return (addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end) ||
+ (addr >= (unsigned long)__entry_text_start &&
+ addr < (unsigned long)__entry_text_end);
}
-/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
-static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
- unsigned long val)
-{
-#ifdef CONFIG_X86_64
- *addr++ = 0x48;
- *addr++ = 0xbf;
-#else
- *addr++ = 0xb8;
-#endif
- *(unsigned long *)addr = val;
-}
-
-static void __used __kprobes kprobes_optinsn_template_holder(void)
-{
- asm volatile (
- ".global optprobe_template_entry\n"
- "optprobe_template_entry: \n"
-#ifdef CONFIG_X86_64
- /* We don't bother saving the ss register */
- " pushq %rsp\n"
- " pushfq\n"
- SAVE_REGS_STRING
- " movq %rsp, %rsi\n"
- ".global optprobe_template_val\n"
- "optprobe_template_val: \n"
- ASM_NOP5
- ASM_NOP5
- ".global optprobe_template_call\n"
- "optprobe_template_call: \n"
- ASM_NOP5
- /* Move flags to rsp */
- " movq 144(%rsp), %rdx\n"
- " movq %rdx, 152(%rsp)\n"
- RESTORE_REGS_STRING
- /* Skip flags entry */
- " addq $8, %rsp\n"
- " popfq\n"
-#else /* CONFIG_X86_32 */
- " pushf\n"
- SAVE_REGS_STRING
- " movl %esp, %edx\n"
- ".global optprobe_template_val\n"
- "optprobe_template_val: \n"
- ASM_NOP5
- ".global optprobe_template_call\n"
- "optprobe_template_call: \n"
- ASM_NOP5
- RESTORE_REGS_STRING
- " addl $4, %esp\n" /* skip cs */
- " popf\n"
-#endif
- ".global optprobe_template_end\n"
- "optprobe_template_end: \n");
-}
-
-#define TMPL_MOVE_IDX \
- ((long)&optprobe_template_val - (long)&optprobe_template_entry)
-#define TMPL_CALL_IDX \
- ((long)&optprobe_template_call - (long)&optprobe_template_entry)
-#define TMPL_END_IDX \
- ((long)&optprobe_template_end - (long)&optprobe_template_entry)
-
-#define INT3_SIZE sizeof(kprobe_opcode_t)
-
-/* Optimized kprobe call back function: called from optinsn */
-static void __kprobes optimized_callback(struct optimized_kprobe *op,
- struct pt_regs *regs)
-{
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- unsigned long flags;
-
- /* This is possible if op is under delayed unoptimizing */
- if (kprobe_disabled(&op->kp))
- return;
-
- local_irq_save(flags);
- if (kprobe_running()) {
- kprobes_inc_nmissed_count(&op->kp);
- } else {
- /* Save skipped registers */
-#ifdef CONFIG_X86_64
- regs->cs = __KERNEL_CS;
-#else
- regs->cs = __KERNEL_CS | get_kernel_rpl();
- regs->gs = 0;
-#endif
- regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
- regs->orig_ax = ~0UL;
-
- __this_cpu_write(current_kprobe, &op->kp);
- kcb->kprobe_status = KPROBE_HIT_ACTIVE;
- opt_pre_handler(&op->kp, regs);
- __this_cpu_write(current_kprobe, NULL);
- }
- local_irq_restore(flags);
-}
-
-static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
-{
- int len = 0, ret;
-
- while (len < RELATIVEJUMP_SIZE) {
- ret = __copy_instruction(dest + len, src + len, 1);
- if (!ret || !can_boost(dest + len))
- return -EINVAL;
- len += ret;
- }
- /* Check whether the address range is reserved */
- if (ftrace_text_reserved(src, src + len - 1) ||
- alternatives_text_reserved(src, src + len - 1) ||
- jump_label_text_reserved(src, src + len - 1))
- return -EBUSY;
-
- return len;
-}
-
-/* Check whether insn is indirect jump */
-static int __kprobes insn_is_indirect_jump(struct insn *insn)
-{
- return ((insn->opcode.bytes[0] == 0xff &&
- (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
- insn->opcode.bytes[0] == 0xea); /* Segment based jump */
-}
-
-/* Check whether insn jumps into specified address range */
-static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
-{
- unsigned long target = 0;
-
- switch (insn->opcode.bytes[0]) {
- case 0xe0: /* loopne */
- case 0xe1: /* loope */
- case 0xe2: /* loop */
- case 0xe3: /* jcxz */
- case 0xe9: /* near relative jump */
- case 0xeb: /* short relative jump */
- break;
- case 0x0f:
- if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
- break;
- return 0;
- default:
- if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
- break;
- return 0;
- }
- target = (unsigned long)insn->next_byte + insn->immediate.value;
-
- return (start <= target && target <= start + len);
-}
-
-/* Decode whole function to ensure any instructions don't jump into target */
-static int __kprobes can_optimize(unsigned long paddr)
-{
- int ret;
- unsigned long addr, size = 0, offset = 0;
- struct insn insn;
- kprobe_opcode_t buf[MAX_INSN_SIZE];
-
- /* Lookup symbol including addr */
- if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
- return 0;
-
- /*
- * Do not optimize in the entry code due to the unstable
- * stack handling.
- */
- if ((paddr >= (unsigned long )__entry_text_start) &&
- (paddr < (unsigned long )__entry_text_end))
- return 0;
-
- /* Check there is enough space for a relative jump. */
- if (size - offset < RELATIVEJUMP_SIZE)
- return 0;
-
- /* Decode instructions */
- addr = paddr - offset;
- while (addr < paddr - offset + size) { /* Decode until function end */
- if (search_exception_tables(addr))
- /*
- * Since some fixup code will jumps into this function,
- * we can't optimize kprobe in this function.
- */
- return 0;
- kernel_insn_init(&insn, (void *)addr);
- insn_get_opcode(&insn);
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
- ret = recover_probed_instruction(buf, addr);
- if (ret)
- return 0;
- kernel_insn_init(&insn, buf);
- }
- insn_get_length(&insn);
- /* Recover address */
- insn.kaddr = (void *)addr;
- insn.next_byte = (void *)(addr + insn.length);
- /* Check any instructions don't jump into target */
- if (insn_is_indirect_jump(&insn) ||
- insn_jump_into_range(&insn, paddr + INT3_SIZE,
- RELATIVE_ADDR_SIZE))
- return 0;
- addr += insn.length;
- }
-
- return 1;
-}
-
-/* Check optimized_kprobe can actually be optimized. */
-int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
-{
- int i;
- struct kprobe *p;
-
- for (i = 1; i < op->optinsn.size; i++) {
- p = get_kprobe(op->kp.addr + i);
- if (p && !kprobe_disabled(p))
- return -EEXIST;
- }
-
- return 0;
-}
-
-/* Check the addr is within the optimized instructions. */
-int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
- unsigned long addr)
-{
- return ((unsigned long)op->kp.addr <= addr &&
- (unsigned long)op->kp.addr + op->optinsn.size > addr);
-}
-
-/* Free optimized instruction slot */
-static __kprobes
-void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
-{
- if (op->optinsn.insn) {
- free_optinsn_slot(op->optinsn.insn, dirty);
- op->optinsn.insn = NULL;
- op->optinsn.size = 0;
- }
-}
-
-void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
-{
- __arch_remove_optimized_kprobe(op, 1);
-}
-
-/*
- * Copy replacing target instructions
- * Target instructions MUST be relocatable (checked inside)
- */
-int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
-{
- u8 *buf;
- int ret;
- long rel;
-
- if (!can_optimize((unsigned long)op->kp.addr))
- return -EILSEQ;
-
- op->optinsn.insn = get_optinsn_slot();
- if (!op->optinsn.insn)
- return -ENOMEM;
-
- /*
- * Verify if the address gap is in 2GB range, because this uses
- * a relative jump.
- */
- rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
- if (abs(rel) > 0x7fffffff)
- return -ERANGE;
-
- buf = (u8 *)op->optinsn.insn;
-
- /* Copy instructions into the out-of-line buffer */
- ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
- if (ret < 0) {
- __arch_remove_optimized_kprobe(op, 0);
- return ret;
- }
- op->optinsn.size = ret;
-
- /* Copy arch-dep-instance from template */
- memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
-
- /* Set probe information */
- synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
-
- /* Set probe function call */
- synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
-
- /* Set returning jmp instruction at the tail of out-of-line buffer */
- synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
- (u8 *)op->kp.addr + op->optinsn.size);
-
- flush_icache_range((unsigned long) buf,
- (unsigned long) buf + TMPL_END_IDX +
- op->optinsn.size + RELATIVEJUMP_SIZE);
- return 0;
-}
-
-#define MAX_OPTIMIZE_PROBES 256
-static struct text_poke_param *jump_poke_params;
-static struct jump_poke_buffer {
- u8 buf[RELATIVEJUMP_SIZE];
-} *jump_poke_bufs;
-
-static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
- u8 *insn_buf,
- struct optimized_kprobe *op)
-{
- s32 rel = (s32)((long)op->optinsn.insn -
- ((long)op->kp.addr + RELATIVEJUMP_SIZE));
-
- /* Backup instructions which will be replaced by jump address */
- memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
- RELATIVE_ADDR_SIZE);
-
- insn_buf[0] = RELATIVEJUMP_OPCODE;
- *(s32 *)(&insn_buf[1]) = rel;
-
- tprm->addr = op->kp.addr;
- tprm->opcode = insn_buf;
- tprm->len = RELATIVEJUMP_SIZE;
-}
-
-/*
- * Replace breakpoints (int3) with relative jumps.
- * Caller must call with locking kprobe_mutex and text_mutex.
- */
-void __kprobes arch_optimize_kprobes(struct list_head *oplist)
-{
- struct optimized_kprobe *op, *tmp;
- int c = 0;
-
- list_for_each_entry_safe(op, tmp, oplist, list) {
- WARN_ON(kprobe_disabled(&op->kp));
- /* Setup param */
- setup_optimize_kprobe(&jump_poke_params[c],
- jump_poke_bufs[c].buf, op);
- list_del_init(&op->list);
- if (++c >= MAX_OPTIMIZE_PROBES)
- break;
- }
-
- /*
- * text_poke_smp doesn't support NMI/MCE code modifying.
- * However, since kprobes itself also doesn't support NMI/MCE
- * code probing, it's not a problem.
- */
- text_poke_smp_batch(jump_poke_params, c);
-}
-
-static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
- u8 *insn_buf,
- struct optimized_kprobe *op)
-{
- /* Set int3 to first byte for kprobes */
- insn_buf[0] = BREAKPOINT_INSTRUCTION;
- memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
-
- tprm->addr = op->kp.addr;
- tprm->opcode = insn_buf;
- tprm->len = RELATIVEJUMP_SIZE;
-}
-
-/*
- * Recover original instructions and breakpoints from relative jumps.
- * Caller must call with locking kprobe_mutex.
- */
-extern void arch_unoptimize_kprobes(struct list_head *oplist,
- struct list_head *done_list)
-{
- struct optimized_kprobe *op, *tmp;
- int c = 0;
-
- list_for_each_entry_safe(op, tmp, oplist, list) {
- /* Setup param */
- setup_unoptimize_kprobe(&jump_poke_params[c],
- jump_poke_bufs[c].buf, op);
- list_move(&op->list, done_list);
- if (++c >= MAX_OPTIMIZE_PROBES)
- break;
- }
-
- /*
- * text_poke_smp doesn't support NMI/MCE code modifying.
- * However, since kprobes itself also doesn't support NMI/MCE
- * code probing, it's not a problem.
- */
- text_poke_smp_batch(jump_poke_params, c);
-}
-
-/* Replace a relative jump with a breakpoint (int3). */
-void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
-{
- u8 buf[RELATIVEJUMP_SIZE];
-
- /* Set int3 to first byte for kprobes */
- buf[0] = BREAKPOINT_INSTRUCTION;
- memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
- text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
-}
-
-static int __kprobes setup_detour_execution(struct kprobe *p,
- struct pt_regs *regs,
- int reenter)
-{
- struct optimized_kprobe *op;
-
- if (p->flags & KPROBE_FLAG_OPTIMIZED) {
- /* This kprobe is really able to run optimized path. */
- op = container_of(p, struct optimized_kprobe, kp);
- /* Detour through copied instructions */
- regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
- if (!reenter)
- reset_current_kprobe();
- preempt_enable_no_resched();
- return 1;
- }
- return 0;
-}
-
-static int __kprobes init_poke_params(void)
-{
- /* Allocate code buffer and parameter array */
- jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
- MAX_OPTIMIZE_PROBES, GFP_KERNEL);
- if (!jump_poke_bufs)
- return -ENOMEM;
-
- jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
- MAX_OPTIMIZE_PROBES, GFP_KERNEL);
- if (!jump_poke_params) {
- kfree(jump_poke_bufs);
- jump_poke_bufs = NULL;
- return -ENOMEM;
- }
-
- return 0;
-}
-#else /* !CONFIG_OPTPROBES */
-static int __kprobes init_poke_params(void)
-{
- return 0;
-}
-#endif
-
int __init arch_init_kprobes(void)
{
- return init_poke_params();
+ return 0;
}
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+int arch_trampoline_kprobe(struct kprobe *p)
{
return 0;
}
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
new file mode 100644
index 00000000000..717b02a22e6
--- /dev/null
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -0,0 +1,96 @@
+/*
+ * Dynamic Ftrace based Kprobes Optimization
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/ftrace.h>
+
+#include "common.h"
+
+static nokprobe_inline
+int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ /*
+ * Emulate singlestep (and also recover regs->ip)
+ * as if there is a 5byte nop
+ */
+ regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
+ if (unlikely(p->post_handler)) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ p->post_handler(p, regs, 0);
+ }
+ __this_cpu_write(current_kprobe, NULL);
+ return 1;
+}
+
+int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ if (kprobe_ftrace(p))
+ return __skip_singlestep(p, regs, kcb);
+ else
+ return 0;
+}
+NOKPROBE_SYMBOL(skip_singlestep);
+
+/* Ftrace callback handler for kprobes */
+void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
+{
+ struct kprobe *p;
+ struct kprobe_ctlblk *kcb;
+ unsigned long flags;
+
+ /* Disable irq for emulating a breakpoint and avoiding preempt */
+ local_irq_save(flags);
+
+ p = get_kprobe((kprobe_opcode_t *)ip);
+ if (unlikely(!p) || kprobe_disabled(p))
+ goto end;
+
+ kcb = get_kprobe_ctlblk();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(p);
+ } else {
+ /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
+ regs->ip = ip + sizeof(kprobe_opcode_t);
+
+ __this_cpu_write(current_kprobe, p);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ if (!p->pre_handler || !p->pre_handler(p, regs))
+ __skip_singlestep(p, regs, kcb);
+ /*
+ * If pre_handler returns !0, it sets regs->ip and
+ * resets current kprobe.
+ */
+ }
+end:
+ local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
+
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+ p->ainsn.insn = NULL;
+ p->ainsn.boostable = -1;
+ return 0;
+}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
new file mode 100644
index 00000000000..f304773285a
--- /dev/null
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -0,0 +1,445 @@
+/*
+ * Kernel Probes Jump Optimization (Optprobes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/ftrace.h>
+
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
+
+#include "common.h"
+
+unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ struct optimized_kprobe *op;
+ struct kprobe *kp;
+ long offs;
+ int i;
+
+ for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
+ kp = get_kprobe((void *)addr - i);
+ /* This function only handles jump-optimized kprobe */
+ if (kp && kprobe_optimized(kp)) {
+ op = container_of(kp, struct optimized_kprobe, kp);
+ /* If op->list is not empty, op is under optimizing */
+ if (list_empty(&op->list))
+ goto found;
+ }
+ }
+
+ return addr;
+found:
+ /*
+ * If the kprobe can be optimized, original bytes which can be
+ * overwritten by jump destination address. In this case, original
+ * bytes must be recovered from op->optinsn.copied_insn buffer.
+ */
+ memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+ if (addr == (unsigned long)kp->addr) {
+ buf[0] = kp->opcode;
+ memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ } else {
+ offs = addr - (unsigned long)kp->addr - 1;
+ memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
+ }
+
+ return (unsigned long)buf;
+}
+
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
+{
+#ifdef CONFIG_X86_64
+ *addr++ = 0x48;
+ *addr++ = 0xbf;
+#else
+ *addr++ = 0xb8;
+#endif
+ *(unsigned long *)addr = val;
+}
+
+asm (
+ ".global optprobe_template_entry\n"
+ "optprobe_template_entry:\n"
+#ifdef CONFIG_X86_64
+ /* We don't bother saving the ss register */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rsi\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val:\n"
+ ASM_NOP5
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call:\n"
+ ASM_NOP5
+ /* Move flags to rsp */
+ " movq 144(%rsp), %rdx\n"
+ " movq %rdx, 152(%rsp)\n"
+ RESTORE_REGS_STRING
+ /* Skip flags entry */
+ " addq $8, %rsp\n"
+ " popfq\n"
+#else /* CONFIG_X86_32 */
+ " pushf\n"
+ SAVE_REGS_STRING
+ " movl %esp, %edx\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val:\n"
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call:\n"
+ ASM_NOP5
+ RESTORE_REGS_STRING
+ " addl $4, %esp\n" /* skip cs */
+ " popf\n"
+#endif
+ ".global optprobe_template_end\n"
+ "optprobe_template_end:\n");
+
+#define TMPL_MOVE_IDX \
+ ((long)&optprobe_template_val - (long)&optprobe_template_entry)
+#define TMPL_CALL_IDX \
+ ((long)&optprobe_template_call - (long)&optprobe_template_entry)
+#define TMPL_END_IDX \
+ ((long)&optprobe_template_end - (long)&optprobe_template_entry)
+
+#define INT3_SIZE sizeof(kprobe_opcode_t)
+
+/* Optimized kprobe call back function: called from optinsn */
+static void
+optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+{
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ unsigned long flags;
+
+ /* This is possible if op is under delayed unoptimizing */
+ if (kprobe_disabled(&op->kp))
+ return;
+
+ local_irq_save(flags);
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(&op->kp);
+ } else {
+ /* Save skipped registers */
+#ifdef CONFIG_X86_64
+ regs->cs = __KERNEL_CS;
+#else
+ regs->cs = __KERNEL_CS | get_kernel_rpl();
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+ regs->orig_ax = ~0UL;
+
+ __this_cpu_write(current_kprobe, &op->kp);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ opt_pre_handler(&op->kp, regs);
+ __this_cpu_write(current_kprobe, NULL);
+ }
+ local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(optimized_callback);
+
+static int copy_optimized_instructions(u8 *dest, u8 *src)
+{
+ int len = 0, ret;
+
+ while (len < RELATIVEJUMP_SIZE) {
+ ret = __copy_instruction(dest + len, src + len);
+ if (!ret || !can_boost(dest + len))
+ return -EINVAL;
+ len += ret;
+ }
+ /* Check whether the address range is reserved */
+ if (ftrace_text_reserved(src, src + len - 1) ||
+ alternatives_text_reserved(src, src + len - 1) ||
+ jump_label_text_reserved(src, src + len - 1))
+ return -EBUSY;
+
+ return len;
+}
+
+/* Check whether insn is indirect jump */
+static int insn_is_indirect_jump(struct insn *insn)
+{
+ return ((insn->opcode.bytes[0] == 0xff &&
+ (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
+ insn->opcode.bytes[0] == 0xea); /* Segment based jump */
+}
+
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+ unsigned long target = 0;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0xe0: /* loopne */
+ case 0xe1: /* loope */
+ case 0xe2: /* loop */
+ case 0xe3: /* jcxz */
+ case 0xe9: /* near relative jump */
+ case 0xeb: /* short relative jump */
+ break;
+ case 0x0f:
+ if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
+ break;
+ return 0;
+ default:
+ if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
+ break;
+ return 0;
+ }
+ target = (unsigned long)insn->next_byte + insn->immediate.value;
+
+ return (start <= target && target <= start + len);
+}
+
+/* Decode whole function to ensure any instructions don't jump into target */
+static int can_optimize(unsigned long paddr)
+{
+ unsigned long addr, size = 0, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ /* Lookup symbol including addr */
+ if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
+ return 0;
+
+ /*
+ * Do not optimize in the entry code due to the unstable
+ * stack handling.
+ */
+ if ((paddr >= (unsigned long)__entry_text_start) &&
+ (paddr < (unsigned long)__entry_text_end))
+ return 0;
+
+ /* Check there is enough space for a relative jump. */
+ if (size - offset < RELATIVEJUMP_SIZE)
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr - offset + size) { /* Decode until function end */
+ if (search_exception_tables(addr))
+ /*
+ * Since some fixup code will jumps into this function,
+ * we can't optimize kprobe in this function.
+ */
+ return 0;
+ kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr));
+ insn_get_length(&insn);
+ /* Another subsystem puts a breakpoint */
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ return 0;
+ /* Recover address */
+ insn.kaddr = (void *)addr;
+ insn.next_byte = (void *)(addr + insn.length);
+ /* Check any instructions don't jump into target */
+ if (insn_is_indirect_jump(&insn) ||
+ insn_jump_into_range(&insn, paddr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE))
+ return 0;
+ addr += insn.length;
+ }
+
+ return 1;
+}
+
+/* Check optimized_kprobe can actually be optimized. */
+int arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+ int i;
+ struct kprobe *p;
+
+ for (i = 1; i < op->optinsn.size; i++) {
+ p = get_kprobe(op->kp.addr + i);
+ if (p && !kprobe_disabled(p))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+/* Check the addr is within the optimized instructions. */
+int arch_within_optimized_kprobe(struct optimized_kprobe *op,
+ unsigned long addr)
+{
+ return ((unsigned long)op->kp.addr <= addr &&
+ (unsigned long)op->kp.addr + op->optinsn.size > addr);
+}
+
+/* Free optimized instruction slot */
+static
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+ if (op->optinsn.insn) {
+ free_optinsn_slot(op->optinsn.insn, dirty);
+ op->optinsn.insn = NULL;
+ op->optinsn.size = 0;
+ }
+}
+
+void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+ __arch_remove_optimized_kprobe(op, 1);
+}
+
+/*
+ * Copy replacing target instructions
+ * Target instructions MUST be relocatable (checked inside)
+ * This is called when new aggr(opt)probe is allocated or reused.
+ */
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+{
+ u8 *buf;
+ int ret;
+ long rel;
+
+ if (!can_optimize((unsigned long)op->kp.addr))
+ return -EILSEQ;
+
+ op->optinsn.insn = get_optinsn_slot();
+ if (!op->optinsn.insn)
+ return -ENOMEM;
+
+ /*
+ * Verify if the address gap is in 2GB range, because this uses
+ * a relative jump.
+ */
+ rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+ if (abs(rel) > 0x7fffffff)
+ return -ERANGE;
+
+ buf = (u8 *)op->optinsn.insn;
+
+ /* Copy instructions into the out-of-line buffer */
+ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
+ if (ret < 0) {
+ __arch_remove_optimized_kprobe(op, 0);
+ return ret;
+ }
+ op->optinsn.size = ret;
+
+ /* Copy arch-dep-instance from template */
+ memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
+
+ /* Set probe information */
+ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+
+ /* Set probe function call */
+ synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
+
+ /* Set returning jmp instruction at the tail of out-of-line buffer */
+ synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
+ (u8 *)op->kp.addr + op->optinsn.size);
+
+ flush_icache_range((unsigned long) buf,
+ (unsigned long) buf + TMPL_END_IDX +
+ op->optinsn.size + RELATIVEJUMP_SIZE);
+ return 0;
+}
+
+/*
+ * Replace breakpoints (int3) with relative jumps.
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ */
+void arch_optimize_kprobes(struct list_head *oplist)
+{
+ struct optimized_kprobe *op, *tmp;
+ u8 insn_buf[RELATIVEJUMP_SIZE];
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ s32 rel = (s32)((long)op->optinsn.insn -
+ ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
+ WARN_ON(kprobe_disabled(&op->kp));
+
+ /* Backup instructions which will be replaced by jump address */
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE);
+
+ insn_buf[0] = RELATIVEJUMP_OPCODE;
+ *(s32 *)(&insn_buf[1]) = rel;
+
+ text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+ op->optinsn.insn);
+
+ list_del_init(&op->list);
+ }
+}
+
+/* Replace a relative jump with a breakpoint (int3). */
+void arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ u8 insn_buf[RELATIVEJUMP_SIZE];
+
+ /* Set int3 to first byte for kprobes */
+ insn_buf[0] = BREAKPOINT_INSTRUCTION;
+ memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+ op->optinsn.insn);
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+ struct list_head *done_list)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ arch_unoptimize_kprobe(op);
+ list_move(&op->list, done_list);
+ }
+}
+
+int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+ struct optimized_kprobe *op;
+
+ if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+ /* This kprobe is really able to run optimized path. */
+ op = container_of(p, struct optimized_kprobe, kp);
+ /* Detour through copied instructions */
+ regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+ if (!reenter)
+ reset_current_kprobe();
+ preempt_enable_no_resched();
+ return 1;
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(setup_detour_execution);
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
new file mode 100644
index 00000000000..c2bedaea11f
--- /dev/null
+++ b/arch/x86/kernel/ksysfs.c
@@ -0,0 +1,340 @@
+/*
+ * Architecture specific sysfs attributes in /sys/kernel
+ *
+ * Copyright (C) 2007, Intel Corp.
+ * Huang Ying <ying.huang@intel.com>
+ * Copyright (C) 2013, 2013 Red Hat, Inc.
+ * Dave Young <dyoung@redhat.com>
+ *
+ * This file is released under the GPLv2
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+#include <asm/io.h>
+#include <asm/setup.h>
+
+static ssize_t version_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "0x%04x\n", boot_params.hdr.version);
+}
+
+static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
+
+static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
+{
+ memcpy(buf, (void *)&boot_params + off, count);
+ return count;
+}
+
+static struct bin_attribute boot_params_data_attr = {
+ .attr = {
+ .name = "data",
+ .mode = S_IRUGO,
+ },
+ .read = boot_params_data_read,
+ .size = sizeof(boot_params),
+};
+
+static struct attribute *boot_params_version_attrs[] = {
+ &boot_params_version_attr.attr,
+ NULL,
+};
+
+static struct bin_attribute *boot_params_data_attrs[] = {
+ &boot_params_data_attr,
+ NULL,
+};
+
+static struct attribute_group boot_params_attr_group = {
+ .attrs = boot_params_version_attrs,
+ .bin_attrs = boot_params_data_attrs,
+};
+
+static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
+{
+ const char *name;
+
+ name = kobject_name(kobj);
+ return kstrtoint(name, 10, nr);
+}
+
+static int get_setup_data_paddr(int nr, u64 *paddr)
+{
+ int i = 0;
+ struct setup_data *data;
+ u64 pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ if (nr == i) {
+ *paddr = pa_data;
+ return 0;
+ }
+ data = ioremap_cache(pa_data, sizeof(*data));
+ if (!data)
+ return -ENOMEM;
+
+ pa_data = data->next;
+ iounmap(data);
+ i++;
+ }
+ return -EINVAL;
+}
+
+static int __init get_setup_data_size(int nr, size_t *size)
+{
+ int i = 0;
+ struct setup_data *data;
+ u64 pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ data = ioremap_cache(pa_data, sizeof(*data));
+ if (!data)
+ return -ENOMEM;
+ if (nr == i) {
+ *size = data->len;
+ iounmap(data);
+ return 0;
+ }
+
+ pa_data = data->next;
+ iounmap(data);
+ i++;
+ }
+ return -EINVAL;
+}
+
+static ssize_t type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int nr, ret;
+ u64 paddr;
+ struct setup_data *data;
+
+ ret = kobj_to_setup_data_nr(kobj, &nr);
+ if (ret)
+ return ret;
+
+ ret = get_setup_data_paddr(nr, &paddr);
+ if (ret)
+ return ret;
+ data = ioremap_cache(paddr, sizeof(*data));
+ if (!data)
+ return -ENOMEM;
+
+ ret = sprintf(buf, "0x%x\n", data->type);
+ iounmap(data);
+ return ret;
+}
+
+static ssize_t setup_data_data_read(struct file *fp,
+ struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf,
+ loff_t off, size_t count)
+{
+ int nr, ret = 0;
+ u64 paddr;
+ struct setup_data *data;
+ void *p;
+
+ ret = kobj_to_setup_data_nr(kobj, &nr);
+ if (ret)
+ return ret;
+
+ ret = get_setup_data_paddr(nr, &paddr);
+ if (ret)
+ return ret;
+ data = ioremap_cache(paddr, sizeof(*data));
+ if (!data)
+ return -ENOMEM;
+
+ if (off > data->len) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (count > data->len - off)
+ count = data->len - off;
+
+ if (!count)
+ goto out;
+
+ ret = count;
+ p = ioremap_cache(paddr + sizeof(*data), data->len);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(buf, p + off, count);
+ iounmap(p);
+out:
+ iounmap(data);
+ return ret;
+}
+
+static struct kobj_attribute type_attr = __ATTR_RO(type);
+
+static struct bin_attribute data_attr = {
+ .attr = {
+ .name = "data",
+ .mode = S_IRUGO,
+ },
+ .read = setup_data_data_read,
+};
+
+static struct attribute *setup_data_type_attrs[] = {
+ &type_attr.attr,
+ NULL,
+};
+
+static struct bin_attribute *setup_data_data_attrs[] = {
+ &data_attr,
+ NULL,
+};
+
+static struct attribute_group setup_data_attr_group = {
+ .attrs = setup_data_type_attrs,
+ .bin_attrs = setup_data_data_attrs,
+};
+
+static int __init create_setup_data_node(struct kobject *parent,
+ struct kobject **kobjp, int nr)
+{
+ int ret = 0;
+ size_t size;
+ struct kobject *kobj;
+ char name[16]; /* should be enough for setup_data nodes numbers */
+ snprintf(name, 16, "%d", nr);
+
+ kobj = kobject_create_and_add(name, parent);
+ if (!kobj)
+ return -ENOMEM;
+
+ ret = get_setup_data_size(nr, &size);
+ if (ret)
+ goto out_kobj;
+
+ data_attr.size = size;
+ ret = sysfs_create_group(kobj, &setup_data_attr_group);
+ if (ret)
+ goto out_kobj;
+ *kobjp = kobj;
+
+ return 0;
+out_kobj:
+ kobject_put(kobj);
+ return ret;
+}
+
+static void __init cleanup_setup_data_node(struct kobject *kobj)
+{
+ sysfs_remove_group(kobj, &setup_data_attr_group);
+ kobject_put(kobj);
+}
+
+static int __init get_setup_data_total_num(u64 pa_data, int *nr)
+{
+ int ret = 0;
+ struct setup_data *data;
+
+ *nr = 0;
+ while (pa_data) {
+ *nr += 1;
+ data = ioremap_cache(pa_data, sizeof(*data));
+ if (!data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pa_data = data->next;
+ iounmap(data);
+ }
+
+out:
+ return ret;
+}
+
+static int __init create_setup_data_nodes(struct kobject *parent)
+{
+ struct kobject *setup_data_kobj, **kobjp;
+ u64 pa_data;
+ int i, j, nr, ret = 0;
+
+ pa_data = boot_params.hdr.setup_data;
+ if (!pa_data)
+ return 0;
+
+ setup_data_kobj = kobject_create_and_add("setup_data", parent);
+ if (!setup_data_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_setup_data_total_num(pa_data, &nr);
+ if (ret)
+ goto out_setup_data_kobj;
+
+ kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL);
+ if (!kobjp) {
+ ret = -ENOMEM;
+ goto out_setup_data_kobj;
+ }
+
+ for (i = 0; i < nr; i++) {
+ ret = create_setup_data_node(setup_data_kobj, kobjp + i, i);
+ if (ret)
+ goto out_clean_nodes;
+ }
+
+ kfree(kobjp);
+ return 0;
+
+out_clean_nodes:
+ for (j = i - 1; j > 0; j--)
+ cleanup_setup_data_node(*(kobjp + j));
+ kfree(kobjp);
+out_setup_data_kobj:
+ kobject_put(setup_data_kobj);
+out:
+ return ret;
+}
+
+static int __init boot_params_ksysfs_init(void)
+{
+ int ret;
+ struct kobject *boot_params_kobj;
+
+ boot_params_kobj = kobject_create_and_add("boot_params",
+ kernel_kobj);
+ if (!boot_params_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group);
+ if (ret)
+ goto out_boot_params_kobj;
+
+ ret = create_setup_data_nodes(boot_params_kobj);
+ if (ret)
+ goto out_create_group;
+
+ return 0;
+out_create_group:
+ sysfs_remove_group(boot_params_kobj, &boot_params_attr_group);
+out_boot_params_kobj:
+ kobject_put(boot_params_kobj);
+out:
+ return ret;
+}
+
+arch_initcall(boot_params_ksysfs_init);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f0c6fd6f176..3dd8e2c4d74 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -20,6 +20,7 @@
* Authors: Anthony Liguori <aliguori@us.ibm.com>
*/
+#include <linux/context_tracking.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/kvm_para.h>
@@ -33,11 +34,17 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/kprobes.h>
+#include <linux/debugfs.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/tlbflush.h>
+#include <asm/idle.h>
+#include <asm/apic.h>
+#include <asm/apicdef.h>
+#include <asm/hypervisor.h>
+#include <asm/kvm_guest.h>
static int kvmapf = 1;
@@ -58,6 +65,15 @@ static int parse_no_stealacc(char *arg)
early_param("no-steal-acc", parse_no_stealacc);
+static int kvmclock_vsyscall = 1;
+static int parse_no_kvmclock_vsyscall(char *arg)
+{
+ kvmclock_vsyscall = 0;
+ return 0;
+}
+
+early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
+
static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
static int has_steal_clock = 0;
@@ -78,7 +94,6 @@ struct kvm_task_sleep_node {
u32 token;
int cpu;
bool halted;
- struct mm_struct *mm;
};
static struct kvm_task_sleep_head {
@@ -107,11 +122,8 @@ void kvm_async_pf_task_wait(u32 token)
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
struct kvm_task_sleep_node n, *e;
DEFINE_WAIT(wait);
- int cpu, idle;
- cpu = get_cpu();
- idle = idle_cpu(cpu);
- put_cpu();
+ rcu_irq_enter();
spin_lock(&b->lock);
e = _find_apf_task(b, token);
@@ -120,14 +132,14 @@ void kvm_async_pf_task_wait(u32 token)
hlist_del(&e->link);
kfree(e);
spin_unlock(&b->lock);
+
+ rcu_irq_exit();
return;
}
n.token = token;
n.cpu = smp_processor_id();
- n.mm = current->active_mm;
- n.halted = idle || preempt_count() > 1;
- atomic_inc(&n.mm->mm_count);
+ n.halted = is_idle_task(current) || preempt_count() > 1;
init_waitqueue_head(&n.wq);
hlist_add_head(&n.link, &b->list);
spin_unlock(&b->lock);
@@ -146,13 +158,16 @@ void kvm_async_pf_task_wait(u32 token)
/*
* We cannot reschedule. So halt.
*/
+ rcu_irq_exit();
native_safe_halt();
+ rcu_irq_enter();
local_irq_disable();
}
}
if (!n.halted)
finish_wait(&n.wq, &wait);
+ rcu_irq_exit();
return;
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
@@ -160,9 +175,6 @@ EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
hlist_del_init(&n->link);
- if (!n->mm)
- return;
- mmdrop(n->mm);
if (n->halted)
smp_send_reschedule(n->cpu);
else if (waitqueue_active(&n->wq))
@@ -206,7 +218,7 @@ again:
* async PF was not yet handled.
* Add dummy entry for the token.
*/
- n = kmalloc(sizeof(*n), GFP_ATOMIC);
+ n = kzalloc(sizeof(*n), GFP_ATOMIC);
if (!n) {
/*
* Allocation failed! Busy wait while other cpu
@@ -218,7 +230,6 @@ again:
}
n->token = token;
n->cpu = smp_processor_id();
- n->mm = NULL;
init_waitqueue_head(&n->wq);
hlist_add_head(&n->link, &b->list);
} else
@@ -240,23 +251,33 @@ u32 kvm_read_and_reset_pf_reason(void)
return reason;
}
EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
+NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
-dotraplinkage void __kprobes
+dotraplinkage void
do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
{
+ enum ctx_state prev_state;
+
switch (kvm_read_and_reset_pf_reason()) {
default:
- do_page_fault(regs, error_code);
+ trace_do_page_fault(regs, error_code);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
+ prev_state = exception_enter();
+ exit_idle();
kvm_async_pf_task_wait((u32)read_cr2());
+ exception_exit(prev_state);
break;
case KVM_PV_REASON_PAGE_READY:
+ rcu_irq_enter();
+ exit_idle();
kvm_async_pf_task_wake((u32)read_cr2());
+ rcu_irq_exit();
break;
}
}
+NOKPROBE_SYMBOL(do_async_page_fault);
static void __init paravirt_ops_setup(void)
{
@@ -281,18 +302,34 @@ static void kvm_register_steal_time(void)
memset(st, 0, sizeof(*st));
- wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
- printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n",
- cpu, __pa(st));
+ wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
+ pr_info("kvm-stealtime: cpu %d, msr %llx\n",
+ cpu, (unsigned long long) slow_virt_to_phys(st));
+}
+
+static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
+
+static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
+{
+ /**
+ * This relies on __test_and_clear_bit to modify the memory
+ * in a way that is atomic with respect to the local CPU.
+ * The hypervisor only accesses this memory from the local CPU so
+ * there's no need for lock or memory barriers.
+ * An optimization barrier is implied in apic write.
+ */
+ if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi)))
+ return;
+ apic_write(APIC_EOI, APIC_EOI_ACK);
}
-void __cpuinit kvm_guest_cpu_init(void)
+void kvm_guest_cpu_init(void)
{
if (!kvm_para_available())
return;
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
- u64 pa = __pa(&__get_cpu_var(apf_reason));
+ u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason));
#ifdef CONFIG_PREEMPT
pa |= KVM_ASYNC_PF_SEND_ALWAYS;
@@ -303,11 +340,21 @@ void __cpuinit kvm_guest_cpu_init(void)
smp_processor_id());
}
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
+ unsigned long pa;
+ /* Size alignment is implied but just to make it explicit. */
+ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
+ __get_cpu_var(kvm_apic_eoi) = 0;
+ pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi))
+ | KVM_MSR_ENABLED;
+ wrmsrl(MSR_KVM_PV_EOI_EN, pa);
+ }
+
if (has_steal_clock)
kvm_register_steal_time();
}
-static void kvm_pv_disable_apf(void *unused)
+static void kvm_pv_disable_apf(void)
{
if (!__get_cpu_var(apf_reason).enabled)
return;
@@ -319,11 +366,24 @@ static void kvm_pv_disable_apf(void *unused)
smp_processor_id());
}
+static void kvm_pv_guest_cpu_reboot(void *unused)
+{
+ /*
+ * We disable PV EOI before we load a new kernel by kexec,
+ * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
+ * New kernel can re-enable when it boots.
+ */
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+ kvm_pv_disable_apf();
+ kvm_disable_steal_time();
+}
+
static int kvm_pv_reboot_notify(struct notifier_block *nb,
unsigned long code, void *unused)
{
if (code == SYS_RESTART)
- on_each_cpu(kvm_pv_disable_apf, NULL, 1);
+ on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
return NOTIFY_DONE;
}
@@ -359,14 +419,12 @@ void kvm_disable_steal_time(void)
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
-#ifdef CONFIG_KVM_CLOCK
- WARN_ON(kvm_register_clock("primary cpu clock"));
-#endif
kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
+ kvm_spinlock_init();
}
-static void __cpuinit kvm_guest_cpu_online(void *dummy)
+static void kvm_guest_cpu_online(void *dummy)
{
kvm_guest_cpu_init();
}
@@ -374,12 +432,14 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
static void kvm_guest_cpu_offline(void *dummy)
{
kvm_disable_steal_time();
- kvm_pv_disable_apf(NULL);
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+ kvm_pv_disable_apf();
apf_task_wake_all();
}
-static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int kvm_cpu_notify(struct notifier_block *self, unsigned long action,
+ void *hcpu)
{
int cpu = (unsigned long)hcpu;
switch (action) {
@@ -398,14 +458,14 @@ static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
+static struct notifier_block kvm_cpu_notifier = {
.notifier_call = kvm_cpu_notify,
};
#endif
static void __init kvm_apf_trap_init(void)
{
- set_intr_gate(14, &async_page_fault);
+ set_intr_gate(14, async_page_fault);
}
void __init kvm_guest_init(void)
@@ -427,6 +487,12 @@ void __init kvm_guest_init(void)
pv_time_ops.steal_clock = kvm_steal_clock;
}
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ apic_set_eoi_write(kvm_guest_apic_eoi_write);
+
+ if (kvmclock_vsyscall)
+ kvm_setup_vsyscall_timeinfo();
+
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
register_cpu_notifier(&kvm_cpu_notifier);
@@ -435,14 +501,329 @@ void __init kvm_guest_init(void)
#endif
}
+static noinline uint32_t __kvm_cpuid_base(void)
+{
+ if (boot_cpu_data.cpuid_level < 0)
+ return 0; /* So we don't blow up on old processors */
+
+ if (cpu_has_hypervisor)
+ return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
+
+ return 0;
+}
+
+static inline uint32_t kvm_cpuid_base(void)
+{
+ static int kvm_cpuid_base = -1;
+
+ if (kvm_cpuid_base == -1)
+ kvm_cpuid_base = __kvm_cpuid_base();
+
+ return kvm_cpuid_base;
+}
+
+bool kvm_para_available(void)
+{
+ return kvm_cpuid_base() != 0;
+}
+EXPORT_SYMBOL_GPL(kvm_para_available);
+
+unsigned int kvm_arch_para_features(void)
+{
+ return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
+}
+
+static uint32_t __init kvm_detect(void)
+{
+ return kvm_cpuid_base();
+}
+
+const struct hypervisor_x86 x86_hyper_kvm __refconst = {
+ .name = "KVM",
+ .detect = kvm_detect,
+ .x2apic_available = kvm_para_available,
+};
+EXPORT_SYMBOL_GPL(x86_hyper_kvm);
+
static __init int activate_jump_labels(void)
{
if (has_steal_clock) {
- jump_label_inc(&paravirt_steal_enabled);
+ static_key_slow_inc(&paravirt_steal_enabled);
if (steal_acc)
- jump_label_inc(&paravirt_steal_rq_enabled);
+ static_key_slow_inc(&paravirt_steal_rq_enabled);
}
return 0;
}
arch_initcall(activate_jump_labels);
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
+static void kvm_kick_cpu(int cpu)
+{
+ int apicid;
+ unsigned long flags = 0;
+
+ apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
+}
+
+enum kvm_contention_stat {
+ TAKEN_SLOW,
+ TAKEN_SLOW_PICKUP,
+ RELEASED_SLOW,
+ RELEASED_SLOW_KICKED,
+ NR_CONTENTION_STATS
+};
+
+#ifdef CONFIG_KVM_DEBUG_FS
+#define HISTO_BUCKETS 30
+
+static struct kvm_spinlock_stats
+{
+ u32 contention_stats[NR_CONTENTION_STATS];
+ u32 histo_spin_blocked[HISTO_BUCKETS+1];
+ u64 time_blocked;
+} spinlock_stats;
+
+static u8 zero_stats;
+
+static inline void check_zero(void)
+{
+ u8 ret;
+ u8 old;
+
+ old = ACCESS_ONCE(zero_stats);
+ if (unlikely(old)) {
+ ret = cmpxchg(&zero_stats, old, 0);
+ /* This ensures only one fellow resets the stat */
+ if (ret == old)
+ memset(&spinlock_stats, 0, sizeof(spinlock_stats));
+ }
+}
+
+static inline void add_stats(enum kvm_contention_stat var, u32 val)
+{
+ check_zero();
+ spinlock_stats.contention_stats[var] += val;
+}
+
+
+static inline u64 spin_time_start(void)
+{
+ return sched_clock();
+}
+
+static void __spin_time_accum(u64 delta, u32 *array)
+{
+ unsigned index;
+
+ index = ilog2(delta);
+ check_zero();
+
+ if (index < HISTO_BUCKETS)
+ array[index]++;
+ else
+ array[HISTO_BUCKETS]++;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+ u32 delta;
+
+ delta = sched_clock() - start;
+ __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
+ spinlock_stats.time_blocked += delta;
+}
+
+static struct dentry *d_spin_debug;
+static struct dentry *d_kvm_debug;
+
+struct dentry *kvm_init_debugfs(void)
+{
+ d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
+ if (!d_kvm_debug)
+ printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");
+
+ return d_kvm_debug;
+}
+
+static int __init kvm_spinlock_debugfs(void)
+{
+ struct dentry *d_kvm;
+
+ d_kvm = kvm_init_debugfs();
+ if (d_kvm == NULL)
+ return -ENOMEM;
+
+ d_spin_debug = debugfs_create_dir("spinlocks", d_kvm);
+
+ debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
+
+ debugfs_create_u32("taken_slow", 0444, d_spin_debug,
+ &spinlock_stats.contention_stats[TAKEN_SLOW]);
+ debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
+ &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
+
+ debugfs_create_u32("released_slow", 0444, d_spin_debug,
+ &spinlock_stats.contention_stats[RELEASED_SLOW]);
+ debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
+ &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
+
+ debugfs_create_u64("time_blocked", 0444, d_spin_debug,
+ &spinlock_stats.time_blocked);
+
+ debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
+ spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
+
+ return 0;
+}
+fs_initcall(kvm_spinlock_debugfs);
+#else /* !CONFIG_KVM_DEBUG_FS */
+static inline void add_stats(enum kvm_contention_stat var, u32 val)
+{
+}
+
+static inline u64 spin_time_start(void)
+{
+ return 0;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+}
+#endif /* CONFIG_KVM_DEBUG_FS */
+
+struct kvm_lock_waiting {
+ struct arch_spinlock *lock;
+ __ticket_t want;
+};
+
+/* cpus 'waiting' on a spinlock to become available */
+static cpumask_t waiting_cpus;
+
+/* Track spinlock on which a cpu is waiting */
+static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting);
+
+__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
+{
+ struct kvm_lock_waiting *w;
+ int cpu;
+ u64 start;
+ unsigned long flags;
+
+ if (in_nmi())
+ return;
+
+ w = &__get_cpu_var(klock_waiting);
+ cpu = smp_processor_id();
+ start = spin_time_start();
+
+ /*
+ * Make sure an interrupt handler can't upset things in a
+ * partially setup state.
+ */
+ local_irq_save(flags);
+
+ /*
+ * The ordering protocol on this is that the "lock" pointer
+ * may only be set non-NULL if the "want" ticket is correct.
+ * If we're updating "want", we must first clear "lock".
+ */
+ w->lock = NULL;
+ smp_wmb();
+ w->want = want;
+ smp_wmb();
+ w->lock = lock;
+
+ add_stats(TAKEN_SLOW, 1);
+
+ /*
+ * This uses set_bit, which is atomic but we should not rely on its
+ * reordering gurantees. So barrier is needed after this call.
+ */
+ cpumask_set_cpu(cpu, &waiting_cpus);
+
+ barrier();
+
+ /*
+ * Mark entry to slowpath before doing the pickup test to make
+ * sure we don't deadlock with an unlocker.
+ */
+ __ticket_enter_slowpath(lock);
+
+ /*
+ * check again make sure it didn't become free while
+ * we weren't looking.
+ */
+ if (ACCESS_ONCE(lock->tickets.head) == want) {
+ add_stats(TAKEN_SLOW_PICKUP, 1);
+ goto out;
+ }
+
+ /*
+ * halt until it's our turn and kicked. Note that we do safe halt
+ * for irq enabled case to avoid hang when lock info is overwritten
+ * in irq spinlock slowpath and no spurious interrupt occur to save us.
+ */
+ if (arch_irqs_disabled_flags(flags))
+ halt();
+ else
+ safe_halt();
+
+out:
+ cpumask_clear_cpu(cpu, &waiting_cpus);
+ w->lock = NULL;
+ local_irq_restore(flags);
+ spin_time_accum_blocked(start);
+}
+PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
+
+/* Kick vcpu waiting on @lock->head to reach value @ticket */
+static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
+{
+ int cpu;
+
+ add_stats(RELEASED_SLOW, 1);
+ for_each_cpu(cpu, &waiting_cpus) {
+ const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
+ if (ACCESS_ONCE(w->lock) == lock &&
+ ACCESS_ONCE(w->want) == ticket) {
+ add_stats(RELEASED_SLOW_KICKED, 1);
+ kvm_kick_cpu(cpu);
+ break;
+ }
+ }
+}
+
+/*
+ * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
+ */
+void __init kvm_spinlock_init(void)
+{
+ if (!kvm_para_available())
+ return;
+ /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
+ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
+ return;
+
+ pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
+ pv_lock_ops.unlock_kick = kvm_unlock_kick;
+}
+
+static __init int kvm_spinlock_init_jump(void)
+{
+ if (!kvm_para_available())
+ return 0;
+ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
+ return 0;
+
+ static_key_slow_inc(&paravirt_ticketlocks_enabled);
+ printk(KERN_INFO "KVM setup paravirtual spinlock\n");
+
+ return 0;
+}
+early_initcall(kvm_spinlock_init_jump);
+
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 44842d756b2..d9156ceecdf 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,6 +22,8 @@
#include <asm/msr.h>
#include <asm/apic.h>
#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/memblock.h>
#include <asm/x86_init.h>
#include <asm/reboot.h>
@@ -38,7 +40,7 @@ static int parse_no_kvmclock(char *arg)
early_param("no-kvmclock", parse_no_kvmclock);
/* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
+static struct pvclock_vsyscall_time_info *hv_clock;
static struct pvclock_wall_clock wall_clock;
/*
@@ -46,25 +48,27 @@ static struct pvclock_wall_clock wall_clock;
* have elapsed since the hypervisor wrote the data. So we try to account for
* that with system time
*/
-static unsigned long kvm_get_wallclock(void)
+static void kvm_get_wallclock(struct timespec *now)
{
struct pvclock_vcpu_time_info *vcpu_time;
- struct timespec ts;
int low, high;
+ int cpu;
low = (int)__pa_symbol(&wall_clock);
high = ((u64)__pa_symbol(&wall_clock) >> 32);
native_write_msr(msr_kvm_wall_clock, low, high);
- vcpu_time = &get_cpu_var(hv_clock);
- pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
- put_cpu_var(hv_clock);
+ preempt_disable();
+ cpu = smp_processor_id();
- return ts.tv_sec;
+ vcpu_time = &hv_clock[cpu].pvti;
+ pvclock_read_wallclock(&wall_clock, vcpu_time, now);
+
+ preempt_enable();
}
-static int kvm_set_wallclock(unsigned long now)
+static int kvm_set_wallclock(const struct timespec *now)
{
return -1;
}
@@ -73,9 +77,11 @@ static cycle_t kvm_clock_read(void)
{
struct pvclock_vcpu_time_info *src;
cycle_t ret;
+ int cpu;
preempt_disable_notrace();
- src = &__get_cpu_var(hv_clock);
+ cpu = smp_processor_id();
+ src = &hv_clock[cpu].pvti;
ret = pvclock_clocksource_read(src);
preempt_enable_notrace();
return ret;
@@ -98,8 +104,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
static unsigned long kvm_get_tsc_khz(void)
{
struct pvclock_vcpu_time_info *src;
- src = &per_cpu(hv_clock, 0);
- return pvclock_tsc_khz(src);
+ int cpu;
+ unsigned long tsc_khz;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+ src = &hv_clock[cpu].pvti;
+ tsc_khz = pvclock_tsc_khz(src);
+ preempt_enable();
+ return tsc_khz;
}
static void kvm_get_preset_lpj(void)
@@ -114,6 +127,25 @@ static void kvm_get_preset_lpj(void)
preset_lpj = lpj;
}
+bool kvm_check_and_clear_guest_paused(void)
+{
+ bool ret = false;
+ struct pvclock_vcpu_time_info *src;
+ int cpu = smp_processor_id();
+
+ if (!hv_clock)
+ return ret;
+
+ src = &hv_clock[cpu].pvti;
+ if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
+ src->flags &= ~PVCLOCK_GUEST_STOPPED;
+ pvclock_touch_watchdogs();
+ ret = true;
+ }
+
+ return ret;
+}
+
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_get_cycles,
@@ -126,9 +158,14 @@ int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
int low, high, ret;
+ struct pvclock_vcpu_time_info *src;
+
+ if (!hv_clock)
+ return 0;
- low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
- high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+ src = &hv_clock[cpu].pvti;
+ low = (int)slow_virt_to_phys(src) | 1;
+ high = ((u64)slow_virt_to_phys(src) >> 32);
ret = native_write_msr_safe(msr_kvm_system_time, low, high);
printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
cpu, high, low, txt);
@@ -136,16 +173,23 @@ int kvm_register_clock(char *txt)
return ret;
}
+static void kvm_save_sched_clock_state(void)
+{
+}
+
+static void kvm_restore_sched_clock_state(void)
+{
+ kvm_register_clock("primary cpu clock, resume");
+}
+
#ifdef CONFIG_X86_LOCAL_APIC
-static void __cpuinit kvm_setup_secondary_clock(void)
+static void kvm_setup_secondary_clock(void)
{
/*
* Now that the first cpu already had this clocksource initialized,
* we shouldn't fail.
*/
WARN_ON(kvm_register_clock("secondary cpu clock"));
- /* ok, done with our trickery, call native */
- setup_secondary_APIC_clock();
}
#endif
@@ -175,6 +219,11 @@ static void kvm_shutdown(void)
void __init kvmclock_init(void)
{
+ unsigned long mem;
+ int size;
+
+ size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
+
if (!kvm_para_available())
return;
@@ -187,16 +236,27 @@ void __init kvmclock_init(void)
printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
msr_kvm_system_time, msr_kvm_wall_clock);
- if (kvm_register_clock("boot clock"))
+ mem = memblock_alloc(size, PAGE_SIZE);
+ if (!mem)
return;
+ hv_clock = __va(mem);
+ memset(hv_clock, 0, size);
+
+ if (kvm_register_clock("primary cpu clock")) {
+ hv_clock = NULL;
+ memblock_free(mem, size);
+ return;
+ }
pv_time_ops.sched_clock = kvm_clock_read;
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
- x86_cpuinit.setup_percpu_clockev =
+ x86_cpuinit.early_percpu_clock_init =
kvm_setup_secondary_clock;
#endif
+ x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
+ x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
@@ -209,3 +269,40 @@ void __init kvmclock_init(void)
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
}
+
+int __init kvm_setup_vsyscall_timeinfo(void)
+{
+#ifdef CONFIG_X86_64
+ int cpu;
+ int ret;
+ u8 flags;
+ struct pvclock_vcpu_time_info *vcpu_time;
+ unsigned int size;
+
+ if (!hv_clock)
+ return 0;
+
+ size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
+
+ preempt_disable();
+ cpu = smp_processor_id();
+
+ vcpu_time = &hv_clock[cpu].pvti;
+ flags = pvclock_read_flags(vcpu_time);
+
+ if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
+ preempt_enable();
+ return 1;
+ }
+
+ if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
+ preempt_enable();
+ return ret;
+ }
+
+ preempt_enable();
+
+ kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
+#endif
+ return 0;
+}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ea697263b37..c37886d759c 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -15,7 +15,6 @@
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
-#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
@@ -230,6 +229,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
}
}
+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+
fill_ldt(&ldt, &ldt_info);
if (oldmode)
ldt.avl = 0;
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index a3fa43ba5d3..1667b1de8d5 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -9,7 +9,6 @@
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/delay.h>
-#include <linux/init.h>
#include <linux/numa.h>
#include <linux/ftrace.h>
#include <linux/suspend.h>
@@ -23,7 +22,6 @@
#include <asm/apic.h>
#include <asm/cpufeature.h>
#include <asm/desc.h>
-#include <asm/system.h>
#include <asm/cacheflush.h>
#include <asm/debugreg.h>
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index b3ea9db39db..679cef0791c 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -16,125 +16,12 @@
#include <linux/io.h>
#include <linux/suspend.h>
+#include <asm/init.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/debugreg.h>
-static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
- unsigned long addr)
-{
- pud_t *pud;
- pmd_t *pmd;
- struct page *page;
- int result = -ENOMEM;
-
- addr &= PMD_MASK;
- pgd += pgd_index(addr);
- if (!pgd_present(*pgd)) {
- page = kimage_alloc_control_pages(image, 0);
- if (!page)
- goto out;
- pud = (pud_t *)page_address(page);
- clear_page(pud);
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
- }
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud)) {
- page = kimage_alloc_control_pages(image, 0);
- if (!page)
- goto out;
- pmd = (pmd_t *)page_address(page);
- clear_page(pmd);
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
- }
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
- result = 0;
-out:
- return result;
-}
-
-static void init_level2_page(pmd_t *level2p, unsigned long addr)
-{
- unsigned long end_addr;
-
- addr &= PAGE_MASK;
- end_addr = addr + PUD_SIZE;
- while (addr < end_addr) {
- set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
- addr += PMD_SIZE;
- }
-}
-
-static int init_level3_page(struct kimage *image, pud_t *level3p,
- unsigned long addr, unsigned long last_addr)
-{
- unsigned long end_addr;
- int result;
-
- result = 0;
- addr &= PAGE_MASK;
- end_addr = addr + PGDIR_SIZE;
- while ((addr < last_addr) && (addr < end_addr)) {
- struct page *page;
- pmd_t *level2p;
-
- page = kimage_alloc_control_pages(image, 0);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- level2p = (pmd_t *)page_address(page);
- init_level2_page(level2p, addr);
- set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
- addr += PUD_SIZE;
- }
- /* clear the unused entries */
- while (addr < end_addr) {
- pud_clear(level3p++);
- addr += PUD_SIZE;
- }
-out:
- return result;
-}
-
-
-static int init_level4_page(struct kimage *image, pgd_t *level4p,
- unsigned long addr, unsigned long last_addr)
-{
- unsigned long end_addr;
- int result;
-
- result = 0;
- addr &= PAGE_MASK;
- end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
- while ((addr < last_addr) && (addr < end_addr)) {
- struct page *page;
- pud_t *level3p;
-
- page = kimage_alloc_control_pages(image, 0);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- level3p = (pud_t *)page_address(page);
- result = init_level3_page(image, level3p, addr, last_addr);
- if (result)
- goto out;
- set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
- addr += PGDIR_SIZE;
- }
- /* clear the unused entries */
- while (addr < end_addr) {
- pgd_clear(level4p++);
- addr += PGDIR_SIZE;
- }
-out:
- return result;
-}
-
static void free_transition_pgtable(struct kimage *image)
{
free_page((unsigned long)image->arch.pud);
@@ -184,22 +71,62 @@ err:
return result;
}
+static void *alloc_pgt_page(void *data)
+{
+ struct kimage *image = (struct kimage *)data;
+ struct page *page;
+ void *p = NULL;
+
+ page = kimage_alloc_control_pages(image, 0);
+ if (page) {
+ p = page_address(page);
+ clear_page(p);
+ }
+
+ return p;
+}
static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .context = image,
+ .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
+ };
+ unsigned long mstart, mend;
pgd_t *level4p;
int result;
+ int i;
+
level4p = (pgd_t *)__va(start_pgtable);
- result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
- if (result)
- return result;
+ clear_page(level4p);
+ for (i = 0; i < nr_pfn_mapped; i++) {
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+
+ result = kernel_ident_mapping_init(&info,
+ level4p, mstart, mend);
+ if (result)
+ return result;
+ }
+
/*
- * image->start may be outside 0 ~ max_pfn, for example when
- * jump back to original kernel from kexeced kernel
+ * segments's mem ranges could be outside 0 ~ max_pfn,
+ * for example when jump back to original kernel from kexeced kernel.
+ * or first kernel is booted with user mem map, and second kernel
+ * could be loaded out of that range.
*/
- result = init_one_level2_page(image, level4p, image->start);
- if (result)
- return result;
+ for (i = 0; i < image->nr_segments; i++) {
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+
+ result = kernel_ident_mapping_init(&info,
+ level4p, mstart, mend);
+
+ if (result)
+ return result;
+ }
+
return init_transition_pgtable(image, level4p);
}
@@ -352,5 +279,7 @@ void arch_crash_save_vmcoreinfo(void)
VMCOREINFO_SYMBOL(node_data);
VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
#endif
+ vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
+ (unsigned long)&_text - __START_KERNEL);
}
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
deleted file mode 100644
index 177183cbb6a..00000000000
--- a/arch/x86/kernel/mca_32.c
+++ /dev/null
@@ -1,477 +0,0 @@
-/*
- * Written by Martin Kolinek, February 1996
- *
- * Changes:
- *
- * Chris Beauregard July 28th, 1996
- * - Fixed up integrated SCSI detection
- *
- * Chris Beauregard August 3rd, 1996
- * - Made mca_info local
- * - Made integrated registers accessible through standard function calls
- * - Added name field
- * - More sanity checking
- *
- * Chris Beauregard August 9th, 1996
- * - Rewrote /proc/mca
- *
- * Chris Beauregard January 7th, 1997
- * - Added basic NMI-processing
- * - Added more information to mca_info structure
- *
- * David Weinehall October 12th, 1998
- * - Made a lot of cleaning up in the source
- * - Added use of save_flags / restore_flags
- * - Added the 'driver_loaded' flag in MCA_adapter
- * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter
- *
- * David Weinehall March 24th, 1999
- * - Fixed the output of 'Driver Installed' in /proc/mca/pos
- * - Made the Integrated Video & SCSI show up even if they have id 0000
- *
- * Alexander Viro November 9th, 1999
- * - Switched to regular procfs methods
- *
- * Alfred Arnold & David Weinehall August 23rd, 2000
- * - Added support for Planar POS-registers
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mca.h>
-#include <linux/kprobes.h>
-#include <linux/slab.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <linux/proc_fs.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/ioport.h>
-#include <asm/uaccess.h>
-#include <linux/init.h>
-
-static unsigned char which_scsi;
-
-int MCA_bus;
-EXPORT_SYMBOL(MCA_bus);
-
-/*
- * Motherboard register spinlock. Untested on SMP at the moment, but
- * are there any MCA SMP boxes?
- *
- * Yes - Alan
- */
-static DEFINE_SPINLOCK(mca_lock);
-
-/* Build the status info for the adapter */
-
-static void mca_configure_adapter_status(struct mca_device *mca_dev)
-{
- mca_dev->status = MCA_ADAPTER_NONE;
-
- mca_dev->pos_id = mca_dev->pos[0]
- + (mca_dev->pos[1] << 8);
-
- if (!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
-
- /*
- * id = 0x0000 usually indicates hardware failure,
- * however, ZP Gu (zpg@castle.net> reports that his 9556
- * has 0x0000 as id and everything still works. There
- * also seem to be an adapter with id = 0x0000; the
- * NCR Parallel Bus Memory Card. Until this is confirmed,
- * however, this code will stay.
- */
-
- mca_dev->status = MCA_ADAPTER_ERROR;
-
- return;
- } else if (mca_dev->pos_id != 0xffff) {
-
- /*
- * 0xffff usually indicates that there's no adapter,
- * however, some integrated adapters may have 0xffff as
- * their id and still be valid. Examples are on-board
- * VGA of the 55sx, the integrated SCSI of the 56 & 57,
- * and possibly also the 95 ULTIMEDIA.
- */
-
- mca_dev->status = MCA_ADAPTER_NORMAL;
- }
-
- if ((mca_dev->pos_id == 0xffff ||
- mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
- int j;
-
- for (j = 2; j < 8; j++) {
- if (mca_dev->pos[j] != 0xff) {
- mca_dev->status = MCA_ADAPTER_NORMAL;
- break;
- }
- }
- }
-
- if (!(mca_dev->pos[2] & MCA_ENABLED)) {
-
- /* enabled bit is in POS 2 */
-
- mca_dev->status = MCA_ADAPTER_DISABLED;
- }
-} /* mca_configure_adapter_status */
-
-/*--------------------------------------------------------------------*/
-
-static struct resource mca_standard_resources[] = {
- { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
- { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
- { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
- { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
- { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
- { .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
- { .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
-};
-
-#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources)
-
-/*
- * mca_read_and_store_pos - read the POS registers into a memory buffer
- * @pos: a char pointer to 8 bytes, contains the POS register value on
- * successful return
- *
- * Returns 1 if a card actually exists (i.e. the pos isn't
- * all 0xff) or 0 otherwise
- */
-static int mca_read_and_store_pos(unsigned char *pos)
-{
- int j;
- int found = 0;
-
- for (j = 0; j < 8; j++) {
- pos[j] = inb_p(MCA_POS_REG(j));
- if (pos[j] != 0xff) {
- /* 0xff all across means no device. 0x00 means
- * something's broken, but a device is
- * probably there. However, if you get 0x00
- * from a motherboard register it won't matter
- * what we find. For the record, on the
- * 57SLC, the integrated SCSI adapter has
- * 0xffff for the adapter ID, but nonzero for
- * other registers. */
-
- found = 1;
- }
- }
- return found;
-}
-
-static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
-{
- unsigned char byte;
- unsigned long flags;
-
- if (reg < 0 || reg >= 8)
- return 0;
-
- spin_lock_irqsave(&mca_lock, flags);
- if (mca_dev->pos_register) {
- /* Disable adapter setup, enable motherboard setup */
-
- outb_p(0, MCA_ADAPTER_SETUP_REG);
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
-
- byte = inb_p(MCA_POS_REG(reg));
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
- } else {
-
- /* Make sure motherboard setup is off */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /* Read the appropriate register */
-
- outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
- byte = inb_p(MCA_POS_REG(reg));
- outb_p(0, MCA_ADAPTER_SETUP_REG);
- }
- spin_unlock_irqrestore(&mca_lock, flags);
-
- mca_dev->pos[reg] = byte;
-
- return byte;
-}
-
-static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
- unsigned char byte)
-{
- unsigned long flags;
-
- if (reg < 0 || reg >= 8)
- return;
-
- spin_lock_irqsave(&mca_lock, flags);
-
- /* Make sure motherboard setup is off */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /* Read in the appropriate register */
-
- outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
- outb_p(byte, MCA_POS_REG(reg));
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- spin_unlock_irqrestore(&mca_lock, flags);
-
- /* Update the global register list, while we have the byte */
-
- mca_dev->pos[reg] = byte;
-
-}
-
-/* for the primary MCA bus, we have identity transforms */
-static int mca_dummy_transform_irq(struct mca_device *mca_dev, int irq)
-{
- return irq;
-}
-
-static int mca_dummy_transform_ioport(struct mca_device *mca_dev, int port)
-{
- return port;
-}
-
-static void *mca_dummy_transform_memory(struct mca_device *mca_dev, void *mem)
-{
- return mem;
-}
-
-
-static int __init mca_init(void)
-{
- unsigned int i, j;
- struct mca_device *mca_dev;
- unsigned char pos[8];
- short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
- struct mca_bus *bus;
-
- /*
- * WARNING: Be careful when making changes here. Putting an adapter
- * and the motherboard simultaneously into setup mode may result in
- * damage to chips (according to The Indispensable PC Hardware Book
- * by Hans-Peter Messmer). Also, we disable system interrupts (so
- * that we are not disturbed in the middle of this).
- */
-
- /* Make sure the MCA bus is present */
-
- if (mca_system_init()) {
- printk(KERN_ERR "MCA bus system initialisation failed\n");
- return -ENODEV;
- }
-
- if (!MCA_bus)
- return -ENODEV;
-
- printk(KERN_INFO "Micro Channel bus detected.\n");
-
- /* All MCA systems have at least a primary bus */
- bus = mca_attach_bus(MCA_PRIMARY_BUS);
- if (!bus)
- goto out_nomem;
- bus->default_dma_mask = 0xffffffffLL;
- bus->f.mca_write_pos = mca_pc_write_pos;
- bus->f.mca_read_pos = mca_pc_read_pos;
- bus->f.mca_transform_irq = mca_dummy_transform_irq;
- bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
- bus->f.mca_transform_memory = mca_dummy_transform_memory;
-
- /* get the motherboard device */
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
- if (unlikely(!mca_dev))
- goto out_nomem;
-
- /*
- * We do not expect many MCA interrupts during initialization,
- * but let us be safe:
- */
- spin_lock_irq(&mca_lock);
-
- /* Make sure adapter setup is off */
-
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- /* Read motherboard POS registers */
-
- mca_dev->pos_register = 0x7f;
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
- mca_dev->name[0] = 0;
- mca_read_and_store_pos(mca_dev->pos);
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for a motherboard */
- mca_dev->pos_id = MCA_MOTHERBOARD_POS;
- mca_dev->slot = MCA_MOTHERBOARD;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- /* Put motherboard into video setup mode, read integrated video
- * POS registers, and turn motherboard setup off.
- */
-
- mca_dev->pos_register = 0xdf;
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
- mca_dev->name[0] = 0;
- mca_read_and_store_pos(mca_dev->pos);
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for the integrated video */
- mca_dev->pos_id = MCA_INTEGVIDEO_POS;
- mca_dev->slot = MCA_INTEGVIDEO;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
- /*
- * Put motherboard into scsi setup mode, read integrated scsi
- * POS registers, and turn motherboard setup off.
- *
- * It seems there are two possible SCSI registers. Martin says that
- * for the 56,57, 0xf7 is the one, but fails on the 76.
- * Alfredo (apena@vnet.ibm.com) says
- * 0xfd works on his machine. We'll try both of them. I figure it's
- * a good bet that only one could be valid at a time. This could
- * screw up though if one is used for something else on the other
- * machine.
- */
-
- for (i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
- outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
- if (mca_read_and_store_pos(pos))
- break;
- }
- if (which_scsi) {
- /* found a scsi card */
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- for (j = 0; j < 8; j++)
- mca_dev->pos[j] = pos[j];
-
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for integrated SCSI controller */
- mca_dev->pos_id = MCA_INTEGSCSI_POS;
- mca_dev->slot = MCA_INTEGSCSI;
- mca_dev->pos_register = which_scsi;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
- }
-
- /* Turn off motherboard setup */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /*
- * Now loop over MCA slots: put each adapter into setup mode, and
- * read its POS registers. Then put adapter setup off.
- */
-
- for (i = 0; i < MCA_MAX_SLOT_NR; i++) {
- outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
- if (!mca_read_and_store_pos(pos))
- continue;
-
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- for (j = 0; j < 8; j++)
- mca_dev->pos[j] = pos[j];
-
- mca_dev->driver_loaded = 0;
- mca_dev->slot = i;
- mca_dev->pos_register = 0;
- mca_configure_adapter_status(mca_dev);
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
- }
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- /* Enable interrupts and return memory start */
- spin_unlock_irq(&mca_lock);
-
- for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
- request_resource(&ioport_resource, mca_standard_resources + i);
-
- mca_do_proc_init();
-
- return 0;
-
- out_unlock_nomem:
- spin_unlock_irq(&mca_lock);
- out_nomem:
- printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
- return -ENOMEM;
-}
-
-subsys_initcall(mca_init);
-
-/*--------------------------------------------------------------------*/
-
-static __kprobes void
-mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
-{
- int slot = mca_dev->slot;
-
- if (slot == MCA_INTEGSCSI) {
- printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
- mca_dev->name);
- } else if (slot == MCA_INTEGVIDEO) {
- printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
- mca_dev->name);
- } else if (slot == MCA_MOTHERBOARD) {
- printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
- mca_dev->name);
- }
-
- /* More info available in POS 6 and 7? */
-
- if (check_flag) {
- unsigned char pos6, pos7;
-
- pos6 = mca_device_read_pos(mca_dev, 6);
- pos7 = mca_device_read_pos(mca_dev, 7);
-
- printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
- }
-
-} /* mca_handle_nmi_slot */
-
-/*--------------------------------------------------------------------*/
-
-static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
-{
- struct mca_device *mca_dev = to_mca_device(dev);
- unsigned char pos5;
-
- pos5 = mca_device_read_pos(mca_dev, 5);
-
- if (!(pos5 & 0x80)) {
- /*
- * Bit 7 of POS 5 is reset when this adapter has a hardware
- * error. Bit 7 it reset if there's error information
- * available in POS 6 and 7.
- */
- mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
- return 1;
- }
- return 0;
-}
-
-void __kprobes mca_handle_nmi(void)
-{
- /*
- * First try - scan the various adapters and see if a specific
- * adapter was responsible for the error.
- */
- bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
-}
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
new file mode 100644
index 00000000000..c050a015316
--- /dev/null
+++ b/arch/x86/kernel/mcount_64.S
@@ -0,0 +1,217 @@
+/*
+ * linux/arch/x86_64/mcount_64.S
+ *
+ * Copyright (C) 2014 Steven Rostedt, Red Hat Inc
+ */
+
+#include <linux/linkage.h>
+#include <asm/ptrace.h>
+#include <asm/ftrace.h>
+
+
+ .code64
+ .section .entry.text, "ax"
+
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+#ifdef CC_USING_FENTRY
+# define function_hook __fentry__
+#else
+# define function_hook mcount
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(function_hook)
+ retq
+END(function_hook)
+
+/* skip is set if stack has been adjusted */
+.macro ftrace_caller_setup skip=0
+ MCOUNT_SAVE_FRAME \skip
+
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
+
+ /* Load ip into the first parameter */
+ movq RIP(%rsp), %rdi
+ subq $MCOUNT_INSN_SIZE, %rdi
+ /* Load the parent_ip into the second parameter */
+#ifdef CC_USING_FENTRY
+ movq SS+16(%rsp), %rsi
+#else
+ movq 8(%rbp), %rsi
+#endif
+.endm
+
+ENTRY(ftrace_caller)
+ /* Check if tracing was disabled (quick check) */
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ ftrace_caller_setup
+ /* regs go into 4th parameter (but make it NULL) */
+ movq $0, %rcx
+
+GLOBAL(ftrace_call)
+ call ftrace_stub
+
+ MCOUNT_RESTORE_FRAME
+ftrace_return:
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+GLOBAL(ftrace_graph_call)
+ jmp ftrace_stub
+#endif
+
+GLOBAL(ftrace_stub)
+ retq
+END(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+ /* Save the current flags before compare (in SS location)*/
+ pushfq
+
+ /* Check if tracing was disabled (quick check) */
+ cmpl $0, function_trace_stop
+ jne ftrace_restore_flags
+
+ /* skip=8 to skip flags saved in SS */
+ ftrace_caller_setup 8
+
+ /* Save the rest of pt_regs */
+ movq %r15, R15(%rsp)
+ movq %r14, R14(%rsp)
+ movq %r13, R13(%rsp)
+ movq %r12, R12(%rsp)
+ movq %r11, R11(%rsp)
+ movq %r10, R10(%rsp)
+ movq %rbp, RBP(%rsp)
+ movq %rbx, RBX(%rsp)
+ /* Copy saved flags */
+ movq SS(%rsp), %rcx
+ movq %rcx, EFLAGS(%rsp)
+ /* Kernel segments */
+ movq $__KERNEL_DS, %rcx
+ movq %rcx, SS(%rsp)
+ movq $__KERNEL_CS, %rcx
+ movq %rcx, CS(%rsp)
+ /* Stack - skipping return address */
+ leaq SS+16(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ /* Copy flags back to SS, to restore them */
+ movq EFLAGS(%rsp), %rax
+ movq %rax, SS(%rsp)
+
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, SS+8(%rsp)
+
+ /* restore the rest of pt_regs */
+ movq R15(%rsp), %r15
+ movq R14(%rsp), %r14
+ movq R13(%rsp), %r13
+ movq R12(%rsp), %r12
+ movq R10(%rsp), %r10
+ movq RBP(%rsp), %rbp
+ movq RBX(%rsp), %rbx
+
+ /* skip=8 to skip flags saved in SS */
+ MCOUNT_RESTORE_FRAME 8
+
+ /* Restore flags */
+ popfq
+
+ jmp ftrace_return
+ftrace_restore_flags:
+ popfq
+ jmp ftrace_stub
+
+END(ftrace_regs_caller)
+
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(function_hook)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ cmpq $ftrace_stub, ftrace_trace_function
+ jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpq $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+
+GLOBAL(ftrace_stub)
+ retq
+
+trace:
+ MCOUNT_SAVE_FRAME
+
+ movq RIP(%rsp), %rdi
+#ifdef CC_USING_FENTRY
+ movq SS+16(%rsp), %rsi
+#else
+ movq 8(%rbp), %rsi
+#endif
+ subq $MCOUNT_INSN_SIZE, %rdi
+
+ call *ftrace_trace_function
+
+ MCOUNT_RESTORE_FRAME
+
+ jmp ftrace_stub
+END(function_hook)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ MCOUNT_SAVE_FRAME
+
+#ifdef CC_USING_FENTRY
+ leaq SS+16(%rsp), %rdi
+ movq $0, %rdx /* No framepointers needed */
+#else
+ leaq 8(%rbp), %rdi
+ movq (%rbp), %rdx
+#endif
+ movq RIP(%rsp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rsi
+
+ call prepare_ftrace_return
+
+ MCOUNT_RESTORE_FRAME
+
+ retq
+END(ftrace_graph_caller)
+
+GLOBAL(return_to_handler)
+ subq $24, %rsp
+
+ /* Save the return values */
+ movq %rax, (%rsp)
+ movq %rdx, 8(%rsp)
+ movq %rbp, %rdi
+
+ call ftrace_return_to_handler
+
+ movq %rax, %rdi
+ movq 8(%rsp), %rdx
+ movq (%rsp), %rax
+ addq $24, %rsp
+ jmp *%rdi
+#endif
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
deleted file mode 100644
index ac0417be913..00000000000
--- a/arch/x86/kernel/microcode_amd.c
+++ /dev/null
@@ -1,394 +0,0 @@
-/*
- * AMD CPU Microcode Update Driver for Linux
- * Copyright (C) 2008-2011 Advanced Micro Devices Inc.
- *
- * Author: Peter Oruba <peter.oruba@amd.com>
- *
- * Based on work by:
- * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *
- * Maintainers:
- * Andreas Herrmann <andreas.herrmann3@amd.com>
- * Borislav Petkov <borislav.petkov@amd.com>
- *
- * This driver allows to upgrade microcode on F10h AMD
- * CPUs and later.
- *
- * Licensed under the terms of the GNU General Public
- * License version 2. See file COPYING for details.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/firmware.h>
-#include <linux/pci_ids.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-
-#include <asm/microcode.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-
-MODULE_DESCRIPTION("AMD Microcode Update Driver");
-MODULE_AUTHOR("Peter Oruba");
-MODULE_LICENSE("GPL v2");
-
-#define UCODE_MAGIC 0x00414d44
-#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
-#define UCODE_UCODE_TYPE 0x00000001
-
-struct equiv_cpu_entry {
- u32 installed_cpu;
- u32 fixed_errata_mask;
- u32 fixed_errata_compare;
- u16 equiv_cpu;
- u16 res;
-} __attribute__((packed));
-
-struct microcode_header_amd {
- u32 data_code;
- u32 patch_id;
- u16 mc_patch_data_id;
- u8 mc_patch_data_len;
- u8 init_flag;
- u32 mc_patch_data_checksum;
- u32 nb_dev_id;
- u32 sb_dev_id;
- u16 processor_rev_id;
- u8 nb_rev_id;
- u8 sb_rev_id;
- u8 bios_api_rev;
- u8 reserved1[3];
- u32 match_reg[8];
-} __attribute__((packed));
-
-struct microcode_amd {
- struct microcode_header_amd hdr;
- unsigned int mpb[0];
-};
-
-#define SECTION_HDR_SIZE 8
-#define CONTAINER_HDR_SZ 12
-
-static struct equiv_cpu_entry *equiv_cpu_table;
-
-/* page-sized ucode patch buffer */
-void *patch;
-
-static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
-{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
- pr_warning("CPU%d: family %d not supported\n", cpu, c->x86);
- return -1;
- }
-
- csig->rev = c->microcode;
- pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
-
- return 0;
-}
-
-static unsigned int verify_ucode_size(int cpu, u32 patch_size,
- unsigned int size)
-{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- u32 max_size;
-
-#define F1XH_MPB_MAX_SIZE 2048
-#define F14H_MPB_MAX_SIZE 1824
-#define F15H_MPB_MAX_SIZE 4096
-
- switch (c->x86) {
- case 0x14:
- max_size = F14H_MPB_MAX_SIZE;
- break;
- case 0x15:
- max_size = F15H_MPB_MAX_SIZE;
- break;
- default:
- max_size = F1XH_MPB_MAX_SIZE;
- break;
- }
-
- if (patch_size > min_t(u32, size, max_size)) {
- pr_err("patch size mismatch\n");
- return 0;
- }
-
- return patch_size;
-}
-
-static u16 find_equiv_id(void)
-{
- unsigned int current_cpu_id, i = 0;
-
- BUG_ON(equiv_cpu_table == NULL);
-
- current_cpu_id = cpuid_eax(0x00000001);
-
- while (equiv_cpu_table[i].installed_cpu != 0) {
- if (current_cpu_id == equiv_cpu_table[i].installed_cpu)
- return equiv_cpu_table[i].equiv_cpu;
-
- i++;
- }
- return 0;
-}
-
-/*
- * we signal a good patch is found by returning its size > 0
- */
-static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
- unsigned int leftover_size, int rev,
- unsigned int *current_size)
-{
- struct microcode_header_amd *mc_hdr;
- unsigned int actual_size;
- u16 equiv_cpu_id;
-
- /* size of the current patch we're staring at */
- *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE;
-
- equiv_cpu_id = find_equiv_id();
- if (!equiv_cpu_id)
- return 0;
-
- /*
- * let's look at the patch header itself now
- */
- mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE);
-
- if (mc_hdr->processor_rev_id != equiv_cpu_id)
- return 0;
-
- /* ucode might be chipset specific -- currently we don't support this */
- if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
- pr_err("CPU%d: chipset specific code not yet supported\n",
- cpu);
- return 0;
- }
-
- if (mc_hdr->patch_id <= rev)
- return 0;
-
- /*
- * now that the header looks sane, verify its size
- */
- actual_size = verify_ucode_size(cpu, *current_size, leftover_size);
- if (!actual_size)
- return 0;
-
- /* clear the patch buffer */
- memset(patch, 0, PAGE_SIZE);
-
- /* all looks ok, get the binary patch */
- get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size);
-
- return actual_size;
-}
-
-static int apply_microcode_amd(int cpu)
-{
- u32 rev, dummy;
- int cpu_num = raw_smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- struct microcode_amd *mc_amd = uci->mc;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- /* We should bind the task to the CPU */
- BUG_ON(cpu_num != cpu);
-
- if (mc_amd == NULL)
- return 0;
-
- wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
- /* get patch id after patching */
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
- /* check current patch id and patch's id for match */
- if (rev != mc_amd->hdr.patch_id) {
- pr_err("CPU%d: update failed for patch_level=0x%08x\n",
- cpu, mc_amd->hdr.patch_id);
- return -1;
- }
-
- pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
- uci->cpu_sig.rev = rev;
- c->microcode = rev;
-
- return 0;
-}
-
-static int install_equiv_cpu_table(const u8 *buf)
-{
- unsigned int *ibuf = (unsigned int *)buf;
- unsigned int type = ibuf[1];
- unsigned int size = ibuf[2];
-
- if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
- pr_err("empty section/"
- "invalid type field in container file section header\n");
- return -EINVAL;
- }
-
- equiv_cpu_table = vmalloc(size);
- if (!equiv_cpu_table) {
- pr_err("failed to allocate equivalent CPU table\n");
- return -ENOMEM;
- }
-
- get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
-
- /* add header length */
- return size + CONTAINER_HDR_SZ;
-}
-
-static void free_equiv_cpu_table(void)
-{
- vfree(equiv_cpu_table);
- equiv_cpu_table = NULL;
-}
-
-static enum ucode_state
-generic_load_microcode(int cpu, const u8 *data, size_t size)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- struct microcode_header_amd *mc_hdr = NULL;
- unsigned int mc_size, leftover, current_size = 0;
- int offset;
- const u8 *ucode_ptr = data;
- void *new_mc = NULL;
- unsigned int new_rev = uci->cpu_sig.rev;
- enum ucode_state state = UCODE_ERROR;
-
- offset = install_equiv_cpu_table(ucode_ptr);
- if (offset < 0) {
- pr_err("failed to create equivalent cpu table\n");
- goto out;
- }
- ucode_ptr += offset;
- leftover = size - offset;
-
- if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) {
- pr_err("invalid type field in container file section header\n");
- goto free_table;
- }
-
- while (leftover) {
- mc_size = get_matching_microcode(cpu, ucode_ptr, leftover,
- new_rev, &current_size);
- if (mc_size) {
- mc_hdr = patch;
- new_mc = patch;
- new_rev = mc_hdr->patch_id;
- goto out_ok;
- }
-
- ucode_ptr += current_size;
- leftover -= current_size;
- }
-
- if (!new_mc) {
- state = UCODE_NFOUND;
- goto free_table;
- }
-
-out_ok:
- uci->mc = new_mc;
- state = UCODE_OK;
- pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
- cpu, uci->cpu_sig.rev, new_rev);
-
-free_table:
- free_equiv_cpu_table();
-
-out:
- return state;
-}
-
-/*
- * AMD microcode firmware naming convention, up to family 15h they are in
- * the legacy file:
- *
- * amd-ucode/microcode_amd.bin
- *
- * This legacy file is always smaller than 2K in size.
- *
- * Starting at family 15h they are in family specific firmware files:
- *
- * amd-ucode/microcode_amd_fam15h.bin
- * amd-ucode/microcode_amd_fam16h.bin
- * ...
- *
- * These might be larger than 2K.
- */
-static enum ucode_state request_microcode_amd(int cpu, struct device *device)
-{
- char fw_name[36] = "amd-ucode/microcode_amd.bin";
- const struct firmware *fw;
- enum ucode_state ret = UCODE_NFOUND;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- if (c->x86 >= 0x15)
- snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
-
- if (request_firmware(&fw, (const char *)fw_name, device)) {
- pr_err("failed to load file %s\n", fw_name);
- goto out;
- }
-
- ret = UCODE_ERROR;
- if (*(u32 *)fw->data != UCODE_MAGIC) {
- pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
- goto fw_release;
- }
-
- ret = generic_load_microcode(cpu, fw->data, fw->size);
-
-fw_release:
- release_firmware(fw);
-
-out:
- return ret;
-}
-
-static enum ucode_state
-request_microcode_user(int cpu, const void __user *buf, size_t size)
-{
- pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
- return UCODE_ERROR;
-}
-
-static void microcode_fini_cpu_amd(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- uci->mc = NULL;
-}
-
-static struct microcode_ops microcode_amd_ops = {
- .request_microcode_user = request_microcode_user,
- .request_microcode_fw = request_microcode_amd,
- .collect_cpu_info = collect_cpu_info_amd,
- .apply_microcode = apply_microcode_amd,
- .microcode_fini_cpu = microcode_fini_cpu_amd,
-};
-
-struct microcode_ops * __init init_amd_microcode(void)
-{
- patch = (void *)get_zeroed_page(GFP_KERNEL);
- if (!patch)
- return NULL;
-
- return &microcode_amd_ops;
-}
-
-void __exit exit_amd_microcode(void)
-{
- free_page((unsigned long)patch);
-}
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index ac861b8348e..f4c886d9165 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -24,14 +24,14 @@ struct pci_hostbridge_probe {
u32 device;
};
-static u64 __cpuinitdata fam10h_pci_mmconf_base;
+static u64 fam10h_pci_mmconf_base;
-static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
+static struct pci_hostbridge_probe pci_probes[] = {
{ 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
{ 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
};
-static int __cpuinit cmp_range(const void *x1, const void *x2)
+static int cmp_range(const void *x1, const void *x2)
{
const struct range *r1 = x1;
const struct range *r2 = x2;
@@ -49,7 +49,7 @@ static int __cpuinit cmp_range(const void *x1, const void *x2)
/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */
#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32)
#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40))
-static void __cpuinit get_fam10h_pci_mmconf_base(void)
+static void get_fam10h_pci_mmconf_base(void)
{
int i;
unsigned bus;
@@ -166,7 +166,7 @@ out:
fam10h_pci_mmconf_base = base;
}
-void __cpuinit fam10h_check_enable_mmcfg(void)
+void fam10h_check_enable_mmcfg(void)
{
u64 val;
u32 address;
@@ -230,7 +230,7 @@ static const struct dmi_system_id __initconst mmconf_dmi_table[] = {
{}
};
-/* Called from a __cpuinit function, but only on the BSP. */
+/* Called from a non __init function, but only on the BSP. */
void __ref check_enable_amd_mmconf_dmi(void)
{
dmi_check_system(mmconf_dmi_table);
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 925179f871d..e69f9882bf9 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -15,6 +15,9 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/moduleloader.h>
#include <linux/elf.h>
#include <linux/vmalloc.h>
@@ -25,24 +28,68 @@
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/jump_label.h>
+#include <linux/random.h>
-#include <asm/system.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#if 0
-#define DEBUGP printk
+#define DEBUGP(fmt, ...) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__)
#else
-#define DEBUGP(fmt...)
+#define DEBUGP(fmt, ...) \
+do { \
+ if (0) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+} while (0)
+#endif
+
+#ifdef CONFIG_RANDOMIZE_BASE
+static unsigned long module_load_offset;
+static int randomize_modules = 1;
+
+/* Mutex protects the module_load_offset. */
+static DEFINE_MUTEX(module_kaslr_mutex);
+
+static int __init parse_nokaslr(char *p)
+{
+ randomize_modules = 0;
+ return 0;
+}
+early_param("nokaslr", parse_nokaslr);
+
+static unsigned long int get_module_load_offset(void)
+{
+ if (randomize_modules) {
+ mutex_lock(&module_kaslr_mutex);
+ /*
+ * Calculate the module_load_offset the first time this
+ * code is called. Once calculated it stays the same until
+ * reboot.
+ */
+ if (module_load_offset == 0)
+ module_load_offset =
+ (get_random_int() % 1024 + 1) * PAGE_SIZE;
+ mutex_unlock(&module_kaslr_mutex);
+ }
+ return module_load_offset;
+}
+#else
+static unsigned long int get_module_load_offset(void)
+{
+ return 0;
+}
#endif
void *module_alloc(unsigned long size)
{
if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL;
- return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
- GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
- -1, __builtin_return_address(0));
+ return __vmalloc_node_range(size, 1,
+ MODULES_VADDR + get_module_load_offset(),
+ MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
+ PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
#ifdef CONFIG_X86_32
@@ -57,8 +104,8 @@ int apply_relocate(Elf32_Shdr *sechdrs,
Elf32_Sym *sym;
uint32_t *location;
- DEBUGP("Applying relocate section %u to %u\n", relsec,
- sechdrs[relsec].sh_info);
+ DEBUGP("Applying relocate section %u to %u\n",
+ relsec, sechdrs[relsec].sh_info);
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
/* This is where to make the change */
location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -74,11 +121,11 @@ int apply_relocate(Elf32_Shdr *sechdrs,
*location += sym->st_value;
break;
case R_386_PC32:
- /* Add the value, subtract its postition */
+ /* Add the value, subtract its position */
*location += sym->st_value - (uint32_t)location;
break;
default:
- printk(KERN_ERR "module %s: Unknown relocation: %u\n",
+ pr_err("%s: Unknown relocation: %u\n",
me->name, ELF32_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
@@ -98,8 +145,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
void *loc;
u64 val;
- DEBUGP("Applying relocate section %u to %u\n", relsec,
- sechdrs[relsec].sh_info);
+ DEBUGP("Applying relocate section %u to %u\n",
+ relsec, sechdrs[relsec].sh_info);
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
/* This is where to make the change */
loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -111,8 +158,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
+ ELF64_R_SYM(rel[i].r_info);
DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info),
- sym->st_value, rel[i].r_addend, (u64)loc);
+ (int)ELF64_R_TYPE(rel[i].r_info),
+ sym->st_value, rel[i].r_addend, (u64)loc);
val = sym->st_value + rel[i].r_addend;
@@ -141,7 +188,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
#endif
break;
default:
- printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",
+ pr_err("%s: Unknown rela relocation: %llu\n",
me->name, ELF64_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
@@ -149,9 +196,9 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
return 0;
overflow:
- printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
+ pr_err("overflow in relocation type %d val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), val);
- printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
+ pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
me->name);
return -ENOEXEC;
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index ca470e4c92d..d2b56489d70 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,7 +27,6 @@
#include <asm/proto.h>
#include <asm/bios_ebda.h>
#include <asm/e820.h>
-#include <asm/trampoline.h>
#include <asm/setup.h>
#include <asm/smp.h>
@@ -97,7 +96,7 @@ static void __init MP_bus_info(struct mpc_bus *m)
set_bit(m->busid, mp_bus_not_pci);
if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
@@ -105,12 +104,10 @@ static void __init MP_bus_info(struct mpc_bus *m)
x86_init.mpparse.mpc_oem_pci_bus(m);
clear_bit(m->busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
- } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
- mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
#endif
} else
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
@@ -368,9 +365,6 @@ static void __init construct_ioapic_table(int mpc_default_type)
case 3:
memcpy(bus.bustype, "EISA ", 6);
break;
- case 4:
- case 7:
- memcpy(bus.bustype, "MCA ", 6);
}
MP_bus_info(&bus);
if (mpc_default_type > 4) {
@@ -573,8 +567,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
struct mpf_intel *mpf;
unsigned long mem;
- apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
- bp, length);
+ apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
+ base, base + length - 1);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
@@ -589,8 +583,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
#endif
mpf_found = mpf;
- printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
- mpf, (u64)virt_to_phys(mpf));
+ printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
+ (unsigned long long) virt_to_phys(mpf),
+ (unsigned long long) virt_to_phys(mpf) +
+ sizeof(*mpf) - 1, mpf);
mem = virt_to_phys(mpf);
memblock_reserve(mem, sizeof(*mpf));
@@ -623,7 +619,7 @@ void __init default_find_smp_config(void)
return;
/*
* If it is an SMP machine we should know now, unless the
- * configuration is in an EISA/MCA bus machine with an
+ * configuration is in an EISA bus machine with an
* extended bios data area.
*
* there is a real-mode segmented pointer pointing to the
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 96356762a51..c9603ac80de 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -40,14 +40,13 @@
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/system.h>
static struct class *msr_class;
static loff_t msr_seek(struct file *file, loff_t offset, int orig)
{
loff_t ret;
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = file_inode(file);
mutex_lock(&inode->i_mutex);
switch (orig) {
@@ -72,7 +71,7 @@ static ssize_t msr_read(struct file *file, char __user *buf,
u32 __user *tmp = (u32 __user *) buf;
u32 data[2];
u32 reg = *ppos;
- int cpu = iminor(file->f_path.dentry->d_inode);
+ int cpu = iminor(file_inode(file));
int err = 0;
ssize_t bytes = 0;
@@ -100,7 +99,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
const u32 __user *tmp = (const u32 __user *)buf;
u32 data[2];
u32 reg = *ppos;
- int cpu = iminor(file->f_path.dentry->d_inode);
+ int cpu = iminor(file_inode(file));
int err = 0;
ssize_t bytes = 0;
@@ -126,7 +125,7 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
{
u32 __user *uregs = (u32 __user *)arg;
u32 regs[8];
- int cpu = iminor(file->f_path.dentry->d_inode);
+ int cpu = iminor(file_inode(file));
int err;
switch (ioc) {
@@ -172,10 +171,12 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
static int msr_open(struct inode *inode, struct file *file)
{
- unsigned int cpu;
+ unsigned int cpu = iminor(file_inode(file));
struct cpuinfo_x86 *c;
- cpu = iminor(file->f_path.dentry->d_inode);
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
return -ENXIO; /* No such CPU */
@@ -199,7 +200,7 @@ static const struct file_operations msr_fops = {
.compat_ioctl = msr_ioctl,
};
-static int __cpuinit msr_device_create(int cpu)
+static int msr_device_create(int cpu)
{
struct device *dev;
@@ -213,8 +214,8 @@ static void msr_device_destroy(int cpu)
device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
}
-static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int msr_class_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
int err = 0;
@@ -258,12 +259,15 @@ static int __init msr_init(void)
goto out_chrdev;
}
msr_class->devnode = msr_devnode;
+
+ cpu_notifier_register_begin();
for_each_online_cpu(i) {
err = msr_device_create(i);
if (err != 0)
goto out_class;
}
- register_hotcpu_notifier(&msr_class_cpu_notifier);
+ __register_hotcpu_notifier(&msr_class_cpu_notifier);
+ cpu_notifier_register_done();
err = 0;
goto out;
@@ -272,6 +276,7 @@ out_class:
i = 0;
for_each_online_cpu(i)
msr_device_destroy(i);
+ cpu_notifier_register_done();
class_destroy(msr_class);
out_chrdev:
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
@@ -282,11 +287,14 @@ out:
static void __exit msr_exit(void)
{
int cpu = 0;
+
+ cpu_notifier_register_begin();
for_each_online_cpu(cpu)
msr_device_destroy(cpu);
class_destroy(msr_class);
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
- unregister_hotcpu_notifier(&msr_class_cpu_notifier);
+ __unregister_hotcpu_notifier(&msr_class_cpu_notifier);
+ cpu_notifier_register_done();
}
module_init(msr_init);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 47acaf31916..c3e985d1751 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -14,13 +14,12 @@
#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/nmi.h>
+#include <linux/debugfs.h>
#include <linux/delay.h>
#include <linux/hardirq.h>
#include <linux/slab.h>
#include <linux/export.h>
-#include <linux/mca.h>
-
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
@@ -31,13 +30,8 @@
#include <asm/nmi.h>
#include <asm/x86_init.h>
-#define NMI_MAX_NAMELEN 16
-struct nmiaction {
- struct list_head list;
- nmi_handler_t handler;
- unsigned int flags;
- char *name;
-};
+#define CREATE_TRACE_POINTS
+#include <trace/events/nmi.h>
struct nmi_desc {
spinlock_t lock;
@@ -54,6 +48,14 @@ static struct nmi_desc nmi_desc[NMI_MAX] =
.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
.head = LIST_HEAD_INIT(nmi_desc[1].head),
},
+ {
+ .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[2].head),
+ },
+ {
+ .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[3].head),
+ },
};
@@ -84,7 +86,31 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
#define nmi_to_desc(type) (&nmi_desc[type])
-static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
+
+static int __init nmi_warning_debugfs(void)
+{
+ debugfs_create_u64("nmi_longest_ns", 0644,
+ arch_debugfs_dir, &nmi_longest_ns);
+ return 0;
+}
+fs_initcall(nmi_warning_debugfs);
+
+static void nmi_max_handler(struct irq_work *w)
+{
+ struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
+ int remainder_ns, decimal_msecs;
+ u64 whole_msecs = ACCESS_ONCE(a->max_duration);
+
+ remainder_ns = do_div(whole_msecs, (1000 * 1000));
+ decimal_msecs = remainder_ns / 1000;
+
+ printk_ratelimited(KERN_INFO
+ "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
+ a->handler, whole_msecs, decimal_msecs);
+}
+
+static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *a;
@@ -98,20 +124,40 @@ static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs,
* can be latched at any given time. Walk the whole list
* to handle those situations.
*/
- list_for_each_entry_rcu(a, &desc->head, list)
- handled += a->handler(type, regs);
+ list_for_each_entry_rcu(a, &desc->head, list) {
+ int thishandled;
+ u64 delta;
+
+ delta = sched_clock();
+ thishandled = a->handler(type, regs);
+ handled += thishandled;
+ delta = sched_clock() - delta;
+ trace_nmi_handler(a->handler, (int)delta, thishandled);
+
+ if (delta < nmi_longest_ns || delta < a->max_duration)
+ continue;
+
+ a->max_duration = delta;
+ irq_work_queue(&a->irq_work);
+ }
rcu_read_unlock();
/* return total number of NMI events handled */
return handled;
}
+NOKPROBE_SYMBOL(nmi_handle);
-static int __setup_nmi(unsigned int type, struct nmiaction *action)
+int __register_nmi_handler(unsigned int type, struct nmiaction *action)
{
struct nmi_desc *desc = nmi_to_desc(type);
unsigned long flags;
+ if (!action->handler)
+ return -EINVAL;
+
+ init_irq_work(&action->irq_work, nmi_max_handler);
+
spin_lock_irqsave(&desc->lock, flags);
/*
@@ -120,6 +166,8 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action)
* to manage expectations
*/
WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
+ WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
+ WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
/*
* some handlers need to be executed first otherwise a fake
@@ -133,8 +181,9 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action)
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
+EXPORT_SYMBOL(__register_nmi_handler);
-static struct nmiaction *__free_nmi(unsigned int type, const char *name)
+void unregister_nmi_handler(unsigned int type, const char *name)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *n;
@@ -157,61 +206,16 @@ static struct nmiaction *__free_nmi(unsigned int type, const char *name)
spin_unlock_irqrestore(&desc->lock, flags);
synchronize_rcu();
- return (n);
}
-
-int register_nmi_handler(unsigned int type, nmi_handler_t handler,
- unsigned long nmiflags, const char *devname)
-{
- struct nmiaction *action;
- int retval = -ENOMEM;
-
- if (!handler)
- return -EINVAL;
-
- action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL);
- if (!action)
- goto fail_action;
-
- action->handler = handler;
- action->flags = nmiflags;
- action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL);
- if (!action->name)
- goto fail_action_name;
-
- retval = __setup_nmi(type, action);
-
- if (retval)
- goto fail_setup_nmi;
-
- return retval;
-
-fail_setup_nmi:
- kfree(action->name);
-fail_action_name:
- kfree(action);
-fail_action:
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(register_nmi_handler);
-
-void unregister_nmi_handler(unsigned int type, const char *name)
-{
- struct nmiaction *a;
-
- a = __free_nmi(type, name);
- if (a) {
- kfree(a->name);
- kfree(a);
- }
-}
-
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
-static notrace __kprobes void
+static void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_SERR, regs, false))
+ return;
+
pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
reason, smp_processor_id());
@@ -235,16 +239,21 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
outb(reason, NMI_REASON_PORT);
}
+NOKPROBE_SYMBOL(pci_serr_error);
-static notrace __kprobes void
+static void
io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_IO_CHECK, regs, false))
+ return;
+
pr_emerg(
"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
reason, smp_processor_id());
- show_registers(regs);
+ show_regs(regs);
if (panic_on_io_nmi)
panic("NMI IOCK error: Not continuing");
@@ -262,8 +271,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
reason &= ~NMI_REASON_CLEAR_IOCHK;
outb(reason, NMI_REASON_PORT);
}
+NOKPROBE_SYMBOL(io_check_error);
-static notrace __kprobes void
+static void
unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
int handled;
@@ -282,16 +292,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
__this_cpu_add(nmi_stats.unknown, 1);
-#ifdef CONFIG_MCA
- /*
- * Might actually be able to figure out what the guilty party
- * is:
- */
- if (MCA_bus) {
- mca_handle_nmi();
- return;
- }
-#endif
pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
reason, smp_processor_id());
@@ -301,11 +301,12 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
pr_emerg("Dazed and confused, but trying to continue\n");
}
+NOKPROBE_SYMBOL(unknown_nmi_error);
static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
-static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+static void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;
@@ -404,6 +405,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
else
unknown_nmi_error(reason, regs);
}
+NOKPROBE_SYMBOL(default_do_nmi);
/*
* NMIs can hit breakpoints which will cause it to lose its
@@ -412,8 +414,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
#ifdef CONFIG_X86_32
/*
* For i386, NMIs use the same stack as the kernel, and we can
- * add a workaround to the iret problem in C. Simply have 3 states
- * the NMI can be in.
+ * add a workaround to the iret problem in C (preventing nested
+ * NMIs if an NMI takes a trap). Simply have 3 states the NMI
+ * can be in:
*
* 1) not running
* 2) executing
@@ -430,32 +433,50 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
* If an NMI hits a breakpoint that executes an iret, another
* NMI can preempt it. We do not want to allow this new NMI
* to run, but we want to execute it when the first one finishes.
- * We set the state to "latched", and the first NMI will perform
- * an cmpxchg on the state, and if it doesn't successfully
- * reset the state to "not running" it will restart the next
- * NMI.
+ * We set the state to "latched", and the exit of the first NMI will
+ * perform a dec_return, if the result is zero (NOT_RUNNING), then
+ * it will simply exit the NMI handler. If not, the dec_return
+ * would have set the state to NMI_EXECUTING (what we want it to
+ * be when we are running). In this case, we simply jump back
+ * to rerun the NMI handler again, and restart the 'latched' NMI.
+ *
+ * No trap (breakpoint or page fault) should be hit before nmi_restart,
+ * thus there is no race between the first check of state for NOT_RUNNING
+ * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
+ * at this point.
+ *
+ * In case the NMI takes a page fault, we need to save off the CR2
+ * because the NMI could have preempted another page fault and corrupt
+ * the CR2 that is about to be read. As nested NMIs must be restarted
+ * and they can not take breakpoints or page faults, the update of the
+ * CR2 must be done before converting the nmi state back to NOT_RUNNING.
+ * Otherwise, there would be a race of another nested NMI coming in
+ * after setting state to NOT_RUNNING but before updating the nmi_cr2.
*/
enum nmi_states {
- NMI_NOT_RUNNING,
+ NMI_NOT_RUNNING = 0,
NMI_EXECUTING,
NMI_LATCHED,
};
static DEFINE_PER_CPU(enum nmi_states, nmi_state);
+static DEFINE_PER_CPU(unsigned long, nmi_cr2);
#define nmi_nesting_preprocess(regs) \
do { \
- if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \
- __get_cpu_var(nmi_state) = NMI_LATCHED; \
+ if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
+ this_cpu_write(nmi_state, NMI_LATCHED); \
return; \
} \
- nmi_restart: \
- __get_cpu_var(nmi_state) = NMI_EXECUTING; \
- } while (0)
+ this_cpu_write(nmi_state, NMI_EXECUTING); \
+ this_cpu_write(nmi_cr2, read_cr2()); \
+ } while (0); \
+ nmi_restart:
#define nmi_nesting_postprocess() \
do { \
- if (cmpxchg(&__get_cpu_var(nmi_state), \
- NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
+ write_cr2(this_cpu_read(nmi_cr2)); \
+ if (this_cpu_dec_return(nmi_state)) \
goto nmi_restart; \
} while (0)
#else /* x86_64 */
@@ -491,18 +512,20 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
*/
if (unlikely(is_debug_stack(regs->sp))) {
debug_stack_set_zero();
- __get_cpu_var(update_debug_stack) = 1;
+ this_cpu_write(update_debug_stack, 1);
}
}
static inline void nmi_nesting_postprocess(void)
{
- if (unlikely(__get_cpu_var(update_debug_stack)))
+ if (unlikely(this_cpu_read(update_debug_stack))) {
debug_stack_reset();
+ this_cpu_write(update_debug_stack, 0);
+ }
}
#endif
-dotraplinkage notrace __kprobes void
+dotraplinkage notrace void
do_nmi(struct pt_regs *regs, long error_code)
{
nmi_nesting_preprocess(regs);
@@ -519,6 +542,7 @@ do_nmi(struct pt_regs *regs, long error_code)
/* On i386, may loop back to preprocess */
nmi_nesting_postprocess();
}
+NOKPROBE_SYMBOL(do_nmi);
void stop_nmi(void)
{
@@ -535,3 +559,4 @@ void local_touch_nmi(void)
{
__this_cpu_write(last_nmi_rip, 0);
}
+EXPORT_SYMBOL_GPL(local_touch_nmi);
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 0d01a8ea4e1..6d9582ec032 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -12,6 +12,8 @@
#include <linux/smp.h>
#include <linux/cpumask.h>
#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
#include <asm/apic.h>
#include <asm/nmi.h>
@@ -20,35 +22,36 @@
#define FAILURE 1
#define TIMEOUT 2
-static int nmi_fail;
+static int __initdata nmi_fail;
/* check to see if NMI IPIs work on this machine */
-static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly;
+static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata;
-static int testcase_total;
-static int testcase_successes;
-static int expected_testcase_failures;
-static int unexpected_testcase_failures;
-static int unexpected_testcase_unknowns;
+static int __initdata testcase_total;
+static int __initdata testcase_successes;
+static int __initdata expected_testcase_failures;
+static int __initdata unexpected_testcase_failures;
+static int __initdata unexpected_testcase_unknowns;
-static int nmi_unk_cb(unsigned int val, struct pt_regs *regs)
+static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
{
unexpected_testcase_unknowns++;
return NMI_HANDLED;
}
-static void init_nmi_testsuite(void)
+static void __init init_nmi_testsuite(void)
{
/* trap all the unknown NMIs we may generate */
- register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
+ register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk",
+ __initdata);
}
-static void cleanup_nmi_testsuite(void)
+static void __init cleanup_nmi_testsuite(void)
{
unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
}
-static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
+static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
{
int cpu = raw_smp_processor_id();
@@ -58,12 +61,12 @@ static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
return NMI_DONE;
}
-static void test_nmi_ipi(struct cpumask *mask)
+static void __init test_nmi_ipi(struct cpumask *mask)
{
unsigned long timeout;
if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
- NMI_FLAG_FIRST, "nmi_selftest")) {
+ NMI_FLAG_FIRST, "nmi_selftest", __initdata)) {
nmi_fail = FAILURE;
return;
}
@@ -86,7 +89,7 @@ static void test_nmi_ipi(struct cpumask *mask)
return;
}
-static void remote_ipi(void)
+static void __init remote_ipi(void)
{
cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
@@ -94,19 +97,19 @@ static void remote_ipi(void)
test_nmi_ipi(to_cpumask(nmi_ipi_mask));
}
-static void local_ipi(void)
+static void __init local_ipi(void)
{
cpumask_clear(to_cpumask(nmi_ipi_mask));
cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
test_nmi_ipi(to_cpumask(nmi_ipi_mask));
}
-static void reset_nmi(void)
+static void __init reset_nmi(void)
{
nmi_fail = 0;
}
-static void dotest(void (*testcase_fn)(void), int expected)
+static void __init dotest(void (*testcase_fn)(void), int expected)
{
testcase_fn();
/*
@@ -116,27 +119,27 @@ static void dotest(void (*testcase_fn)(void), int expected)
unexpected_testcase_failures++;
if (nmi_fail == FAILURE)
- printk("FAILED |");
+ printk(KERN_CONT "FAILED |");
else if (nmi_fail == TIMEOUT)
- printk("TIMEOUT|");
+ printk(KERN_CONT "TIMEOUT|");
else
- printk("ERROR |");
+ printk(KERN_CONT "ERROR |");
dump_stack();
} else {
testcase_successes++;
- printk(" ok |");
+ printk(KERN_CONT " ok |");
}
testcase_total++;
reset_nmi();
}
-static inline void print_testname(const char *testname)
+static inline void __init print_testname(const char *testname)
{
printk("%12s:", testname);
}
-void nmi_selftest(void)
+void __init nmi_selftest(void)
{
init_nmi_testsuite();
@@ -149,10 +152,10 @@ void nmi_selftest(void)
print_testname("remote IPI");
dotest(remote_ipi, SUCCESS);
- printk("\n");
+ printk(KERN_CONT "\n");
print_testname("local IPI");
dotest(local_ipi, SUCCESS);
- printk("\n");
+ printk(KERN_CONT "\n");
cleanup_nmi_testsuite();
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 676b8c77a97..bbb6c731634 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -4,25 +4,17 @@
*/
#include <linux/spinlock.h>
#include <linux/module.h>
+#include <linux/jump_label.h>
#include <asm/paravirt.h>
-static inline void
-default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
-{
- arch_spin_lock(lock);
-}
-
struct pv_lock_ops pv_lock_ops = {
#ifdef CONFIG_SMP
- .spin_is_locked = __ticket_spin_is_locked,
- .spin_is_contended = __ticket_spin_is_contended,
-
- .spin_lock = __ticket_spin_lock,
- .spin_lock_flags = default_spin_lock_flags,
- .spin_trylock = __ticket_spin_trylock,
- .spin_unlock = __ticket_spin_unlock,
+ .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
+ .unlock_kick = paravirt_nop,
#endif
};
EXPORT_SYMBOL(pv_lock_ops);
+struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(paravirt_ticketlocks_enabled);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index d90272e6bc4..548d25f00c9 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -23,9 +23,11 @@
#include <linux/efi.h>
#include <linux/bcd.h>
#include <linux/highmem.h>
+#include <linux/kprobes.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
+#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/pgtable.h>
@@ -37,6 +39,7 @@
#include <asm/apic.h>
#include <asm/tlbflush.h>
#include <asm/timer.h>
+#include <asm/special_insns.h>
/* nop stub */
void _paravirt_nop(void)
@@ -60,11 +63,6 @@ void __init default_banner(void)
pv_info.name);
}
-/* Simple instruction patching code. */
-#define DEF_NATIVE(ops, name, code) \
- extern const char start_##ops##_##name[], end_##ops##_##name[]; \
- asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
-
/* Undefined instruction for dealing with missing ops pointers. */
static const unsigned char ud2a[] = { 0x0f, 0x0b };
@@ -202,8 +200,8 @@ static void native_flush_tlb_single(unsigned long addr)
__native_flush_tlb_single(addr);
}
-struct jump_label_key paravirt_steal_enabled;
-struct jump_label_key paravirt_steal_rq_enabled;
+struct static_key paravirt_steal_enabled;
+struct static_key paravirt_steal_rq_enabled;
static u64 native_steal_clock(int cpu)
{
@@ -239,16 +237,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
static inline void enter_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+ BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
- percpu_write(paravirt_lazy_mode, mode);
+ this_cpu_write(paravirt_lazy_mode, mode);
}
static void leave_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
+ BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
- percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+ this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
}
void paravirt_enter_lazy_mmu(void)
@@ -261,11 +259,23 @@ void paravirt_leave_lazy_mmu(void)
leave_lazy(PARAVIRT_LAZY_MMU);
}
+void paravirt_flush_lazy_mmu(void)
+{
+ preempt_disable();
+
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ arch_enter_lazy_mmu_mode();
+ }
+
+ preempt_enable();
+}
+
void paravirt_start_context_switch(struct task_struct *prev)
{
BUG_ON(preemptible());
- if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+ if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
arch_leave_lazy_mmu_mode();
set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
}
@@ -287,19 +297,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
if (in_interrupt())
return PARAVIRT_LAZY_NONE;
- return percpu_read(paravirt_lazy_mode);
-}
-
-void arch_flush_lazy_mmu_mode(void)
-{
- preempt_disable();
-
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- arch_enter_lazy_mmu_mode();
- }
-
- preempt_enable();
+ return this_cpu_read(paravirt_lazy_mode);
}
struct pv_info pv_info = {
@@ -322,7 +320,7 @@ struct pv_time_ops pv_time_ops = {
.steal_clock = native_steal_clock,
};
-struct pv_irq_ops pv_irq_ops = {
+__visible struct pv_irq_ops pv_irq_ops = {
.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
@@ -334,7 +332,7 @@ struct pv_irq_ops pv_irq_ops = {
#endif
};
-struct pv_cpu_ops pv_cpu_ops = {
+__visible struct pv_cpu_ops pv_cpu_ops = {
.cpuid = native_cpuid,
.get_debugreg = native_get_debugreg,
.set_debugreg = native_set_debugreg,
@@ -350,9 +348,7 @@ struct pv_cpu_ops pv_cpu_ops = {
#endif
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
- .rdmsr_regs = native_rdmsr_safe_regs,
.write_msr = native_write_msr_safe,
- .wrmsr_regs = native_wrmsr_safe_regs,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
.read_tscp = native_read_tscp,
@@ -360,7 +356,6 @@ struct pv_cpu_ops pv_cpu_ops = {
.set_ldt = native_set_ldt,
.load_gdt = native_load_gdt,
.load_idt = native_load_idt,
- .store_gdt = native_store_gdt,
.store_idt = native_store_idt,
.store_tr = native_store_tr,
.load_tls = native_load_tls,
@@ -395,6 +390,11 @@ struct pv_cpu_ops pv_cpu_ops = {
.end_context_switch = paravirt_nop,
};
+/* At this point, native_get/set_debugreg has real function entries */
+NOKPROBE_SYMBOL(native_get_debugreg);
+NOKPROBE_SYMBOL(native_set_debugreg);
+NOKPROBE_SYMBOL(native_load_idt);
+
struct pv_apic_ops pv_apic_ops = {
#ifdef CONFIG_X86_LOCAL_APIC
.startup_ipi_hook = paravirt_nop,
@@ -475,6 +475,7 @@ struct pv_mmu_ops pv_mmu_ops = {
.lazy_mode = {
.enter = paravirt_nop,
.leave = paravirt_nop,
+ .flush = paravirt_nop,
},
.set_fixmap = native_set_fixmap,
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 3f08f34f93e..a1da6737ba5 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
-DEF_NATIVE(pv_cpu_ops, iret, "iretq");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
@@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, iret);
PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 726494b5834..0497f719977 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -22,6 +22,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) "Calgary: " fmt
+
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/types.h>
@@ -42,7 +44,6 @@
#include <asm/calgary.h>
#include <asm/tce.h>
#include <asm/pci-direct.h>
-#include <asm/system.h>
#include <asm/dma.h>
#include <asm/rio.h>
#include <asm/bios_ebda.h>
@@ -246,7 +247,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
npages, 0, boundary_size, 0);
if (offset == ~0UL) {
- printk(KERN_WARNING "Calgary: IOMMU full.\n");
+ pr_warn("IOMMU full\n");
spin_unlock_irqrestore(&tbl->it_lock, flags);
if (panic_on_overflow)
panic("Calgary: fix the allocator.\n");
@@ -272,8 +273,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
entry = iommu_range_alloc(dev, tbl, npages);
if (unlikely(entry == DMA_ERROR_CODE)) {
- printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
- "iommu %p\n", npages, tbl);
+ pr_warn("failed to allocate %u pages in iommu %p\n",
+ npages, tbl);
return DMA_ERROR_CODE;
}
@@ -431,7 +432,7 @@ static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
}
static void* calgary_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag)
+ dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs)
{
void *ret = NULL;
dma_addr_t mapping;
@@ -464,7 +465,8 @@ error:
}
static void calgary_free_coherent(struct device *dev, size_t size,
- void *vaddr, dma_addr_t dma_handle)
+ void *vaddr, dma_addr_t dma_handle,
+ struct dma_attrs *attrs)
{
unsigned int npages;
struct iommu_table *tbl = find_iommu_table(dev);
@@ -477,8 +479,8 @@ static void calgary_free_coherent(struct device *dev, size_t size,
}
static struct dma_map_ops calgary_dma_ops = {
- .alloc_coherent = calgary_alloc_coherent,
- .free_coherent = calgary_free_coherent,
+ .alloc = calgary_alloc_coherent,
+ .free = calgary_free_coherent,
.map_sg = calgary_map_sg,
.unmap_sg = calgary_unmap_sg,
.map_page = calgary_map_page,
@@ -561,8 +563,7 @@ static void calgary_tce_cache_blast(struct iommu_table *tbl)
i++;
} while ((val & 0xff) != 0xff && i < 100);
if (i == 100)
- printk(KERN_WARNING "Calgary: PCI bus not quiesced, "
- "continuing anyway\n");
+ pr_warn("PCI bus not quiesced, continuing anyway\n");
/* invalidate TCE cache */
target = calgary_reg(bbar, tar_offset(tbl->it_busno));
@@ -604,8 +605,7 @@ begin:
i++;
} while ((val64 & 0xff) != 0xff && i < 100);
if (i == 100)
- printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, "
- "continuing anyway\n");
+ pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n");
/* 3. poll Page Migration DEBUG for SoftStopFault */
target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
@@ -617,8 +617,7 @@ begin:
if (++count < 100)
goto begin;
else {
- printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, "
- "aborting TCE cache flush sequence!\n");
+ pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n");
return; /* pray for the best */
}
}
@@ -840,8 +839,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl)
plssr = be32_to_cpu(readl(target));
/* If no error, the agent ID in the CSR is not valid */
- printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, "
- "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr);
+ pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n",
+ tbl->it_busno, csr, plssr);
}
static void calioc2_dump_error_regs(struct iommu_table *tbl)
@@ -867,22 +866,21 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl)
target = calgary_reg(bbar, phboff | 0x800);
mck = be32_to_cpu(readl(target));
- printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n",
- tbl->it_busno);
+ pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno);
- printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
- csr, plssr, csmr, mck);
+ pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
+ csr, plssr, csmr, mck);
/* dump rest of error regs */
- printk(KERN_EMERG "Calgary: ");
+ pr_emerg("");
for (i = 0; i < ARRAY_SIZE(errregs); i++) {
/* err regs are at 0x810 - 0x870 */
erroff = (0x810 + (i * 0x10));
target = calgary_reg(bbar, phboff | erroff);
errregs[i] = be32_to_cpu(readl(target));
- printk("0x%08x@0x%lx ", errregs[i], erroff);
+ pr_cont("0x%08x@0x%lx ", errregs[i], erroff);
}
- printk("\n");
+ pr_cont("\n");
/* root complex status */
target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
@@ -1209,23 +1207,31 @@ error:
return ret;
}
-static inline int __init determine_tce_table_size(u64 ram)
+static inline int __init determine_tce_table_size(void)
{
int ret;
if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
return specified_table_size;
- /*
- * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
- * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
- * larger table size has twice as many entries, so shift the
- * max ram address by 13 to divide by 8K and then look at the
- * order of the result to choose between 0-7.
- */
- ret = get_order(ram >> 13);
- if (ret > TCE_TABLE_SIZE_8M)
+ if (is_kdump_kernel() && saved_max_pfn) {
+ /*
+ * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
+ * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
+ * larger table size has twice as many entries, so shift the
+ * max ram address by 13 to divide by 8K and then look at the
+ * order of the result to choose between 0-7.
+ */
+ ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13);
+ if (ret > TCE_TABLE_SIZE_8M)
+ ret = TCE_TABLE_SIZE_8M;
+ } else {
+ /*
+ * Use 8M by default (suggested by Muli) if it's not
+ * kdump kernel and saved_max_pfn isn't set.
+ */
ret = TCE_TABLE_SIZE_8M;
+ }
return ret;
}
@@ -1420,8 +1426,7 @@ int __init detect_calgary(void)
return -ENOMEM;
}
- specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
- saved_max_pfn : max_pfn) * PAGE_SIZE);
+ specified_table_size = determine_tce_table_size();
for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
struct calgary_bus_info *info = &bus_info[bus];
@@ -1480,8 +1485,9 @@ cleanup:
static int __init calgary_parse_options(char *p)
{
unsigned int bridge;
+ unsigned long val;
size_t len;
- char* endp;
+ ssize_t ret;
while (*p) {
if (!strncmp(p, "64k", 3))
@@ -1512,10 +1518,11 @@ static int __init calgary_parse_options(char *p)
++p;
if (*p == '\0')
break;
- bridge = simple_strtoul(p, &endp, 0);
- if (p == endp)
+ ret = kstrtoul(p, 0, &val);
+ if (ret)
break;
+ bridge = val;
if (bridge < MAX_PHB_BUS_NUM) {
printk(KERN_INFO "Calgary: disabling "
"translation for PHB %#x\n", bridge);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1c4d769e21e..a25e202bb31 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -45,15 +45,6 @@ int iommu_detected __read_mostly = 0;
*/
int iommu_pass_through __read_mostly;
-/*
- * Group multi-function PCI devices into a single device-group for the
- * iommu_device_group interface. This tells the iommu driver to pretend
- * it cannot distinguish between functions of a device, exposing only one
- * group for the device. Useful for disallowing use of individual PCI
- * functions from userspace drivers.
- */
-int iommu_group_mf __read_mostly;
-
extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
/* Dummy device used for NULL arguments (normally ISA). */
@@ -65,7 +56,7 @@ struct device x86_dma_fallback_dev = {
EXPORT_SYMBOL(x86_dma_fallback_dev);
/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES 32768
+#define PREALLOC_DMA_DEBUG_ENTRIES 65536
int dma_set_mask(struct device *dev, u64 mask)
{
@@ -96,17 +87,30 @@ void __init pci_iommu_alloc(void)
}
}
void *dma_generic_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_addr, gfp_t flag)
+ dma_addr_t *dma_addr, gfp_t flag,
+ struct dma_attrs *attrs)
{
unsigned long dma_mask;
struct page *page;
+ unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
dma_addr_t addr;
dma_mask = dma_alloc_coherent_mask(dev, flag);
- flag |= __GFP_ZERO;
+ flag &= ~__GFP_ZERO;
again:
- page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
+ page = NULL;
+ /* CMA can be used only in the context which permits sleeping */
+ if (flag & __GFP_WAIT) {
+ page = dma_alloc_from_contiguous(dev, count, get_order(size));
+ if (page && page_to_phys(page) + size > dma_mask) {
+ dma_release_from_contiguous(dev, page, count);
+ page = NULL;
+ }
+ }
+ /* fallback */
+ if (!page)
+ page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
if (!page)
return NULL;
@@ -121,11 +125,21 @@ again:
return NULL;
}
-
+ memset(page_address(page), 0, size);
*dma_addr = addr;
return page_address(page);
}
+void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_addr, struct dma_attrs *attrs)
+{
+ unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ struct page *page = virt_to_page(vaddr);
+
+ if (!dma_release_from_contiguous(dev, page, count))
+ free_pages((unsigned long)vaddr, get_order(size));
+}
+
/*
* See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
* parameter documentation.
@@ -178,8 +192,6 @@ static __init int iommu_setup(char *p)
#endif
if (!strncmp(p, "pt", 2))
iommu_pass_through = 1;
- if (!strncmp(p, "group_mf", 8))
- iommu_group_mf = 1;
gart_parse_options(p);
@@ -260,12 +272,13 @@ rootfs_initcall(pci_iommu_init);
#ifdef CONFIG_PCI
/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-static __devinit void via_no_dac(struct pci_dev *dev)
+static void via_no_dac(struct pci_dev *dev)
{
- if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+ if (forbid_dac == 0) {
dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
forbid_dac = 1;
}
}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);
#endif
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 3af4af810c0..da15918d1c8 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -3,7 +3,6 @@
#include <linux/dma-mapping.h>
#include <linux/scatterlist.h>
#include <linux/string.h>
-#include <linux/init.h>
#include <linux/gfp.h>
#include <linux/pci.h>
#include <linux/mm.h>
@@ -74,12 +73,6 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
return nents;
}
-static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_addr)
-{
- free_pages((unsigned long)vaddr, get_order(size));
-}
-
static void nommu_sync_single_for_device(struct device *dev,
dma_addr_t addr, size_t size,
enum dma_data_direction dir)
@@ -96,8 +89,8 @@ static void nommu_sync_sg_for_device(struct device *dev,
}
struct dma_map_ops nommu_dma_ops = {
- .alloc_coherent = dma_generic_alloc_coherent,
- .free_coherent = nommu_free_coherent,
+ .alloc = dma_generic_alloc_coherent,
+ .free = dma_generic_free_coherent,
.map_sg = nommu_map_sg,
.map_page = nommu_map_page,
.sync_single_for_device = nommu_sync_single_for_device,
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 8f972cbddef..77dd0ad58be 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -14,22 +14,34 @@
#include <asm/iommu_table.h>
int swiotlb __read_mostly;
-static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
- dma_addr_t *dma_handle, gfp_t flags)
+void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags,
+ struct dma_attrs *attrs)
{
void *vaddr;
- vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags);
+ vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags,
+ attrs);
if (vaddr)
return vaddr;
return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
}
+void x86_swiotlb_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_addr,
+ struct dma_attrs *attrs)
+{
+ if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr)))
+ swiotlb_free_coherent(dev, size, vaddr, dma_addr);
+ else
+ dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
+}
+
static struct dma_map_ops swiotlb_dma_ops = {
.mapping_error = swiotlb_dma_mapping_error,
- .alloc_coherent = x86_swiotlb_alloc_coherent,
- .free_coherent = swiotlb_free_coherent,
+ .alloc = x86_swiotlb_alloc_coherent,
+ .free = x86_swiotlb_free_coherent,
.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
.sync_single_for_device = swiotlb_sync_single_for_device,
.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
new file mode 100644
index 00000000000..e309cc5c276
--- /dev/null
+++ b/arch/x86/kernel/perf_regs.c
@@ -0,0 +1,105 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <linux/stddef.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_X86_32
+#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX
+#else
+#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX
+#endif
+
+#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
+
+static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
+ PT_REGS_OFFSET(PERF_REG_X86_AX, ax),
+ PT_REGS_OFFSET(PERF_REG_X86_BX, bx),
+ PT_REGS_OFFSET(PERF_REG_X86_CX, cx),
+ PT_REGS_OFFSET(PERF_REG_X86_DX, dx),
+ PT_REGS_OFFSET(PERF_REG_X86_SI, si),
+ PT_REGS_OFFSET(PERF_REG_X86_DI, di),
+ PT_REGS_OFFSET(PERF_REG_X86_BP, bp),
+ PT_REGS_OFFSET(PERF_REG_X86_SP, sp),
+ PT_REGS_OFFSET(PERF_REG_X86_IP, ip),
+ PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags),
+ PT_REGS_OFFSET(PERF_REG_X86_CS, cs),
+ PT_REGS_OFFSET(PERF_REG_X86_SS, ss),
+#ifdef CONFIG_X86_32
+ PT_REGS_OFFSET(PERF_REG_X86_DS, ds),
+ PT_REGS_OFFSET(PERF_REG_X86_ES, es),
+ PT_REGS_OFFSET(PERF_REG_X86_FS, fs),
+ PT_REGS_OFFSET(PERF_REG_X86_GS, gs),
+#else
+ /*
+ * The pt_regs struct does not store
+ * ds, es, fs, gs in 64 bit mode.
+ */
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+#endif
+#ifdef CONFIG_X86_64
+ PT_REGS_OFFSET(PERF_REG_X86_R8, r8),
+ PT_REGS_OFFSET(PERF_REG_X86_R9, r9),
+ PT_REGS_OFFSET(PERF_REG_X86_R10, r10),
+ PT_REGS_OFFSET(PERF_REG_X86_R11, r11),
+ PT_REGS_OFFSET(PERF_REG_X86_R12, r12),
+ PT_REGS_OFFSET(PERF_REG_X86_R13, r13),
+ PT_REGS_OFFSET(PERF_REG_X86_R14, r14),
+ PT_REGS_OFFSET(PERF_REG_X86_R15, r15),
+#endif
+};
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+ if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset)))
+ return 0;
+
+ return regs_get_register(regs, pt_regs_offset[idx]);
+}
+
+#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL))
+
+#ifdef CONFIG_X86_32
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || mask & REG_RESERVED)
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ return PERF_SAMPLE_REGS_ABI_32;
+}
+#else /* CONFIG_X86_64 */
+#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
+ (1ULL << PERF_REG_X86_ES) | \
+ (1ULL << PERF_REG_X86_FS) | \
+ (1ULL << PERF_REG_X86_GS))
+
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || mask & REG_RESERVED)
+ return -EINVAL;
+
+ if (mask & REG_NOSUPPORT)
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ if (test_tsk_thread_flag(task, TIF_IA32))
+ return PERF_SAMPLE_REGS_ABI_32;
+ else
+ return PERF_SAMPLE_REGS_ABI_64;
+}
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S
new file mode 100644
index 00000000000..ca7f0d58a87
--- /dev/null
+++ b/arch/x86/kernel/preempt.S
@@ -0,0 +1,25 @@
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/asm.h>
+#include <asm/calling.h>
+
+ENTRY(___preempt_schedule)
+ CFI_STARTPROC
+ SAVE_ALL
+ call preempt_schedule
+ RESTORE_ALL
+ ret
+ CFI_ENDPROC
+
+#ifdef CONFIG_CONTEXT_TRACKING
+
+ENTRY(___preempt_schedule_context)
+ CFI_STARTPROC
+ SAVE_ALL
+ call preempt_schedule_context
+ RESTORE_ALL
+ ret
+ CFI_ENDPROC
+
+#endif
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 34e06e84ce3..d5f15c3f7b2 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -12,6 +12,7 @@
#include <linux/pci.h>
#include <linux/export.h>
+#include <asm/probe_roms.h>
#include <asm/pci-direct.h>
#include <asm/e820.h>
#include <asm/mmzone.h>
@@ -149,7 +150,7 @@ static struct resource *find_oprom(struct pci_dev *pdev)
return oprom;
}
-void *pci_map_biosrom(struct pci_dev *pdev)
+void __iomem *pci_map_biosrom(struct pci_dev *pdev)
{
struct resource *oprom = find_oprom(pdev);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 15763af7bfe..4505e2a950d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mm.h>
@@ -12,20 +14,54 @@
#include <linux/user-return-notifier.h>
#include <linux/dmi.h>
#include <linux/utsname.h>
+#include <linux/stackprotector.h>
+#include <linux/tick.h>
+#include <linux/cpuidle.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
#include <asm/cpu.h>
-#include <asm/system.h>
#include <asm/apic.h>
#include <asm/syscalls.h>
#include <asm/idle.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
+#include <asm/nmi.h>
+
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data..cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+
+#ifdef CONFIG_X86_64
+static DEFINE_PER_CPU(unsigned char, is_idle);
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+ atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
+#endif
struct kmem_cache *task_xstate_cachep;
EXPORT_SYMBOL_GPL(task_xstate_cachep);
+/*
+ * this gets called so that we can store lazy state into memory and copy the
+ * current task into the new thread.
+ */
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
int ret;
@@ -36,7 +72,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
ret = fpu_alloc(&dst->thread.fpu);
if (ret)
return ret;
- fpu_copy(&dst->thread.fpu, &src->thread.fpu);
+ fpu_copy(dst, src);
}
return 0;
}
@@ -46,10 +82,9 @@ void free_thread_xstate(struct task_struct *tsk)
fpu_free(&tsk->thread.fpu);
}
-void free_thread_info(struct thread_info *ti)
+void arch_release_task_struct(struct task_struct *tsk)
{
- free_thread_xstate(ti->task);
- free_pages((unsigned long)ti, THREAD_ORDER);
+ free_thread_xstate(tsk);
}
void arch_task_cache_init(void)
@@ -82,38 +117,8 @@ void exit_thread(void)
put_cpu();
kfree(bp);
}
-}
-
-void show_regs(struct pt_regs *regs)
-{
- show_registers(regs);
- show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
-}
-
-void show_regs_common(void)
-{
- const char *vendor, *product, *board;
-
- vendor = dmi_get_system_info(DMI_SYS_VENDOR);
- if (!vendor)
- vendor = "";
- product = dmi_get_system_info(DMI_PRODUCT_NAME);
- if (!product)
- product = "";
- /* Board Name is optional */
- board = dmi_get_system_info(DMI_BOARD_NAME);
-
- printk(KERN_CONT "\n");
- printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- printk(KERN_CONT " %s %s", vendor, product);
- if (board)
- printk(KERN_CONT "/%s", board);
- printk(KERN_CONT "\n");
+ drop_fpu(me);
}
void flush_thread(void)
@@ -122,12 +127,13 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ drop_init_fpu(tsk);
/*
- * Forget coprocessor state..
+ * Free the FPU state for non xsave platforms. They get reallocated
+ * lazily at the first use.
*/
- tsk->fpu_counter = 0;
- clear_fpu(tsk);
- clear_used_math();
+ if (!use_eager_fpu())
+ free_thread_xstate(tsk);
}
static void hard_disable_TSC(void)
@@ -232,185 +238,92 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
propagate_user_return_notify(prev_p, next_p);
}
-int sys_fork(struct pt_regs *regs)
-{
- return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
-}
-
/*
- * This is trivial, and on the face of it looks like it
- * could equally well be done in user mode.
- *
- * Not so, for quite unobvious reasons - register pressure.
- * In user mode vfork() cannot have a stack frame, and if
- * done by calling the "clone()" system call directly, you
- * do not have enough call-clobbered registers to hold all
- * the information you need.
+ * Idle related variables and functions
*/
-int sys_vfork(struct pt_regs *regs)
+unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+static void (*x86_idle)(void);
+
+#ifndef CONFIG_SMP
+static inline void play_dead(void)
{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
- NULL, NULL);
+ BUG();
}
+#endif
-long
-sys_clone(unsigned long clone_flags, unsigned long newsp,
- void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+#ifdef CONFIG_X86_64
+void enter_idle(void)
{
- if (!newsp)
- newsp = regs->sp;
- return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+ this_cpu_write(is_idle, 1);
+ atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
-/*
- * This gets run with %si containing the
- * function to call, and %di containing
- * the "args".
- */
-extern void kernel_thread_helper(void);
-
-/*
- * Create a kernel thread
- */
-int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+static void __exit_idle(void)
{
- struct pt_regs regs;
-
- memset(&regs, 0, sizeof(regs));
-
- regs.si = (unsigned long) fn;
- regs.di = (unsigned long) arg;
-
-#ifdef CONFIG_X86_32
- regs.ds = __USER_DS;
- regs.es = __USER_DS;
- regs.fs = __KERNEL_PERCPU;
- regs.gs = __KERNEL_STACK_CANARY;
-#else
- regs.ss = __KERNEL_DS;
-#endif
-
- regs.orig_ax = -1;
- regs.ip = (unsigned long) kernel_thread_helper;
- regs.cs = __KERNEL_CS | get_kernel_rpl();
- regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
-
- /* Ok, create the new process.. */
- return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
+ if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
+ return;
+ atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
}
-EXPORT_SYMBOL(kernel_thread);
-/*
- * sys_execve() executes a new program.
- */
-long sys_execve(const char __user *name,
- const char __user *const __user *argv,
- const char __user *const __user *envp, struct pt_regs *regs)
-{
- long error;
- char *filename;
-
- filename = getname(name);
- error = PTR_ERR(filename);
- if (IS_ERR(filename))
- return error;
- error = do_execve(filename, argv, envp, regs);
-
-#ifdef CONFIG_X86_32
- if (error == 0) {
- /* Make sure we don't return using sysenter.. */
- set_thread_flag(TIF_IRET);
- }
-#endif
-
- putname(filename);
- return error;
+/* Called from interrupts to signify idle end */
+void exit_idle(void)
+{
+ /* idle loop has pid 0 */
+ if (current->pid)
+ return;
+ __exit_idle();
}
-
-/*
- * Idle related variables and functions
- */
-unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
-EXPORT_SYMBOL(boot_option_idle_override);
-
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(pm_idle);
#endif
-#ifdef CONFIG_X86_32
-/*
- * This halt magic was a workaround for ancient floppy DMA
- * wreckage. It should be safe to remove.
- */
-static int hlt_counter;
-void disable_hlt(void)
+void arch_cpu_idle_enter(void)
{
- hlt_counter++;
+ local_touch_nmi();
+ enter_idle();
}
-EXPORT_SYMBOL(disable_hlt);
-void enable_hlt(void)
+void arch_cpu_idle_exit(void)
{
- hlt_counter--;
+ __exit_idle();
}
-EXPORT_SYMBOL(enable_hlt);
-static inline int hlt_use_halt(void)
+void arch_cpu_idle_dead(void)
{
- return (!hlt_counter && boot_cpu_data.hlt_works_ok);
+ play_dead();
}
-#else
-static inline int hlt_use_halt(void)
+
+/*
+ * Called from the generic idle code.
+ */
+void arch_cpu_idle(void)
{
- return 1;
+ x86_idle();
}
-#endif
/*
- * We use this if we don't have any better
- * idle routine..
+ * We use this if we don't have any better idle routine..
*/
void default_idle(void)
{
- if (hlt_use_halt()) {
- trace_power_start(POWER_CSTATE, 1, smp_processor_id());
- trace_cpu_idle(1, smp_processor_id());
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
-
- if (!need_resched())
- safe_halt(); /* enables interrupts racelessly */
- else
- local_irq_enable();
- current_thread_info()->status |= TS_POLLING;
- trace_power_end(smp_processor_id());
- trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
- } else {
- local_irq_enable();
- /* loop is done by the caller */
- cpu_relax();
- }
+ trace_cpu_idle_rcuidle(1, smp_processor_id());
+ safe_halt();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
}
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(default_idle);
#endif
-bool set_pm_idle_to_default(void)
+#ifdef CONFIG_XEN
+bool xen_set_default_idle(void)
{
- bool ret = !!pm_idle;
+ bool ret = !!x86_idle;
- pm_idle = default_idle;
+ x86_idle = default_idle;
return ret;
}
+#endif
void stop_this_cpu(void *dummy)
{
local_irq_disable();
@@ -420,106 +333,8 @@ void stop_this_cpu(void *dummy)
set_cpu_online(smp_processor_id(), false);
disable_local_APIC();
- for (;;) {
- if (hlt_works(smp_processor_id()))
- halt();
- }
-}
-
-static void do_nothing(void *unused)
-{
-}
-
-/*
- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
- * pm_idle and update to new pm_idle value. Required while changing pm_idle
- * handler on SMP systems.
- *
- * Caller must have changed pm_idle to the new value before the call. Old
- * pm_idle value will not be used by any CPU after the return of this function.
- */
-void cpu_idle_wait(void)
-{
- smp_mb();
- /* kick all the CPUs so that they exit out of pm_idle */
- smp_call_function(do_nothing, NULL, 1);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
-/* Default MONITOR/MWAIT with no hints, used for default C1 state */
-static void mwait_idle(void)
-{
- if (!need_resched()) {
- trace_power_start(POWER_CSTATE, 1, smp_processor_id());
- trace_cpu_idle(1, smp_processor_id());
- if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
- clflush((void *)&current_thread_info()->flags);
-
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __sti_mwait(0, 0);
- else
- local_irq_enable();
- trace_power_end(smp_processor_id());
- trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
- } else
- local_irq_enable();
-}
-
-/*
- * On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->work.need_resched flag instead of waiting for the
- * cross-CPU IPI to arrive. Use this option with caution.
- */
-static void poll_idle(void)
-{
- trace_power_start(POWER_CSTATE, 0, smp_processor_id());
- trace_cpu_idle(0, smp_processor_id());
- local_irq_enable();
- while (!need_resched())
- cpu_relax();
- trace_power_end(smp_processor_id());
- trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
-}
-
-/*
- * mwait selection logic:
- *
- * It depends on the CPU. For AMD CPUs that support MWAIT this is
- * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
- * then depend on a clock divisor and current Pstate of the core. If
- * all cores of a processor are in halt state (C1) the processor can
- * enter the C1E (C1 enhanced) state. If mwait is used this will never
- * happen.
- *
- * idle=mwait overrides this decision and forces the usage of mwait.
- */
-
-#define MWAIT_INFO 0x05
-#define MWAIT_ECX_EXTENDED_INFO 0x01
-#define MWAIT_EDX_C1 0xf0
-
-int mwait_usable(const struct cpuinfo_x86 *c)
-{
- u32 eax, ebx, ecx, edx;
-
- if (boot_option_idle_override == IDLE_FORCE_MWAIT)
- return 1;
-
- if (c->cpuid_level < MWAIT_INFO)
- return 0;
-
- cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
- /* Check, whether EDX has extended info about MWAIT */
- if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
- return 1;
-
- /*
- * edx enumeratios MONITOR/MWAIT extensions. Check, whether
- * C1 supports MWAIT
- */
- return (edx & MWAIT_EDX_C1);
+ for (;;)
+ halt();
}
bool amd_e400_c1e_detected;
@@ -540,9 +355,6 @@ void amd_e400_remove_cpu(int cpu)
*/
static void amd_e400_idle(void)
{
- if (need_resched())
- return;
-
if (!amd_e400_c1e_detected) {
u32 lo, hi;
@@ -552,7 +364,7 @@ static void amd_e400_idle(void)
amd_e400_c1e_detected = true;
if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halt in AMD C1E");
- printk(KERN_INFO "System has AMD C1E enabled\n");
+ pr_info("System has AMD C1E enabled\n");
}
}
@@ -566,8 +378,7 @@ static void amd_e400_idle(void)
*/
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
&cpu);
- printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
- cpu);
+ pr_info("Switch to broadcast mode on CPU%d\n", cpu);
}
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
@@ -577,42 +388,34 @@ static void amd_e400_idle(void)
* The switch back from broadcast mode needs to be
* called with interrupts disabled.
*/
- local_irq_disable();
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
- local_irq_enable();
+ local_irq_disable();
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+ local_irq_enable();
} else
default_idle();
}
-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+void select_idle_routine(const struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
- if (pm_idle == poll_idle && smp_num_siblings > 1) {
- printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
- " performance may degrade.\n");
- }
+ if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
+ pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
#endif
- if (pm_idle)
+ if (x86_idle || boot_option_idle_override == IDLE_POLL)
return;
- if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
- /*
- * One CPU supports mwait => All CPUs supports mwait
- */
- printk(KERN_INFO "using mwait in idle threads.\n");
- pm_idle = mwait_idle;
- } else if (cpu_has_amd_erratum(amd_erratum_400)) {
+ if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) {
/* E400: APIC timer interrupt does not wake up CPU from C1e */
- printk(KERN_INFO "using AMD E400 aware idle routine\n");
- pm_idle = amd_e400_idle;
+ pr_info("using AMD E400 aware idle routine\n");
+ x86_idle = amd_e400_idle;
} else
- pm_idle = default_idle;
+ x86_idle = default_idle;
}
void __init init_amd_e400_c1e_mask(void)
{
/* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
- if (pm_idle == amd_e400_idle)
+ if (x86_idle == amd_e400_idle)
zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
}
@@ -622,12 +425,9 @@ static int __init idle_setup(char *str)
return -EINVAL;
if (!strcmp(str, "poll")) {
- printk("using polling idle threads.\n");
- pm_idle = poll_idle;
+ pr_info("using polling idle threads\n");
boot_option_idle_override = IDLE_POLL;
- } else if (!strcmp(str, "mwait")) {
- boot_option_idle_override = IDLE_FORCE_MWAIT;
- WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n");
+ cpu_idle_poll_ctrl(true);
} else if (!strcmp(str, "halt")) {
/*
* When the boot option of idle=halt is added, halt is
@@ -636,7 +436,7 @@ static int __init idle_setup(char *str)
* To continue to load the CPU idle driver, don't touch
* the boot_option_idle_override.
*/
- pm_idle = default_idle;
+ x86_idle = default_idle;
boot_option_idle_override = IDLE_HALT;
} else if (!strcmp(str, "nomwait")) {
/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 80bfe1ab003..7bc86bbe748 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,7 +9,6 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
@@ -25,26 +24,23 @@
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/reboot.h>
-#include <linux/init.h>
#include <linux/mc146818rtc.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/ptrace.h>
#include <linux/personality.h>
-#include <linux/tick.h>
#include <linux/percpu.h>
#include <linux/prctl.h>
#include <linux/ftrace.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/kdebug.h>
-#include <linux/cpuidle.h>
#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/desc.h>
#ifdef CONFIG_MATH_EMULATION
#include <asm/math_emu.h>
@@ -57,9 +53,10 @@
#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/debugreg.h>
-#include <asm/nmi.h>
+#include <asm/switch_to.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
/*
* Return saved PC of a blocked thread.
@@ -69,62 +66,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
return ((unsigned long *)tsk->thread.sp)[3];
}
-#ifndef CONFIG_SMP
-static inline void play_dead(void)
-{
- BUG();
-}
-#endif
-
-/*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
- */
-void cpu_idle(void)
-{
- int cpu = smp_processor_id();
-
- /*
- * If we're the non-boot CPU, nothing set the stack canary up
- * for us. CPU0 already has it initialized but no harm in
- * doing it again. This is a good place for updating it, as
- * we wont ever return from this function (so the invalid
- * canaries already on the stack wont ever trigger).
- */
- boot_init_stack_canary();
-
- current_thread_info()->status |= TS_POLLING;
-
- /* endless idle loop with no priority at all */
- while (1) {
- tick_nohz_idle_enter();
- rcu_idle_enter();
- while (!need_resched()) {
-
- check_pgt_cache();
- rmb();
-
- if (cpu_is_offline(cpu))
- play_dead();
-
- local_touch_nmi();
- local_irq_disable();
- /* Don't trace irqs off for idle */
- stop_critical_timings();
- if (cpuidle_idle_call())
- pm_idle();
- start_critical_timings();
- }
- rcu_idle_exit();
- tick_nohz_idle_exit();
- preempt_enable_no_resched();
- schedule();
- preempt_disable();
- }
-}
-
void __show_regs(struct pt_regs *regs, int all)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
@@ -142,8 +83,6 @@ void __show_regs(struct pt_regs *regs, int all)
savesegment(gs, gs);
}
- show_regs_common();
-
printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
(u16)regs->cs, regs->ip, regs->flags,
smp_processor_id());
@@ -170,11 +109,16 @@ void __show_regs(struct pt_regs *regs, int all)
get_debugreg(d1, 1);
get_debugreg(d2, 2);
get_debugreg(d3, 3);
- printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
- d0, d1, d2, d3);
-
get_debugreg(d6, 6);
get_debugreg(d7, 7);
+
+ /* Only print out debug registers if they are in their non-default state. */
+ if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
+ (d6 == DR6_RESERVED) && (d7 == 0x400))
+ return;
+
+ printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+ d0, d1, d2, d3);
printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
d6, d7);
}
@@ -185,35 +129,43 @@ void release_thread(struct task_struct *dead_task)
release_vm86_irqs(dead_task);
}
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
- unlazy_fpu(tsk);
-}
-
int copy_thread(unsigned long clone_flags, unsigned long sp,
- unsigned long unused,
- struct task_struct *p, struct pt_regs *regs)
+ unsigned long arg, struct task_struct *p)
{
- struct pt_regs *childregs;
+ struct pt_regs *childregs = task_pt_regs(p);
struct task_struct *tsk;
int err;
- childregs = task_pt_regs(p);
- *childregs = *regs;
- childregs->ax = 0;
- childregs->sp = sp;
-
p->thread.sp = (unsigned long) childregs;
p->thread.sp0 = (unsigned long) (childregs+1);
- p->thread.ip = (unsigned long) ret_from_fork;
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ /* kernel thread */
+ memset(childregs, 0, sizeof(struct pt_regs));
+ p->thread.ip = (unsigned long) ret_from_kernel_thread;
+ task_user_gs(p) = __KERNEL_STACK_CANARY;
+ childregs->ds = __USER_DS;
+ childregs->es = __USER_DS;
+ childregs->fs = __KERNEL_PERCPU;
+ childregs->bx = sp; /* function */
+ childregs->bp = arg;
+ childregs->orig_ax = -1;
+ childregs->cs = __KERNEL_CS | get_kernel_rpl();
+ childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+ p->thread.fpu_counter = 0;
+ p->thread.io_bitmap_ptr = NULL;
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+ return 0;
+ }
+ *childregs = *current_pt_regs();
+ childregs->ax = 0;
+ if (sp)
+ childregs->sp = sp;
- task_user_gs(p) = get_user_gs(regs);
+ p->thread.ip = (unsigned long) ret_from_fork;
+ task_user_gs(p) = get_user_gs(current_pt_regs());
+ p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
tsk = current;
err = -ENOMEM;
@@ -257,10 +209,12 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
regs->cs = __USER_CS;
regs->ip = new_ip;
regs->sp = new_sp;
+ regs->flags = X86_EFLAGS_IF;
/*
- * Free the old FP and other extended state
+ * force it to the iret return path by making it look as if there was
+ * some work pending.
*/
- free_thread_xstate(current);
+ set_thread_flag(TIF_NOTIFY_RESUME);
}
EXPORT_SYMBOL_GPL(start_thread);
@@ -292,7 +246,7 @@ EXPORT_SYMBOL_GPL(start_thread);
* the task-switch, and shows up in ret_from_fork in entry.S,
* for example.
*/
-__notrace_funcgraph struct task_struct *
+__visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
@@ -303,7 +257,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
- fpu = switch_fpu_prepare(prev_p, next_p);
+ fpu = switch_fpu_prepare(prev_p, next_p, cpu);
/*
* Reload esp0.
@@ -337,6 +291,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
set_iopl_mask(next->iopl);
/*
+ * If it were not for PREEMPT_ACTIVE we could guarantee that the
+ * preempt_count of all tasks was equal here and this would not be
+ * needed.
+ */
+ task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
+ this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
+
+ /*
* Now maybe handle debug registers and/or IO bitmaps
*/
if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
@@ -352,6 +314,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
arch_end_context_switch(next_p);
+ this_cpu_write(kernel_stack,
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE - KERNEL_STACK_OFFSET);
+
/*
* Restore %gs if needed (which is common)
*/
@@ -360,7 +326,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
switch_fpu_finish(next_p, fpu);
- percpu_write(current_task, next_p);
+ this_cpu_write(current_task, next_p);
return prev_p;
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 1fd94bc4279..ca5b02d405c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,7 +14,6 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
@@ -32,17 +31,15 @@
#include <linux/notifier.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
-#include <linux/tick.h>
#include <linux/prctl.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/ftrace.h>
-#include <linux/cpuidle.h>
#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/processor.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/mmu_context.h>
#include <asm/prctl.h>
#include <asm/desc.h>
@@ -51,116 +48,11 @@
#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/debugreg.h>
-#include <asm/nmi.h>
+#include <asm/switch_to.h>
asmlinkage extern void ret_from_fork(void);
-DEFINE_PER_CPU(unsigned long, old_rsp);
-static DEFINE_PER_CPU(unsigned char, is_idle);
-
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-
-void idle_notifier_register(struct notifier_block *n)
-{
- atomic_notifier_chain_register(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
- atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
-
-void enter_idle(void)
-{
- percpu_write(is_idle, 1);
- atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
-}
-
-static void __exit_idle(void)
-{
- if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
- return;
- atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
-}
-
-/* Called from interrupts to signify idle end */
-void exit_idle(void)
-{
- /* idle loop has pid 0 */
- if (current->pid)
- return;
- __exit_idle();
-}
-
-#ifndef CONFIG_SMP
-static inline void play_dead(void)
-{
- BUG();
-}
-#endif
-
-/*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
- */
-void cpu_idle(void)
-{
- current_thread_info()->status |= TS_POLLING;
-
- /*
- * If we're the non-boot CPU, nothing set the stack canary up
- * for us. CPU0 already has it initialized but no harm in
- * doing it again. This is a good place for updating it, as
- * we wont ever return from this function (so the invalid
- * canaries already on the stack wont ever trigger).
- */
- boot_init_stack_canary();
-
- /* endless idle loop with no priority at all */
- while (1) {
- tick_nohz_idle_enter();
- while (!need_resched()) {
-
- rmb();
-
- if (cpu_is_offline(smp_processor_id()))
- play_dead();
- /*
- * Idle routines should keep interrupts disabled
- * from here on, until they go to idle.
- * Otherwise, idle callbacks can misfire.
- */
- local_touch_nmi();
- local_irq_disable();
- enter_idle();
- /* Don't trace irqs off for idle */
- stop_critical_timings();
-
- /* enter_idle() needs rcu for notifiers */
- rcu_idle_enter();
-
- if (cpuidle_idle_call())
- pm_idle();
-
- rcu_idle_exit();
- start_critical_timings();
-
- /* In many cases the interrupt that ended idle
- has already called exit_idle. But some idle
- loops can be woken up without interrupt. */
- __exit_idle();
- }
-
- tick_nohz_idle_exit();
- preempt_enable_no_resched();
- schedule();
- preempt_disable();
- }
-}
+__visible DEFINE_PER_CPU(unsigned long, old_rsp);
/* Prints also some state that isn't saved in the pt_regs */
void __show_regs(struct pt_regs *regs, int all)
@@ -170,9 +62,8 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
- show_regs_common();
printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
- printk_address(regs->ip, 1);
+ printk_address(regs->ip);
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
regs->sp, regs->flags);
printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
@@ -214,21 +105,28 @@ void __show_regs(struct pt_regs *regs, int all)
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
- printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
get_debugreg(d3, 3);
get_debugreg(d6, 6);
get_debugreg(d7, 7);
+
+ /* Only print out debug registers if they are in their non-default state. */
+ if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
+ (d6 == DR6_RESERVED) && (d7 == 0x400))
+ return;
+
+ printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+
}
void release_thread(struct task_struct *dead_task)
{
if (dead_task->mm) {
if (dead_task->mm->context.size) {
- printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
- dead_task->comm,
- dead_task->mm->context.ldt,
- dead_task->mm->context.size);
+ pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
+ dead_task->comm,
+ dead_task->mm->context.ldt,
+ dead_task->mm->context.size);
BUG();
}
}
@@ -253,39 +151,19 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
return get_desc_base(&t->thread.tls_array[tls]);
}
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
- unlazy_fpu(tsk);
-}
-
int copy_thread(unsigned long clone_flags, unsigned long sp,
- unsigned long unused,
- struct task_struct *p, struct pt_regs *regs)
+ unsigned long arg, struct task_struct *p)
{
int err;
struct pt_regs *childregs;
struct task_struct *me = current;
- childregs = ((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(p))) - 1;
- *childregs = *regs;
-
- childregs->ax = 0;
- if (user_mode(regs))
- childregs->sp = sp;
- else
- childregs->sp = (unsigned long)childregs;
-
+ p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
+ childregs = task_pt_regs(p);
p->thread.sp = (unsigned long) childregs;
- p->thread.sp0 = (unsigned long) (childregs+1);
p->thread.usersp = me->thread.usersp;
-
set_tsk_thread_flag(p, TIF_FORK);
-
+ p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
savesegment(gs, p->thread.gsindex);
@@ -294,6 +172,25 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ /* kernel thread */
+ memset(childregs, 0, sizeof(struct pt_regs));
+ childregs->sp = (unsigned long)childregs;
+ childregs->ss = __KERNEL_DS;
+ childregs->bx = sp; /* function */
+ childregs->bp = arg;
+ childregs->orig_ax = -1;
+ childregs->cs = __KERNEL_CS | get_kernel_rpl();
+ childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+ return 0;
+ }
+ *childregs = *current_pt_regs();
+
+ childregs->ax = 0;
+ if (sp)
+ childregs->sp = sp;
err = -ENOMEM;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
@@ -341,16 +238,13 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
loadsegment(es, _ds);
loadsegment(ds, _ds);
load_gs_index(0);
+ current->thread.usersp = new_sp;
regs->ip = new_ip;
regs->sp = new_sp;
- percpu_write(old_rsp, new_sp);
+ this_cpu_write(old_rsp, new_sp);
regs->cs = _cs;
regs->ss = _ss;
regs->flags = X86_EFLAGS_IF;
- /*
- * Free the old FP and other extended state
- */
- free_thread_xstate(current);
}
void
@@ -364,7 +258,9 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
{
start_thread_common(regs, new_ip, new_sp,
- __USER32_CS, __USER32_DS, __USER32_DS);
+ test_thread_flag(TIF_X32)
+ ? __USER_CS : __USER32_CS,
+ __USER_DS, __USER_DS);
}
#endif
@@ -378,7 +274,7 @@ void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
* Kprobes not supported here. Set the probe on schedule instead.
* Function graph tracer not supported too.
*/
-__notrace_funcgraph struct task_struct *
+__visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
@@ -388,7 +284,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
unsigned fsindex, gsindex;
fpu_switch_t fpu;
- fpu = switch_fpu_prepare(prev_p, next_p);
+ fpu = switch_fpu_prepare(prev_p, next_p, cpu);
/*
* Reload esp0, LDT and the page table pointer:
@@ -463,11 +359,19 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- prev->usersp = percpu_read(old_rsp);
- percpu_write(old_rsp, next->usersp);
- percpu_write(current_task, next_p);
+ prev->usersp = this_cpu_read(old_rsp);
+ this_cpu_write(old_rsp, next->usersp);
+ this_cpu_write(current_task, next_p);
+
+ /*
+ * If it were not for PREEMPT_ACTIVE we could guarantee that the
+ * preempt_count of all tasks was equal here and this would not be
+ * needed.
+ */
+ task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
+ this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
- percpu_write(kernel_stack,
+ this_cpu_write(kernel_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE - KERNEL_STACK_OFFSET);
@@ -487,6 +391,8 @@ void set_personality_64bit(void)
/* Make sure to be in 64bit mode */
clear_thread_flag(TIF_IA32);
+ clear_thread_flag(TIF_ADDR32);
+ clear_thread_flag(TIF_X32);
/* Ensure the corresponding mm is not marked. */
if (current->mm)
@@ -499,21 +405,34 @@ void set_personality_64bit(void)
current->personality &= ~READ_IMPLIES_EXEC;
}
-void set_personality_ia32(void)
+void set_personality_ia32(bool x32)
{
/* inherit personality from parent */
/* Make sure to be in 32bit mode */
- set_thread_flag(TIF_IA32);
- current->personality |= force_personality32;
+ set_thread_flag(TIF_ADDR32);
/* Mark the associated mm as containing 32-bit tasks. */
- if (current->mm)
- current->mm->context.ia32_compat = 1;
-
- /* Prepare the first "return" to user space */
- current_thread_info()->status |= TS_COMPAT;
+ if (x32) {
+ clear_thread_flag(TIF_IA32);
+ set_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_X32;
+ current->personality &= ~READ_IMPLIES_EXEC;
+ /* is_compat_task() uses the presence of the x32
+ syscall bit flag to determine compat status */
+ current_thread_info()->status &= ~TS_COMPAT;
+ } else {
+ set_thread_flag(TIF_IA32);
+ clear_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_IA32;
+ current->personality |= force_personality32;
+ /* Prepare the first "return" to user space */
+ current_thread_info()->status |= TS_COMPAT;
+ }
}
+EXPORT_SYMBOL_GPL(set_personality_ia32);
unsigned long get_wchan(struct task_struct *p)
{
@@ -565,7 +484,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
task->thread.gs = addr;
if (doit) {
load_gs_index(0);
- ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
+ ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
}
}
put_cpu();
@@ -593,7 +512,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
/* set the selector to 0 to not confuse
__switch_to */
loadsegment(fs, 0);
- ret = checking_wrmsrl(MSR_FS_BASE, addr);
+ ret = wrmsrl_safe(MSR_FS_BASE, addr);
}
}
put_cpu();
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 50267386b76..678c0ada3b3 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,18 +21,22 @@
#include <linux/signal.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <linux/context_tracking.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/processor.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
#include <asm/ldt.h>
#include <asm/desc.h>
#include <asm/prctl.h>
#include <asm/proto.h>
#include <asm/hw_breakpoint.h>
+#include <asm/traps.h>
#include "tls.h"
@@ -164,6 +168,35 @@ static inline bool invalid_selector(u16 value)
#define FLAG_MASK FLAG_MASK_32
+/*
+ * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
+ * when it traps. The previous stack will be directly underneath the saved
+ * registers, and 'sp/ss' won't even have been saved. Thus the '&regs->sp'.
+ *
+ * Now, if the stack is empty, '&regs->sp' is out of range. In this
+ * case we try to take the previous stack. To always return a non-null
+ * stack pointer we fall back to regs as stack if no previous stack
+ * exists.
+ *
+ * This is valid only for kernel mode traps.
+ */
+unsigned long kernel_stack_pointer(struct pt_regs *regs)
+{
+ unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1);
+ unsigned long sp = (unsigned long)&regs->sp;
+ u32 *prev_esp;
+
+ if (context == (sp & ~(THREAD_SIZE - 1)))
+ return sp;
+
+ prev_esp = (u32 *)(context);
+ if (prev_esp)
+ return (unsigned long)prev_esp;
+
+ return (unsigned long)regs;
+}
+EXPORT_SYMBOL_GPL(kernel_stack_pointer);
+
static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
{
BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
@@ -568,30 +601,48 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[])
return dr7;
}
-static int
-ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
- struct task_struct *tsk, int disabled)
+static int ptrace_fill_bp_fields(struct perf_event_attr *attr,
+ int len, int type, bool disabled)
+{
+ int err, bp_len, bp_type;
+
+ err = arch_bp_generic_fields(len, type, &bp_len, &bp_type);
+ if (!err) {
+ attr->bp_len = bp_len;
+ attr->bp_type = bp_type;
+ attr->disabled = disabled;
+ }
+
+ return err;
+}
+
+static struct perf_event *
+ptrace_register_breakpoint(struct task_struct *tsk, int len, int type,
+ unsigned long addr, bool disabled)
{
- int err;
- int gen_len, gen_type;
struct perf_event_attr attr;
+ int err;
- /*
- * We should have at least an inactive breakpoint at this
- * slot. It means the user is writing dr7 without having
- * written the address register first
- */
- if (!bp)
- return -EINVAL;
+ ptrace_breakpoint_init(&attr);
+ attr.bp_addr = addr;
- err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
+ err = ptrace_fill_bp_fields(&attr, len, type, disabled);
if (err)
- return err;
+ return ERR_PTR(err);
+
+ return register_user_hw_breakpoint(&attr, ptrace_triggered,
+ NULL, tsk);
+}
+
+static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+ int disabled)
+{
+ struct perf_event_attr attr = bp->attr;
+ int err;
- attr = bp->attr;
- attr.bp_len = gen_len;
- attr.bp_type = gen_type;
- attr.disabled = disabled;
+ err = ptrace_fill_bp_fields(&attr, len, type, disabled);
+ if (err)
+ return err;
return modify_user_hw_breakpoint(bp, &attr);
}
@@ -601,67 +652,50 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
*/
static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
{
- struct thread_struct *thread = &(tsk->thread);
+ struct thread_struct *thread = &tsk->thread;
unsigned long old_dr7;
- int i, orig_ret = 0, rc = 0;
- int enabled, second_pass = 0;
- unsigned len, type;
- struct perf_event *bp;
-
- if (ptrace_get_breakpoints(tsk) < 0)
- return -ESRCH;
+ bool second_pass = false;
+ int i, rc, ret = 0;
data &= ~DR_CONTROL_RESERVED;
old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+
restore:
- /*
- * Loop through all the hardware breakpoints, making the
- * appropriate changes to each.
- */
+ rc = 0;
for (i = 0; i < HBP_NUM; i++) {
- enabled = decode_dr7(data, i, &len, &type);
- bp = thread->ptrace_bps[i];
-
- if (!enabled) {
- if (bp) {
- /*
- * Don't unregister the breakpoints right-away,
- * unless all register_user_hw_breakpoint()
- * requests have succeeded. This prevents
- * any window of opportunity for debug
- * register grabbing by other users.
- */
- if (!second_pass)
- continue;
-
- rc = ptrace_modify_breakpoint(bp, len, type,
- tsk, 1);
- if (rc)
- break;
+ unsigned len, type;
+ bool disabled = !decode_dr7(data, i, &len, &type);
+ struct perf_event *bp = thread->ptrace_bps[i];
+
+ if (!bp) {
+ if (disabled)
+ continue;
+
+ bp = ptrace_register_breakpoint(tsk,
+ len, type, 0, disabled);
+ if (IS_ERR(bp)) {
+ rc = PTR_ERR(bp);
+ break;
}
+
+ thread->ptrace_bps[i] = bp;
continue;
}
- rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
+ rc = ptrace_modify_breakpoint(bp, len, type, disabled);
if (rc)
break;
}
- /*
- * Make a second pass to free the remaining unused breakpoints
- * or to restore the original breakpoints if an error occurred.
- */
- if (!second_pass) {
- second_pass = 1;
- if (rc < 0) {
- orig_ret = rc;
- data = old_dr7;
- }
+
+ /* Restore if the first pass failed, second_pass shouldn't fail. */
+ if (rc && !WARN_ON(second_pass)) {
+ ret = rc;
+ data = old_dr7;
+ second_pass = true;
goto restore;
}
- ptrace_put_breakpoints(tsk);
-
- return ((orig_ret < 0) ? orig_ret : rc);
+ return ret;
}
/*
@@ -669,25 +703,17 @@ restore:
*/
static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
{
- struct thread_struct *thread = &(tsk->thread);
+ struct thread_struct *thread = &tsk->thread;
unsigned long val = 0;
if (n < HBP_NUM) {
- struct perf_event *bp;
+ struct perf_event *bp = thread->ptrace_bps[n];
- if (ptrace_get_breakpoints(tsk) < 0)
- return -ESRCH;
-
- bp = thread->ptrace_bps[n];
- if (!bp)
- val = 0;
- else
+ if (bp)
val = bp->hw.info.address;
-
- ptrace_put_breakpoints(tsk);
} else if (n == 6) {
val = thread->debugreg6;
- } else if (n == 7) {
+ } else if (n == 7) {
val = thread->ptrace_dr7;
}
return val;
@@ -696,29 +722,14 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
unsigned long addr)
{
- struct perf_event *bp;
struct thread_struct *t = &tsk->thread;
- struct perf_event_attr attr;
+ struct perf_event *bp = t->ptrace_bps[nr];
int err = 0;
- if (ptrace_get_breakpoints(tsk) < 0)
- return -ESRCH;
-
- if (!t->ptrace_bps[nr]) {
- ptrace_breakpoint_init(&attr);
- /*
- * Put stub len and type to register (reserve) an inactive but
- * correct bp
- */
- attr.bp_addr = addr;
- attr.bp_len = HW_BREAKPOINT_LEN_1;
- attr.bp_type = HW_BREAKPOINT_W;
- attr.disabled = 1;
-
- bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
- NULL, tsk);
-
+ if (!bp) {
/*
+ * Put stub len and type to create an inactive but correct bp.
+ *
* CHECKME: the previous code returned -EIO if the addr wasn't
* a valid task virtual addr. The new one will return -EINVAL in
* this case.
@@ -727,22 +738,20 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
* writing for the user. And anyway this is the previous
* behaviour.
*/
- if (IS_ERR(bp)) {
+ bp = ptrace_register_breakpoint(tsk,
+ X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE,
+ addr, true);
+ if (IS_ERR(bp))
err = PTR_ERR(bp);
- goto put;
- }
-
- t->ptrace_bps[nr] = bp;
+ else
+ t->ptrace_bps[nr] = bp;
} else {
- bp = t->ptrace_bps[nr];
+ struct perf_event_attr attr = bp->attr;
- attr = bp->attr;
attr.bp_addr = addr;
err = modify_user_hw_breakpoint(bp, &attr);
}
-put:
- ptrace_put_breakpoints(tsk);
return err;
}
@@ -752,30 +761,20 @@ put:
static int ptrace_set_debugreg(struct task_struct *tsk, int n,
unsigned long val)
{
- struct thread_struct *thread = &(tsk->thread);
- int rc = 0;
-
+ struct thread_struct *thread = &tsk->thread;
/* There are no DR4 or DR5 registers */
- if (n == 4 || n == 5)
- return -EIO;
+ int rc = -EIO;
- if (n == 6) {
- thread->debugreg6 = val;
- goto ret_path;
- }
if (n < HBP_NUM) {
rc = ptrace_set_breakpoint_addr(tsk, n, val);
- if (rc)
- return rc;
- }
- /* All that's left is DR7 */
- if (n == 7) {
+ } else if (n == 6) {
+ thread->debugreg6 = val;
+ rc = 0;
+ } else if (n == 7) {
rc = ptrace_write_dr7(tsk, val);
if (!rc)
thread->ptrace_dr7 = val;
}
-
-ret_path:
return rc;
}
@@ -1130,6 +1129,94 @@ static int genregs32_set(struct task_struct *target,
return ret;
}
+#ifdef CONFIG_X86_X32_ABI
+static long x32_arch_ptrace(struct task_struct *child,
+ compat_long_t request, compat_ulong_t caddr,
+ compat_ulong_t cdata)
+{
+ unsigned long addr = caddr;
+ unsigned long data = cdata;
+ void __user *datap = compat_ptr(data);
+ int ret;
+
+ switch (request) {
+ /* Read 32bits at location addr in the USER area. Only allow
+ to return the lower 32bits of segment and debug registers. */
+ case PTRACE_PEEKUSR: {
+ u32 tmp;
+
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+ addr < offsetof(struct user_regs_struct, cs))
+ break;
+
+ tmp = 0; /* Default return condition */
+ if (addr < sizeof(struct user_regs_struct))
+ tmp = getreg(child, addr);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+ }
+ ret = put_user(tmp, (__u32 __user *)datap);
+ break;
+ }
+
+ /* Write the word at location addr in the USER area. Only allow
+ to update segment and debug registers with the upper 32bits
+ zero-extended. */
+ case PTRACE_POKEUSR:
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+ addr < offsetof(struct user_regs_struct, cs))
+ break;
+
+ if (addr < sizeof(struct user_regs_struct))
+ ret = putreg(child, addr, data);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ ret = ptrace_set_debugreg(child,
+ addr / sizeof(data), data);
+ }
+ break;
+
+ case PTRACE_GETREGS: /* Get all gp regs from the child. */
+ return copy_regset_to_user(child,
+ task_user_regset_view(current),
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_SETREGS: /* Set all gp regs in the child. */
+ return copy_regset_from_user(child,
+ task_user_regset_view(current),
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_GETFPREGS: /* Get the child FPU state. */
+ return copy_regset_to_user(child,
+ task_user_regset_view(current),
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ case PTRACE_SETFPREGS: /* Set the child FPU state. */
+ return copy_regset_from_user(child,
+ task_user_regset_view(current),
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ default:
+ return compat_ptrace_request(child, request, addr, data);
+ }
+
+ return ret;
+}
+#endif
+
long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
compat_ulong_t caddr, compat_ulong_t cdata)
{
@@ -1139,6 +1226,11 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
int ret;
__u32 val;
+#ifdef CONFIG_X86_X32_ABI
+ if (!is_ia32_task())
+ return x32_arch_ptrace(child, request, caddr, cdata);
+#endif
+
switch (request) {
case PTRACE_PEEKUSR:
ret = getreg32(child, addr, &val);
@@ -1238,9 +1330,6 @@ static const struct user_regset_view user_x86_64_view = {
#define genregs32_get genregs_get
#define genregs32_set genregs_set
-#define user_i387_ia32_struct user_i387_struct
-#define user32_fxsr_struct user_fxsr_struct
-
#endif /* CONFIG_X86_64 */
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1326,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk,
int error_code, int si_code,
struct siginfo *info)
{
- tsk->thread.trap_no = 1;
+ tsk->thread.trap_nr = X86_TRAP_DB;
tsk->thread.error_code = error_code;
memset(info, 0, sizeof(*info));
@@ -1369,6 +1458,8 @@ long syscall_trace_enter(struct pt_regs *regs)
{
long ret = 0;
+ user_exit();
+
/*
* If we stepped into a sysenter/syscall insn, it trapped in
* kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
@@ -1380,7 +1471,11 @@ long syscall_trace_enter(struct pt_regs *regs)
regs->flags |= X86_EFLAGS_TF;
/* do the secure computing check first */
- secure_computing(regs->orig_ax);
+ if (secure_computing(regs->orig_ax)) {
+ /* seccomp failures shouldn't expose any additional code. */
+ ret = -1L;
+ goto out;
+ }
if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
ret = -1L;
@@ -1405,6 +1500,7 @@ long syscall_trace_enter(struct pt_regs *regs)
regs->dx, regs->r10);
#endif
+out:
return ret ?: regs->orig_ax;
}
@@ -1412,6 +1508,13 @@ void syscall_trace_leave(struct pt_regs *regs)
{
bool step;
+ /*
+ * We may come here right after calling schedule_user()
+ * or do_notify_resume(), in which case we can be in RCU
+ * user mode.
+ */
+ user_exit();
+
audit_syscall_exit(regs);
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
@@ -1427,4 +1530,6 @@ void syscall_trace_leave(struct pt_regs *regs)
!test_thread_flag(TIF_SYSCALL_EMU);
if (step || test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall_exit(regs, step);
+
+ user_enter();
}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 42eb3300dfc..2f355d229a5 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -17,23 +17,13 @@
#include <linux/kernel.h>
#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/bootmem.h>
+#include <asm/fixmap.h>
#include <asm/pvclock.h>
-/*
- * These are perodically updated
- * xen: magic shared_info page
- * kvm: gpa registered via msr
- * and then copied here.
- */
-struct pvclock_shadow_time {
- u64 tsc_timestamp; /* TSC at last update of time vals. */
- u64 system_timestamp; /* Time, in nanosecs, since boot. */
- u32 tsc_to_nsec_mul;
- int tsc_shift;
- u32 version;
- u8 flags;
-};
-
static u8 valid_flags __read_mostly = 0;
void pvclock_set_flags(u8 flags)
@@ -41,34 +31,6 @@ void pvclock_set_flags(u8 flags)
valid_flags = flags;
}
-static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
-{
- u64 delta = native_read_tsc() - shadow->tsc_timestamp;
- return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
- shadow->tsc_shift);
-}
-
-/*
- * Reads a consistent set of time-base values from hypervisor,
- * into a shadow data area.
- */
-static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
- struct pvclock_vcpu_time_info *src)
-{
- do {
- dst->version = src->version;
- rmb(); /* fetch version before data */
- dst->tsc_timestamp = src->tsc_timestamp;
- dst->system_timestamp = src->system_time;
- dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
- dst->tsc_shift = src->tsc_shift;
- dst->flags = src->flags;
- rmb(); /* test version after fetching data */
- } while ((src->version & 1) || (dst->version != src->version));
-
- return dst->version;
-}
-
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
{
u64 pv_tsc_khz = 1000000ULL << 32;
@@ -81,6 +43,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
return pv_tsc_khz;
}
+void pvclock_touch_watchdogs(void)
+{
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+ reset_hung_task_detector();
+}
+
static atomic64_t last_value = ATOMIC64_INIT(0);
void pvclock_resume(void)
@@ -88,23 +58,37 @@ void pvclock_resume(void)
atomic64_set(&last_value, 0);
}
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
+{
+ unsigned version;
+ cycle_t ret;
+ u8 flags;
+
+ do {
+ version = __pvclock_read_cycles(src, &ret, &flags);
+ } while ((src->version & 1) || version != src->version);
+
+ return flags & valid_flags;
+}
+
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
- struct pvclock_shadow_time shadow;
unsigned version;
- cycle_t ret, offset;
+ cycle_t ret;
u64 last;
+ u8 flags;
do {
- version = pvclock_get_time_values(&shadow, src);
- barrier();
- offset = pvclock_get_nsec_offset(&shadow);
- ret = shadow.system_timestamp + offset;
- barrier();
- } while (version != src->version);
+ version = __pvclock_read_cycles(src, &ret, &flags);
+ } while ((src->version & 1) || version != src->version);
+
+ if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
+ src->flags &= ~PVCLOCK_GUEST_STOPPED;
+ pvclock_touch_watchdogs();
+ }
if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
- (shadow.flags & PVCLOCK_TSC_STABLE_BIT))
+ (flags & PVCLOCK_TSC_STABLE_BIT))
return ret;
/*
@@ -156,3 +140,27 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
+
+#ifdef CONFIG_X86_64
+/*
+ * Initialize the generic pvclock vsyscall state. This will allocate
+ * a/some page(s) for the per-vcpu pvclock information, set up a
+ * fixmap mapping for the page(s)
+ */
+
+int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
+ int size)
+{
+ int idx;
+
+ WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
+
+ for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
+ __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
+ __pa(i) + (idx*PAGE_SIZE),
+ PAGE_KERNEL_VVAR);
+ }
+
+ return 0;
+}
+#endif
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 03920a15a63..ff898bbf579 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -8,7 +8,7 @@
#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
-static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
+static void quirk_intel_irqbalance(struct pci_dev *dev)
{
u8 config;
u16 word;
@@ -354,18 +354,22 @@ static void ati_force_hpet_resume(void)
static u32 ati_ixp4x0_rev(struct pci_dev *dev)
{
- u32 d;
- u8 b;
+ int err = 0;
+ u32 d = 0;
+ u8 b = 0;
- pci_read_config_byte(dev, 0xac, &b);
+ err = pci_read_config_byte(dev, 0xac, &b);
b &= ~(1<<5);
- pci_write_config_byte(dev, 0xac, b);
- pci_read_config_dword(dev, 0x70, &d);
+ err |= pci_write_config_byte(dev, 0xac, b);
+ err |= pci_read_config_dword(dev, 0x70, &d);
d |= 1<<8;
- pci_write_config_dword(dev, 0x70, d);
- pci_read_config_dword(dev, 0x8, &d);
+ err |= pci_write_config_dword(dev, 0x70, d);
+ err |= pci_read_config_dword(dev, 0x8, &d);
d &= 0xff;
dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d);
+
+ WARN_ON_ONCE(err);
+
return d;
}
@@ -512,7 +516,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
/* Set correct numa_node information for AMD NB functions */
-static void __init quirk_amd_nb_node(struct pci_dev *dev)
+static void quirk_amd_nb_node(struct pci_dev *dev)
{
struct pci_dev *nb_ht;
unsigned int devfn;
@@ -525,7 +529,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
return;
pci_read_config_dword(nb_ht, 0x60, &val);
- node = val & 7;
+ node = pcibus_to_node(dev->bus) | (val & 7);
/*
* Some hardware may return an invalid node ID,
* so check it first:
@@ -567,3 +571,40 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5,
quirk_amd_nb_node);
#endif
+
+#ifdef CONFIG_PCI
+/*
+ * Processor does not ensure DRAM scrub read/write sequence
+ * is atomic wrt accesses to CC6 save state area. Therefore
+ * if a concurrent scrub read/write access is to same address
+ * the entry may appear as if it is not written. This quirk
+ * applies to Fam16h models 00h-0Fh
+ *
+ * See "Revision Guide" for AMD F16h models 00h-0fh,
+ * document 51810 rev. 3.04, Nov 2013
+ */
+static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev)
+{
+ u32 val;
+
+ /*
+ * Suggested workaround:
+ * set D18F3x58[4:0] = 00h and set D18F3x5C[0] = 0b
+ */
+ pci_read_config_dword(dev, 0x58, &val);
+ if (val & 0x1F) {
+ val &= ~(0x1F);
+ pci_write_config_dword(dev, 0x58, val);
+ }
+
+ pci_read_config_dword(dev, 0x5C, &val);
+ if (val & BIT(0)) {
+ val &= ~BIT(0);
+ pci_write_config_dword(dev, 0x5c, val);
+ }
+}
+
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
+ amd_disable_seq_and_redirect_scrub);
+
+#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d840e69a853..52b1157c53e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/reboot.h>
#include <linux/init.h>
@@ -20,13 +22,12 @@
#include <asm/virtext.h>
#include <asm/cpu.h>
#include <asm/nmi.h>
+#include <asm/smp.h>
-#ifdef CONFIG_X86_32
-# include <linux/ctype.h>
-# include <linux/mc146818rtc.h>
-#else
-# include <asm/x86_init.h>
-#endif
+#include <linux/ctype.h>
+#include <linux/mc146818rtc.h>
+#include <asm/realmode.h>
+#include <asm/x86_init.h>
/*
* Power off function, if any
@@ -35,23 +36,9 @@ void (*pm_power_off)(void);
EXPORT_SYMBOL(pm_power_off);
static const struct desc_ptr no_idt = {};
-static int reboot_mode;
-enum reboot_type reboot_type = BOOT_ACPI;
-int reboot_force;
-
-/* This variable is used privately to keep track of whether or not
- * reboot_type is still set to its default value (i.e., reboot= hasn't
- * been set on the command line). This is needed so that we can
- * suppress DMI scanning for reboot quirks. Without it, it's
- * impossible to override a faulty reboot quirk without recompiling.
- */
-static int reboot_default = 1;
-
-#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
-static int reboot_cpu = -1;
-#endif
-/* This is set if we need to go through the 'emergency' path.
+/*
+ * This is set if we need to go through the 'emergency' path.
* When machine_emergency_restart() is called, we may be on
* an inconsistent state and won't be able to do a clean cleanup
*/
@@ -60,78 +47,6 @@ static int reboot_emergency;
/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
bool port_cf9_safe = false;
-/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
- warm Don't set the cold reboot flag
- cold Set the cold reboot flag
- bios Reboot by jumping through the BIOS (only for X86_32)
- smp Reboot by executing reset on BSP or other CPU (only for X86_32)
- triple Force a triple fault (init)
- kbd Use the keyboard controller. cold reset (default)
- acpi Use the RESET_REG in the FADT
- efi Use efi reset_system runtime service
- pci Use the so-called "PCI reset register", CF9
- force Avoid anything that could hang.
- */
-static int __init reboot_setup(char *str)
-{
- for (;;) {
- /* Having anything passed on the command line via
- * reboot= will cause us to disable DMI checking
- * below.
- */
- reboot_default = 0;
-
- switch (*str) {
- case 'w':
- reboot_mode = 0x1234;
- break;
-
- case 'c':
- reboot_mode = 0;
- break;
-
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_SMP
- case 's':
- if (isdigit(*(str+1))) {
- reboot_cpu = (int) (*(str+1) - '0');
- if (isdigit(*(str+2)))
- reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
- }
- /* we will leave sorting out the final value
- when we are ready to reboot, since we might not
- have detected BSP APIC ID or smp_num_cpu */
- break;
-#endif /* CONFIG_SMP */
-
- case 'b':
-#endif
- case 'a':
- case 'k':
- case 't':
- case 'e':
- case 'p':
- reboot_type = *str;
- break;
-
- case 'f':
- reboot_force = 1;
- break;
- }
-
- str = strchr(str, ',');
- if (str)
- str++;
- else
- break;
- }
- return 1;
-}
-
-__setup("reboot=", reboot_setup);
-
-
-#ifdef CONFIG_X86_32
/*
* Reboot options and system auto-detection code provided by
* Dell Inc. so their systems "just work". :-)
@@ -145,7 +60,64 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
{
if (reboot_type != BOOT_BIOS) {
reboot_type = BOOT_BIOS;
- printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "BIOS");
+ }
+ return 0;
+}
+
+void __noreturn machine_real_restart(unsigned int type)
+{
+ local_irq_disable();
+
+ /*
+ * Write zero to CMOS register number 0x0f, which the BIOS POST
+ * routine will recognize as telling it to do a proper reboot. (Well
+ * that's what this book in front of me says -- it may only apply to
+ * the Phoenix BIOS though, it's not clear). At the same time,
+ * disable NMIs by setting the top bit in the CMOS address register,
+ * as we're about to do peculiar things to the CPU. I'm not sure if
+ * `outb_p' is needed instead of just `outb'. Use it to be on the
+ * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
+ */
+ spin_lock(&rtc_lock);
+ CMOS_WRITE(0x00, 0x8f);
+ spin_unlock(&rtc_lock);
+
+ /*
+ * Switch back to the initial page table.
+ */
+#ifdef CONFIG_X86_32
+ load_cr3(initial_page_table);
+#else
+ write_cr3(real_mode_header->trampoline_pgd);
+#endif
+
+ /* Jump to the identity-mapped low memory code */
+#ifdef CONFIG_X86_32
+ asm volatile("jmpl *%0" : :
+ "rm" (real_mode_header->machine_real_restart_asm),
+ "a" (type));
+#else
+ asm volatile("ljmpl *%0" : :
+ "m" (real_mode_header->machine_real_restart_asm),
+ "D" (type));
+#endif
+ unreachable();
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(machine_real_restart);
+#endif
+
+/*
+ * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
+ */
+static int __init set_pci_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_CF9_FORCE) {
+ reboot_type = BOOT_CF9_FORCE;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "PCI");
}
return 0;
}
@@ -154,337 +126,290 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
{
if (reboot_type != BOOT_KBD) {
reboot_type = BOOT_KBD;
- printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident);
+ pr_info("%s series board detected. Selecting %s-method for reboot.\n",
+ d->ident, "KBD");
}
return 0;
}
+/*
+ * This is a single dmi_table handling all reboot quirks.
+ */
static struct dmi_system_id __initdata reboot_dmi_table[] = {
- { /* Handle problems with rebooting on Dell E520's */
- .callback = set_bios_reboot,
- .ident = "Dell E520",
+
+ /* Acer */
+ { /* Handle reboot issue on Acer Aspire one */
+ .callback = set_kbd_reboot,
+ .ident = "Acer Aspire One A110",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
},
},
- { /* Handle problems with rebooting on Dell 1300's */
- .callback = set_bios_reboot,
- .ident = "Dell PowerEdge 1300",
+
+ /* Apple */
+ { /* Handle problems with rebooting on Apple MacBook5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBook5",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
},
},
- { /* Handle problems with rebooting on Dell 300's */
- .callback = set_bios_reboot,
- .ident = "Dell PowerEdge 300",
+ { /* Handle problems with rebooting on Apple MacBookPro5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBookPro5",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
+ { /* Handle problems with rebooting on Apple Macmini3,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple Macmini3,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
+ },
+ },
+ { /* Handle problems with rebooting on the iMac9,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac9,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
+ },
+ },
+
+ /* ASUS */
+ { /* Handle problems with rebooting on ASUS P4S800 */
.callback = set_bios_reboot,
- .ident = "Dell OptiPlex 745",
+ .ident = "ASUS P4S800",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/
+
+ /* Certec */
+ { /* Handle problems with rebooting on Certec BPC600 */
+ .callback = set_pci_reboot,
+ .ident = "Certec BPC600",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Certec"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"),
+ },
+ },
+
+ /* Dell */
+ { /* Handle problems with rebooting on Dell DXP061 */
.callback = set_bios_reboot,
- .ident = "Dell OptiPlex 745",
+ .ident = "Dell DXP061",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
- DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
+ { /* Handle problems with rebooting on Dell E520's */
.callback = set_bios_reboot,
- .ident = "Dell OptiPlex 745",
+ .ident = "Dell E520",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
- DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
- .callback = set_bios_reboot,
- .ident = "Dell OptiPlex 330",
+ { /* Handle problems with rebooting on the Latitude E5410. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5410",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
- DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5410"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
- .callback = set_bios_reboot,
- .ident = "Dell OptiPlex 360",
+ { /* Handle problems with rebooting on the Latitude E5420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5420",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
- DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
},
},
- { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
- .callback = set_bios_reboot,
- .ident = "Dell OptiPlex 760",
+ { /* Handle problems with rebooting on the Latitude E6320. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6320",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
- DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
},
},
- { /* Handle problems with rebooting on Dell 2400's */
- .callback = set_bios_reboot,
- .ident = "Dell PowerEdge 2400",
+ { /* Handle problems with rebooting on the Latitude E6420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6420",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
},
},
- { /* Handle problems with rebooting on Dell T5400's */
+ { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
.callback = set_bios_reboot,
- .ident = "Dell Precision T5400",
+ .ident = "Dell OptiPlex 330",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
},
},
- { /* Handle problems with rebooting on Dell T7400's */
+ { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
.callback = set_bios_reboot,
- .ident = "Dell Precision T7400",
+ .ident = "Dell OptiPlex 360",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+ DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
},
},
- { /* Handle problems with rebooting on HP laptops */
+ { /* Handle problems with rebooting on Dell Optiplex 745's SFF */
.callback = set_bios_reboot,
- .ident = "HP Compaq Laptop",
+ .ident = "Dell OptiPlex 745",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
- DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
},
},
- { /* Handle problems with rebooting on Dell XPS710 */
+ { /* Handle problems with rebooting on Dell Optiplex 745's DFF */
.callback = set_bios_reboot,
- .ident = "Dell XPS710",
+ .ident = "Dell OptiPlex 745",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
},
},
- { /* Handle problems with rebooting on Dell DXP061 */
+ { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
.callback = set_bios_reboot,
- .ident = "Dell DXP061",
+ .ident = "Dell OptiPlex 745",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
},
},
- { /* Handle problems with rebooting on Sony VGN-Z540N */
+ { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */
.callback = set_bios_reboot,
- .ident = "Sony VGN-Z540N",
+ .ident = "Dell OptiPlex 760",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
+ DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
},
},
- { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */
- .callback = set_bios_reboot,
- .ident = "CompuLab SBC-FITPC2",
+ { /* Handle problems with rebooting on the OptiPlex 990. */
+ .callback = set_pci_reboot,
+ .ident = "Dell OptiPlex 990",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"),
- DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
},
},
- { /* Handle problems with rebooting on ASUS P4S800 */
+ { /* Handle problems with rebooting on Dell 300's */
.callback = set_bios_reboot,
- .ident = "ASUS P4S800",
+ .ident = "Dell PowerEdge 300",
.matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
- DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
},
},
- { /* Handle reboot issue on Acer Aspire one */
- .callback = set_kbd_reboot,
- .ident = "Acer Aspire One A110",
+ { /* Handle problems with rebooting on Dell 1300's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 1300",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
- DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
},
},
- { }
-};
-
-static int __init reboot_init(void)
-{
- /* Only do the DMI check if reboot_type hasn't been overridden
- * on the command line
- */
- if (reboot_default) {
- dmi_check_system(reboot_dmi_table);
- }
- return 0;
-}
-core_initcall(reboot_init);
-
-extern const unsigned char machine_real_restart_asm[];
-extern const u64 machine_real_restart_gdt[3];
-
-void machine_real_restart(unsigned int type)
-{
- void *restart_va;
- unsigned long restart_pa;
- void (*restart_lowmem)(unsigned int);
- u64 *lowmem_gdt;
-
- local_irq_disable();
-
- /* Write zero to CMOS register number 0x0f, which the BIOS POST
- routine will recognize as telling it to do a proper reboot. (Well
- that's what this book in front of me says -- it may only apply to
- the Phoenix BIOS though, it's not clear). At the same time,
- disable NMIs by setting the top bit in the CMOS address register,
- as we're about to do peculiar things to the CPU. I'm not sure if
- `outb_p' is needed instead of just `outb'. Use it to be on the
- safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
- */
- spin_lock(&rtc_lock);
- CMOS_WRITE(0x00, 0x8f);
- spin_unlock(&rtc_lock);
-
- /*
- * Switch back to the initial page table.
- */
- load_cr3(initial_page_table);
-
- /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
- this on booting to tell it to "Bypass memory test (also warm
- boot)". This seems like a fairly standard thing that gets set by
- REBOOT.COM programs, and the previous reset routine did this
- too. */
- *((unsigned short *)0x472) = reboot_mode;
-
- /* Patch the GDT in the low memory trampoline */
- lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
-
- restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
- restart_pa = virt_to_phys(restart_va);
- restart_lowmem = (void (*)(unsigned int))restart_pa;
-
- /* GDT[0]: GDT self-pointer */
- lowmem_gdt[0] =
- (u64)(sizeof(machine_real_restart_gdt) - 1) +
- ((u64)virt_to_phys(lowmem_gdt) << 16);
- /* GDT[1]: 64K real mode code segment */
- lowmem_gdt[1] =
- GDT_ENTRY(0x009b, restart_pa, 0xffff);
-
- /* Jump to the identity-mapped low memory code */
- restart_lowmem(type);
-}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(machine_real_restart);
-#endif
-
-#endif /* CONFIG_X86_32 */
-
-/*
- * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
- */
-static int __init set_pci_reboot(const struct dmi_system_id *d)
-{
- if (reboot_type != BOOT_CF9) {
- reboot_type = BOOT_CF9;
- printk(KERN_INFO "%s series board detected. "
- "Selecting PCI-method for reboots.\n", d->ident);
- }
- return 0;
-}
-
-static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
- { /* Handle problems with rebooting on Apple MacBook5 */
- .callback = set_pci_reboot,
- .ident = "Apple MacBook5",
+ { /* Handle problems with rebooting on Dell 2400's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 2400",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
},
},
- { /* Handle problems with rebooting on Apple MacBookPro5 */
+ { /* Handle problems with rebooting on the Dell PowerEdge C6100. */
.callback = set_pci_reboot,
- .ident = "Apple MacBookPro5",
+ .ident = "Dell PowerEdge C6100",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
},
},
- { /* Handle problems with rebooting on Apple Macmini3,1 */
+ { /* Handle problems with rebooting on the Precision M6600. */
.callback = set_pci_reboot,
- .ident = "Apple Macmini3,1",
+ .ident = "Dell Precision M6600",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
},
},
- { /* Handle problems with rebooting on the iMac9,1. */
- .callback = set_pci_reboot,
- .ident = "Apple iMac9,1",
+ { /* Handle problems with rebooting on Dell T5400's */
+ .callback = set_bios_reboot,
+ .ident = "Dell Precision T5400",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
},
},
- { /* Handle problems with rebooting on the Latitude E6320. */
- .callback = set_pci_reboot,
- .ident = "Dell Latitude E6320",
+ { /* Handle problems with rebooting on Dell T7400's */
+ .callback = set_bios_reboot,
+ .ident = "Dell Precision T7400",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"),
},
},
- { /* Handle problems with rebooting on the Latitude E5420. */
- .callback = set_pci_reboot,
- .ident = "Dell Latitude E5420",
+ { /* Handle problems with rebooting on Dell XPS710 */
+ .callback = set_bios_reboot,
+ .ident = "Dell XPS710",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
},
},
- { /* Handle problems with rebooting on the Latitude E6420. */
- .callback = set_pci_reboot,
- .ident = "Dell Latitude E6420",
+
+ /* Hewlett-Packard */
+ { /* Handle problems with rebooting on HP laptops */
+ .callback = set_bios_reboot,
+ .ident = "HP Compaq Laptop",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
},
},
- { /* Handle problems with rebooting on the OptiPlex 990. */
- .callback = set_pci_reboot,
- .ident = "Dell OptiPlex 990",
+
+ /* Sony */
+ { /* Handle problems with rebooting on Sony VGN-Z540N */
+ .callback = set_bios_reboot,
+ .ident = "Sony VGN-Z540N",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
},
},
+
{ }
};
-static int __init pci_reboot_init(void)
+static int __init reboot_init(void)
{
- /* Only do the DMI check if reboot_type hasn't been overridden
+ /*
+ * Only do the DMI check if reboot_type hasn't been overridden
* on the command line
*/
- if (reboot_default) {
- dmi_check_system(pci_reboot_dmi_table);
- }
+ if (reboot_default)
+ dmi_check_system(reboot_dmi_table);
return 0;
}
-core_initcall(pci_reboot_init);
+core_initcall(reboot_init);
static inline void kb_wait(void)
{
@@ -502,14 +427,14 @@ static void vmxoff_nmi(int cpu, struct pt_regs *regs)
cpu_emergency_vmxoff();
}
-/* Use NMIs as IPIs to tell all CPUs to disable virtualization
- */
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
static void emergency_vmx_disable_all(void)
{
/* Just make sure we won't change CPUs while doing this */
local_irq_disable();
- /* We need to disable VMX on all CPUs before rebooting, otherwise
+ /*
+ * We need to disable VMX on all CPUs before rebooting, otherwise
* we risk hanging up the machine, because the CPU ignore INIT
* signals when VMX is enabled.
*
@@ -528,8 +453,7 @@ static void emergency_vmx_disable_all(void)
* is still enabling VMX.
*/
if (cpu_has_vmx() && cpu_vmx_enabled()) {
- /* Disable VMX on this CPU.
- */
+ /* Disable VMX on this CPU. */
cpu_vmxoff();
/* Halt and disable VMX on the other CPUs */
@@ -544,23 +468,30 @@ void __attribute__((weak)) mach_reboot_fixups(void)
}
/*
- * Windows compatible x86 hardware expects the following on reboot:
+ * To the best of our knowledge Windows compatible x86 hardware expects
+ * the following on reboot:
*
* 1) If the FADT has the ACPI reboot register flag set, try it
* 2) If still alive, write to the keyboard controller
* 3) If still alive, write to the ACPI reboot register again
* 4) If still alive, write to the keyboard controller again
+ * 5) If still alive, call the EFI runtime service to reboot
+ * 6) If no EFI runtime service, call the BIOS to do a reboot
*
- * If the machine is still alive at this stage, it gives up. We default to
- * following the same pattern, except that if we're still alive after (4) we'll
- * try to force a triple fault and then cycle between hitting the keyboard
- * controller and doing that
+ * We default to following the same pattern. We also have
+ * two other reboot methods: 'triple fault' and 'PCI', which
+ * can be triggered via the reboot= kernel boot option or
+ * via quirks.
+ *
+ * This means that this function can never return, it can misbehave
+ * by not rebooting properly and hanging.
*/
static void native_machine_emergency_restart(void)
{
int i;
int attempt = 0;
int orig_reboot_type = reboot_type;
+ unsigned short mode;
if (reboot_emergency)
emergency_vmx_disable_all();
@@ -568,69 +499,72 @@ static void native_machine_emergency_restart(void)
tboot_shutdown(TB_SHUTDOWN_REBOOT);
/* Tell the BIOS if we want cold or warm reboot */
- *((unsigned short *)__va(0x472)) = reboot_mode;
+ mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0;
+ *((unsigned short *)__va(0x472)) = mode;
for (;;) {
/* Could also try the reset bit in the Hammer NB */
switch (reboot_type) {
+ case BOOT_ACPI:
+ acpi_reboot();
+ reboot_type = BOOT_KBD;
+ break;
+
case BOOT_KBD:
- mach_reboot_fixups(); /* for board specific fixups */
+ mach_reboot_fixups(); /* For board specific fixups */
for (i = 0; i < 10; i++) {
kb_wait();
udelay(50);
- outb(0xfe, 0x64); /* pulse reset low */
+ outb(0xfe, 0x64); /* Pulse reset low */
udelay(50);
}
if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
attempt = 1;
reboot_type = BOOT_ACPI;
} else {
- reboot_type = BOOT_TRIPLE;
+ reboot_type = BOOT_EFI;
}
break;
- case BOOT_TRIPLE:
- load_idt(&no_idt);
- __asm__ __volatile__("int3");
-
- reboot_type = BOOT_KBD;
+ case BOOT_EFI:
+ if (efi_enabled(EFI_RUNTIME_SERVICES))
+ efi.reset_system(reboot_mode == REBOOT_WARM ?
+ EFI_RESET_WARM :
+ EFI_RESET_COLD,
+ EFI_SUCCESS, 0, NULL);
+ reboot_type = BOOT_BIOS;
break;
-#ifdef CONFIG_X86_32
case BOOT_BIOS:
machine_real_restart(MRR_BIOS);
- reboot_type = BOOT_KBD;
- break;
-#endif
-
- case BOOT_ACPI:
- acpi_reboot();
- reboot_type = BOOT_KBD;
+ /* We're probably dead after this, but... */
+ reboot_type = BOOT_CF9_SAFE;
break;
- case BOOT_EFI:
- if (efi_enabled)
- efi.reset_system(reboot_mode ?
- EFI_RESET_WARM :
- EFI_RESET_COLD,
- EFI_SUCCESS, 0, NULL);
- reboot_type = BOOT_KBD;
- break;
-
- case BOOT_CF9:
+ case BOOT_CF9_FORCE:
port_cf9_safe = true;
- /* fall through */
+ /* Fall through */
- case BOOT_CF9_COND:
+ case BOOT_CF9_SAFE:
if (port_cf9_safe) {
- u8 cf9 = inb(0xcf9) & ~6;
+ u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E;
+ u8 cf9 = inb(0xcf9) & ~reboot_code;
outb(cf9|2, 0xcf9); /* Request hard reset */
udelay(50);
- outb(cf9|6, 0xcf9); /* Actually do the reset */
+ /* Actually do the reset */
+ outb(cf9|reboot_code, 0xcf9);
udelay(50);
}
+ reboot_type = BOOT_TRIPLE;
+ break;
+
+ case BOOT_TRIPLE:
+ load_idt(&no_idt);
+ __asm__ __volatile__("int3");
+
+ /* We're probably dead after this, but... */
reboot_type = BOOT_KBD;
break;
}
@@ -640,37 +574,33 @@ static void native_machine_emergency_restart(void)
void native_machine_shutdown(void)
{
/* Stop the cpus and apics */
-#ifdef CONFIG_SMP
-
- /* The boot cpu is always logical cpu 0 */
- int reboot_cpu_id = 0;
-
-#ifdef CONFIG_X86_32
- /* See if there has been given a command line override */
- if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
- cpu_online(reboot_cpu))
- reboot_cpu_id = reboot_cpu;
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Disabling IO APIC before local APIC is a workaround for
+ * erratum AVR31 in "Intel Atom Processor C2000 Product Family
+ * Specification Update". In this situation, interrupts that target
+ * a Logical Processor whose Local APIC is either in the process of
+ * being hardware disabled or software disabled are neither delivered
+ * nor discarded. When this erratum occurs, the processor may hang.
+ *
+ * Even without the erratum, it still makes sense to quiet IO APIC
+ * before disabling Local APIC.
+ */
+ disable_IO_APIC();
#endif
- /* Make certain the cpu I'm about to reboot on is online */
- if (!cpu_online(reboot_cpu_id))
- reboot_cpu_id = smp_processor_id();
-
- /* Make certain I only run on the appropriate processor */
- set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
-
- /* O.K Now that I'm on the appropriate processor,
- * stop all of the others.
+#ifdef CONFIG_SMP
+ /*
+ * Stop all of the others. Also disable the local irq to
+ * not receive the per-cpu timer interrupt which may trigger
+ * scheduler's load balance.
*/
+ local_irq_disable();
stop_other_cpus();
#endif
lapic_shutdown();
-#ifdef CONFIG_X86_IO_APIC
- disable_IO_APIC();
-#endif
-
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
@@ -688,7 +618,7 @@ static void __machine_emergency_restart(int emergency)
static void native_machine_restart(char *__unused)
{
- printk("machine restart\n");
+ pr_notice("machine restart\n");
if (!reboot_force)
machine_shutdown();
@@ -697,12 +627,11 @@ static void native_machine_restart(char *__unused)
static void native_machine_halt(void)
{
- /* stop other cpus and apics */
+ /* Stop other cpus and apics */
machine_shutdown();
tboot_shutdown(TB_SHUTDOWN_HALT);
- /* stop this cpu */
stop_this_cpu(NULL);
}
@@ -713,7 +642,7 @@ static void native_machine_power_off(void)
machine_shutdown();
pm_power_off();
}
- /* a fallback in case there is no PM info available */
+ /* A fallback in case there is no PM info available */
tboot_shutdown(TB_SHUTDOWN_HALT);
}
@@ -775,7 +704,8 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
cpu = raw_smp_processor_id();
- /* Don't do anything if this handler is invoked on crashing cpu.
+ /*
+ * Don't do anything if this handler is invoked on crashing cpu.
* Otherwise, system will completely hang. Crashing cpu can get
* an NMI if system was initially booted with nmi_watchdog parameter.
*/
@@ -799,7 +729,8 @@ static void smp_send_nmi_allbutself(void)
apic->send_IPI_allbutself(NMI_VECTOR);
}
-/* Halt all other CPUs, calling the specified function on each of them
+/*
+ * Halt all other CPUs, calling the specified function on each of them
*
* This function can be used to halt all other CPUs on crash
* or emergency reboot time. The function passed as parameter
@@ -810,7 +741,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
unsigned long msecs;
local_irq_disable();
- /* Make a note of crashing cpu. Will be used in NMI callback.*/
+ /* Make a note of crashing cpu. Will be used in NMI callback. */
crashing_cpu = safe_smp_processor_id();
shootdown_callback = callback;
@@ -819,8 +750,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
/* Would it be better to replace the trap vector here? */
if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
NMI_FLAG_FIRST, "crash"))
- return; /* return what? */
- /* Ensure the new callback function is set before sending
+ return; /* Return what? */
+ /*
+ * Ensure the new callback function is set before sending
* out the NMI
*/
wmb();
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
deleted file mode 100644
index 1d5c46df0d7..00000000000
--- a/arch/x86/kernel/reboot_32.S
+++ /dev/null
@@ -1,135 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/segment.h>
-#include <asm/page_types.h>
-
-/*
- * The following code and data reboots the machine by switching to real
- * mode and jumping to the BIOS reset entry point, as if the CPU has
- * really been reset. The previous version asked the keyboard
- * controller to pulse the CPU reset line, which is more thorough, but
- * doesn't work with at least one type of 486 motherboard. It is easy
- * to stop this code working; hence the copious comments.
- *
- * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
- */
- .section ".x86_trampoline","a"
- .balign 16
- .code32
-ENTRY(machine_real_restart_asm)
-r_base = .
- /* Get our own relocated address */
- call 1f
-1: popl %ebx
- subl $(1b - r_base), %ebx
-
- /* Compute the equivalent real-mode segment */
- movl %ebx, %ecx
- shrl $4, %ecx
-
- /* Patch post-real-mode segment jump */
- movw (dispatch_table - r_base)(%ebx,%eax,2),%ax
- movw %ax, (101f - r_base)(%ebx)
- movw %cx, (102f - r_base)(%ebx)
-
- /* Set up the IDT for real mode. */
- lidtl (machine_real_restart_idt - r_base)(%ebx)
-
- /*
- * Set up a GDT from which we can load segment descriptors for real
- * mode. The GDT is not used in real mode; it is just needed here to
- * prepare the descriptors.
- */
- lgdtl (machine_real_restart_gdt - r_base)(%ebx)
-
- /*
- * Load the data segment registers with 16-bit compatible values
- */
- movl $16, %ecx
- movl %ecx, %ds
- movl %ecx, %es
- movl %ecx, %fs
- movl %ecx, %gs
- movl %ecx, %ss
- ljmpl $8, $1f - r_base
-
-/*
- * This is 16-bit protected mode code to disable paging and the cache,
- * switch to real mode and jump to the BIOS reset code.
- *
- * The instruction that switches to real mode by writing to CR0 must be
- * followed immediately by a far jump instruction, which set CS to a
- * valid value for real mode, and flushes the prefetch queue to avoid
- * running instructions that have already been decoded in protected
- * mode.
- *
- * Clears all the flags except ET, especially PG (paging), PE
- * (protected-mode enable) and TS (task switch for coprocessor state
- * save). Flushes the TLB after paging has been disabled. Sets CD and
- * NW, to disable the cache on a 486, and invalidates the cache. This
- * is more like the state of a 486 after reset. I don't know if
- * something else should be done for other chips.
- *
- * More could be done here to set up the registers as if a CPU reset had
- * occurred; hopefully real BIOSs don't assume much. This is not the
- * actual BIOS entry point, anyway (that is at 0xfffffff0).
- *
- * Most of this work is probably excessive, but it is what is tested.
- */
- .code16
-1:
- xorl %ecx, %ecx
- movl %cr0, %eax
- andl $0x00000011, %eax
- orl $0x60000000, %eax
- movl %eax, %cr0
- movl %ecx, %cr3
- movl %cr0, %edx
- andl $0x60000000, %edx /* If no cache bits -> no wbinvd */
- jz 2f
- wbinvd
-2:
- andb $0x10, %al
- movl %eax, %cr0
- .byte 0xea /* ljmpw */
-101: .word 0 /* Offset */
-102: .word 0 /* Segment */
-
-bios:
- ljmpw $0xf000, $0xfff0
-
-apm:
- movw $0x1000, %ax
- movw %ax, %ss
- movw $0xf000, %sp
- movw $0x5307, %ax
- movw $0x0001, %bx
- movw $0x0003, %cx
- int $0x15
-
-END(machine_real_restart_asm)
-
- .balign 16
- /* These must match <asm/reboot.h */
-dispatch_table:
- .word bios - r_base
- .word apm - r_base
-END(dispatch_table)
-
- .balign 16
-machine_real_restart_idt:
- .word 0xffff /* Length - real mode default value */
- .long 0 /* Base - real mode default value */
-END(machine_real_restart_idt)
-
- .balign 16
-ENTRY(machine_real_restart_gdt)
- .quad 0 /* Self-pointer, filled in by PM code */
- .quad 0 /* 16-bit code segment, filled in by PM code */
- /*
- * 16-bit data segment with the selector value 16 = 0x10 and
- * base value 0x100; since this is consistent with real mode
- * semantics we don't have to reload the segments once CR0.PE = 0.
- */
- .quad GDT_ENTRY(0x0093, 0x100, 0xffff)
-END(machine_real_restart_gdt)
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 36818f8ec2b..e13f8e7c22a 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -186,7 +186,7 @@ identity_mapped:
movl CP_PA_PGD(%ebx), %eax
movl %eax, %cr3
movl %cr0, %eax
- orl $(1<<31), %eax
+ orl $X86_CR0_PG, %eax
movl %eax, %cr0
lea PAGE_SIZE(%edi), %esp
movl %edi, %eax
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 7a6f3b3be3c..3fd2c693e47 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -151,21 +151,21 @@ identity_mapped:
testq %r11, %r11
jnz 1f
- xorq %rax, %rax
- xorq %rbx, %rbx
- xorq %rcx, %rcx
- xorq %rdx, %rdx
- xorq %rsi, %rsi
- xorq %rdi, %rdi
- xorq %rbp, %rbp
- xorq %r8, %r8
- xorq %r9, %r9
- xorq %r10, %r9
- xorq %r11, %r11
- xorq %r12, %r12
- xorq %r13, %r13
- xorq %r14, %r14
- xorq %r15, %r15
+ xorl %eax, %eax
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ xorl %edx, %edx
+ xorl %esi, %esi
+ xorl %edi, %edi
+ xorl %ebp, %ebp
+ xorl %r8d, %r8d
+ xorl %r9d, %r9d
+ xorl %r10d, %r10d
+ xorl %r11d, %r11d
+ xorl %r12d, %r12d
+ xorl %r13d, %r13d
+ xorl %r14d, %r14d
+ xorl %r15d, %r15d
ret
@@ -212,8 +212,8 @@ virtual_mapped:
/* Do the copies */
swap_pages:
movq %rdi, %rcx /* Put the page_list in %rcx */
- xorq %rdi, %rdi
- xorq %rsi, %rsi
+ xorl %edi, %edi
+ xorl %esi, %esi
jmp 1f
0: /* top, read another word for the indirection page */
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index af6db6ec5b2..ca9622a25e9 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -12,7 +12,8 @@
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
#include <asm/time.h>
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
+#include <asm/rtc.h>
#ifdef CONFIG_X86_32
/*
@@ -36,74 +37,29 @@ EXPORT_SYMBOL(rtc_lock);
* nowtime is written into the registers of the CMOS clock, it will
* jump to the next second precisely 500 ms later. Check the Motorola
* MC146818A or Dallas DS12887 data sheet for details.
- *
- * BUG: This routine does not handle hour overflow properly; it just
- * sets the minutes. Usually you'll only notice that after reboot!
*/
-int mach_set_rtc_mmss(unsigned long nowtime)
+int mach_set_rtc_mmss(const struct timespec *now)
{
- int real_seconds, real_minutes, cmos_minutes;
- unsigned char save_control, save_freq_select;
- unsigned long flags;
+ unsigned long nowtime = now->tv_sec;
+ struct rtc_time tm;
int retval = 0;
- spin_lock_irqsave(&rtc_lock, flags);
-
- /* tell the clock it's being set */
- save_control = CMOS_READ(RTC_CONTROL);
- CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
-
- /* stop and reset prescaler */
- save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
- CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
-
- cmos_minutes = CMOS_READ(RTC_MINUTES);
- if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
- cmos_minutes = bcd2bin(cmos_minutes);
-
- /*
- * since we're only adjusting minutes and seconds,
- * don't interfere with hour overflow. This avoids
- * messing with unknown time zones but requires your
- * RTC not to be off by more than 15 minutes
- */
- real_seconds = nowtime % 60;
- real_minutes = nowtime / 60;
- /* correct for half hour time zone */
- if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
- real_minutes += 30;
- real_minutes %= 60;
-
- if (abs(real_minutes - cmos_minutes) < 30) {
- if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
- real_seconds = bin2bcd(real_seconds);
- real_minutes = bin2bcd(real_minutes);
- }
- CMOS_WRITE(real_seconds, RTC_SECONDS);
- CMOS_WRITE(real_minutes, RTC_MINUTES);
+ rtc_time_to_tm(nowtime, &tm);
+ if (!rtc_valid_tm(&tm)) {
+ retval = set_rtc_time(&tm);
+ if (retval)
+ printk(KERN_ERR "%s: RTC write failed with error %d\n",
+ __FUNCTION__, retval);
} else {
- printk_once(KERN_NOTICE
- "set_rtc_mmss: can't update from %d to %d\n",
- cmos_minutes, real_minutes);
- retval = -1;
+ printk(KERN_ERR
+ "%s: Invalid RTC value: write of %lx to RTC failed\n",
+ __FUNCTION__, nowtime);
+ retval = -EINVAL;
}
-
- /* The following flags have to be released exactly in this order,
- * otherwise the DS12887 (popular MC146818A clone with integrated
- * battery and quartz) will not reset the oscillator and will not
- * update precisely 500 ms later. You won't find this mentioned in
- * the Dallas Semiconductor data sheets, but who believes data
- * sheets anyway ... -- Markus Kuhn
- */
- CMOS_WRITE(save_control, RTC_CONTROL);
- CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
-
- spin_unlock_irqrestore(&rtc_lock, flags);
-
return retval;
}
-unsigned long mach_get_cmos_time(void)
+void mach_get_cmos_time(struct timespec *now)
{
unsigned int status, year, mon, day, hour, min, sec, century = 0;
unsigned long flags;
@@ -149,11 +105,11 @@ unsigned long mach_get_cmos_time(void)
if (century) {
century = bcd2bin(century);
year += century * 100;
- printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
} else
year += CMOS_YEARS_OFFS;
- return mktime(year, mon, day, hour, min, sec);
+ now->tv_sec = mktime(year, mon, day, hour, min, sec);
+ now->tv_nsec = 0;
}
/* Routines for accessing the CMOS RAM/RTC. */
@@ -181,25 +137,14 @@ EXPORT_SYMBOL(rtc_cmos_write);
int update_persistent_clock(struct timespec now)
{
- return x86_platform.set_wallclock(now.tv_sec);
+ return x86_platform.set_wallclock(&now);
}
/* not static: needed by APM */
void read_persistent_clock(struct timespec *ts)
{
- unsigned long retval;
-
- retval = x86_platform.get_wallclock();
-
- ts->tv_sec = retval;
- ts->tv_nsec = 0;
-}
-
-unsigned long long native_read_tsc(void)
-{
- return __native_read_tsc();
+ x86_platform.get_wallclock(ts);
}
-EXPORT_SYMBOL(native_read_tsc);
static struct resource rtc_resources[] = {
@@ -225,7 +170,7 @@ static struct platform_device rtc_device = {
static __init int add_rtc_cmos(void)
{
#ifdef CONFIG_PNP
- static const char *ids[] __initconst =
+ static const char * const const ids[] __initconst =
{ "PNP0b00", "PNP0b01", "PNP0b02", };
struct pnp_dev *dev;
struct pnp_id *id;
@@ -244,8 +189,16 @@ static __init int add_rtc_cmos(void)
return 0;
/* Intel MID platforms don't have ioport rtc */
- if (mrst_identify_cpu())
+ if (intel_mid_identify_cpu())
+ return -ENODEV;
+
+#ifdef CONFIG_ACPI
+ if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
+ /* This warning can likely go away again in a year or two. */
+ pr_info("ACPI: not registering RTC platform device\n");
return -ENODEV;
+ }
+#endif
platform_device_register(&rtc_device);
dev_info(&rtc_device.dev,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d7d5099fe87..78a0e629892 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -34,7 +34,6 @@
#include <linux/memblock.h>
#include <linux/seq_file.h>
#include <linux/console.h>
-#include <linux/mca.h>
#include <linux/root_dev.h>
#include <linux/highmem.h>
#include <linux/module.h>
@@ -50,6 +49,7 @@
#include <asm/pci-direct.h>
#include <linux/init_ohci1394_dma.h>
#include <linux/kvm_para.h>
+#include <linux/dma-contiguous.h>
#include <linux/errno.h>
#include <linux/kernel.h>
@@ -68,12 +68,13 @@
#include <linux/percpu.h>
#include <linux/crash_dump.h>
#include <linux/tboot.h>
+#include <linux/jiffies.h>
#include <video/edid.h>
#include <asm/mtrr.h>
#include <asm/apic.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
#include <asm/e820.h>
#include <asm/mpspec.h>
#include <asm/setup.h>
@@ -81,7 +82,6 @@
#include <asm/timer.h>
#include <asm/i8259.h>
#include <asm/sections.h>
-#include <asm/dmi.h>
#include <asm/io_apic.h>
#include <asm/ist.h>
#include <asm/setup_arch.h>
@@ -90,7 +90,6 @@
#include <asm/processor.h>
#include <asm/bugs.h>
-#include <asm/system.h>
#include <asm/vsyscall.h>
#include <asm/cpu.h>
#include <asm/desc.h>
@@ -108,17 +107,16 @@
#include <asm/topology.h>
#include <asm/apicdef.h>
#include <asm/amd_nb.h>
-#ifdef CONFIG_X86_64
-#include <asm/numa_64.h>
-#endif
#include <asm/mce.h>
#include <asm/alternative.h>
#include <asm/prom.h>
/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
+ * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+ * max_pfn_mapped: highest direct mapped pfn over 4GB
+ *
+ * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
+ * represented by pfn_mapped
*/
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
@@ -143,11 +141,7 @@ int default_check_phys_apicid_present(int phys_apicid)
}
#endif
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
struct boot_params boot_params;
-#endif
/*
* Machine setup..
@@ -176,16 +170,14 @@ static struct resource bss_resource = {
#ifdef CONFIG_X86_32
/* cpu data as detected by the assembly code in head.S */
-struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
+struct cpuinfo_x86 new_cpu_data = {
+ .wp_works_ok = -1,
+};
/* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+ .wp_works_ok = -1,
+};
EXPORT_SYMBOL(boot_cpu_data);
-static void set_mca_bus(int x)
-{
-#ifdef CONFIG_MCA
- MCA_bus = x;
-#endif
-}
unsigned int def_to_bigsmp;
@@ -214,9 +206,9 @@ EXPORT_SYMBOL(boot_cpu_data);
#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
-unsigned long mmu_cr4_features;
+__visible unsigned long mmu_cr4_features;
#else
-unsigned long mmu_cr4_features = X86_CR4_PAE;
+__visible unsigned long mmu_cr4_features = X86_CR4_PAE;
#endif
/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
@@ -286,18 +278,7 @@ void * __init extend_brk(size_t size, size_t align)
return ret;
}
-#ifdef CONFIG_X86_64
-static void __init init_gbpages(void)
-{
- if (direct_gbpages && cpu_has_gbpages)
- printk(KERN_INFO "Using GB pages for direct mapping\n");
- else
- direct_gbpages = 0;
-}
-#else
-static inline void init_gbpages(void)
-{
-}
+#ifdef CONFIG_X86_32
static void __init cleanup_highmap(void)
{
}
@@ -306,57 +287,64 @@ static void __init cleanup_highmap(void)
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
- memblock_reserve(__pa(_brk_start),
- __pa(_brk_end) - __pa(_brk_start));
+ memblock_reserve(__pa_symbol(_brk_start),
+ _brk_end - _brk_start);
/* Mark brk area as locked down and no longer taking any
new allocations */
_brk_start = 0;
}
+u64 relocated_ramdisk;
+
#ifdef CONFIG_BLK_DEV_INITRD
+static u64 __init get_ramdisk_image(void)
+{
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+
+ ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
+
+ return ramdisk_image;
+}
+static u64 __init get_ramdisk_size(void)
+{
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+
+ ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
+
+ return ramdisk_size;
+}
+
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
static void __init relocate_initrd(void)
{
/* Assume only end is not page aligned */
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
u64 area_size = PAGE_ALIGN(ramdisk_size);
- u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
- u64 ramdisk_here;
unsigned long slop, clen, mapaddr;
char *p, *q;
- /* We need to move the initrd down into lowmem */
- ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
- PAGE_SIZE);
+ /* We need to move the initrd down into directly mapped mem */
+ relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ area_size, PAGE_SIZE);
- if (!ramdisk_here)
+ if (!relocated_ramdisk)
panic("Cannot find place for new RAMDISK of size %lld\n",
- ramdisk_size);
+ ramdisk_size);
- /* Note: this includes all the lowmem currently occupied by
+ /* Note: this includes all the mem currently occupied by
the initrd, we rely on that fact to keep the data intact. */
- memblock_reserve(ramdisk_here, area_size);
- initrd_start = ramdisk_here + PAGE_OFFSET;
+ memblock_reserve(relocated_ramdisk, area_size);
+ initrd_start = relocated_ramdisk + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
- printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
- ramdisk_here, ramdisk_here + ramdisk_size);
+ printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
+ relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
q = (char *)initrd_start;
- /* Copy any lowmem portion of the initrd */
- if (ramdisk_image < end_of_lowmem) {
- clen = end_of_lowmem - ramdisk_image;
- p = (char *)__va(ramdisk_image);
- memcpy(q, p, clen);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
-
- /* Copy the highmem portion of the initrd */
+ /* Copy the initrd */
while (ramdisk_size) {
slop = ramdisk_image & ~PAGE_MASK;
clen = ramdisk_size;
@@ -370,22 +358,35 @@ static void __init relocate_initrd(void)
ramdisk_image += clen;
ramdisk_size -= clen;
}
- /* high pages is not converted by early_res_to_bootmem */
- ramdisk_image = boot_params.hdr.ramdisk_image;
- ramdisk_size = boot_params.hdr.ramdisk_size;
- printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
- " %08llx - %08llx\n",
+
+ ramdisk_image = get_ramdisk_image();
+ ramdisk_size = get_ramdisk_size();
+ printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
+ " [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
- ramdisk_here, ramdisk_here + ramdisk_size - 1);
+ relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
}
+static void __init early_reserve_initrd(void)
+{
+ /* Assume only end is not page aligned */
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
+ u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+
+ if (!boot_params.hdr.type_of_loader ||
+ !ramdisk_image || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
+
+ memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
+}
static void __init reserve_initrd(void)
{
/* Assume only end is not page aligned */
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+ u64 mapped_size;
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
@@ -393,23 +394,18 @@ static void __init reserve_initrd(void)
initrd_start = 0;
- if (ramdisk_size >= (end_of_lowmem>>1)) {
- memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
- printk(KERN_ERR "initrd too large to handle, "
- "disabling initrd\n");
- return;
- }
-
- printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
- ramdisk_end);
+ mapped_size = memblock_mem_size(max_pfn_mapped);
+ if (ramdisk_size >= (mapped_size>>1))
+ panic("initrd too large to handle, "
+ "disabling initrd (%lld needed, %lld available)\n",
+ ramdisk_size, mapped_size>>1);
+ printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
+ ramdisk_end - 1);
- if (ramdisk_end <= end_of_lowmem) {
- /* All in lowmem, easy case */
- /*
- * don't need to reserve again, already reserved early
- * in i386_start_kernel
- */
+ if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
+ PFN_DOWN(ramdisk_end))) {
+ /* All are mapped, easy case */
initrd_start = ramdisk_image + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
return;
@@ -420,6 +416,9 @@ static void __init reserve_initrd(void)
memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
}
#else
+static void __init early_reserve_initrd(void)
+{
+}
static void __init reserve_initrd(void)
{
}
@@ -428,36 +427,34 @@ static void __init reserve_initrd(void)
static void __init parse_setup_data(void)
{
struct setup_data *data;
- u64 pa_data;
+ u64 pa_data, pa_next;
- if (boot_params.hdr.version < 0x0209)
- return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
- u32 data_len, map_len;
+ u32 data_len, map_len, data_type;
map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
(u64)sizeof(struct setup_data));
data = early_memremap(pa_data, map_len);
data_len = data->len + sizeof(struct setup_data);
- if (data_len > map_len) {
- early_iounmap(data, map_len);
- data = early_memremap(pa_data, data_len);
- map_len = data_len;
- }
+ data_type = data->type;
+ pa_next = data->next;
+ early_iounmap(data, map_len);
- switch (data->type) {
+ switch (data_type) {
case SETUP_E820_EXT:
- parse_e820_ext(data);
+ parse_e820_ext(pa_data, data_len);
break;
case SETUP_DTB:
add_dtb(pa_data);
break;
+ case SETUP_EFI:
+ parse_efi_setup(pa_data, data_len);
+ break;
default:
break;
}
- pa_data = data->next;
- early_iounmap(data, map_len);
+ pa_data = pa_next;
}
}
@@ -467,8 +464,6 @@ static void __init e820_reserve_setup_data(void)
u64 pa_data;
int found = 0;
- if (boot_params.hdr.version < 0x0209)
- return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
@@ -492,8 +487,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
struct setup_data *data;
u64 pa_data;
- if (boot_params.hdr.version < 0x0209)
- return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
@@ -509,54 +502,107 @@ static void __init memblock_x86_reserve_range_setup_data(void)
#ifdef CONFIG_KEXEC
-static inline unsigned long long get_total_mem(void)
-{
- unsigned long long total;
-
- total = max_pfn - min_low_pfn;
-
- return total << PAGE_SHIFT;
-}
-
/*
* Keep the crash kernel below this limit. On 32 bits earlier kernels
* would limit the kernel to the low 512 MiB due to mapping restrictions.
- * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
- * limit once kexec-tools are fixed.
+ * On 64bit, old kexec-tools need to under 896MiB.
*/
#ifdef CONFIG_X86_32
-# define CRASH_KERNEL_ADDR_MAX (512 << 20)
+# define CRASH_KERNEL_ADDR_LOW_MAX (512 << 20)
+# define CRASH_KERNEL_ADDR_HIGH_MAX (512 << 20)
#else
-# define CRASH_KERNEL_ADDR_MAX (896 << 20)
+# define CRASH_KERNEL_ADDR_LOW_MAX (896UL<<20)
+# define CRASH_KERNEL_ADDR_HIGH_MAX MAXMEM
+#endif
+
+static void __init reserve_crashkernel_low(void)
+{
+#ifdef CONFIG_X86_64
+ const unsigned long long alignment = 16<<20; /* 16M */
+ unsigned long long low_base = 0, low_size = 0;
+ unsigned long total_low_mem;
+ unsigned long long base;
+ bool auto_set = false;
+ int ret;
+
+ total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
+ /* crashkernel=Y,low */
+ ret = parse_crashkernel_low(boot_command_line, total_low_mem,
+ &low_size, &base);
+ if (ret != 0) {
+ /*
+ * two parts from lib/swiotlb.c:
+ * swiotlb size: user specified with swiotlb= or default.
+ * swiotlb overflow buffer: now is hardcoded to 32k.
+ * We round it to 8M for other buffers that
+ * may need to stay low too.
+ */
+ low_size = swiotlb_size_or_default() + (8UL<<20);
+ auto_set = true;
+ } else {
+ /* passed with crashkernel=0,low ? */
+ if (!low_size)
+ return;
+ }
+
+ low_base = memblock_find_in_range(low_size, (1ULL<<32),
+ low_size, alignment);
+
+ if (!low_base) {
+ if (!auto_set)
+ pr_info("crashkernel low reservation failed - No suitable area found.\n");
+
+ return;
+ }
+
+ memblock_reserve(low_base, low_size);
+ pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
+ (unsigned long)(low_size >> 20),
+ (unsigned long)(low_base >> 20),
+ (unsigned long)(total_low_mem >> 20));
+ crashk_low_res.start = low_base;
+ crashk_low_res.end = low_base + low_size - 1;
+ insert_resource(&iomem_resource, &crashk_low_res);
#endif
+}
static void __init reserve_crashkernel(void)
{
+ const unsigned long long alignment = 16<<20; /* 16M */
unsigned long long total_mem;
unsigned long long crash_size, crash_base;
+ bool high = false;
int ret;
- total_mem = get_total_mem();
+ total_mem = memblock_phys_mem_size();
+ /* crashkernel=XM */
ret = parse_crashkernel(boot_command_line, total_mem,
&crash_size, &crash_base);
- if (ret != 0 || crash_size <= 0)
- return;
+ if (ret != 0 || crash_size <= 0) {
+ /* crashkernel=X,high */
+ ret = parse_crashkernel_high(boot_command_line, total_mem,
+ &crash_size, &crash_base);
+ if (ret != 0 || crash_size <= 0)
+ return;
+ high = true;
+ }
/* 0 means: find the address automatically */
if (crash_base <= 0) {
- const unsigned long long alignment = 16<<20; /* 16M */
-
/*
* kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
*/
crash_base = memblock_find_in_range(alignment,
- CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
+ high ? CRASH_KERNEL_ADDR_HIGH_MAX :
+ CRASH_KERNEL_ADDR_LOW_MAX,
+ crash_size, alignment);
if (!crash_base) {
pr_info("crashkernel reservation failed - No suitable area found.\n");
return;
}
+
} else {
unsigned long long start;
@@ -578,6 +624,9 @@ static void __init reserve_crashkernel(void)
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
insert_resource(&iomem_resource, &crashk_res);
+
+ if (crash_base >= (1ULL<<32))
+ reserve_crashkernel_low();
}
#else
static void __init reserve_crashkernel(void)
@@ -628,7 +677,82 @@ static __init void reserve_ibft_region(void)
memblock_reserve(addr, size);
}
-static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
+static bool __init snb_gfx_workaround_needed(void)
+{
+#ifdef CONFIG_PCI
+ int i;
+ u16 vendor, devid;
+ static const __initconst u16 snb_ids[] = {
+ 0x0102,
+ 0x0112,
+ 0x0122,
+ 0x0106,
+ 0x0116,
+ 0x0126,
+ 0x010a,
+ };
+
+ /* Assume no if something weird is going on with PCI */
+ if (!early_pci_allowed())
+ return false;
+
+ vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
+ if (vendor != 0x8086)
+ return false;
+
+ devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
+ for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
+ if (devid == snb_ids[i])
+ return true;
+#endif
+
+ return false;
+}
+
+/*
+ * Sandy Bridge graphics has trouble with certain ranges, exclude
+ * them from allocation.
+ */
+static void __init trim_snb_memory(void)
+{
+ static const __initconst unsigned long bad_pages[] = {
+ 0x20050000,
+ 0x20110000,
+ 0x20130000,
+ 0x20138000,
+ 0x40004000,
+ };
+ int i;
+
+ if (!snb_gfx_workaround_needed())
+ return;
+
+ printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
+
+ /*
+ * Reserve all memory below the 1 MB mark that has not
+ * already been reserved.
+ */
+ memblock_reserve(0, 1<<20);
+
+ for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
+ if (memblock_reserve(bad_pages[i], PAGE_SIZE))
+ printk(KERN_WARNING "failed to reserve 0x%08lx\n",
+ bad_pages[i]);
+ }
+}
+
+/*
+ * Here we put platform-specific memory range workarounds, i.e.
+ * memory known to be corrupt or otherwise in need to be reserved on
+ * specific platforms.
+ *
+ * If this gets used more widely it could use a real dispatch mechanism.
+ */
+static void __init trim_platform_memory_ranges(void)
+{
+ trim_snb_memory();
+}
static void __init trim_bios_range(void)
{
@@ -641,8 +765,7 @@ static void __init trim_bios_range(void)
* since some BIOSes are known to corrupt low memory. See the
* Kconfig help text for X86_RESERVE_LOW.
*/
- e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
- E820_RAM, E820_RESERVED);
+ e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
/*
* special case: Some BIOSen report the PC BIOS
@@ -650,9 +773,33 @@ static void __init trim_bios_range(void)
* take them out.
*/
e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
+
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
}
+/* called before trim_bios_range() to spare extra sanitize */
+static void __init e820_add_kernel_range(void)
+{
+ u64 start = __pa_symbol(_text);
+ u64 size = __pa_symbol(_end) - start;
+
+ /*
+ * Complain if .text .data and .bss are not marked as E820_RAM and
+ * attempt to fix it by adding the range. We may have a confused BIOS,
+ * or the user may have used memmap=exactmap or memmap=xxM$yyM to
+ * exclude kernel range. If we really are running on top non-RAM,
+ * we will crash later anyways.
+ */
+ if (e820_all_mapped(start, start + size, E820_RAM))
+ return;
+
+ pr_warn(".text .data .bss are not marked as E820_RAM!\n");
+ e820_remove_range(start, size, E820_RAM, 0);
+ e820_add_region(start, size, E820_RAM);
+}
+
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
+
static int __init parse_reservelow(char *p)
{
unsigned long long size;
@@ -675,6 +822,25 @@ static int __init parse_reservelow(char *p)
early_param("reservelow", parse_reservelow);
+static void __init trim_low_memory_range(void)
+{
+ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+}
+
+/*
+ * Dump out kernel offset information on panic.
+ */
+static int
+dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
+{
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
+ "(relocation range: 0x%lx-0x%lx)\n",
+ (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
+ __START_KERNEL_map, MODULES_VADDR-1);
+
+ return 0;
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -690,9 +856,19 @@ early_param("reservelow", parse_reservelow);
void __init setup_arch(char **cmdline_p)
{
+ memblock_reserve(__pa_symbol(_text),
+ (unsigned long)__bss_stop - (unsigned long)_text);
+
+ early_reserve_initrd();
+
+ /*
+ * At this point everything still needed from the boot loader
+ * or BIOS or kernel text should be early reserved or marked not
+ * RAM in e820. All other memory is free game.
+ */
+
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
- visws_early_detect();
/*
* copy kernel address range established so far and switch
@@ -727,7 +903,6 @@ void __init setup_arch(char **cmdline_p)
apm_info.bios = boot_params.apm_bios_info;
ist_info = boot_params.ist_info;
if (boot_params.sys_desc_table.length != 0) {
- set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
machine_id = boot_params.sys_desc_table.table[0];
machine_submodel_id = boot_params.sys_desc_table.table[1];
BIOS_revision = boot_params.sys_desc_table.table[2];
@@ -749,10 +924,16 @@ void __init setup_arch(char **cmdline_p)
#endif
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- EFI_LOADER_SIGNATURE, 4)) {
- efi_enabled = 1;
- efi_memblock_x86_reserve_range();
+ "EL32", 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ "EL64", 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
}
+
+ if (efi_enabled(EFI_BOOT))
+ efi_memblock_x86_reserve_range();
#endif
x86_init.oem.arch_setup();
@@ -760,8 +941,6 @@ void __init setup_arch(char **cmdline_p)
iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
setup_memory_map();
parse_setup_data();
- /* update the e820_saved too */
- e820_reserve_setup_data();
copy_edd();
@@ -772,12 +951,12 @@ void __init setup_arch(char **cmdline_p)
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = _brk_end;
- code_resource.start = virt_to_phys(_text);
- code_resource.end = virt_to_phys(_etext)-1;
- data_resource.start = virt_to_phys(_etext);
- data_resource.end = virt_to_phys(_edata)-1;
- bss_resource.start = virt_to_phys(&__bss_start);
- bss_resource.end = virt_to_phys(&__bss_stop)-1;
+ code_resource.start = __pa_symbol(_text);
+ code_resource.end = __pa_symbol(_etext)-1;
+ data_resource.start = __pa_symbol(_etext);
+ data_resource.end = __pa_symbol(_edata)-1;
+ bss_resource.start = __pa_symbol(__bss_start);
+ bss_resource.end = __pa_symbol(__bss_stop)-1;
#ifdef CONFIG_CMDLINE_BOOL
#ifdef CONFIG_CMDLINE_OVERRIDE
@@ -823,12 +1002,16 @@ void __init setup_arch(char **cmdline_p)
early_dump_pci_devices();
#endif
+ /* update the e820_saved too */
+ e820_reserve_setup_data();
finish_e820_parsing();
- if (efi_enabled)
+ if (efi_enabled(EFI_BOOT))
efi_init();
dmi_scan_machine();
+ dmi_memdev_walk();
+ dmi_set_dump_stack_arch_desc();
/*
* VMware detection requires dmi to be available, so this
@@ -843,6 +1026,7 @@ void __init setup_arch(char **cmdline_p)
insert_resource(&iomem_resource, &data_resource);
insert_resource(&iomem_resource, &bss_resource);
+ e820_add_kernel_range();
trim_bios_range();
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
@@ -871,8 +1055,6 @@ void __init setup_arch(char **cmdline_p)
/* max_low_pfn get updated here */
find_low_pfn_range();
#else
- num_physpages = max_pfn;
-
check_x2apic();
/* How many end-of-memory variables you have, grandma! */
@@ -892,6 +1074,8 @@ void __init setup_arch(char **cmdline_p)
reserve_ibft_region();
+ early_alloc_pgt_buf();
+
/*
* Need to conclude brk, before memblock_x86_fill()
* it could use memblock_find_in_range, could overlap with
@@ -901,14 +1085,14 @@ void __init setup_arch(char **cmdline_p)
cleanup_highmap();
- memblock.current_limit = get_max_mapped();
+ memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
/*
* The EFI specification says that boot service code won't be called
* after ExitBootServices(). This is, in fact, a lie.
*/
- if (efi_enabled)
+ if (efi_enabled(EFI_MEMMAP))
efi_reserve_boot_services();
/* preallocate 4k for mptable mpc */
@@ -918,26 +1102,24 @@ void __init setup_arch(char **cmdline_p)
setup_bios_corruption_check();
#endif
- printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
- max_pfn_mapped<<PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+ printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
+ (max_pfn_mapped<<PAGE_SHIFT) - 1);
+#endif
- setup_trampolines();
+ reserve_real_mode();
- init_gbpages();
+ trim_platform_memory_ranges();
+ trim_low_memory_range();
- /* max_pfn_mapped is updated here */
- max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
- max_pfn_mapped = max_low_pfn_mapped;
+ init_mem_mapping();
-#ifdef CONFIG_X86_64
- if (max_pfn > max_low_pfn) {
- max_pfn_mapped = init_memory_mapping(1UL<<32,
- max_pfn<<PAGE_SHIFT);
- /* can we preseve max_low_pfn ?*/
- max_low_pfn = max_pfn;
- }
-#endif
- memblock.current_limit = get_max_mapped();
+ early_trap_pf_init();
+
+ setup_real_mode();
+
+ memblock_set_current_limit(get_max_mapped());
+ dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
/*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -952,7 +1134,9 @@ void __init setup_arch(char **cmdline_p)
reserve_initrd();
- reserve_crashkernel();
+#if defined(CONFIG_ACPI) && defined(CONFIG_BLK_DEV_INITRD)
+ acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start);
+#endif
vsmp_init();
@@ -966,19 +1150,26 @@ void __init setup_arch(char **cmdline_p)
early_acpi_boot_init();
initmem_init();
+
+ /*
+ * Reserve memory for crash kernel after SRAT is parsed so that it
+ * won't consume hotpluggable memory.
+ */
+ reserve_crashkernel();
+
memblock_find_dma_reserve();
-#ifdef CONFIG_KVM_CLOCK
+#ifdef CONFIG_KVM_GUEST
kvmclock_init();
#endif
- x86_init.paging.pagetable_setup_start(swapper_pg_dir);
- paging_init();
- x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+ x86_init.paging.pagetable_init();
if (boot_cpu_data.cpuid_level >= 0) {
/* A CPU has %cr4 if and only if it has CPUID */
mmu_cr4_features = read_cr4();
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
}
#ifdef CONFIG_X86_32
@@ -1016,7 +1207,8 @@ void __init setup_arch(char **cmdline_p)
init_cpu_to_node();
init_apic_mappings();
- ioapic_and_gsi_init();
+ if (x86_io_apic_ops.init)
+ x86_io_apic_ops.init();
kvm_guest_init();
@@ -1029,7 +1221,7 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
- if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+ if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
conswitchp = &vga_con;
#elif defined(CONFIG_DUMMY_CONSOLE)
conswitchp = &dummy_con;
@@ -1039,11 +1231,16 @@ void __init setup_arch(char **cmdline_p)
x86_init.timers.wallclock_init();
- x86_platform.wallclock_init();
-
mcheck_init();
arch_init_ideal_nops();
+
+ register_refined_jiffies(CLOCK_TICK_RATE);
+
+#ifdef CONFIG_EFI
+ if (efi_enabled(EFI_BOOT))
+ efi_apply_memmap_quirks();
+#endif
}
#ifdef CONFIG_X86_32
@@ -1062,3 +1259,15 @@ void __init i386_reserve_resources(void)
}
#endif /* CONFIG_X86_32 */
+
+static struct notifier_block kernel_offset_notifier = {
+ .notifier_call = dump_kernel_offset
+};
+
+static int __init register_kernel_offset_dumper(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &kernel_offset_notifier);
+ return 0;
+}
+__initcall(register_kernel_offset_dumper);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 71f4727da37..5cdff035774 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -21,7 +21,7 @@
#include <asm/cpu.h>
#include <asm/stackprotector.h>
-DEFINE_PER_CPU(int, cpu_number);
+DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
EXPORT_PER_CPU_SYMBOL(cpu_number);
#ifdef CONFIG_X86_64
@@ -185,10 +185,22 @@ void __init setup_per_cpu_areas(void)
#endif
rc = -EINVAL;
if (pcpu_chosen_fc != PCPU_FC_PAGE) {
- const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
const size_t dyn_size = PERCPU_MODULE_RESERVE +
PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+ size_t atom_size;
+ /*
+ * On 64bit, use PMD_SIZE for atom_size so that embedded
+ * percpu areas are aligned to PMD. This, in the future,
+ * can also allow using PMD mappings in vmalloc area. Use
+ * PAGE_SIZE on 32bit as vmalloc space is highly contended
+ * and large vmalloc area allocs can easily fail.
+ */
+#ifdef CONFIG_X86_64
+ atom_size = PMD_SIZE;
+#else
+ atom_size = PAGE_SIZE;
+#endif
rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
dyn_size, atom_size,
pcpu_cpu_distance,
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 46a01bdc27e..2851d63c120 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,30 +6,36 @@
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
* 2000-2002 x86-64 support by Andi Kleen
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
-#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/wait.h>
-#include <linux/ptrace.h>
#include <linux/tracehook.h>
#include <linux/unistd.h>
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/uaccess.h>
#include <linux/user-return-notifier.h>
+#include <linux/uprobes.h>
+#include <linux/context_tracking.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/vdso.h>
#include <asm/mce.h>
+#include <asm/sighandling.h>
#ifdef CONFIG_X86_64
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
+#include <asm/sys_ia32.h>
#endif /* CONFIG_X86_64 */
#include <asm/syscall.h>
@@ -37,19 +43,6 @@
#include <asm/sigframe.h>
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
- X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
- X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
- X86_EFLAGS_CF)
-
-#ifdef CONFIG_X86_32
-# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF)
-#else
-# define FIX_EFLAGS __FIX_EFLAGS
-#endif
-
#define COPY(x) do { \
get_user_ex(regs->x, &sc->x); \
} while (0)
@@ -68,9 +61,8 @@
regs->seg = GET_SEG(seg) | 3; \
} while (0)
-static int
-restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
- unsigned long *pax)
+int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
+ unsigned long *pax)
{
void __user *buf;
unsigned int tmpflags;
@@ -117,17 +109,17 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
regs->orig_ax = -1; /* disable syscall checks */
get_user_ex(buf, &sc->fpstate);
- err |= restore_i387_xstate(buf);
get_user_ex(*pax, &sc->ax);
} get_user_catch(err);
+ err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
+
return err;
}
-static int
-setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
- struct pt_regs *regs, unsigned long mask)
+int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+ struct pt_regs *regs, unsigned long mask)
{
int err = 0;
@@ -159,7 +151,7 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
put_user_ex(regs->r15, &sc->r15);
#endif /* CONFIG_X86_64 */
- put_user_ex(current->thread.trap_no, &sc->trapno);
+ put_user_ex(current->thread.trap_nr, &sc->trapno);
put_user_ex(current->thread.error_code, &sc->err);
put_user_ex(regs->ip, &sc->ip);
#ifdef CONFIG_X86_32
@@ -210,35 +202,32 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
void __user **fpstate)
{
/* Default to using normal stack */
+ unsigned long math_size = 0;
unsigned long sp = regs->sp;
+ unsigned long buf_fx = 0;
int onsigstack = on_sig_stack(sp);
-#ifdef CONFIG_X86_64
/* redzone */
- sp -= 128;
-#endif /* CONFIG_X86_64 */
+ if (config_enabled(CONFIG_X86_64))
+ sp -= 128;
if (!onsigstack) {
/* This is the X/Open sanctioned signal stack switching. */
if (ka->sa.sa_flags & SA_ONSTACK) {
if (current->sas_ss_size)
sp = current->sas_ss_sp + current->sas_ss_size;
- } else {
-#ifdef CONFIG_X86_32
- /* This is the legacy signal stack switching. */
- if ((regs->ss & 0xffff) != __USER_DS &&
- !(ka->sa.sa_flags & SA_RESTORER) &&
- ka->sa.sa_restorer)
+ } else if (config_enabled(CONFIG_X86_32) &&
+ (regs->ss & 0xffff) != __USER_DS &&
+ !(ka->sa.sa_flags & SA_RESTORER) &&
+ ka->sa.sa_restorer) {
+ /* This is the legacy signal stack switching. */
sp = (unsigned long) ka->sa.sa_restorer;
-#endif /* CONFIG_X86_32 */
}
}
if (used_math()) {
- sp -= sig_xstate_size;
-#ifdef CONFIG_X86_64
- sp = round_down(sp, 64);
-#endif /* CONFIG_X86_64 */
+ sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32),
+ &buf_fx, &math_size);
*fpstate = (void __user *)sp;
}
@@ -251,8 +240,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
if (onsigstack && !likely(on_sig_stack(sp)))
return (void __user *)-1L;
- /* save i387 state */
- if (used_math() && save_i387_xstate(*fpstate) < 0)
+ /* save i387 and extended state */
+ if (used_math() &&
+ save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0)
return (void __user *)-1L;
return (void __user *)sp;
@@ -282,7 +272,7 @@ static const struct {
};
static int
-__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
+__setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
struct pt_regs *regs)
{
struct sigframe __user *frame;
@@ -290,7 +280,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
int err = 0;
void __user *fpstate = NULL;
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
+ frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
@@ -308,11 +298,12 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
}
if (current->mm->context.vdso)
- restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
+ restorer = current->mm->context.vdso +
+ selected_vdso32->sym___kernel_sigreturn;
else
restorer = &frame->retcode;
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
/* Set up to return from userspace. */
err |= __put_user(restorer, &frame->pretcode);
@@ -331,7 +322,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
/* Set up registers for signal handler */
regs->sp = (unsigned long)frame;
- regs->ip = (unsigned long)ka->sa.sa_handler;
+ regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
regs->ax = (unsigned long)sig;
regs->dx = 0;
regs->cx = 0;
@@ -344,7 +335,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
return 0;
}
-static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+static int __setup_rt_frame(int sig, struct ksignal *ksig,
sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
@@ -352,7 +343,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
int err = 0;
void __user *fpstate = NULL;
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
+ frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
@@ -361,7 +352,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
put_user_ex(sig, &frame->sig);
put_user_ex(&frame->info, &frame->pinfo);
put_user_ex(&frame->uc, &frame->puc);
- err |= copy_siginfo_to_user(&frame->info, info);
/* Create the ucontext. */
if (cpu_has_xsave)
@@ -369,18 +359,13 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
else
put_user_ex(0, &frame->uc.uc_flags);
put_user_ex(0, &frame->uc.uc_link);
- put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- put_user_ex(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
- regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+ save_altstack_ex(&frame->uc.uc_stack, regs->sp);
/* Set up to return from userspace. */
- restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
+ restorer = current->mm->context.vdso +
+ selected_vdso32->sym___kernel_rt_sigreturn;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
put_user_ex(restorer, &frame->pretcode);
/*
@@ -392,13 +377,18 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
*/
put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
} put_user_catch(err);
+
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
if (err)
return -EFAULT;
/* Set up registers for signal handler */
regs->sp = (unsigned long)frame;
- regs->ip = (unsigned long)ka->sa.sa_handler;
+ regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
regs->ax = (unsigned long)sig;
regs->dx = (unsigned long)&frame->info;
regs->cx = (unsigned long)&frame->uc;
@@ -411,21 +401,20 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
return 0;
}
#else /* !CONFIG_X86_32 */
-static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+static int __setup_rt_frame(int sig, struct ksignal *ksig,
sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
void __user *fp = NULL;
int err = 0;
- struct task_struct *me = current;
- frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
+ frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
return -EFAULT;
- if (ka->sa.sa_flags & SA_SIGINFO) {
- if (copy_siginfo_to_user(&frame->info, info))
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user(&frame->info, &ksig->info))
return -EFAULT;
}
@@ -436,24 +425,22 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
else
put_user_ex(0, &frame->uc.uc_flags);
put_user_ex(0, &frame->uc.uc_link);
- put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- put_user_ex(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+ save_altstack_ex(&frame->uc.uc_stack, regs->sp);
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
/* x86-64 should always use SA_RESTORER. */
- if (ka->sa.sa_flags & SA_RESTORER) {
- put_user_ex(ka->sa.sa_restorer, &frame->pretcode);
+ if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode);
} else {
/* could use a vstub here */
err |= -EFAULT;
}
} put_user_catch(err);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
if (err)
return -EFAULT;
@@ -466,7 +453,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
next argument after the signal number on the stack. */
regs->si = (unsigned long)&frame->info;
regs->dx = (unsigned long)&frame->uc;
- regs->ip = (unsigned long) ka->sa.sa_handler;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
regs->sp = (unsigned long)frame;
@@ -478,87 +465,79 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
}
#endif /* CONFIG_X86_32 */
-#ifdef CONFIG_X86_32
-/*
- * Atomically swap in the new signal mask, and wait for a signal.
- */
-asmlinkage int
-sys_sigsuspend(int history0, int history1, old_sigset_t mask)
+static int x32_setup_rt_frame(struct ksignal *ksig,
+ compat_sigset_t *set,
+ struct pt_regs *regs)
{
- sigset_t blocked;
-
- current->saved_sigmask = current->blocked;
-
- mask &= _BLOCKABLE;
- siginitset(&blocked, mask);
- set_current_blocked(&blocked);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
-
- set_restore_sigmask();
- return -ERESTARTNOHAND;
-}
+#ifdef CONFIG_X86_X32_ABI
+ struct rt_sigframe_x32 __user *frame;
+ void __user *restorer;
+ int err = 0;
+ void __user *fpstate = NULL;
-asmlinkage int
-sys_sigaction(int sig, const struct old_sigaction __user *act,
- struct old_sigaction __user *oact)
-{
- struct k_sigaction new_ka, old_ka;
- int ret = 0;
+ frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
- if (act) {
- old_sigset_t mask;
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
- if (!access_ok(VERIFY_READ, act, sizeof(*act)))
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user32(&frame->info, &ksig->info))
return -EFAULT;
+ }
- get_user_try {
- get_user_ex(new_ka.sa.sa_handler, &act->sa_handler);
- get_user_ex(new_ka.sa.sa_flags, &act->sa_flags);
- get_user_ex(mask, &act->sa_mask);
- get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer);
- } get_user_catch(ret);
+ put_user_try {
+ /* Create the ucontext. */
+ if (cpu_has_xsave)
+ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ put_user_ex(0, &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
+ put_user_ex(0, &frame->uc.uc__pad0);
- if (ret)
- return -EFAULT;
- siginitset(&new_ka.sa.sa_mask, mask);
- }
+ if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ restorer = ksig->ka.sa.sa_restorer;
+ } else {
+ /* could use a vstub here */
+ restorer = NULL;
+ err |= -EFAULT;
+ }
+ put_user_ex(restorer, &frame->pretcode);
+ } put_user_catch(err);
- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
- if (!ret && oact) {
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)))
- return -EFAULT;
+ if (err)
+ return -EFAULT;
- put_user_try {
- put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler);
- put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags);
- put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
- put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer);
- } put_user_catch(ret);
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
- if (ret)
- return -EFAULT;
- }
+ /* We use the x32 calling convention here... */
+ regs->di = ksig->sig;
+ regs->si = (unsigned long) &frame->info;
+ regs->dx = (unsigned long) &frame->uc;
- return ret;
-}
-#endif /* CONFIG_X86_32 */
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
-long
-sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
- struct pt_regs *regs)
-{
- return do_sigaltstack(uss, uoss, regs->sp);
+ regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
+#endif /* CONFIG_X86_X32_ABI */
+
+ return 0;
}
/*
* Do a signal return; undo the signal stack.
*/
#ifdef CONFIG_X86_32
-unsigned long sys_sigreturn(struct pt_regs *regs)
+asmlinkage unsigned long sys_sigreturn(void)
{
+ struct pt_regs *regs = current_pt_regs();
struct sigframe __user *frame;
unsigned long ax;
sigset_t set;
@@ -572,7 +551,6 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
sizeof(frame->extramask))))
goto badframe;
- sigdelsetmask(&set, ~_BLOCKABLE);
set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->sc, &ax))
@@ -586,8 +564,9 @@ badframe:
}
#endif /* CONFIG_X86_32 */
-long sys_rt_sigreturn(struct pt_regs *regs)
+asmlinkage long sys_rt_sigreturn(void)
{
+ struct pt_regs *regs = current_pt_regs();
struct rt_sigframe __user *frame;
unsigned long ax;
sigset_t set;
@@ -598,13 +577,12 @@ long sys_rt_sigreturn(struct pt_regs *regs)
if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
goto badframe;
- sigdelsetmask(&set, ~_BLOCKABLE);
set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
goto badframe;
- if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
+ if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
return ax;
@@ -628,62 +606,30 @@ static int signr_convert(int sig)
return sig;
}
-#ifdef CONFIG_X86_32
-
-#define is_ia32 1
-#define ia32_setup_frame __setup_frame
-#define ia32_setup_rt_frame __setup_rt_frame
-
-#else /* !CONFIG_X86_32 */
-
-#ifdef CONFIG_IA32_EMULATION
-#define is_ia32 test_thread_flag(TIF_IA32)
-#else /* !CONFIG_IA32_EMULATION */
-#define is_ia32 0
-#endif /* CONFIG_IA32_EMULATION */
-
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
- sigset_t *set, struct pt_regs *regs);
-
-#endif /* CONFIG_X86_32 */
-
static int
-setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- struct pt_regs *regs)
+setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
- int usig = signr_convert(sig);
- sigset_t *set = &current->blocked;
- int ret;
-
- if (current_thread_info()->status & TS_RESTORE_SIGMASK)
- set = &current->saved_sigmask;
+ int usig = signr_convert(ksig->sig);
+ sigset_t *set = sigmask_to_save();
+ compat_sigset_t *cset = (compat_sigset_t *) set;
/* Set up the stack frame */
- if (is_ia32) {
- if (ka->sa.sa_flags & SA_SIGINFO)
- ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+ if (is_ia32_frame()) {
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO)
+ return ia32_setup_rt_frame(usig, ksig, cset, regs);
else
- ret = ia32_setup_frame(usig, ka, set, regs);
- } else
- ret = __setup_rt_frame(sig, ka, info, set, regs);
-
- if (ret) {
- force_sigsegv(sig, current);
- return -EFAULT;
+ return ia32_setup_frame(usig, ksig, cset, regs);
+ } else if (is_x32_frame()) {
+ return x32_setup_rt_frame(ksig, cset, regs);
+ } else {
+ return __setup_rt_frame(ksig->sig, ksig, set, regs);
}
-
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- return ret;
}
-static int
-handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
- struct pt_regs *regs)
+static void
+handle_signal(struct ksignal *ksig, struct pt_regs *regs)
{
- int ret;
-
+ bool failed;
/* Are we from a system call? */
if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
@@ -694,7 +640,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
break;
case -ERESTARTSYS:
- if (!(ka->sa.sa_flags & SA_RESTART)) {
+ if (!(ksig->ka.sa.sa_flags & SA_RESTART)) {
regs->ax = -EINTR;
break;
}
@@ -714,30 +660,23 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
regs->flags &= ~X86_EFLAGS_TF;
- ret = setup_rt_frame(sig, ka, info, regs);
-
- if (ret)
- return ret;
-
- /*
- * Clear the direction flag as per the ABI for function entry.
- */
- regs->flags &= ~X86_EFLAGS_DF;
-
- /*
- * Clear TF when entering the signal handler, but
- * notify any tracer that was single-stepping it.
- * The tracer may want to single-step inside the
- * handler too.
- */
- regs->flags &= ~X86_EFLAGS_TF;
-
- block_sigmask(ka, sig);
-
- tracehook_signal_handler(sig, info, ka, regs,
- test_thread_flag(TIF_SINGLESTEP));
-
- return 0;
+ failed = (setup_rt_frame(ksig, regs) < 0);
+ if (!failed) {
+ /*
+ * Clear the direction flag as per the ABI for function entry.
+ *
+ * Clear RF when entering the signal handler, because
+ * it might disable possible debug exception from the
+ * signal handler.
+ *
+ * Clear TF when entering the signal handler, but
+ * notify any tracer that was single-stepping it.
+ * The tracer may want to single-step inside the
+ * handler too.
+ */
+ regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
+ }
+ signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
}
#ifdef CONFIG_X86_32
@@ -754,24 +693,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
*/
static void do_signal(struct pt_regs *regs)
{
- struct k_sigaction ka;
- siginfo_t info;
- int signr;
+ struct ksignal ksig;
- /*
- * We want the common case to go fast, which is why we may in certain
- * cases get here from kernel mode. Just return without doing anything
- * if so.
- * X86_32: vm86 regs switched out by assembly code before reaching
- * here, so testing against kernel CS suffices.
- */
- if (!user_mode(regs))
- return;
-
- signr = get_signal_to_deliver(&info, &ka, regs, NULL);
- if (signr > 0) {
+ if (get_signal(&ksig)) {
/* Whee! Actually deliver the signal. */
- handle_signal(signr, &info, &ka, regs);
+ handle_signal(&ksig, regs);
return;
}
@@ -797,25 +723,27 @@ static void do_signal(struct pt_regs *regs)
* If there's no signal to deliver, we just put the saved sigmask
* back.
*/
- if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- set_current_blocked(&current->saved_sigmask);
- }
+ restore_saved_sigmask();
}
/*
* notification of userspace execution resumption
* - triggered by the TIF_WORK_MASK flags
*/
-void
+__visible void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
+ user_exit();
+
#ifdef CONFIG_X86_MCE
/* notify userspace of pending MCEs */
if (thread_info_flags & _TIF_MCE_NOTIFY)
mce_notify_process();
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
+ if (thread_info_flags & _TIF_UPROBE)
+ uprobe_notify_resume(regs);
+
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
@@ -823,15 +751,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
- if (current->replacement_session_keyring)
- key_replace_session_keyring();
}
if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
-#ifdef CONFIG_X86_32
- clear_thread_flag(TIF_IRET);
-#endif /* CONFIG_X86_32 */
+ user_enter();
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
@@ -845,8 +769,39 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
me->comm, me->pid, where, frame,
regs->ip, regs->sp, regs->orig_ax);
print_vma_addr(" in ", regs->ip);
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
force_sig(SIGSEGV, me);
}
+
+#ifdef CONFIG_X86_X32_ABI
+asmlinkage long sys32_x32_rt_sigreturn(void)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe_x32 __user *frame;
+ sigset_t set;
+ unsigned long ax;
+
+ frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
+
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+ goto badframe;
+
+ if (compat_restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ return ax;
+
+badframe:
+ signal_fault(regs, frame, "x32 rt_sigreturn");
+ return 0;
+}
+#endif
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 66c74f481ca..be8e1bde07a 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -30,6 +30,7 @@
#include <asm/proto.h>
#include <asm/apic.h>
#include <asm/nmi.h>
+#include <asm/trace/irq_vectors.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
*
@@ -109,6 +110,9 @@
* about nothing of note with C stepping upwards.
*/
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+static bool smp_no_nmi_ipi = false;
+
/*
* this function sends a 'reschedule' IPI to another CPU.
* it goes straight through and wastes no time serializing
@@ -149,8 +153,6 @@ void native_send_call_func_ipi(const struct cpumask *mask)
free_cpumask_var(allbutself);
}
-static atomic_t stopping_cpu = ATOMIC_INIT(-1);
-
static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
{
/* We are registered on stopping cpu too, avoid spurious NMI */
@@ -162,7 +164,19 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
}
-static void native_nmi_stop_other_cpus(int wait)
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+
+asmlinkage __visible void smp_reboot_interrupt(void)
+{
+ ack_APIC_irq();
+ irq_enter();
+ stop_this_cpu(NULL);
+ irq_exit();
+}
+
+static void native_stop_other_cpus(int wait)
{
unsigned long flags;
unsigned long timeout;
@@ -174,20 +188,25 @@ static void native_nmi_stop_other_cpus(int wait)
* Use an own vector here because smp_call_function
* does lots of things not suitable in a panic situation.
*/
+
+ /*
+ * We start by using the REBOOT_VECTOR irq.
+ * The irq is treated as a sync point to allow critical
+ * regions of code on other cpus to release their spin locks
+ * and re-enable irqs. Jumping straight to an NMI might
+ * accidentally cause deadlocks with further shutdown/panic
+ * code. By syncing, we give the cpus up to one second to
+ * finish their work before we force them off with the NMI.
+ */
if (num_online_cpus() > 1) {
/* did someone beat us here? */
if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
return;
- if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
- NMI_FLAG_FIRST, "smp_stop"))
- /* Note: we ignore failures here */
- return;
-
- /* sync above data before sending NMI */
+ /* sync above data before sending IRQ */
wmb();
- apic->send_IPI_allbutself(NMI_VECTOR);
+ apic->send_IPI_allbutself(REBOOT_VECTOR);
/*
* Don't wait longer than a second if the caller
@@ -197,98 +216,127 @@ static void native_nmi_stop_other_cpus(int wait)
while (num_online_cpus() > 1 && (wait || timeout--))
udelay(1);
}
+
+ /* if the REBOOT_VECTOR didn't work, try with the NMI */
+ if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) {
+ if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+ NMI_FLAG_FIRST, "smp_stop"))
+ /* Note: we ignore failures here */
+ /* Hope the REBOOT_IRQ is good enough */
+ goto finish;
- local_irq_save(flags);
- disable_local_APIC();
- local_irq_restore(flags);
-}
-
-/*
- * this function calls the 'stop' function on all other CPUs in the system.
- */
-
-asmlinkage void smp_reboot_interrupt(void)
-{
- ack_APIC_irq();
- irq_enter();
- stop_this_cpu(NULL);
- irq_exit();
-}
-
-static void native_irq_stop_other_cpus(int wait)
-{
- unsigned long flags;
- unsigned long timeout;
+ /* sync above data before sending IRQ */
+ wmb();
- if (reboot_force)
- return;
+ pr_emerg("Shutting down cpus with NMI\n");
- /*
- * Use an own vector here because smp_call_function
- * does lots of things not suitable in a panic situation.
- * On most systems we could also use an NMI here,
- * but there are a few systems around where NMI
- * is problematic so stay with an non NMI for now
- * (this implies we cannot stop CPUs spinning with irq off
- * currently)
- */
- if (num_online_cpus() > 1) {
- apic->send_IPI_allbutself(REBOOT_VECTOR);
+ apic->send_IPI_allbutself(NMI_VECTOR);
/*
- * Don't wait longer than a second if the caller
+ * Don't wait longer than a 10 ms if the caller
* didn't ask us to wait.
*/
- timeout = USEC_PER_SEC;
+ timeout = USEC_PER_MSEC * 10;
while (num_online_cpus() > 1 && (wait || timeout--))
udelay(1);
}
+finish:
local_irq_save(flags);
disable_local_APIC();
local_irq_restore(flags);
}
-static void native_smp_disable_nmi_ipi(void)
-{
- smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
-}
-
/*
* Reschedule call back.
*/
-void smp_reschedule_interrupt(struct pt_regs *regs)
+static inline void __smp_reschedule_interrupt(void)
{
- ack_APIC_irq();
inc_irq_stat(irq_resched_count);
scheduler_ipi();
+}
+
+__visible void smp_reschedule_interrupt(struct pt_regs *regs)
+{
+ ack_APIC_irq();
+ __smp_reschedule_interrupt();
/*
* KVM uses this interrupt to force a cpu out of guest mode
*/
}
-void smp_call_function_interrupt(struct pt_regs *regs)
+static inline void smp_entering_irq(void)
{
ack_APIC_irq();
irq_enter();
+}
+
+__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
+{
+ /*
+ * Need to call irq_enter() before calling the trace point.
+ * __smp_reschedule_interrupt() calls irq_enter/exit() too (in
+ * scheduler_ipi(). This is OK, since those functions are allowed
+ * to nest.
+ */
+ smp_entering_irq();
+ trace_reschedule_entry(RESCHEDULE_VECTOR);
+ __smp_reschedule_interrupt();
+ trace_reschedule_exit(RESCHEDULE_VECTOR);
+ exiting_irq();
+ /*
+ * KVM uses this interrupt to force a cpu out of guest mode
+ */
+}
+
+static inline void __smp_call_function_interrupt(void)
+{
generic_smp_call_function_interrupt();
inc_irq_stat(irq_call_count);
- irq_exit();
}
-void smp_call_function_single_interrupt(struct pt_regs *regs)
+__visible void smp_call_function_interrupt(struct pt_regs *regs)
+{
+ smp_entering_irq();
+ __smp_call_function_interrupt();
+ exiting_irq();
+}
+
+__visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
+{
+ smp_entering_irq();
+ trace_call_function_entry(CALL_FUNCTION_VECTOR);
+ __smp_call_function_interrupt();
+ trace_call_function_exit(CALL_FUNCTION_VECTOR);
+ exiting_irq();
+}
+
+static inline void __smp_call_function_single_interrupt(void)
{
- ack_APIC_irq();
- irq_enter();
generic_smp_call_function_single_interrupt();
inc_irq_stat(irq_call_count);
- irq_exit();
+}
+
+__visible void smp_call_function_single_interrupt(struct pt_regs *regs)
+{
+ smp_entering_irq();
+ __smp_call_function_single_interrupt();
+ exiting_irq();
+}
+
+__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
+{
+ smp_entering_irq();
+ trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
+ __smp_call_function_single_interrupt();
+ trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
+ exiting_irq();
}
static int __init nonmi_ipi_setup(char *str)
{
- native_smp_disable_nmi_ipi();
- return 1;
+ smp_no_nmi_ipi = true;
+ return 1;
}
__setup("nonmi_ipi", nonmi_ipi_setup);
@@ -298,7 +346,7 @@ struct smp_ops smp_ops = {
.smp_prepare_cpus = native_smp_prepare_cpus,
.smp_cpus_done = native_smp_cpus_done,
- .stop_other_cpus = native_nmi_stop_other_cpus,
+ .stop_other_cpus = native_stop_other_cpus,
.smp_send_reschedule = native_smp_send_reschedule,
.cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 66d250c00d1..5492798930e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1,4 +1,4 @@
-/*
+ /*
* x86 SMP booting functions
*
* (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
@@ -39,6 +39,8 @@
* Glauber Costa : i386 and x86_64 integration
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/module.h>
@@ -50,13 +52,14 @@
#include <linux/tboot.h>
#include <linux/stackprotector.h>
#include <linux/gfp.h>
+#include <linux/cpuidle.h>
#include <asm/acpi.h>
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
#include <asm/idle.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
#include <asm/cpu.h>
#include <asm/numa.h>
#include <asm/pgtable.h>
@@ -65,69 +68,35 @@
#include <asm/mwait.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
+#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/setup.h>
#include <asm/uv/uv.h>
#include <linux/mc146818rtc.h>
-
#include <asm/smpboot_hooks.h>
#include <asm/i8259.h>
+#include <asm/realmode.h>
+#include <asm/misc.h>
/* State of each CPU */
DEFINE_PER_CPU(int, cpu_state) = { 0 };
-/* Store all idle threads, this can be reused instead of creating
-* a new thread. Also avoids complicated thread destroy functionality
-* for idle threads.
-*/
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
- * removed after init for !CONFIG_HOTPLUG_CPU.
- */
-static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
-#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
-#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
-
-/*
- * We need this for trampoline_base protection from concurrent accesses when
- * off- and onlining cores wildly.
- */
-static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
-
-void cpu_hotplug_driver_lock(void)
-{
- mutex_lock(&x86_cpu_hotplug_driver_mutex);
-}
-
-void cpu_hotplug_driver_unlock(void)
-{
- mutex_unlock(&x86_cpu_hotplug_driver_mutex);
-}
-
-ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
-ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
-#else
-static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-#define get_idle_for_cpu(x) (idle_thread_array[(x)])
-#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
-#endif
-
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);
/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
/* representing HT siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
/* representing HT and core siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
EXPORT_PER_CPU_SYMBOL(cpu_core_map);
-DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
/* Per CPU bogomips and other parameters */
DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@ -136,10 +105,10 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
atomic_t init_deasserted;
/*
- * Report back to the Boot Processor.
- * Running on AP.
+ * Report back to the Boot Processor during boot time or to the caller processor
+ * during CPU online.
*/
-static void __cpuinit smp_callin(void)
+static void smp_callin(void)
{
int cpuid, phys_id;
unsigned long timeout;
@@ -149,15 +118,18 @@ static void __cpuinit smp_callin(void)
* we may get here before an INIT-deassert IPI reaches
* our local APIC. We have to wait for the IPI or we'll
* lock up on an APIC access.
+ *
+ * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI.
*/
- if (apic->wait_for_init_deassert)
- apic->wait_for_init_deassert(&init_deasserted);
+ cpuid = smp_processor_id();
+ if (apic->wait_for_init_deassert && cpuid)
+ while (!atomic_read(&init_deasserted))
+ cpu_relax();
/*
* (This works even if the APIC is not enabled.)
*/
phys_id = read_apic_id();
- cpuid = smp_processor_id();
if (cpumask_test_cpu(cpuid, cpu_callin_mask)) {
panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
phys_id, cpuid);
@@ -197,7 +169,7 @@ static void __cpuinit smp_callin(void)
* boards)
*/
- pr_debug("CALLIN, before setup_local_APIC().\n");
+ pr_debug("CALLIN, before setup_local_APIC()\n");
if (apic->smp_callin_clear_local_apic)
apic->smp_callin_clear_local_apic();
setup_local_APIC();
@@ -219,14 +191,9 @@ static void __cpuinit smp_callin(void)
* Update loops_per_jiffy in cpu_data. Previous call to
* smp_store_cpu_info() stored a value that is close but not as
* accurate as the value just calculated.
- *
- * Need to enable IRQs because it can take longer and then
- * the NMI watchdog might kill us.
*/
- local_irq_enable();
calibrate_delay();
cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
- local_irq_disable();
pr_debug("Stack at about %p\n", &cpuid);
/*
@@ -244,10 +211,12 @@ static void __cpuinit smp_callin(void)
cpumask_set_cpu(cpuid, cpu_callin_mask);
}
+static int cpu0_logical_apicid;
+static int enable_start_cpu0;
/*
* Activate a secondary processor.
*/
-notrace static void __cpuinit start_secondary(void *unused)
+static void notrace start_secondary(void *unused)
{
/*
* Don't put *anything* before cpu_init(), SMP booting is too
@@ -255,9 +224,12 @@ notrace static void __cpuinit start_secondary(void *unused)
* most necessary things.
*/
cpu_init();
+ x86_cpuinit.early_percpu_clock_init();
preempt_disable();
smp_callin();
+ enable_start_cpu0 = 0;
+
#ifdef CONFIG_X86_32
/* switch away from the initial page table */
load_cr3(swapper_pg_dir);
@@ -272,38 +244,23 @@ notrace static void __cpuinit start_secondary(void *unused)
check_tsc_sync_target();
/*
- * We need to hold call_lock, so there is no inconsistency
- * between the time smp_call_function() determines number of
- * IPI recipients, and the time when the determination is made
- * for which cpus receive the IPI. Holding this
- * lock helps us to not include this cpu in a currently in progress
- * smp_call_function().
- *
+ * Enable the espfix hack for this CPU
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ init_espfix_ap();
+#endif
+
+ /*
* We need to hold vector_lock so there the set of online cpus
* does not change while we are assigning vectors to cpus. Holding
* this lock ensures we don't half assign or remove an irq from a cpu.
*/
- ipi_call_lock();
lock_vector_lock();
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
- ipi_call_unlock();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
x86_platform.nmi_init();
- /*
- * Wait until the cpu which brought this one up marked it
- * online before enabling interrupts. If we don't do that then
- * we can end up waking up the softirq thread before this cpu
- * reached the active state, which makes the scheduler unhappy
- * and schedule the softirq thread on the wrong cpu. This is
- * only observable with forced threaded interrupts, but in
- * theory it could also happen w/o them. It's just way harder
- * to achieve.
- */
- while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
- cpu_relax();
-
/* enable local interrupts */
local_irq_enable();
@@ -313,77 +270,131 @@ notrace static void __cpuinit start_secondary(void *unused)
x86_cpuinit.setup_percpu_clockev();
wmb();
- cpu_idle();
+ cpu_startup_entry(CPUHP_ONLINE);
+}
+
+void __init smp_store_boot_cpu_info(void)
+{
+ int id = 0; /* CPU 0 */
+ struct cpuinfo_x86 *c = &cpu_data(id);
+
+ *c = boot_cpu_data;
+ c->cpu_index = id;
}
/*
* The bootstrap kernel entry code has set these up. Save them for
* a given CPU
*/
-
-void __cpuinit smp_store_cpu_info(int id)
+void smp_store_cpu_info(int id)
{
struct cpuinfo_x86 *c = &cpu_data(id);
*c = boot_cpu_data;
c->cpu_index = id;
- if (id != 0)
- identify_secondary_cpu(c);
+ /*
+ * During boot time, CPU0 has this setup already. Save the info when
+ * bringing up AP or offlined CPU0.
+ */
+ identify_secondary_cpu(c);
}
-static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+static bool
+topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
{
- cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
- cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
- cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
- cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
- cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
- cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
+ "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
+ "[node: %d != %d]. Ignoring dependency.\n",
+ cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
}
+#define link_mask(_m, c1, c2) \
+do { \
+ cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \
+ cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \
+} while (0)
-void __cpuinit set_cpu_sibling_map(int cpu)
+static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- int i;
+ if (cpu_has_topoext) {
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ if (c->phys_proc_id == o->phys_proc_id &&
+ per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
+ c->compute_unit_id == o->compute_unit_id)
+ return topology_sane(c, o, "smt");
+
+ } else if (c->phys_proc_id == o->phys_proc_id &&
+ c->cpu_core_id == o->cpu_core_id) {
+ return topology_sane(c, o, "smt");
+ }
+
+ return false;
+}
+
+static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
+ per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
+ return topology_sane(c, o, "llc");
+
+ return false;
+}
+
+static bool match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (c->phys_proc_id == o->phys_proc_id) {
+ if (cpu_has(c, X86_FEATURE_AMD_DCM))
+ return true;
+
+ return topology_sane(c, o, "mc");
+ }
+ return false;
+}
+
+void set_cpu_sibling_map(int cpu)
+{
+ bool has_smt = smp_num_siblings > 1;
+ bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1;
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct cpuinfo_x86 *o;
+ int i;
cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
- if (smp_num_siblings > 1) {
- for_each_cpu(i, cpu_sibling_setup_mask) {
- struct cpuinfo_x86 *o = &cpu_data(i);
-
- if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
- if (c->phys_proc_id == o->phys_proc_id &&
- per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
- c->compute_unit_id == o->compute_unit_id)
- link_thread_siblings(cpu, i);
- } else if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_core_id == o->cpu_core_id) {
- link_thread_siblings(cpu, i);
- }
- }
- } else {
+ if (!has_mp) {
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_core_mask(cpu));
+ c->booted_cores = 1;
+ return;
}
- cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ for_each_cpu(i, cpu_sibling_setup_mask) {
+ o = &cpu_data(i);
+
+ if ((i == cpu) || (has_smt && match_smt(c, o)))
+ link_mask(sibling, cpu, i);
+
+ if ((i == cpu) || (has_mp && match_llc(c, o)))
+ link_mask(llc_shared, cpu, i);
- if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
- cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
- c->booted_cores = 1;
- return;
}
+ /*
+ * This needs a separate iteration over the cpus because we rely on all
+ * cpu_sibling_mask links to be set-up.
+ */
for_each_cpu(i, cpu_sibling_setup_mask) {
- if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
- per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
- cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
- cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
- }
- if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- cpumask_set_cpu(cpu, cpu_core_mask(i));
+ o = &cpu_data(i);
+
+ if ((i == cpu) || (has_mp && match_mc(c, o))) {
+ link_mask(core, cpu, i);
+
/*
* Does this new cpu bringup a new core?
*/
@@ -409,16 +420,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
/* maps the cpu to the sched domain representing multi-core */
const struct cpumask *cpu_coregroup_mask(int cpu)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- /*
- * For perf, we return last level cache shared map.
- * And for power savings, we return cpu_core_map
- */
- if ((sched_mc_power_savings || sched_smt_power_savings) &&
- !(cpu_has(c, X86_FEATURE_AMD_DCM)))
- return cpu_core_mask(cpu);
- else
- return cpu_llc_shared_mask(cpu);
+ return cpu_llc_shared_mask(cpu);
}
static void impress_friends(void)
@@ -428,17 +430,16 @@ static void impress_friends(void)
/*
* Allow the user to impress friends.
*/
- pr_debug("Before bogomips.\n");
+ pr_debug("Before bogomips\n");
for_each_possible_cpu(cpu)
if (cpumask_test_cpu(cpu, cpu_callout_mask))
bogosum += cpu_data(cpu).loops_per_jiffy;
- printk(KERN_INFO
- "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+ pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
num_online_cpus(),
bogosum/(500000/HZ),
(bogosum/(5000/HZ))%100);
- pr_debug("Before bogocount - setting activated=1.\n");
+ pr_debug("Before bogocount - setting activated=1\n");
}
void __inquire_remote_apic(int apicid)
@@ -448,18 +449,17 @@ void __inquire_remote_apic(int apicid)
int timeout;
u32 status;
- printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
+ pr_info("Inquiring remote APIC 0x%x...\n", apicid);
for (i = 0; i < ARRAY_SIZE(regs); i++) {
- printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
+ pr_info("... APIC 0x%x %s: ", apicid, names[i]);
/*
* Wait for idle.
*/
status = safe_apic_wait_icr_idle();
if (status)
- printk(KERN_CONT
- "a previous APIC delivery may have failed\n");
+ pr_cont("a previous APIC delivery may have failed\n");
apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
@@ -472,10 +472,10 @@ void __inquire_remote_apic(int apicid)
switch (status) {
case APIC_ICR_RR_VALID:
status = apic_read(APIC_RRR);
- printk(KERN_CONT "%08x\n", status);
+ pr_cont("%08x\n", status);
break;
default:
- printk(KERN_CONT "failed\n");
+ pr_cont("failed\n");
}
}
}
@@ -485,8 +485,8 @@ void __inquire_remote_apic(int apicid)
* INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
* won't ... remember to clear down the APIC, etc later.
*/
-int __cpuinit
-wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
+int
+wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
int maxlvt;
@@ -494,7 +494,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
/* Target chip */
/* Boot on the stack */
/* Kick the second */
- apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid);
+ apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@@ -509,17 +509,17 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
apic_write(APIC_ESR, 0);
accept_status = (apic_read(APIC_ESR) & 0xEF);
}
- pr_debug("NMI sent.\n");
+ pr_debug("NMI sent\n");
if (send_status)
- printk(KERN_ERR "APIC never delivered???\n");
+ pr_err("APIC never delivered???\n");
if (accept_status)
- printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
+ pr_err("APIC delivery error (%lx)\n", accept_status);
return (send_status | accept_status);
}
-static int __cpuinit
+static int
wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
@@ -536,7 +536,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
apic_read(APIC_ESR);
}
- pr_debug("Asserting INIT.\n");
+ pr_debug("Asserting INIT\n");
/*
* Turn INIT on target chip
@@ -552,7 +552,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
mdelay(10);
- pr_debug("Deasserting INIT.\n");
+ pr_debug("Deasserting INIT\n");
/* Target chip */
/* Send IPI */
@@ -585,14 +585,14 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
/*
* Run STARTUP IPI loop.
*/
- pr_debug("#startup loops: %d.\n", num_starts);
+ pr_debug("#startup loops: %d\n", num_starts);
for (j = 1; j <= num_starts; j++) {
- pr_debug("Sending STARTUP #%d.\n", j);
+ pr_debug("Sending STARTUP #%d\n", j);
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
- pr_debug("After apic_write.\n");
+ pr_debug("After apic_write\n");
/*
* STARTUP IPI
@@ -609,7 +609,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
*/
udelay(300);
- pr_debug("Startup point 1.\n");
+ pr_debug("Startup point 1\n");
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@@ -624,113 +624,162 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
if (send_status || accept_status)
break;
}
- pr_debug("After Startup.\n");
+ pr_debug("After Startup\n");
if (send_status)
- printk(KERN_ERR "APIC never delivered???\n");
+ pr_err("APIC never delivered???\n");
if (accept_status)
- printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
+ pr_err("APIC delivery error (%lx)\n", accept_status);
return (send_status | accept_status);
}
-struct create_idle {
- struct work_struct work;
- struct task_struct *idle;
- struct completion done;
- int cpu;
-};
-
-static void __cpuinit do_fork_idle(struct work_struct *work)
+void smp_announce(void)
{
- struct create_idle *c_idle =
- container_of(work, struct create_idle, work);
+ int num_nodes = num_online_nodes();
- c_idle->idle = fork_idle(c_idle->cpu);
- complete(&c_idle->done);
+ printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n",
+ num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus());
}
/* reduce the number of lines printed when booting a large cpu count system */
-static void __cpuinit announce_cpu(int cpu, int apicid)
+static void announce_cpu(int cpu, int apicid)
{
static int current_node = -1;
int node = early_cpu_to_node(cpu);
+ static int width, node_width;
+
+ if (!width)
+ width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */
+
+ if (!node_width)
+ node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */
+
+ if (cpu == 1)
+ printk(KERN_INFO "x86: Booting SMP configuration:\n");
if (system_state == SYSTEM_BOOTING) {
if (node != current_node) {
if (current_node > (-1))
- pr_cont(" Ok.\n");
+ pr_cont("\n");
current_node = node;
- pr_info("Booting Node %3d, Processors ", node);
+
+ printk(KERN_INFO ".... node %*s#%d, CPUs: ",
+ node_width - num_digits(node), " ", node);
}
- pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : "");
- return;
+
+ /* Add padding for the BSP */
+ if (cpu == 1)
+ pr_cont("%*s", width + 1, " ");
+
+ pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
+
} else
pr_info("Booting Node %d Processor %d APIC 0x%x\n",
node, cpu, apicid);
}
-/*
- * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
- * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
- * Returns zero if CPU booted OK, else error code from
- * ->wakeup_secondary_cpu.
- */
-static int __cpuinit do_boot_cpu(int apicid, int cpu)
+static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs)
{
- unsigned long boot_error = 0;
- unsigned long start_ip;
- int timeout;
- struct create_idle c_idle = {
- .cpu = cpu,
- .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
- };
+ int cpu;
- INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
+ cpu = smp_processor_id();
+ if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0)
+ return NMI_HANDLED;
- alternatives_smp_switch(1);
+ return NMI_DONE;
+}
+
+/*
+ * Wake up AP by INIT, INIT, STARTUP sequence.
+ *
+ * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS
+ * boot-strap code which is not a desired behavior for waking up BSP. To
+ * void the boot-strap code, wake up CPU0 by NMI instead.
+ *
+ * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined
+ * (i.e. physically hot removed and then hot added), NMI won't wake it up.
+ * We'll change this code in the future to wake up hard offlined CPU0 if
+ * real platform and request are available.
+ */
+static int
+wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
+ int *cpu0_nmi_registered)
+{
+ int id;
+ int boot_error;
- c_idle.idle = get_idle_for_cpu(cpu);
+ preempt_disable();
/*
- * We can't use kernel_thread since we must avoid to
- * reschedule the child.
+ * Wake up AP by INIT, INIT, STARTUP sequence.
*/
- if (c_idle.idle) {
- c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
- init_idle(c_idle.idle, cpu);
- goto do_rest;
+ if (cpu) {
+ boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
+ goto out;
}
- schedule_work(&c_idle.work);
- wait_for_completion(&c_idle.done);
+ /*
+ * Wake up BSP by nmi.
+ *
+ * Register a NMI handler to help wake up CPU0.
+ */
+ boot_error = register_nmi_handler(NMI_LOCAL,
+ wakeup_cpu0_nmi, 0, "wake_cpu0");
- if (IS_ERR(c_idle.idle)) {
- printk("failed fork for CPU %d\n", cpu);
- destroy_work_on_stack(&c_idle.work);
- return PTR_ERR(c_idle.idle);
+ if (!boot_error) {
+ enable_start_cpu0 = 1;
+ *cpu0_nmi_registered = 1;
+ if (apic->dest_logical == APIC_DEST_LOGICAL)
+ id = cpu0_logical_apicid;
+ else
+ id = apicid;
+ boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
}
- set_idle_for_cpu(cpu, c_idle.idle);
-do_rest:
- per_cpu(current_task, cpu) = c_idle.idle;
+out:
+ preempt_enable();
+
+ return boot_error;
+}
+
+/*
+ * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
+ * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
+ * Returns zero if CPU booted OK, else error code from
+ * ->wakeup_secondary_cpu.
+ */
+static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
+{
+ volatile u32 *trampoline_status =
+ (volatile u32 *) __va(real_mode_header->trampoline_status);
+ /* start_ip had better be page-aligned! */
+ unsigned long start_ip = real_mode_header->trampoline_start;
+
+ unsigned long boot_error = 0;
+ int timeout;
+ int cpu0_nmi_registered = 0;
+
+ /* Just in case we booted with a single CPU. */
+ alternatives_enable_smp();
+
+ idle->thread.sp = (unsigned long) (((struct pt_regs *)
+ (THREAD_SIZE + task_stack_page(idle))) - 1);
+ per_cpu(current_task, cpu) = idle;
+
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
- clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+ clear_tsk_thread_flag(idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
+#endif
per_cpu(kernel_stack, cpu) =
- (unsigned long)task_stack_page(c_idle.idle) -
+ (unsigned long)task_stack_page(idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
-#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
- stack_start = c_idle.idle->thread.sp;
-
- /* start_ip had better be page-aligned! */
- start_ip = trampoline_address();
+ stack_start = idle->thread.sp;
/* So we see what's up */
announce_cpu(cpu, apicid);
@@ -740,8 +789,6 @@ do_rest:
* the targeted processor.
*/
- printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
-
atomic_set(&init_deasserted, 0);
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
@@ -759,21 +806,24 @@ do_rest:
}
/*
- * Kick the secondary CPU. Use the method in the APIC driver
- * if it's defined - or use an INIT boot APIC message otherwise:
+ * Wake up a CPU in difference cases:
+ * - Use the method in the APIC driver if it's defined
+ * Otherwise,
+ * - Use an INIT boot APIC message for APs or NMI for BSP.
*/
if (apic->wakeup_secondary_cpu)
boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
else
- boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
+ boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
+ &cpu0_nmi_registered);
if (!boot_error) {
/*
* allow APs to start initializing.
*/
- pr_debug("Before Callout %d.\n", cpu);
+ pr_debug("Before Callout %d\n", cpu);
cpumask_set_cpu(cpu, cpu_callout_mask);
- pr_debug("After Callout %d.\n", cpu);
+ pr_debug("After Callout %d\n", cpu);
/*
* Wait 5s total for a response
@@ -791,17 +841,17 @@ do_rest:
schedule();
}
- if (cpumask_test_cpu(cpu, cpu_callin_mask))
+ if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
+ print_cpu_msr(&cpu_data(cpu));
pr_debug("CPU%d: has booted.\n", cpu);
- else {
+ } else {
boot_error = 1;
- if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
- == 0xA5A5A5A5)
+ if (*trampoline_status == 0xA5A5A5A5)
/* trampoline started but...? */
pr_err("CPU%d: Stuck ??\n", cpu);
else
/* trampoline code not run */
- pr_err("CPU%d: Not responding.\n", cpu);
+ pr_err("CPU%d: Not responding\n", cpu);
if (apic->inquire_remote_apic)
apic->inquire_remote_apic(apicid);
}
@@ -816,13 +866,10 @@ do_rest:
/* was set by cpu_init() */
cpumask_clear_cpu(cpu, cpu_initialized_mask);
-
- set_cpu_present(cpu, false);
- per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
}
/* mark "stuck" area as not stuck */
- *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0;
+ *trampoline_status = 0;
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
/*
@@ -830,12 +877,17 @@ do_rest:
*/
smpboot_restore_warm_reset_vector();
}
+ /*
+ * Clean up the nmi handler. Do this after the callin and callout sync
+ * to avoid impact of possible long unregister time.
+ */
+ if (cpu0_nmi_registered)
+ unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
- destroy_work_on_stack(&c_idle.work);
return boot_error;
}
-int __cpuinit native_cpu_up(unsigned int cpu)
+int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int apicid = apic->cpu_present_to_apicid(cpu);
unsigned long flags;
@@ -845,10 +897,10 @@ int __cpuinit native_cpu_up(unsigned int cpu)
pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
- if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
+ if (apicid == BAD_APICID ||
!physid_isset(apicid, phys_cpu_present_map) ||
- (!x2apic_mode && apicid >= 255)) {
- printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
+ !apic->apic_id_valid(apicid)) {
+ pr_err("%s: bad cpu %d\n", __func__, cpu);
return -EINVAL;
}
@@ -868,9 +920,12 @@ int __cpuinit native_cpu_up(unsigned int cpu)
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
- err = do_boot_cpu(apicid, cpu);
+ /* the FPU context is blank, nobody can own it */
+ __cpu_disable_lazy_restore(cpu);
+
+ err = do_boot_cpu(apicid, cpu, tidle);
if (err) {
- pr_debug("do_boot_cpu failed %d\n", err);
+ pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
return -EIO;
}
@@ -929,9 +984,8 @@ static int __init smp_sanity_check(unsigned max_cpus)
unsigned int cpu;
unsigned nr;
- printk(KERN_WARNING
- "More than 8 CPUs detected - skipping them.\n"
- "Use CONFIG_X86_BIGSMP.\n");
+ pr_warn("More than 8 CPUs detected - skipping them\n"
+ "Use CONFIG_X86_BIGSMP\n");
nr = 0;
for_each_present_cpu(cpu) {
@@ -952,8 +1006,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
#endif
if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
- printk(KERN_WARNING
- "weird, boot CPU (#%d) not listed by the BIOS.\n",
+ pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
hard_smp_processor_id());
physid_set(hard_smp_processor_id(), phys_cpu_present_map);
@@ -965,11 +1018,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
*/
if (!smp_found_config && !acpi_lapic) {
preempt_enable();
- printk(KERN_NOTICE "SMP motherboard not detected.\n");
+ pr_notice("SMP motherboard not detected\n");
disable_smp();
if (APIC_init_uniprocessor())
- printk(KERN_NOTICE "Local APIC not detected."
- " Using dummy APIC emulation.\n");
+ pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
return -1;
}
@@ -978,9 +1030,8 @@ static int __init smp_sanity_check(unsigned max_cpus)
* CPU too, but we do it for the sake of robustness anyway.
*/
if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
- printk(KERN_NOTICE
- "weird, boot CPU (#%d) not listed by the BIOS.\n",
- boot_cpu_physical_apicid);
+ pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
+ boot_cpu_physical_apicid);
physid_set(hard_smp_processor_id(), phys_cpu_present_map);
}
preempt_enable();
@@ -993,8 +1044,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
if (!disable_apic) {
pr_err("BIOS bug, local APIC #%d not detected!...\n",
boot_cpu_physical_apicid);
- pr_err("... forcing use of dummy APIC emulation."
- "(tell your hw vendor)\n");
+ pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
}
smpboot_clear_io_apic();
disable_ioapic_support();
@@ -1007,7 +1057,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
* If SMP should be disabled, then really disable it!
*/
if (!max_cpus) {
- printk(KERN_INFO "SMP mode deactivated.\n");
+ pr_info("SMP mode deactivated\n");
smpboot_clear_io_apic();
connect_bsp_APIC();
@@ -1045,7 +1095,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
/*
* Setup boot CPU information
*/
- smp_store_cpu_info(0); /* Final full version of the data */
+ smp_store_boot_cpu_info(); /* Final full version of the data */
cpumask_copy(cpu_callin_mask, cpumask_of(0));
mb();
@@ -1059,7 +1109,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
if (smp_sanity_check(max_cpus) < 0) {
- printk(KERN_INFO "SMP disabled\n");
+ pr_info("SMP disabled\n");
disable_smp();
goto out;
}
@@ -1081,6 +1131,11 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
*/
setup_local_APIC();
+ if (x2apic_mode)
+ cpu0_logical_apicid = apic_read(APIC_LDR);
+ else
+ cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+
/*
* Enable IO APIC before setting up error vector
*/
@@ -1097,7 +1152,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
* Set up local APIC timer on boot CPU.
*/
- printk(KERN_INFO "CPU%d: ", 0);
+ pr_info("CPU%d: ", 0);
print_cpu_info(&cpu_data(0));
x86_init.timers.setup_percpu_clockev();
@@ -1109,20 +1164,6 @@ out:
preempt_enable();
}
-void arch_disable_nonboot_cpus_begin(void)
-{
- /*
- * Avoid the smp alternatives switch during the disable_nonboot_cpus().
- * In the suspend path, we will be back in the SMP mode shortly anyways.
- */
- skip_smp_alternatives = true;
-}
-
-void arch_disable_nonboot_cpus_end(void)
-{
- skip_smp_alternatives = false;
-}
-
void arch_enable_nonboot_cpus_begin(void)
{
set_mtrr_aps_delayed_init();
@@ -1147,7 +1188,7 @@ void __init native_smp_prepare_boot_cpu(void)
void __init native_smp_cpus_done(unsigned int max_cpus)
{
- pr_debug("Boot done.\n");
+ pr_debug("Boot done\n");
nmi_selftest();
impress_friends();
@@ -1208,8 +1249,7 @@ __init void prefill_possible_map(void)
/* nr_cpu_ids could be reduced via nr_cpus= */
if (possible > nr_cpu_ids) {
- printk(KERN_WARNING
- "%d Processors exceeds NR_CPUS limit of %d\n",
+ pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",
possible, nr_cpu_ids);
possible = nr_cpu_ids;
}
@@ -1218,13 +1258,12 @@ __init void prefill_possible_map(void)
if (!setup_max_cpus)
#endif
if (possible > i) {
- printk(KERN_WARNING
- "%d Processors exceeds max_cpus limit of %u\n",
+ pr_warn("%d Processors exceeds max_cpus limit of %u\n",
possible, setup_max_cpus);
possible = i;
}
- printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
+ pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
possible, max_t(int, possible - num_processors, 0));
for (i = 0; i < possible; i++)
@@ -1285,18 +1324,11 @@ void cpu_disable_common(void)
int native_cpu_disable(void)
{
- int cpu = smp_processor_id();
+ int ret;
- /*
- * Perhaps use cpufreq to drop frequency, but that could go
- * into generic code.
- *
- * We won't take down the boot processor on i386 due to some
- * interrupts only being able to be serviced by the BSP.
- * Especially so if we're not using an IOAPIC -zwane
- */
- if (cpu == 0)
- return -EBUSY;
+ ret = check_irq_vectors_for_cpu_disable();
+ if (ret)
+ return ret;
clear_local_APIC();
@@ -1314,9 +1346,6 @@ void native_cpu_die(unsigned int cpu)
if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
if (system_state == SYSTEM_RUNNING)
pr_info("CPU %u is now offline\n", cpu);
-
- if (1 == num_online_cpus())
- alternatives_smp_switch(0);
return;
}
msleep(100);
@@ -1340,6 +1369,14 @@ void play_dead_common(void)
local_irq_disable();
}
+static bool wakeup_cpu0(void)
+{
+ if (smp_processor_id() == 0 && enable_start_cpu0)
+ return true;
+
+ return false;
+}
+
/*
* We need to flush the caches before going to sleep, lest we have
* dirty data in our caches when we come back up.
@@ -1349,13 +1386,12 @@ static inline void mwait_play_dead(void)
unsigned int eax, ebx, ecx, edx;
unsigned int highest_cstate = 0;
unsigned int highest_subcstate = 0;
- int i;
void *mwait_ptr;
- struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
+ int i;
- if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
+ if (!this_cpu_has(X86_FEATURE_MWAIT))
return;
- if (!this_cpu_has(X86_FEATURE_CLFLSH))
+ if (!this_cpu_has(X86_FEATURE_CLFLUSH))
return;
if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
return;
@@ -1399,10 +1435,17 @@ static inline void mwait_play_dead(void)
* The WBINVD is insufficient due to the spurious-wakeup
* case where we return around the loop.
*/
+ mb();
clflush(mwait_ptr);
+ mb();
__monitor(mwait_ptr, 0, 0);
mb();
__mwait(eax, 0);
+ /*
+ * If NMI wants to wake up CPU0, start CPU0.
+ */
+ if (wakeup_cpu0())
+ start_cpu0();
}
}
@@ -1413,6 +1456,11 @@ static inline void hlt_play_dead(void)
while (1) {
native_halt();
+ /*
+ * If NMI wants to wake up CPU0, start CPU0.
+ */
+ if (wakeup_cpu0())
+ start_cpu0();
}
}
@@ -1422,7 +1470,8 @@ void native_play_dead(void)
tboot_shutdown(TB_SHUTDOWN_WFS);
mwait_play_dead(); /* Only returns on failure */
- hlt_play_dead();
+ if (cpuidle_play_dead())
+ hlt_play_dead();
}
#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index c346d116148..9b4d51d0c0d 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -157,6 +157,34 @@ static int enable_single_step(struct task_struct *child)
return 1;
}
+void set_task_blockstep(struct task_struct *task, bool on)
+{
+ unsigned long debugctl;
+
+ /*
+ * Ensure irq/preemption can't change debugctl in between.
+ * Note also that both TIF_BLOCKSTEP and debugctl should
+ * be changed atomically wrt preemption.
+ *
+ * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if
+ * task is current or it can't be running, otherwise we can race
+ * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but
+ * PTRACE_KILL is not safe.
+ */
+ local_irq_disable();
+ debugctl = get_debugctlmsr();
+ if (on) {
+ debugctl |= DEBUGCTLMSR_BTF;
+ set_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ } else {
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ clear_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ }
+ if (task == current)
+ update_debugctlmsr(debugctl);
+ local_irq_enable();
+}
+
/*
* Enable single or block step.
*/
@@ -169,19 +197,10 @@ static void enable_step(struct task_struct *child, bool block)
* So no one should try to use debugger block stepping in a program
* that uses user-mode single stepping itself.
*/
- if (enable_single_step(child) && block) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl |= DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- set_tsk_thread_flag(child, TIF_BLOCKSTEP);
- } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl &= ~DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
- }
+ if (enable_single_step(child) && block)
+ set_task_blockstep(child, true);
+ else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
}
void user_enable_single_step(struct task_struct *child)
@@ -199,13 +218,8 @@ void user_disable_single_step(struct task_struct *child)
/*
* Make sure block stepping (BTF) is disabled.
*/
- if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl &= ~DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
- }
+ if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
/* Always clear TIF_SINGLESTEP... */
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
deleted file mode 100644
index 0b0cb5fede1..00000000000
--- a/arch/x86/kernel/sys_i386_32.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * This file contains various random system calls that
- * have a non-standard calling sequence on the Linux/i386
- * platform.
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/smp.h>
-#include <linux/sem.h>
-#include <linux/msg.h>
-#include <linux/shm.h>
-#include <linux/stat.h>
-#include <linux/syscalls.h>
-#include <linux/mman.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <linux/ipc.h>
-
-#include <linux/uaccess.h>
-#include <linux/unistd.h>
-
-#include <asm/syscalls.h>
-
-/*
- * Do a system call from kernel instead of calling sys_execve so we
- * end up with proper pt_regs.
- */
-int kernel_execve(const char *filename,
- const char *const argv[],
- const char *const envp[])
-{
- long __res;
- asm volatile ("int $0x80"
- : "=a" (__res)
- : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
- return __res;
-}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 051489082d5..30277e27431 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -21,37 +21,23 @@
/*
* Align a virtual address to avoid aliasing in the I$ on AMD F15h.
- *
- * @flags denotes the allocation direction - bottomup or topdown -
- * or vDSO; see call sites below.
*/
-unsigned long align_addr(unsigned long addr, struct file *filp,
- enum align_flags flags)
+static unsigned long get_align_mask(void)
{
- unsigned long tmp_addr;
-
/* handle 32- and 64-bit case with a single conditional */
if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
- return addr;
+ return 0;
if (!(current->flags & PF_RANDOMIZE))
- return addr;
-
- if (!((flags & ALIGN_VDSO) || filp))
- return addr;
-
- tmp_addr = addr;
-
- /*
- * We need an address which is <= than the original
- * one only when in topdown direction.
- */
- if (!(flags & ALIGN_TOPDOWN))
- tmp_addr += va_align.mask;
+ return 0;
- tmp_addr &= ~va_align.mask;
+ return va_align.mask;
+}
- return tmp_addr;
+unsigned long align_vdso_addr(unsigned long addr)
+{
+ unsigned long align_mask = get_align_mask();
+ return (addr + align_mask) & ~align_mask;
}
static int __init control_va_addr_alignment(char *str)
@@ -98,7 +84,7 @@ out:
static void find_start_end(unsigned long flags, unsigned long *begin,
unsigned long *end)
{
- if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
+ if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) {
unsigned long new_begin;
/* This is usually used needed to map code in small
model, so it needs to be in the first 31bit. Limit
@@ -115,7 +101,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
*begin = new_begin;
}
} else {
- *begin = TASK_UNMAPPED_BASE;
+ *begin = current->mm->mmap_legacy_base;
*end = TASK_SIZE;
}
}
@@ -126,7 +112,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long start_addr;
+ struct vm_unmapped_area_info info;
unsigned long begin, end;
if (flags & MAP_FIXED)
@@ -144,50 +130,16 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
(!vma || addr + len <= vma->vm_start))
return addr;
}
- if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
- && len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = begin;
- }
- addr = mm->free_area_cache;
- if (addr < begin)
- addr = begin;
- start_addr = addr;
-
-full_search:
-
- addr = align_addr(addr, filp, 0);
-
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (end - len < addr) {
- /*
- * Start a new search - just in case we missed
- * some holes.
- */
- if (start_addr != begin) {
- start_addr = addr = begin;
- mm->cached_hole_size = 0;
- goto full_search;
- }
- return -ENOMEM;
- }
- if (!vma || addr + len <= vma->vm_start) {
- /*
- * Remember the place where we stopped the search:
- */
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = vma->vm_end;
- addr = align_addr(addr, filp, 0);
- }
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = begin;
+ info.high_limit = end;
+ info.align_mask = filp ? get_align_mask() : 0;
+ info.align_offset = pgoff << PAGE_SHIFT;
+ return vm_unmapped_area(&info);
}
-
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
const unsigned long len, const unsigned long pgoff,
@@ -196,6 +148,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
+ struct vm_unmapped_area_info info;
/* requested length too big for entire address space */
if (len > TASK_SIZE)
@@ -204,8 +157,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
if (flags & MAP_FIXED)
return addr;
- /* for MAP_32BIT mappings we force the legact mmap base */
- if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT))
+ /* for MAP_32BIT mappings we force the legacy mmap base */
+ if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT))
goto bottomup;
/* requesting a specific address */
@@ -217,51 +170,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return addr;
}
- /* check if free_area_cache is useful for us */
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
- }
-
- /* either no address requested or can't fit in requested address hole */
- addr = mm->free_area_cache;
-
- /* make sure it can fit in the remaining address space */
- if (addr > len) {
- unsigned long tmp_addr = align_addr(addr - len, filp,
- ALIGN_TOPDOWN);
-
- vma = find_vma(mm, tmp_addr);
- if (!vma || tmp_addr + len <= vma->vm_start)
- /* remember the address as a hint for next time */
- return mm->free_area_cache = tmp_addr;
- }
-
- if (mm->mmap_base < len)
- goto bottomup;
-
- addr = mm->mmap_base-len;
-
- do {
- addr = align_addr(addr, filp, ALIGN_TOPDOWN);
-
- /*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
- */
- vma = find_vma(mm, addr);
- if (!vma || addr+len <= vma->vm_start)
- /* remember the address as a hint for next time */
- return mm->free_area_cache = addr;
-
- /* remember the largest hole we saw so far */
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = vma->vm_start-len;
- } while (len < vma->vm_start);
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = PAGE_SIZE;
+ info.high_limit = mm->mmap_base;
+ info.align_mask = filp ? get_align_mask() : 0;
+ info.align_offset = pgoff << PAGE_SHIFT;
+ addr = vm_unmapped_area(&info);
+ if (!(addr & ~PAGE_MASK))
+ return addr;
+ VM_BUG_ON(addr != -ENOMEM);
bottomup:
/*
@@ -270,14 +188,5 @@ bottomup:
* can happen with large stack limits and large mmap()
* allocations.
*/
- mm->cached_hole_size = ~0UL;
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
- /*
- * Restore the topdown base:
- */
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
-
- return addr;
+ return arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
}
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index 147fcd4941c..e9bcd57d8a9 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -15,7 +15,7 @@ typedef asmlinkage void (*sys_call_ptr_t)(void);
extern asmlinkage void sys_ni_syscall(void);
-const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 7ac7943be02..4ac730b37f0 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -4,6 +4,15 @@
#include <linux/sys.h>
#include <linux/cache.h>
#include <asm/asm-offsets.h>
+#include <asm/syscall.h>
+
+#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
+
+#ifdef CONFIG_X86_X32_ABI
+# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
+#else
+# define __SYSCALL_X32(nr, sym, compat) /* nothing */
+#endif
#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
#include <asm/syscalls_64.h>
@@ -11,11 +20,9 @@
#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
-typedef void (*sys_call_ptr_t)(void);
-
extern void sys_ni_syscall(void);
-const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c
new file mode 100644
index 00000000000..193ec2ce46c
--- /dev/null
+++ b/arch/x86/kernel/sysfb.c
@@ -0,0 +1,74 @@
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ * Simple-Framebuffer support for x86 systems
+ * Create a platform-device for any available boot framebuffer. The
+ * simple-framebuffer platform device is already available on DT systems, so
+ * this module parses the global "screen_info" object and creates a suitable
+ * platform device compatible with the "simple-framebuffer" DT object. If
+ * the framebuffer is incompatible, we instead create a legacy
+ * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and
+ * pass the screen_info as platform_data. This allows legacy drivers
+ * to pick these devices up without messing with simple-framebuffer drivers.
+ * The global "screen_info" is still valid at all times.
+ *
+ * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer"
+ * platform devices, but only use legacy framebuffer devices for
+ * backwards compatibility.
+ *
+ * TODO: We set the dev_id field of all platform-devices to 0. This allows
+ * other x86 OF/DT parsers to create such devices, too. However, they must
+ * start at offset 1 for this to work.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/platform_data/simplefb.h>
+#include <linux/platform_device.h>
+#include <linux/screen_info.h>
+#include <asm/sysfb.h>
+
+static __init int sysfb_init(void)
+{
+ struct screen_info *si = &screen_info;
+ struct simplefb_platform_data mode;
+ struct platform_device *pd;
+ const char *name;
+ bool compatible;
+ int ret;
+
+ sysfb_apply_efi_quirks();
+
+ /* try to create a simple-framebuffer device */
+ compatible = parse_mode(si, &mode);
+ if (compatible) {
+ ret = create_simplefb(si, &mode);
+ if (!ret)
+ return 0;
+ }
+
+ /* if the FB is incompatible, create a legacy framebuffer device */
+ if (si->orig_video_isVGA == VIDEO_TYPE_EFI)
+ name = "efi-framebuffer";
+ else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
+ name = "vesa-framebuffer";
+ else
+ name = "platform-framebuffer";
+
+ pd = platform_device_register_resndata(NULL, name, 0,
+ NULL, 0, si, sizeof(*si));
+ return IS_ERR(pd) ? PTR_ERR(pd) : 0;
+}
+
+/* must execute after PCI subsystem for EFI quirks */
+device_initcall(sysfb_init);
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
new file mode 100644
index 00000000000..b285d4e8c68
--- /dev/null
+++ b/arch/x86/kernel/sysfb_efi.c
@@ -0,0 +1,214 @@
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * EFI Quirks Copyright (c) 2006 Edgar Hucek <gimli@dark-green.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ * EFI Quirks
+ * Several EFI systems do not correctly advertise their boot framebuffers.
+ * Hence, we use this static table of known broken machines and fix up the
+ * information so framebuffer drivers can load corectly.
+ */
+
+#include <linux/dmi.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+#include <linux/screen_info.h>
+#include <video/vga.h>
+#include <asm/sysfb.h>
+
+enum {
+ OVERRIDE_NONE = 0x0,
+ OVERRIDE_BASE = 0x1,
+ OVERRIDE_STRIDE = 0x2,
+ OVERRIDE_HEIGHT = 0x4,
+ OVERRIDE_WIDTH = 0x8,
+};
+
+struct efifb_dmi_info efifb_dmi_list[] = {
+ [M_I17] = { "i17", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_I20] = { "i20", 0x80010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, /* guess */
+ [M_I20_SR] = { "imac7", 0x40010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE },
+ [M_I24] = { "i24", 0x80010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, /* guess */
+ [M_I24_8_1] = { "imac8", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_I24_10_1] = { "imac10", 0xc0010000, 2048 * 4, 1920, 1080, OVERRIDE_NONE },
+ [M_I27_11_1] = { "imac11", 0xc0010000, 2560 * 4, 2560, 1440, OVERRIDE_NONE },
+ [M_MINI]= { "mini", 0x80000000, 2048 * 4, 1024, 768, OVERRIDE_NONE },
+ [M_MINI_3_1] = { "mini31", 0x40010000, 1024 * 4, 1024, 768, OVERRIDE_NONE },
+ [M_MINI_4_1] = { "mini41", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MB] = { "macbook", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MB_5_1] = { "macbook51", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MB_6_1] = { "macbook61", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MB_7_1] = { "macbook71", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MBA] = { "mba", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ /* 11" Macbook Air 3,1 passes the wrong stride */
+ [M_MBA_3] = { "mba3", 0, 2048 * 4, 0, 0, OVERRIDE_STRIDE },
+ [M_MBP] = { "mbp", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_2] = { "mbp2", 0, 0, 0, 0, OVERRIDE_NONE }, /* placeholder */
+ [M_MBP_2_2] = { "mbp22", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_SR] = { "mbp3", 0x80030000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_4] = { "mbp4", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MBP_5_1] = { "mbp51", 0xc0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_5_2] = { "mbp52", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MBP_5_3] = { "mbp53", 0xd0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_6_1] = { "mbp61", 0x90030000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MBP_6_2] = { "mbp62", 0x90030000, 2048 * 4, 1680, 1050, OVERRIDE_NONE },
+ [M_MBP_7_1] = { "mbp71", 0xc0010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MBP_8_2] = { "mbp82", 0x90010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE }
+};
+
+#define choose_value(dmivalue, fwvalue, field, flags) ({ \
+ typeof(fwvalue) _ret_ = fwvalue; \
+ if ((flags) & (field)) \
+ _ret_ = dmivalue; \
+ else if ((fwvalue) == 0) \
+ _ret_ = dmivalue; \
+ _ret_; \
+ })
+
+static int __init efifb_set_system(const struct dmi_system_id *id)
+{
+ struct efifb_dmi_info *info = id->driver_data;
+
+ if (info->base == 0 && info->height == 0 && info->width == 0 &&
+ info->stride == 0)
+ return 0;
+
+ /* Trust the bootloader over the DMI tables */
+ if (screen_info.lfb_base == 0) {
+#if defined(CONFIG_PCI)
+ struct pci_dev *dev = NULL;
+ int found_bar = 0;
+#endif
+ if (info->base) {
+ screen_info.lfb_base = choose_value(info->base,
+ screen_info.lfb_base, OVERRIDE_BASE,
+ info->flags);
+
+#if defined(CONFIG_PCI)
+ /* make sure that the address in the table is actually
+ * on a VGA device's PCI BAR */
+
+ for_each_pci_dev(dev) {
+ int i;
+ if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
+ continue;
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+ resource_size_t start, end;
+
+ start = pci_resource_start(dev, i);
+ if (start == 0)
+ break;
+ end = pci_resource_end(dev, i);
+ if (screen_info.lfb_base >= start &&
+ screen_info.lfb_base < end) {
+ found_bar = 1;
+ }
+ }
+ }
+ if (!found_bar)
+ screen_info.lfb_base = 0;
+#endif
+ }
+ }
+ if (screen_info.lfb_base) {
+ screen_info.lfb_linelength = choose_value(info->stride,
+ screen_info.lfb_linelength, OVERRIDE_STRIDE,
+ info->flags);
+ screen_info.lfb_width = choose_value(info->width,
+ screen_info.lfb_width, OVERRIDE_WIDTH,
+ info->flags);
+ screen_info.lfb_height = choose_value(info->height,
+ screen_info.lfb_height, OVERRIDE_HEIGHT,
+ info->flags);
+ if (screen_info.orig_video_isVGA == 0)
+ screen_info.orig_video_isVGA = VIDEO_TYPE_EFI;
+ } else {
+ screen_info.lfb_linelength = 0;
+ screen_info.lfb_width = 0;
+ screen_info.lfb_height = 0;
+ screen_info.orig_video_isVGA = 0;
+ return 0;
+ }
+
+ printk(KERN_INFO "efifb: dmi detected %s - framebuffer at 0x%08x "
+ "(%dx%d, stride %d)\n", id->ident,
+ screen_info.lfb_base, screen_info.lfb_width,
+ screen_info.lfb_height, screen_info.lfb_linelength);
+
+ return 1;
+}
+
+#define EFIFB_DMI_SYSTEM_ID(vendor, name, enumid) \
+ { \
+ efifb_set_system, \
+ name, \
+ { \
+ DMI_MATCH(DMI_BIOS_VENDOR, vendor), \
+ DMI_MATCH(DMI_PRODUCT_NAME, name) \
+ }, \
+ &efifb_dmi_list[enumid] \
+ }
+
+static const struct dmi_system_id efifb_dmi_system_table[] __initconst = {
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac4,1", M_I17),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac5,1", M_I20),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac5,1", M_I20),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac6,1", M_I24),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac6,1", M_I24),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac7,1", M_I20_SR),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac8,1", M_I24_8_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac10,1", M_I24_10_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac11,1", M_I27_11_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "Macmini1,1", M_MINI),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini3,1", M_MINI_3_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini4,1", M_MINI_4_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook1,1", M_MB),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook2,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook2,1", M_MB),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook3,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook3,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook4,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook5,1", M_MB_5_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook6,1", M_MB_6_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook7,1", M_MB_7_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir1,1", M_MBA),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir3,1", M_MBA_3),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro1,1", M_MBP),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,1", M_MBP_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,2", M_MBP_2_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro2,1", M_MBP_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro3,1", M_MBP_SR),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro3,1", M_MBP_SR),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro4,1", M_MBP_4),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,1", M_MBP_5_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,2", M_MBP_5_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,3", M_MBP_5_3),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,1", M_MBP_6_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,2", M_MBP_6_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro7,1", M_MBP_7_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro8,2", M_MBP_8_2),
+ {},
+};
+
+__init void sysfb_apply_efi_quirks(void)
+{
+ if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI ||
+ !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS))
+ dmi_check_system(efifb_dmi_system_table);
+}
diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
new file mode 100644
index 00000000000..86179d40989
--- /dev/null
+++ b/arch/x86/kernel/sysfb_simplefb.c
@@ -0,0 +1,95 @@
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ * simple-framebuffer probing
+ * Try to convert "screen_info" into a "simple-framebuffer" compatible mode.
+ * If the mode is incompatible, we return "false" and let the caller create
+ * legacy nodes instead.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/platform_data/simplefb.h>
+#include <linux/platform_device.h>
+#include <linux/screen_info.h>
+#include <asm/sysfb.h>
+
+static const char simplefb_resname[] = "BOOTFB";
+static const struct simplefb_format formats[] = SIMPLEFB_FORMATS;
+
+/* try parsing x86 screen_info into a simple-framebuffer mode struct */
+__init bool parse_mode(const struct screen_info *si,
+ struct simplefb_platform_data *mode)
+{
+ const struct simplefb_format *f;
+ __u8 type;
+ unsigned int i;
+
+ type = si->orig_video_isVGA;
+ if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI)
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(formats); ++i) {
+ f = &formats[i];
+ if (si->lfb_depth == f->bits_per_pixel &&
+ si->red_size == f->red.length &&
+ si->red_pos == f->red.offset &&
+ si->green_size == f->green.length &&
+ si->green_pos == f->green.offset &&
+ si->blue_size == f->blue.length &&
+ si->blue_pos == f->blue.offset &&
+ si->rsvd_size == f->transp.length &&
+ si->rsvd_pos == f->transp.offset) {
+ mode->format = f->name;
+ mode->width = si->lfb_width;
+ mode->height = si->lfb_height;
+ mode->stride = si->lfb_linelength;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+__init int create_simplefb(const struct screen_info *si,
+ const struct simplefb_platform_data *mode)
+{
+ struct platform_device *pd;
+ struct resource res;
+ unsigned long len;
+
+ /* don't use lfb_size as it may contain the whole VMEM instead of only
+ * the part that is occupied by the framebuffer */
+ len = mode->height * mode->stride;
+ len = PAGE_ALIGN(len);
+ if (len > (u64)si->lfb_size << 16) {
+ printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
+ return -EINVAL;
+ }
+
+ /* setup IORESOURCE_MEM as framebuffer memory */
+ memset(&res, 0, sizeof(res));
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ res.name = simplefb_resname;
+ res.start = si->lfb_base;
+ res.end = si->lfb_base + len - 1;
+ if (res.end <= res.start)
+ return -EINVAL;
+
+ pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0,
+ &res, 1, mode, sizeof(*mode));
+ if (IS_ERR(pd))
+ return PTR_ERR(pd);
+
+ return 0;
+}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index e2410e27f97..91a4496db43 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -31,8 +31,9 @@
#include <linux/pfn.h>
#include <linux/mm.h>
#include <linux/tboot.h>
+#include <linux/debugfs.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
#include <asm/processor.h>
#include <asm/bootparam.h>
#include <asm/pgtable.h>
@@ -44,7 +45,7 @@
#include <asm/e820.h>
#include <asm/io.h>
-#include "acpi/realmode/wakeup.h"
+#include "../realmode/rm/wakeup.h"
/* Global pointer to shared data; NULL means no measured launch. */
struct tboot *tboot __read_mostly;
@@ -201,7 +202,8 @@ static int tboot_setup_sleep(void)
add_mac_region(e820.map[i].addr, e820.map[i].size);
}
- tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
+ tboot->acpi_sinfo.kernel_s3_resume_vector =
+ real_mode_header->wakeup_start;
return 0;
}
@@ -272,7 +274,7 @@ static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
offsetof(struct acpi_table_facs, firmware_waking_vector);
}
-void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
+static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
{
static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
/* S0,1,2: */ -1, -1, -1,
@@ -281,7 +283,7 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
/* S5: */ TB_SHUTDOWN_S5 };
if (!tboot_enabled())
- return;
+ return 0;
tboot_copy_fadt(&acpi_gbl_FADT);
tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
@@ -292,10 +294,20 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
if (sleep_state >= ACPI_S_STATE_COUNT ||
acpi_shutdown_map[sleep_state] == -1) {
pr_warning("unsupported sleep state 0x%x\n", sleep_state);
- return;
+ return -1;
}
tboot_shutdown(acpi_shutdown_map[sleep_state]);
+ return 0;
+}
+
+static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)");
+ return -ENODEV;
}
static atomic_t ap_wfs_count;
@@ -317,8 +329,8 @@ static int tboot_wait_for_aps(int num_aps)
return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
}
-static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int tboot_cpu_callback(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
{
switch (action) {
case CPU_DYING:
@@ -331,11 +343,78 @@ static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb,
return NOTIFY_OK;
}
-static struct notifier_block tboot_cpu_notifier __cpuinitdata =
+static struct notifier_block tboot_cpu_notifier =
{
.notifier_call = tboot_cpu_callback,
};
+#ifdef CONFIG_DEBUG_FS
+
+#define TBOOT_LOG_UUID { 0x26, 0x25, 0x19, 0xc0, 0x30, 0x6b, 0xb4, 0x4d, \
+ 0x4c, 0x84, 0xa3, 0xe9, 0x53, 0xb8, 0x81, 0x74 }
+
+#define TBOOT_SERIAL_LOG_ADDR 0x60000
+#define TBOOT_SERIAL_LOG_SIZE 0x08000
+#define LOG_MAX_SIZE_OFF 16
+#define LOG_BUF_OFF 24
+
+static uint8_t tboot_log_uuid[16] = TBOOT_LOG_UUID;
+
+static ssize_t tboot_log_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos)
+{
+ void __iomem *log_base;
+ u8 log_uuid[16];
+ u32 max_size;
+ void *kbuf;
+ int ret = -EFAULT;
+
+ log_base = ioremap_nocache(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE);
+ if (!log_base)
+ return ret;
+
+ memcpy_fromio(log_uuid, log_base, sizeof(log_uuid));
+ if (memcmp(&tboot_log_uuid, log_uuid, sizeof(log_uuid)))
+ goto err_iounmap;
+
+ max_size = readl(log_base + LOG_MAX_SIZE_OFF);
+ if (*ppos >= max_size) {
+ ret = 0;
+ goto err_iounmap;
+ }
+
+ if (*ppos + count > max_size)
+ count = max_size - *ppos;
+
+ kbuf = kmalloc(count, GFP_KERNEL);
+ if (!kbuf) {
+ ret = -ENOMEM;
+ goto err_iounmap;
+ }
+
+ memcpy_fromio(kbuf, log_base + LOG_BUF_OFF + *ppos, count);
+ if (copy_to_user(user_buf, kbuf, count))
+ goto err_kfree;
+
+ *ppos += count;
+
+ ret = count;
+
+err_kfree:
+ kfree(kbuf);
+
+err_iounmap:
+ iounmap(log_base);
+
+ return ret;
+}
+
+static const struct file_operations tboot_log_fops = {
+ .read = tboot_log_read,
+ .llseek = default_llseek,
+};
+
+#endif /* CONFIG_DEBUG_FS */
+
static __init int tboot_late_init(void)
{
if (!tboot_enabled())
@@ -345,6 +424,14 @@ static __init int tboot_late_init(void)
atomic_set(&ap_wfs_count, 0);
register_hotcpu_notifier(&tboot_cpu_notifier);
+
+#ifdef CONFIG_DEBUG_FS
+ debugfs_create_file("tboot_log", S_IRUSR,
+ arch_debugfs_dir, NULL, &tboot_log_fops);
+#endif
+
+ acpi_os_set_prepare_sleep(&tboot_sleep);
+ acpi_os_set_prepare_extended_sleep(&tboot_extended_sleep);
return 0;
}
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
index 9e540fee700..ab40954e113 100644
--- a/arch/x86/kernel/tce_64.c
+++ b/arch/x86/kernel/tce_64.c
@@ -34,6 +34,7 @@
#include <asm/tce.h>
#include <asm/calgary.h>
#include <asm/proto.h>
+#include <asm/cacheflush.h>
/* flush a tce at 'tceaddr' to main memory */
static inline void flush_tce(void* tceaddr)
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index c29e235792a..b79133abda4 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
+#include <asm/asm.h>
int rodata_test(void)
{
@@ -42,14 +43,7 @@ int rodata_test(void)
".section .fixup,\"ax\"\n"
"2: jmp 1b\n"
".previous\n"
- ".section __ex_table,\"a\"\n"
- " .align 16\n"
-#ifdef CONFIG_X86_32
- " .long 0b,2b\n"
-#else
- " .quad 0b,2b\n"
-#endif
- ".previous"
+ _ASM_EXTABLE(0b,2b)
: [rslt] "=r" (result)
: [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
);
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index dd5fbf4101f..bf7ef5ce29d 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -14,7 +14,6 @@
#include <linux/i8253.h>
#include <linux/time.h>
#include <linux/export.h>
-#include <linux/mca.h>
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
@@ -24,7 +23,7 @@
#include <asm/time.h>
#ifdef CONFIG_X86_64
-DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
+__visible DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
#endif
unsigned long profile_pc(struct pt_regs *regs)
@@ -57,21 +56,13 @@ EXPORT_SYMBOL(profile_pc);
*/
static irqreturn_t timer_interrupt(int irq, void *dev_id)
{
- /* Keep nmi watchdog up to date */
- inc_irq_stat(irq0_irqs);
-
global_clock_event->event_handler(global_clock_event);
-
- /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
- if (MCA_bus)
- outb_p(inb_p(0x61)| 0x80, 0x61);
-
return IRQ_HANDLED;
}
static struct irqaction irq0 = {
.handler = timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
+ .flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
.name = "timer"
};
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 6bb7b8579e7..f7fec09e3e3 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -3,14 +3,13 @@
#include <linux/sched.h>
#include <linux/user.h>
#include <linux/regset.h>
+#include <linux/syscalls.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
-#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/proto.h>
-#include <asm/syscalls.h>
#include "tls.h"
@@ -90,11 +89,9 @@ int do_set_thread_area(struct task_struct *p, int idx,
return 0;
}
-asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
+SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, u_info)
{
- int ret = do_set_thread_area(current, -1, u_info, 1);
- asmlinkage_protect(1, ret, u_info);
- return ret;
+ return do_set_thread_area(current, -1, u_info, 1);
}
@@ -140,11 +137,9 @@ int do_get_thread_area(struct task_struct *p, int idx,
return 0;
}
-asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
+SYSCALL_DEFINE1(get_thread_area, struct user_desc __user *, u_info)
{
- int ret = do_get_thread_area(current, -1, u_info);
- asmlinkage_protect(1, ret, u_info);
- return ret;
+ return do_get_thread_area(current, -1, u_info);
}
int regset_tls_active(struct task_struct *target,
@@ -163,7 +158,7 @@ int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
{
const struct desc_struct *tls;
- if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+ if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
(pos % sizeof(struct user_desc)) != 0 ||
(count % sizeof(struct user_desc)) != 0)
return -EINVAL;
@@ -198,7 +193,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
const struct user_desc *info;
- if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+ if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
(pos % sizeof(struct user_desc)) != 0 ||
(count % sizeof(struct user_desc)) != 0)
return -EINVAL;
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 76ee97709a0..649b010da00 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -30,23 +30,113 @@
#include <linux/mmzone.h>
#include <linux/init.h>
#include <linux/smp.h>
+#include <linux/irq.h>
#include <asm/cpu.h>
static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
#ifdef CONFIG_HOTPLUG_CPU
+
+#ifdef CONFIG_BOOTPARAM_HOTPLUG_CPU0
+static int cpu0_hotpluggable = 1;
+#else
+static int cpu0_hotpluggable;
+static int __init enable_cpu0_hotplug(char *str)
+{
+ cpu0_hotpluggable = 1;
+ return 1;
+}
+
+__setup("cpu0_hotplug", enable_cpu0_hotplug);
+#endif
+
+#ifdef CONFIG_DEBUG_HOTPLUG_CPU0
+/*
+ * This function offlines a CPU as early as possible and allows userspace to
+ * boot up without the CPU. The CPU can be onlined back by user after boot.
+ *
+ * This is only called for debugging CPU offline/online feature.
+ */
+int __ref _debug_hotplug_cpu(int cpu, int action)
+{
+ struct device *dev = get_cpu_device(cpu);
+ int ret;
+
+ if (!cpu_is_hotpluggable(cpu))
+ return -EINVAL;
+
+ lock_device_hotplug();
+
+ switch (action) {
+ case 0:
+ ret = cpu_down(cpu);
+ if (!ret) {
+ pr_info("CPU %u is now offline\n", cpu);
+ dev->offline = true;
+ kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+ } else
+ pr_debug("Can't offline CPU%d.\n", cpu);
+ break;
+ case 1:
+ ret = cpu_up(cpu);
+ if (!ret) {
+ dev->offline = false;
+ kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+ } else {
+ pr_debug("Can't online CPU%d.\n", cpu);
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ unlock_device_hotplug();
+
+ return ret;
+}
+
+static int __init debug_hotplug_cpu(void)
+{
+ _debug_hotplug_cpu(0, 0);
+ return 0;
+}
+
+late_initcall_sync(debug_hotplug_cpu);
+#endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */
+
int __ref arch_register_cpu(int num)
{
+ struct cpuinfo_x86 *c = &cpu_data(num);
+
+ /*
+ * Currently CPU0 is only hotpluggable on Intel platforms. Other
+ * vendors can add hotplug support later.
+ */
+ if (c->x86_vendor != X86_VENDOR_INTEL)
+ cpu0_hotpluggable = 0;
+
/*
- * CPU0 cannot be offlined due to several
- * restrictions and assumptions in kernel. This basically
- * doesn't add a control file, one cannot attempt to offline
- * BSP.
+ * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate
+ * depends on BSP. PIC interrupts depend on BSP.
*
- * Also certain PCI quirks require not to enable hotplug control
- * for all CPU's.
+ * If the BSP depencies are under control, one can tell kernel to
+ * enable BSP hotplug. This basically adds a control file and
+ * one can attempt to offline BSP.
*/
- if (num)
+ if (num == 0 && cpu0_hotpluggable) {
+ unsigned int irq;
+ /*
+ * We won't take down the boot processor on i386 if some
+ * interrupts only are able to be serviced by the BSP in PIC.
+ */
+ for_each_active_irq(irq) {
+ if (!IO_APIC_IRQ(irq) && irq_has_action(irq)) {
+ cpu0_hotpluggable = 0;
+ break;
+ }
+ }
+ }
+ if (num || cpu0_hotpluggable)
per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
new file mode 100644
index 00000000000..25b993729f9
--- /dev/null
+++ b/arch/x86/kernel/trace_clock.c
@@ -0,0 +1,21 @@
+/*
+ * X86 trace clocks
+ */
+#include <asm/trace_clock.h>
+#include <asm/barrier.h>
+#include <asm/msr.h>
+
+/*
+ * trace_clock_x86_tsc(): A clock that is just the cycle counter.
+ *
+ * Unlike the other clocks, this is not in nanoseconds.
+ */
+u64 notrace trace_clock_x86_tsc(void)
+{
+ u64 ret;
+
+ rdtsc_barrier();
+ rdtscll(ret);
+
+ return ret;
+}
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
new file mode 100644
index 00000000000..1c113db9ed5
--- /dev/null
+++ b/arch/x86/kernel/tracepoint.c
@@ -0,0 +1,59 @@
+/*
+ * Code for supporting irq vector tracepoints.
+ *
+ * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com>
+ *
+ */
+#include <asm/hw_irq.h>
+#include <asm/desc.h>
+#include <linux/atomic.h>
+
+atomic_t trace_idt_ctr = ATOMIC_INIT(0);
+struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
+ (unsigned long) trace_idt_table };
+
+/* No need to be aligned, but done to keep all IDTs defined the same way. */
+gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
+
+static int trace_irq_vector_refcount;
+static DEFINE_MUTEX(irq_vector_mutex);
+
+static void set_trace_idt_ctr(int val)
+{
+ atomic_set(&trace_idt_ctr, val);
+ /* Ensure the trace_idt_ctr is set before sending IPI */
+ wmb();
+}
+
+static void switch_idt(void *arg)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ load_current_idt();
+ local_irq_restore(flags);
+}
+
+void trace_irq_vector_regfunc(void)
+{
+ mutex_lock(&irq_vector_mutex);
+ if (!trace_irq_vector_refcount) {
+ set_trace_idt_ctr(1);
+ smp_call_function(switch_idt, NULL, 0);
+ switch_idt(NULL);
+ }
+ trace_irq_vector_refcount++;
+ mutex_unlock(&irq_vector_mutex);
+}
+
+void trace_irq_vector_unregfunc(void)
+{
+ mutex_lock(&irq_vector_mutex);
+ trace_irq_vector_refcount--;
+ if (!trace_irq_vector_refcount) {
+ set_trace_idt_ctr(0);
+ smp_call_function(switch_idt, NULL, 0);
+ switch_idt(NULL);
+ }
+ mutex_unlock(&irq_vector_mutex);
+}
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
deleted file mode 100644
index a73b61055ad..00000000000
--- a/arch/x86/kernel/trampoline.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <linux/io.h>
-#include <linux/memblock.h>
-
-#include <asm/trampoline.h>
-#include <asm/cacheflush.h>
-#include <asm/pgtable.h>
-
-unsigned char *x86_trampoline_base;
-
-void __init setup_trampolines(void)
-{
- phys_addr_t mem;
- size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
-
- /* Has to be in very low memory so we can execute real-mode AP code. */
- mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
- if (!mem)
- panic("Cannot allocate trampoline\n");
-
- x86_trampoline_base = __va(mem);
- memblock_reserve(mem, size);
-
- printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
- x86_trampoline_base, (unsigned long long)mem, size);
-
- memcpy(x86_trampoline_base, x86_trampoline_start, size);
-}
-
-/*
- * setup_trampolines() gets called very early, to guarantee the
- * availability of low memory. This is before the proper kernel page
- * tables are set up, so we cannot set page permissions in that
- * function. Thus, we use an arch_initcall instead.
- */
-static int __init configure_trampolines(void)
-{
- size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
-
- set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT);
- return 0;
-}
-arch_initcall(configure_trampolines);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
deleted file mode 100644
index 451c0a7ef7f..00000000000
--- a/arch/x86/kernel/trampoline_32.S
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- *
- * Trampoline.S Derived from Setup.S by Linus Torvalds
- *
- * 4 Jan 1997 Michael Chastain: changed to gnu as.
- *
- * This is only used for booting secondary CPUs in SMP machine
- *
- * Entry: CS:IP point to the start of our code, we are
- * in real mode with no stack, but the rest of the
- * trampoline page to make our stack and everything else
- * is a mystery.
- *
- * We jump into arch/x86/kernel/head_32.S.
- *
- * On entry to trampoline_data, the processor is in real mode
- * with 16-bit addressing and 16-bit data. CS has some value
- * and IP is zero. Thus, data addresses need to be absolute
- * (no relocation) and are taken with regard to r_base.
- *
- * If you work on this file, check the object module with
- * objdump --reloc to make sure there are no relocation
- * entries except for:
- *
- * TYPE VALUE
- * R_386_32 startup_32_smp
- * R_386_32 boot_gdt
- */
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/segment.h>
-#include <asm/page_types.h>
-
-#ifdef CONFIG_SMP
-
- .section ".x86_trampoline","a"
- .balign PAGE_SIZE
- .code16
-
-ENTRY(trampoline_data)
-r_base = .
- wbinvd # Needed for NUMA-Q should be harmless for others
- mov %cs, %ax # Code and data in the same place
- mov %ax, %ds
-
- cli # We should be safe anyway
-
- movl $0xA5A5A5A5, trampoline_status - r_base
- # write marker for master knows we're running
-
- /* GDT tables in non default location kernel can be beyond 16MB and
- * lgdt will not be able to load the address as in real mode default
- * operand size is 16bit. Use lgdtl instead to force operand size
- * to 32 bit.
- */
-
- lidtl boot_idt_descr - r_base # load idt with 0, 0
- lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate
-
- xor %ax, %ax
- inc %ax # protected mode (PE) bit
- lmsw %ax # into protected mode
- # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
- ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
-
- # These need to be in the same 64K segment as the above;
- # hence we don't use the boot_gdt_descr defined in head.S
-boot_gdt_descr:
- .word __BOOT_DS + 7 # gdt limit
- .long boot_gdt - __PAGE_OFFSET # gdt base
-
-boot_idt_descr:
- .word 0 # idt limit = 0
- .long 0 # idt base = 0L
-
-ENTRY(trampoline_status)
- .long 0
-
-.globl trampoline_end
-trampoline_end:
-
-#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
deleted file mode 100644
index 09ff51799e9..00000000000
--- a/arch/x86/kernel/trampoline_64.S
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- *
- * Trampoline.S Derived from Setup.S by Linus Torvalds
- *
- * 4 Jan 1997 Michael Chastain: changed to gnu as.
- * 15 Sept 2005 Eric Biederman: 64bit PIC support
- *
- * Entry: CS:IP point to the start of our code, we are
- * in real mode with no stack, but the rest of the
- * trampoline page to make our stack and everything else
- * is a mystery.
- *
- * On entry to trampoline_data, the processor is in real mode
- * with 16-bit addressing and 16-bit data. CS has some value
- * and IP is zero. Thus, data addresses need to be absolute
- * (no relocation) and are taken with regard to r_base.
- *
- * With the addition of trampoline_level4_pgt this code can
- * now enter a 64bit kernel that lives at arbitrary 64bit
- * physical addresses.
- *
- * If you work on this file, check the object module with objdump
- * --full-contents --reloc to make sure there are no relocation
- * entries.
- */
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/pgtable_types.h>
-#include <asm/page_types.h>
-#include <asm/msr.h>
-#include <asm/segment.h>
-#include <asm/processor-flags.h>
-
- .section ".x86_trampoline","a"
- .balign PAGE_SIZE
- .code16
-
-ENTRY(trampoline_data)
-r_base = .
- cli # We should be safe anyway
- wbinvd
- mov %cs, %ax # Code and data in the same place
- mov %ax, %ds
- mov %ax, %es
- mov %ax, %ss
-
-
- movl $0xA5A5A5A5, trampoline_status - r_base
- # write marker for master knows we're running
-
- # Setup stack
- movw $(trampoline_stack_end - r_base), %sp
-
- call verify_cpu # Verify the cpu supports long mode
- testl %eax, %eax # Check for return code
- jnz no_longmode
-
- mov %cs, %ax
- movzx %ax, %esi # Find the 32bit trampoline location
- shll $4, %esi
-
- # Fixup the absolute vectors
- leal (startup_32 - r_base)(%esi), %eax
- movl %eax, startup_32_vector - r_base
- leal (startup_64 - r_base)(%esi), %eax
- movl %eax, startup_64_vector - r_base
- leal (tgdt - r_base)(%esi), %eax
- movl %eax, (tgdt + 2 - r_base)
-
- /*
- * GDT tables in non default location kernel can be beyond 16MB and
- * lgdt will not be able to load the address as in real mode default
- * operand size is 16bit. Use lgdtl instead to force operand size
- * to 32 bit.
- */
-
- lidtl tidt - r_base # load idt with 0, 0
- lgdtl tgdt - r_base # load gdt with whatever is appropriate
-
- mov $X86_CR0_PE, %ax # protected mode (PE) bit
- lmsw %ax # into protected mode
-
- # flush prefetch and jump to startup_32
- ljmpl *(startup_32_vector - r_base)
-
- .code32
- .balign 4
-startup_32:
- movl $__KERNEL_DS, %eax # Initialize the %ds segment register
- movl %eax, %ds
-
- movl $X86_CR4_PAE, %eax
- movl %eax, %cr4 # Enable PAE mode
-
- # Setup trampoline 4 level pagetables
- leal (trampoline_level4_pgt - r_base)(%esi), %eax
- movl %eax, %cr3
-
- movl $MSR_EFER, %ecx
- movl $(1 << _EFER_LME), %eax # Enable Long Mode
- xorl %edx, %edx
- wrmsr
-
- # Enable paging and in turn activate Long Mode
- # Enable protected mode
- movl $(X86_CR0_PG | X86_CR0_PE), %eax
- movl %eax, %cr0
-
- /*
- * At this point we're in long mode but in 32bit compatibility mode
- * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
- * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
- * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
- */
- ljmp *(startup_64_vector - r_base)(%esi)
-
- .code64
- .balign 4
-startup_64:
- # Now jump into the kernel using virtual addresses
- movq $secondary_startup_64, %rax
- jmp *%rax
-
- .code16
-no_longmode:
- hlt
- jmp no_longmode
-#include "verify_cpu.S"
-
- .balign 4
- # Careful these need to be in the same 64K segment as the above;
-tidt:
- .word 0 # idt limit = 0
- .word 0, 0 # idt base = 0L
-
- # Duplicate the global descriptor table
- # so the kernel can live anywhere
- .balign 4
-tgdt:
- .short tgdt_end - tgdt # gdt limit
- .long tgdt - r_base
- .short 0
- .quad 0x00cf9b000000ffff # __KERNEL32_CS
- .quad 0x00af9b000000ffff # __KERNEL_CS
- .quad 0x00cf93000000ffff # __KERNEL_DS
-tgdt_end:
-
- .balign 4
-startup_32_vector:
- .long startup_32 - r_base
- .word __KERNEL32_CS, 0
-
- .balign 4
-startup_64_vector:
- .long startup_64 - r_base
- .word __KERNEL_CS, 0
-
- .balign 4
-ENTRY(trampoline_status)
- .long 0
-
-trampoline_stack:
- .org 0x1000
-trampoline_stack_end:
-ENTRY(trampoline_level4_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 510,8,0
- .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-
-ENTRY(trampoline_end)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 77da5b475ad..0d0e922fafc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -9,6 +9,10 @@
/*
* Handle hardware traps and faults.
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/context_tracking.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
#include <linux/spinlock.h>
@@ -19,6 +23,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/ptrace.h>
+#include <linux/uprobes.h>
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
@@ -37,10 +42,6 @@
#include <linux/eisa.h>
#endif
-#ifdef CONFIG_MCA
-#include <linux/mca.h>
-#endif
-
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
@@ -50,34 +51,33 @@
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <linux/atomic.h>
-#include <asm/system.h>
+#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/mce.h>
-
+#include <asm/fixmap.h>
#include <asm/mach_traps.h>
+#include <asm/alternative.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
+
+/* No need to be aligned, but done to keep all IDTs defined the same way. */
+gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss;
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
asmlinkage int system_call(void);
-
-/* Do we ignore FPU interrupts ? */
-char ignore_fpu_irq;
-
-/*
- * The IDT has to be page-aligned to simplify the Pentium
- * F0 0F bug workaround.
- */
-gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
#endif
+/* Must be page-aligned because the real IDT is used in a fixmap. */
+gate_desc idt_table[NR_VECTORS] __page_aligned_bss;
+
DECLARE_BITMAP(used_vectors, NR_VECTORS);
EXPORT_SYMBOL_GPL(used_vectors);
@@ -89,7 +89,7 @@ static inline void conditional_sti(struct pt_regs *regs)
static inline void preempt_conditional_sti(struct pt_regs *regs)
{
- inc_preempt_count();
+ preempt_count_inc();
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}
@@ -104,35 +104,81 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
{
if (regs->flags & X86_EFLAGS_IF)
local_irq_disable();
- dec_preempt_count();
+ preempt_count_dec();
}
-static void __kprobes
-do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
- long error_code, siginfo_t *info)
+static nokprobe_inline int
+do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
+ struct pt_regs *regs, long error_code)
{
- struct task_struct *tsk = current;
-
#ifdef CONFIG_X86_32
if (regs->flags & X86_VM_MASK) {
/*
- * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
+ * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
* On nmi (interrupt 2), do_trap should not be called.
*/
- if (trapnr < 6)
- goto vm86_trap;
- goto trap_signal;
+ if (trapnr < X86_TRAP_UD) {
+ if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
+ error_code, trapnr))
+ return 0;
+ }
+ return -1;
}
#endif
+ if (!user_mode(regs)) {
+ if (!fixup_exception(regs)) {
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = trapnr;
+ die(str, regs, error_code);
+ }
+ return 0;
+ }
- if (!user_mode(regs))
- goto kernel_trap;
+ return -1;
+}
-#ifdef CONFIG_X86_32
-trap_signal:
-#endif
+static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr,
+ siginfo_t *info)
+{
+ unsigned long siaddr;
+ int sicode;
+
+ switch (trapnr) {
+ default:
+ return SEND_SIG_PRIV;
+
+ case X86_TRAP_DE:
+ sicode = FPE_INTDIV;
+ siaddr = uprobe_get_trap_addr(regs);
+ break;
+ case X86_TRAP_UD:
+ sicode = ILL_ILLOPN;
+ siaddr = uprobe_get_trap_addr(regs);
+ break;
+ case X86_TRAP_AC:
+ sicode = BUS_ADRALN;
+ siaddr = 0;
+ break;
+ }
+
+ info->si_signo = signr;
+ info->si_errno = 0;
+ info->si_code = sicode;
+ info->si_addr = (void __user *)siaddr;
+ return info;
+}
+
+static void
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+ long error_code, siginfo_t *info)
+{
+ struct task_struct *tsk = current;
+
+
+ if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
+ return;
/*
- * We want error_code and trap_no set for userspace faults and
+ * We want error_code and trap_nr set for userspace faults and
* kernelspace faults which result in die(), but not
* kernelspace faults which are fixed up. die() gives the
* process no chance to handle the signal and notice the
@@ -141,90 +187,71 @@ trap_signal:
* delivered, faults. See also do_general_protection below.
*/
tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
+ tsk->thread.trap_nr = trapnr;
#ifdef CONFIG_X86_64
if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
- tsk->comm, tsk->pid, str,
- regs->ip, regs->sp, error_code);
+ pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+ tsk->comm, tsk->pid, str,
+ regs->ip, regs->sp, error_code);
print_vma_addr(" in ", regs->ip);
- printk("\n");
+ pr_cont("\n");
}
#endif
- if (info)
- force_sig_info(signr, info, tsk);
- else
- force_sig(signr, tsk);
- return;
+ force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
+}
+NOKPROBE_SYMBOL(do_trap);
-kernel_trap:
- if (!fixup_exception(regs)) {
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
- die(str, regs, error_code);
+static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
+ unsigned long trapnr, int signr)
+{
+ enum ctx_state prev_state = exception_enter();
+ siginfo_t info;
+
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
+ NOTIFY_STOP) {
+ conditional_sti(regs);
+ do_trap(trapnr, signr, str, regs, error_code,
+ fill_trap_info(regs, signr, trapnr, &info));
}
- return;
-#ifdef CONFIG_X86_32
-vm86_trap:
- if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
- error_code, trapnr))
- goto trap_signal;
- return;
-#endif
+ exception_exit(prev_state);
}
#define DO_ERROR(trapnr, signr, str, name) \
dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, NULL); \
-}
-
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
-{ \
- siginfo_t info; \
- info.si_signo = signr; \
- info.si_errno = 0; \
- info.si_code = sicode; \
- info.si_addr = (void __user *)siaddr; \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, &info); \
+ do_error_trap(regs, error_code, str, trapnr, signr); \
}
-DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-DO_ERROR(4, SIGSEGV, "overflow", overflow)
-DO_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
-DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error)
+DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
+DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds)
+DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op)
+DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun)
+DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
#ifdef CONFIG_X86_32
-DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
+DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
#endif
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
#ifdef CONFIG_X86_64
/* Runs on IST stack */
dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
{
+ enum ctx_state prev_state;
+
+ prev_state = exception_enter();
if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
- 12, SIGBUS) == NOTIFY_STOP)
- return;
- preempt_conditional_sti(regs);
- do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
- preempt_conditional_cli(regs);
+ X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) {
+ preempt_conditional_sti(regs);
+ do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
+ preempt_conditional_cli(regs);
+ }
+ exception_exit(prev_state);
}
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
@@ -232,12 +259,16 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
static const char str[] = "double fault";
struct task_struct *tsk = current;
+ exception_enter();
/* Return not checked because double check cannot be ignored */
- notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
+ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 8;
+ tsk->thread.trap_nr = X86_TRAP_DF;
+#ifdef CONFIG_DOUBLEFAULT
+ df_debug(regs, error_code);
+#endif
/*
* This is always a kernel trap and never fixable (and thus must
* never return).
@@ -247,69 +278,86 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
}
#endif
-dotraplinkage void __kprobes
+dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk;
+ enum ctx_state prev_state;
+ prev_state = exception_enter();
conditional_sti(regs);
#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK)
- goto gp_in_vm86;
+ if (regs->flags & X86_VM_MASK) {
+ local_irq_enable();
+ handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+ goto exit;
+ }
#endif
tsk = current;
- if (!user_mode(regs))
- goto gp_in_kernel;
+ if (!user_mode(regs)) {
+ if (fixup_exception(regs))
+ goto exit;
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_GP;
+ if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
+ X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
+ die("general protection fault", regs, error_code);
+ goto exit;
+ }
tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 13;
+ tsk->thread.trap_nr = X86_TRAP_GP;
if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] general protection ip:%lx sp:%lx error:%lx",
+ pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
tsk->comm, task_pid_nr(tsk),
regs->ip, regs->sp, error_code);
print_vma_addr(" in ", regs->ip);
- printk("\n");
+ pr_cont("\n");
}
- force_sig(SIGSEGV, tsk);
- return;
+ force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
+exit:
+ exception_exit(prev_state);
+}
+NOKPROBE_SYMBOL(do_general_protection);
-#ifdef CONFIG_X86_32
-gp_in_vm86:
- local_irq_enable();
- handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
- return;
-#endif
+/* May run on IST stack. */
+dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
+{
+ enum ctx_state prev_state;
-gp_in_kernel:
- if (fixup_exception(regs))
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /*
+ * ftrace must be first, everything else may cause a recursive crash.
+ * See note by declaration of modifying_ftrace_code in ftrace.c
+ */
+ if (unlikely(atomic_read(&modifying_ftrace_code)) &&
+ ftrace_int3_handler(regs))
return;
-
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 13;
- if (notify_die(DIE_GPF, "general protection fault", regs,
- error_code, 13, SIGSEGV) == NOTIFY_STOP)
+#endif
+ if (poke_int3_handler(regs))
return;
- die("general protection fault", regs, error_code);
-}
-/* May run on IST stack. */
-dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
-{
+ prev_state = exception_enter();
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
- if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
- return;
+ if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+ SIGTRAP) == NOTIFY_STOP)
+ goto exit;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
- if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
- return;
+#ifdef CONFIG_KPROBES
+ if (kprobe_int3_handler(regs))
+ goto exit;
+#endif
+
+ if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+ SIGTRAP) == NOTIFY_STOP)
+ goto exit;
/*
* Let others (NMI) know that the debug stack is in use
@@ -317,10 +365,13 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
*/
debug_stack_usage_inc();
preempt_conditional_sti(regs);
- do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
+ do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
preempt_conditional_cli(regs);
debug_stack_usage_dec();
+exit:
+ exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
@@ -328,7 +379,7 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
* for scheduling or signal handling. The actual stack switch is done in
* entry.S
*/
-asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
+asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = eregs;
/* Did already sync */
@@ -347,6 +398,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
*regs = *eregs;
return regs;
}
+NOKPROBE_SYMBOL(sync_regs);
#endif
/*
@@ -373,13 +425,16 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
*
* May run on IST stack.
*/
-dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
+dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk = current;
+ enum ctx_state prev_state;
int user_icebp = 0;
unsigned long dr6;
int si_code;
+ prev_state = exception_enter();
+
get_debugreg(dr6, 6);
/* Filter out all the reserved bits which are preset to 1 */
@@ -395,7 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
/* Catch kmemcheck conditions first of all! */
if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
- return;
+ goto exit;
/* DR6 may or may not be cleared by the CPU */
set_debugreg(0, 6);
@@ -408,9 +463,14 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
/* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
- if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
+#ifdef CONFIG_KPROBES
+ if (kprobe_debug_handler(regs))
+ goto exit;
+#endif
+
+ if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
SIGTRAP) == NOTIFY_STOP)
- return;
+ goto exit;
/*
* Let others (NMI) know that the debug stack is in use
@@ -422,11 +482,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
preempt_conditional_sti(regs);
if (regs->flags & X86_VM_MASK) {
- handle_vm86_trap((struct kernel_vm86_regs *) regs,
- error_code, 1);
+ handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
+ X86_TRAP_DB);
preempt_conditional_cli(regs);
debug_stack_usage_dec();
- return;
+ goto exit;
}
/*
@@ -447,20 +507,23 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs);
debug_stack_usage_dec();
- return;
+exit:
+ exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_debug);
/*
* Note that we play around with the 'TS' bit in an attempt to get
* the correct behaviour even in the presence of the asynchronous
* IRQ13 behaviour
*/
-void math_error(struct pt_regs *regs, int error_code, int trapnr)
+static void math_error(struct pt_regs *regs, int error_code, int trapnr)
{
struct task_struct *task = current;
siginfo_t info;
unsigned short err;
- char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
+ char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
+ "simd exception";
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
return;
@@ -470,7 +533,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
{
if (!fixup_exception(regs)) {
task->thread.error_code = error_code;
- task->thread.trap_no = trapnr;
+ task->thread.trap_nr = trapnr;
die(str, regs, error_code);
}
return;
@@ -480,12 +543,12 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
* Save the info for the exception handler and clear the error.
*/
save_init_fpu(task);
- task->thread.trap_no = trapnr;
+ task->thread.trap_nr = trapnr;
task->thread.error_code = error_code;
info.si_signo = SIGFPE;
info.si_errno = 0;
- info.si_addr = (void __user *)regs->ip;
- if (trapnr == 16) {
+ info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
+ if (trapnr == X86_TRAP_MF) {
unsigned short cwd, swd;
/*
* (~cwd & swd) will mask out exceptions that are not set to unmasked
@@ -529,27 +592,32 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
info.si_code = FPE_FLTRES;
} else {
/*
- * If we're using IRQ 13, or supposedly even some trap 16
- * implementations, it's possible we get a spurious trap...
+ * If we're using IRQ 13, or supposedly even some trap
+ * X86_TRAP_MF implementations, it's possible
+ * we get a spurious trap, which is not an error.
*/
- return; /* Spurious trap, no error */
+ return;
}
force_sig_info(SIGFPE, &info, task);
}
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
{
-#ifdef CONFIG_X86_32
- ignore_fpu_irq = 1;
-#endif
+ enum ctx_state prev_state;
- math_error(regs, error_code, 16);
+ prev_state = exception_enter();
+ math_error(regs, error_code, X86_TRAP_MF);
+ exception_exit(prev_state);
}
dotraplinkage void
do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
{
- math_error(regs, error_code, 19);
+ enum ctx_state prev_state;
+
+ prev_state = exception_enter();
+ math_error(regs, error_code, X86_TRAP_XF);
+ exception_exit(prev_state);
}
dotraplinkage void
@@ -558,50 +626,19 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
conditional_sti(regs);
#if 0
/* No need to warn about this any longer. */
- printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
+ pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
#endif
}
-asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void)
{
}
-asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
+asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void)
{
}
/*
- * This gets called with the process already owning the
- * FPU state, and with CR0.TS cleared. It just needs to
- * restore the FPU register state.
- */
-void __math_state_restore(struct task_struct *tsk)
-{
- /* We need a safe address that is cheap to find and that is already
- in L1. We've just brought in "tsk->thread.has_fpu", so use that */
-#define safe_address (tsk->thread.has_fpu)
-
- /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
- is pending. Clear the x87 state here by setting it to fixed
- values. safe_address is a random variable that should be in L1 */
- alternative_input(
- ASM_NOP8 ASM_NOP2,
- "emms\n\t" /* clear stack tags */
- "fildl %P[addr]", /* set F?P to defined value */
- X86_FEATURE_FXSAVE_LEAK,
- [addr] "m" (safe_address));
-
- /*
- * Paranoid restore. send a SIGSEGV if we fail to restore the state.
- */
- if (unlikely(restore_fpu_checking(tsk))) {
- __thread_fpu_end(tsk);
- force_sig(SIGSEGV, tsk);
- return;
- }
-}
-
-/*
* 'math_state_restore()' saves the current math information in the
* old math state array, and gets the new ones from the current task
*
@@ -631,15 +668,28 @@ void math_state_restore(void)
}
__thread_fpu_begin(tsk);
- __math_state_restore(tsk);
- tsk->fpu_counter++;
+ /*
+ * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+ */
+ if (unlikely(restore_fpu_checking(tsk))) {
+ drop_init_fpu(tsk);
+ force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
+ return;
+ }
+
+ tsk->thread.fpu_counter++;
}
EXPORT_SYMBOL_GPL(math_state_restore);
-dotraplinkage void __kprobes
+dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
{
+ enum ctx_state prev_state;
+
+ prev_state = exception_enter();
+ BUG_ON(use_eager_fpu());
+
#ifdef CONFIG_MATH_EMULATION
if (read_cr0() & X86_CR0_EM) {
struct math_emu_info info = { };
@@ -648,6 +698,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
info.regs = regs;
math_emulate(&info);
+ exception_exit(prev_state);
return;
}
#endif
@@ -655,35 +706,51 @@ do_device_not_available(struct pt_regs *regs, long error_code)
#ifdef CONFIG_X86_32
conditional_sti(regs);
#endif
+ exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_device_not_available);
#ifdef CONFIG_X86_32
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
{
siginfo_t info;
+ enum ctx_state prev_state;
+
+ prev_state = exception_enter();
local_irq_enable();
info.si_signo = SIGILL;
info.si_errno = 0;
info.si_code = ILL_BADSTK;
info.si_addr = NULL;
- if (notify_die(DIE_TRAP, "iret exception",
- regs, error_code, 32, SIGILL) == NOTIFY_STOP)
- return;
- do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
+ if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
+ X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
+ do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
+ &info);
+ }
+ exception_exit(prev_state);
}
#endif
/* Set of traps needed for early debugging. */
void __init early_trap_init(void)
{
- set_intr_gate_ist(1, &debug, DEBUG_STACK);
+ set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
/* int3 can be called from all */
- set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
- set_intr_gate(14, &page_fault);
+ set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+#ifdef CONFIG_X86_32
+ set_intr_gate(X86_TRAP_PF, page_fault);
+#endif
load_idt(&idt_descr);
}
+void __init early_trap_pf_init(void)
+{
+#ifdef CONFIG_X86_64
+ set_intr_gate(X86_TRAP_PF, page_fault);
+#endif
+}
+
void __init trap_init(void)
{
int i;
@@ -696,30 +763,30 @@ void __init trap_init(void)
early_iounmap(p, 4);
#endif
- set_intr_gate(0, &divide_error);
- set_intr_gate_ist(2, &nmi, NMI_STACK);
+ set_intr_gate(X86_TRAP_DE, divide_error);
+ set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
/* int4 can be called from all */
- set_system_intr_gate(4, &overflow);
- set_intr_gate(5, &bounds);
- set_intr_gate(6, &invalid_op);
- set_intr_gate(7, &device_not_available);
+ set_system_intr_gate(X86_TRAP_OF, &overflow);
+ set_intr_gate(X86_TRAP_BR, bounds);
+ set_intr_gate(X86_TRAP_UD, invalid_op);
+ set_intr_gate(X86_TRAP_NM, device_not_available);
#ifdef CONFIG_X86_32
- set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
+ set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
#else
- set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
+ set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
#endif
- set_intr_gate(9, &coprocessor_segment_overrun);
- set_intr_gate(10, &invalid_TSS);
- set_intr_gate(11, &segment_not_present);
- set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
- set_intr_gate(13, &general_protection);
- set_intr_gate(15, &spurious_interrupt_bug);
- set_intr_gate(16, &coprocessor_error);
- set_intr_gate(17, &alignment_check);
+ set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
+ set_intr_gate(X86_TRAP_TS, invalid_TSS);
+ set_intr_gate(X86_TRAP_NP, segment_not_present);
+ set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
+ set_intr_gate(X86_TRAP_GP, general_protection);
+ set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
+ set_intr_gate(X86_TRAP_MF, coprocessor_error);
+ set_intr_gate(X86_TRAP_AC, alignment_check);
#ifdef CONFIG_X86_MCE
- set_intr_gate_ist(18, &machine_check, MCE_STACK);
+ set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
#endif
- set_intr_gate(19, &simd_coprocessor_error);
+ set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
/* Reserve all the builtin and the syscall vector: */
for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
@@ -736,6 +803,14 @@ void __init trap_init(void)
#endif
/*
+ * Set the IDT descriptor to a fixed read-only location, so that the
+ * "sidt" instruction will not leak the location of the kernel, and
+ * to defend the IDT against arbitrary memory write vulnerabilities.
+ * It will be reloaded in cpu_init() */
+ __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
+ idt_descr.address = fix_to_virt(FIX_RO_IDT);
+
+ /*
* Should be a barrier for any external CPU state:
*/
cpu_init();
@@ -743,8 +818,8 @@ void __init trap_init(void)
x86_init.irqs.trap_init();
#ifdef CONFIG_X86_64
- memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
- set_nmi_gate(1, &debug);
- set_nmi_gate(3, &int3);
+ memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
+ set_nmi_gate(X86_TRAP_DB, &debug);
+ set_nmi_gate(X86_TRAP_BP, &int3);
#endif
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a62c201c97e..ea030319b32 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
@@ -9,6 +11,7 @@
#include <linux/clocksource.h>
#include <linux/percpu.h>
#include <linux/timex.h>
+#include <linux/static_key.h>
#include <asm/hpet.h>
#include <asm/timer.h>
@@ -35,13 +38,244 @@ static int __read_mostly tsc_unstable;
erroneous rdtsc usage on !cpu_has_tsc processors */
static int __read_mostly tsc_disabled = -1;
+static struct static_key __use_tsc = STATIC_KEY_INIT;
+
int tsc_clocksource_reliable;
+
+/*
+ * Use a ring-buffer like data structure, where a writer advances the head by
+ * writing a new data entry and a reader advances the tail when it observes a
+ * new entry.
+ *
+ * Writers are made to wait on readers until there's space to write a new
+ * entry.
+ *
+ * This means that we can always use an {offset, mul} pair to compute a ns
+ * value that is 'roughly' in the right direction, even if we're writing a new
+ * {offset, mul} pair during the clock read.
+ *
+ * The down-side is that we can no longer guarantee strict monotonicity anymore
+ * (assuming the TSC was that to begin with), because while we compute the
+ * intersection point of the two clock slopes and make sure the time is
+ * continuous at the point of switching; we can no longer guarantee a reader is
+ * strictly before or after the switch point.
+ *
+ * It does mean a reader no longer needs to disable IRQs in order to avoid
+ * CPU-Freq updates messing with his times, and similarly an NMI reader will
+ * no longer run the risk of hitting half-written state.
+ */
+
+struct cyc2ns {
+ struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
+ struct cyc2ns_data *head; /* 48 + 8 = 56 */
+ struct cyc2ns_data *tail; /* 56 + 8 = 64 */
+}; /* exactly fits one cacheline */
+
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
+
+struct cyc2ns_data *cyc2ns_read_begin(void)
+{
+ struct cyc2ns_data *head;
+
+ preempt_disable();
+
+ head = this_cpu_read(cyc2ns.head);
+ /*
+ * Ensure we observe the entry when we observe the pointer to it.
+ * matches the wmb from cyc2ns_write_end().
+ */
+ smp_read_barrier_depends();
+ head->__count++;
+ barrier();
+
+ return head;
+}
+
+void cyc2ns_read_end(struct cyc2ns_data *head)
+{
+ barrier();
+ /*
+ * If we're the outer most nested read; update the tail pointer
+ * when we're done. This notifies possible pending writers
+ * that we've observed the head pointer and that the other
+ * entry is now free.
+ */
+ if (!--head->__count) {
+ /*
+ * x86-TSO does not reorder writes with older reads;
+ * therefore once this write becomes visible to another
+ * cpu, we must be finished reading the cyc2ns_data.
+ *
+ * matches with cyc2ns_write_begin().
+ */
+ this_cpu_write(cyc2ns.tail, head);
+ }
+ preempt_enable();
+}
+
+/*
+ * Begin writing a new @data entry for @cpu.
+ *
+ * Assumes some sort of write side lock; currently 'provided' by the assumption
+ * that cpufreq will call its notifiers sequentially.
+ */
+static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+{
+ struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+ struct cyc2ns_data *data = c2n->data;
+
+ if (data == c2n->head)
+ data++;
+
+ /* XXX send an IPI to @cpu in order to guarantee a read? */
+
+ /*
+ * When we observe the tail write from cyc2ns_read_end(),
+ * the cpu must be done with that entry and its safe
+ * to start writing to it.
+ */
+ while (c2n->tail == data)
+ cpu_relax();
+
+ return data;
+}
+
+static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+{
+ struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+
+ /*
+ * Ensure the @data writes are visible before we publish the
+ * entry. Matches the data-depencency in cyc2ns_read_begin().
+ */
+ smp_wmb();
+
+ ACCESS_ONCE(c2n->head) = data;
+}
+
+/*
+ * Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ * basic equation:
+ * ns = cycles / (freq / ns_per_sec)
+ * ns = cycles * (ns_per_sec / freq)
+ * ns = cycles * (10^9 / (cpu_khz * 10^3))
+ * ns = cycles * (10^6 / cpu_khz)
+ *
+ * Then we use scaling math (suggested by george@mvista.com) to get:
+ * ns = cycles * (10^6 * SC / cpu_khz) / SC
+ * ns = cycles * cyc2ns_scale / SC
+ *
+ * And since SC is a constant power of two, we can convert the div
+ * into a shift.
+ *
+ * We can use khz divisor instead of mhz to keep a better precision, since
+ * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ * (mathieu.desnoyers@polymtl.ca)
+ *
+ * -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static void cyc2ns_data_init(struct cyc2ns_data *data)
+{
+ data->cyc2ns_mul = 0;
+ data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+ data->cyc2ns_offset = 0;
+ data->__count = 0;
+}
+
+static void cyc2ns_init(int cpu)
+{
+ struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+
+ cyc2ns_data_init(&c2n->data[0]);
+ cyc2ns_data_init(&c2n->data[1]);
+
+ c2n->head = c2n->data;
+ c2n->tail = c2n->data;
+}
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+ struct cyc2ns_data *data, *tail;
+ unsigned long long ns;
+
+ /*
+ * See cyc2ns_read_*() for details; replicated in order to avoid
+ * an extra few instructions that came with the abstraction.
+ * Notable, it allows us to only do the __count and tail update
+ * dance when its actually needed.
+ */
+
+ preempt_disable_notrace();
+ data = this_cpu_read(cyc2ns.head);
+ tail = this_cpu_read(cyc2ns.tail);
+
+ if (likely(data == tail)) {
+ ns = data->cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ } else {
+ data->__count++;
+
+ barrier();
+
+ ns = data->cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+
+ barrier();
+
+ if (!--data->__count)
+ this_cpu_write(cyc2ns.tail, data);
+ }
+ preempt_enable_notrace();
+
+ return ns;
+}
+
+/* XXX surely we already have this someplace in the kernel?! */
+#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
+
+static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+{
+ unsigned long long tsc_now, ns_now;
+ struct cyc2ns_data *data;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sched_clock_idle_sleep_event();
+
+ if (!cpu_khz)
+ goto done;
+
+ data = cyc2ns_write_begin(cpu);
+
+ rdtscll(tsc_now);
+ ns_now = cycles_2_ns(tsc_now);
+
+ /*
+ * Compute a new multiplier as per the above comment and ensure our
+ * time function is continuous; see the comment near struct
+ * cyc2ns_data.
+ */
+ data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
+ data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+ data->cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+
+ cyc2ns_write_end(cpu, data);
+
+done:
+ sched_clock_idle_wakeup_event(0);
+ local_irq_restore(flags);
+}
/*
* Scheduler clock - returns current time in nanosec units.
*/
u64 native_sched_clock(void)
{
- u64 this_offset;
+ u64 tsc_now;
/*
* Fall back to jiffies if there's no TSC available:
@@ -51,16 +285,16 @@ u64 native_sched_clock(void)
* very important for it to be as fast as the platform
* can achieve it. )
*/
- if (unlikely(tsc_disabled)) {
+ if (!static_key_false(&__use_tsc)) {
/* No locking but a rare wrong value is not a big deal: */
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
}
/* read the Time Stamp Counter: */
- rdtscll(this_offset);
+ rdtscll(tsc_now);
/* return the value in ns */
- return __cycles_2_ns(this_offset);
+ return cycles_2_ns(tsc_now);
}
/* We need to define a real function for sched_clock, to override the
@@ -75,17 +309,28 @@ unsigned long long
sched_clock(void) __attribute__((alias("native_sched_clock")));
#endif
+unsigned long long native_read_tsc(void)
+{
+ return __native_read_tsc();
+}
+EXPORT_SYMBOL(native_read_tsc);
+
int check_tsc_unstable(void)
{
return tsc_unstable;
}
EXPORT_SYMBOL_GPL(check_tsc_unstable);
+int check_tsc_disabled(void)
+{
+ return tsc_disabled;
+}
+EXPORT_SYMBOL_GPL(check_tsc_disabled);
+
#ifdef CONFIG_X86_TSC
int __init notsc_setup(char *str)
{
- printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
- "cannot disable TSC completely.\n");
+ pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n");
tsc_disabled = 1;
return 1;
}
@@ -373,7 +618,7 @@ static unsigned long quick_pit_calibrate(void)
goto success;
}
}
- printk("Fast TSC calibration failed\n");
+ pr_err("Fast TSC calibration failed\n");
return 0;
success:
@@ -392,7 +637,7 @@ success:
*/
delta *= PIT_TICK_RATE;
do_div(delta, i*256*1000);
- printk("Fast TSC calibration using PIT\n");
+ pr_info("Fast TSC calibration using PIT\n");
return delta;
}
@@ -406,6 +651,13 @@ unsigned long native_calibrate_tsc(void)
unsigned long flags, latch, ms, fast_calibrate;
int hpet = is_hpet_enabled(), i, loopmin;
+ /* Calibrate TSC using MSR for Intel Atom SoCs */
+ local_irq_save(flags);
+ fast_calibrate = try_msr_calibrate_tsc();
+ local_irq_restore(flags);
+ if (fast_calibrate)
+ return fast_calibrate;
+
local_irq_save(flags);
fast_calibrate = quick_pit_calibrate();
local_irq_restore(flags);
@@ -487,9 +739,8 @@ unsigned long native_calibrate_tsc(void)
* use the reference value, as it is more precise.
*/
if (delta >= 90 && delta <= 110) {
- printk(KERN_INFO
- "TSC: PIT calibration matches %s. %d loops\n",
- hpet ? "HPET" : "PMTIMER", i + 1);
+ pr_info("PIT calibration matches %s. %d loops\n",
+ hpet ? "HPET" : "PMTIMER", i + 1);
return tsc_ref_min;
}
@@ -511,38 +762,36 @@ unsigned long native_calibrate_tsc(void)
*/
if (tsc_pit_min == ULONG_MAX) {
/* PIT gave no useful value */
- printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
+ pr_warn("Unable to calibrate against PIT\n");
/* We don't have an alternative source, disable TSC */
if (!hpet && !ref1 && !ref2) {
- printk("TSC: No reference (HPET/PMTIMER) available\n");
+ pr_notice("No reference (HPET/PMTIMER) available\n");
return 0;
}
/* The alternative source failed as well, disable TSC */
if (tsc_ref_min == ULONG_MAX) {
- printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
- "failed.\n");
+ pr_warn("HPET/PMTIMER calibration failed\n");
return 0;
}
/* Use the alternative source */
- printk(KERN_INFO "TSC: using %s reference calibration\n",
- hpet ? "HPET" : "PMTIMER");
+ pr_info("using %s reference calibration\n",
+ hpet ? "HPET" : "PMTIMER");
return tsc_ref_min;
}
/* We don't have an alternative source, use the PIT calibration value */
if (!hpet && !ref1 && !ref2) {
- printk(KERN_INFO "TSC: Using PIT calibration value\n");
+ pr_info("Using PIT calibration value\n");
return tsc_pit_min;
}
/* The alternative source failed, use the PIT calibration value */
if (tsc_ref_min == ULONG_MAX) {
- printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
- "Using PIT calibration\n");
+ pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
return tsc_pit_min;
}
@@ -551,9 +800,9 @@ unsigned long native_calibrate_tsc(void)
* the PIT value as we know that there are PMTIMERs around
* running at double speed. At least we let the user know:
*/
- printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
- hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
- printk(KERN_INFO "TSC: Using PIT calibration value\n");
+ pr_warn("PIT calibration deviates from %s: %lu %lu\n",
+ hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
+ pr_info("Using PIT calibration value\n");
return tsc_pit_min;
}
@@ -579,59 +828,11 @@ int recalibrate_cpu_khz(void)
EXPORT_SYMBOL(recalibrate_cpu_khz);
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- * basic equation:
- * ns = cycles / (freq / ns_per_sec)
- * ns = cycles * (ns_per_sec / freq)
- * ns = cycles * (10^9 / (cpu_khz * 10^3))
- * ns = cycles * (10^6 / cpu_khz)
- *
- * Then we use scaling math (suggested by george@mvista.com) to get:
- * ns = cycles * (10^6 * SC / cpu_khz) / SC
- * ns = cycles * cyc2ns_scale / SC
- *
- * And since SC is a constant power of two, we can convert the div
- * into a shift.
- *
- * We can use khz divisor instead of mhz to keep a better precision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- * (mathieu.desnoyers@polymtl.ca)
- *
- * -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-
-DEFINE_PER_CPU(unsigned long, cyc2ns);
-DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
-
-static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
-{
- unsigned long long tsc_now, ns_now, *offset;
- unsigned long flags, *scale;
-
- local_irq_save(flags);
- sched_clock_idle_sleep_event();
-
- scale = &per_cpu(cyc2ns, cpu);
- offset = &per_cpu(cyc2ns_offset, cpu);
-
- rdtscll(tsc_now);
- ns_now = __cycles_2_ns(tsc_now);
-
- if (cpu_khz) {
- *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
- *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
- }
-
- sched_clock_idle_wakeup_event(0);
- local_irq_restore(flags);
-}
-
static unsigned long long cyc2ns_suspend;
-void save_sched_clock_state(void)
+void tsc_save_sched_clock_state(void)
{
- if (!sched_clock_stable)
+ if (!sched_clock_stable())
return;
cyc2ns_suspend = sched_clock();
@@ -645,22 +846,32 @@ void save_sched_clock_state(void)
* that sched_clock() continues from the point where it was left off during
* suspend.
*/
-void restore_sched_clock_state(void)
+void tsc_restore_sched_clock_state(void)
{
unsigned long long offset;
unsigned long flags;
int cpu;
- if (!sched_clock_stable)
+ if (!sched_clock_stable())
return;
local_irq_save(flags);
- __this_cpu_write(cyc2ns_offset, 0);
+ /*
+ * We're comming out of suspend, there's no concurrency yet; don't
+ * bother being nice about the RCU stuff, just write to both
+ * data fields.
+ */
+
+ this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
+ this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
+
offset = cyc2ns_suspend - sched_clock();
- for_each_possible_cpu(cpu)
- per_cpu(cyc2ns_offset, cpu) = offset;
+ for_each_possible_cpu(cpu) {
+ per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
+ per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
+ }
local_irq_restore(flags);
}
@@ -703,16 +914,15 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
tsc_khz_ref = tsc_khz;
}
if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
- (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
- (val == CPUFREQ_RESUMECHANGE)) {
+ (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- }
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu);
+ }
return 0;
}
@@ -762,7 +972,8 @@ static cycle_t read_tsc(struct clocksource *cs)
static void resume_tsc(struct clocksource *cs)
{
- clocksource_tsc.cycle_last = 0;
+ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
+ clocksource_tsc.cycle_last = 0;
}
static struct clocksource clocksource_tsc = {
@@ -773,18 +984,16 @@ static struct clocksource clocksource_tsc = {
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
-#ifdef CONFIG_X86_64
.archdata = { .vclock_mode = VCLOCK_TSC },
-#endif
};
void mark_tsc_unstable(char *reason)
{
if (!tsc_unstable) {
tsc_unstable = 1;
- sched_clock_stable = 0;
+ clear_sched_clock_stable();
disable_sched_clock_irqtime();
- printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
+ pr_info("Marking TSC unstable due to %s\n", reason);
/* Change only the rating, when not registered */
if (clocksource_tsc.mult)
clocksource_mark_unstable(&clocksource_tsc);
@@ -817,7 +1026,7 @@ static void __init check_system_tsc_reliable(void)
* Make an educated guess if the TSC is trustworthy and synchronized
* over all CPUs.
*/
-__cpuinit int unsynchronized_tsc(void)
+int unsynchronized_tsc(void)
{
if (!cpu_has_tsc || tsc_unstable)
return 1;
@@ -911,9 +1120,9 @@ static void tsc_refine_calibration_work(struct work_struct *work)
goto out;
tsc_khz = freq;
- printk(KERN_INFO "Refined TSC clocksource calibration: "
- "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
- (unsigned long)tsc_khz % 1000);
+ pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
+ (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
out:
clocksource_register_khz(&clocksource_tsc, tsc_khz);
@@ -932,6 +1141,19 @@ static int __init init_tsc_clocksource(void)
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
}
+
+ if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
+ clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
+
+ /*
+ * Trust the results of the earlier calibration on systems
+ * exporting a reliable TSC.
+ */
+ if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
+ return 0;
+ }
+
schedule_delayed_work(&tsc_irqwork, 0);
return 0;
}
@@ -959,9 +1181,9 @@ void __init tsc_init(void)
return;
}
- printk("Detected %lu.%03lu MHz processor.\n",
- (unsigned long)cpu_khz / 1000,
- (unsigned long)cpu_khz % 1000);
+ pr_info("Detected %lu.%03lu MHz processor\n",
+ (unsigned long)cpu_khz / 1000,
+ (unsigned long)cpu_khz % 1000);
/*
* Secondary CPUs do not run through tsc_init(), so set up
@@ -969,14 +1191,18 @@ void __init tsc_init(void)
* speed as the bootup CPU. (cpufreq notifiers will fix this
* up if their speed diverges)
*/
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
+ cyc2ns_init(cpu);
set_cyc2ns_scale(cpu_khz, cpu);
+ }
if (tsc_disabled > 0)
return;
/* now allow native_sched_clock() to use rdtsc */
+
tsc_disabled = 0;
+ static_key_slow_inc(&__use_tsc);
if (!no_sched_irq_time)
enable_sched_clock_irqtime();
@@ -1000,7 +1226,7 @@ void __init tsc_init(void)
* been calibrated. This assumes that CONSTANT_TSC applies to all
* cpus in the socket - this should be a safe assumption.
*/
-unsigned long __cpuinit calibrate_delay_is_known(void)
+unsigned long calibrate_delay_is_known(void)
{
int i, cpu = smp_processor_id();
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
new file mode 100644
index 00000000000..92ae6acac8a
--- /dev/null
+++ b/arch/x86/kernel/tsc_msr.c
@@ -0,0 +1,127 @@
+/*
+ * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms.
+ *
+ * TSC in Intel Atom SoC runs at a constant rate which can be figured
+ * by this formula:
+ * <maximum core-clock to bus-clock ratio> * <maximum resolved frequency>
+ * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5
+ * for details.
+ * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR
+ * based calibration is the only option.
+ *
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Bin Gao <bin.gao@intel.com>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/kernel.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/param.h>
+
+/* CPU reference clock frequency: in KHz */
+#define FREQ_83 83200
+#define FREQ_100 99840
+#define FREQ_133 133200
+#define FREQ_166 166400
+
+#define MAX_NUM_FREQS 8
+
+/*
+ * According to Intel 64 and IA-32 System Programming Guide,
+ * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be
+ * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40].
+ * Unfortunately some Intel Atom SoCs aren't quite compliant to this,
+ * so we need manually differentiate SoC families. This is what the
+ * field msr_plat does.
+ */
+struct freq_desc {
+ u8 x86_family; /* CPU family */
+ u8 x86_model; /* model */
+ u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */
+ u32 freqs[MAX_NUM_FREQS];
+};
+
+static struct freq_desc freq_desc_tables[] = {
+ /* PNW */
+ { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
+ /* CLV+ */
+ { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
+ /* TNG */
+ { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } },
+ /* VLV2 */
+ { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } },
+ /* ANN */
+ { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } },
+};
+
+static int match_cpu(u8 family, u8 model)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) {
+ if ((family == freq_desc_tables[i].x86_family) &&
+ (model == freq_desc_tables[i].x86_model))
+ return i;
+ }
+
+ return -1;
+}
+
+/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */
+#define id_to_freq(cpu_index, freq_id) \
+ (freq_desc_tables[cpu_index].freqs[freq_id])
+
+/*
+ * Do MSR calibration only for known/supported CPUs.
+ *
+ * Returns the calibration value or 0 if MSR calibration failed.
+ */
+unsigned long try_msr_calibrate_tsc(void)
+{
+ u32 lo, hi, ratio, freq_id, freq;
+ unsigned long res;
+ int cpu_index;
+
+ cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model);
+ if (cpu_index < 0)
+ return 0;
+
+ if (freq_desc_tables[cpu_index].msr_plat) {
+ rdmsr(MSR_PLATFORM_INFO, lo, hi);
+ ratio = (lo >> 8) & 0x1f;
+ } else {
+ rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+ ratio = (hi >> 8) & 0x1f;
+ }
+ pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio);
+
+ if (!ratio)
+ goto fail;
+
+ /* Get FSB FREQ ID */
+ rdmsr(MSR_FSB_FREQ, lo, hi);
+ freq_id = lo & 0x7;
+ freq = id_to_freq(cpu_index, freq_id);
+ pr_info("Resolved frequency ID: %u, frequency: %u KHz\n",
+ freq_id, freq);
+ if (!freq)
+ goto fail;
+
+ /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
+ res = freq * ratio;
+ pr_info("TSC runs at %lu KHz\n", res);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ lapic_timer_frequency = (freq * 1000) / HZ;
+ pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency);
+#endif
+ return res;
+
+fail:
+ pr_warn("Fast TSC calibration using MSR failed\n");
+ return 0;
+}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9eba29b46cb..26488487bc6 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -16,7 +16,6 @@
*/
#include <linux/spinlock.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nmi.h>
#include <asm/tsc.h>
@@ -25,24 +24,24 @@
* Entry/exit counters that make sure that both CPUs
* run the measurement code at once:
*/
-static __cpuinitdata atomic_t start_count;
-static __cpuinitdata atomic_t stop_count;
+static atomic_t start_count;
+static atomic_t stop_count;
/*
* We use a raw spinlock in this exceptional case, because
* we want to have the fastest, inlined, non-debug version
* of a critical section, to be able to prove TSC time-warps:
*/
-static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
-static __cpuinitdata cycles_t last_tsc;
-static __cpuinitdata cycles_t max_warp;
-static __cpuinitdata int nr_warps;
+static cycles_t last_tsc;
+static cycles_t max_warp;
+static int nr_warps;
/*
* TSC-warp measurement loop running on both CPUs:
*/
-static __cpuinit void check_tsc_warp(void)
+static void check_tsc_warp(unsigned int timeout)
{
cycles_t start, now, prev, end;
int i;
@@ -51,9 +50,9 @@ static __cpuinit void check_tsc_warp(void)
start = get_cycles();
rdtsc_barrier();
/*
- * The measurement runs for 20 msecs:
+ * The measurement runs for 'timeout' msecs:
*/
- end = start + tsc_khz * 20ULL;
+ end = start + (cycles_t) tsc_khz * timeout;
now = start;
for (i = 0; ; i++) {
@@ -99,10 +98,29 @@ static __cpuinit void check_tsc_warp(void)
}
/*
+ * If the target CPU coming online doesn't have any of its core-siblings
+ * online, a timeout of 20msec will be used for the TSC-warp measurement
+ * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
+ * information about this socket already (and this information grows as we
+ * have more and more logical-siblings in that socket).
+ *
+ * Ideally we should be able to skip the TSC sync check on the other
+ * core-siblings, if the first logical CPU in a socket passed the sync test.
+ * But as the TSC is per-logical CPU and can potentially be modified wrongly
+ * by the bios, TSC sync test for smaller duration should be able
+ * to catch such errors. Also this will catch the condition where all the
+ * cores in the socket doesn't get reset at the same time.
+ */
+static inline unsigned int loop_timeout(int cpu)
+{
+ return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20;
+}
+
+/*
* Source CPU calls into this - it waits for the freshly booted
* target CPU to arrive and then starts the measurement:
*/
-void __cpuinit check_tsc_sync_source(int cpu)
+void check_tsc_sync_source(int cpu)
{
int cpus = 2;
@@ -135,7 +153,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
*/
atomic_inc(&start_count);
- check_tsc_warp();
+ check_tsc_warp(loop_timeout(cpu));
while (atomic_read(&stop_count) != cpus-1)
cpu_relax();
@@ -168,7 +186,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
/*
* Freshly booted CPUs call into this:
*/
-void __cpuinit check_tsc_sync_target(void)
+void check_tsc_sync_target(void)
{
int cpus = 2;
@@ -183,7 +201,7 @@ void __cpuinit check_tsc_sync_target(void)
while (atomic_read(&start_count) != cpus)
cpu_relax();
- check_tsc_warp();
+ check_tsc_warp(loop_timeout(smp_processor_id()));
/*
* Ok, we are done:
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
new file mode 100644
index 00000000000..5d1cbfe4ae5
--- /dev/null
+++ b/arch/x86/kernel/uprobes.c
@@ -0,0 +1,928 @@
+/*
+ * User-space Probes (UProbes) for x86
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2011
+ * Authors:
+ * Srikar Dronamraju
+ * Jim Keniston
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/uprobes.h>
+#include <linux/uaccess.h>
+
+#include <linux/kdebug.h>
+#include <asm/processor.h>
+#include <asm/insn.h>
+
+/* Post-execution fixups. */
+
+/* Adjust IP back to vicinity of actual insn */
+#define UPROBE_FIX_IP 0x01
+
+/* Adjust the return address of a call insn */
+#define UPROBE_FIX_CALL 0x02
+
+/* Instruction will modify TF, don't change it */
+#define UPROBE_FIX_SETF 0x04
+
+#define UPROBE_FIX_RIP_SI 0x08
+#define UPROBE_FIX_RIP_DI 0x10
+#define UPROBE_FIX_RIP_BX 0x20
+#define UPROBE_FIX_RIP_MASK \
+ (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX)
+
+#define UPROBE_TRAP_NR UINT_MAX
+
+/* Adaptations for mhiramat x86 decoder v14. */
+#define OPCODE1(insn) ((insn)->opcode.bytes[0])
+#define OPCODE2(insn) ((insn)->opcode.bytes[1])
+#define OPCODE3(insn) ((insn)->opcode.bytes[2])
+#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value)
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
+ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
+ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
+ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
+ << (row % 32))
+
+/*
+ * Good-instruction tables for 32-bit apps. This is non-const and volatile
+ * to keep gcc from statically optimizing it out, as variable_test_bit makes
+ * some versions of gcc to think only *(unsigned long*) is used.
+ */
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+static volatile u32 good_insns_32[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
+ W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#else
+#define good_insns_32 NULL
+#endif
+
+/* Good-instruction tables for 64-bit apps */
+#if defined(CONFIG_X86_64)
+static volatile u32 good_insns_64[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
+ W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
+ W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#else
+#define good_insns_64 NULL
+#endif
+
+/* Using this for both 64-bit and 32-bit apps */
+static volatile u32 good_2byte_insns[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+ W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
+ W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#undef W
+
+/*
+ * opcodes we'll probably never support:
+ *
+ * 6c-6d, e4-e5, ec-ed - in
+ * 6e-6f, e6-e7, ee-ef - out
+ * cc, cd - int3, int
+ * cf - iret
+ * d6 - illegal instruction
+ * f1 - int1/icebp
+ * f4 - hlt
+ * fa, fb - cli, sti
+ * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
+ *
+ * invalid opcodes in 64-bit mode:
+ *
+ * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
+ * 63 - we support this opcode in x86_64 but not in i386.
+ *
+ * opcodes we may need to refine support for:
+ *
+ * 0f - 2-byte instructions: For many of these instructions, the validity
+ * depends on the prefix and/or the reg field. On such instructions, we
+ * just consider the opcode combination valid if it corresponds to any
+ * valid instruction.
+ *
+ * 8f - Group 1 - only reg = 0 is OK
+ * c6-c7 - Group 11 - only reg = 0 is OK
+ * d9-df - fpu insns with some illegal encodings
+ * f2, f3 - repnz, repz prefixes. These are also the first byte for
+ * certain floating-point instructions, such as addsd.
+ *
+ * fe - Group 4 - only reg = 0 or 1 is OK
+ * ff - Group 5 - only reg = 0-6 is OK
+ *
+ * others -- Do we need to support these?
+ *
+ * 0f - (floating-point?) prefetch instructions
+ * 07, 17, 1f - pop es, pop ss, pop ds
+ * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
+ * but 64 and 65 (fs: and gs:) seem to be used, so we support them
+ * 67 - addr16 prefix
+ * ce - into
+ * f0 - lock prefix
+ */
+
+/*
+ * TODO:
+ * - Where necessary, examine the modrm byte and allow only valid instructions
+ * in the different Groups and fpu instructions.
+ */
+
+static bool is_prefix_bad(struct insn *insn)
+{
+ int i;
+
+ for (i = 0; i < insn->prefixes.nbytes; i++) {
+ switch (insn->prefixes.bytes[i]) {
+ case 0x26: /* INAT_PFX_ES */
+ case 0x2E: /* INAT_PFX_CS */
+ case 0x36: /* INAT_PFX_DS */
+ case 0x3E: /* INAT_PFX_SS */
+ case 0xF0: /* INAT_PFX_LOCK */
+ return true;
+ }
+ }
+ return false;
+}
+
+static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)
+{
+ u32 volatile *good_insns;
+
+ insn_init(insn, auprobe->insn, x86_64);
+ /* has the side-effect of processing the entire instruction */
+ insn_get_length(insn);
+ if (WARN_ON_ONCE(!insn_complete(insn)))
+ return -ENOEXEC;
+
+ if (is_prefix_bad(insn))
+ return -ENOTSUPP;
+
+ if (x86_64)
+ good_insns = good_insns_64;
+ else
+ good_insns = good_insns_32;
+
+ if (test_bit(OPCODE1(insn), (unsigned long *)good_insns))
+ return 0;
+
+ if (insn->opcode.nbytes == 2) {
+ if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
+ return 0;
+ }
+
+ return -ENOTSUPP;
+}
+
+#ifdef CONFIG_X86_64
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+ return !config_enabled(CONFIG_IA32_EMULATION) ||
+ !(mm->context.ia32_compat == TIF_IA32);
+}
+/*
+ * If arch_uprobe->insn doesn't use rip-relative addressing, return
+ * immediately. Otherwise, rewrite the instruction so that it accesses
+ * its memory operand indirectly through a scratch register. Set
+ * defparam->fixups accordingly. (The contents of the scratch register
+ * will be saved before we single-step the modified instruction,
+ * and restored afterward).
+ *
+ * We do this because a rip-relative instruction can access only a
+ * relatively small area (+/- 2 GB from the instruction), and the XOL
+ * area typically lies beyond that area. At least for instructions
+ * that store to memory, we can't execute the original instruction
+ * and "fix things up" later, because the misdirected store could be
+ * disastrous.
+ *
+ * Some useful facts about rip-relative instructions:
+ *
+ * - There's always a modrm byte with bit layout "00 reg 101".
+ * - There's never a SIB byte.
+ * - The displacement is always 4 bytes.
+ * - REX.B=1 bit in REX prefix, which normally extends r/m field,
+ * has no effect on rip-relative mode. It doesn't make modrm byte
+ * with r/m=101 refer to register 1101 = R13.
+ */
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 *cursor;
+ u8 reg;
+ u8 reg2;
+
+ if (!insn_rip_relative(insn))
+ return;
+
+ /*
+ * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm.
+ * Clear REX.b bit (extension of MODRM.rm field):
+ * we want to encode low numbered reg, not r8+.
+ */
+ if (insn->rex_prefix.nbytes) {
+ cursor = auprobe->insn + insn_offset_rex_prefix(insn);
+ /* REX byte has 0100wrxb layout, clearing REX.b bit */
+ *cursor &= 0xfe;
+ }
+ /*
+ * Similar treatment for VEX3 prefix.
+ * TODO: add XOP/EVEX treatment when insn decoder supports them
+ */
+ if (insn->vex_prefix.nbytes == 3) {
+ /*
+ * vex2: c5 rvvvvLpp (has no b bit)
+ * vex3/xop: c4/8f rxbmmmmm wvvvvLpp
+ * evex: 62 rxbR00mm wvvvv1pp zllBVaaa
+ * (evex will need setting of both b and x since
+ * in non-sib encoding evex.x is 4th bit of MODRM.rm)
+ * Setting VEX3.b (setting because it has inverted meaning):
+ */
+ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
+ *cursor |= 0x20;
+ }
+
+ /*
+ * Convert from rip-relative addressing to register-relative addressing
+ * via a scratch register.
+ *
+ * This is tricky since there are insns with modrm byte
+ * which also use registers not encoded in modrm byte:
+ * [i]div/[i]mul: implicitly use dx:ax
+ * shift ops: implicitly use cx
+ * cmpxchg: implicitly uses ax
+ * cmpxchg8/16b: implicitly uses dx:ax and bx:cx
+ * Encoding: 0f c7/1 modrm
+ * The code below thinks that reg=1 (cx), chooses si as scratch.
+ * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m.
+ * First appeared in Haswell (BMI2 insn). It is vex-encoded.
+ * Example where none of bx,cx,dx can be used as scratch reg:
+ * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx
+ * [v]pcmpistri: implicitly uses cx, xmm0
+ * [v]pcmpistrm: implicitly uses xmm0
+ * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0
+ * [v]pcmpestrm: implicitly uses ax, dx, xmm0
+ * Evil SSE4.2 string comparison ops from hell.
+ * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination.
+ * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm.
+ * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi).
+ * AMD says it has no 3-operand form (vex.vvvv must be 1111)
+ * and that it can have only register operands, not mem
+ * (its modrm byte must have mode=11).
+ * If these restrictions will ever be lifted,
+ * we'll need code to prevent selection of di as scratch reg!
+ *
+ * Summary: I don't know any insns with modrm byte which
+ * use SI register implicitly. DI register is used only
+ * by one insn (maskmovq) and BX register is used
+ * only by one too (cmpxchg8b).
+ * BP is stack-segment based (may be a problem?).
+ * AX, DX, CX are off-limits (many implicit users).
+ * SP is unusable (it's stack pointer - think about "pop mem";
+ * also, rsp+disp32 needs sib encoding -> insn length change).
+ */
+
+ reg = MODRM_REG(insn); /* Fetch modrm.reg */
+ reg2 = 0xff; /* Fetch vex.vvvv */
+ if (insn->vex_prefix.nbytes == 2)
+ reg2 = insn->vex_prefix.bytes[1];
+ else if (insn->vex_prefix.nbytes == 3)
+ reg2 = insn->vex_prefix.bytes[2];
+ /*
+ * TODO: add XOP, EXEV vvvv reading.
+ *
+ * vex.vvvv field is in bits 6-3, bits are inverted.
+ * But in 32-bit mode, high-order bit may be ignored.
+ * Therefore, let's consider only 3 low-order bits.
+ */
+ reg2 = ((reg2 >> 3) & 0x7) ^ 0x7;
+ /*
+ * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15.
+ *
+ * Choose scratch reg. Order is important: must not select bx
+ * if we can use si (cmpxchg8b case!)
+ */
+ if (reg != 6 && reg2 != 6) {
+ reg2 = 6;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI;
+ } else if (reg != 7 && reg2 != 7) {
+ reg2 = 7;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI;
+ /* TODO (paranoia): force maskmovq to not use di */
+ } else {
+ reg2 = 3;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX;
+ }
+ /*
+ * Point cursor at the modrm byte. The next 4 bytes are the
+ * displacement. Beyond the displacement, for some instructions,
+ * is the immediate operand.
+ */
+ cursor = auprobe->insn + insn_offset_modrm(insn);
+ /*
+ * Change modrm from "00 reg 101" to "10 reg reg2". Example:
+ * 89 05 disp32 mov %eax,disp32(%rip) becomes
+ * 89 86 disp32 mov %eax,disp32(%rsi)
+ */
+ *cursor = 0x80 | (reg << 3) | reg2;
+}
+
+static inline unsigned long *
+scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI)
+ return &regs->si;
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI)
+ return &regs->di;
+ return &regs->bx;
+}
+
+/*
+ * If we're emulating a rip-relative instruction, save the contents
+ * of the scratch register and store the target address in that register.
+ */
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
+
+ utask->autask.saved_scratch_register = *sr;
+ *sr = utask->vaddr + auprobe->defparam.ilen;
+ }
+}
+
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
+
+ *sr = utask->autask.saved_scratch_register;
+ }
+}
+#else /* 32-bit: */
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+ return false;
+}
+/*
+ * No RIP-relative addressing on 32-bit
+ */
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
+{
+}
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+}
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+}
+#endif /* CONFIG_X86_64 */
+
+struct uprobe_xol_ops {
+ bool (*emulate)(struct arch_uprobe *, struct pt_regs *);
+ int (*pre_xol)(struct arch_uprobe *, struct pt_regs *);
+ int (*post_xol)(struct arch_uprobe *, struct pt_regs *);
+ void (*abort)(struct arch_uprobe *, struct pt_regs *);
+};
+
+static inline int sizeof_long(void)
+{
+ return is_ia32_task() ? 4 : 8;
+}
+
+static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ riprel_pre_xol(auprobe, regs);
+ return 0;
+}
+
+static int push_ret_address(struct pt_regs *regs, unsigned long ip)
+{
+ unsigned long new_sp = regs->sp - sizeof_long();
+
+ if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
+ return -EFAULT;
+
+ regs->sp = new_sp;
+ return 0;
+}
+
+/*
+ * We have to fix things up as follows:
+ *
+ * Typically, the new ip is relative to the copied instruction. We need
+ * to make it relative to the original instruction (FIX_IP). Exceptions
+ * are return instructions and absolute or indirect jump or call instructions.
+ *
+ * If the single-stepped instruction was a call, the return address that
+ * is atop the stack is the address following the copied instruction. We
+ * need to make it the address following the original instruction (FIX_CALL).
+ *
+ * If the original instruction was a rip-relative instruction such as
+ * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
+ * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)".
+ * We need to restore the contents of the scratch register
+ * (FIX_RIP_reg).
+ */
+static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ riprel_post_xol(auprobe, regs);
+ if (auprobe->defparam.fixups & UPROBE_FIX_IP) {
+ long correction = utask->vaddr - utask->xol_vaddr;
+ regs->ip += correction;
+ } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
+ regs->sp += sizeof_long(); /* Pop incorrect return address */
+ if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
+ return -ERESTART;
+ }
+ /* popf; tell the caller to not touch TF */
+ if (auprobe->defparam.fixups & UPROBE_FIX_SETF)
+ utask->autask.saved_tf = true;
+
+ return 0;
+}
+
+static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ riprel_post_xol(auprobe, regs);
+}
+
+static struct uprobe_xol_ops default_xol_ops = {
+ .pre_xol = default_pre_xol_op,
+ .post_xol = default_post_xol_op,
+ .abort = default_abort_op,
+};
+
+static bool branch_is_call(struct arch_uprobe *auprobe)
+{
+ return auprobe->branch.opc1 == 0xe8;
+}
+
+#define CASE_COND \
+ COND(70, 71, XF(OF)) \
+ COND(72, 73, XF(CF)) \
+ COND(74, 75, XF(ZF)) \
+ COND(78, 79, XF(SF)) \
+ COND(7a, 7b, XF(PF)) \
+ COND(76, 77, XF(CF) || XF(ZF)) \
+ COND(7c, 7d, XF(SF) != XF(OF)) \
+ COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF))
+
+#define COND(op_y, op_n, expr) \
+ case 0x ## op_y: DO((expr) != 0) \
+ case 0x ## op_n: DO((expr) == 0)
+
+#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf))
+
+static bool is_cond_jmp_opcode(u8 opcode)
+{
+ switch (opcode) {
+ #define DO(expr) \
+ return true;
+ CASE_COND
+ #undef DO
+
+ default:
+ return false;
+ }
+}
+
+static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long flags = regs->flags;
+
+ switch (auprobe->branch.opc1) {
+ #define DO(expr) \
+ return expr;
+ CASE_COND
+ #undef DO
+
+ default: /* not a conditional jmp */
+ return true;
+ }
+}
+
+#undef XF
+#undef COND
+#undef CASE_COND
+
+static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long new_ip = regs->ip += auprobe->branch.ilen;
+ unsigned long offs = (long)auprobe->branch.offs;
+
+ if (branch_is_call(auprobe)) {
+ /*
+ * If it fails we execute this (mangled, see the comment in
+ * branch_clear_offset) insn out-of-line. In the likely case
+ * this should trigger the trap, and the probed application
+ * should die or restart the same insn after it handles the
+ * signal, arch_uprobe_post_xol() won't be even called.
+ *
+ * But there is corner case, see the comment in ->post_xol().
+ */
+ if (push_ret_address(regs, new_ip))
+ return false;
+ } else if (!check_jmp_cond(auprobe, regs)) {
+ offs = 0;
+ }
+
+ regs->ip = new_ip + offs;
+ return true;
+}
+
+static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ BUG_ON(!branch_is_call(auprobe));
+ /*
+ * We can only get here if branch_emulate_op() failed to push the ret
+ * address _and_ another thread expanded our stack before the (mangled)
+ * "call" insn was executed out-of-line. Just restore ->sp and restart.
+ * We could also restore ->ip and try to call branch_emulate_op() again.
+ */
+ regs->sp += sizeof_long();
+ return -ERESTART;
+}
+
+static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ /*
+ * Turn this insn into "call 1f; 1:", this is what we will execute
+ * out-of-line if ->emulate() fails. We only need this to generate
+ * a trap, so that the probed task receives the correct signal with
+ * the properly filled siginfo.
+ *
+ * But see the comment in ->post_xol(), in the unlikely case it can
+ * succeed. So we need to ensure that the new ->ip can not fall into
+ * the non-canonical area and trigger #GP.
+ *
+ * We could turn it into (say) "pushf", but then we would need to
+ * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte
+ * of ->insn[] for set_orig_insn().
+ */
+ memset(auprobe->insn + insn_offset_immediate(insn),
+ 0, insn->immediate.nbytes);
+}
+
+static struct uprobe_xol_ops branch_xol_ops = {
+ .emulate = branch_emulate_op,
+ .post_xol = branch_post_xol_op,
+};
+
+/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
+static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 opc1 = OPCODE1(insn);
+ int i;
+
+ switch (opc1) {
+ case 0xeb: /* jmp 8 */
+ case 0xe9: /* jmp 32 */
+ case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
+ break;
+
+ case 0xe8: /* call relative */
+ branch_clear_offset(auprobe, insn);
+ break;
+
+ case 0x0f:
+ if (insn->opcode.nbytes != 2)
+ return -ENOSYS;
+ /*
+ * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches
+ * OPCODE1() of the "short" jmp which checks the same condition.
+ */
+ opc1 = OPCODE2(insn) - 0x10;
+ default:
+ if (!is_cond_jmp_opcode(opc1))
+ return -ENOSYS;
+ }
+
+ /*
+ * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported.
+ * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
+ * No one uses these insns, reject any branch insns with such prefix.
+ */
+ for (i = 0; i < insn->prefixes.nbytes; i++) {
+ if (insn->prefixes.bytes[i] == 0x66)
+ return -ENOTSUPP;
+ }
+
+ auprobe->branch.opc1 = opc1;
+ auprobe->branch.ilen = insn->length;
+ auprobe->branch.offs = insn->immediate.value;
+
+ auprobe->ops = &branch_xol_ops;
+ return 0;
+}
+
+/**
+ * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
+ * @mm: the probed address space.
+ * @arch_uprobe: the probepoint information.
+ * @addr: virtual address at which to install the probepoint
+ * Return 0 on success or a -ve number on error.
+ */
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
+{
+ struct insn insn;
+ u8 fix_ip_or_call = UPROBE_FIX_IP;
+ int ret;
+
+ ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
+ if (ret)
+ return ret;
+
+ ret = branch_setup_xol_ops(auprobe, &insn);
+ if (ret != -ENOSYS)
+ return ret;
+
+ /*
+ * Figure out which fixups default_post_xol_op() will need to perform,
+ * and annotate defparam->fixups accordingly.
+ */
+ switch (OPCODE1(&insn)) {
+ case 0x9d: /* popf */
+ auprobe->defparam.fixups |= UPROBE_FIX_SETF;
+ break;
+ case 0xc3: /* ret or lret -- ip is correct */
+ case 0xcb:
+ case 0xc2:
+ case 0xca:
+ case 0xea: /* jmp absolute -- ip is correct */
+ fix_ip_or_call = 0;
+ break;
+ case 0x9a: /* call absolute - Fix return addr, not ip */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 0xff:
+ switch (MODRM_REG(&insn)) {
+ case 2: case 3: /* call or lcall, indirect */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 4: case 5: /* jmp or ljmp, indirect */
+ fix_ip_or_call = 0;
+ break;
+ }
+ /* fall through */
+ default:
+ riprel_analyze(auprobe, &insn);
+ }
+
+ auprobe->defparam.ilen = insn.length;
+ auprobe->defparam.fixups |= fix_ip_or_call;
+
+ auprobe->ops = &default_xol_ops;
+ return 0;
+}
+
+/*
+ * arch_uprobe_pre_xol - prepare to execute out of line.
+ * @auprobe: the probepoint information.
+ * @regs: reflects the saved user state of current task.
+ */
+int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (auprobe->ops->pre_xol) {
+ int err = auprobe->ops->pre_xol(auprobe, regs);
+ if (err)
+ return err;
+ }
+
+ regs->ip = utask->xol_vaddr;
+ utask->autask.saved_trap_nr = current->thread.trap_nr;
+ current->thread.trap_nr = UPROBE_TRAP_NR;
+
+ utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF);
+ regs->flags |= X86_EFLAGS_TF;
+ if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
+ set_task_blockstep(current, false);
+
+ return 0;
+}
+
+/*
+ * If xol insn itself traps and generates a signal(Say,
+ * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
+ * instruction jumps back to its own address. It is assumed that anything
+ * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
+ *
+ * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
+ * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
+ * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
+ */
+bool arch_uprobe_xol_was_trapped(struct task_struct *t)
+{
+ if (t->thread.trap_nr != UPROBE_TRAP_NR)
+ return true;
+
+ return false;
+}
+
+/*
+ * Called after single-stepping. To avoid the SMP problems that can
+ * occur when we temporarily put back the original opcode to
+ * single-step, we single-stepped a copy of the instruction.
+ *
+ * This function prepares to resume execution after the single-step.
+ */
+int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+ bool send_sigtrap = utask->autask.saved_tf;
+ int err = 0;
+
+ WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+
+ if (auprobe->ops->post_xol) {
+ err = auprobe->ops->post_xol(auprobe, regs);
+ if (err) {
+ /*
+ * Restore ->ip for restart or post mortem analysis.
+ * ->post_xol() must not return -ERESTART unless this
+ * is really possible.
+ */
+ regs->ip = utask->vaddr;
+ if (err == -ERESTART)
+ err = 0;
+ send_sigtrap = false;
+ }
+ }
+ /*
+ * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP
+ * so we can get an extra SIGTRAP if we do not clear TF. We need
+ * to examine the opcode to make it right.
+ */
+ if (send_sigtrap)
+ send_sig(SIGTRAP, current, 0);
+
+ if (!utask->autask.saved_tf)
+ regs->flags &= ~X86_EFLAGS_TF;
+
+ return err;
+}
+
+/* callback routine for handling exceptions. */
+int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
+{
+ struct die_args *args = data;
+ struct pt_regs *regs = args->regs;
+ int ret = NOTIFY_DONE;
+
+ /* We are only interested in userspace traps */
+ if (regs && !user_mode_vm(regs))
+ return NOTIFY_DONE;
+
+ switch (val) {
+ case DIE_INT3:
+ if (uprobe_pre_sstep_notifier(regs))
+ ret = NOTIFY_STOP;
+
+ break;
+
+ case DIE_DEBUG:
+ if (uprobe_post_sstep_notifier(regs))
+ ret = NOTIFY_STOP;
+
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * This function gets called when XOL instruction either gets trapped or
+ * the thread has a fatal signal. Reset the instruction pointer to its
+ * probed address for the potential restart or for post mortem analysis.
+ */
+void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (auprobe->ops->abort)
+ auprobe->ops->abort(auprobe, regs);
+
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+ regs->ip = utask->vaddr;
+ /* clear TF if it was set by us in arch_uprobe_pre_xol() */
+ if (!utask->autask.saved_tf)
+ regs->flags &= ~X86_EFLAGS_TF;
+}
+
+static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->ops->emulate)
+ return auprobe->ops->emulate(auprobe, regs);
+ return false;
+}
+
+bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ bool ret = __skip_sstep(auprobe, regs);
+ if (ret && (regs->flags & X86_EFLAGS_TF))
+ send_sig(SIGTRAP, current, 0);
+ return ret;
+}
+
+unsigned long
+arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
+{
+ int rasize = sizeof_long(), nleft;
+ unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
+
+ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
+ return -1;
+
+ /* check whether address has been already hijacked */
+ if (orig_ret_vaddr == trampoline_vaddr)
+ return orig_ret_vaddr;
+
+ nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
+ if (likely(!nleft))
+ return orig_ret_vaddr;
+
+ if (nleft != rasize) {
+ pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, "
+ "%%ip=%#lx\n", current->pid, regs->sp, regs->ip);
+
+ force_sig_info(SIGSEGV, SEND_SIG_FORCED, current);
+ }
+
+ return -1;
+}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index b466cab5ba1..e8edcf52e06 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -28,9 +28,12 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
+#include <linux/syscalls.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/signal.h>
@@ -46,7 +49,6 @@
#include <asm/io.h>
#include <asm/tlbflush.h>
#include <asm/irq.h>
-#include <asm/syscalls.h>
/*
* Known problems:
@@ -137,14 +139,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
local_irq_enable();
if (!current->thread.vm86_info) {
- printk("no vm86_info: BAD\n");
+ pr_alert("no vm86_info: BAD\n");
do_exit(SIGSEGV);
}
set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
if (tmp) {
- printk("vm86: could not access userspace vm86_info\n");
+ pr_alert("could not access userspace vm86_info\n");
do_exit(SIGSEGV);
}
@@ -172,6 +174,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
spinlock_t *ptl;
int i;
+ down_write(&mm->mmap_sem);
pgd = pgd_offset(mm, 0xA0000);
if (pgd_none_or_clear_bad(pgd))
goto out;
@@ -179,7 +182,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
- split_huge_page_pmd(mm, pmd);
+ split_huge_page_pmd_mm(mm, 0xA0000, pmd);
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
@@ -190,6 +193,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
}
pte_unmap_unlock(pte, ptl);
out:
+ up_write(&mm->mmap_sem);
flush_tlb();
}
@@ -198,36 +202,32 @@ out:
static int do_vm86_irq_handling(int subfunction, int irqnumber);
static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
-int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs)
+SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
{
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
* This remains on the stack until we
* return to 32 bit user space.
*/
- struct task_struct *tsk;
- int tmp, ret = -EPERM;
+ struct task_struct *tsk = current;
+ int tmp;
- tsk = current;
if (tsk->thread.saved_sp0)
- goto out;
+ return -EPERM;
tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
offsetof(struct kernel_vm86_struct, vm86plus) -
sizeof(info.regs));
- ret = -EFAULT;
if (tmp)
- goto out;
+ return -EFAULT;
memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
- info.regs32 = regs;
+ info.regs32 = current_pt_regs();
tsk->thread.vm86_info = v86;
do_sys_vm86(&info, tsk);
- ret = 0; /* we never return here */
-out:
- return ret;
+ return 0; /* we never return here */
}
-int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
+SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
{
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
@@ -235,7 +235,7 @@ int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
* return to 32 bit user space.
*/
struct task_struct *tsk;
- int tmp, ret;
+ int tmp;
struct vm86plus_struct __user *v86;
tsk = current;
@@ -244,8 +244,7 @@ int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
case VM86_FREE_IRQ:
case VM86_GET_IRQ_BITS:
case VM86_GET_AND_RESET_IRQ:
- ret = do_vm86_irq_handling(cmd, (int)arg);
- goto out;
+ return do_vm86_irq_handling(cmd, (int)arg);
case VM86_PLUS_INSTALL_CHECK:
/*
* NOTE: on old vm86 stuff this will return the error
@@ -253,28 +252,23 @@ int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
* interpreted as (invalid) address to vm86_struct.
* So the installation check works.
*/
- ret = 0;
- goto out;
+ return 0;
}
/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
- ret = -EPERM;
if (tsk->thread.saved_sp0)
- goto out;
+ return -EPERM;
v86 = (struct vm86plus_struct __user *)arg;
tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
offsetof(struct kernel_vm86_struct, regs32) -
sizeof(info.regs));
- ret = -EFAULT;
if (tmp)
- goto out;
- info.regs32 = regs;
+ return -EFAULT;
+ info.regs32 = current_pt_regs();
info.vm86plus.is_vm86pus = 1;
tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
do_sys_vm86(&info, tsk);
- ret = 0; /* we never return here */
-out:
- return ret;
+ return 0; /* we never return here */
}
@@ -557,9 +551,9 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
if ((trapno == 3) || (trapno == 1)) {
KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
/* setting this flag forces the code in entry_32.S to
- call save_v86_state() and change the stack pointer
- to KVM86->regs32 */
- set_thread_flag(TIF_IRET);
+ the path where we call save_v86_state() and change
+ the stack pointer to KVM86->regs32 */
+ set_thread_flag(TIF_NOTIFY_RESUME);
return 0;
}
do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
@@ -567,7 +561,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
}
if (trapno != 1)
return 1; /* we let this handle by the calling routine */
- current->thread.trap_no = trapno;
+ current->thread.trap_nr = trapno;
current->thread.error_code = error_code;
force_sig(SIGTRAP, current);
return 0;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0f703f10901..49edf2dd361 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -94,10 +94,6 @@ SECTIONS
_text = .;
/* bootstrapping code */
HEAD_TEXT
-#ifdef CONFIG_X86_32
- . = ALIGN(PAGE_SIZE);
- *(.text..page_aligned)
-#endif
. = ALIGN(8);
_stext = .;
TEXT_TEXT
@@ -151,7 +147,6 @@ SECTIONS
_edata = .;
} :data
-#ifdef CONFIG_X86_64
. = ALIGN(PAGE_SIZE);
__vvar_page = .;
@@ -169,12 +164,15 @@ SECTIONS
#undef __VVAR_KERNEL_LDS
#undef EMIT_VVAR
+ /*
+ * Pad the rest of the page with zeros. Otherwise the loader
+ * can leave garbage here.
+ */
+ . = __vvar_beginning_hack + PAGE_SIZE;
} :data
. = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
-#endif /* CONFIG_X86_64 */
-
/* Init code and data - will be freed after init */
. = ALIGN(PAGE_SIZE);
.init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
@@ -197,24 +195,21 @@ SECTIONS
INIT_DATA_SECTION(16)
- /*
- * Code and data for a variety of lowlevel trampolines, to be
- * copied into base memory (< 1 MiB) during initialization.
- * Since it is copied early, the main copy can be discarded
- * afterwards.
- */
- .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
- x86_trampoline_start = .;
- *(.x86_trampoline)
- x86_trampoline_end = .;
- }
-
.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
__x86_cpu_dev_start = .;
*(.x86_cpu_dev.init)
__x86_cpu_dev_end = .;
}
+#ifdef CONFIG_X86_INTEL_MID
+ .x86_intel_mid_dev.init : AT(ADDR(.x86_intel_mid_dev.init) - \
+ LOAD_OFFSET) {
+ __x86_intel_mid_dev_start = .;
+ *(.x86_intel_mid_dev.init)
+ __x86_intel_mid_dev_end = .;
+ }
+#endif
+
/*
* start address and size of operations which during runtime
* can be patched with virtualization friendly instructions or
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a1d804bcd48..b99b9ad8540 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -15,6 +15,8 @@
#include <linux/init.h>
#include <linux/pci_ids.h>
#include <linux/pci_regs.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
#include <asm/apic.h>
#include <asm/pci-direct.h>
@@ -22,6 +24,11 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
+#define TOPOLOGY_REGISTER_OFFSET 0x10
+
+/* Flag below is initialized once during vSMP PCI initialization. */
+static int irq_routing_comply = 1;
+
#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
/*
* Interrupt control on vSMPowered systems:
@@ -29,7 +36,7 @@
* and vice versa.
*/
-static unsigned long vsmp_save_fl(void)
+asmlinkage __visible unsigned long vsmp_save_fl(void)
{
unsigned long flags = native_save_fl();
@@ -39,7 +46,7 @@ static unsigned long vsmp_save_fl(void)
}
PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
-static void vsmp_restore_fl(unsigned long flags)
+__visible void vsmp_restore_fl(unsigned long flags)
{
if (flags & X86_EFLAGS_IF)
flags &= ~X86_EFLAGS_AC;
@@ -49,7 +56,7 @@ static void vsmp_restore_fl(unsigned long flags)
}
PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
-static void vsmp_irq_disable(void)
+asmlinkage __visible void vsmp_irq_disable(void)
{
unsigned long flags = native_save_fl();
@@ -57,7 +64,7 @@ static void vsmp_irq_disable(void)
}
PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
-static void vsmp_irq_enable(void)
+asmlinkage __visible void vsmp_irq_enable(void)
{
unsigned long flags = native_save_fl();
@@ -92,6 +99,22 @@ static void __init set_vsmp_pv_ops(void)
ctl = readl(address + 4);
printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n",
cap, ctl);
+
+ /* If possible, let the vSMP foundation route the interrupt optimally */
+#ifdef CONFIG_SMP
+ if (cap & ctl & BIT(8)) {
+ ctl &= ~BIT(8);
+
+ /* Interrupt routing set to ignore */
+ irq_routing_comply = 0;
+
+#ifdef CONFIG_PROC_FS
+ /* Don't let users change irq affinity via procfs */
+ no_irq_affinity = 1;
+#endif
+ }
+#endif
+
if (cap & ctl & (1 << 4)) {
/* Setup irq ops and turn on vSMP IRQ fastpath handling */
pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
@@ -99,12 +122,11 @@ static void __init set_vsmp_pv_ops(void)
pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
pv_init_ops.patch = vsmp_patch;
-
ctl &= ~(1 << 4);
- writel(ctl, address + 4);
- ctl = readl(address + 4);
- printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
}
+ writel(ctl, address + 4);
+ ctl = readl(address + 4);
+ pr_info("vSMP CTL: control set to:0x%08x\n", ctl);
early_iounmap(address, 8);
}
@@ -149,12 +171,75 @@ int is_vsmp_box(void)
return 0;
}
#endif
+
+static void __init vsmp_cap_cpus(void)
+{
+#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP)
+ void __iomem *address;
+ unsigned int cfg, topology, node_shift, maxcpus;
+
+ /*
+ * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the
+ * ones present in the first board, unless explicitly overridden by
+ * setup_max_cpus
+ */
+ if (setup_max_cpus != NR_CPUS)
+ return;
+
+ /* Read the vSMP Foundation topology register */
+ cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
+ address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4);
+ if (WARN_ON(!address))
+ return;
+
+ topology = readl(address);
+ node_shift = (topology >> 16) & 0x7;
+ if (!node_shift)
+ /* The value 0 should be decoded as 8 */
+ node_shift = 8;
+ maxcpus = (topology & ((1 << node_shift) - 1)) + 1;
+
+ pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n",
+ maxcpus);
+ setup_max_cpus = maxcpus;
+ early_iounmap(address, 4);
+#endif
+}
+
+static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return hard_smp_processor_id() >> index_msb;
+}
+
+/*
+ * In vSMP, all cpus should be capable of handling interrupts, regardless of
+ * the APIC used.
+ */
+static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask,
+ const struct cpumask *mask)
+{
+ cpumask_setall(retmask);
+}
+
+static void vsmp_apic_post_init(void)
+{
+ /* need to update phys_pkg_id */
+ apic->phys_pkg_id = apicid_phys_pkg_id;
+
+ if (!irq_routing_comply)
+ apic->vector_allocation_domain = fill_vector_allocation_domain;
+}
+
void __init vsmp_init(void)
{
detect_vsmp_box();
if (!is_vsmp_box())
return;
+ x86_platform.apic_post_init = vsmp_apic_post_init;
+
+ vsmp_cap_cpus();
+
set_vsmp_pv_ops();
return;
}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index b07ba939356..ea5b5709aa7 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -18,6 +18,8 @@
* use the vDSO.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -26,7 +28,7 @@
#include <linux/jiffies.h>
#include <linux/sysctl.h>
#include <linux/topology.h>
-#include <linux/clocksource.h>
+#include <linux/timekeeper_internal.h>
#include <linux/getcpu.h>
#include <linux/cpu.h>
#include <linux/smp.h>
@@ -45,17 +47,12 @@
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/topology.h>
-#include <asm/vgtod.h>
#include <asm/traps.h>
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
DEFINE_VVAR(int, vgetcpu_mode);
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
-{
- .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
-};
static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
@@ -78,59 +75,23 @@ static int __init vsyscall_setup(char *str)
}
early_param("vsyscall", vsyscall_setup);
-void update_vsyscall_tz(void)
-{
- unsigned long flags;
-
- write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
- /* sys_tz has changed */
- vsyscall_gtod_data.sys_tz = sys_tz;
- write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
-}
-
-void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
- struct clocksource *clock, u32 mult)
-{
- unsigned long flags;
-
- write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
-
- /* copy vsyscall data */
- vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
- vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
- vsyscall_gtod_data.clock.mask = clock->mask;
- vsyscall_gtod_data.clock.mult = mult;
- vsyscall_gtod_data.clock.shift = clock->shift;
- vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
- vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
- vsyscall_gtod_data.wall_to_monotonic = *wtm;
- vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
-
- write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
-}
-
static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
const char *message)
{
- static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
- struct task_struct *tsk;
-
- if (!show_unhandled_signals || !__ratelimit(&rs))
+ if (!show_unhandled_signals)
return;
- tsk = current;
-
- printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
- level, tsk->comm, task_pid_nr(tsk),
- message, regs->ip, regs->cs,
- regs->sp, regs->ax, regs->si, regs->di);
+ pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+ level, current->comm, task_pid_nr(current),
+ message, regs->ip, regs->cs,
+ regs->sp, regs->ax, regs->si, regs->di);
}
static int addr_to_vsyscall_nr(unsigned long addr)
{
int nr;
- if ((addr & ~0xC00UL) != VSYSCALL_START)
+ if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
return -EINVAL;
nr = (addr & 0xC00UL) >> 10;
@@ -153,7 +114,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
thread->error_code = 6; /* user fault, no page, write */
thread->cr2 = ptr;
- thread->trap_no = 14;
+ thread->trap_nr = X86_TRAP_PF;
memset(&info, 0, sizeof(info));
info.si_signo = SIGSEGV;
@@ -172,7 +133,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
{
struct task_struct *tsk;
unsigned long caller;
- int vsyscall_nr;
+ int vsyscall_nr, syscall_nr, tmp;
int prev_sig_on_uaccess_error;
long ret;
@@ -206,8 +167,63 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
}
tsk = current;
- if (seccomp_mode(&tsk->seccomp))
- do_exit(SIGKILL);
+
+ /*
+ * Check for access_ok violations and find the syscall nr.
+ *
+ * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
+ * 64-bit, so we don't need to special-case it here. For all the
+ * vsyscalls, NULL means "don't write anything" not "write it at
+ * address 0".
+ */
+ switch (vsyscall_nr) {
+ case 0:
+ if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+ !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_gettimeofday;
+ break;
+
+ case 1:
+ if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_time;
+ break;
+
+ case 2:
+ if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+ !write_ok_or_segv(regs->si, sizeof(unsigned))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_getcpu;
+ break;
+ }
+
+ /*
+ * Handle seccomp. regs->ip must be the original value.
+ * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
+ *
+ * We could optimize the seccomp disabled case, but performance
+ * here doesn't matter.
+ */
+ regs->orig_ax = syscall_nr;
+ regs->ax = -ENOSYS;
+ tmp = secure_computing(syscall_nr);
+ if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
+ warn_bad_vsyscall(KERN_DEBUG, regs,
+ "seccomp tried to change syscall nr or ip");
+ do_exit(SIGSYS);
+ }
+ if (tmp)
+ goto do_ret; /* skip requested */
/*
* With a real vsyscall, page faults cause SIGSEGV. We want to
@@ -216,44 +232,28 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
current_thread_info()->sig_on_uaccess_error = 1;
- /*
- * 0 is a valid user pointer (in the access_ok sense) on 32-bit and
- * 64-bit, so we don't need to special-case it here. For all the
- * vsyscalls, 0 means "don't write anything" not "write it at
- * address 0".
- */
ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
- if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
- !write_ok_or_segv(regs->si, sizeof(struct timezone)))
- break;
-
ret = sys_gettimeofday(
(struct timeval __user *)regs->di,
(struct timezone __user *)regs->si);
break;
case 1:
- if (!write_ok_or_segv(regs->di, sizeof(time_t)))
- break;
-
ret = sys_time((time_t __user *)regs->di);
break;
case 2:
- if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
- !write_ok_or_segv(regs->si, sizeof(unsigned)))
- break;
-
ret = sys_getcpu((unsigned __user *)regs->di,
(unsigned __user *)regs->si,
- 0);
+ NULL);
break;
}
current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
+check_fault:
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
@@ -272,10 +272,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
regs->ax = ret;
+do_ret:
/* Emulate a ret instruction. */
regs->ip = caller;
regs->sp += 8;
-
return true;
sigsegv:
@@ -287,7 +287,7 @@ sigsegv:
* Assume __initcall executes before all user space. Hopefully kmod
* doesn't violate that. We'll find out if it does.
*/
-static void __cpuinit vsyscall_set_cpu(int cpu)
+static void vsyscall_set_cpu(int cpu)
{
unsigned long d;
unsigned long node = 0;
@@ -309,13 +309,13 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}
-static void __cpuinit cpu_vsyscall_init(void *arg)
+static void cpu_vsyscall_init(void *arg)
{
/* preemption should be already off */
vsyscall_set_cpu(raw_smp_processor_id());
}
-static int __cpuinit
+static int
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
{
long cpu = (long)arg;
@@ -330,28 +330,24 @@ void __init map_vsyscall(void)
{
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
- extern char __vvar_page;
- unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
- __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
+ __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
vsyscall_mode == NATIVE
? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR);
- BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) !=
- (unsigned long)VSYSCALL_START);
-
- __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
- BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
- (unsigned long)VVAR_ADDRESS);
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
+ (unsigned long)VSYSCALL_ADDR);
}
static int __init vsyscall_init(void)
{
- BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
+ cpu_notifier_register_begin();
on_each_cpu(cpu_vsyscall_init, NULL, 1);
/* notifier priority > KVM */
- hotcpu_notifier(cpu_vsyscall_notifier, 30);
+ __hotcpu_notifier(cpu_vsyscall_notifier, 30);
+
+ cpu_notifier_register_done();
return 0;
}
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
new file mode 100644
index 00000000000..9531fbb123b
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ * Modified for x86 32 bit architecture by
+ * Stefani Seibold <stefani@seibold.net>
+ * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
+ *
+ * Thanks to hpa@transmeta.com for some useful hint.
+ * Special thanks to Ingo Molnar for his early experience with
+ * a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ */
+
+#include <linux/timekeeper_internal.h>
+#include <asm/vgtod.h>
+#include <asm/vvar.h>
+
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
+
+void update_vsyscall_tz(void)
+{
+ vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest;
+ vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime;
+}
+
+void update_vsyscall(struct timekeeper *tk)
+{
+ struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
+
+ gtod_write_begin(vdata);
+
+ /* copy vsyscall data */
+ vdata->vclock_mode = tk->clock->archdata.vclock_mode;
+ vdata->cycle_last = tk->clock->cycle_last;
+ vdata->mask = tk->clock->mask;
+ vdata->mult = tk->mult;
+ vdata->shift = tk->shift;
+
+ vdata->wall_time_sec = tk->xtime_sec;
+ vdata->wall_time_snsec = tk->xtime_nsec;
+
+ vdata->monotonic_time_sec = tk->xtime_sec
+ + tk->wall_to_monotonic.tv_sec;
+ vdata->monotonic_time_snsec = tk->xtime_nsec
+ + ((u64)tk->wall_to_monotonic.tv_nsec
+ << tk->shift);
+ while (vdata->monotonic_time_snsec >=
+ (((u64)NSEC_PER_SEC) << tk->shift)) {
+ vdata->monotonic_time_snsec -=
+ ((u64)NSEC_PER_SEC) << tk->shift;
+ vdata->monotonic_time_sec++;
+ }
+
+ vdata->wall_time_coarse_sec = tk->xtime_sec;
+ vdata->wall_time_coarse_nsec = (long)(tk->xtime_nsec >> tk->shift);
+
+ vdata->monotonic_time_coarse_sec =
+ vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
+ vdata->monotonic_time_coarse_nsec =
+ vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
+
+ while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) {
+ vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC;
+ vdata->monotonic_time_coarse_sec++;
+ }
+
+ gtod_write_end(vdata);
+}
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 9796c2f3d07..040681928e9 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -13,9 +13,13 @@
#include <asm/ftrace.h>
#ifdef CONFIG_FUNCTION_TRACER
-/* mcount is defined in assembly */
+/* mcount and __fentry__ are defined in assembly */
+#ifdef CC_USING_FENTRY
+EXPORT_SYMBOL(__fentry__);
+#else
EXPORT_SYMBOL(mcount);
#endif
+#endif
EXPORT_SYMBOL(__get_user_1);
EXPORT_SYMBOL(__get_user_2);
@@ -28,6 +32,7 @@ EXPORT_SYMBOL(__put_user_8);
EXPORT_SYMBOL(copy_user_generic_string);
EXPORT_SYMBOL(copy_user_generic_unrolled);
+EXPORT_SYMBOL(copy_user_enhanced_fast_string);
EXPORT_SYMBOL(__copy_user_nocache);
EXPORT_SYMBOL(_copy_from_user);
EXPORT_SYMBOL(_copy_to_user);
@@ -54,7 +59,17 @@ EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(memmove);
+#ifndef CONFIG_DEBUG_VIRTUAL
+EXPORT_SYMBOL(phys_base);
+#endif
EXPORT_SYMBOL(empty_zero_page);
#ifndef CONFIG_PARAVIRT
EXPORT_SYMBOL(native_load_gs_index);
#endif
+
+#ifdef CONFIG_PREEMPT
+EXPORT_SYMBOL(___preempt_schedule);
+#ifdef CONFIG_CONTEXT_TRACKING
+EXPORT_SYMBOL(___preempt_schedule_context);
+#endif
+#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 947a06ccc67..e48b674639c 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -18,17 +18,17 @@
#include <asm/e820.h>
#include <asm/time.h>
#include <asm/irq.h>
+#include <asm/io_apic.h>
+#include <asm/hpet.h>
#include <asm/pat.h>
#include <asm/tsc.h>
#include <asm/iommu.h>
#include <asm/mach_traps.h>
-void __cpuinit x86_init_noop(void) { }
+void x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
-void __init x86_init_pgd_noop(pgd_t *unused) { }
int __init iommu_init_noop(void) { return 0; }
void iommu_shutdown_noop(void) { }
-void wallclock_init_noop(void) { }
/*
* The platform setup functions are preset with the default functions
@@ -63,13 +63,8 @@ struct x86_init_ops x86_init __initdata = {
.banner = default_banner,
},
- .mapping = {
- .pagetable_reserve = native_pagetable_reserve,
- },
-
.paging = {
- .pagetable_setup_start = native_pagetable_setup_start,
- .pagetable_setup_done = native_pagetable_setup_done,
+ .pagetable_init = native_pagetable_init,
},
.timers = {
@@ -90,9 +85,9 @@ struct x86_init_ops x86_init __initdata = {
},
};
-struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
+struct x86_cpuinit_ops x86_cpuinit = {
+ .early_percpu_clock_init = x86_init_noop,
.setup_percpu_clockev = setup_secondary_APIC_clock,
- .fixup_cpu_id = x86_default_fixup_cpu_id,
};
static void default_nmi_init(void) { };
@@ -100,20 +95,69 @@ static int default_i8042_detect(void) { return 1; };
struct x86_platform_ops x86_platform = {
.calibrate_tsc = native_calibrate_tsc,
- .wallclock_init = wallclock_init_noop,
.get_wallclock = mach_get_cmos_time,
.set_wallclock = mach_set_rtc_mmss,
.iommu_shutdown = iommu_shutdown_noop,
.is_untracked_pat_range = is_ISA_range,
.nmi_init = default_nmi_init,
.get_nmi_reason = default_get_nmi_reason,
- .i8042_detect = default_i8042_detect
+ .i8042_detect = default_i8042_detect,
+ .save_sched_clock_state = tsc_save_sched_clock_state,
+ .restore_sched_clock_state = tsc_restore_sched_clock_state,
};
EXPORT_SYMBOL_GPL(x86_platform);
+
+#if defined(CONFIG_PCI_MSI)
struct x86_msi_ops x86_msi = {
- .setup_msi_irqs = native_setup_msi_irqs,
- .teardown_msi_irq = native_teardown_msi_irq,
- .teardown_msi_irqs = default_teardown_msi_irqs,
- .restore_msi_irqs = default_restore_msi_irqs,
+ .setup_msi_irqs = native_setup_msi_irqs,
+ .compose_msi_msg = native_compose_msi_msg,
+ .teardown_msi_irq = native_teardown_msi_irq,
+ .teardown_msi_irqs = default_teardown_msi_irqs,
+ .restore_msi_irqs = default_restore_msi_irqs,
+ .setup_hpet_msi = default_setup_hpet_msi,
+ .msi_mask_irq = default_msi_mask_irq,
+ .msix_mask_irq = default_msix_mask_irq,
+};
+
+/* MSI arch specific hooks */
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ return x86_msi.setup_msi_irqs(dev, nvec, type);
+}
+
+void arch_teardown_msi_irqs(struct pci_dev *dev)
+{
+ x86_msi.teardown_msi_irqs(dev);
+}
+
+void arch_teardown_msi_irq(unsigned int irq)
+{
+ x86_msi.teardown_msi_irq(irq);
+}
+
+void arch_restore_msi_irqs(struct pci_dev *dev)
+{
+ x86_msi.restore_msi_irqs(dev);
+}
+u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
+{
+ return x86_msi.msi_mask_irq(desc, mask, flag);
+}
+u32 arch_msix_mask_irq(struct msi_desc *desc, u32 flag)
+{
+ return x86_msi.msix_mask_irq(desc, flag);
+}
+#endif
+
+struct x86_io_apic_ops x86_io_apic_ops = {
+ .init = native_io_apic_init_mappings,
+ .read = native_io_apic_read,
+ .write = native_io_apic_write,
+ .modify = native_io_apic_modify,
+ .disable = native_disable_io_apic,
+ .print_entries = native_io_apic_print_entries,
+ .set_affinity = native_ioapic_set_affinity,
+ .setup_entry = native_setup_ioapic_entry,
+ .eoi_ioapic_pin = native_eoi_ioapic_pin,
};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 71109111411..a4b451c6add 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -3,12 +3,14 @@
*
* Author: Suresh Siddha <suresh.b.siddha@intel.com>
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/bootmem.h>
#include <linux/compat.h>
#include <asm/i387.h>
-#ifdef CONFIG_IA32_EMULATION
-#include <asm/sigcontext32.h>
-#endif
+#include <asm/fpu-internal.h>
+#include <asm/sigframe.h>
#include <asm/xcr.h>
/*
@@ -19,13 +21,9 @@ u64 pcntxt_mask;
/*
* Represents init state for the supported extended state.
*/
-static struct xsave_struct *init_xstate_buf;
-
-struct _fpx_sw_bytes fx_sw_reserved;
-#ifdef CONFIG_IA32_EMULATION
-struct _fpx_sw_bytes fx_sw_reserved_ia32;
-#endif
+struct xsave_struct *init_xstate_buf;
+static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
/*
@@ -40,15 +38,13 @@ static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
*/
void __sanitize_i387_state(struct task_struct *tsk)
{
- u64 xstate_bv;
- int feature_bit = 0x2;
struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
+ int feature_bit = 0x2;
+ u64 xstate_bv;
if (!fx)
return;
- BUG_ON(__thread_has_fpu(tsk));
-
xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
/*
@@ -102,213 +98,325 @@ void __sanitize_i387_state(struct task_struct *tsk)
* Check for the presence of extended state information in the
* user fpstate pointer in the sigcontext.
*/
-int check_for_xstate(struct i387_fxsave_struct __user *buf,
- void __user *fpstate,
- struct _fpx_sw_bytes *fx_sw_user)
+static inline int check_for_xstate(struct i387_fxsave_struct __user *buf,
+ void __user *fpstate,
+ struct _fpx_sw_bytes *fx_sw)
{
int min_xstate_size = sizeof(struct i387_fxsave_struct) +
sizeof(struct xsave_hdr_struct);
unsigned int magic2;
- int err;
- err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
- sizeof(struct _fpx_sw_bytes));
- if (err)
- return -EFAULT;
+ if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
+ return -1;
- /*
- * First Magic check failed.
- */
- if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
- return -EINVAL;
+ /* Check for the first magic field and other error scenarios. */
+ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
+ fx_sw->xstate_size < min_xstate_size ||
+ fx_sw->xstate_size > xstate_size ||
+ fx_sw->xstate_size > fx_sw->extended_size)
+ return -1;
/*
- * Check for error scenarios.
- */
- if (fx_sw_user->xstate_size < min_xstate_size ||
- fx_sw_user->xstate_size > xstate_size ||
- fx_sw_user->xstate_size > fx_sw_user->extended_size)
- return -EINVAL;
-
- err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
- fx_sw_user->extended_size -
- FP_XSTATE_MAGIC2_SIZE));
- if (err)
- return err;
- /*
* Check for the presence of second magic word at the end of memory
* layout. This detects the case where the user just copied the legacy
* fpstate layout with out copying the extended state information
* in the memory layout.
*/
- if (magic2 != FP_XSTATE_MAGIC2)
- return -EFAULT;
+ if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
+ || magic2 != FP_XSTATE_MAGIC2)
+ return -1;
return 0;
}
-#ifdef CONFIG_X86_64
/*
* Signal frame handlers.
*/
-
-int save_i387_xstate(void __user *buf)
+static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
{
- struct task_struct *tsk = current;
- int err = 0;
-
- if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
- return -EACCES;
-
- BUG_ON(sig_xstate_size < xstate_size);
-
- if ((unsigned long)buf % 64)
- printk("save_i387_xstate: bad fpstate %p\n", buf);
+ if (use_fxsr()) {
+ struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
+ struct user_i387_ia32_struct env;
+ struct _fpstate_ia32 __user *fp = buf;
- if (!used_math())
- return 0;
+ convert_from_fxsr(&env, tsk);
- if (user_has_fpu()) {
- if (use_xsave())
- err = xsave_user(buf);
- else
- err = fxsave_user(buf);
-
- if (err)
- return err;
- user_fpu_end();
+ if (__copy_to_user(buf, &env, sizeof(env)) ||
+ __put_user(xsave->i387.swd, &fp->status) ||
+ __put_user(X86_FXSR_MAGIC, &fp->magic))
+ return -1;
} else {
- sanitize_i387_state(tsk);
- if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
- xstate_size))
+ struct i387_fsave_struct __user *fp = buf;
+ u32 swd;
+ if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
return -1;
}
- clear_used_math(); /* trigger finit */
+ return 0;
+}
- if (use_xsave()) {
- struct _fpstate __user *fx = buf;
- struct _xstate __user *x = buf;
- u64 xstate_bv;
+static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
+{
+ struct xsave_struct __user *x = buf;
+ struct _fpx_sw_bytes *sw_bytes;
+ u32 xstate_bv;
+ int err;
- err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
- sizeof(struct _fpx_sw_bytes));
+ /* Setup the bytes not touched by the [f]xsave and reserved for SW. */
+ sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
+ err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
- err |= __put_user(FP_XSTATE_MAGIC2,
- (__u32 __user *) (buf + sig_xstate_size
- - FP_XSTATE_MAGIC2_SIZE));
+ if (!use_xsave())
+ return err;
- /*
- * Read the xstate_bv which we copied (directly from the cpu or
- * from the state in task struct) to the user buffers and
- * set the FP/SSE bits.
- */
- err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
+ err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size));
- /*
- * For legacy compatible, we always set FP/SSE bits in the bit
- * vector while saving the state to the user context. This will
- * enable us capturing any changes(during sigreturn) to
- * the FP/SSE bits by the legacy applications which don't touch
- * xstate_bv in the xsave header.
- *
- * xsave aware apps can change the xstate_bv in the xsave
- * header as well as change any contents in the memory layout.
- * xrestore as part of sigreturn will capture all the changes.
- */
- xstate_bv |= XSTATE_FPSSE;
+ /*
+ * Read the xstate_bv which we copied (directly from the cpu or
+ * from the state in task struct) to the user buffers.
+ */
+ err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
- err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
+ /*
+ * For legacy compatible, we always set FP/SSE bits in the bit
+ * vector while saving the state to the user context. This will
+ * enable us capturing any changes(during sigreturn) to
+ * the FP/SSE bits by the legacy applications which don't touch
+ * xstate_bv in the xsave header.
+ *
+ * xsave aware apps can change the xstate_bv in the xsave
+ * header as well as change any contents in the memory layout.
+ * xrestore as part of sigreturn will capture all the changes.
+ */
+ xstate_bv |= XSTATE_FPSSE;
- if (err)
- return err;
- }
+ err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
- return 1;
+ return err;
+}
+
+static inline int save_user_xstate(struct xsave_struct __user *buf)
+{
+ int err;
+
+ if (use_xsave())
+ err = xsave_user(buf);
+ else if (use_fxsr())
+ err = fxsave_user((struct i387_fxsave_struct __user *) buf);
+ else
+ err = fsave_user((struct i387_fsave_struct __user *) buf);
+
+ if (unlikely(err) && __clear_user(buf, xstate_size))
+ err = -EFAULT;
+ return err;
}
/*
- * Restore the extended state if present. Otherwise, restore the FP/SSE
- * state.
+ * Save the fpu, extended register state to the user signal frame.
+ *
+ * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
+ * state is copied.
+ * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
+ *
+ * buf == buf_fx for 64-bit frames and 32-bit fsave frame.
+ * buf != buf_fx for 32-bit frames with fxstate.
+ *
+ * If the fpu, extended register state is live, save the state directly
+ * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
+ * copy the thread's fpu state to the user frame starting at 'buf_fx'.
+ *
+ * If this is a 32-bit frame with fxstate, put a fsave header before
+ * the aligned state at 'buf_fx'.
+ *
+ * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
+ * indicating the absence/presence of the extended state to the user.
*/
-static int restore_user_xstate(void __user *buf)
+int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)
{
- struct _fpx_sw_bytes fx_sw_user;
- u64 mask;
- int err;
+ struct xsave_struct *xsave = &current->thread.fpu.state->xsave;
+ struct task_struct *tsk = current;
+ int ia32_fxstate = (buf != buf_fx);
- if (((unsigned long)buf % 64) ||
- check_for_xstate(buf, buf, &fx_sw_user))
- goto fx_only;
+ ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
+ config_enabled(CONFIG_IA32_EMULATION));
- mask = fx_sw_user.xstate_bv;
+ if (!access_ok(VERIFY_WRITE, buf, size))
+ return -EACCES;
- /*
- * restore the state passed by the user.
- */
- err = xrestore_user(buf, mask);
- if (err)
- return err;
+ if (!static_cpu_has(X86_FEATURE_FPU))
+ return fpregs_soft_get(current, NULL, 0,
+ sizeof(struct user_i387_ia32_struct), NULL,
+ (struct _fpstate_ia32 __user *) buf) ? -1 : 1;
- /*
- * init the state skipped by the user.
- */
- mask = pcntxt_mask & ~mask;
- if (unlikely(mask))
- xrstor_state(init_xstate_buf, mask);
+ if (user_has_fpu()) {
+ /* Save the live register state to the user directly. */
+ if (save_user_xstate(buf_fx))
+ return -1;
+ /* Update the thread's fxstate to save the fsave header. */
+ if (ia32_fxstate)
+ fpu_fxsave(&tsk->thread.fpu);
+ } else {
+ sanitize_i387_state(tsk);
+ if (__copy_to_user(buf_fx, xsave, xstate_size))
+ return -1;
+ }
+
+ /* Save the fsave header for the 32-bit frames. */
+ if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
+ return -1;
+
+ if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
+ return -1;
+
+ drop_init_fpu(tsk); /* trigger finit */
return 0;
+}
-fx_only:
- /*
- * couldn't find the extended state information in the
- * memory layout. Restore just the FP/SSE and init all
- * the other extended state.
- */
- xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
- return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
+static inline void
+sanitize_restored_xstate(struct task_struct *tsk,
+ struct user_i387_ia32_struct *ia32_env,
+ u64 xstate_bv, int fx_only)
+{
+ struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
+ struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr;
+
+ if (use_xsave()) {
+ /* These bits must be zero. */
+ xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+
+ /*
+ * Init the state that is not present in the memory
+ * layout and not enabled by the OS.
+ */
+ if (fx_only)
+ xsave_hdr->xstate_bv = XSTATE_FPSSE;
+ else
+ xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv);
+ }
+
+ if (use_fxsr()) {
+ /*
+ * mscsr reserved bits must be masked to zero for security
+ * reasons.
+ */
+ xsave->i387.mxcsr &= mxcsr_feature_mask;
+
+ convert_to_fxsr(tsk, ia32_env);
+ }
}
/*
- * This restores directly out of user space. Exceptions are handled.
+ * Restore the extended state if present. Otherwise, restore the FP/SSE state.
*/
-int restore_i387_xstate(void __user *buf)
+static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only)
+{
+ if (use_xsave()) {
+ if ((unsigned long)buf % 64 || fx_only) {
+ u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE;
+ xrstor_state(init_xstate_buf, init_bv);
+ return fxrstor_user(buf);
+ } else {
+ u64 init_bv = pcntxt_mask & ~xbv;
+ if (unlikely(init_bv))
+ xrstor_state(init_xstate_buf, init_bv);
+ return xrestore_user(buf, xbv);
+ }
+ } else if (use_fxsr()) {
+ return fxrstor_user(buf);
+ } else
+ return frstor_user(buf);
+}
+
+int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
{
+ int ia32_fxstate = (buf != buf_fx);
struct task_struct *tsk = current;
- int err = 0;
+ int state_size = xstate_size;
+ u64 xstate_bv = 0;
+ int fx_only = 0;
+
+ ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
+ config_enabled(CONFIG_IA32_EMULATION));
if (!buf) {
- if (used_math())
- goto clear;
+ drop_init_fpu(tsk);
return 0;
- } else
- if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
- return -EACCES;
+ }
- if (!used_math()) {
- err = init_fpu(tsk);
- if (err)
- return err;
+ if (!access_ok(VERIFY_READ, buf, size))
+ return -EACCES;
+
+ if (!used_math() && init_fpu(tsk))
+ return -1;
+
+ if (!static_cpu_has(X86_FEATURE_FPU))
+ return fpregs_soft_set(current, NULL,
+ 0, sizeof(struct user_i387_ia32_struct),
+ NULL, buf) != 0;
+
+ if (use_xsave()) {
+ struct _fpx_sw_bytes fx_sw_user;
+ if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
+ /*
+ * Couldn't find the extended state information in the
+ * memory layout. Restore just the FP/SSE and init all
+ * the other extended state.
+ */
+ state_size = sizeof(struct i387_fxsave_struct);
+ fx_only = 1;
+ } else {
+ state_size = fx_sw_user.xstate_size;
+ xstate_bv = fx_sw_user.xstate_bv;
+ }
}
- user_fpu_begin();
- if (use_xsave())
- err = restore_user_xstate(buf);
- else
- err = fxrstor_checking((__force struct i387_fxsave_struct *)
- buf);
- if (unlikely(err)) {
+ if (ia32_fxstate) {
+ /*
+ * For 32-bit frames with fxstate, copy the user state to the
+ * thread's fpu state, reconstruct fxstate from the fsave
+ * header. Sanitize the copied state etc.
+ */
+ struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
+ struct user_i387_ia32_struct env;
+ int err = 0;
+
+ /*
+ * Drop the current fpu which clears used_math(). This ensures
+ * that any context-switch during the copy of the new state,
+ * avoids the intermediate state from getting restored/saved.
+ * Thus avoiding the new restored state from getting corrupted.
+ * We will be ready to restore/save the state only after
+ * set_used_math() is again set.
+ */
+ drop_fpu(tsk);
+
+ if (__copy_from_user(xsave, buf_fx, state_size) ||
+ __copy_from_user(&env, buf, sizeof(env))) {
+ err = -1;
+ } else {
+ sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only);
+ set_used_math();
+ }
+
+ if (use_eager_fpu())
+ math_state_restore();
+
+ return err;
+ } else {
/*
- * Encountered an error while doing the restore from the
- * user buffer, clear the fpu state.
+ * For 64-bit frames and 32-bit fsave frames, restore the user
+ * state to the registers directly (with exceptions handled).
*/
-clear:
- clear_fpu(tsk);
- clear_used_math();
+ user_fpu_begin();
+ if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) {
+ drop_init_fpu(tsk);
+ return -1;
+ }
}
- return err;
+
+ return 0;
}
-#endif
/*
* Prepare the SW reserved portion of the fxsave memory layout, indicating
@@ -319,31 +427,22 @@ clear:
*/
static void prepare_fx_sw_frame(void)
{
- int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
- FP_XSTATE_MAGIC2_SIZE;
-
- sig_xstate_size = sizeof(struct _fpstate) + size_extended;
-
-#ifdef CONFIG_IA32_EMULATION
- sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
-#endif
+ int fsave_header_size = sizeof(struct i387_fsave_struct);
+ int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
- memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
+ if (config_enabled(CONFIG_X86_32))
+ size += fsave_header_size;
fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
- fx_sw_reserved.extended_size = sig_xstate_size;
+ fx_sw_reserved.extended_size = size;
fx_sw_reserved.xstate_bv = pcntxt_mask;
fx_sw_reserved.xstate_size = xstate_size;
-#ifdef CONFIG_IA32_EMULATION
- memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
- sizeof(struct _fpx_sw_bytes));
- fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
-#endif
-}
-#ifdef CONFIG_X86_64
-unsigned int sig_xstate_size = sizeof(struct _fpstate);
-#endif
+ if (config_enabled(CONFIG_IA32_EMULATION)) {
+ fx_sw_reserved_ia32 = fx_sw_reserved;
+ fx_sw_reserved_ia32.extended_size += fsave_header_size;
+ }
+}
/*
* Enable the extended processor state save/restore feature
@@ -382,19 +481,21 @@ static void __init setup_xstate_features(void)
/*
* setup the xstate image representing the init state
*/
-static void __init setup_xstate_init(void)
+static void __init setup_init_fpu_buf(void)
{
- setup_xstate_features();
-
/*
* Setup init_xstate_buf to represent the init state of
* all the features managed by the xsave
*/
init_xstate_buf = alloc_bootmem_align(xstate_size,
__alignof__(struct xsave_struct));
- init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
+ fx_finit(&init_xstate_buf->i387);
+
+ if (!cpu_has_xsave)
+ return;
+
+ setup_xstate_features();
- clts();
/*
* Init all the features state with header_bv being 0x0
*/
@@ -404,9 +505,21 @@ static void __init setup_xstate_init(void)
* of any feature which is not represented by all zero's.
*/
xsave_state(init_xstate_buf, -1);
- stts();
}
+static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
+static int __init eager_fpu_setup(char *s)
+{
+ if (!strcmp(s, "on"))
+ eagerfpu = ENABLE;
+ else if (!strcmp(s, "off"))
+ eagerfpu = DISABLE;
+ else if (!strcmp(s, "auto"))
+ eagerfpu = AUTO;
+ return 1;
+}
+__setup("eagerfpu=", eager_fpu_setup);
+
/*
* Enable and initialize the xsave feature.
*/
@@ -423,7 +536,7 @@ static void __init xstate_enable_boot_cpu(void)
pcntxt_mask = eax + ((u64)edx << 32);
if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
- printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
+ pr_err("FP/SSE not shown under xsave features 0x%llx\n",
pcntxt_mask);
BUG();
}
@@ -443,12 +556,24 @@ static void __init xstate_enable_boot_cpu(void)
update_regset_xstate_info(xstate_size, pcntxt_mask);
prepare_fx_sw_frame();
+ setup_init_fpu_buf();
+
+ /* Auto enable eagerfpu for xsaveopt */
+ if (cpu_has_xsaveopt && eagerfpu != DISABLE)
+ eagerfpu = ENABLE;
+
+ if (pcntxt_mask & XSTATE_EAGER) {
+ if (eagerfpu == DISABLE) {
+ pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n",
+ pcntxt_mask & XSTATE_EAGER);
+ pcntxt_mask &= ~XSTATE_EAGER;
+ } else {
+ eagerfpu = ENABLE;
+ }
+ }
- setup_xstate_init();
-
- printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
- "cntxt size 0x%x\n",
- pcntxt_mask, xstate_size);
+ pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
+ pcntxt_mask, xstate_size);
}
/*
@@ -458,7 +583,7 @@ static void __init xstate_enable_boot_cpu(void)
* This is somewhat obfuscated due to the lack of powerful enough
* overrides for the section checks.
*/
-void __cpuinit xsave_init(void)
+void xsave_init(void)
{
static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
void (*this_func)(void);
@@ -470,3 +595,43 @@ void __cpuinit xsave_init(void)
next_func = xstate_enable;
this_func();
}
+
+static inline void __init eager_fpu_init_bp(void)
+{
+ current->thread.fpu.state =
+ alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
+ if (!init_xstate_buf)
+ setup_init_fpu_buf();
+}
+
+void eager_fpu_init(void)
+{
+ static __refdata void (*boot_func)(void) = eager_fpu_init_bp;
+
+ clear_used_math();
+ current_thread_info()->status = 0;
+
+ if (eagerfpu == ENABLE)
+ setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
+
+ if (!cpu_has_eager_fpu) {
+ stts();
+ return;
+ }
+
+ if (boot_func) {
+ boot_func();
+ boot_func = NULL;
+ }
+
+ /*
+ * This is same as math_state_restore(). But use_xsave() is
+ * not yet patched to use math_state_restore().
+ */
+ init_fpu(current);
+ __thread_fpu_begin(current);
+ if (cpu_has_xsave)
+ xrstor_state(init_xstate_buf, -1);
+ else
+ fxrstor_checking(&init_xstate_buf->i387);
+}